Commit ad4e5415 authored by ljia's avatar ljia

* ADD Weisfeiler-Lehman subtree kernel and its result on dataset Asyclic.

parent fe1c63c0
......@@ -3,9 +3,17 @@ a python package for graph kernels.
## requirements
* numpy
* scipy
* matplotlib
* networkx
* sklearn
* tabulate
* numpy - 1.13.3
* scipy - 1.0.0
* matplotlib - 2.1.0
* networkx - 2.0
* sklearn - 0.19.1
* tabulate - 0.8.2
## results with minimal RMSE for each kernel on dataset Asyclic
| Kernels | RMSE(℃) | std(℃) | parameter |
|---------------|:---------:|:--------:|-------------:|
| shortest path | 36.400524 | 5.352940 | - |
| marginalized | 17.8991 | 6.59104 | p_quit = 0.1 |
| path | 14.270816 | 6.366698 | - |
| WL subtree | 9.01403 | 6.35786 | height = 1 |
This source diff could not be displayed because it is too large. You can view the blob instead.
# py-graph
a python package for graph kernels.
## requirements
* numpy - 1.13.3
* scipy - 1.0.0
* matplotlib - 2.1.0
* networkx - 2.0
* sklearn - 0.19.1
* tabulate - 0.8.2
## results with minimal RMSE for each kernel on dataset Asyclic
| Kernels | RMSE(℃) | std(℃) | parameter |
|---------------|:---------:|:--------:|-------------:|
| shortest path | 36.400524 | 5.352940 | - |
| marginalized | 17.8991 | 6.59104 | p_quit = 0.1 |
| path | 14.270816 | 6.366698 | - |
| WL subtree | 9.01403 | 6.35786 | height = 1 |
## updates
### 2017.12.20
* ADD Weisfeiler-Lehman subtree kernel and its result on dataset Asyclic. - linlin
### 2017.12.07
* ADD mean average path kernel and its result on dataset Asyclic. - linlin
* ADD delta kernel. - linlin
* MOD reconstruction the code of marginalized kernel. - linlin
### 2017.12.05
* ADD marginalized kernel and its result. - linlin
* ADD list required python packages in file README.md. - linlin
### 2017.11.24
* ADD shortest path kernel and its result. - linlin
......@@ -7,8 +7,23 @@ import numpy as np
import time
from pygraph.kernels.spkernel import spkernel
from pygraph.kernels.pathKernel import pathkernel
def weisfeilerlehmankernel(*args):
# test of WL subtree kernel on many graphs
import sys
import pathlib
from collections import Counter
sys.path.insert(0, "../")
import networkx as nx
import numpy as np
import time
from pygraph.kernels.spkernel import spkernel
from pygraph.kernels.pathKernel import pathkernel
def weisfeilerlehmankernel(*args, height = 0, base_kernel = 'subtree'):
"""Calculate Weisfeiler-Lehman kernels between graphs.
Parameters
......@@ -19,41 +34,160 @@ def weisfeilerlehmankernel(*args):
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
height : subtree height
base_kernel : base kernel used in each iteration of WL kernel
the default base kernel is subtree kernel
Return
------
Kmatrix/Kernel : Numpy matrix/int
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. / Weisfeiler-Lehman Kernel between 2 graphs.
Notes
-----
This function now supports WL subtree kernel and WL shortest path kernel.
References
----------
[1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research. 2011;12(Sep):2539-61.
"""
if len(args) == 1: # for a list of graphs
Gn = args[0]
Kmatrix = np.zeros((len(Gn), len(Gn)))
start_time = time.time()
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j])
Kmatrix[j][i] = Kmatrix[i][j]
# for WL subtree kernel
if base_kernel == 'subtree':
Kmatrix = _wl_subtreekernel_do(args[0], height = height, base_kernel = 'subtree')
# for WL edge kernel
elif base_kernel == 'edge':
print('edge')
# for WL shortest path kernel
elif base_kernel == 'sp':
Gn = args[0]
Kmatrix = np.zeros((len(Gn), len(Gn)))
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _weisfeilerlehmankernel_do(Gn[i], Gn[j])
Kmatrix[j][i] = Kmatrix[i][j]
print("\n --- Weisfeiler-Lehman kernel matrix of size %d built in %s seconds ---" % (len(Gn), (time.time() - start_time)))
print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), (time.time() - start_time)))
return Kmatrix
else: # for only 2 graphs
start_time = time.time()
kernel = _pathkernel_do(args[0], args[1])
# for WL subtree kernel
if base_kernel == 'subtree':
args = [args[0], args[1]]
kernel = _wl_subtreekernel_do(args, height = height, base_kernel = 'subtree')
# for WL edge kernel
elif base_kernel == 'edge':
print('edge')
# for WL shortest path kernel
elif base_kernel == 'sp':
kernel = _pathkernel_do(args[0], args[1])
print("\n --- Weisfeiler-Lehman kernel built in %s seconds ---" % (time.time() - start_time))
print("\n --- Weisfeiler-Lehman %s kernel built in %s seconds ---" % (base_kernel, time.time() - start_time))
return kernel
def _wl_subtreekernel_do(*args, height = 0, base_kernel = 'subtree'):
"""Calculate Weisfeiler-Lehman subtree kernels between graphs.
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
Return
------
Kmatrix/Kernel : Numpy matrix/int
Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
"""
Gn = args[0]
Kmatrix = np.zeros((len(Gn), len(Gn)))
all_num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
# iterate each height
for h in range(height + 1):
all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
num_of_labels_occured = all_num_of_labels_occured # number of the set of letters that occur before as node labels at least once in all graphs
# for each graph
for idx, G in enumerate(Gn):
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'label').values())
num_of_each_label = dict(Counter(labels_ori)) # number of occurence of each label in graph
num_of_labels = len(num_of_each_label) # number of all unique labels
all_labels_ori.update(labels_ori)
# num_of_labels_occured += num_of_labels #@todo not precise
num_of_labels_occured = all_num_of_labels_occured + len(all_labels_ori) + len(all_set_compressed)
set_multisets = []
for node in G.nodes(data = True):
# Multiset-label determination.
multiset = [ G.node[neighbors]['label'] for neighbors in G[node[0]] ]
# sorting each multiset
multiset.sort()
multiset = node[1]['label'] + ''.join(multiset) # concatenate to a string and add the prefix
set_multisets.append(multiset)
# label compression
# set_multisets.sort() # this is unnecessary
set_unique = list(set(set_multisets)) # set of unique multiset labels
# a dictionary mapping original labels to new ones.
set_compressed = {}
# if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
for value in set_unique:
if value in all_set_compressed.keys():
set_compressed.update({ value : all_set_compressed[value] })
else:
set_compressed.update({ value : str(num_of_labels_occured + 1) })
num_of_labels_occured += 1
# set_compressed = { value : (all_set_compressed[value] if value in all_set_compressed.keys() else str(set_unique.index(value) + num_of_labels_occured + 1)) for value in set_unique }
all_set_compressed.update(set_compressed)
# num_of_labels_occured += len(set_compressed) #@todo not precise
# relabel nodes
# nx.relabel_nodes(G, set_compressed, copy = False)
for node in G.nodes(data = True):
node[1]['label'] = set_compressed[set_multisets[node[0]]]
# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'label').values())
num_of_each_label.update(dict(Counter(labels_comp)))
all_num_of_each_label.append(num_of_each_label)
# calculate subtree kernel with h iterations and add it to the final kernel
for i in range(0, len(Gn)):
for j in range(i, len(Gn)):
labels = set(list(all_num_of_each_label[i].keys()) + list(all_num_of_each_label[j].keys()))
vector1 = np.matrix([ (all_num_of_each_label[i][label] if (label in all_num_of_each_label[i].keys()) else 0) for label in labels ])
vector2 = np.matrix([ (all_num_of_each_label[j][label] if (label in all_num_of_each_label[j].keys()) else 0) for label in labels ])
Kmatrix[i][j] += np.dot(vector1, vector2.transpose())
Kmatrix[j][i] = Kmatrix[i][j]
all_num_of_labels_occured += len(all_labels_ori)
return Kmatrix
def _weisfeilerlehmankernel_do(G1, G2):
"""Calculate Weisfeiler-Lehman kernels between 2 graphs. This kernel use shortest path kernel to calculate kernel between two graphs in each iteration.
......@@ -72,26 +206,27 @@ def _weisfeilerlehmankernel_do(G1, G2):
kernel = 0 # init kernel
num_nodes1 = G1.number_of_nodes()
num_nodes2 = G2.number_of_nodes()
height = 12 #min(num_nodes1, num_nodes2)) #Q how to determine the upper bound of the height?
# the first iteration.
labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
kernel += spkernel(G1, G2)
# labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
# labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
kernel += pathkernel(G1, G2) # change your base kernel here (and one more below)
for height in range(0, min(num_nodes1, num_nodes2)): #Q how to determine the upper bound of the height?
if labelset1 != labelset2:
break
for h in range(0, height):
# if labelset1 != labelset2:
# break
# Weisfeiler-Lehman test of graph isomorphism.
relabel(G1)
relabel(G2)
# calculate kernel
kernel += spkernel(G1, G2)
kernel += pathkernel(G1, G2) # change your base kernel here (and one more before)
# get label sets of both graphs
labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
# labelset1 = { G1.nodes(data = True)[i]['label'] for i in range(num_nodes1) }
# labelset2 = { G2.nodes(data = True)[i]['label'] for i in range(num_nodes2) }
return kernel
......@@ -106,6 +241,11 @@ def relabel(G):
The graphs whose nodes are relabeled.
'''
# get the set of original labels
labels_ori = list(nx.get_node_attributes(G, 'label').values())
num_of_each_label = dict(Counter(labels_ori))
num_of_labels = len(num_of_each_label)
set_multisets = []
for node in G.nodes(data = True):
# Multiset-label determination.
......@@ -118,8 +258,13 @@ def relabel(G):
# label compression
# set_multisets.sort() # this is unnecessary
set_unique = list(set(set_multisets)) # set of unique multiset labels
set_compressed = { value : str(set_unique.index(value)) for value in set_unique } # assign indices as the new labels
set_compressed = { value : str(set_unique.index(value) + num_of_labels + 1) for value in set_unique } # assign new labels
# relabel nodes
# nx.relabel_nodes(G, set_compressed, copy = False)
for node in G.nodes(data = True):
node[1]['label'] = set_compressed[set_multisets[node[0]]]
\ No newline at end of file
node[1]['label'] = set_compressed[set_multisets[node[0]]]
# get the set of compressed labels
labels_comp = list(nx.get_node_attributes(G, 'label').values())
num_of_each_label.update(dict(Counter(labels_comp)))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment