Commit 6fcdb5ff authored by ljia's avatar ljia

modifié : README.md

	nouveau fichier : notebooks/run_cyclicpatternkernel.ipynb
	modifié :         notebooks/run_marginalizedkernel_acyclic.ipynb
	modifié :         notebooks/run_pathkernel_acyclic.ipynb
	modifié :         notebooks/run_spkernel_acyclic.ipynb
	modifié :         notebooks/run_treeletkernel_acyclic.ipynb
	nouveau fichier : notebooks/run_treepatternkernel.ipynb
	modifié :         notebooks/run_weisfeilerLehmankernel_acyclic.ipynb
	modifié :         pygraph/kernels/README.md
	nouveau fichier : pygraph/kernels/cyclicPatternKernel.py
	modifié :         pygraph/kernels/deltaKernel.py
	modifié :         pygraph/kernels/pathKernel.py
	modifié :         pygraph/kernels/results.md
	modifié :         pygraph/kernels/spKernel.py
	nouveau fichier : pygraph/kernels/treePatternKernel.py
	modifié :         pygraph/kernels/treeletKernel.py
	modifié :         pygraph/kernels/untildPathKernel.py
	modifié :         pygraph/kernels/weisfeilerLehmanKernel.py
	modifié :         pygraph/utils/graphfiles.py
	modifié :         pygraph/utils/utils.py
parent 66e58d69
# py-graph
A python package for graph kernels.
## Requirements
* numpy - 1.13.3
* scipy - 1.0.0
* matplotlib - 2.1.0
* networkx - 2.0
* sklearn - 0.19.1
* tabulate - 0.8.2
## Results with minimal test RMSE for each kernel on dataset Asyclic
All kernels are tested on dataset Asyclic, which consists of 185 molecules (graphs).
All kernels expect for Cyclic pattern kernel are tested on dataset Asyclic, which consists of 185 molecules (graphs). (Cyclic pattern kernel is tested on dataset MAO and PAH.)
The criteria used for prediction are SVM for classification and kernel Ridge regression for regression.
For predition we randomly divide the data in train and test subset, where 90% of entire dataset is for training and rest for testing. 10 splits are performed. For each split, we first train on the train data, then evaluate the performance on the test set. We choose the optimal parameters for the test set and finally provide the corresponding performance. The final results correspond to the average of the performances on the test sets.
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time |
|---------------|:-------:|:------:|-------------:|-------:|
| Shortest path | 35.19 | 4.50 | - | 14.58" |
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" |
| Path | 14.00 | 6.93 | - | 36.21" |
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" |
| Treelet | 8.31 | 3.38 | - | 0.50" |
| Path up to d | 7.43 | 2.69 | depth = 2 | 0.59" |
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time |
|------------------|:-------:|:------:|------------------:|-------:|
| Shortest path | 35.19 | 4.50 | - | 14.58" |
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" |
| Path | 18.41 | 10.78 | - | 29.43" |
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" |
| WL shortest path | 35.16 | 4.50 | height = 2 | 40.24" |
| WL edge | 33.41 | 4.73 | height = 5 | 5.66" |
| Treelet | 8.31 | 3.38 | - | 0.50" |
| Path up to d | 7.43 | 2.69 | depth = 2 | 0.59" |
| Tree pattern | 7.27 | 2.21 | lamda = 1, h = 2 | 37.24" |
| Cyclic pattern | 0.9 | 0.11 | cycle bound = 100 | 0.31" |
* RMSE stands for arithmetic mean of the root mean squared errors on all splits.
* STD stands for standard deviation of the root mean squared errors on all splits.
* Paremeter is the one with which the kenrel achieves the best results.
* k_time is the time spent on building the kernel matrix.
* The targets of training data are normalized before calculating *path kernel* and *treelet kernel*.
* The targets of training data are normalized before calculating *treelet kernel*.
* See detail results in [results.md](pygraph/kernels/results.md).
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -362,6 +362,155 @@
" 0.9 24.4241 4.95119 25.8082 3.31207 256.738"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" --- This is a regression problem ---\n",
"\n",
"\n",
" Loading dataset from file...\n",
"\n",
" Calculating kernel matrix, this could take a while...\n",
"\n",
" --- marginalized kernel matrix of size 185 built in 1133.0229969024658 seconds ---\n",
"[[ 0.0287062 0.0124634 0.00444444 ..., 0.00606061 0.00606061\n",
" 0.00606061]\n",
" [ 0.0124634 0.01108958 0.00333333 ..., 0.00454545 0.00454545\n",
" 0.00454545]\n",
" [ 0.00444444 0.00333333 0.0287062 ..., 0.00819912 0.00819912\n",
" 0.00975875]\n",
" ..., \n",
" [ 0.00606061 0.00454545 0.00819912 ..., 0.02846735 0.02836907\n",
" 0.02896354]\n",
" [ 0.00606061 0.00454545 0.00819912 ..., 0.02836907 0.02831424\n",
" 0.0288712 ]\n",
" [ 0.00606061 0.00454545 0.00975875 ..., 0.02896354 0.0288712\n",
" 0.02987915]]\n",
"\n",
" Saving kernel matrix to file...\n",
"\n",
" Mean performance on train set: 12.186285\n",
"With standard deviation: 7.038988\n",
"\n",
" Mean performance on test set: 18.024312\n",
"With standard deviation: 6.292466\n",
"\n",
"\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 18.0243 6.29247 12.1863 7.03899 1133.02\n"
]
}
],
"source": [
"%load_ext line_profiler\n",
"\n",
"import numpy as np\n",
"import sys\n",
"sys.path.insert(0, \"../\")\n",
"from pygraph.utils.utils import kernel_train_test\n",
"from pygraph.kernels.marginalizedKernel import marginalizedkernel, _marginalizedkernel_do\n",
"\n",
"datafile = '../../../../datasets/acyclic/Acyclic/dataset_bps.ds'\n",
"kernel_file_path = 'kernelmatrices_weisfeilerlehman_subtree_acyclic/'\n",
"\n",
"kernel_para = dict(node_label = 'atom', edge_label = 'bond_type', itr = 20, p_quit = 0.1)\n",
"\n",
"# kernel_train_test(datafile, kernel_file_path, marginalizedkernel, kernel_para, \\\n",
"# hyper_name = 'p_quit', hyper_range = np.linspace(0.1, 0.9, 9), normalize = False)\n",
"\n",
"%lprun -f _marginalizedkernel_do \\\n",
" kernel_train_test(datafile, kernel_file_path, marginalizedkernel, kernel_para, \\\n",
" normalize = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Timer unit: 1e-06 s\n",
"\n",
"Total time: 828.879 s\n",
"File: ../pygraph/kernels/marginalizedKernel.py\n",
"Function: _marginalizedkernel_do at line 67\n",
"\n",
"Line # Hits Time Per Hit % Time Line Contents\n",
"==============================================================\n",
" 67 def _marginalizedkernel_do(G1, G2, node_label, edge_label, p_quit, itr):\n",
" 68 \"\"\"Calculate marginalized graph kernel between 2 graphs.\n",
" 69 \n",
" 70 Parameters\n",
" 71 ----------\n",
" 72 G1, G2 : NetworkX graphs\n",
" 73 2 graphs between which the kernel is calculated.\n",
" 74 node_label : string\n",
" 75 node attribute used as label.\n",
" 76 edge_label : string\n",
" 77 edge attribute used as label.\n",
" 78 p_quit : integer\n",
" 79 the termination probability in the random walks generating step.\n",
" 80 itr : integer\n",
" 81 time of iterations to calculate R_inf.\n",
" 82 \n",
" 83 Return\n",
" 84 ------\n",
" 85 kernel : float\n",
" 86 Marginalized Kernel between 2 graphs.\n",
" 87 \"\"\"\n",
" 88 # init parameters\n",
" 89 17205 12886.0 0.7 0.0 kernel = 0\n",
" 90 17205 52542.0 3.1 0.0 num_nodes_G1 = nx.number_of_nodes(G1)\n",
" 91 17205 28240.0 1.6 0.0 num_nodes_G2 = nx.number_of_nodes(G2)\n",
" 92 17205 15595.0 0.9 0.0 p_init_G1 = 1 / num_nodes_G1 # the initial probability distribution in the random walks generating step (uniform distribution over |G|)\n",
" 93 17205 11587.0 0.7 0.0 p_init_G2 = 1 / num_nodes_G2\n",
" 94 \n",
" 95 17205 11663.0 0.7 0.0 q = p_quit * p_quit\n",
" 96 17205 10728.0 0.6 0.0 r1 = q\n",
" 97 \n",
" 98 # initial R_inf\n",
" 99 17205 38412.0 2.2 0.0 R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) # matrix to save all the R_inf for all pairs of nodes\n",
" 100 \n",
" 101 # calculate R_inf with a simple interative method\n",
" 102 344100 329235.0 1.0 0.0 for i in range(1, itr):\n",
" 103 326895 900354.0 2.8 0.1 R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])\n",
" 104 326895 2287346.0 7.0 0.3 R_inf_new.fill(r1)\n",
" 105 \n",
" 106 # calculate R_inf for each pair of nodes\n",
" 107 2653464 3667117.0 1.4 0.4 for node1 in G1.nodes(data = True):\n",
" 108 2326569 7522840.0 3.2 0.9 neighbor_n1 = G1[node1[0]]\n",
" 109 2326569 3492118.0 1.5 0.4 p_trans_n1 = (1 - p_quit) / len(neighbor_n1) # the transition probability distribution in the random walks generating step (uniform distribution over the vertices adjacent to the current vertex)\n",
" 110 24024379 27775021.0 1.2 3.4 for node2 in G2.nodes(data = True):\n",
" 111 21697810 69471941.0 3.2 8.4 neighbor_n2 = G2[node2[0]]\n",
" 112 21697810 32446626.0 1.5 3.9 p_trans_n2 = (1 - p_quit) / len(neighbor_n2) \n",
" 113 \n",
" 114 59095092 52545370.0 0.9 6.3 for neighbor1 in neighbor_n1:\n",
" 115 104193150 92513935.0 0.9 11.2 for neighbor2 in neighbor_n2:\n",
" 116 \n",
" 117 t = p_trans_n1 * p_trans_n2 * \\\n",
" 118 66795868 285324518.0 4.3 34.4 deltakernel(G1.node[neighbor1][node_label] == G2.node[neighbor2][node_label]) * \\\n",
" 119 66795868 137934393.0 2.1 16.6 deltakernel(neighbor_n1[neighbor1][edge_label] == neighbor_n2[neighbor2][edge_label])\n",
" 120 66795868 106834143.0 1.6 12.9 R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][neighbor2] # ref [1] equation (8)\n",
" 121 \n",
" 122 326895 1123677.0 3.4 0.1 R_inf[:] = R_inf_new\n",
" 123 \n",
" 124 # add elements of R_inf up and calculate kernel\n",
" 125 139656 330283.0 2.4 0.0 for node1 in G1.nodes(data = True):\n",
" 126 1264441 1435263.0 1.1 0.2 for node2 in G2.nodes(data = True): \n",
" 127 1141990 1377134.0 1.2 0.2 s = p_init_G1 * p_init_G2 * deltakernel(node1[1][node_label] == node2[1][node_label])\n",
" 128 1141990 1375456.0 1.2 0.2 kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)\n",
" 129 \n",
" 130 17205 10801.0 0.6 0.0 return kernel"
]
},
{
"cell_type": "code",
"execution_count": 3,
......
......@@ -2,23 +2,24 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The line_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext line_profiler\n",
"\n",
" --- This is a regression problem ---\n",
"\n",
"\n",
"\n",
" Loading dataset from file...\n",
"\n",
" Calculating kernel matrix, this could take a while...\n",
"\n",
" --- mean average path kernel matrix of size 185 built in 45.52756929397583 seconds ---\n",
" --- mean average path kernel matrix of size 185 built in 29.430902242660522 seconds ---\n",
"[[ 0.55555556 0.22222222 0. ..., 0. 0. 0. ]\n",
" [ 0.22222222 0.27777778 0. ..., 0. 0. 0. ]\n",
" [ 0. 0. 0.55555556 ..., 0.03030303 0.03030303\n",
......@@ -33,16 +34,16 @@
"\n",
" Saving kernel matrix to file...\n",
"\n",
" Mean performance on train set: 3.761907\n",
"With standard deviation: 0.702594\n",
" Mean performance on train set: 3.619948\n",
"With standard deviation: 0.512351\n",
"\n",
" Mean performance on test set: 14.001515\n",
"With standard deviation: 6.936023\n",
" Mean performance on test set: 18.418852\n",
"With standard deviation: 10.781119\n",
"\n",
"\n",
" RMSE_test std_test RMSE_train std_train k_time\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 14.0015 6.93602 3.76191 0.702594 45.5276\n"
" 18.4189 10.7811 3.61995 0.512351 29.4309\n"
]
}
],
......@@ -59,10 +60,10 @@
"\n",
"kernel_para = dict(node_label = 'atom', edge_label = 'bond_type')\n",
"\n",
"kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = True)\n",
"kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = False)\n",
"\n",
"# %lprun -f _pathkernel_do \\\n",
"# kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = True)"
"# kernel_train_test(datafile, kernel_file_path, pathkernel, kernel_para, normalize = False)"
]
},
{
......@@ -81,7 +82,7 @@
"# without y normalization\n",
" RMSE_test std_test RMSE_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 18.4189 10.7811 3.61995 0.512351 37.0017"
" 18.4189 10.7811 3.61995 0.512351 29.4309"
]
},
{
......
......@@ -2,44 +2,42 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The line_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext line_profiler\n",
"\n",
" --- This is a regression problem ---\n",
"\n",
"\n",
"\n",
" Loading dataset from file...\n",
"\n",
" Calculating kernel matrix, this could take a while...\n",
"--- shortest path kernel matrix of size 185 built in 14.576777696609497 seconds ---\n",
"[[ 3. 1. 3. ..., 1. 1. 1.]\n",
" [ 1. 6. 1. ..., 0. 0. 3.]\n",
" [ 3. 1. 3. ..., 1. 1. 1.]\n",
" ..., \n",
" [ 1. 0. 1. ..., 55. 21. 7.]\n",
" [ 1. 0. 1. ..., 21. 55. 7.]\n",
" [ 1. 3. 1. ..., 7. 7. 55.]]\n",
"\n",
" Saving kernel matrix to file...\n",
"\n",
"--- shortest path kernel matrix of size 185 built in 13.3865065574646 seconds ---\n",
"[[ 3. 1. 3. ... 1. 1. 1.]\n",
" [ 1. 6. 1. ... 0. 0. 3.]\n",
" [ 3. 1. 3. ... 1. 1. 1.]\n",
" ...\n",
" [ 1. 0. 1. ... 55. 21. 7.]\n",
" [ 1. 0. 1. ... 21. 55. 7.]\n",
" [ 1. 3. 1. ... 7. 7. 55.]]\n",
"\n",
" Starting calculate accuracy/rmse...\n",
"calculate performance: 94%|█████████▎| 936/1000 [00:01<00:00, 757.54it/s]\n",
" Mean performance on train set: 28.360361\n",
"With standard deviation: 1.357183\n",
"\n",
" Mean performance on test set: 35.191954\n",
"With standard deviation: 4.495767\n",
"calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 771.22it/s]\n",
"\n",
"\n",
" RMSE_test std_test RMSE_train std_train k_time\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 35.192 4.49577 28.3604 1.35718 14.5768\n"
" 35.192 4.49577 28.3604 1.35718 13.3865\n"
]
}
],
......
......@@ -2,15 +2,13 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The line_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext line_profiler\n",
"\n",
" --- This is a regression problem ---\n",
"\n",
......@@ -19,68 +17,34 @@
"\n",
" Calculating kernel matrix, this could take a while...\n",
"\n",
" --- treelet kernel matrix of size 185 built in 0.48417091369628906 seconds ---\n",
"[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n",
" 1.26641655e-14 1.26641655e-14]\n",
" [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n",
" 1.26641655e-14 1.26641655e-14]\n",
" [ 1.00000000e+00 1.00000000e+00 4.00000000e+00 ..., 3.00000000e+00\n",
" 3.00000000e+00 3.00000000e+00]\n",
" ..., \n",
" [ 1.26641655e-14 1.26641655e-14 3.00000000e+00 ..., 1.80000000e+01\n",
" 1.30548713e+01 8.19020657e+00]\n",
" [ 1.26641655e-14 1.26641655e-14 3.00000000e+00 ..., 1.30548713e+01\n",
" 2.20000000e+01 9.71901120e+00]\n",
" [ 1.26641655e-14 1.26641655e-14 3.00000000e+00 ..., 8.19020657e+00\n",
" 9.71901120e+00 1.60000000e+01]]\n",
"\n",
" Saving kernel matrix to file...\n",
"\n",
" --- treelet kernel matrix of size 185 built in 0.47543811798095703 seconds ---\n",
"[[4.00000000e+00 2.60653066e+00 1.00000000e+00 ... 1.26641655e-14\n",
" 1.26641655e-14 1.26641655e-14]\n",
" [2.60653066e+00 6.00000000e+00 1.00000000e+00 ... 1.26641655e-14\n",
" 1.26641655e-14 1.26641655e-14]\n",
" [1.00000000e+00 1.00000000e+00 4.00000000e+00 ... 3.00000000e+00\n",
" 3.00000000e+00 3.00000000e+00]\n",
" ...\n",
" [1.26641655e-14 1.26641655e-14 3.00000000e+00 ... 1.80000000e+01\n",
" 1.30548713e+01 8.19020657e+00]\n",
" [1.26641655e-14 1.26641655e-14 3.00000000e+00 ... 1.30548713e+01\n",
" 2.20000000e+01 9.71901120e+00]\n",
" [1.26641655e-14 1.26641655e-14 3.00000000e+00 ... 8.19020657e+00\n",
" 9.71901120e+00 1.60000000e+01]]\n",
"\n",
" Starting calculate accuracy/rmse...\n",
"calculate performance: 98%|█████████▊| 983/1000 [00:01<00:00, 796.45it/s]\n",
" Mean performance on train set: 2.688029\n",
"With standard deviation: 1.541623\n",
"\n",
" Mean performance on test set: 10.099738\n",
"With standard deviation: 5.035844\n",
"calculate performance: 100%|██████████| 1000/1000 [00:01<00:00, 745.11it/s]\n",
"\n",
"\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 10.0997 5.03584 2.68803 1.54162 0.484171\n",
"\n",
" --- This is a regression problem ---\n",
"\n",
"\n",
" Loading dataset from file...\n",
"\n",
" Calculating kernel matrix, this could take a while...\n",
"\n",
" --- treelet kernel matrix of size 185 built in 0.5003015995025635 seconds ---\n",
"[[ 4.00000000e+00 2.60653066e+00 1.00000000e+00 ..., 1.26641655e-14\n",
" 1.26641655e-14 1.26641655e-14]\n",
" [ 2.60653066e+00 6.00000000e+00 1.00000000e+00 ..., 1.26641655e-14\n",
" 1.26641655e-14 1.26641655e-14]\n",
" [ 1.00000000e+00 1.00000000e+00 4.00000000e+00 ..., 3.00000000e+00\n",
" 3.00000000e+00 3.00000000e+00]\n",
" ..., \n",
" [ 1.26641655e-14 1.26641655e-14 3.00000000e+00 ..., 1.80000000e+01\n",
" 1.30548713e+01 8.19020657e+00]\n",
" [ 1.26641655e-14 1.26641655e-14 3.00000000e+00 ..., 1.30548713e+01\n",
" 2.20000000e+01 9.71901120e+00]\n",
" [ 1.26641655e-14 1.26641655e-14 3.00000000e+00 ..., 8.19020657e+00\n",
" 9.71901120e+00 1.60000000e+01]]\n",
"\n",
" Saving kernel matrix to file...\n",
"\n",
" Mean performance on train set: 2.908869\n",
"With standard deviation: 1.267900\n",
"\n",
" Mean performance on test set: 8.307902\n",
"With standard deviation: 3.378376\n",
"\n",
"\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 8.3079 3.37838 2.90887 1.2679 0.500302\n"
" 10.0997 5.03584 2.68803 1.54162 0.475438\n"
]
}
],
......@@ -99,8 +63,6 @@
"\n",
"kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)\n",
"\n",
"kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = True)\n",
"\n",
"# %lprun -f treeletkernel \\\n",
"# kernel_train_test(datafile, kernel_file_path, treeletkernel, kernel_para, normalize = False)"
]
......@@ -121,14 +83,58 @@
"# without y normalization\n",
" RMSE_test std_test RMSE_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 10.0997 5.03584 2.68803 1.54162 0.484171"
" 10.0997 5.03584 2.68803 1.54162 0.484171\n",
"\n",
" \n",
"\n",
"# G0 -> WL subtree h = 0\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 13.9223 2.88611 13.373 0.653301 0.186731\n",
"\n",
"# G0 U G1 U G6 U G8 U G13 -> WL subtree h = 1\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 8.97706 2.90771 6.7343 1.17505 0.223171\n",
" \n",
"# all patterns \\ { G3 U G4 U G5 U G10 } -> WL subtree h = 2 \n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 7.31274 1.96289 3.73909 0.406267 0.294902\n",
"\n",
"# all patterns \\ { G4 U G5 } -> WL subtree h = 3\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 8.39977 2.78309 3.8606 1.58686 0.348912\n",
"\n",
"# all patterns \\ { G5 } \n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 9.47647 4.22113 3.18029 1.5669 0.423638\n",
" \n",
" \n",
" \n",
"# G0, -> WL subtree h = 0\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 13.9223 2.88611 13.373 0.653301 0.186731 \n",
" \n",
"# G0 U G1 U G2 U G6 U G8 U G13 -> WL subtree h = 1\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 8.62431 2.54327 5.63422 0.255002 0.290797\n",
" \n",
"# all patterns \\ { G5 U G10 } -> WL subtree h = 2\n",
" rmse_test std_test rmse_train std_train k_time\n",
"----------- ---------- ------------ ----------- --------\n",
" 10.1294 3.50275 3.69664 1.55116 0.418498"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
"scrolled": true
},
"outputs": [
{
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -11,26 +11,30 @@ A python package for graph kernels.
* tabulate - 0.8.2
## Results with minimal test RMSE for each kernel on dataset Asyclic
All kernels are tested on dataset Asyclic, which consists of 185 molecules (graphs).
All kernels expect for Cyclic pattern kernel are tested on dataset Asyclic, which consists of 185 molecules (graphs). (Cyclic pattern kernel is tested on dataset MAO and PAH.)
The criteria used for prediction are SVM for classification and kernel Ridge regression for regression.
For predition we randomly divide the data in train and test subset, where 90% of entire dataset is for training and rest for testing. 10 splits are performed. For each split, we first train on the train data, then evaluate the performance on the test set. We choose the optimal parameters for the test set and finally provide the corresponding performance. The final results correspond to the average of the performances on the test sets.
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time |
|---------------|:-------:|:------:|-------------:|-------:|
| Shortest path | 35.19 | 4.50 | - | 14.58" |
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" |
| Path | 14.00 | 6.93 | - | 36.21" |
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" |
| Treelet | 8.31 | 3.38 | - | 0.50" |
| Path up to d | 7.43 | 2.69 | depth = 2 | 0.59" |
| Kernels | RMSE(℃) | STD(℃) | Parameter | k_time |
|------------------|:-------:|:------:|------------------:|-------:|
| Shortest path | 35.19 | 4.50 | - | 14.58" |
| Marginalized | 18.02 | 6.29 | p_quit = 0.1 | 4'19" |
| Path | 18.41 | 10.78 | - | 29.43" |
| WL subtree | 7.55 | 2.33 | height = 1 | 0.84" |
| WL shortest path | 35.16 | 4.50 | height = 2 | 40.24" |
| WL edge | 33.41 | 4.73 | height = 5 | 5.66" |
| Treelet | 8.31 | 3.38 | - | 0.50" |
| Path up to d | 7.43 | 2.69 | depth = 2 | 0.59" |
| Tree pattern | 7.27 | 2.21 | lamda = 1, h = 2 | 37.24" |
| Cyclic pattern | 0.9 | 0.11 | cycle bound = 100 | 0.31" |
* RMSE stands for arithmetic mean of the root mean squared errors on all splits.
* STD stands for standard deviation of the root mean squared errors on all splits.
* Paremeter is the one with which the kenrel achieves the best results.
* k_time is the time spent on building the kernel matrix.
* The targets of training data are normalized before calculating *path kernel* and *treelet kernel*.
* The targets of training data are normalized before calculating *treelet kernel*.
* See detail results in [results.md](pygraph/kernels/results.md).
## References
......@@ -46,6 +50,10 @@ For predition we randomly divide the data in train and test subset, where 90% of
[6] Liva Ralaivola, Sanjay J Swamidass, Hiroto Saigo, and Pierre Baldi. Graph kernels for chemical informatics. Neural networks, 18(8):1093–1110, 2005.
[7] Pierre Mahé and Jean-Philippe Vert. Graph kernels based on tree patterns for molecules. Machine learning, 75(1):3–35, 2009.
[8] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004.
## Updates
### 2018.01.24
* ADD *path kernel up to depth d* and its result on dataset Asyclic.
......
"""
@author: linlin <jajupmochi@gmail.com>
@references:
[1] Tamás Horváth, Thomas Gärtner, and Stefan Wrobel. Cyclic pattern kernels for predictive graph mining. In Proceedings of the tenth ACM SIGKDD international conference on Knowledge discovery and data mining, pages 158–167. ACM, 2004.
[2] Hopcroft, J.; Tarjan, R. (1973). “Efficient algorithms for graph manipulation”. Communications of the ACM 16: 372–378. doi:10.1145/362248.362272.
[3] Finding all the elementary circuits of a directed graph. D. B. Johnson, SIAM Journal on Computing 4, no. 1, 77-84, 1975. http://dx.doi.org/10.1137/0204007
"""
import sys
import pathlib
sys.path.insert(0, "../")
import time
import networkx as nx
import numpy as np
from tqdm import tqdm
def cyclicpatternkernel(*args, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
"""Calculate cyclic pattern graph kernels between graphs.
Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
2 graphs between which the kernel is calculated.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
depth : integer
Depth of search. Longest length of paths.
Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
"""
Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
Kmatrix = np.zeros((len(Gn), len(Gn)))
start_time = time.time()
# get all cyclic and tree patterns of all graphs before calculating kernels to save time, but this may consume a lot of memory for large dataset.
all_patterns = [ get_patterns(Gn[i], node_label = node_label, edge_label = edge_label, labeled = labeled, cycle_bound = cycle_bound)
for i in tqdm(range(0, len(Gn)), desc = 'retrieve patterns', file=sys.stdout) ]
for i in tqdm(range(0, len(Gn)), desc = 'calculate kernels', file=sys.stdout):
for j in range(i, len(Gn)):
Kmatrix[i][j] = _cyclicpatternkernel_do(all_patterns[i], all_patterns[j])
Kmatrix[j][i] = Kmatrix[i][j]
run_time = time.time() - start_time
print("\n --- kernel matrix of cyclic pattern kernel of size %d built in %s seconds ---" % (len(Gn), run_time))
return Kmatrix, run_time
def _cyclicpatternkernel_do(patterns1, patterns2):
"""Calculate path graph kernels up to depth d between 2 graphs.
Parameters
----------
paths1, paths2 : list
List of paths in 2 graphs, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
k_func : function
A kernel function used using different notions of fingerprint similarity.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
Return
------
kernel : float
Treelet Kernel between 2 graphs.
"""
return len(set(patterns1) & set(patterns2))
def get_patterns(G, node_label = 'atom', edge_label = 'bond_type', labeled = True, cycle_bound = None):
"""Find all cyclic and tree patterns in a graph.
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The maximum length of paths.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
Return
------
path : list
List of paths retrieved, where for unlabeled graphs, each path is represented by a list of nodes; while for labeled graphs, each path is represented by a string consists of labels of nodes and edges on that path.
"""
number_simplecycles = 0
bridges = nx.Graph()
patterns = []
bicomponents = nx.biconnected_component_subgraphs(G) # all biconnected components of G. this function use algorithm in reference [2], which (i guess) is slightly different from the one used in paper [1]
for subgraph in bicomponents:
if nx.number_of_edges(subgraph) > 1:
simple_cycles = list(nx.simple_cycles(G.to_directed())) # all simple cycles in biconnected components. this function use algorithm in reference [3], which has time complexity O((n+e)(N+1)) for n nodes, e edges and N simple cycles. Which might be slower than the algorithm applied in paper [1]
if cycle_bound != None and len(simple_cycles) > cycle_bound - number_simplecycles: # in paper [1], when applying another algorithm (subroutine RT), this becomes len(simple_cycles) == cycle_bound - number_simplecycles + 1, check again.
return []
else:
# calculate canonical representation for each simple cycle
all_canonkeys = []
for cycle in simple_cycles:
canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[:-1] ]
canonkey = ''.join(canonlist)
canonkey = canonkey if canonkey < canonkey[::-1] else canonkey[::-1]
for i in range(1, len(cycle[:-1])):
canonlist = [ G.node[node][node_label] + G[node][cycle[cycle.index(node) + 1]][edge_label] for node in cycle[i:-1] + cycle[:i] ]
canonkey_t = ''.join(canonlist)
canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
canonkey = canonkey if canonkey < canonkey_t else canonkey_t
all_canonkeys.append(canonkey)
patterns = list(set(patterns) | set(all_canonkeys))
number_simplecycles += len(simple_cycles)
else:
bridges.add_edges_from(subgraph.edges(data=True))
# calculate canonical representation for each connected component in bridge set
components = list(nx.connected_component_subgraphs(bridges)) # all connected components in the bridge
tree_patterns = []
for tree in components:
break
# patterns += pi(bridges)
return patterns
def deltakernel(condition):
"""Return 1 if condition holds, 0 otherwise.
Parameters
----------
condition : Boolean
A condition, according to which the kernel is set to 1 or 0.
Return
------
kernel : integer
Delta kernel.
References
----------
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between labeled graphs. In Proceedings of the 20th International Conference on Machine Learning, Washington, DC, United States, 2003.
"""
return (1 if condition else 0)
\ No newline at end of file
return condition #(1 if condition else 0)