Skip to content

Commit 85032ce

Browse files
committed
working: add AttrLabel support for gedlibpy.
1 parent 4fe2acb commit 85032ce

14 files changed

+5929
-614
lines changed
Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
"""
2+
@File: compare_gedlib_with_coords_in_string_and_attr_format.py
3+
4+
@Author: jajupmochi
5+
@Date: May 22 2025
6+
"""
7+
from typing import List
8+
9+
import networkx as nx
10+
import numpy as np
11+
12+
ISSUE_TAG = "\033[91m[issue]\033[0m " # Red
13+
INFO_TAG = "\033[94m[info]\033[0m " # Blue
14+
SUCCESS_TAG = "\033[92m[success]\033[0m " # Green
15+
16+
17+
def fit_model_ged(
18+
graphs_X: List[nx.Graph],
19+
graphs_Y: List[nx.Graph] = None,
20+
ged_options: dict = None,
21+
parallel: bool = None,
22+
n_jobs: int = None,
23+
chunksize: int = None,
24+
copy_graphs: bool = True,
25+
read_resu_from_file: int = 1,
26+
output_dir: str = None,
27+
params_idx: str = None,
28+
reorder_graphs: bool = False,
29+
verbose: int = 2,
30+
**kwargs
31+
):
32+
# if read_resu_from_file >= 1:
33+
# fn_model = os.path.join(
34+
# output_dir, 'metric_model.params_{}.pkl'.format(
35+
# params_idx
36+
# )
37+
# )
38+
# # Load model from file if it exists:
39+
# if os.path.exists(fn_model) and os.path.getsize(fn_model) > 0:
40+
# print('\nLoading model from file...')
41+
# resu = pickle.load(open(fn_model, 'rb'))
42+
# return resu['model'], resu['history'], resu['model'].dis_matrix
43+
44+
# Reorder graphs if specified:
45+
if reorder_graphs:
46+
graphs_X = reorder_graphs_by_index(graphs_X, idx_key='id')
47+
if graphs_Y is not None:
48+
graphs_Y = reorder_graphs_by_index(graphs_Y, idx_key='id')
49+
50+
# Compute metric matrix otherwise:
51+
print(f'{INFO_TAG}Computing metric matrix...')
52+
all_graphs = graphs_X + graphs_Y if graphs_Y else graphs_X
53+
nl_names = list(
54+
all_graphs[0].nodes[list(all_graphs[0].nodes)[0]].keys()
55+
) if graphs_X else []
56+
if not all_graphs:
57+
el_names = []
58+
else:
59+
idx_edge = (
60+
np.where(np.array([nx.number_of_edges(g) for g in all_graphs]) > 0)[0]
61+
)
62+
if len(idx_edge) == 0:
63+
el_names = []
64+
else:
65+
el_names = list(
66+
all_graphs[idx_edge[0]].edges[
67+
list(all_graphs[idx_edge[0]].edges)[0]].keys()
68+
)
69+
70+
from gklearn.experiments.ged.ged_model.parallel_version import GEDModel
71+
72+
if parallel is False:
73+
parallel = None
74+
elif parallel is True:
75+
parallel = 'imap_unordered'
76+
77+
model = GEDModel(
78+
ed_method=ged_options['method'],
79+
edit_cost_fun=ged_options['edit_cost_fun'],
80+
init_edit_cost_constants=ged_options['edit_costs'],
81+
optim_method=ged_options['optim_method'],
82+
node_labels=nl_names, edge_labels=el_names,
83+
parallel=parallel,
84+
n_jobs=n_jobs,
85+
chunksize=chunksize,
86+
copy_graphs=copy_graphs,
87+
# make sure it is a full deep copy. and faster!
88+
verbose=verbose
89+
)
90+
91+
# Train model.
92+
try:
93+
if graphs_Y is None:
94+
# Compute the distance matrix for the same set of graphs:
95+
matrix = model.fit_transform(
96+
graphs_X, y=graphs_Y,
97+
save_dm_train=True, repeats=ged_options['repeats'],
98+
)
99+
else:
100+
model.fit(graphs_X, repeats=ged_options['repeats'])
101+
matrix = model.transform(
102+
graphs_Y,
103+
save_dm_test=True, repeats=ged_options['repeats'],
104+
)
105+
106+
except OSError as exception:
107+
if 'GLIBC_2.23' in exception.args[0]:
108+
msg = \
109+
'This error is very likely due to the low version of GLIBC ' \
110+
'on your system. ' \
111+
'The required version of GLIBC is 2.23. This may happen on the ' \
112+
'CentOS 7 system, where the highest version of GLIBC is 2.17. ' \
113+
'You may check your CLIBC version by bash command `rpm -q glibc`. ' \
114+
'The `graphkit-learn` library comes with GLIBC_2.23, which you can ' \
115+
'install by enable the `--build-gedlib` option: ' \
116+
'`python3 setup.py install --build-gedlib`. This will compile the C++ ' \
117+
'module `gedlib`, which requires a C++ compiler and CMake.'
118+
raise AssertionError(msg) from exception
119+
else:
120+
assert False, exception
121+
except Exception as exception:
122+
assert False, exception
123+
124+
# Save history:
125+
# For graph kernels it is n * (n - 1) / 2:
126+
if graphs_Y is None:
127+
n_pairs = len(graphs_X) * (len(graphs_X) - 1) / 2
128+
else:
129+
n_pairs = len(graphs_X) * len(graphs_Y)
130+
# history = {'run_time': AverageMeter()}
131+
# history['run_time'].update(model.run_time / n_pairs, n_pairs)
132+
133+
# # Save model and history to file:
134+
# if read_resu_from_file >= 1:
135+
# os.makedirs(os.path.dirname(fn_model), exist_ok=True)
136+
# pickle.dump({'model': model, 'history': history}, open(fn_model, 'wb'))
137+
138+
# Print out the information:
139+
params_msg = f' for parameters {params_idx}' if params_idx else ''
140+
print(
141+
f'{SUCCESS_TAG}Computed metric matrix of size {matrix.shape} in {model.run_time:.3f} '
142+
f'seconds ({(model.run_time / n_pairs):.9f} s per pair){params_msg}.'
143+
)
144+
145+
stats = {
146+
'n_pairs': n_pairs,
147+
'matrix_shape': matrix.shape,
148+
'run_time': model.run_time,
149+
'run_time_per_pair': model.run_time / n_pairs,
150+
}
151+
152+
return model, matrix, stats
153+
154+
155+
def show_some_graphs(graphs):
156+
"""
157+
Show some graphs from the list of graphs.
158+
"""
159+
print(f'{INFO_TAG}Showing some graphs:')
160+
for i, g in enumerate(graphs[:5]):
161+
print(f'Graph {i}:')
162+
print('Number of nodes:', g.number_of_nodes())
163+
print('Number of edges:', g.number_of_edges())
164+
print('Nodes:', g.nodes(data=True))
165+
print('Edges:', g.edges(data=True))
166+
print()
167+
168+
169+
def convert_graphs_coords_from_attr_to_string(graphs: List[nx.Graph]):
170+
"""
171+
Convert the coordinates of nodes in graphs from the attribute format `AttrLabel` to the string format `GXLLabel`.
172+
"""
173+
for g in graphs:
174+
for node in g.nodes(data=True):
175+
if 'coords' in node[1]:
176+
# Convert the coordinates to string format and store them in "x" and "y" keys:
177+
coords = node[1]['coords']
178+
node[1]['x'] = str(coords[0])
179+
node[1]['y'] = str(coords[1])
180+
del node[1]['coords']
181+
print(f'{INFO_TAG}Converted coordinates from attribute format to string format.')
182+
183+
184+
def fit_model_attr_version(
185+
seed: int = 42, n_graphs: int = 100
186+
) -> (np.array, float):
187+
"""
188+
Fit the GED model with graphs that have coordinates on nodes in attribute format `AttrLabel`.
189+
"""
190+
print(
191+
f'\n{INFO_TAG}Fitting model with graphs with coordinates in attribute format...'
192+
)
193+
194+
from gklearn.experiments.ged.ged_model.graph_generator import GraphGenerator
195+
generator = GraphGenerator(
196+
num_graphs=n_graphs,
197+
max_num_nodes=20,
198+
min_num_nodes=10,
199+
max_num_edges=50,
200+
min_num_edges=20,
201+
node_feat_type='float',
202+
edge_feat_type=None,
203+
with_discrete_n_features=False,
204+
with_discrete_e_features=False,
205+
with_continuous_n_features=True,
206+
with_continuous_e_features=False,
207+
continuous_n_feature_key='coords',
208+
continuous_n_feature_dim=2,
209+
continuous_e_feature_dim=0,
210+
seed=seed
211+
)
212+
graphs = generator.generate_graphs()
213+
print(
214+
f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in string format.'
215+
)
216+
show_some_graphs(graphs)
217+
218+
# Set GED options:
219+
ged_options = {
220+
'method': 'BIPARTITE',
221+
'edit_cost_fun': 'GEOMETRIC',
222+
'edit_costs': [3, 3, 1, 3, 3, 1],
223+
'optim_method': 'init',
224+
'repeats': 1
225+
}
226+
227+
fit_settings = {
228+
'parallel': None,
229+
'n_jobs': 1, # min(12, max(os.cpu_count() - 2, 0)),
230+
'chunksize': None, # None == automatic determination
231+
'copy_graphs': True,
232+
'reorder_graphs': False,
233+
}
234+
235+
# Fit model and compute GED matrix:
236+
model, matrix, stats = fit_model_ged(
237+
graphs,
238+
graphs_Y=None,
239+
ged_options=ged_options,
240+
read_resu_from_file=0,
241+
output_dir=None,
242+
params_idx=None,
243+
verbose=2,
244+
**fit_settings
245+
)
246+
print("Model:", model)
247+
print("Matrix shape:", matrix.shape)
248+
print("Run time:", stats['run_time'])
249+
250+
return matrix, stats['run_time']
251+
252+
253+
def fit_model_string_version(
254+
seed: int = 42, n_graphs: int = 100
255+
) -> (np.array, float):
256+
"""
257+
Fit the GED model with graphs that have coordinates on nodes in string format `GXLLabel`.
258+
"""
259+
print(f'\n{INFO_TAG}Fitting model with graphs with coordinates in string format...')
260+
261+
from gklearn.experiments.ged.ged_model.graph_generator import GraphGenerator
262+
generator = GraphGenerator(
263+
num_graphs=n_graphs,
264+
max_num_nodes=20,
265+
min_num_nodes=10,
266+
max_num_edges=50,
267+
min_num_edges=20,
268+
node_feat_type='float',
269+
edge_feat_type=None,
270+
with_discrete_n_features=False,
271+
with_discrete_e_features=False,
272+
with_continuous_n_features=True,
273+
with_continuous_e_features=False,
274+
continuous_n_feature_key='coords',
275+
continuous_n_feature_dim=2,
276+
continuous_e_feature_dim=0,
277+
seed=seed
278+
)
279+
graphs = generator.generate_graphs()
280+
convert_graphs_coords_from_attr_to_string(graphs)
281+
print(
282+
f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in string format.'
283+
)
284+
show_some_graphs(graphs)
285+
286+
# Set GED options:
287+
ged_options = {
288+
'method': 'BIPARTITE',
289+
'edit_cost_fun': 'NON_SYMBOLIC',
290+
'edit_costs': [3, 3, 1, 3, 3, 1],
291+
'optim_method': 'init',
292+
'repeats': 1
293+
}
294+
295+
fit_settings = {
296+
'parallel': None,
297+
'n_jobs': 1, # min(12, max(os.cpu_count() - 2, 0)),
298+
'chunksize': None, # None == automatic determination
299+
'copy_graphs': True,
300+
'reorder_graphs': False,
301+
}
302+
303+
# Fit model and compute GED matrix:
304+
model, matrix, stats = fit_model_ged(
305+
graphs,
306+
graphs_Y=None,
307+
ged_options=ged_options,
308+
read_resu_from_file=0,
309+
output_dir=None,
310+
params_idx=None,
311+
verbose=2,
312+
**fit_settings
313+
)
314+
print("Model:", model)
315+
print("Matrix shape:", matrix.shape)
316+
print("Run time:", stats['run_time'])
317+
318+
return matrix, stats['run_time']
319+
320+
321+
def compare_gedlib_with_coords_in_string_and_attr_format(
322+
seed: int = 42, n_graphs: int = 100
323+
) -> (np.array, np.array):
324+
"""
325+
Compare the output and the performance of GEDLIB with the same graphs with coordinates on nodes,
326+
but one is in string format `GXLLabel` and the other is in the complex attribute format `AttrLabel`.
327+
"""
328+
# cost_matrix_s, run_time_s = fit_model_string_version(seed=seed, n_graphs=n_graphs)
329+
cost_matrix_a, run_time_a = fit_model_attr_version(seed=seed, n_graphs=n_graphs)
330+
if not np.array_equal(cost_matrix_s, cost_matrix_a):
331+
print(
332+
f'{ISSUE_TAG}The cost matrices are not equal! '
333+
f'String version: {cost_matrix_s.shape}, '
334+
f'Attribute version: {cost_matrix_a.shape}'
335+
)
336+
else:
337+
print(
338+
f'{SUCCESS_TAG}The cost matrices are equal! '
339+
f'String version: {cost_matrix_s.shape}, '
340+
f'Attribute version: {cost_matrix_a.shape}'
341+
)
342+
343+
# Print the first 5 rows and columns of the matrices:
344+
print('First 5 rows and columns of the string version cost matrix:')
345+
print(cost_matrix_s[:5, :5])
346+
print('First 5 rows and columns of the attribute version cost matrix:')
347+
print(cost_matrix_a[:5, :5])
348+
349+
# Print the run times:
350+
print(f'String version run time: {run_time_s:.3f} seconds.')
351+
print(f'Attribute version run time: {run_time_a:.3f} seconds.')
352+
353+
# Print the run time per pair:
354+
n_pairs = cost_matrix_s.shape[0] * (cost_matrix_s.shape[0] - 1) / 2
355+
print(
356+
f'String version run time per pair: {run_time_s / n_pairs:.9f} seconds.'
357+
)
358+
print(
359+
f'Attribute version run time per pair: {run_time_a / n_pairs:.9f} seconds.'
360+
)
361+
362+
return cost_matrix_s, cost_matrix_a
363+
364+
365+
if __name__ == '__main__':
366+
# Test the class
367+
# feat_type = 'str'
368+
seed = 42
369+
n_graphs = 10
370+
compare_gedlib_with_coords_in_string_and_attr_format(seed=seed, n_graphs=n_graphs)

0 commit comments

Comments
 (0)