jajupmochi
diff --git a/‎gklearn/experiments/ged/ged_model/compare_gedlib_with_coords_in_string_and_attr_format.py‎
Lines changed: 119 additions & 19 deletions b/‎gklearn/experiments/ged/ged_model/compare_gedlib_with_coords_in_string_and_attr_format.py‎
Lines changed: 119 additions & 19 deletions
diff --git a/‎gklearn/experiments/ged/ged_model/fit_ged_model.py‎
Lines changed: 1 addition & 1 deletion b/‎gklearn/experiments/ged/ged_model/fit_ged_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gklearn/experiments/ged/ged_model/parallel_version.py‎ renamed to ‎gklearn/experiments/ged/ged_model/ged_model_parallel.py‎
Lines changed: 65 additions & 3 deletions b/‎gklearn/experiments/ged/ged_model/parallel_version.py‎ renamed to ‎gklearn/experiments/ged/ged_model/ged_model_parallel.py‎
Lines changed: 65 additions & 3 deletions
@@ -67,17 +67,19 @@ def fit_model_ged(
 					list(all_graphs[idx_edge[0]].edges)[0]].keys()
 			)
 
-	from gklearn.experiments.ged.ged_model.parallel_version import GEDModel
+	from gklearn.experiments.ged.ged_model.ged_model_parallel import GEDModel
 
 	if parallel is False:
 		parallel = None
 	elif parallel is True:
 		parallel = 'imap_unordered'
 
 	model = GEDModel(
+		# env_type=ged_options['env_type'],
 		ed_method=ged_options['method'],
 		edit_cost_fun=ged_options['edit_cost_fun'],
 		init_edit_cost_constants=ged_options['edit_costs'],
+		edit_cost_config=ged_options.get('edit_cost_config', {}),
 		optim_method=ged_options['optim_method'],
 		node_labels=nl_names, edge_labels=el_names,
 		parallel=parallel,
@@ -157,7 +159,7 @@ def show_some_graphs(graphs):
 	Show some graphs from the list of graphs.
 	"""
 	print(f'{INFO_TAG}Showing some graphs:')
-	for i, g in enumerate(graphs[:5]):
+	for i, g in enumerate(graphs[:3]):
 		print(f'Graph {i}:')
 		print('Number of nodes:', g.number_of_nodes())
 		print('Number of edges:', g.number_of_edges())
@@ -177,12 +179,15 @@ def convert_graphs_coords_from_attr_to_string(graphs: List[nx.Graph]):
 				coords = node[1]['coords']
 				node[1]['x'] = str(coords[0])
 				node[1]['y'] = str(coords[1])
+				for idx in range(2, len(coords)):
+					# If there are more than 2 coordinates, store them with extra keys:
+					node[1][f'coord_{idx}'] = str(coords[idx])
 				del node[1]['coords']
 	print(f'{INFO_TAG}Converted coordinates from attribute format to string format.')
 
 
 def fit_model_attr_version(
-		seed: int = 42, n_graphs: int = 100
+		seed: int = 42, n_graphs: int = 100, n_emb_dim: int = 2, parallel: bool = False
 ) -> (np.array, float):
 	"""
 	Fit the GED model with graphs that have coordinates on nodes in attribute format `AttrLabel`.
@@ -205,28 +210,43 @@ def fit_model_attr_version(
 		with_continuous_n_features=True,
 		with_continuous_e_features=False,
 		continuous_n_feature_key='coords',
-		continuous_n_feature_dim=2,
+		continuous_n_feature_dim=n_emb_dim,
 		continuous_e_feature_dim=0,
 		seed=seed
 	)
 	graphs = generator.generate_graphs()
+	# Check graph node label format:
+	one_n_labels = graphs[0].nodes[list(graphs[0].nodes)[0]]
+	assert 'coords' in one_n_labels and isinstance(one_n_labels['coords'], np.ndarray) and (
+		len(one_n_labels['coords']) > 0 and one_n_labels['coords'].dtype in [
+			np.float64, np.float32]
+	), (
+		'The node labels should contain "coords" key with a numpy array as value.'
+	)
 	print(
 		f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in string format.'
 	)
 	show_some_graphs(graphs)
 
 	# Set GED options:
 	ged_options = {
+		'env_type': 'attr',  # Use the attribute-based environment
 		'method': 'BIPARTITE',
 		'edit_cost_fun': 'GEOMETRIC',
 		'edit_costs': [3, 3, 1, 3, 3, 1],
+		'edit_cost_config': {
+			'node_coord_metric': 'euclidean',
+			'node_embed_metric': 'cosine_distance',
+			'edge_weight_metric': 'euclidean',
+			'edge_embed_metric': 'cosine_distance',
+		},
 		'optim_method': 'init',
-		'repeats': 1
+		'repeats': 1,
 	}
 
 	fit_settings = {
-		'parallel': None,
-		'n_jobs': 1,  # min(12, max(os.cpu_count() - 2, 0)),
+		'parallel': parallel,  # Use parallel processing if specified
+		'n_jobs': 10,  # min(12, max(os.cpu_count() - 2, 0)),
 		'chunksize': None,  # None == automatic determination
 		'copy_graphs': True,
 		'reorder_graphs': False,
@@ -251,7 +271,7 @@ def fit_model_attr_version(
 
 
 def fit_model_string_version(
-		seed: int = 42, n_graphs: int = 100
+		seed: int = 42, n_graphs: int = 100, n_emb_dim: int = 2, parallel: bool = False
 ) -> (np.array, float):
 	"""
 	Fit the GED model with graphs that have coordinates on nodes in string format `GXLLabel`.
@@ -272,19 +292,26 @@ def fit_model_string_version(
 		with_continuous_n_features=True,
 		with_continuous_e_features=False,
 		continuous_n_feature_key='coords',
-		continuous_n_feature_dim=2,
+		continuous_n_feature_dim=n_emb_dim,
 		continuous_e_feature_dim=0,
 		seed=seed
 	)
 	graphs = generator.generate_graphs()
 	convert_graphs_coords_from_attr_to_string(graphs)
+	# Check graph node label format:
+	one_n_labels = graphs[0].nodes[list(graphs[0].nodes)[0]]
+	assert 'x' in one_n_labels and 'y' in one_n_labels and isinstance(
+		one_n_labels['x'], str) and isinstance(one_n_labels['y'], str), (
+		'The node labels should contain "x" and "y" keys with string values.'
+	)
 	print(
 		f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in string format.'
 	)
 	show_some_graphs(graphs)
 
 	# Set GED options:
 	ged_options = {
+		'env_type': 'gxl',  # Use the GXLLabel environment
 		'method': 'BIPARTITE',
 		'edit_cost_fun': 'NON_SYMBOLIC',
 		'edit_costs': [3, 3, 1, 3, 3, 1],
@@ -293,8 +320,8 @@ def fit_model_string_version(
 	}
 
 	fit_settings = {
-		'parallel': None,
-		'n_jobs': 1,  # min(12, max(os.cpu_count() - 2, 0)),
+		'parallel': parallel,  # Use parallel processing if specified
+		'n_jobs': 10,  # min(12, max(os.cpu_count() - 2, 0)),
 		'chunksize': None,  # None == automatic determination
 		'copy_graphs': True,
 		'reorder_graphs': False,
@@ -319,25 +346,31 @@ def fit_model_string_version(
 
 
 def compare_gedlib_with_coords_in_string_and_attr_format(
-		seed: int = 42, n_graphs: int = 100
+		seed: int = 42, n_graphs: int = 100, n_emb_dim: int = 2, parallel: bool = False
 ) -> (np.array, np.array):
 	"""
 	Compare the output and the performance of GEDLIB with the same graphs with coordinates on nodes,
 	but one is in string format `GXLLabel` and the other is in the complex attribute format `AttrLabel`.
 	"""
-	# cost_matrix_s, run_time_s = fit_model_string_version(seed=seed, n_graphs=n_graphs)
-	cost_matrix_a, run_time_a = fit_model_attr_version(seed=seed, n_graphs=n_graphs)
-	if not np.array_equal(cost_matrix_s, cost_matrix_a):
+	cost_matrix_a, run_time_a = fit_model_attr_version(
+		seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel
+	)
+	cost_matrix_s, run_time_s = fit_model_string_version(
+		seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel
+	)
+	if not np.allclose(cost_matrix_s, cost_matrix_a, rtol=1e-9):
 		print(
 			f'{ISSUE_TAG}The cost matrices are not equal! '
 			f'String version: {cost_matrix_s.shape}, '
-			f'Attribute version: {cost_matrix_a.shape}'
+			f'Attribute version: {cost_matrix_a.shape}, '
+			f'Relevant tolerance: 1e-9.'
 		)
 	else:
 		print(
 			f'{SUCCESS_TAG}The cost matrices are equal! '
 			f'String version: {cost_matrix_s.shape}, '
-			f'Attribute version: {cost_matrix_a.shape}'
+			f'Attribute version: {cost_matrix_a.shape}, '
+			f'Relevant tolerance: 1e-9.'
 		)
 
 	# Print the first 5 rows and columns of the matrices:
@@ -366,5 +399,72 @@ def compare_gedlib_with_coords_in_string_and_attr_format(
 	# Test the class
 	# feat_type = 'str'
 	seed = 42
-	n_graphs = 10
-	compare_gedlib_with_coords_in_string_and_attr_format(seed=seed, n_graphs=n_graphs)
+	n_graphs = 500
+	n_emb_dim = 100
+	parellel = True
+	compare_gedlib_with_coords_in_string_and_attr_format(
+		seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parellel
+	)
+
+	# # Comparison of the two versions:
+	#
+	# General Settings:
+	# - n_graphs: 500
+	# - node numbers: 10-20
+	# - edge numbers: 20-50
+	# - Regenerate GEDEnv for each pair of computation (not optimized).
+	# - Coordinates as labels of strings in GXLLabel or one label of np.array in AttrLabel,
+	#   where the latter is optimized by the Eigen C++ library for vectorized operations.
+	#
+	# ## Without parallelization:
+	#
+	# ### n_emb_dim = 2:
+	# - String version run time: 7.4e-4 s per pair (92.3 s total).
+	# - Attribute version run time: 5.0e-4 s per pair (62.4 s total).
+	# The Attr version is ~ 1.5x faster than the String version.
+	#
+	# ### n_emb_dim = 20:
+	# - String version run time: 5.4e-3 s per pair (675.1 s total).
+	# - Attribute version run time: 5.5e-4 s per pair (69.0 s total).
+	# The Attr version is ~ 10x faster than the String version.
+	#
+	# ### n_emb_dim = 100:
+	# - String version run time: too long to compute (over 1 h ~ 3698.5 s).
+	# - Attribute version run time: 8.0e-4 s per pair (99.9 s total).
+	# The Attr version is ~ 37x faster than the String version.
+	#
+	# ### Conclusion:
+	# - The Attribute version is faster than the String version.
+	# - With the increase of the dimensionality of the coordinates (n_emb_dim):
+	#   -- Attribute version takes almost the same level of time to compute pairwise
+	#      distances (e.g., ~ 1.6x slower when n_emb_dim = 100 than 2).
+	#   -- String version becomes unusable (~ 40x slower when n_emb_dim = 100 than 2),
+	#      and ~ 37x slower than the Attribute version with n_emb_dim = 100.
+	#
+	# ## With parallelization (n_jobs=10):
+	#
+	# ### n_emb_dim = 2:
+	# - String version run time: 3.6e-4 s per pair (45.3 s total).
+	# - Attribute version run time: 3.6e-4 s per pair (45.3 s total).
+	# The two versions are almost equal in terms of run time.
+	#
+	# ### n_emb_dim = 20:
+	# - String version run time: 9.8e-4 s per pair (122.4 s total).
+	# - Attribute version run time: 4.1e-4 s per pair (50.7 s total).
+	# The Attribute version is ~ 2.4x faster than the String version.
+	#
+	# ### n_emb_dim = 100:
+	# - String version run time: 5.3e-3 s per pair (664.3 s total).
+	# - Attribute version run time: 4.4e-4 s per pair (54.3 s total).
+	# The Attribute version is ~ 12.2x faster than the String version.
+	#
+	# ### Conclusion:
+	# - The Attribute version is still faster than the String version.
+	# - The parallelization helps to reduce the run time of both versions,
+	#   but the improvement on the String version is much more significant,
+	#   e.g., ~ x faster than the non-parallelized version with n_emb_dim = 100
+	# - On the other hand, the improvement brought by parallelization is not so significant
+	#   for the Attribute version, e.g., ~ 1.8x faster than the non-parallelized version
+	#   with n_emb_dim = 100.
+	#   -- I assume the reason is that the construction of the GEDEnvAttr and the
+	#      Python-C++ interface conversion becomes the bottleneck of the process.
@@ -67,7 +67,7 @@ def fit_model_ged(
 					list(all_graphs[idx_edge[0]].edges)[0]].keys()
 			)
 
-	from gklearn.experiments.ged.ged_model.parallel_version import GEDModel
+	from gklearn.experiments.ged.ged_model.ged_model_parallel import GEDModel
 
 	if parallel is False:
 		parallel = None
 
@@ -52,9 +52,11 @@ class GEDModel(BaseEstimator):  # , ABC):
 
 	def __init__(
 			self,
+			env_type: str | None = None,
 			ed_method='BIPARTITE',
 			edit_cost_fun='CONSTANT',
 			init_edit_cost_constants=[3, 3, 1, 3, 3, 1],
+			edit_cost_config: dict = {},
 			optim_method='init',
 			optim_options={'y_distance': euclid_d, 'mode': 'reg'},
 			node_labels=[],
@@ -66,12 +68,33 @@ def __init__(
 			copy_graphs=True,  # make sure it is a full deep copy. and faster!
 			verbose=2
 	):
-		"""`__init__` for `GEDModel` object."""
+		"""`__init__` for `GEDModel` object.
+
+		Parameters
+		----------
+		env_type : str, optional
+			The type of the GED environment. Default is None. If None, try to determine
+			the type automatically based on the given graph node / edge labels.
+
+			Available types are:
+
+			- 'attr': Attribute-based environment (with complex node and edge labels).
+			Each node or edge can have multiple key-value label pairs, and each value can
+			be of the following types: int, float, str, list/np.ndarray of int or float.
+			This is the default type if no node or edge labels are provided.
+
+			- 'gxl' or 'str': GXLLabel environment (with string labels). Each node or
+			edge can have multiple key-value label pairs, but all values must be strings.
+			The type will be set to GXL only if at least one node or edge label is
+			provided.
+		"""
 		# @todo: the default settings of the parameters are different from those in the self.compute method.
 		#		self._graphs = None
+		self.env_type = env_type
 		self.ed_method = ed_method
 		self.edit_cost_fun = edit_cost_fun
 		self.init_edit_cost_constants = init_edit_cost_constants
+		self.edit_cost_config = edit_cost_config
 		self.optim_method = optim_method
 		self.optim_options = optim_options
 		self.node_labels = node_labels
@@ -1079,12 +1102,15 @@ def _wrapper_compute_ged(self, itr):
 
 	def compute_ged(self, Gi, Gj, **kwargs):
 		"""
-		Compute GED between two graph according to edit_cost.
+		Compute GED between two graphs according to edit_cost.
 		"""
+		env_type = self.get_env_type(graph=Gi)
 		ged_options = {
+			'env_type': env_type,
 			'edit_cost': self.edit_cost_fun,
 			'method': self.ed_method,
-			'edit_cost_constants': self._edit_cost_constants
+			'edit_cost_constants': self._edit_cost_constants,
+			'edit_cost_config': self.edit_cost_config,
 		}
 		repeats = kwargs.get('repeats', 1)
 		dis, pi_forward, pi_backward = pairwise_ged(
@@ -1103,6 +1129,42 @@ def compute_ged(self, Gi, Gj, **kwargs):
 		return dis, None
 
 
+	def get_env_type(self, graph: nx.Graph | None = None):
+		"""
+		Check the environment type of the graph.
+		If `env_type` is set on initialization, return it.
+		Otherwise, check the given graph's node and edge labels to determine the type.
+
+		Only one node and one edge are checked to determine the type.
+		This function expects that all nodes have the same type of labels, so as all
+		edges.
+		"""
+		if self.env_type is not None:
+			return self.env_type
+		if graph is None:
+			raise ValueError(
+				'Graph is not provided while `env_type` not set on initialization. '
+				'Cannot determine environment type.'
+			)
+		# Use 'gxl' env type only if all nodes and edge labes are strings, and at least one
+		# node or edge label is present:
+		one_n_labels = graph.nodes[list(graph.nodes)[0]]
+		for k, v in one_n_labels.items():
+			if not isinstance(v, str):
+				return 'attr'
+		if nx.number_of_edges(graph) != 0:
+			one_e_labels = graph.edges[list(graph.edges)[0]]
+			for k, v in one_e_labels.items():
+				if not isinstance(v, str):
+					return 'attr'
+		if len(one_n_labels) > 0 or (
+				nx.number_of_edges(graph) != 0 and len(one_e_labels) > 0
+		):
+			return 'gxl'
+		return 'attr'
+
+
+
 	# 	def _compute_kernel_list(self, g1, g_list):
 	# 		start_time = time.time()
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ def fit_model_ged(`
`67`	`67`	`list(all_graphs[idx_edge[0]].edges)[0]].keys()`
`68`	`68`	`)`
`69`	`69`
`70`		`- from gklearn.experiments.ged.ged_model.parallel_version import GEDModel`
	`70`	`+ from gklearn.experiments.ged.ged_model.ged_model_parallel import GEDModel`
`71`	`71`
`72`	`72`	`if parallel is False:`
`73`	`73`	`parallel = None`