ZJUEarthData · BBO-Junbo · Jun 2, 2024 · Jun 15, 2024 · SanyHe · Jun 15, 2024
diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py
diff --git a/geochemistrypi/data_mining/constants.py b/geochemistrypi/data_mining/constants.py
@@ -26,8 +26,8 @@
 
 OPTION = ["Yes", "No"]
 DATA_OPTION = ["Own Data", "Testing Data (Built-in)"]
-TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection"]
-MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Abnormal Detection"]
+TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection", "Data For Network Analysis"]
+MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Abnormal Detection", "Network Analysis"]
 MODE_OPTION_WITH_MISSING_VALUES = ["Regression", "Classification", "Clustering"]
 
 # The model provided to use
@@ -69,6 +69,7 @@
 CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"]
 DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
 ABNORMALDETECTION_MODELS = ["Isolation Forest"]
+NETWORKANALYSIS_MODELS = ["Bron Kerbosch Community Detection", "Louvain Method Community Detection"]
 
 # The model can deal with missing values
 # Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
@@ -90,7 +91,7 @@
 
 
 # Special AutoML models
-NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
+NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression", "bron_kerbosch, louvain_method"]
 RAY_FLAML = ["Multi-layer Perceptron"]
 
 MISSING_VALUE_STRATEGY = ["Drop Rows with Missing Values ", "Impute Missing Values"]

diff --git a/geochemistrypi/data_mining/data/dataset/Data_network.xlsx b/geochemistrypi/data_mining/data/dataset/Data_network.xlsx
diff --git a/geochemistrypi/data_mining/model/func/algo_network/__init__.py b/geochemistrypi/data_mining/model/func/algo_network/__init__.py
diff --git a/geochemistrypi/data_mining/model/func/algo_network/_common.py b/geochemistrypi/data_mining/model/func/algo_network/_common.py
@@ -0,0 +1,86 @@
+from itertools import combinations
+
+import numpy as np
+import pandas as pd
+
+
+# Pair each dataframe with every other dataframe in a list of dataframes
+def pair_dataframes(dataframes):
+    pairs = []
+    for pair in combinations(enumerate(dataframes), 2):
+        idx1, df1 = pair[0]
+        idx2, df2 = pair[1]
+        pairs.append((df1, df2, idx1, idx2))
+    return pairs
+
+
+# Convert indices and distances into triplets of mineral IDs and their distance
+def convert_to_triplets(indices, distances, mineral_a_ids, mineral_b_ids):
+    triplets = []
+
+    for i, (neighbors, distances_row) in enumerate(zip(indices, distances)):
+        for neighbor, distance in zip(neighbors, distances_row):
+            triplet = (mineral_a_ids[i], mineral_b_ids[neighbor], distance)
+            triplets.append(triplet)
+
+    return triplets
+
+
+# Clean the dataframe of triplets by sorting, removing duplicates, and handling nulls or zeros in distance
+def triplets_df_clean(triplets_df):
+    triplets_df[["Node1", "Node2"]] = np.sort(triplets_df[["Node1", "Node2"]], axis=1)
+    triplets_df = triplets_df.drop_duplicates(subset=["Node1", "Node2"])
+    triplets_df["Distance"] = triplets_df["Distance"].apply(lambda x: 0.001 if pd.isnull(x) or x == 0 else x)
+    triplets_df = triplets_df.dropna(subset=["Distance"])
+    triplets_df = triplets_df.sort_values(by="Node1")
+    return triplets_df
+
+
+# Construct an adjacency matrix from graph data
+def construct_adj_matrix(graph_data):
+    nodes = np.unique(graph_data[["Node1", "Node2"]].values)
+    num_nodes = len(nodes)
+    adj_matrix = np.zeros((num_nodes, num_nodes))
+    node_index_mapping = {node: idx for idx, node in enumerate(nodes)}
+    for index, row in graph_data.iterrows():
+        node1, node2, distance = int(row["Node1"]), int(row["Node2"]), row["Distance"]
+        adj_matrix[node_index_mapping[node1], node_index_mapping[node2]] = distance
+        adj_matrix[node_index_mapping[node2], node_index_mapping[node1]] = distance
+    mapping_df = pd.DataFrame(list(node_index_mapping.items()), columns=["Original_Node", "Mapped_Node"])
+    return adj_matrix, mapping_df
+
+
+# Update community IDs based on a mapping and calculate unique and repeated counts
+def accurate_statistic_algo(communities, ids, group_ids):
+    result_df = communities.copy()
+    flat_ids = np.array(ids).flatten()
+    flat_group_ids = np.array(group_ids).flatten()
+
+    for i, row in communities.iloc[1:].iterrows():
+        for j, val in row.items():
+            if val in flat_ids:
+                idx = flat_ids.tolist().index(val)
+                if idx < len(flat_group_ids):
+                    result_df.at[i, j] = flat_group_ids[idx]
+    repeated_counts = []
+    unique_counts = []
+    for index, row in result_df.iterrows():
+        row_values = row[1:].dropna()
+        seen_values = set()
+        repeated_set = set()
+        for num in row_values:
+            if num in seen_values:
+                repeated_set.add(num)
+            else:
+                seen_values.add(num)
+
+        repeated_count = len(repeated_set)
+        unique_count = len(seen_values) - repeated_count
+
+        repeated_counts.append(repeated_count)
+        unique_counts.append(unique_count)
+
+    result_df["Repeated_Counts"] = repeated_counts
+    result_df["Unique_Counts"] = unique_counts
+
+    return result_df
diff --git a/geochemistrypi/data_mining/model/func/algo_network/_community.py b/geochemistrypi/data_mining/model/func/algo_network/_community.py
@@ -0,0 +1,73 @@
+import communities.algorithms
+import pandas as pd
+
+
+# Function to apply the Bron-Kerbosch algorithm to find maximal cliques in an adjacency matrix
+def bron_kerbosch_algo(adj_matrix, mapping_df):
+    # Use the Bron-Kerbosch algorithm from the communities package, with pivoting enabled
+    communities_list = communities.algorithms.bron_kerbosch(adj_matrix, pivot=True)
+    # Retrieve the mapping from the dataframe to map back to original node IDs
+    node_mapping_df = mapping_df
+    mapping_dict = dict(zip(node_mapping_df["Mapped_Node"], node_mapping_df["Original_Node"]))
+
+    # Prepare column names for the dataframe based on the largest community size
+    column_names = ["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]
+
+    # Initialize community data with column names
+    community_data = [["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]]
+    # Append community data with community ID and nodes in each community
+    community_data += [[f"Community {idx + 1}"] + list(community) for idx, community in enumerate(communities_list)]
+
+    # Create a dataframe from the community data
+    bk_df = pd.DataFrame(community_data, columns=column_names)
+    # Initialize a dataframe for mapped community data
+    mapped_bk_df = pd.DataFrame(columns=column_names)
+    # Map the community nodes back to their original IDs
+    for index, row in bk_df.iterrows():
+        mapped_row = []
+        for column in bk_df.columns[1:]:
+            community_nodes = row[column]
+
+            if not isinstance(community_nodes, list):
+                community_nodes = [community_nodes]
+
+            original_nodes = [mapping_dict.get(node, float("nan")) for node in community_nodes]
+            mapped_row.append(original_nodes)
+
+        mapped_bk_df.loc[index] = [row["Community"]] + [item for sublist in mapped_row for item in sublist]
+    return mapped_bk_df
+
+
+# Function to apply the Louvain method to find communities in an adjacency matrix
+def louvain_method_algo(adj_matrix, mapping_df):
+    # Use the Louvain method from the communities package
+    communities_list, _ = communities.algorithms.louvain_method(adj_matrix)
+    # Retrieve the mapping from the dataframe to map back to original node IDs
+    node_mapping_df = mapping_df
+    mapping_dict = dict(zip(node_mapping_df["Mapped_Node"], node_mapping_df["Original_Node"]))
+    # Prepare column names for the dataframe based on the largest community size
+    column_names = ["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]
+
+    # Initialize community data with column names
+    community_data = [["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]]
+    # Append community data with community ID and nodes in each community
+    community_data += [[f"Community {idx + 1}"] + list(community) for idx, community in enumerate(communities_list)]
+    # Create a dataframe from the community data
+    louvain_df = pd.DataFrame(community_data, columns=column_names)
+    # Initialize a dataframe for mapped community data
+    mapped_louvain_df = pd.DataFrame(columns=column_names)
+    # Map the community nodes back to their original IDs
+    for index, row in louvain_df.iterrows():
+        mapped_row = []
+
+        for column in louvain_df.columns[1:]:
+            community_nodes = row[column]
+
+            if not isinstance(community_nodes, list):
+                community_nodes = [community_nodes]
+
+            original_nodes = [mapping_dict.get(node, float("nan")) for node in community_nodes]
+            mapped_row.append(original_nodes)
+
+        mapped_louvain_df.loc[index] = [row["Community"]] + [item for sublist in mapped_row for item in sublist]
+    return mapped_louvain_df
diff --git a/geochemistrypi/data_mining/model/func/algo_network/_distance.py b/geochemistrypi/data_mining/model/func/algo_network/_distance.py
@@ -0,0 +1,40 @@
+import numpy as np
+import scipy
+
+
+# Calculate the Mahalanobis distance between two single points x and y
+def mahalanobis_distance_singal(x, y, inv_cov):
+    # Compute the difference between the two points
+    x_minus_y = x - y
+    # Calculate the Mahalanobis distance using the inverse covariance matrix
+    return np.sqrt(np.dot(np.dot(x_minus_y, inv_cov), x_minus_y.T))
+
+
+# Calculate the Mahalanobis distance between two sets of points
+def mahalanobis_distance_calculator(mineral_a, mineral_b):
+    # Calculate the inverse covariance matrix of the first set of points
+    inv_cov = np.linalg.pinv(np.cov(mineral_a, rowvar=False))
+    # Use scipy's cdist function to calculate the distance between each pair of points
+    distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, lambda u, v: mahalanobis_distance_singal(u, v, inv_cov))
+    return distance
+
+
+# Calculate the Euclidean distance between two sets of points
+def euclidean_distance_calcular(mineral_a, mineral_b):
+    # Specify the metric as Euclidean
+    metric = "euclidean"
+    # Use scipy's cdist function to calculate the distance between each pair of points
+    distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, metric)
+    return distance
+
+
+# Compute the distance between two sets of points using either Mahalanobis or Euclidean metric
+def compute_distance_between_2(mineral_a, mineral_b, k=1, metric="euclidean"):
+    # Select the distance calculation method based on the specified metric
+    if metric == "mahalanobis":
+        return mahalanobis_distance_calculator(mineral_a, mineral_b)
+    elif metric == "euclidean":
+        return euclidean_distance_calcular(mineral_a, mineral_b)
+    else:
+        # Raise an error if an unsupported metric is specified
+        raise ValueError("Unsupported distance metric. Supported metrics are 'mahalanobis' and 'euclidean'.")
diff --git a/geochemistrypi/data_mining/model/func/algo_network/_nearest.py b/geochemistrypi/data_mining/model/func/algo_network/_nearest.py
@@ -0,0 +1,13 @@
+import numpy as np
+
+
+# Function to find the top k nearest neighbors for each point
+def top_k_nearest_neighbors(distance, k=5):
+    # Sort the distances and get indices of the nearest neighbors
+    nearest_neighbors_indices = np.argsort(distance, axis=1)[:, :k]
+
+    # Retrieve the distances of the nearest neighbors using the sorted indices
+    nearest_neighbors_distances = np.take_along_axis(distance, nearest_neighbors_indices, axis=1)
+
+    # Return the indices and distances of the nearest neighbors
+    return nearest_neighbors_indices, nearest_neighbors_distances
diff --git a/geochemistrypi/data_mining/model/network.py b/geochemistrypi/data_mining/model/network.py
@@ -0,0 +1,136 @@
+import os
+
+import pandas as pd
+
+from ..constants import MLFLOW_ARTIFACT_DATA_PATH
+from ..utils.base import save_data
+from ._base import WorkflowBase
+from .func.algo_network._common import accurate_statistic_algo, construct_adj_matrix, convert_to_triplets, pair_dataframes, triplets_df_clean
+from .func.algo_network._community import bron_kerbosch_algo, louvain_method_algo
+from .func.algo_network._distance import euclidean_distance_calcular, mahalanobis_distance_calculator
+from .func.algo_network._nearest import top_k_nearest_neighbors
+
+
+class NetworkAnalysisWorkflowBase(WorkflowBase):
+    def __init__(self) -> None:
+        # Initialize default values
+        self.distance_calculator = "EU"  # Default distance calculator
+        self.community_detection_algo = "BK"  # Default community detection algorithm
+        self.minerals = []  # List to store dataframes of minerals
+        self.ids = []  # List to store ids
+        self.labels = []  # List to store labels
+        self.k = 1  # Number of nearest neighbors
+        self.distances = pd.DataFrame  # DataFrame to store distances
+        self.communities_sample = pd.DataFrame  # DataFrame to store sample communities
+        self.communities_gruop = pd.DataFrame  # DataFrame to store grouped communities
+
+    def fit(self) -> None:
+        # Merge features and labels
+        merged_df = pd.concat([self.X, self.y], axis=1)
+        split_dfs = []
+        # Group by mineral type and split
+        for mineral_type, group in merged_df.groupby("mineral_type"):
+            split_dfs.append(group)
+        extracted_dfs = []
+        # Extract last column from each group
+        for mineral_type, df in enumerate(split_dfs):
+            last_column = df.iloc[:, -1:]
+            extracted_dfs.append(last_column)
+        # Remove last column from original dataframes
+        for mineral_type, df in enumerate(split_dfs):
+            split_dfs[mineral_type] = df.iloc[:, :-1]
+        self.minerals = split_dfs
+        self.labels = extracted_dfs
+
+    def manual_hyper_parameters(cls) -> None:
+        """Manual hyper-parameters specification."""
+        return dict()
+
+    def generate_ids(self):
+        # Generate unique ids for each mineral
+        offset = 0
+        for df in self.minerals:
+            self.ids.append(list(range(offset, offset + len(df))))
+            offset += len(df)
+
+    def compute_distance(self):
+        # Compute distances between minerals
+        all_triplets = []
+        pair_combinations = pair_dataframes(self.minerals)
+        # Select distance calculator
+        if self.distance_calculator == "EU":
+            distance_func = euclidean_distance_calcular
+        elif self.distance_calculator == "MA":
+            distance_func = mahalanobis_distance_calculator
+        # Compute distances for each pair
+        for pair in pair_combinations:
+            mineral1, mineral2, index1, index2 = pair
+            a_to_b_indices, a_to_b_distances = top_k_nearest_neighbors(distance_func(mineral1, mineral2), self.k)
+            all_triplets += convert_to_triplets(a_to_b_indices, a_to_b_distances, self.ids[index1], self.ids[index2])
+        self.distances = triplets_df_clean(pd.DataFrame(all_triplets, columns=["Node1", "Node2", "Distance"]))
+        # Save distances to file
+        GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
+        save_data(self.distances, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+
+    def accuracy_statistic(self):
+        # Print accuracy statistics
+        self.communities_gruop = accurate_statistic_algo(self.communities_sample, self.ids, self.labels)
+        # Save accuracy statistics to file
+        GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
+        save_data(self.communities_gruop, f"{self.naming} Result Accuracy Statistic", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+
+
+class bron_kerbosch(NetworkAnalysisWorkflowBase):
+    def __init__(self, X, y) -> None:
+        super().__init__()
+        self.naming = "Bron_kerbosch"  # Name of the algorithm
+        self.X = X  # Features
+        self.y = y  # Labels
+        self.community_detection_algo = "BK"  # Set community detection algorithm
+
+    def community_detection(self):
+        # Perform community detection using Bron-Kerbosch algorithm
+        self.fit()
+        self.generate_ids()
+        self.compute_distance()
+        adj_matrix, mapping_df = construct_adj_matrix(self.distances)
+        self.communities_sample = bron_kerbosch_algo(adj_matrix, mapping_df)
+        # Save communities to file
+        GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
+        save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        self.accuracy_statistic()
+
+
+class louvain_method(NetworkAnalysisWorkflowBase):
+    def __init__(self, X, y) -> None:
+        super().__init__()
+        self.naming = "Louvain_method"  # Name of the algorithm
+        self.X = X  # Features
+        self.y = y  # Labels
+        self.community_detection_algo = "LU"  # Set community detection algorithm
+
+    def community_detection(self):
+        # Perform community detection using Louvain method
+        self.fit()
+        self.generate_ids()
+        self.compute_distance()
+        adj_matrix, mapping_df = construct_adj_matrix(self.distances)
+        self.communities_sample = louvain_method_algo(adj_matrix, mapping_df)
+        # Save communities to file
+        GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
+        save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
+        self.accuracy_statistic()
+
+
+class BronKerboschWorkflow(NetworkAnalysisWorkflowBase):
+    def community_detection(self, X, y):
+        instance = bron_kerbosch(X, y)
+        instance.community_detection()
+        pass
+
+
+class LouvainMethodWorkflow(NetworkAnalysisWorkflowBase):
+    def community_detection(self, X, y):
+        instance = louvain_method(X, y)
+        instance.community_detection()
+        pass