Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add network analysis #352

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
304 changes: 185 additions & 119 deletions geochemistrypi/data_mining/cli_pipeline.py

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

OPTION = ["Yes", "No"]
DATA_OPTION = ["Own Data", "Testing Data (Built-in)"]
TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection"]
MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Abnormal Detection"]
TEST_DATA_OPTION = ["Data For Regression", "Data For Classification", "Data For Clustering", "Data For Dimensional Reduction", "Data For Abnormal Detection", "Data For Network Analysis"]
MODE_OPTION = ["Regression", "Classification", "Clustering", "Dimensional Reduction", "Abnormal Detection", "Network Analysis"]
MODE_OPTION_WITH_MISSING_VALUES = ["Regression", "Classification", "Clustering"]

# The model provided to use
Expand Down Expand Up @@ -69,6 +69,7 @@
CLUSTERING_MODELS = ["KMeans", "DBSCAN", "Agglomerative", "AffinityPropagation"]
DECOMPOSITION_MODELS = ["PCA", "T-SNE", "MDS"]
ABNORMALDETECTION_MODELS = ["Isolation Forest"]
NETWORKANALYSIS_MODELS = ["Bron Kerbosch Community Detection", "Louvain Method Community Detection"]

# The model can deal with missing values
# Reference: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Expand All @@ -90,7 +91,7 @@


# Special AutoML models
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression"]
NON_AUTOML_MODELS = ["Linear Regression", "Polynomial Regression", "bron_kerbosch, louvain_method"]
RAY_FLAML = ["Multi-layer Perceptron"]

MISSING_VALUE_STRATEGY = ["Drop Rows with Missing Values ", "Impute Missing Values"]
Expand Down
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename the file as "Data_NetworkAnalysis.xlsx"

Binary file not shown.
Empty file.
86 changes: 86 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from itertools import combinations

import numpy as np
import pandas as pd


# Pair each dataframe with every other dataframe in a list of dataframes
def pair_dataframes(dataframes):
pairs = []
for pair in combinations(enumerate(dataframes), 2):
idx1, df1 = pair[0]
idx2, df2 = pair[1]
pairs.append((df1, df2, idx1, idx2))
return pairs


# Convert indices and distances into triplets of mineral IDs and their distance
def convert_to_triplets(indices, distances, mineral_a_ids, mineral_b_ids):
triplets = []

for i, (neighbors, distances_row) in enumerate(zip(indices, distances)):
for neighbor, distance in zip(neighbors, distances_row):
triplet = (mineral_a_ids[i], mineral_b_ids[neighbor], distance)
triplets.append(triplet)

return triplets


# Clean the dataframe of triplets by sorting, removing duplicates, and handling nulls or zeros in distance
def triplets_df_clean(triplets_df):
triplets_df[["Node1", "Node2"]] = np.sort(triplets_df[["Node1", "Node2"]], axis=1)
triplets_df = triplets_df.drop_duplicates(subset=["Node1", "Node2"])
triplets_df["Distance"] = triplets_df["Distance"].apply(lambda x: 0.001 if pd.isnull(x) or x == 0 else x)
triplets_df = triplets_df.dropna(subset=["Distance"])
triplets_df = triplets_df.sort_values(by="Node1")
return triplets_df


# Construct an adjacency matrix from graph data
def construct_adj_matrix(graph_data):
nodes = np.unique(graph_data[["Node1", "Node2"]].values)
num_nodes = len(nodes)
adj_matrix = np.zeros((num_nodes, num_nodes))
node_index_mapping = {node: idx for idx, node in enumerate(nodes)}
for index, row in graph_data.iterrows():
node1, node2, distance = int(row["Node1"]), int(row["Node2"]), row["Distance"]
adj_matrix[node_index_mapping[node1], node_index_mapping[node2]] = distance
adj_matrix[node_index_mapping[node2], node_index_mapping[node1]] = distance
mapping_df = pd.DataFrame(list(node_index_mapping.items()), columns=["Original_Node", "Mapped_Node"])
return adj_matrix, mapping_df


# Update community IDs based on a mapping and calculate unique and repeated counts
def accurate_statistic_algo(communities, ids, group_ids):
result_df = communities.copy()
flat_ids = np.array(ids).flatten()
flat_group_ids = np.array(group_ids).flatten()

for i, row in communities.iloc[1:].iterrows():
for j, val in row.items():
if val in flat_ids:
idx = flat_ids.tolist().index(val)
if idx < len(flat_group_ids):
result_df.at[i, j] = flat_group_ids[idx]
repeated_counts = []
unique_counts = []
for index, row in result_df.iterrows():
row_values = row[1:].dropna()
seen_values = set()
repeated_set = set()
for num in row_values:
if num in seen_values:
repeated_set.add(num)
else:
seen_values.add(num)

repeated_count = len(repeated_set)
unique_count = len(seen_values) - repeated_count

repeated_counts.append(repeated_count)
unique_counts.append(unique_count)

result_df["Repeated_Counts"] = repeated_counts
result_df["Unique_Counts"] = unique_counts

return result_df
73 changes: 73 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_community.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import communities.algorithms
import pandas as pd


# Function to apply the Bron-Kerbosch algorithm to find maximal cliques in an adjacency matrix
def bron_kerbosch_algo(adj_matrix, mapping_df):
# Use the Bron-Kerbosch algorithm from the communities package, with pivoting enabled
communities_list = communities.algorithms.bron_kerbosch(adj_matrix, pivot=True)
# Retrieve the mapping from the dataframe to map back to original node IDs
node_mapping_df = mapping_df
mapping_dict = dict(zip(node_mapping_df["Mapped_Node"], node_mapping_df["Original_Node"]))

# Prepare column names for the dataframe based on the largest community size
column_names = ["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]

# Initialize community data with column names
community_data = [["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]]
# Append community data with community ID and nodes in each community
community_data += [[f"Community {idx + 1}"] + list(community) for idx, community in enumerate(communities_list)]

# Create a dataframe from the community data
bk_df = pd.DataFrame(community_data, columns=column_names)
# Initialize a dataframe for mapped community data
mapped_bk_df = pd.DataFrame(columns=column_names)
# Map the community nodes back to their original IDs
for index, row in bk_df.iterrows():
mapped_row = []
for column in bk_df.columns[1:]:
community_nodes = row[column]

if not isinstance(community_nodes, list):
community_nodes = [community_nodes]

original_nodes = [mapping_dict.get(node, float("nan")) for node in community_nodes]
mapped_row.append(original_nodes)

mapped_bk_df.loc[index] = [row["Community"]] + [item for sublist in mapped_row for item in sublist]
return mapped_bk_df


# Function to apply the Louvain method to find communities in an adjacency matrix
def louvain_method_algo(adj_matrix, mapping_df):
# Use the Louvain method from the communities package
communities_list, _ = communities.algorithms.louvain_method(adj_matrix)
# Retrieve the mapping from the dataframe to map back to original node IDs
node_mapping_df = mapping_df
mapping_dict = dict(zip(node_mapping_df["Mapped_Node"], node_mapping_df["Original_Node"]))
# Prepare column names for the dataframe based on the largest community size
column_names = ["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]

# Initialize community data with column names
community_data = [["Community"] + [f"Node{i + 1}" for i in range(len(max(communities_list, key=len)))]]
# Append community data with community ID and nodes in each community
community_data += [[f"Community {idx + 1}"] + list(community) for idx, community in enumerate(communities_list)]
# Create a dataframe from the community data
louvain_df = pd.DataFrame(community_data, columns=column_names)
# Initialize a dataframe for mapped community data
mapped_louvain_df = pd.DataFrame(columns=column_names)
# Map the community nodes back to their original IDs
for index, row in louvain_df.iterrows():
mapped_row = []

for column in louvain_df.columns[1:]:
community_nodes = row[column]

if not isinstance(community_nodes, list):
community_nodes = [community_nodes]

original_nodes = [mapping_dict.get(node, float("nan")) for node in community_nodes]
mapped_row.append(original_nodes)

mapped_louvain_df.loc[index] = [row["Community"]] + [item for sublist in mapped_row for item in sublist]
return mapped_louvain_df
40 changes: 40 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import scipy


# Calculate the Mahalanobis distance between two single points x and y
def mahalanobis_distance_singal(x, y, inv_cov):
# Compute the difference between the two points
x_minus_y = x - y
# Calculate the Mahalanobis distance using the inverse covariance matrix
return np.sqrt(np.dot(np.dot(x_minus_y, inv_cov), x_minus_y.T))


# Calculate the Mahalanobis distance between two sets of points
def mahalanobis_distance_calculator(mineral_a, mineral_b):
# Calculate the inverse covariance matrix of the first set of points
inv_cov = np.linalg.pinv(np.cov(mineral_a, rowvar=False))
# Use scipy's cdist function to calculate the distance between each pair of points
distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, lambda u, v: mahalanobis_distance_singal(u, v, inv_cov))
return distance


# Calculate the Euclidean distance between two sets of points
def euclidean_distance_calcular(mineral_a, mineral_b):
# Specify the metric as Euclidean
metric = "euclidean"
# Use scipy's cdist function to calculate the distance between each pair of points
distance = scipy.spatial.distance.cdist(mineral_a, mineral_b, metric)
return distance


# Compute the distance between two sets of points using either Mahalanobis or Euclidean metric
def compute_distance_between_2(mineral_a, mineral_b, k=1, metric="euclidean"):
# Select the distance calculation method based on the specified metric
if metric == "mahalanobis":
return mahalanobis_distance_calculator(mineral_a, mineral_b)
elif metric == "euclidean":
return euclidean_distance_calcular(mineral_a, mineral_b)
else:
# Raise an error if an unsupported metric is specified
raise ValueError("Unsupported distance metric. Supported metrics are 'mahalanobis' and 'euclidean'.")
13 changes: 13 additions & 0 deletions geochemistrypi/data_mining/model/func/algo_network/_nearest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import numpy as np


# Function to find the top k nearest neighbors for each point
def top_k_nearest_neighbors(distance, k=5):
# Sort the distances and get indices of the nearest neighbors
nearest_neighbors_indices = np.argsort(distance, axis=1)[:, :k]

# Retrieve the distances of the nearest neighbors using the sorted indices
nearest_neighbors_distances = np.take_along_axis(distance, nearest_neighbors_indices, axis=1)

# Return the indices and distances of the nearest neighbors
return nearest_neighbors_indices, nearest_neighbors_distances
136 changes: 136 additions & 0 deletions geochemistrypi/data_mining/model/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os

import pandas as pd

from ..constants import MLFLOW_ARTIFACT_DATA_PATH
from ..utils.base import save_data
from ._base import WorkflowBase
from .func.algo_network._common import accurate_statistic_algo, construct_adj_matrix, convert_to_triplets, pair_dataframes, triplets_df_clean
from .func.algo_network._community import bron_kerbosch_algo, louvain_method_algo
from .func.algo_network._distance import euclidean_distance_calcular, mahalanobis_distance_calculator
from .func.algo_network._nearest import top_k_nearest_neighbors


class NetworkAnalysisWorkflowBase(WorkflowBase):
def __init__(self) -> None:
# Initialize default values
self.distance_calculator = "EU" # Default distance calculator
self.community_detection_algo = "BK" # Default community detection algorithm
self.minerals = [] # List to store dataframes of minerals
self.ids = [] # List to store ids
self.labels = [] # List to store labels
self.k = 1 # Number of nearest neighbors
self.distances = pd.DataFrame # DataFrame to store distances
self.communities_sample = pd.DataFrame # DataFrame to store sample communities
self.communities_gruop = pd.DataFrame # DataFrame to store grouped communities

def fit(self) -> None:
# Merge features and labels
merged_df = pd.concat([self.X, self.y], axis=1)
split_dfs = []
# Group by mineral type and split
for mineral_type, group in merged_df.groupby("mineral_type"):
split_dfs.append(group)
extracted_dfs = []
# Extract last column from each group
for mineral_type, df in enumerate(split_dfs):
last_column = df.iloc[:, -1:]
extracted_dfs.append(last_column)
# Remove last column from original dataframes
for mineral_type, df in enumerate(split_dfs):
split_dfs[mineral_type] = df.iloc[:, :-1]
self.minerals = split_dfs
self.labels = extracted_dfs

def manual_hyper_parameters(cls) -> None:
"""Manual hyper-parameters specification."""
return dict()

def generate_ids(self):
# Generate unique ids for each mineral
offset = 0
for df in self.minerals:
self.ids.append(list(range(offset, offset + len(df))))
offset += len(df)

def compute_distance(self):
# Compute distances between minerals
all_triplets = []
pair_combinations = pair_dataframes(self.minerals)
# Select distance calculator
if self.distance_calculator == "EU":
distance_func = euclidean_distance_calcular
elif self.distance_calculator == "MA":
distance_func = mahalanobis_distance_calculator
# Compute distances for each pair
for pair in pair_combinations:
mineral1, mineral2, index1, index2 = pair
a_to_b_indices, a_to_b_distances = top_k_nearest_neighbors(distance_func(mineral1, mineral2), self.k)
all_triplets += convert_to_triplets(a_to_b_indices, a_to_b_distances, self.ids[index1], self.ids[index2])
self.distances = triplets_df_clean(pd.DataFrame(all_triplets, columns=["Node1", "Node2", "Distance"]))
# Save distances to file
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.distances, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)

def accuracy_statistic(self):
# Print accuracy statistics
self.communities_gruop = accurate_statistic_algo(self.communities_sample, self.ids, self.labels)
# Save accuracy statistics to file
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.communities_gruop, f"{self.naming} Result Accuracy Statistic", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)


class bron_kerbosch(NetworkAnalysisWorkflowBase):
def __init__(self, X, y) -> None:
super().__init__()
self.naming = "Bron_kerbosch" # Name of the algorithm
self.X = X # Features
self.y = y # Labels
self.community_detection_algo = "BK" # Set community detection algorithm

def community_detection(self):
# Perform community detection using Bron-Kerbosch algorithm
self.fit()
self.generate_ids()
self.compute_distance()
adj_matrix, mapping_df = construct_adj_matrix(self.distances)
self.communities_sample = bron_kerbosch_algo(adj_matrix, mapping_df)
# Save communities to file
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
self.accuracy_statistic()


class louvain_method(NetworkAnalysisWorkflowBase):
def __init__(self, X, y) -> None:
super().__init__()
self.naming = "Louvain_method" # Name of the algorithm
self.X = X # Features
self.y = y # Labels
self.community_detection_algo = "LU" # Set community detection algorithm

def community_detection(self):
# Perform community detection using Louvain method
self.fit()
self.generate_ids()
self.compute_distance()
adj_matrix, mapping_df = construct_adj_matrix(self.distances)
self.communities_sample = louvain_method_algo(adj_matrix, mapping_df)
# Save communities to file
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(self.communities_sample, f"{self.naming} Result", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
self.accuracy_statistic()


class BronKerboschWorkflow(NetworkAnalysisWorkflowBase):
def community_detection(self, X, y):
instance = bron_kerbosch(X, y)
instance.community_detection()
pass


class LouvainMethodWorkflow(NetworkAnalysisWorkflowBase):
def community_detection(self, X, y):
instance = louvain_method(X, y)
instance.community_detection()
pass
Loading
Loading