Skip to content

Commit

Permalink
Merge pull request #392 from ZJUEarthData/dev/HaibinWang
Browse files Browse the repository at this point in the history
refactor: move classification common function name to enum
  • Loading branch information
SanyHe authored Sep 25, 2024
2 parents 6f7b278 + fadc949 commit 81f00e5
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 22 deletions.
50 changes: 30 additions & 20 deletions geochemistrypi/data_mining/model/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,24 +144,26 @@ def _classification_report(y_true: pd.DataFrame, y_predict: pd.DataFrame, algori
mlflow.log_artifact(os.path.join(store_path, f"Classification Report - {algorithm_name}.txt"))

@staticmethod
def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
def _cross_validation(trained_model: object, X_train: pd.DataFrame, y_train: pd.DataFrame, graph_name: str, average: str, cv_num: int, algorithm_name: str, store_path: str) -> None:
"""Perform cross validation on the model."""
print("-----* Cross Validation *-----")
print(f"-----* {graph_name} *-----")
print(f"K-Folds: {cv_num}")
scores = cross_validation(trained_model, X_train, y_train, average=average, cv_num=cv_num)
scores_str = json.dumps(scores, indent=4)
save_text(scores_str, f"Cross Validation - {algorithm_name}", store_path)
save_text(scores_str, f"{graph_name} - {algorithm_name}", store_path)

@staticmethod
def _plot_confusion_matrix(y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_confusion_matrix(
y_test: pd.DataFrame, y_test_predict: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str
) -> None:
"""Plot the confusion matrix of the model."""
print("-----* Confusion Matrix *-----")
print("-----* {graph_name} *-----")
data = plot_confusion_matrix(y_test, y_test_predict, trained_model)
save_fig(f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
index = [f"true_{i}" for i in range(int(y_test.nunique().values))]
columns = [f"pred_{i}" for i in range(int(y_test.nunique().values))]
data = pd.DataFrame(data, columns=columns, index=index)
save_data(data, name_column, f"Confusion Matrix - {algorithm_name}", local_path, mlflow_path, True)
save_data(data, name_column, f"{graph_name} - {algorithm_name}", local_path, mlflow_path, True)

@staticmethod
def _plot_precision_recall(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, graph_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
Expand Down Expand Up @@ -192,29 +194,29 @@ def _plot_precision_recall_threshold(
save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)

@staticmethod
def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
print("-----* ROC Curve *-----")
y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, algorithm_name)
save_fig(f"ROC Curve - {algorithm_name}", local_path, mlflow_path)
def _plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, name_column: str, graph_name: str, trained_model: object, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
print(f"-----* {graph_name} *-----")
y_probs, fpr, tpr, thresholds = plot_ROC(X_test, y_test, trained_model, graph_name, algorithm_name)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
y_probs = pd.DataFrame(y_probs, columns=["Probabilities"])
fpr = pd.DataFrame(fpr, columns=["False Positive Rate"])
tpr = pd.DataFrame(tpr, columns=["True Positive Rate"])
thresholds = pd.DataFrame(thresholds, columns=["Thresholds"])
save_data(y_probs, name_column, "ROC Curve - Probabilities", local_path, mlflow_path)
save_data(fpr, name_column, "ROC Curve - False Positive Rate", local_path, mlflow_path)
save_data(tpr, name_column, "ROC Curve - True Positive Rate", local_path, mlflow_path)
save_data(thresholds, name_column, "ROC Curve - Thresholds", local_path, mlflow_path)
save_data(y_probs, name_column, f"{graph_name} - Probabilities", local_path, mlflow_path)
save_data(fpr, name_column, f"{graph_name} - False Positive Rate", local_path, mlflow_path)
save_data(tpr, name_column, f"{graph_name} - True Positive Rate", local_path, mlflow_path)
save_data(thresholds, name_column, f"{graph_name} - Thresholds", local_path, mlflow_path)

@staticmethod
def _plot_2d_decision_boundary(
X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
X: pd.DataFrame, X_test: pd.DataFrame, name_column1: str, name_column2: str, trained_model: object, graph_name: str, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str
) -> None:
"""Plot the decision boundary of the trained model with the testing data set below."""
print("-----* Two-dimensional Decision Boundary Diagram *-----")
print(f"-----* {graph_name} *-----")
plot_2d_decision_boundary(X, X_test, trained_model, image_config)
save_fig(f"Decision Boundary - {algorithm_name}", local_path, mlflow_path)
save_data(X, name_column1, "Decision Boundary - X", local_path, mlflow_path)
save_data(X_test, name_column2, "Decision Boundary - X Test", local_path, mlflow_path)
save_fig(f"{graph_name} - {algorithm_name}", local_path, mlflow_path)
save_data(X, name_column1, f"{graph_name} - X", local_path, mlflow_path)
save_data(X_test, name_column2, f"{graph_name} - X Test", local_path, mlflow_path)

@staticmethod
def sample_balance(X_train: pd.DataFrame, y_train: pd.DataFrame, name_column: str, local_path: str, mlflow_path: str) -> tuple:
Expand Down Expand Up @@ -286,6 +288,7 @@ def common_components(self) -> None:
trained_model=self.model,
X_train=ClassificationWorkflowBase.X_train,
y_train=ClassificationWorkflowBase.y_train,
graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
average=average,
cv_num=10,
algorithm_name=self.naming,
Expand All @@ -296,6 +299,7 @@ def common_components(self) -> None:
y_test_predict=ClassificationWorkflowBase.y_test_predict,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.model,
graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -326,6 +330,7 @@ def common_components(self) -> None:
y_test=ClassificationWorkflowBase.y_test,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.model,
graph_name=ClassificationCommonFunction.ROC_CURVE.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -348,6 +353,7 @@ def common_components(self) -> None:
name_column2=ClassificationWorkflowBase.name_test,
trained_model=self.model,
image_config=self.image_config,
graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -374,6 +380,7 @@ def common_components(self, is_automl: bool) -> None:
trained_model=self.auto_model,
X_train=ClassificationWorkflowBase.X_train,
y_train=ClassificationWorkflowBase.y_train,
graph_name=ClassificationCommonFunction.CROSS_VALIDATION.value,
average=average,
cv_num=10,
algorithm_name=self.naming,
Expand All @@ -384,6 +391,7 @@ def common_components(self, is_automl: bool) -> None:
y_test_predict=ClassificationWorkflowBase.y_test_predict,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
graph_name=ClassificationCommonFunction.CONFUSION_MATRIX.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down Expand Up @@ -414,6 +422,7 @@ def common_components(self, is_automl: bool) -> None:
y_test=ClassificationWorkflowBase.y_test,
name_column=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
graph_name=ClassificationCommonFunction.ROC_CURVE.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand All @@ -436,6 +445,7 @@ def common_components(self, is_automl: bool) -> None:
name_column2=ClassificationWorkflowBase.name_test,
trained_model=self.auto_model,
image_config=self.image_config,
graph_name=ClassificationCommonFunction.TWO_DIMENSIONAL_DECISION_BOUNDARY_DIAGRAM.value,
algorithm_name=self.naming,
local_path=GEOPI_OUTPUT_ARTIFACTS_IMAGE_MODEL_OUTPUT_PATH,
mlflow_path=MLFLOW_ARTIFACT_IMAGE_MODEL_OUTPUT_PATH,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def plot_precision_recall_threshold(X_test: pd.DataFrame, y_test: pd.DataFrame,
return y_probs, precisions, recalls, thresholds


def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, algorithm_name: str) -> tuple:
def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object, graph_name: str, algorithm_name: str) -> tuple:
"""Plot the ROC curve.
Parameters
Expand Down Expand Up @@ -324,7 +324,7 @@ def plot_ROC(X_test: pd.DataFrame, y_test: pd.DataFrame, trained_model: object,
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title(f"ROC Curve - {algorithm_name}")
plt.title(f"{graph_name} - {algorithm_name}")
return y_probs, fpr, tpr, thresholds


Expand Down

0 comments on commit 81f00e5

Please sign in to comment.