From bda2e640c8c0db458e0b2c6babd6a860980aec12 Mon Sep 17 00:00:00 2001 From: ThomasBouche Date: Tue, 23 Aug 2022 09:31:41 +0200 Subject: [PATCH] :bug: fix dataset names --- eurybia/core/smartdrift.py | 43 +++++++++++++++--------- eurybia/core/smartplotter.py | 4 +-- tests/unit_tests/core/test_smartdrift.py | 14 ++++++++ 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/eurybia/core/smartdrift.py b/eurybia/core/smartdrift.py index 8e9e691..897eddf 100644 --- a/eurybia/core/smartdrift.py +++ b/eurybia/core/smartdrift.py @@ -8,7 +8,6 @@ import pickle import shutil import tempfile -import warnings from pathlib import Path from typing import Dict, Text @@ -290,7 +289,7 @@ def compile( loss_function=hyperparameter["loss_function"], eval_metric=hyperparameter["eval_metric"], task_type="CPU", - allow_writing_files=False + allow_writing_files=False, ) datadrift_classifier = datadrift_classifier.fit(train_pool_cat, eval_set=test_pool_cat, silent=True) @@ -318,8 +317,12 @@ def compile( self.pb_cols, self.err_mods = pb_cols, err_mods if self.deployed_model is not None: self.js_divergence = compute_js_divergence( - self.df_predict.loc[lambda df: df["dataset"] == "Baseline dataset", :]["Score"].values, - self.df_predict.loc[lambda df: df["dataset"] == "Current dataset", :]["Score"].values, + self.df_predict.loc[lambda df: df["dataset"] == self.dataset_names["df_baseline"].values[0], :][ + "Score" + ].values, + self.df_predict.loc[lambda df: df["dataset"] == self.dataset_names["df_current"].values[0], :][ + "Score" + ].values, n_bins=20, ) @@ -405,22 +408,27 @@ def _analyze_consistency(self, full_validation=False, ignore_cols: list = list() new_cols = [c for c in self.df_baseline.columns if c not in self.df_current.columns] removed_cols = [c for c in self.df_current.columns if c not in self.df_baseline.columns] if len(new_cols) > 0: - print(f"""The following variables are no longer available in the - current dataset and will not be analyzed: \n {new_cols}""") + print( + f"""The following variables are no longer available in the + current dataset and will not be analyzed: \n {new_cols}""" + ) if len(removed_cols) > 0: - print(f"""The following variables are only available in the - current dataset and will not be analyzed: \n {removed_cols}""") + print( + f"""The following variables are only available in the + current dataset and will not be analyzed: \n {removed_cols}""" + ) common_cols = [c for c in self.df_current.columns if c in self.df_baseline.columns] # dtypes err_dtypes = [ c for c in common_cols if self.df_baseline.dtypes.map(str)[c] != self.df_current.dtypes.map(str)[c] ] if len(err_dtypes) > 0: - print(f"""The following variables have mismatching dtypes - and will not be analyzed: \n {err_dtypes}""") + print( + f"""The following variables have mismatching dtypes + and will not be analyzed: \n {err_dtypes}""" + ) # Feature values err_mods: Dict[Text, Dict] = {} - variables_mm_mods = [] if full_validation is True: invalid_cols = ignore_cols + new_cols + removed_cols + err_dtypes for column in self.df_baseline.columns: @@ -433,8 +441,10 @@ def _analyze_consistency(self, full_validation=False, ignore_cols: list = list() err_mods[column] = {} err_mods[column]["New distinct values"] = new_mods err_mods[column]["Removed distinct values"] = removed_mods - print(f"""The variable {column} has mismatching unique values: -{new_mods} | {removed_mods}\n""") + print( + f"""The variable {column} has mismatching unique values: +{new_mods} | {removed_mods}\n""" + ) return ({"New columns": new_cols, "Removed columns": removed_cols, "Type errors": err_dtypes}, err_mods) def _predict(self, deployed_model=None, encoding=None): @@ -716,13 +726,14 @@ def _compute_datadrift_stat_test(self, max_size=50000, categ_max=20): test = ksmirnov_test(current[features].to_numpy(), baseline[features].to_numpy()) except BaseException as e: raise Exception( - """ + """ There is a problem with the format of {} column between the two datasets. Error: """.format( - str(features) + str(features) + ) + + str(e) ) - + str(e)) test_results[features] = test return pd.DataFrame.from_dict(test_results, orient="index") diff --git a/eurybia/core/smartplotter.py b/eurybia/core/smartplotter.py index 9505d6a..aba87d7 100644 --- a/eurybia/core/smartplotter.py +++ b/eurybia/core/smartplotter.py @@ -557,7 +557,7 @@ def generate_modeldrift_data( template: Optional[str] = None, title: Optional[str] = None, xaxis_title: Optional[str] = None, - yaxis_title: Optional[str] = None, + yaxis_title: Optional[dict] = None, xaxis: Optional[str] = None, height: Optional[str] = None, width: Optional[str] = None, @@ -578,7 +578,7 @@ def generate_modeldrift_data( Plot title xaxis_title: str, optional X axis title - yaxis_title: str, optional + yaxis_title: dict, optional y axis title xaxis: str, optional X axis options (spike line, margin, range ...) diff --git a/tests/unit_tests/core/test_smartdrift.py b/tests/unit_tests/core/test_smartdrift.py index 27eb6e6..e0fed75 100644 --- a/tests/unit_tests/core/test_smartdrift.py +++ b/tests/unit_tests/core/test_smartdrift.py @@ -102,6 +102,20 @@ def test_compile_model_encoder(self): smart_drift.compile() assert isinstance(smart_drift.xpl, shapash.explainer.smart_explainer.SmartExplainer) + def test_compile_dataset_names(self): + """ + test compile() with a model and an encoder specified + """ + smart_drift = SmartDrift( + self.titanic_df_1, + self.titanic_df_2, + deployed_model=self.rf, + encoding=self.categ_encoding, + dataset_names={"df_current": "titanic 2", "df_baseline": "titanic 1"}, + ) + smart_drift.compile() + assert isinstance(smart_drift.xpl, shapash.explainer.smart_explainer.SmartExplainer) + def test_generate_report_fullvalid(self): """ test generate_report() with fullvalidation option specified to True