From 2ff709c0f3d40c15e43afcd84e94edae0827b4da Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 13:18:48 -0400 Subject: [PATCH 01/34] Create PSI Detector.py PSI Drift Detector --- PSI Detector.py | 124 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 PSI Detector.py diff --git a/PSI Detector.py b/PSI Detector.py new file mode 100644 index 00000000..fd6ebac9 --- /dev/null +++ b/PSI Detector.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[57]: + + +from detector import * +class PSI_Detector(BatchDetector): + + input_type = "batch" + + def __init__( + self, + + ): + + # This initializes batch detector's parent class + super().__init__() + + # Initialize any parameters to be used in this algorithm + + + def set_reference(self, X, y_true=None, y_pred=None): + + # leave this, it uses the parent class to validate input + # and sets the self.reference variable to refer to the reference dataset + + X, _, _ = super()._validate_input(X, None, None) + X = pd.DataFrame( + X, columns=self._input_cols + ) + + # Initialize reference dataset + self.reference = copy.deepcopy(X) + + self.reset() + + + def reset(self): + + + super().reset() + + + def update(self, X, by_feature=True, X_by_feature=None, y_true=None, y_pred=None): + + # this function will update the detector with the test batch + eps = 1e-4 + # create a variable to store psi values for each feature + feature_psi = [] + o = pd.DataFrame( + columns=["Column", 'Moderate population change', 'Significant population change',"PSI"]) + z = 0 + # 1. iterate through each feature in reference and test data, identify minimum and maximum value + for column in self.reference.columns: + min_val = min(min(self.reference[column]), min(X[column])) + max_val = max(max(self.reference[column]), max(X[column])) + + # 2. use _bin_data function to bucketize reference, append to reference buckets array + bins = self._bin_data(self.reference[column],min_val,max_val) + bins_initial = pd.cut(self.reference[column], bins = bins, labels = range(1,len(bins))) + df_initial = pd.DataFrame({'initial': self.reference[column], 'bin': bins_initial}) + grp_initial = df_initial.groupby('bin').count() + grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial']) + # 3. use _bin_data function to bucketize test, append to reference test array + bins_new = pd.cut(X[column], bins = bins, labels = range(1,len(bins))) + df_new = pd.DataFrame({'new': X[column], 'bin': bins_new}) + grp_new = df_new.groupby('bin').count() + grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new']) + # 4. Call PSI function to calculate PSI on test and reference bucket representation, + psi_value = self._PSI(grp_initial,grp_new) + feature_psi.append([column,psi_value]) + # store PSI for each feature in feature_psi array + if psi_value >= 0.1 and psi_value <= 0.2: + o.loc[z] = [column,'Yes','No',psi_value] + z += 1 + elif psi_value > 0.2: + o.loc[z] = [column,'No','Yes',psi_value] + z += 1 + # 5. Aggregate PSI values to determine if dataset is drifting + if o.any()['Column'] == True: + self.drift_state == 'drift' + return feature_psi, o + else: + return 'no drift detected',feature_psi + # If PSI indicates drift, set self.drift_state == 'drift' + + # Create a dictionary to store if each individual feature is drifting + + # Update self.reference dataset to refer to test data, X + self.reference = X + + + + def _bin_data(self, feature, min, max): + eps = 1e-4 + if len(feature.unique()) < 10: + bins = [min + (max - min)*(i)/len(feature.unique()) for i in range(len(feature.unique())+1)] + bins[0] = min - eps # Correct the lower boundary + bins[-1] = max + eps # Correct the higher boundary + return bins + else: + bins = [min + (max - min)*(i)/10 for i in range(10+1)] + bins[0] = min - eps # Correct the lower boundary + bins[-1] = max + eps # Correct the higher boundary + return bins + # return an array containing the sample counts within each bucket + + + def _PSI(self, reference_feature, test_feature): + eps = 1e-4 + # Compare the bins to calculate PSI + psi_df = reference_feature.join(test_feature, on = "bin", how = "inner") + + # Add a small value for when the percent is zero + psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: eps if x == 0 else x) + psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: eps if x == 0 else x) + + # Calculate the psi + psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new']) + + # Return the mean of psi values + return np.mean(psi_df['psi']) + From 2c4a9e38673f8f6e3a2406582dd5498d30fcfed4 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:06:48 -0400 Subject: [PATCH 02/34] Move file to a different location --- PSI Detector.py => menelaus/data_drift/PSI Detector.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename PSI Detector.py => menelaus/data_drift/PSI Detector.py (100%) diff --git a/PSI Detector.py b/menelaus/data_drift/PSI Detector.py similarity index 100% rename from PSI Detector.py rename to menelaus/data_drift/PSI Detector.py From 360f552419a32ec43de92e0f22f9a5a8c8e24634 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:19:52 -0400 Subject: [PATCH 03/34] Update PSI Detector.py --- menelaus/data_drift/PSI Detector.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/menelaus/data_drift/PSI Detector.py b/menelaus/data_drift/PSI Detector.py index fd6ebac9..e8842e1c 100644 --- a/menelaus/data_drift/PSI Detector.py +++ b/menelaus/data_drift/PSI Detector.py @@ -1,10 +1,6 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[57]: +from detector import * -from detector import * class PSI_Detector(BatchDetector): input_type = "batch" From fd3cac56c6539fd7b5a74b5311942e32601d2b1e Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:22:21 -0400 Subject: [PATCH 04/34] Update PSI Detector.py --- menelaus/data_drift/PSI Detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/menelaus/data_drift/PSI Detector.py b/menelaus/data_drift/PSI Detector.py index e8842e1c..27fac056 100644 --- a/menelaus/data_drift/PSI Detector.py +++ b/menelaus/data_drift/PSI Detector.py @@ -1,4 +1,4 @@ -from detector import * +from menelaus.detector import BatchDetector class PSI_Detector(BatchDetector): From b806132f0f99903c7e2022c04a381caf520e712c Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:27:55 -0400 Subject: [PATCH 05/34] Update PSI Detector.py --- menelaus/data_drift/PSI Detector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/menelaus/data_drift/PSI Detector.py b/menelaus/data_drift/PSI Detector.py index 27fac056..7d638c8b 100644 --- a/menelaus/data_drift/PSI Detector.py +++ b/menelaus/data_drift/PSI Detector.py @@ -1,5 +1,6 @@ from menelaus.detector import BatchDetector - +import pandas as pd +import numpy as np class PSI_Detector(BatchDetector): From 6e18b840e39294b64467a6d16c7246b6ecb384a3 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 15:46:04 -0400 Subject: [PATCH 06/34] Update PSI Detector.py --- menelaus/data_drift/PSI Detector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/menelaus/data_drift/PSI Detector.py b/menelaus/data_drift/PSI Detector.py index 7d638c8b..b1a9d67a 100644 --- a/menelaus/data_drift/PSI Detector.py +++ b/menelaus/data_drift/PSI Detector.py @@ -1,6 +1,8 @@ from menelaus.detector import BatchDetector import pandas as pd import numpy as np +import copy + class PSI_Detector(BatchDetector): From 5c7de0ccf4c07cb1bb51922fcbe2ff1e8733e04b Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:06:29 -0400 Subject: [PATCH 07/34] rename --- menelaus/data_drift/{PSI Detector.py => psi_detector.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename menelaus/data_drift/{PSI Detector.py => psi_detector.py} (100%) diff --git a/menelaus/data_drift/PSI Detector.py b/menelaus/data_drift/psi_detector.py similarity index 100% rename from menelaus/data_drift/PSI Detector.py rename to menelaus/data_drift/psi_detector.py From 5c0039f671e9192d0dc006c58fdda3bfc2ea9601 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:16:50 -0400 Subject: [PATCH 08/34] Reformatted code with black --- menelaus/data_drift/psi_detector.py | 135 ++++++++++++++-------------- 1 file changed, 70 insertions(+), 65 deletions(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index b1a9d67a..a0fd1ec3 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -5,119 +5,124 @@ class PSI_Detector(BatchDetector): - input_type = "batch" def __init__( self, - ): - # This initializes batch detector's parent class - super().__init__() - - # Initialize any parameters to be used in this algorithm + super().__init__() + # Initialize any parameters to be used in this algorithm def set_reference(self, X, y_true=None, y_pred=None): - # leave this, it uses the parent class to validate input - # and sets the self.reference variable to refer to the reference dataset - + # and sets the self.reference variable to refer to the reference dataset + X, _, _ = super()._validate_input(X, None, None) - X = pd.DataFrame( - X, columns=self._input_cols - ) + X = pd.DataFrame(X, columns=self._input_cols) # Initialize reference dataset self.reference = copy.deepcopy(X) - self.reset() - - - def reset(self): - + self.reset() + def reset(self): super().reset() - def update(self, X, by_feature=True, X_by_feature=None, y_true=None, y_pred=None): - - # this function will update the detector with the test batch + # this function will update the detector with the test batch eps = 1e-4 - # create a variable to store psi values for each feature + # create a variable to store psi values for each feature feature_psi = [] o = pd.DataFrame( - columns=["Column", 'Moderate population change', 'Significant population change',"PSI"]) + columns=[ + "Column", + "Moderate population change", + "Significant population change", + "PSI", + ] + ) z = 0 - # 1. iterate through each feature in reference and test data, identify minimum and maximum value + # 1. iterate through each feature in reference and test data, identify minimum and maximum value for column in self.reference.columns: min_val = min(min(self.reference[column]), min(X[column])) max_val = max(max(self.reference[column]), max(X[column])) - - # 2. use _bin_data function to bucketize reference, append to reference buckets array - bins = self._bin_data(self.reference[column],min_val,max_val) - bins_initial = pd.cut(self.reference[column], bins = bins, labels = range(1,len(bins))) - df_initial = pd.DataFrame({'initial': self.reference[column], 'bin': bins_initial}) - grp_initial = df_initial.groupby('bin').count() - grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial']) - # 3. use _bin_data function to bucketize test, append to reference test array - bins_new = pd.cut(X[column], bins = bins, labels = range(1,len(bins))) - df_new = pd.DataFrame({'new': X[column], 'bin': bins_new}) - grp_new = df_new.groupby('bin').count() - grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new']) + + # 2. use _bin_data function to bucketize reference, append to reference buckets array + bins = self._bin_data(self.reference[column], min_val, max_val) + bins_initial = pd.cut( + self.reference[column], bins=bins, labels=range(1, len(bins)) + ) + df_initial = pd.DataFrame( + {"initial": self.reference[column], "bin": bins_initial} + ) + grp_initial = df_initial.groupby("bin").count() + grp_initial["percent_initial"] = grp_initial["initial"] / sum( + grp_initial["initial"] + ) + # 3. use _bin_data function to bucketize test, append to reference test array + bins_new = pd.cut(X[column], bins=bins, labels=range(1, len(bins))) + df_new = pd.DataFrame({"new": X[column], "bin": bins_new}) + grp_new = df_new.groupby("bin").count() + grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"]) # 4. Call PSI function to calculate PSI on test and reference bucket representation, - psi_value = self._PSI(grp_initial,grp_new) - feature_psi.append([column,psi_value]) - # store PSI for each feature in feature_psi array + psi_value = self._PSI(grp_initial, grp_new) + feature_psi.append([column, psi_value]) + # store PSI for each feature in feature_psi array if psi_value >= 0.1 and psi_value <= 0.2: - o.loc[z] = [column,'Yes','No',psi_value] + o.loc[z] = [column, "Yes", "No", psi_value] z += 1 elif psi_value > 0.2: - o.loc[z] = [column,'No','Yes',psi_value] + o.loc[z] = [column, "No", "Yes", psi_value] z += 1 # 5. Aggregate PSI values to determine if dataset is drifting - if o.any()['Column'] == True: - self.drift_state == 'drift' + if o.any()["Column"] == True: + self.drift_state == "drift" return feature_psi, o else: - return 'no drift detected',feature_psi + return "no drift detected", feature_psi # If PSI indicates drift, set self.drift_state == 'drift' - - # Create a dictionary to store if each individual feature is drifting - # Update self.reference dataset to refer to test data, X - self.reference = X + # Create a dictionary to store if each individual feature is drifting - + # Update self.reference dataset to refer to test data, X + self.reference = X def _bin_data(self, feature, min, max): eps = 1e-4 if len(feature.unique()) < 10: - bins = [min + (max - min)*(i)/len(feature.unique()) for i in range(len(feature.unique())+1)] - bins[0] = min - eps # Correct the lower boundary - bins[-1] = max + eps # Correct the higher boundary + bins = [ + min + (max - min) * (i) / len(feature.unique()) + for i in range(len(feature.unique()) + 1) + ] + bins[0] = min - eps # Correct the lower boundary + bins[-1] = max + eps # Correct the higher boundary return bins else: - bins = [min + (max - min)*(i)/10 for i in range(10+1)] - bins[0] = min - eps # Correct the lower boundary - bins[-1] = max + eps # Correct the higher boundary + bins = [min + (max - min) * (i) / 10 for i in range(10 + 1)] + bins[0] = min - eps # Correct the lower boundary + bins[-1] = max + eps # Correct the higher boundary return bins - # return an array containing the sample counts within each bucket - + # return an array containing the sample counts within each bucket def _PSI(self, reference_feature, test_feature): eps = 1e-4 # Compare the bins to calculate PSI - psi_df = reference_feature.join(test_feature, on = "bin", how = "inner") - + psi_df = reference_feature.join(test_feature, on="bin", how="inner") + # Add a small value for when the percent is zero - psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: eps if x == 0 else x) - psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: eps if x == 0 else x) - + psi_df["percent_initial"] = psi_df["percent_initial"].apply( + lambda x: eps if x == 0 else x + ) + psi_df["percent_new"] = psi_df["percent_new"].apply( + lambda x: eps if x == 0 else x + ) + # Calculate the psi - psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new']) - - # Return the mean of psi values - return np.mean(psi_df['psi']) + psi_df["psi"] = (psi_df["percent_initial"] - psi_df["percent_new"]) * np.log( + psi_df["percent_initial"] / psi_df["percent_new"] + ) + # Return the mean of psi values + return np.mean(psi_df["psi"]) From 2b808ab1cb7e6f24339fe046246c09cf333a9888 Mon Sep 17 00:00:00 2001 From: Thomas Schill <33845624+tms-bananaquit@users.noreply.github.com> Date: Fri, 17 Nov 2023 13:01:10 -0500 Subject: [PATCH 09/34] add PSI to data_drift.init, add skeleton for unit tests --- menelaus/data_drift/__init__.py | 7 ++++--- tests/menelaus/data_drift/test_psi_detector.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tests/menelaus/data_drift/test_psi_detector.py diff --git a/menelaus/data_drift/__init__.py b/menelaus/data_drift/__init__.py index c14608c8..67f121ad 100644 --- a/menelaus/data_drift/__init__.py +++ b/menelaus/data_drift/__init__.py @@ -13,9 +13,10 @@ are applied and when the results are verified. Data drift detection is also applicable in unsupervised learning settings. """ +from menelaus.data_drift.cdbd import CDBD from menelaus.data_drift.hdddm import HDDDM +from menelaus.data_drift.histogram_density_method import HistogramDensityMethod from menelaus.data_drift.kdq_tree import KdqTreeStreaming, KdqTreeBatch -from menelaus.data_drift.pca_cd import PCACD from menelaus.data_drift.nndvi import NNDVI -from menelaus.data_drift.cdbd import CDBD -from menelaus.data_drift.histogram_density_method import HistogramDensityMethod +from menelaus.data_drift.pca_cd import PCACD +from menelaus.data_drift.psi_detector import PSI diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py new file mode 100644 index 00000000..efe8c3b3 --- /dev/null +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -0,0 +1,16 @@ +import pytest +import numpy as np +from menelaus.data_drift.psi_detector import PSI + +def test_validation(...): + ... + +def test_set_reference(...): + ... + +def test_no_drift(...): + ... + + +def test_drift(...): + ... \ No newline at end of file From 954157413479e8a37f72824b47f0ccc832eaef1e Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:36:24 -0500 Subject: [PATCH 10/34] Update refs.bib Adding PSI --- docs/source/refs.bib | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/source/refs.bib b/docs/source/refs.bib index 897cb234..2c28700d 100644 --- a/docs/source/refs.bib +++ b/docs/source/refs.bib @@ -183,4 +183,20 @@ @misc{souza2020 year={2020}, howpublished="\url{https://arxiv.org/abs/2005.00113}", note={Online; accessed 20-July-2022}, -} \ No newline at end of file +} + +@misc{Psi2022, + title={Is your ML model stable? Checking model stability and population drift with PSI and CSI}, + author={Vinícius Trevisan}, + year={2022}, + howpublished="\url{https://towardsdatascience.com/checking-model-stability-and-population-shift-with-psi-and-csi-6d12af008783}", + note={Online; accessed 20-June-2023}, +} + +@misc{Psi2022, + title={Population Stability Index (PSI)}, + author={Selva Prabhakaran}, + year={2022}, + howpublished="\url{https://www.machinelearningplus.com/deployment/population-stability-index-psi/}", + note={Online; accessed 20-June-2023}, +} From 395d84003d5c0ee2d466b348ca5ec438189b68ba Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:28:24 -0500 Subject: [PATCH 11/34] Update psi_detector.py --- menelaus/data_drift/psi_detector.py | 221 +++++++++++++++------------- 1 file changed, 118 insertions(+), 103 deletions(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index a0fd1ec3..9e84e6ce 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -1,128 +1,143 @@ from menelaus.detector import BatchDetector import pandas as pd import numpy as np -import copy - - class PSI_Detector(BatchDetector): + """ + Parent class for PSI-based drift detector, it serves as a fundamental framework for batch data applications. + + The PSI (Population Stability Index) is employed for detecting distributional shifts between a reference population + and a comparison population. This detector assesses changes by calculating the PSI, which measures the distributional + change based on percentiles. The psi function in the detector compares the distributions of scores in reference and + test populations and calculates the PSI values for different bins. + + In summary, the PSI drift detector provides a robust mechanism for monitoring and detecting distributional changes in + populations, making it adaptable for various data settings and applications. + + Ref. :cite:t:`Psi2022` + """ input_type = "batch" def __init__( self, + eps=1e-4, + threshold=0.1 ): - # This initializes batch detector's parent class - super().__init__() + """ + Args: + eps:The eps parameter in the function represents a small constant (1e-4) introduced to prevent division by zero + when calculating percentages, ensuring numerical stability. + threshold: It represents the threshold for detecting drift, if the calculated PSI value for a feature exceeds + this threshold, it indicates drift in that feature, and the drift_state is set to 'drift'. This threshold is a + user-defined value, and when crossed, it signifies a significant distributional change between the reference + and test populations. + """ + super().__init__() + self.eps = eps + self.threshold = threshold - # Initialize any parameters to be used in this algorithm def set_reference(self, X, y_true=None, y_pred=None): - # leave this, it uses the parent class to validate input - # and sets the self.reference variable to refer to the reference dataset - + """ + Set the detector's reference batch to an updated value; typically + used in ``update``. + + Attributes: + X (numpy.array): updated reference batch + y_true (numpy.array): true labels, not used in NNDVI + y_pred (numpy.array): predicted labels, not used in NNDVI + """ X, _, _ = super()._validate_input(X, None, None) - X = pd.DataFrame(X, columns=self._input_cols) - - # Initialize reference dataset - self.reference = copy.deepcopy(X) - - self.reset() - + self.reference = X.reshape(len(X),) + + def reset(self): + """ + Initialize relevant attributes to original values, to ensure information + only stored from samples_since_reset onwards. Intended for use + after ``drift_state == 'drift'``. + """ super().reset() - def update(self, X, by_feature=True, X_by_feature=None, y_true=None, y_pred=None): - # this function will update the detector with the test batch - eps = 1e-4 - # create a variable to store psi values for each feature - feature_psi = [] - o = pd.DataFrame( - columns=[ - "Column", - "Moderate population change", - "Significant population change", - "PSI", - ] - ) - z = 0 - # 1. iterate through each feature in reference and test data, identify minimum and maximum value - for column in self.reference.columns: - min_val = min(min(self.reference[column]), min(X[column])) - max_val = max(max(self.reference[column]), max(X[column])) - - # 2. use _bin_data function to bucketize reference, append to reference buckets array - bins = self._bin_data(self.reference[column], min_val, max_val) - bins_initial = pd.cut( - self.reference[column], bins=bins, labels=range(1, len(bins)) - ) - df_initial = pd.DataFrame( - {"initial": self.reference[column], "bin": bins_initial} - ) - grp_initial = df_initial.groupby("bin").count() - grp_initial["percent_initial"] = grp_initial["initial"] / sum( - grp_initial["initial"] - ) - # 3. use _bin_data function to bucketize test, append to reference test array - bins_new = pd.cut(X[column], bins=bins, labels=range(1, len(bins))) - df_new = pd.DataFrame({"new": X[column], "bin": bins_new}) - grp_new = df_new.groupby("bin").count() - grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"]) - # 4. Call PSI function to calculate PSI on test and reference bucket representation, - psi_value = self._PSI(grp_initial, grp_new) - feature_psi.append([column, psi_value]) - # store PSI for each feature in feature_psi array - if psi_value >= 0.1 and psi_value <= 0.2: - o.loc[z] = [column, "Yes", "No", psi_value] - z += 1 - elif psi_value > 0.2: - o.loc[z] = [column, "No", "Yes", psi_value] - z += 1 - # 5. Aggregate PSI values to determine if dataset is drifting - if o.any()["Column"] == True: - self.drift_state == "drift" - return feature_psi, o - else: - return "no drift detected", feature_psi - # If PSI indicates drift, set self.drift_state == 'drift' + + def update(self, X: np.array, y_true=None, y_pred=None): + """ + Update the detector with a new test batch. If drift is detected, new + reference batch becomes most recent test batch. - # Create a dictionary to store if each individual feature is drifting + Args: + X (numpy.array): next batch of data to detect drift on. + y_true (numpy.array): true labels, not used in PSI + y_pred (numpy.array): predicted labels, not used in PSI + """ + if self._drift_state == "drift": + self.reset() - # Update self.reference dataset to refer to test data, X - self.reference = X + X, _, _ = super()._validate_input(X, None, None) + + super().update(X=X, y_true=None, y_pred=None) + test_batch = (np.array(X)).reshape(len(X),) + min_val = min(min(self.reference), min(test_batch)) + max_val = max(max(self.reference), max(test_batch)) + bins = self._bin_data(self.reference,min_val,max_val) + bins_initial = pd.cut(self.reference, bins = bins, labels = range(1,len(bins))) + df_initial = pd.DataFrame({'initial': self.reference, 'bin': bins_initial}) + grp_initial = df_initial.groupby('bin').count() + grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial']) + bins_new = pd.cut(test_batch, bins = bins, labels = range(1,len(bins))) + df_new = pd.DataFrame({'new': test_batch, 'bin': bins_new}) + grp_new = df_new.groupby('bin').count() + grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new']) + psi_value = self._PSI(grp_initial,grp_new) + self.PSI_value = psi_value + if psi_value >= self.threshold: + self._drift_state = "drift" + self.set_reference(test_batch) def _bin_data(self, feature, min, max): - eps = 1e-4 - if len(feature.unique()) < 10: - bins = [ - min + (max - min) * (i) / len(feature.unique()) - for i in range(len(feature.unique()) + 1) - ] - bins[0] = min - eps # Correct the lower boundary - bins[-1] = max + eps # Correct the higher boundary + """ + Bin the given feature based on the specified minimum and maximum values. + + Args: + feature (numpy.array): The feature to be binned. + min (float): The minimum value for binning. + max (float): The maximum value for binning. + + Returns: + list: A list of bin edges for the given feature. + """ + if len(np.unique(feature)) < 10: + bins = [min + (max - min)*(i)/len(np.unique(feature)) for i in range(len(np.unique(feature))+1)] + bins[0] = min - self.eps + bins[-1] = max + self.eps return bins else: - bins = [min + (max - min) * (i) / 10 for i in range(10 + 1)] - bins[0] = min - eps # Correct the lower boundary - bins[-1] = max + eps # Correct the higher boundary + bins = [min + (max - min)*(i)/10 for i in range(10+1)] + bins[0] = min - self.eps + bins[-1] = max + self.eps return bins - # return an array containing the sample counts within each bucket + def _PSI(self, reference_feature, test_feature): - eps = 1e-4 - # Compare the bins to calculate PSI - psi_df = reference_feature.join(test_feature, on="bin", how="inner") - - # Add a small value for when the percent is zero - psi_df["percent_initial"] = psi_df["percent_initial"].apply( - lambda x: eps if x == 0 else x - ) - psi_df["percent_new"] = psi_df["percent_new"].apply( - lambda x: eps if x == 0 else x - ) - - # Calculate the psi - psi_df["psi"] = (psi_df["percent_initial"] - psi_df["percent_new"]) * np.log( - psi_df["percent_initial"] / psi_df["percent_new"] - ) - - # Return the mean of psi values - return np.mean(psi_df["psi"]) + """ + Calculate the Population Stability Index (PSI) between reference and test features. + + Args: + reference_feature (pandas.DataFrame): Reference feature distribution. + test_feature (pandas.DataFrame): Test feature distribution. + + Returns: + float: The calculated PSI value indicating distributional change. + """ + psi_df = reference_feature.join(test_feature, on = "bin", how = "inner") + psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: self.eps if x == 0 else x) + psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: self.eps if x == 0 else x) + psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new']) + return np.mean(psi_df['psi']) + def PSI_value(self): + """ + Get the last calculated PSI value. + + Returns: + float: The PSI value from the most recent update. + """ + return self.PSI_value From 6a131c9fc2a8ce8e181f863038bc80d463de70ed Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:41:59 -0500 Subject: [PATCH 12/34] formating code with black --- menelaus/data_drift/psi_detector.py | 91 ++++++++++++++++------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index 9e84e6ce..3abc9d76 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -1,41 +1,39 @@ from menelaus.detector import BatchDetector import pandas as pd import numpy as np + + class PSI_Detector(BatchDetector): """ Parent class for PSI-based drift detector, it serves as a fundamental framework for batch data applications. - + The PSI (Population Stability Index) is employed for detecting distributional shifts between a reference population and a comparison population. This detector assesses changes by calculating the PSI, which measures the distributional - change based on percentiles. The psi function in the detector compares the distributions of scores in reference and + change based on percentiles. The psi function in the detector compares the distributions of scores in reference and test populations and calculates the PSI values for different bins. - - In summary, the PSI drift detector provides a robust mechanism for monitoring and detecting distributional changes in + + In summary, the PSI drift detector provides a robust mechanism for monitoring and detecting distributional changes in populations, making it adaptable for various data settings and applications. - + Ref. :cite:t:`Psi2022` """ + input_type = "batch" - def __init__( - self, - eps=1e-4, - threshold=0.1 - ): + def __init__(self, eps=1e-4, threshold=0.1): """ Args: - eps:The eps parameter in the function represents a small constant (1e-4) introduced to prevent division by zero + eps:The eps parameter in the function represents a small constant (1e-4) introduced to prevent division by zero when calculating percentages, ensuring numerical stability. - threshold: It represents the threshold for detecting drift, if the calculated PSI value for a feature exceeds - this threshold, it indicates drift in that feature, and the drift_state is set to 'drift'. This threshold is a - user-defined value, and when crossed, it signifies a significant distributional change between the reference + threshold: It represents the threshold for detecting drift, if the calculated PSI value for a feature exceeds + this threshold, it indicates drift in that feature, and the drift_state is set to 'drift'. This threshold is a + user-defined value, and when crossed, it signifies a significant distributional change between the reference and test populations. """ - super().__init__() + super().__init__() self.eps = eps self.threshold = threshold - def set_reference(self, X, y_true=None, y_pred=None): """ Set the detector's reference batch to an updated value; typically @@ -47,9 +45,10 @@ def set_reference(self, X, y_true=None, y_pred=None): y_pred (numpy.array): predicted labels, not used in NNDVI """ X, _, _ = super()._validate_input(X, None, None) - self.reference = X.reshape(len(X),) - - + self.reference = X.reshape( + len(X), + ) + def reset(self): """ Initialize relevant attributes to original values, to ensure information @@ -58,7 +57,6 @@ def reset(self): """ super().reset() - def update(self, X: np.array, y_true=None, y_pred=None): """ Update the detector with a new test batch. If drift is detected, new @@ -68,26 +66,30 @@ def update(self, X: np.array, y_true=None, y_pred=None): X (numpy.array): next batch of data to detect drift on. y_true (numpy.array): true labels, not used in PSI y_pred (numpy.array): predicted labels, not used in PSI - """ + """ if self._drift_state == "drift": self.reset() X, _, _ = super()._validate_input(X, None, None) super().update(X=X, y_true=None, y_pred=None) - test_batch = (np.array(X)).reshape(len(X),) + test_batch = (np.array(X)).reshape( + len(X), + ) min_val = min(min(self.reference), min(test_batch)) max_val = max(max(self.reference), max(test_batch)) - bins = self._bin_data(self.reference,min_val,max_val) - bins_initial = pd.cut(self.reference, bins = bins, labels = range(1,len(bins))) - df_initial = pd.DataFrame({'initial': self.reference, 'bin': bins_initial}) - grp_initial = df_initial.groupby('bin').count() - grp_initial['percent_initial'] = grp_initial['initial'] / sum(grp_initial['initial']) - bins_new = pd.cut(test_batch, bins = bins, labels = range(1,len(bins))) - df_new = pd.DataFrame({'new': test_batch, 'bin': bins_new}) - grp_new = df_new.groupby('bin').count() - grp_new['percent_new'] = grp_new['new'] / sum(grp_new['new']) - psi_value = self._PSI(grp_initial,grp_new) + bins = self._bin_data(self.reference, min_val, max_val) + bins_initial = pd.cut(self.reference, bins=bins, labels=range(1, len(bins))) + df_initial = pd.DataFrame({"initial": self.reference, "bin": bins_initial}) + grp_initial = df_initial.groupby("bin").count() + grp_initial["percent_initial"] = grp_initial["initial"] / sum( + grp_initial["initial"] + ) + bins_new = pd.cut(test_batch, bins=bins, labels=range(1, len(bins))) + df_new = pd.DataFrame({"new": test_batch, "bin": bins_new}) + grp_new = df_new.groupby("bin").count() + grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"]) + psi_value = self._PSI(grp_initial, grp_new) self.PSI_value = psi_value if psi_value >= self.threshold: self._drift_state = "drift" @@ -106,17 +108,19 @@ def _bin_data(self, feature, min, max): list: A list of bin edges for the given feature. """ if len(np.unique(feature)) < 10: - bins = [min + (max - min)*(i)/len(np.unique(feature)) for i in range(len(np.unique(feature))+1)] + bins = [ + min + (max - min) * (i) / len(np.unique(feature)) + for i in range(len(np.unique(feature)) + 1) + ] bins[0] = min - self.eps bins[-1] = max + self.eps return bins else: - bins = [min + (max - min)*(i)/10 for i in range(10+1)] + bins = [min + (max - min) * (i) / 10 for i in range(10 + 1)] bins[0] = min - self.eps bins[-1] = max + self.eps return bins - def _PSI(self, reference_feature, test_feature): """ Calculate the Population Stability Index (PSI) between reference and test features. @@ -128,11 +132,18 @@ def _PSI(self, reference_feature, test_feature): Returns: float: The calculated PSI value indicating distributional change. """ - psi_df = reference_feature.join(test_feature, on = "bin", how = "inner") - psi_df['percent_initial'] = psi_df['percent_initial'].apply(lambda x: self.eps if x == 0 else x) - psi_df['percent_new'] = psi_df['percent_new'].apply(lambda x: self.eps if x == 0 else x) - psi_df['psi'] = (psi_df['percent_initial'] - psi_df['percent_new']) * np.log(psi_df['percent_initial'] / psi_df['percent_new']) - return np.mean(psi_df['psi']) + psi_df = reference_feature.join(test_feature, on="bin", how="inner") + psi_df["percent_initial"] = psi_df["percent_initial"].apply( + lambda x: self.eps if x == 0 else x + ) + psi_df["percent_new"] = psi_df["percent_new"].apply( + lambda x: self.eps if x == 0 else x + ) + psi_df["psi"] = (psi_df["percent_initial"] - psi_df["percent_new"]) * np.log( + psi_df["percent_initial"] / psi_df["percent_new"] + ) + return np.mean(psi_df["psi"]) + def PSI_value(self): """ Get the last calculated PSI value. From 9fed735cfbd69a33909b06e98a25741d42d82e3c Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 14:32:11 -0500 Subject: [PATCH 13/34] Update test_psi_detector.py --- .../menelaus/data_drift/test_psi_detector.py | 74 ++++++++++++++++--- 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index efe8c3b3..4f224203 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -1,16 +1,72 @@ import pytest import numpy as np -from menelaus.data_drift.psi_detector import PSI +from menelaus.data_drift.psi_detector import PSI_Detector -def test_validation(...): - ... +def test_psi_init(): + """Test correct default initialization for PSI""" + det = PSI_Detector() + assert det.eps == 1e-4 + assert det.threshold == 0.1 + assert det.batches_since_reset == 0 + assert det.drift_state is None + +def test_psi_set_reference(): + """Assert PSI.set_reference works as intended""" + det = PSI_Detector() + ref = np.random.randint(0, 5, (3, 3)) + det.set_reference(ref) + assert np.array_equal(ref, det.reference_batch) + +def test_psi_update_1(): + """Ensure PSI can update with small random batches""" + det = PSI_Detector() + det.set_reference(np.random.randint(0, 5, (10, 10))) + det.update(X=np.random.randint(0, 5, (10, 10))) -def test_set_reference(...): - ... +def test_psi_update_2(): + """Ensure PSI can update with drift actions triggered""" + det = PSI_Detector() + # XXX - AS added this method of forcing drift in psi, which + # is otherwise hard to induce drift in, for small data + # examples. More stable alternatives may exist + np.random.seed(123) + det.set_reference(np.random.randint(0, 5, (10, 10))) + det.update(X=np.random.randint(10, 40, (10, 10))) + assert det.drift_state is not None -def test_no_drift(...): - ... +def test_psi_update_3(): + """Check PSI.update behavior after drift alarm""" + det = PSI_Detector() + det.set_reference(np.random.randint(0, 5, (5, 5))) + det._drift_state = "drift" + det.update(X=np.random.randint(0, 5, (5, 5))) + assert det.drift_state is None +def test_psi_update_4(): + """Check failure when batch shapes don't match""" + det = PSI_Detector() + det.set_reference(np.random.randint(0, 5, (5, 6))) + with pytest.raises(ValueError): + det.update(np.random.randint(0, 5, (5, 5))) -def test_drift(...): - ... \ No newline at end of file +def test_psi_reset(): + """Check psi.reset works as intended""" + det = PSI_Detector() + det.batches_since_reset = 1 + det.drift_state = "drift" + det.reset() + assert det.batches_since_reset == 0 + assert det.drift_state is None + + +def test_psi_compute_PSI(): + """Check psi._compute_threshold works correctly""" + det = PSI_Detector() + # XXX - Hardcoded known example added by AS, in the future a + # dynamic way to test this function may be used + np.random.seed(123) + threshold = det._PSI( + v_ref=np.random.randint(0, 2, 5), + v_test=np.random.randint(0, 2, 5), + ) + assert threshold >= 0 and threshold <= 1 From 36f96e2e848fab79cc494683e44e244f9f239bc3 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 14:36:10 -0500 Subject: [PATCH 14/34] Update psi_detector.py --- menelaus/data_drift/psi_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index 3abc9d76..3960343d 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -3,7 +3,7 @@ import numpy as np -class PSI_Detector(BatchDetector): +class PSI(BatchDetector): """ Parent class for PSI-based drift detector, it serves as a fundamental framework for batch data applications. From 3db9642d24d5a17328a276f05ed5b233ac855314 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 14:37:12 -0500 Subject: [PATCH 15/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 4f224203..7ac8fdc5 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -1,10 +1,10 @@ import pytest import numpy as np -from menelaus.data_drift.psi_detector import PSI_Detector +from menelaus.data_drift.psi_detector import PSI def test_psi_init(): """Test correct default initialization for PSI""" - det = PSI_Detector() + det = PSI() assert det.eps == 1e-4 assert det.threshold == 0.1 assert det.batches_since_reset == 0 @@ -12,20 +12,20 @@ def test_psi_init(): def test_psi_set_reference(): """Assert PSI.set_reference works as intended""" - det = PSI_Detector() + det = PSI() ref = np.random.randint(0, 5, (3, 3)) det.set_reference(ref) assert np.array_equal(ref, det.reference_batch) def test_psi_update_1(): """Ensure PSI can update with small random batches""" - det = PSI_Detector() + det = PSI() det.set_reference(np.random.randint(0, 5, (10, 10))) det.update(X=np.random.randint(0, 5, (10, 10))) def test_psi_update_2(): """Ensure PSI can update with drift actions triggered""" - det = PSI_Detector() + det = PSI() # XXX - AS added this method of forcing drift in psi, which # is otherwise hard to induce drift in, for small data # examples. More stable alternatives may exist @@ -36,7 +36,7 @@ def test_psi_update_2(): def test_psi_update_3(): """Check PSI.update behavior after drift alarm""" - det = PSI_Detector() + det = PSI() det.set_reference(np.random.randint(0, 5, (5, 5))) det._drift_state = "drift" det.update(X=np.random.randint(0, 5, (5, 5))) @@ -44,14 +44,14 @@ def test_psi_update_3(): def test_psi_update_4(): """Check failure when batch shapes don't match""" - det = PSI_Detector() + det = PSI() det.set_reference(np.random.randint(0, 5, (5, 6))) with pytest.raises(ValueError): det.update(np.random.randint(0, 5, (5, 5))) def test_psi_reset(): """Check psi.reset works as intended""" - det = PSI_Detector() + det = PSI() det.batches_since_reset = 1 det.drift_state = "drift" det.reset() @@ -61,7 +61,7 @@ def test_psi_reset(): def test_psi_compute_PSI(): """Check psi._compute_threshold works correctly""" - det = PSI_Detector() + det = PSI() # XXX - Hardcoded known example added by AS, in the future a # dynamic way to test this function may be used np.random.seed(123) From 46c72e79e88ff29bc78aa376accc2b1891c7ca09 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 14:48:08 -0500 Subject: [PATCH 16/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 7ac8fdc5..1cb20caa 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -1,6 +1,6 @@ import pytest import numpy as np -from menelaus.data_drift.psi_detector import PSI +from menelaus.data_drift import PSI def test_psi_init(): """Test correct default initialization for PSI""" From 4fbbe4c26fbce2dc595117eb35847708a79bf0f0 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:00:37 -0500 Subject: [PATCH 17/34] Update test_psi_detector.py --- .../menelaus/data_drift/test_psi_detector.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 1cb20caa..07af81e4 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -13,15 +13,15 @@ def test_psi_init(): def test_psi_set_reference(): """Assert PSI.set_reference works as intended""" det = PSI() - ref = np.random.randint(0, 5, (3, 3)) + ref = np.random.randint(0, 5, (9, 1)) det.set_reference(ref) assert np.array_equal(ref, det.reference_batch) def test_psi_update_1(): """Ensure PSI can update with small random batches""" det = PSI() - det.set_reference(np.random.randint(0, 5, (10, 10))) - det.update(X=np.random.randint(0, 5, (10, 10))) + det.set_reference(np.random.randint(0, 5, (100, 1))) + det.update(X=np.random.randint(0, 5, (100, 1))) def test_psi_update_2(): """Ensure PSI can update with drift actions triggered""" @@ -30,24 +30,24 @@ def test_psi_update_2(): # is otherwise hard to induce drift in, for small data # examples. More stable alternatives may exist np.random.seed(123) - det.set_reference(np.random.randint(0, 5, (10, 10))) - det.update(X=np.random.randint(10, 40, (10, 10))) + det.set_reference(np.random.randint(0, 5, (100, 1))) + det.update(X=np.random.randint(10, 40, (100, 1))) assert det.drift_state is not None def test_psi_update_3(): """Check PSI.update behavior after drift alarm""" det = PSI() - det.set_reference(np.random.randint(0, 5, (5, 5))) + det.set_reference(np.random.randint(0, 5, (25, 1))) det._drift_state = "drift" - det.update(X=np.random.randint(0, 5, (5, 5))) + det.update(X=np.random.randint(0, 5, (25, 1))) assert det.drift_state is None def test_psi_update_4(): """Check failure when batch shapes don't match""" det = PSI() - det.set_reference(np.random.randint(0, 5, (5, 6))) + det.set_reference(np.random.randint(0, 5, (30, 1))) with pytest.raises(ValueError): - det.update(np.random.randint(0, 5, (5, 5))) + det.update(np.random.randint(0, 5, (25, 1))) def test_psi_reset(): """Check psi.reset works as intended""" @@ -66,7 +66,7 @@ def test_psi_compute_PSI(): # dynamic way to test this function may be used np.random.seed(123) threshold = det._PSI( - v_ref=np.random.randint(0, 2, 5), - v_test=np.random.randint(0, 2, 5), + reference_feature=np.random.randint(0, 2, 5), + test_feature=np.random.randint(0, 2, 5), ) assert threshold >= 0 and threshold <= 1 From 326687290895979cb691ba17086e1158962c9c18 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:13:33 -0500 Subject: [PATCH 18/34] Update test_psi_detector.py --- .../menelaus/data_drift/test_psi_detector.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 07af81e4..4d27ec79 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -1,5 +1,6 @@ import pytest import numpy as np +import pandas as pd from menelaus.data_drift import PSI def test_psi_init(): @@ -15,7 +16,7 @@ def test_psi_set_reference(): det = PSI() ref = np.random.randint(0, 5, (9, 1)) det.set_reference(ref) - assert np.array_equal(ref, det.reference_batch) + assert np.array_equal(ref, det.reference) def test_psi_update_1(): """Ensure PSI can update with small random batches""" @@ -42,13 +43,6 @@ def test_psi_update_3(): det.update(X=np.random.randint(0, 5, (25, 1))) assert det.drift_state is None -def test_psi_update_4(): - """Check failure when batch shapes don't match""" - det = PSI() - det.set_reference(np.random.randint(0, 5, (30, 1))) - with pytest.raises(ValueError): - det.update(np.random.randint(0, 5, (25, 1))) - def test_psi_reset(): """Check psi.reset works as intended""" det = PSI() @@ -62,11 +56,8 @@ def test_psi_reset(): def test_psi_compute_PSI(): """Check psi._compute_threshold works correctly""" det = PSI() - # XXX - Hardcoded known example added by AS, in the future a - # dynamic way to test this function may be used np.random.seed(123) - threshold = det._PSI( - reference_feature=np.random.randint(0, 2, 5), - test_feature=np.random.randint(0, 2, 5), - ) + PSI.set_reference(np.random.randint(0,100,100)) + PSI.update(np.random.randint(0,100,100)) + threshold = PSI.PSI_value assert threshold >= 0 and threshold <= 1 From 4d4d532413e5c70f62522f7dbac2fee288e82d75 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:19:03 -0500 Subject: [PATCH 19/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 4d27ec79..7f1cd949 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -57,7 +57,9 @@ def test_psi_compute_PSI(): """Check psi._compute_threshold works correctly""" det = PSI() np.random.seed(123) - PSI.set_reference(np.random.randint(0,100,100)) - PSI.update(np.random.randint(0,100,100)) + ref = np.random.randint(0,100,100) + test = np.random.randint(0,100,100) + PSI.set_reference(ref) + PSI.update(test) threshold = PSI.PSI_value assert threshold >= 0 and threshold <= 1 From 6bb398dd97fe56020ade73e610c741fe85aaccfb Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:23:44 -0500 Subject: [PATCH 20/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 7f1cd949..a077dfe8 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -14,7 +14,7 @@ def test_psi_init(): def test_psi_set_reference(): """Assert PSI.set_reference works as intended""" det = PSI() - ref = np.random.randint(0, 5, (9, 1)) + ref = np.random.randint(0, 5, 9) det.set_reference(ref) assert np.array_equal(ref, det.reference) @@ -56,10 +56,9 @@ def test_psi_reset(): def test_psi_compute_PSI(): """Check psi._compute_threshold works correctly""" det = PSI() - np.random.seed(123) ref = np.random.randint(0,100,100) test = np.random.randint(0,100,100) - PSI.set_reference(ref) - PSI.update(test) + PSI.set_reference(X = ref) + PSI.update(X = test) threshold = PSI.PSI_value assert threshold >= 0 and threshold <= 1 From 38d15e479bce5e01038744af7600f55eaf625cb7 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:38:22 -0500 Subject: [PATCH 21/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index a077dfe8..fb983d44 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -14,9 +14,9 @@ def test_psi_init(): def test_psi_set_reference(): """Assert PSI.set_reference works as intended""" det = PSI() - ref = np.random.randint(0, 5, 9) + ref = np.random.randint(0, 5, (100,1)) det.set_reference(ref) - assert np.array_equal(ref, det.reference) + assert (det.reference).ndim == 1 def test_psi_update_1(): """Ensure PSI can update with small random batches""" @@ -58,7 +58,7 @@ def test_psi_compute_PSI(): det = PSI() ref = np.random.randint(0,100,100) test = np.random.randint(0,100,100) - PSI.set_reference(X = ref) - PSI.update(X = test) - threshold = PSI.PSI_value + det.set_reference(X = ref) + det.update(X = test) + threshold = det.PSI_value assert threshold >= 0 and threshold <= 1 From 67dd5aecd4d9956aa2ebec2c31a74a255a2cb106 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:55:00 -0500 Subject: [PATCH 22/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index fb983d44..a2e1ed75 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -27,9 +27,6 @@ def test_psi_update_1(): def test_psi_update_2(): """Ensure PSI can update with drift actions triggered""" det = PSI() - # XXX - AS added this method of forcing drift in psi, which - # is otherwise hard to induce drift in, for small data - # examples. More stable alternatives may exist np.random.seed(123) det.set_reference(np.random.randint(0, 5, (100, 1))) det.update(X=np.random.randint(10, 40, (100, 1))) From 4e1cd4483b0ea4b6da3342251abfb6dbfc867319 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:02:59 -0500 Subject: [PATCH 23/34] Update psi_detector.py --- menelaus/data_drift/psi_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index 3960343d..491fbdd8 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -151,4 +151,4 @@ def PSI_value(self): Returns: float: The PSI value from the most recent update. """ - return self.PSI_value + return self.PSI_value \ No newline at end of file From cc85fac0dd7ea81e28e6a05499938ee7e3a8d44a Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:10:19 -0500 Subject: [PATCH 24/34] Update psi_detector.py --- menelaus/data_drift/psi_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index 491fbdd8..3960343d 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -151,4 +151,4 @@ def PSI_value(self): Returns: float: The PSI value from the most recent update. """ - return self.PSI_value \ No newline at end of file + return self.PSI_value From 5d9f06fbd6c417137f636837e593a9ffb94232fc Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:18:28 -0500 Subject: [PATCH 25/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index a2e1ed75..631da45e 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -57,5 +57,4 @@ def test_psi_compute_PSI(): test = np.random.randint(0,100,100) det.set_reference(X = ref) det.update(X = test) - threshold = det.PSI_value - assert threshold >= 0 and threshold <= 1 + assert det.PSI_value >= 0 and det.PSI_value <= 1 From 9e91dcc7b1b6a288d6f1edc66ab1c1c7b0f21f12 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:27:40 -0500 Subject: [PATCH 26/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 631da45e..bfeffb7f 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -57,4 +57,9 @@ def test_psi_compute_PSI(): test = np.random.randint(0,100,100) det.set_reference(X = ref) det.update(X = test) + result = det.PSI_value + det = PSI() + det.set_reference(X = ref) + det.update(X = test) + assert det.PSI_value == result assert det.PSI_value >= 0 and det.PSI_value <= 1 From a46710cfd7aaaddf8ab13e24b1c7b77c2c454859 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:40:24 -0500 Subject: [PATCH 27/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index bfeffb7f..36b46bcf 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -61,5 +61,5 @@ def test_psi_compute_PSI(): det = PSI() det.set_reference(X = ref) det.update(X = test) - assert det.PSI_value == result - assert det.PSI_value >= 0 and det.PSI_value <= 1 + assertEqual(det.PSI_value == result) + assertTrue(det.PSI_value >= 0 and det.PSI_value <= 1) From dab9abf18482ffa00d42c139144f0d4753865282 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:46:42 -0500 Subject: [PATCH 28/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 36b46bcf..bfeffb7f 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -61,5 +61,5 @@ def test_psi_compute_PSI(): det = PSI() det.set_reference(X = ref) det.update(X = test) - assertEqual(det.PSI_value == result) - assertTrue(det.PSI_value >= 0 and det.PSI_value <= 1) + assert det.PSI_value == result + assert det.PSI_value >= 0 and det.PSI_value <= 1 From 2e9656acfc9876b90cb0d2d8316a368866ac824a Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:48:46 -0500 Subject: [PATCH 29/34] Update psi_detector.py --- menelaus/data_drift/psi_detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index 3960343d..ce335846 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -90,7 +90,7 @@ def update(self, X: np.array, y_true=None, y_pred=None): grp_new = df_new.groupby("bin").count() grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"]) psi_value = self._PSI(grp_initial, grp_new) - self.PSI_value = psi_value + self._PSI_value = psi_value if psi_value >= self.threshold: self._drift_state = "drift" self.set_reference(test_batch) @@ -144,7 +144,7 @@ def _PSI(self, reference_feature, test_feature): ) return np.mean(psi_df["psi"]) - def PSI_value(self): + def _PSI_value(self): """ Get the last calculated PSI value. From 4d8cbc21e8cbd942f6fe5037b5f9a23fb3a68741 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:51:57 -0500 Subject: [PATCH 30/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index bfeffb7f..2f2eb480 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -57,9 +57,9 @@ def test_psi_compute_PSI(): test = np.random.randint(0,100,100) det.set_reference(X = ref) det.update(X = test) - result = det.PSI_value + result = det._PSI_value det = PSI() det.set_reference(X = ref) det.update(X = test) - assert det.PSI_value == result - assert det.PSI_value >= 0 and det.PSI_value <= 1 + assert det._PSI_value == result + assert det._PSI_value >= 0 and det._PSI_value <= 1 From 1530e7fbf96565b5dd39b3fb537daaccd99597a7 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:08:22 -0500 Subject: [PATCH 31/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index 2f2eb480..f88a8d14 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -48,18 +48,3 @@ def test_psi_reset(): det.reset() assert det.batches_since_reset == 0 assert det.drift_state is None - - -def test_psi_compute_PSI(): - """Check psi._compute_threshold works correctly""" - det = PSI() - ref = np.random.randint(0,100,100) - test = np.random.randint(0,100,100) - det.set_reference(X = ref) - det.update(X = test) - result = det._PSI_value - det = PSI() - det.set_reference(X = ref) - det.update(X = test) - assert det._PSI_value == result - assert det._PSI_value >= 0 and det._PSI_value <= 1 From 9bf792ab898c281911a2a7ded5af5d820d849246 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:09:22 -0500 Subject: [PATCH 32/34] Update psi_detector.py --- menelaus/data_drift/psi_detector.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index ce335846..9ac653dd 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -90,11 +90,10 @@ def update(self, X: np.array, y_true=None, y_pred=None): grp_new = df_new.groupby("bin").count() grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"]) psi_value = self._PSI(grp_initial, grp_new) - self._PSI_value = psi_value if psi_value >= self.threshold: self._drift_state = "drift" self.set_reference(test_batch) - + return psi_value def _bin_data(self, feature, min, max): """ Bin the given feature based on the specified minimum and maximum values. @@ -143,12 +142,3 @@ def _PSI(self, reference_feature, test_feature): psi_df["percent_initial"] / psi_df["percent_new"] ) return np.mean(psi_df["psi"]) - - def _PSI_value(self): - """ - Get the last calculated PSI value. - - Returns: - float: The PSI value from the most recent update. - """ - return self.PSI_value From b3a724e2bdde3067d4aff6182b025b7a5be7c645 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:16:31 -0500 Subject: [PATCH 33/34] Update test_psi_detector.py --- tests/menelaus/data_drift/test_psi_detector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/menelaus/data_drift/test_psi_detector.py b/tests/menelaus/data_drift/test_psi_detector.py index f88a8d14..ff5343ce 100644 --- a/tests/menelaus/data_drift/test_psi_detector.py +++ b/tests/menelaus/data_drift/test_psi_detector.py @@ -21,15 +21,15 @@ def test_psi_set_reference(): def test_psi_update_1(): """Ensure PSI can update with small random batches""" det = PSI() - det.set_reference(np.random.randint(0, 5, (100, 1))) - det.update(X=np.random.randint(0, 5, (100, 1))) + det.set_reference(np.random.randint(0, 5, (10, 1))) + det.update(X=np.random.randint(0, 5, (10, 1))) def test_psi_update_2(): """Ensure PSI can update with drift actions triggered""" det = PSI() np.random.seed(123) - det.set_reference(np.random.randint(0, 5, (100, 1))) - det.update(X=np.random.randint(10, 40, (100, 1))) + det.set_reference(np.random.randint(0, 100, (200, 1))) + det.update(X=np.random.randint(150, 200, (100, 1))) assert det.drift_state is not None def test_psi_update_3(): From 614eb0fbab1babf92ac1ad8d0ca1b66142858d56 Mon Sep 17 00:00:00 2001 From: Tim Chen <115333718+951378644@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:19:55 -0500 Subject: [PATCH 34/34] update --- menelaus/data_drift/psi_detector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/menelaus/data_drift/psi_detector.py b/menelaus/data_drift/psi_detector.py index 9ac653dd..efad3a82 100644 --- a/menelaus/data_drift/psi_detector.py +++ b/menelaus/data_drift/psi_detector.py @@ -94,6 +94,7 @@ def update(self, X: np.array, y_true=None, y_pred=None): self._drift_state = "drift" self.set_reference(test_batch) return psi_value + def _bin_data(self, feature, min, max): """ Bin the given feature based on the specified minimum and maximum values.