Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

160 psi detector #162

Open
wants to merge 34 commits into
base: dev
Choose a base branch
from
Open
Changes from 8 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
2ff709c
Create PSI Detector.py
951378644 Sep 19, 2023
2c4a9e3
Move file to a different location
951378644 Sep 19, 2023
360f552
Update PSI Detector.py
951378644 Sep 19, 2023
fd3cac5
Update PSI Detector.py
951378644 Sep 19, 2023
b806132
Update PSI Detector.py
951378644 Sep 19, 2023
6e18b84
Update PSI Detector.py
951378644 Sep 19, 2023
5c7de0c
rename
951378644 Sep 19, 2023
5c0039f
Reformatted code with black
951378644 Sep 19, 2023
2b808ab
add PSI to data_drift.init, add skeleton for unit tests
tms-bananaquit Nov 17, 2023
9541574
Update refs.bib
951378644 Nov 21, 2023
395d840
Update psi_detector.py
951378644 Nov 21, 2023
6a131c9
formating code with black
951378644 Nov 21, 2023
9fed735
Update test_psi_detector.py
951378644 Nov 22, 2023
36f96e2
Update psi_detector.py
951378644 Nov 22, 2023
3db9642
Update test_psi_detector.py
951378644 Nov 22, 2023
46c72e7
Update test_psi_detector.py
951378644 Nov 22, 2023
4fbbe4c
Update test_psi_detector.py
951378644 Nov 22, 2023
3266872
Update test_psi_detector.py
951378644 Nov 22, 2023
4d4d532
Update test_psi_detector.py
951378644 Nov 22, 2023
6bb398d
Update test_psi_detector.py
951378644 Nov 22, 2023
38d15e4
Update test_psi_detector.py
951378644 Nov 22, 2023
67dd5ae
Update test_psi_detector.py
951378644 Nov 22, 2023
4e1cd44
Update psi_detector.py
951378644 Nov 22, 2023
cc85fac
Update psi_detector.py
951378644 Nov 22, 2023
5d9f06f
Update test_psi_detector.py
951378644 Nov 22, 2023
9e91dcc
Update test_psi_detector.py
951378644 Nov 22, 2023
a46710c
Update test_psi_detector.py
951378644 Nov 22, 2023
dab9abf
Update test_psi_detector.py
951378644 Nov 22, 2023
2e9656a
Update psi_detector.py
951378644 Nov 22, 2023
4d8cbc2
Update test_psi_detector.py
951378644 Nov 22, 2023
1530e7f
Update test_psi_detector.py
951378644 Nov 22, 2023
9bf792a
Update psi_detector.py
951378644 Nov 22, 2023
b3a724e
Update test_psi_detector.py
951378644 Nov 22, 2023
614eb0f
update
951378644 Nov 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions menelaus/data_drift/psi_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from menelaus.detector import BatchDetector
import pandas as pd
import numpy as np
import copy


class PSI_Detector(BatchDetector):
input_type = "batch"

def __init__(
self,
):
# This initializes batch detector's parent class
951378644 marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()

# Initialize any parameters to be used in this algorithm

def set_reference(self, X, y_true=None, y_pred=None):
# leave this, it uses the parent class to validate input
# and sets the self.reference variable to refer to the reference dataset

X, _, _ = super()._validate_input(X, None, None)
X = pd.DataFrame(X, columns=self._input_cols)
951378644 marked this conversation as resolved.
Show resolved Hide resolved

# Initialize reference dataset
self.reference = copy.deepcopy(X)

self.reset()

def reset(self):
super().reset()

def update(self, X, by_feature=True, X_by_feature=None, y_true=None, y_pred=None):
# this function will update the detector with the test batch
eps = 1e-4
# create a variable to store psi values for each feature
feature_psi = []
o = pd.DataFrame(
columns=[
"Column",
"Moderate population change",
"Significant population change",
"PSI",
]
)
z = 0
# 1. iterate through each feature in reference and test data, identify minimum and maximum value
for column in self.reference.columns:
min_val = min(min(self.reference[column]), min(X[column]))
max_val = max(max(self.reference[column]), max(X[column]))

# 2. use _bin_data function to bucketize reference, append to reference buckets array
bins = self._bin_data(self.reference[column], min_val, max_val)
bins_initial = pd.cut(
self.reference[column], bins=bins, labels=range(1, len(bins))
)
df_initial = pd.DataFrame(
{"initial": self.reference[column], "bin": bins_initial}
)
grp_initial = df_initial.groupby("bin").count()
grp_initial["percent_initial"] = grp_initial["initial"] / sum(
grp_initial["initial"]
)
# 3. use _bin_data function to bucketize test, append to reference test array
bins_new = pd.cut(X[column], bins=bins, labels=range(1, len(bins)))
df_new = pd.DataFrame({"new": X[column], "bin": bins_new})
grp_new = df_new.groupby("bin").count()
grp_new["percent_new"] = grp_new["new"] / sum(grp_new["new"])
# 4. Call PSI function to calculate PSI on test and reference bucket representation,
psi_value = self._PSI(grp_initial, grp_new)
feature_psi.append([column, psi_value])
# store PSI for each feature in feature_psi array
951378644 marked this conversation as resolved.
Show resolved Hide resolved
if psi_value >= 0.1 and psi_value <= 0.2:
951378644 marked this conversation as resolved.
Show resolved Hide resolved
o.loc[z] = [column, "Yes", "No", psi_value]
z += 1
elif psi_value > 0.2:
o.loc[z] = [column, "No", "Yes", psi_value]
z += 1
# 5. Aggregate PSI values to determine if dataset is drifting
if o.any()["Column"] == True:
self.drift_state == "drift"
return feature_psi, o
else:
return "no drift detected", feature_psi
# If PSI indicates drift, set self.drift_state == 'drift'

# Create a dictionary to store if each individual feature is drifting
951378644 marked this conversation as resolved.
Show resolved Hide resolved

# Update self.reference dataset to refer to test data, X
self.reference = X
951378644 marked this conversation as resolved.
Show resolved Hide resolved

def _bin_data(self, feature, min, max):
eps = 1e-4
if len(feature.unique()) < 10:
bins = [
min + (max - min) * (i) / len(feature.unique())
for i in range(len(feature.unique()) + 1)
]
bins[0] = min - eps # Correct the lower boundary
bins[-1] = max + eps # Correct the higher boundary
return bins
else:
bins = [min + (max - min) * (i) / 10 for i in range(10 + 1)]
bins[0] = min - eps # Correct the lower boundary
bins[-1] = max + eps # Correct the higher boundary
return bins
# return an array containing the sample counts within each bucket

def _PSI(self, reference_feature, test_feature):
eps = 1e-4
951378644 marked this conversation as resolved.
Show resolved Hide resolved
# Compare the bins to calculate PSI
psi_df = reference_feature.join(test_feature, on="bin", how="inner")

# Add a small value for when the percent is zero
psi_df["percent_initial"] = psi_df["percent_initial"].apply(
lambda x: eps if x == 0 else x
)
psi_df["percent_new"] = psi_df["percent_new"].apply(
lambda x: eps if x == 0 else x
)

# Calculate the psi
psi_df["psi"] = (psi_df["percent_initial"] - psi_df["percent_new"]) * np.log(
psi_df["percent_initial"] / psi_df["percent_new"]
)

# Return the mean of psi values
return np.mean(psi_df["psi"])
Loading