Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make concept_drift.ADWIN a child of change_detection.ADWIN #83

Merged
merged 17 commits into from
Jul 22, 2022
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,13 @@ follows:

```python
import pandas as pd
from menelaus.concept_drift import ADWIN
from menelaus.concept_drift import ADWINacc
from menelaus.data_drift import KdqTreeStreaming

df = pd.read_csv('example.csv')

# use a detector that searches for concept drift
detector = ADWIN()
detector = ADWINacc()
for i, row in df.iterrows():
detector.update(row['y_true'], row['y_predicted'], X=None)
if detector.drift_state is not None:
Expand Down
248 changes: 189 additions & 59 deletions docs/source/examples/change_detection/change_detection_examples.ipynb

Large diffs are not rendered by default.

69 changes: 47 additions & 22 deletions docs/source/examples/concept_drift/concept_drift_examples.ipynb

Large diffs are not rendered by default.

185 changes: 133 additions & 52 deletions examples/change_detection_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@
# conditional distributions P(y|var1) and P(y|var2). The drift occurs from index
# 1000 to 1250, and affects 66% of the sample.
#
# Rainfall is a real source of weather data. We use the first 1000 samples, where
# no drift has been injected; but many features are cyclical, and haven't been
# corrected, so change does occur.
#
# These change detectors can be applied to any given single variable; below,
# they are applied to var2.

# In[ ]:
# In[1]:


## Imports ##
Expand All @@ -23,11 +27,11 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from menelaus.change_detection import PageHinkley, CUSUM
from menelaus.datasets import fetch_circle_data
from menelaus.change_detection import ADWIN, CUSUM, PageHinkley
from menelaus.datasets import fetch_circle_data, fetch_rainfall_data


# In[ ]:
# In[2]:


## Import Data ##
Expand All @@ -36,29 +40,46 @@
df = fetch_circle_data()
drift_start, drift_end = 1000, 1250

rainfall_df = fetch_rainfall_data()

# ## Page-Hinkley (PH) Test

# In[ ]:
# ## Cumulative Sum (CUSUM) Test

# - This monitors a moving average of var2, starting from an initial estimate of mean
# and standard deviation.
#
# - It will only alarm if 50 or more samples have been observed since
# initialization/drift.
#
# - This will alarm if var2 passes a critical value controlled by delta and
# threshold in either direction, positive or negative.
#
#

## Setup ##
# In[5]:

# Set up one-directional PH test: this will only alarm if the mean of the
# monitored variable decreases, and only after seeing 30 or more samples.
ph = PageHinkley(delta=0.01, threshold=15, direction="negative", burn_in=30)

## Setup ##
cusum = CUSUM(
target=np.mean(df.loc[:drift_start, "var2"]), # mean / std of 'Var 2' pre-drift
sd_hat=np.std(df.loc[:drift_start, "var2"]),
burn_in=50,
delta=0.005,
threshold=40,
direction=None,
)

# setup DF to record results
status = pd.DataFrame(columns=["index", "actual value", "drift_detected"])

# iterate through data; feed each sample to the detector, in turn
for i in range(len(df)):
obs = df["var2"][i]
ph.update(X=obs)
status.loc[i] = [i, obs, ph.drift_state]
cusum.update(obs)
status.loc[i] = [i, obs, cusum.drift_state]


# In[ ]:
# In[6]:


## Plotting ##
Expand All @@ -69,7 +90,7 @@
plt.grid(False, axis="x")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title("PH Test Results", fontsize=22)
plt.title("CUSUM Test Results", fontsize=22)
plt.ylabel("Value", fontsize=18)
plt.xlabel("Index", fontsize=18)
ylims = [-0.05, 1.1]
Expand All @@ -87,55 +108,40 @@
plt.legend()


# Page-Hinkley alarms shortly after the drift induction window closes, and then makes
# several apparently erroneous alarms afterwards. The parameters may not be
# well-chosen for the new regime.
# Change detection algorithms come out of process control, so a priori
# characterization of the bounds of the process, not performed here, would not
# be unreasonable.
# CUSUM alarms several times within the drift induction window, roughly halfway
# through. After the alarm is reset, change is detected a few more times,
# including an apparently erroneous detection after the drift induction window
# is passed. The current threshold settings may then be too sensitive for the
# new regime.
#

# In[ ]:
# In[7]:


plt.show()
# plt.savefig("example_Page-Hinkley_detections.png")

plt.show()
# plt.savefig("example_CUSUM_detections.png")

# ## Cumulative Sum (CUSUM) Test

# - This monitors a moving average of var2, starting from an initial estimate of mean
# and standard deviation.
#
# - It will only alarm if 50 or more samples have been observed since
# initialization/drift.
#
# - This will alarm if var2 passes a critical value controlled by delta and
# threshold in either direction, positive or negative.
#
#
# ## Page-Hinkley (PH) Test

# In[ ]:


## Setup ##
cusum = CUSUM(
target=np.mean(df.loc[:drift_start, "var2"]), # mean / std of 'Var 2' pre-drift
sd_hat=np.std(df.loc[:drift_start, "var2"]),
burn_in=50,
delta=0.005,
threshold=40,
direction=None,
)

# Set up one-directional PH test: this will only alarm if the mean of the
# monitored variable decreases, and only after seeing 30 or more samples.
ph = PageHinkley(delta=0.01, threshold=15, direction="negative", burn_in=30)

# setup DF to record results
status = pd.DataFrame(columns=["index", "actual value", "drift_detected"])

# iterate through data; feed each sample to the detector, in turn
for i in range(len(df)):
obs = df["var2"][i]
cusum.update(obs)
status.loc[i] = [i, obs, cusum.drift_state]
ph.update(X=obs)
status.loc[i] = [i, obs, ph.drift_state]


# In[ ]:
Expand All @@ -149,7 +155,7 @@
plt.grid(False, axis="x")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title("CUSUM Test Results", fontsize=22)
plt.title("PH Test Results", fontsize=22)
plt.ylabel("Value", fontsize=18)
plt.xlabel("Index", fontsize=18)
ylims = [-0.05, 1.1]
Expand All @@ -165,19 +171,94 @@
color="red",
)
plt.legend()
plt.show()
# plt.savefig("example_Page-Hinkley_detections.png")


# CUSUM alarms several times within the drift induction window, roughly halfway
# through. After the alarm is reset, change is detected a few more times,
# including an apparently erroneous detection after the drift induction window
# is passed. The current threshold settings may then be too sensitive for the
# new regime.
# Page-Hinkley alarms shortly after the drift induction window closes, and then makes
# several apparently erroneous alarms afterwards. The parameters may not be
# well-chosen for the new regime.
# Change detection algorithms come out of process control, so a priori
# characterization of the bounds of the process, not performed here, would not
# be unreasonable.
#

# In[ ]:
# # ADaptive WINdowing (ADWIN)
#
# ADWIN is a change detection algorithm that can be used to monitor a real-valued number. ADWIN maintains a window of the data stream, which grows to the right as new elements are received. When the mean of the feature in one of the subwindows is different enough, ADWIN drops older elements in its window until this ceases to be the case.

# In[58]:


## Setup ##

adwin = ADWIN()

# setup DF to record results
status = pd.DataFrame(columns=["index", "actual value", "drift_detected", "ADWIN mean"])
df2 = rainfall_df.loc[:1000, 'max_sustained_wind_speed']
rec_list = []

# iterate through data; feed each sample to the detector, in turn
for i in range(len(df2)):
obs = df2[i]
adwin.update(X=obs)
status.loc[i] = [i, obs, adwin.drift_state, adwin.mean()]

#monitor the size of ADWIN's window as it changes
if adwin.drift_state == "drift":
retrain_start = adwin.retraining_recs[0]
retrain_end = adwin.retraining_recs[1]
rec_list.append([retrain_start, retrain_end])


# In[66]:


## Plotting ##

# plot the monitored variable and the status of the detector
plt.figure(figsize=(20, 6))
plt.scatter("index", "actual value", data=status, label="max_sustained_wind_speed", alpha=.5)
plt.plot("index", "ADWIN mean", data=status, color='red', linewidth=2)
plt.grid(False, axis="x")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.title("ADWIN Results", fontsize=22)
plt.ylabel("Value", fontsize=18)
plt.xlabel("Index", fontsize=18)
ylims = [-2, 6]
plt.ylim(ylims)

plt.vlines(
x=status.loc[status["drift_detected"] == "drift"]["index"],
ymin=ylims[0],
ymax=ylims[1],
label="Drift Detected",
color="red",
)

# Create a list of lines that indicate the retraining windows.
# Space them evenly, vertically.
rec_list = pd.DataFrame(rec_list)
rec_list["y_val"] = np.linspace(
start=0.6 * (ylims[1] - ylims[0]) + ylims[0],
stop=0.8 * ylims[1],
num=len(rec_list),
)

# Draw green lines that indicate where retraining occurred
plt.hlines(
y=rec_list["y_val"][::-1],
xmin=rec_list[0],
xmax=rec_list[1],
color="black",
label="New Observation Windows",
)

plt.legend()
plt.show()
# plt.savefig("example_CUSUM_detections.png")
# plt.savefig("example_ADWIN.png")


# ADWIN monitors the running average of the `max_sustained_wind_speed` column and, once that mean begins to change enough around index 600, shrinks its observation window (in black) to only include more recent samples. This process repeats as further changes are detected. We can see that the size of the observation window shrinks and grows as the incoming data changes.
Loading