From ef3ec02d5073c42a1a274de5c9148b7789b9eee0 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 31 Jan 2023 15:41:42 -0500
Subject: [PATCH 01/35] initial injection automation script

---
 menelaus/injection/injection_automation.py | 154 +++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 menelaus/injection/injection_automation.py

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
new file mode 100644
index 00000000..408d126c
--- /dev/null
+++ b/menelaus/injection/injection_automation.py
@@ -0,0 +1,154 @@
+import pandas as pd
+import random
+from scipy.io.arff import loadarff
+
+from menelaus.concept_drift import LinearFourRates, ADWINAccuracy, DDM, EDDM, STEPD, MD3
+from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI
+import class_manipulation
+import feature_manipulation
+import noise
+
+
+def select_random_classes(series):
+    classes = series.unique()
+
+    if len(classes) < 2:
+        raise ValueError(f'Insufficient classes in series: {len(classes)}')
+    else:
+        class_a = classes[random.randint(0, len(classes) - 1)]
+        class_b = classes[random.randint(0, len(classes) - 1)]
+
+        while class_a == class_b:
+            class_b = classes[random.randint(0, len(classes) - 1)]
+
+        return [class_a, class_b]
+
+
+class InjectionTesting:
+    def __init__(self, data_path, seed=None):
+        file_type = data_path.split('.')[-1]
+        self.numeric_cols = []
+        self.categorical_cols = []
+
+        if file_type == 'csv':
+            self.df = pd.read_csv(data_path)
+        elif file_type == 'arff':
+            raw_data = loadarff(data_path)
+            self.df = pd.DataFrame(raw_data[0])
+        else:
+            raise ValueError(f'Invalid file type: {file_type}')
+
+        for col in self.df.columns:
+            if pd.api.types.is_numeric_dtype(self.df[col]):
+                self.numeric_cols.append(col)
+            elif self.df[col].nunique() < len(self.df):
+                self.categorical_cols.append(col)
+
+        if seed:
+            random.seed(seed)
+
+
+    def select_rows(self, start, end):
+        start_drift = int(start * len(self.df))
+        end_drift = int(end * len(self.df))
+
+        return [start_drift, end_drift]
+
+
+    def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1):
+        rand_cols = []
+        start_drift, end_drift = self.select_rows(start, end)
+
+        for i in range(num_drift_cols):
+            rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
+            rand_cols.append(rand_col)
+
+            self.df = noise.brownian_noise(self.df, rand_col, x, start_drift, end_drift)
+
+        return rand_cols
+
+
+    def inject_random_class_manipulation(self, manipulation_type, start=.75, end=1, num_drift_cols=1):
+        rand_cols = []
+        all_rand_classes = []
+        start_drift, end_drift = self.select_rows(start, end)
+
+        for i in range(num_drift_cols):
+            rand_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            rand_cols.append(rand_col)
+            rand_classes = select_random_classes(self.df[rand_col])
+            all_rand_classes.append(rand_classes)
+
+            if manipulation_type == 'class_swap':
+                self.df = class_manipulation.class_swap(self.df, rand_col, rand_classes[0], rand_classes[1], start_drift, end_drift)
+            elif manipulation_type == 'class_join':
+                new_label = f'{rand_classes[0]}_{rand_classes[1]}'
+                self.df = class_manipulation.class_join(self.df, rand_col, rand_classes[0], rand_classes[1], new_label, start_drift, end_drift)
+            else:
+                raise ValueError(f'Invalid class manipulation type: {manipulation_type}')
+
+        return rand_cols, all_rand_classes
+
+
+    def inject_random_feature_swap(self, start=.75, end=1, num_swaps=1):
+        all_swap_cols = []
+        start_drift, end_drift = self.select_rows(start, end)
+
+        for i in range(num_swaps):
+            col_type = self.numeric_cols if random.randint(0, 1) == 0 else self.categorical_cols
+
+            if len(col_type) < 2:
+                col_type = self.numeric_cols if col_type == self.categorical_cols else self.categorical_cols
+            if len(col_type) < 2:
+                raise ValueError('Insufficient numeric and categorical columns for swaps')
+
+            col_a = col_type[random.randint(0, len(col_type) - 1)]
+            col_b = col_type[random.randint(0, len(col_type) - 1)]
+
+            while col_a == col_b:
+                col_b = col_type[random.randint(0, len(col_type) - 1)]
+
+            swap_cols = [col_a, col_b]
+            all_swap_cols.append(swap_cols)
+            self.df = feature_manipulation.feature_swap(self.df, col_a, col_b, start_drift, end_drift)
+
+        return all_swap_cols
+
+
+    def inject_random_feature_hide_and_sample(self):
+        rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
+        sample_size = min(self.df[rand_col].value_counts())
+        self.df = feature_manipulation.feature_hide_and_sample(self.df, rand_col, sample_size)
+
+        return rand_col
+
+
+    def test_adwin_detector(self, col):
+        detector = ADWINAccuracy()
+
+        for i, row in self.df.iterrows():
+            detector.update(X=None, y_true=row[col], y_pred=0)
+            assert detector.drift_state != 'drift', f'Drift detected in row {i}'
+
+
+    def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
+        detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound)
+
+        for i, row in self.df.iterrows():
+            detector.update(row[cols])
+            assert detector.drift_state != 'drift', f'Drift detected in row {i}'
+
+
+    def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
+        detector = PCACD(window_size=window_size, divergence_metric=divergence_metric)
+
+        for i, row in self.df.iterrows():
+            detector.update(row)
+            assert detector.drift_state != 'drift', f'Drift detected in row {i}'
+
+
+if __name__ == '__main__':
+    file = 'souza_data/gassensor.arff'
+    tester = InjectionTesting(file)
+    drift_cols = tester.inject_random_brownian_noise(10)
+    tester.test_pcacd_detector()

From 0761bde7336a0922fcf769f67828becd9dfe21b8 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 31 Jan 2023 15:59:23 -0500
Subject: [PATCH 02/35] adding souza data locally only

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 734db991..2862e2f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ _build
 *.coverage
 *.DS_Store
 .idea/
+menelaus/injection/souza_data
 
 # Images
 

From 40b5a55100205bbaf684b5d5bba929af05d75bb3 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Thu, 2 Feb 2023 16:07:33 -0500
Subject: [PATCH 03/35] implementing drift detection scatter plot visualization

---
 .gitignore                                 |  1 +
 menelaus/injection/injection_automation.py | 46 ++++++++++++++++++++--
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2862e2f2..0f6395fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ _build
 *.DS_Store
 .idea/
 menelaus/injection/souza_data
+menelaus/injection/plots
 
 # Images
 
diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 408d126c..3e4b42b3 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -1,3 +1,4 @@
+import matplotlib.pyplot as plt
 import pandas as pd
 import random
 from scipy.io.arff import loadarff
@@ -125,30 +126,67 @@ def inject_random_feature_hide_and_sample(self):
 
     def test_adwin_detector(self, col):
         detector = ADWINAccuracy()
+        drift_state = []
 
         for i, row in self.df.iterrows():
             detector.update(X=None, y_true=row[col], y_pred=0)
-            assert detector.drift_state != 'drift', f'Drift detected in row {i}'
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
 
 
     def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
         detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound)
+        drift_state = []
 
         for i, row in self.df.iterrows():
             detector.update(row[cols])
-            assert detector.drift_state != 'drift', f'Drift detected in row {i}'
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
 
 
     def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
         detector = PCACD(window_size=window_size, divergence_metric=divergence_metric)
+        drift_state = []
 
         for i, row in self.df.iterrows():
             detector.update(row)
-            assert detector.drift_state != 'drift', f'Drift detected in row {i}'
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
+
+
+    def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
+        plt.figure(figsize=(20, 6))
+        y_min = None
+        y_max = None
+
+        for col in cols:
+            plt.scatter(self.df.index, self.df[col], label=col)
+            local_min = self.df[col].min()
+            local_max = self.df[col].max()
+
+            if y_min is None or y_min > local_min:
+                y_min = local_min
+            if y_max is None or y_max < local_max:
+                y_max = local_max
+
+        plt.grid(False, axis='x')
+        plt.xticks(fontsize=16)
+        plt.yticks(fontsize=16)
+        plt.title('Scatter Results', fontsize=22)
+        plt.xlabel('Index', fontsize=18)
+        plt.ylabel('Value', fontsize=18)
+        plt.ylim((y_min, y_max))
+        plt.vlines(x=self.df[self.df['drift_state'] == 'drift'].index, ymin=y_min, ymax=y_max, label='Drift Detected', color='red')
+        plt.legend()
+        plt.savefig(output_file)
 
 
 if __name__ == '__main__':
     file = 'souza_data/gassensor.arff'
     tester = InjectionTesting(file)
-    drift_cols = tester.inject_random_brownian_noise(10)
+    drift_cols = tester.inject_random_brownian_noise(1000)
     tester.test_pcacd_detector()
+    tester.plot_drift_scatter(drift_cols)

From 820be858311dced0afb0c80d161c4734397ec2d3 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 3 Feb 2023 13:53:00 -0500
Subject: [PATCH 04/35] minor tweaks

---
 menelaus/injection/injection_automation.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 3e4b42b3..85cdbea3 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -1,4 +1,5 @@
 import matplotlib.pyplot as plt
+import os
 import pandas as pd
 import random
 from scipy.io.arff import loadarff
@@ -124,12 +125,12 @@ def inject_random_feature_hide_and_sample(self):
         return rand_col
 
 
-    def test_adwin_detector(self, col):
+    def test_adwin_detector(self, cols):
         detector = ADWINAccuracy()
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(X=None, y_true=row[col], y_pred=0)
+            detector.update(X=None, y_true=row[cols], y_pred=0)
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
@@ -181,12 +182,14 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
         plt.ylim((y_min, y_max))
         plt.vlines(x=self.df[self.df['drift_state'] == 'drift'].index, ymin=y_min, ymax=y_max, label='Drift Detected', color='red')
         plt.legend()
+
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
         plt.savefig(output_file)
 
 
 if __name__ == '__main__':
-    file = 'souza_data/gassensor.arff'
+    file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
-    drift_cols = tester.inject_random_brownian_noise(1000)
-    tester.test_pcacd_detector()
+    drift_cols = tester.inject_random_brownian_noise(10)
+    tester.test_adwin_detector(drift_cols)
     tester.plot_drift_scatter(drift_cols)

From 4b9ca33d58b8f8324ee579a5a73a63b8d64c8d5c Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 10 Feb 2023 18:24:01 -0500
Subject: [PATCH 05/35] adding nndvi drift detection

---
 menelaus/injection/injection_automation.py | 29 ++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 85cdbea3..94e208a0 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -147,6 +147,32 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
         self.df['drift_state'] = drift_state
 
 
+    def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50):
+        if not group_name:
+            group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+            while group_name in cols:
+                group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+        filtered_df = self.df.copy()
+        for filter_col in filtered_df.columns:
+            if filter_col != group_name and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]):
+                filtered_df.drop(columns=filter_col, inplace=True)
+
+        grouped_df = filtered_df.groupby(group_name)
+        status = pd.DataFrame(columns=[group_name, 'drift'])
+        batches = {group_id: group.sample(frac=0.1).drop(columns=group_name).values for group_id, group in grouped_df}
+
+        detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times)
+        detector.set_reference(batches.pop(min(self.df[group_name])))
+
+        for group_id, batch in batches.items():
+            detector.update(pd.DataFrame(batch))
+            status = pd.concat([status, pd.DataFrame({group_name: [group_id], 'drift': [detector.drift_state]})], ignore_index=True)
+
+        return status
+
+
     def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
         detector = PCACD(window_size=window_size, divergence_metric=divergence_metric)
         drift_state = []
@@ -191,5 +217,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_adwin_detector(drift_cols)
-    tester.plot_drift_scatter(drift_cols)
+    tester.test_nndvi_detector(drift_cols)

From 8f397a1bee6f17fd4324b1cb3af44909dc84bc1c Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Mon, 13 Feb 2023 12:05:54 -0500
Subject: [PATCH 06/35] allow manual column type specification

---
 menelaus/injection/injection_automation.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 94e208a0..5d9feaa7 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -27,7 +27,7 @@ def select_random_classes(series):
 
 
 class InjectionTesting:
-    def __init__(self, data_path, seed=None):
+    def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None):
         file_type = data_path.split('.')[-1]
         self.numeric_cols = []
         self.categorical_cols = []
@@ -40,11 +40,12 @@ def __init__(self, data_path, seed=None):
         else:
             raise ValueError(f'Invalid file type: {file_type}')
 
-        for col in self.df.columns:
-            if pd.api.types.is_numeric_dtype(self.df[col]):
-                self.numeric_cols.append(col)
-            elif self.df[col].nunique() < len(self.df):
-                self.categorical_cols.append(col)
+        if numeric_cols is None or categorical_cols is None:
+            for col in self.df.columns:
+                if pd.api.types.is_numeric_dtype(self.df[col]) and numeric_cols is None:
+                    self.numeric_cols.append(col)
+                elif self.df[col].nunique() < len(self.df) and categorical_cols is None:
+                    self.categorical_cols.append(col)
 
         if seed:
             random.seed(seed)

From 016eb7e9c071402bce9114b30d119ef67bc1bad8 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Mon, 13 Feb 2023 12:10:01 -0500
Subject: [PATCH 07/35] adjusting function returns and local variables

---
 menelaus/injection/injection_automation.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 5d9feaa7..995822fb 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -46,6 +46,10 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non
                     self.numeric_cols.append(col)
                 elif self.df[col].nunique() < len(self.df) and categorical_cols is None:
                     self.categorical_cols.append(col)
+        if numeric_cols is None:
+            self.numeric_cols = numeric_cols
+        if categorical_cols is None:
+            self.categorical_cols = categorical_cols
 
         if seed:
             random.seed(seed)
@@ -135,6 +139,7 @@ def test_adwin_detector(self, cols):
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
+        return detector
 
 
     def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
@@ -146,6 +151,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
+        return detector
 
 
     def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50):
@@ -171,7 +177,7 @@ def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50):
             detector.update(pd.DataFrame(batch))
             status = pd.concat([status, pd.DataFrame({group_name: [group_id], 'drift': [detector.drift_state]})], ignore_index=True)
 
-        return status
+        return detector, status
 
 
     def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):

From 74e95f63b6aecedd8ab9c2a643f4b85279cbb088 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Mon, 13 Feb 2023 12:25:10 -0500
Subject: [PATCH 08/35] cbdb detector implementation

---
 menelaus/injection/injection_automation.py | 51 ++++++++++++++++------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 995822fb..63420431 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -6,6 +6,8 @@
 
 from menelaus.concept_drift import LinearFourRates, ADWINAccuracy, DDM, EDDM, STEPD, MD3
 from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI
+from menelaus.data_drift.cdbd import CDBD
+from menelaus.data_drift.hdddm import HDDDM
 import class_manipulation
 import feature_manipulation
 import noise
@@ -46,9 +48,9 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non
                     self.numeric_cols.append(col)
                 elif self.df[col].nunique() < len(self.df) and categorical_cols is None:
                     self.categorical_cols.append(col)
-        if numeric_cols is None:
+        if numeric_cols is not None:
             self.numeric_cols = numeric_cols
-        if categorical_cols is None:
+        if categorical_cols is not None:
             self.categorical_cols = categorical_cols
 
         if seed:
@@ -142,6 +144,26 @@ def test_adwin_detector(self, cols):
         return detector
 
 
+    def test_cbdb_detector(self, cols, group_col=None, subsets=8):
+        if not group_col:
+            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+            while group_col in cols:
+                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+        reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols]
+        test_df = self.df[self.df[group_col] != self.df[group_col].min()]
+        detector = CDBD(subsets=subsets)
+        detector.set_reference(reference_df)
+        drift_state = []
+
+        for group_id, subset_data in test_df.groupby(group_col):
+            detector.update(subset_data[cols])
+            drift_state.append(detector.drift_state)
+
+        return detector, drift_state
+
+
     def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
         detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound)
         drift_state = []
@@ -154,28 +176,28 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
         return detector
 
 
-    def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50):
-        if not group_name:
-            group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+    def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50):
+        if not group_col:
+            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
-            while group_name in cols:
-                group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            while group_col in cols:
+                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
         filtered_df = self.df.copy()
         for filter_col in filtered_df.columns:
-            if filter_col != group_name and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]):
+            if filter_col != group_col and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]):
                 filtered_df.drop(columns=filter_col, inplace=True)
 
-        grouped_df = filtered_df.groupby(group_name)
-        status = pd.DataFrame(columns=[group_name, 'drift'])
-        batches = {group_id: group.sample(frac=0.1).drop(columns=group_name).values for group_id, group in grouped_df}
+        grouped_df = filtered_df.groupby(group_col)
+        status = pd.DataFrame(columns=[group_col, 'drift'])
+        batches = {group_id: group.sample(frac=0.1).drop(columns=group_col).values for group_id, group in grouped_df}
 
         detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times)
-        detector.set_reference(batches.pop(min(self.df[group_name])))
+        detector.set_reference(batches.pop(min(self.df[group_col])))
 
         for group_id, batch in batches.items():
             detector.update(pd.DataFrame(batch))
-            status = pd.concat([status, pd.DataFrame({group_name: [group_id], 'drift': [detector.drift_state]})], ignore_index=True)
+            status = pd.concat([status, pd.DataFrame({group_col: [group_id], 'drift': [detector.drift_state]})], ignore_index=True)
 
         return detector, status
 
@@ -189,6 +211,7 @@ def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
+        return detector
 
 
     def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
@@ -224,4 +247,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_nndvi_detector(drift_cols)
+    tester.test_cbdb_detector(drift_cols)

From 4d06ba3cc896cbdb734997268391b5913e961cd4 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 17 Feb 2023 13:24:21 -0500
Subject: [PATCH 09/35] hdddm detector implementation

---
 menelaus/injection/injection_automation.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 63420431..210ff692 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -164,6 +164,26 @@ def test_cbdb_detector(self, cols, group_col=None, subsets=8):
         return detector, drift_state
 
 
+    def test_hdddm_detector(self, cols, group_col=None, subsets=8):
+        if not group_col:
+            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+            while group_col in cols:
+                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+        reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols]
+        test_df = self.df[self.df[group_col] != self.df[group_col].min()]
+        detector = HDDDM(subsets=subsets)
+        detector.set_reference(reference_df)
+        drift_state = []
+
+        for group_id, subset_data in test_df.groupby(group_col):
+            detector.update(subset_data[cols])
+            drift_state.append(detector.drift_state)
+
+        return detector, drift_state
+
+
     def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
         detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound)
         drift_state = []
@@ -247,4 +267,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_cbdb_detector(drift_cols)
+    tester.test_hdddm_detector(drift_cols)

From f2f97fcf98a9d4fa9862fd8a06f6f5e2874fa9cf Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 17 Feb 2023 14:37:52 -0500
Subject: [PATCH 10/35] kdq tree batch detector implementation

---
 menelaus/injection/injection_automation.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 210ff692..1766d9a4 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -184,6 +184,26 @@ def test_hdddm_detector(self, cols, group_col=None, subsets=8):
         return detector, drift_state
 
 
+    def test_kdq_tree_batch_detector(self, cols, group_col=None):
+        if not group_col:
+            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+            while group_col in cols:
+                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+
+        reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols]
+        test_df = self.df[self.df[group_col] != self.df[group_col].min()]
+        detector = KdqTreeBatch()
+        detector.set_reference(reference_df)
+        drift_state = []
+
+        for group_id, subset_data in test_df.groupby(group_col):
+            detector.update(subset_data[cols])
+            drift_state.append(detector.drift_state)
+
+        return detector, drift_state
+
+
     def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
         detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound)
         drift_state = []
@@ -267,4 +287,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_hdddm_detector(drift_cols)
+    tester.test_kdq_tree_batch_detector(drift_cols)

From 6f1d923e9e5061d4b228a1161abb2f2657dae53c Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Sun, 19 Feb 2023 15:44:48 -0500
Subject: [PATCH 11/35] add basic linear model training for concept drift
 testing

---
 menelaus/injection/injection_automation.py | 35 ++++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 1766d9a4..55253674 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -2,6 +2,7 @@
 import os
 import pandas as pd
 import random
+import sklearn
 from scipy.io.arff import loadarff
 
 from menelaus.concept_drift import LinearFourRates, ADWINAccuracy, DDM, EDDM, STEPD, MD3
@@ -58,10 +59,28 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non
 
 
     def select_rows(self, start, end):
-        start_drift = int(start * len(self.df))
-        end_drift = int(end * len(self.df))
+        start_row = int(start * len(self.df))
+        end_row = int(end * len(self.df))
 
-        return [start_drift, end_drift]
+        return [start_row, end_row]
+
+
+    def train_linear_model(self, x_cols, y_col=None, start=0, end=0.75):
+        if not y_col:
+            if len(x_cols) < len(self.numeric_cols):
+                y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
+
+                while y_col in x_cols:
+                    y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
+            else:
+                raise ValueError('Insufficient numerical columns to select a y variable')
+
+        model = sklearn.linear_model.LinearRegression()
+        start_train, end_train = self.select_rows(start, end)
+        train_df = self.df.iloc[start_train:end_train, ]
+        model.fit(train_df[x_cols], train_df[y_col])
+
+        return model, y_col
 
 
     def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1):
@@ -132,12 +151,16 @@ def inject_random_feature_hide_and_sample(self):
         return rand_col
 
 
-    def test_adwin_detector(self, cols):
+    def test_adwin_detector(self, cols, model=None, y_col=None):
+        if not model:
+            model, y_col = self.train_linear_model(x_cols=cols)
+
+        self.df['y_pred'] = model.predict(self.df[cols])
         detector = ADWINAccuracy()
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(X=None, y_true=row[cols], y_pred=0)
+            detector.update(X=row[cols], y_true=row[y_col], y_pred=row['y_pred'])
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
@@ -287,4 +310,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_kdq_tree_batch_detector(drift_cols)
+    tester.test_adwin_detector(drift_cols)

From de581265a60ecfb5260f9e37a19aba99d5d7d722 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Sun, 19 Feb 2023 15:52:50 -0500
Subject: [PATCH 12/35] tuning linear model automation and concept drift

---
 menelaus/injection/injection_automation.py | 27 ++++++++++------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 55253674..5a568fa6 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -65,22 +65,18 @@ def select_rows(self, start, end):
         return [start_row, end_row]
 
 
-    def train_linear_model(self, x_cols, y_col=None, start=0, end=0.75):
-        if not y_col:
-            if len(x_cols) < len(self.numeric_cols):
-                y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
-
-                while y_col in x_cols:
-                    y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
-            else:
-                raise ValueError('Insufficient numerical columns to select a y variable')
+    def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
+        if not x_cols or not y_col:
+            y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
+            x_cols = self.numeric_cols.copy()
+            x_cols.remove(y_col)
 
         model = sklearn.linear_model.LinearRegression()
         start_train, end_train = self.select_rows(start, end)
         train_df = self.df.iloc[start_train:end_train, ]
         model.fit(train_df[x_cols], train_df[y_col])
 
-        return model, y_col
+        return model, x_cols, y_col
 
 
     def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1):
@@ -151,16 +147,16 @@ def inject_random_feature_hide_and_sample(self):
         return rand_col
 
 
-    def test_adwin_detector(self, cols, model=None, y_col=None):
+    def test_adwin_detector(self, model=None, x_cols=None, y_col=None):
         if not model:
-            model, y_col = self.train_linear_model(x_cols=cols)
+            model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col)
 
-        self.df['y_pred'] = model.predict(self.df[cols])
+        self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = ADWINAccuracy()
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(X=row[cols], y_true=row[y_col], y_pred=row['y_pred'])
+            detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred'])
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
@@ -310,4 +306,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_adwin_detector(drift_cols)
+    tester.test_adwin_detector()
+    print(tester.df['drift_state'].describe())

From 6c4191f553ba058b4a69b7685211eb8ed1ad1658 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 21 Feb 2023 19:30:12 -0500
Subject: [PATCH 13/35] linear four rates detector implementation

---
 menelaus/injection/injection_automation.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 5a568fa6..262604d2 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -235,6 +235,24 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
         return detector
 
 
+    def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01,
+                          detect_level=0.001, num_mc=5000, burn_in=10, subsample=10):
+        if not model:
+            model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col)
+
+        self.df['y_pred'] = model.predict(self.df[x_cols])
+        detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level,
+                                   num_mc=num_mc, burn_in=burn_in, subsample=subsample)
+        drift_state = []
+
+        for i, row in self.df.iterrows():
+            detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred'])
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
+        return detector
+
+
     def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
@@ -306,5 +324,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_adwin_detector()
+    tester.test_lfr_detector()
     print(tester.df['drift_state'].describe())

From f9897f8af460db5a4c921c6c93495f44e33760ec Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 21 Feb 2023 19:56:37 -0500
Subject: [PATCH 14/35] adding naive logistic classifier model

---
 menelaus/injection/injection_automation.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 262604d2..6d79921b 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -79,6 +79,24 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
         return model, x_cols, y_col
 
 
+    def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75):
+        if not x_cols or not y_col:
+            y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            x_cols = self.numeric_cols.copy()
+
+        encoder = sklearn.preprocessing.LabelEncoder()
+        encoder.fit(self.df[y_col])
+        self.df[f'{y_col}_encoded'] = encoder.transform(self.df[y_col])
+        y_col = f'{y_col}_encoded'
+
+        model = sklearn.linear_model.LogisticRegression()
+        start_train, end_train = self.select_rows(start, end)
+        train_df = self.df.iloc[start_train:end_train, ]
+        model.fit(train_df[x_cols], train_df[y_col])
+
+        return model, x_cols, y_col
+
+
     def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1):
         rand_cols = []
         start_drift, end_drift = self.select_rows(start, end)
@@ -238,7 +256,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
     def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01,
                           detect_level=0.001, num_mc=5000, burn_in=10, subsample=10):
         if not model:
-            model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
 
         self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level,

From 3a6e0e04bcbb619fe5a509e790eeb379b88926d3 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 22 Feb 2023 12:28:54 -0500
Subject: [PATCH 15/35] adding binary option to logistic classifier

---
 menelaus/injection/injection_automation.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 6d79921b..5a78f493 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -79,7 +79,7 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
         return model, x_cols, y_col
 
 
-    def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75):
+    def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None):
         if not x_cols or not y_col:
             y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
             x_cols = self.numeric_cols.copy()
@@ -89,6 +89,9 @@ def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75):
         self.df[f'{y_col}_encoded'] = encoder.transform(self.df[y_col])
         y_col = f'{y_col}_encoded'
 
+        if limit_classes:
+            self.df = self.df[self.df[y_col] < limit_classes]
+
         model = sklearn.linear_model.LogisticRegression()
         start_train, end_train = self.select_rows(start, end)
         train_df = self.df.iloc[start_train:end_train, ]
@@ -256,7 +259,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
     def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01,
                           detect_level=0.001, num_mc=5000, burn_in=10, subsample=10):
         if not model:
-            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col, limit_classes=2)
 
         self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level,

From 11b6f157b4e714f39120af9eb4fb07f6175561ec Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 22 Feb 2023 18:10:37 -0500
Subject: [PATCH 16/35] ddm detector implementation

---
 menelaus/injection/injection_automation.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 5a78f493..97f3c9e7 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -204,6 +204,22 @@ def test_cbdb_detector(self, cols, group_col=None, subsets=8):
         return detector, drift_state
 
 
+    def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100, warning_scale=7, drift_scale=10):
+        if not model:
+            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+
+        self.df['y_pred'] = model.predict(self.df[x_cols])
+        detector = DDM(n_threshold=n_threshold, warning_scale=warning_scale, drift_scale=drift_scale)
+        drift_state = []
+
+        for i, row in self.df.iterrows():
+            detector.update(y_true=row[y_col], y_pred=row['y_pred'])
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
+        return detector
+
+
     def test_hdddm_detector(self, cols, group_col=None, subsets=8):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
@@ -345,5 +361,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_lfr_detector()
+    tester.test_ddm_detector()
     print(tester.df['drift_state'].describe())

From 4339ae68b84dd36f56495221ad76fc0be1799979 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 22 Feb 2023 18:24:01 -0500
Subject: [PATCH 17/35] eddm detector implementation

---
 menelaus/injection/injection_automation.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 97f3c9e7..30bbdeb4 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -220,6 +220,22 @@ def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100
         return detector
 
 
+    def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30, warning_thresh=0.7, drift_thresh=0.5):
+        if not model:
+            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+
+        self.df['y_pred'] = model.predict(self.df[x_cols])
+        detector = EDDM(n_threshold=n_threshold, warning_thresh=warning_thresh, drift_thresh=drift_thresh)
+        drift_state = []
+
+        for i, row in self.df.iterrows():
+            detector.update(y_true=row[y_col], y_pred=row['y_pred'])
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
+        return detector
+
+
     def test_hdddm_detector(self, cols, group_col=None, subsets=8):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
@@ -361,5 +377,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_ddm_detector()
+    tester.test_eddm_detector()
     print(tester.df['drift_state'].describe())

From c68a81f827d7edb68bf59e203448196b094c1061 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 22 Feb 2023 18:31:37 -0500
Subject: [PATCH 18/35] stepd detector implementation

---
 menelaus/injection/injection_automation.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 30bbdeb4..8eb9f63f 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -344,6 +344,22 @@ def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
         return detector
 
 
+    def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=250):
+        if not model:
+            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+
+        self.df['y_pred'] = model.predict(self.df[x_cols])
+        detector = STEPD(window_size=window_size)
+        drift_state = []
+
+        for i, row in self.df.iterrows():
+            detector.update(y_true=row[y_col], y_pred=row['y_pred'])
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
+        return detector
+
+
     def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
         plt.figure(figsize=(20, 6))
         y_min = None
@@ -377,5 +393,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_eddm_detector()
+    tester.test_stepd_detector()
     print(tester.df['drift_state'].describe())

From 52d49c653724aec34323556bc46bbc1671a3e812 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 28 Feb 2023 11:17:54 -0500
Subject: [PATCH 19/35] adding reference group logic for concept drift
 detectors

---
 menelaus/injection/injection_automation.py | 34 +++++++++++++++-------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 8eb9f63f..8a9e4933 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -184,15 +184,18 @@ def test_adwin_detector(self, model=None, x_cols=None, y_col=None):
         return detector
 
 
-    def test_cbdb_detector(self, cols, group_col=None, subsets=8):
+    def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets=8):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
             while group_col in cols:
                 group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
-        reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols]
-        test_df = self.df[self.df[group_col] != self.df[group_col].min()]
+        if not reference_group:
+            reference_group = self.df[group_col].min()
+
+        reference_df = self.df[self.df[group_col] == reference_group][cols]
+        test_df = self.df[self.df[group_col] != reference_group]
         detector = CDBD(subsets=subsets)
         detector.set_reference(reference_df)
         drift_state = []
@@ -236,15 +239,18 @@ def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30
         return detector
 
 
-    def test_hdddm_detector(self, cols, group_col=None, subsets=8):
+    def test_hdddm_detector(self, cols, group_col=None, reference_group=None, subsets=8):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
             while group_col in cols:
                 group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
-        reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols]
-        test_df = self.df[self.df[group_col] != self.df[group_col].min()]
+        if not reference_group:
+            reference_group = self.df[group_col].min()
+
+        reference_df = self.df[self.df[group_col] == reference_group][cols]
+        test_df = self.df[self.df[group_col] != reference_group]
         detector = HDDDM(subsets=subsets)
         detector.set_reference(reference_df)
         drift_state = []
@@ -256,15 +262,18 @@ def test_hdddm_detector(self, cols, group_col=None, subsets=8):
         return detector, drift_state
 
 
-    def test_kdq_tree_batch_detector(self, cols, group_col=None):
+    def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=None):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
             while group_col in cols:
                 group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
-        reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols]
-        test_df = self.df[self.df[group_col] != self.df[group_col].min()]
+        if not reference_group:
+            reference_group = self.df[group_col].min()
+
+        reference_df = self.df[self.df[group_col] == reference_group][cols]
+        test_df = self.df[self.df[group_col] != reference_group]
         detector = KdqTreeBatch()
         detector.set_reference(reference_df)
         drift_state = []
@@ -306,13 +315,16 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact
         return detector
 
 
-    def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50):
+    def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
             while group_col in cols:
                 group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
+        if not reference_group:
+            reference_group = self.df[group_col].min()
+
         filtered_df = self.df.copy()
         for filter_col in filtered_df.columns:
             if filter_col != group_col and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]):
@@ -323,7 +335,7 @@ def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50):
         batches = {group_id: group.sample(frac=0.1).drop(columns=group_col).values for group_id, group in grouped_df}
 
         detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times)
-        detector.set_reference(batches.pop(min(self.df[group_col])))
+        detector.set_reference(batches.pop(reference_group))
 
         for group_id, batch in batches.items():
             detector.update(pd.DataFrame(batch))

From 3d3ba2259393f22f107a6ded8fb66818f3c5dd12 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 28 Feb 2023 18:24:35 -0500
Subject: [PATCH 20/35] md3 detector implementation

---
 menelaus/injection/injection_automation.py | 26 +++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 8a9e4933..6570d707 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -32,6 +32,7 @@ def select_random_classes(series):
 class InjectionTesting:
     def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None):
         file_type = data_path.split('.')[-1]
+        self.seed = seed
         self.numeric_cols = []
         self.categorical_cols = []
 
@@ -315,6 +316,29 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact
         return detector
 
 
+    def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5, oracle_labels=1000):
+        if not model:
+            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+
+        cols = x_cols.copy()
+        cols.append(y_col)
+        self.df['y_pred'] = model.predict(self.df[x_cols])
+        detector = MD3(clf=model, sensitivity=sensitivity, oracle_data_length_required=oracle_labels)
+        detector.set_reference(X=self.df[cols], target_name=y_col)
+        drift_state = []
+
+        for i, row in self.df.iterrows():
+            if detector.waiting_for_oracle:
+                oracle_label = pd.DataFrame([row[cols]])
+                detector.give_oracle_label(oracle_label)
+
+            detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred'])
+            drift_state.append(detector.drift_state)
+
+        self.df['drift_state'] = drift_state
+        return detector
+
+
     def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
@@ -405,5 +429,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_stepd_detector()
+    tester.test_md3_detector()
     print(tester.df['drift_state'].describe())

From cff5906ec5b7a88d51f366873dce4efb3787d7ed Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 28 Feb 2023 19:44:23 -0500
Subject: [PATCH 21/35] naive linear svc implementation

---
 menelaus/injection/injection_automation.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 6570d707..40606681 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -80,7 +80,7 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
         return model, x_cols, y_col
 
 
-    def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None):
+    def train_classifier_model(self,  model_type='svc', x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None):
         if not x_cols or not y_col:
             y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
             x_cols = self.numeric_cols.copy()
@@ -93,7 +93,13 @@ def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75, limit
         if limit_classes:
             self.df = self.df[self.df[y_col] < limit_classes]
 
-        model = sklearn.linear_model.LogisticRegression()
+        if model_type == 'svc':
+            model = sklearn.svm.SVC(kernel='linear')
+        elif model_type == 'logistic':
+            model = sklearn.linear_model.LogisticRegression()
+        else:
+            raise ValueError(f'Model type not supported: {model_type}')
+
         start_train, end_train = self.select_rows(start, end)
         train_df = self.df.iloc[start_train:end_train, ]
         model.fit(train_df[x_cols], train_df[y_col])
@@ -210,7 +216,7 @@ def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets
 
     def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100, warning_scale=7, drift_scale=10):
         if not model:
-            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
 
         self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = DDM(n_threshold=n_threshold, warning_scale=warning_scale, drift_scale=drift_scale)
@@ -226,7 +232,7 @@ def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100
 
     def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30, warning_thresh=0.7, drift_thresh=0.5):
         if not model:
-            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
 
         self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = EDDM(n_threshold=n_threshold, warning_thresh=warning_thresh, drift_thresh=drift_thresh)
@@ -301,7 +307,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
     def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01,
                           detect_level=0.001, num_mc=5000, burn_in=10, subsample=10):
         if not model:
-            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col, limit_classes=2)
+            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, limit_classes=2)
 
         self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level,
@@ -318,7 +324,7 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact
 
     def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5, oracle_labels=1000):
         if not model:
-            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
 
         cols = x_cols.copy()
         cols.append(y_col)
@@ -382,7 +388,7 @@ def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
 
     def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=250):
         if not model:
-            model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
 
         self.df['y_pred'] = model.predict(self.df[x_cols])
         detector = STEPD(window_size=window_size)

From 98f58b6c398465c90aaf808020591b607881f998 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 1 Mar 2023 13:53:36 -0500
Subject: [PATCH 22/35] md3 detector bug fixing

---
 menelaus/injection/injection_automation.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 40606681..b095504d 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -44,15 +44,15 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non
         else:
             raise ValueError(f'Invalid file type: {file_type}')
 
-        if numeric_cols is None or categorical_cols is None:
+        if not numeric_cols or not categorical_cols:
             for col in self.df.columns:
                 if pd.api.types.is_numeric_dtype(self.df[col]) and numeric_cols is None:
                     self.numeric_cols.append(col)
                 elif self.df[col].nunique() < len(self.df) and categorical_cols is None:
                     self.categorical_cols.append(col)
-        if numeric_cols is not None:
+        if numeric_cols:
             self.numeric_cols = numeric_cols
-        if categorical_cols is not None:
+        if categorical_cols:
             self.categorical_cols = categorical_cols
 
         if seed:
@@ -322,10 +322,14 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact
         return detector
 
 
-    def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5, oracle_labels=1000):
+    def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=None):
         if not model:
-            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end)
+
+        if not oracle_labels:
+            oracle_labels = self.df[y_col].nunique()
 
+        training_size = int(len(self.df) * end)
         cols = x_cols.copy()
         cols.append(y_col)
         self.df['y_pred'] = model.predict(self.df[x_cols])
@@ -333,8 +337,8 @@ def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5
         detector.set_reference(X=self.df[cols], target_name=y_col)
         drift_state = []
 
-        for i, row in self.df.iterrows():
-            if detector.waiting_for_oracle:
+        for i, row in self.df.iloc[training_size:len(self.df), ].iterrows():
+            while detector.waiting_for_oracle:
                 oracle_label = pd.DataFrame([row[cols]])
                 detector.give_oracle_label(oracle_label)
 

From f655b8235b44836a96576d253a9d9bdae83f2d7c Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 1 Mar 2023 17:06:17 -0500
Subject: [PATCH 23/35] md3 detector working

---
 menelaus/injection/injection_automation.py | 29 ++++++++++++----------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index b095504d..46e155e0 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -322,31 +322,35 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact
         return detector
 
 
-    def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=None):
+    def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=1000):
         if not model:
             model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end)
+            retrain_model, _, _ = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end)
 
-        if not oracle_labels:
-            oracle_labels = self.df[y_col].nunique()
-
-        training_size = int(len(self.df) * end)
+        end_train = int(end * len(self.df))
         cols = x_cols.copy()
         cols.append(y_col)
         self.df['y_pred'] = model.predict(self.df[x_cols])
+        self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols])
         detector = MD3(clf=model, sensitivity=sensitivity, oracle_data_length_required=oracle_labels)
         detector.set_reference(X=self.df[cols], target_name=y_col)
         drift_state = []
 
-        for i, row in self.df.iloc[training_size:len(self.df), ].iterrows():
-            while detector.waiting_for_oracle:
+        for i, row in self.df.iloc[end_train:len(self.df), ].iterrows():
+            if detector.waiting_for_oracle:
                 oracle_label = pd.DataFrame([row[cols]])
                 detector.give_oracle_label(oracle_label)
 
-            detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred'])
-            drift_state.append(detector.drift_state)
+                if not detector.waiting_for_oracle:
+                    retrain_model.fit(detector.reference_batch_features, detector.reference_batch_target.values.ravel())
+                    self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols])
 
-        self.df['drift_state'] = drift_state
-        return detector
+                drift_state.append(detector.drift_state)
+            else:
+                detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred_retrain'])
+                drift_state.append(detector.drift_state)
+
+        return detector, drift_state
 
 
     def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50):
@@ -439,5 +443,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
     drift_cols = tester.inject_random_brownian_noise(10)
-    tester.test_md3_detector()
-    print(tester.df['drift_state'].describe())
+    detector, drift = tester.test_md3_detector()

From fee1182b5415650bfdfd4f2646bda0596c3bed9e Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Thu, 2 Mar 2023 11:41:56 -0500
Subject: [PATCH 24/35] some detector tuning

---
 menelaus/injection/injection_automation.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 46e155e0..f387a18e 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -353,12 +353,13 @@ def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.
         return detector, drift_state
 
 
-    def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50):
+    def test_nndvi_detector(self, cols=None, group_col=None, reference_group=None, k_nn=2, sampling_times=50):
         if not group_col:
             group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
-            while group_col in cols:
-                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            if cols:
+                while group_col in cols:
+                    group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
 
         if not reference_group:
             reference_group = self.df[group_col].min()
@@ -382,12 +383,15 @@ def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2
         return detector, status
 
 
-    def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'):
+    def test_pcacd_detector(self, cols=None, window_size=50, divergence_metric='intersection'):
+        if not cols:
+            cols = self.numeric_cols.copy()
+
         detector = PCACD(window_size=window_size, divergence_metric=divergence_metric)
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(row)
+            detector.update(row[cols])
             drift_state.append(detector.drift_state)
 
         self.df['drift_state'] = drift_state
@@ -442,5 +446,7 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
 if __name__ == '__main__':
     file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
     tester = InjectionTesting(file)
-    drift_cols = tester.inject_random_brownian_noise(10)
-    detector, drift = tester.test_md3_detector()
+    _, classes = tester.inject_random_class_manipulation(manipulation_type='class_swap')
+    nndvi, status = tester.test_nndvi_detector(k_nn=1000, sampling_times=1000)
+    print(classes)
+    print(status)

From 6bb1bb9289b76e1d6cd8ad6b16366c6526f4a62a Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 7 Mar 2023 14:38:34 -0500
Subject: [PATCH 25/35] testing

---
 menelaus/injection/injection_automation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index f387a18e..5a95fe42 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -444,9 +444,9 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
 
 
 if __name__ == '__main__':
-    file = 'souza_data/INSECTS-abrupt_balanced_norm.arff'
+    file = 'souza_data/gassensor.arff'
     tester = InjectionTesting(file)
     _, classes = tester.inject_random_class_manipulation(manipulation_type='class_swap')
-    nndvi, status = tester.test_nndvi_detector(k_nn=1000, sampling_times=1000)
+    nndvi, status = tester.test_nndvi_detector(k_nn=50, sampling_times=100)
     print(classes)
     print(status)

From ac726479d4f03e918f65d99c84e1ca77f0ae0181 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 15 Mar 2023 11:36:59 -0400
Subject: [PATCH 26/35] script reformatting

---
 menelaus/injection/injection_automation.py | 383 ++++++++++++++-------
 1 file changed, 260 insertions(+), 123 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 5a95fe42..a26fa9e7 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -18,7 +18,7 @@ def select_random_classes(series):
     classes = series.unique()
 
     if len(classes) < 2:
-        raise ValueError(f'Insufficient classes in series: {len(classes)}')
+        raise ValueError(f"Insufficient classes in series: {len(classes)}")
     else:
         class_a = classes[random.randint(0, len(classes) - 1)]
         class_b = classes[random.randint(0, len(classes) - 1)]
@@ -31,18 +31,18 @@ def select_random_classes(series):
 
 class InjectionTesting:
     def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None):
-        file_type = data_path.split('.')[-1]
+        file_type = data_path.split(".")[-1]
         self.seed = seed
         self.numeric_cols = []
         self.categorical_cols = []
 
-        if file_type == 'csv':
+        if file_type == "csv":
             self.df = pd.read_csv(data_path)
-        elif file_type == 'arff':
+        elif file_type == "arff":
             raw_data = loadarff(data_path)
             self.df = pd.DataFrame(raw_data[0])
         else:
-            raise ValueError(f'Invalid file type: {file_type}')
+            raise ValueError(f"Invalid file type: {file_type}")
 
         if not numeric_cols or not categorical_cols:
             for col in self.df.columns:
@@ -58,14 +58,12 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non
         if seed:
             random.seed(seed)
 
-
     def select_rows(self, start, end):
         start_row = int(start * len(self.df))
         end_row = int(end * len(self.df))
 
         return [start_row, end_row]
 
-
     def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
         if not x_cols or not y_col:
             y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
@@ -74,40 +72,52 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
 
         model = sklearn.linear_model.LinearRegression()
         start_train, end_train = self.select_rows(start, end)
-        train_df = self.df.iloc[start_train:end_train, ]
+        train_df = self.df.iloc[
+            start_train:end_train,
+        ]
         model.fit(train_df[x_cols], train_df[y_col])
 
         return model, x_cols, y_col
 
-
-    def train_classifier_model(self,  model_type='svc', x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None):
+    def train_classifier_model(
+        self,
+        model_type="svc",
+        x_cols=None,
+        y_col=None,
+        start=0,
+        end=0.75,
+        limit_classes=None,
+    ):
         if not x_cols or not y_col:
-            y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            y_col = self.categorical_cols[
+                random.randint(0, len(self.categorical_cols) - 1)
+            ]
             x_cols = self.numeric_cols.copy()
 
         encoder = sklearn.preprocessing.LabelEncoder()
         encoder.fit(self.df[y_col])
-        self.df[f'{y_col}_encoded'] = encoder.transform(self.df[y_col])
-        y_col = f'{y_col}_encoded'
+        self.df[f"{y_col}_encoded"] = encoder.transform(self.df[y_col])
+        y_col = f"{y_col}_encoded"
 
         if limit_classes:
             self.df = self.df[self.df[y_col] < limit_classes]
 
-        if model_type == 'svc':
-            model = sklearn.svm.SVC(kernel='linear')
-        elif model_type == 'logistic':
+        if model_type == "svc":
+            model = sklearn.svm.SVC(kernel="linear")
+        elif model_type == "logistic":
             model = sklearn.linear_model.LogisticRegression()
         else:
-            raise ValueError(f'Model type not supported: {model_type}')
+            raise ValueError(f"Model type not supported: {model_type}")
 
         start_train, end_train = self.select_rows(start, end)
-        train_df = self.df.iloc[start_train:end_train, ]
+        train_df = self.df.iloc[
+            start_train:end_train,
+        ]
         model.fit(train_df[x_cols], train_df[y_col])
 
         return model, x_cols, y_col
 
-
-    def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1):
+    def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1):
         rand_cols = []
         start_drift, end_drift = self.select_rows(start, end)
 
@@ -119,40 +129,69 @@ def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1):
 
         return rand_cols
 
-
-    def inject_random_class_manipulation(self, manipulation_type, start=.75, end=1, num_drift_cols=1):
+    def inject_random_class_manipulation(
+        self, manipulation_type, start=0.75, end=1, num_drift_cols=1
+    ):
         rand_cols = []
         all_rand_classes = []
         start_drift, end_drift = self.select_rows(start, end)
 
         for i in range(num_drift_cols):
-            rand_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            rand_col = self.categorical_cols[
+                random.randint(0, len(self.categorical_cols) - 1)
+            ]
             rand_cols.append(rand_col)
             rand_classes = select_random_classes(self.df[rand_col])
             all_rand_classes.append(rand_classes)
 
-            if manipulation_type == 'class_swap':
-                self.df = class_manipulation.class_swap(self.df, rand_col, rand_classes[0], rand_classes[1], start_drift, end_drift)
-            elif manipulation_type == 'class_join':
-                new_label = f'{rand_classes[0]}_{rand_classes[1]}'
-                self.df = class_manipulation.class_join(self.df, rand_col, rand_classes[0], rand_classes[1], new_label, start_drift, end_drift)
+            if manipulation_type == "class_swap":
+                self.df = class_manipulation.class_swap(
+                    self.df,
+                    rand_col,
+                    rand_classes[0],
+                    rand_classes[1],
+                    start_drift,
+                    end_drift,
+                )
+            elif manipulation_type == "class_join":
+                new_label = f"{rand_classes[0]}_{rand_classes[1]}"
+                self.df = class_manipulation.class_join(
+                    self.df,
+                    rand_col,
+                    rand_classes[0],
+                    rand_classes[1],
+                    new_label,
+                    start_drift,
+                    end_drift,
+                )
             else:
-                raise ValueError(f'Invalid class manipulation type: {manipulation_type}')
+                raise ValueError(
+                    f"Invalid class manipulation type: {manipulation_type}"
+                )
 
         return rand_cols, all_rand_classes
 
-
-    def inject_random_feature_swap(self, start=.75, end=1, num_swaps=1):
+    def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
         all_swap_cols = []
         start_drift, end_drift = self.select_rows(start, end)
 
         for i in range(num_swaps):
-            col_type = self.numeric_cols if random.randint(0, 1) == 0 else self.categorical_cols
+            col_type = (
+                self.numeric_cols
+                if random.randint(0, 1) == 0
+                else self.categorical_cols
+            )
 
             if len(col_type) < 2:
-                col_type = self.numeric_cols if col_type == self.categorical_cols else self.categorical_cols
+                col_type = (
+                    self.numeric_cols
+                    if col_type == self.categorical_cols
+                    else self.categorical_cols
+                )
             if len(col_type) < 2:
-                raise ValueError('Insufficient numeric and categorical columns for swaps')
+                raise ValueError(
+                    "Insufficient numeric and categorical columns for swaps"
+                )
 
             col_a = col_type[random.randint(0, len(col_type) - 1)]
             col_b = col_type[random.randint(0, len(col_type) - 1)]
@@ -162,41 +201,46 @@ def inject_random_feature_swap(self, start=.75, end=1, num_swaps=1):
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
-            self.df = feature_manipulation.feature_swap(self.df, col_a, col_b, start_drift, end_drift)
+            self.df = feature_manipulation.feature_swap(
+                self.df, col_a, col_b, start_drift, end_drift
+            )
 
         return all_swap_cols
 
-
     def inject_random_feature_hide_and_sample(self):
         rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
         sample_size = min(self.df[rand_col].value_counts())
-        self.df = feature_manipulation.feature_hide_and_sample(self.df, rand_col, sample_size)
+        self.df = feature_manipulation.feature_hide_and_sample(
+            self.df, rand_col, sample_size
+        )
 
         return rand_col
 
-
     def test_adwin_detector(self, model=None, x_cols=None, y_col=None):
         if not model:
             model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col)
 
-        self.df['y_pred'] = model.predict(self.df[x_cols])
+        self.df["y_pred"] = model.predict(self.df[x_cols])
         detector = ADWINAccuracy()
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred'])
+            detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row["y_pred"])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
     def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets=8):
         if not group_col:
-            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            group_col = self.categorical_cols[
+                random.randint(0, len(self.categorical_cols) - 1)
+            ]
 
             while group_col in cols:
-                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+                group_col = self.categorical_cols[
+                    random.randint(0, len(self.categorical_cols) - 1)
+                ]
 
         if not reference_group:
             reference_group = self.df[group_col].min()
@@ -213,45 +257,76 @@ def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets
 
         return detector, drift_state
 
-
-    def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100, warning_scale=7, drift_scale=10):
+    def test_ddm_detector(
+        self,
+        model=None,
+        x_cols=None,
+        y_col=None,
+        n_threshold=100,
+        warning_scale=7,
+        drift_scale=10,
+    ):
         if not model:
-            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
-
-        self.df['y_pred'] = model.predict(self.df[x_cols])
-        detector = DDM(n_threshold=n_threshold, warning_scale=warning_scale, drift_scale=drift_scale)
+            model, x_cols, y_col = self.train_classifier_model(
+                model_type="svc", x_cols=x_cols, y_col=y_col
+            )
+
+        self.df["y_pred"] = model.predict(self.df[x_cols])
+        detector = DDM(
+            n_threshold=n_threshold,
+            warning_scale=warning_scale,
+            drift_scale=drift_scale,
+        )
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(y_true=row[y_col], y_pred=row['y_pred'])
+            detector.update(y_true=row[y_col], y_pred=row["y_pred"])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
-    def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30, warning_thresh=0.7, drift_thresh=0.5):
+    def test_eddm_detector(
+        self,
+        model=None,
+        x_cols=None,
+        y_col=None,
+        n_threshold=30,
+        warning_thresh=0.7,
+        drift_thresh=0.5,
+    ):
         if not model:
-            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
-
-        self.df['y_pred'] = model.predict(self.df[x_cols])
-        detector = EDDM(n_threshold=n_threshold, warning_thresh=warning_thresh, drift_thresh=drift_thresh)
+            model, x_cols, y_col = self.train_classifier_model(
+                model_type="svc", x_cols=x_cols, y_col=y_col
+            )
+
+        self.df["y_pred"] = model.predict(self.df[x_cols])
+        detector = EDDM(
+            n_threshold=n_threshold,
+            warning_thresh=warning_thresh,
+            drift_thresh=drift_thresh,
+        )
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(y_true=row[y_col], y_pred=row['y_pred'])
+            detector.update(y_true=row[y_col], y_pred=row["y_pred"])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
-    def test_hdddm_detector(self, cols, group_col=None, reference_group=None, subsets=8):
+    def test_hdddm_detector(
+        self, cols, group_col=None, reference_group=None, subsets=8
+    ):
         if not group_col:
-            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            group_col = self.categorical_cols[
+                random.randint(0, len(self.categorical_cols) - 1)
+            ]
 
             while group_col in cols:
-                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+                group_col = self.categorical_cols[
+                    random.randint(0, len(self.categorical_cols) - 1)
+                ]
 
         if not reference_group:
             reference_group = self.df[group_col].min()
@@ -268,13 +343,16 @@ def test_hdddm_detector(self, cols, group_col=None, reference_group=None, subset
 
         return detector, drift_state
 
-
     def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=None):
         if not group_col:
-            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            group_col = self.categorical_cols[
+                random.randint(0, len(self.categorical_cols) - 1)
+            ]
 
             while group_col in cols:
-                group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+                group_col = self.categorical_cols[
+                    random.randint(0, len(self.categorical_cols) - 1)
+                ]
 
         if not reference_group:
             reference_group = self.df[group_col].min()
@@ -291,8 +369,9 @@ def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=Non
 
         return detector, drift_state
 
-
-    def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50):
+    def test_kdq_tree_streaming_detector(
+        self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50
+    ):
         detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound)
         drift_state = []
 
@@ -300,90 +379,151 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo
             detector.update(row[cols])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
-    def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01,
-                          detect_level=0.001, num_mc=5000, burn_in=10, subsample=10):
+    def test_lfr_detector(
+        self,
+        model=None,
+        x_cols=None,
+        y_col=None,
+        time_decay_factor=0.6,
+        warning_level=0.01,
+        detect_level=0.001,
+        num_mc=5000,
+        burn_in=10,
+        subsample=10,
+    ):
         if not model:
-            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, limit_classes=2)
-
-        self.df['y_pred'] = model.predict(self.df[x_cols])
-        detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level,
-                                   num_mc=num_mc, burn_in=burn_in, subsample=subsample)
+            model, x_cols, y_col = self.train_classifier_model(
+                model_type="svc", x_cols=x_cols, y_col=y_col, limit_classes=2
+            )
+
+        self.df["y_pred"] = model.predict(self.df[x_cols])
+        detector = LinearFourRates(
+            time_decay_factor=time_decay_factor,
+            warning_level=warning_level,
+            detect_level=detect_level,
+            num_mc=num_mc,
+            burn_in=burn_in,
+            subsample=subsample,
+        )
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred'])
+            detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row["y_pred"])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
-    def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=1000):
+    def test_md3_detector(
+        self,
+        model=None,
+        x_cols=None,
+        y_col=None,
+        start=0,
+        end=0.75,
+        sensitivity=1.5,
+        oracle_labels=1000,
+    ):
         if not model:
-            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end)
-            retrain_model, _, _ = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end)
+            model, x_cols, y_col = self.train_classifier_model(
+                model_type="svc", x_cols=x_cols, y_col=y_col, start=start, end=end
+            )
+            retrain_model, _, _ = self.train_classifier_model(
+                model_type="svc", x_cols=x_cols, y_col=y_col, start=start, end=end
+            )
 
         end_train = int(end * len(self.df))
         cols = x_cols.copy()
         cols.append(y_col)
-        self.df['y_pred'] = model.predict(self.df[x_cols])
-        self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols])
-        detector = MD3(clf=model, sensitivity=sensitivity, oracle_data_length_required=oracle_labels)
+        self.df["y_pred"] = model.predict(self.df[x_cols])
+        self.df["y_pred_retrain"] = retrain_model.predict(self.df[x_cols])
+        detector = MD3(
+            clf=model,
+            sensitivity=sensitivity,
+            oracle_data_length_required=oracle_labels,
+        )
         detector.set_reference(X=self.df[cols], target_name=y_col)
         drift_state = []
 
-        for i, row in self.df.iloc[end_train:len(self.df), ].iterrows():
+        for i, row in self.df.iloc[
+            end_train : len(self.df),
+        ].iterrows():
             if detector.waiting_for_oracle:
                 oracle_label = pd.DataFrame([row[cols]])
                 detector.give_oracle_label(oracle_label)
 
                 if not detector.waiting_for_oracle:
-                    retrain_model.fit(detector.reference_batch_features, detector.reference_batch_target.values.ravel())
-                    self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols])
+                    retrain_model.fit(
+                        detector.reference_batch_features,
+                        detector.reference_batch_target.values.ravel(),
+                    )
+                    self.df["y_pred_retrain"] = retrain_model.predict(self.df[x_cols])
 
                 drift_state.append(detector.drift_state)
             else:
-                detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred_retrain'])
+                detector.update(
+                    X=pd.DataFrame([row[x_cols]]),
+                    y_true=row[y_col],
+                    y_pred=row["y_pred_retrain"],
+                )
                 drift_state.append(detector.drift_state)
 
         return detector, drift_state
 
-
-    def test_nndvi_detector(self, cols=None, group_col=None, reference_group=None, k_nn=2, sampling_times=50):
+    def test_nndvi_detector(
+        self, cols=None, group_col=None, reference_group=None, k_nn=2, sampling_times=50
+    ):
         if not group_col:
-            group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+            group_col = self.categorical_cols[
+                random.randint(0, len(self.categorical_cols) - 1)
+            ]
 
             if cols:
                 while group_col in cols:
-                    group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)]
+                    group_col = self.categorical_cols[
+                        random.randint(0, len(self.categorical_cols) - 1)
+                    ]
 
         if not reference_group:
             reference_group = self.df[group_col].min()
 
         filtered_df = self.df.copy()
         for filter_col in filtered_df.columns:
-            if filter_col != group_col and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]):
+            if filter_col != group_col and not pd.api.types.is_numeric_dtype(
+                filtered_df[filter_col]
+            ):
                 filtered_df.drop(columns=filter_col, inplace=True)
 
         grouped_df = filtered_df.groupby(group_col)
-        status = pd.DataFrame(columns=[group_col, 'drift'])
-        batches = {group_id: group.sample(frac=0.1).drop(columns=group_col).values for group_id, group in grouped_df}
+        status = pd.DataFrame(columns=[group_col, "drift"])
+        batches = {
+            group_id: group.sample(frac=0.1).drop(columns=group_col).values
+            for group_id, group in grouped_df
+        }
 
         detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times)
         detector.set_reference(batches.pop(reference_group))
 
         for group_id, batch in batches.items():
             detector.update(pd.DataFrame(batch))
-            status = pd.concat([status, pd.DataFrame({group_col: [group_id], 'drift': [detector.drift_state]})], ignore_index=True)
+            status = pd.concat(
+                [
+                    status,
+                    pd.DataFrame(
+                        {group_col: [group_id], "drift": [detector.drift_state]}
+                    ),
+                ],
+                ignore_index=True,
+            )
 
         return detector, status
 
-
-    def test_pcacd_detector(self, cols=None, window_size=50, divergence_metric='intersection'):
+    def test_pcacd_detector(
+        self, cols=None, window_size=50, divergence_metric="intersection"
+    ):
         if not cols:
             cols = self.numeric_cols.copy()
 
@@ -394,27 +534,27 @@ def test_pcacd_detector(self, cols=None, window_size=50, divergence_metric='inte
             detector.update(row[cols])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
     def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=250):
         if not model:
-            model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col)
+            model, x_cols, y_col = self.train_classifier_model(
+                model_type="svc", x_cols=x_cols, y_col=y_col
+            )
 
-        self.df['y_pred'] = model.predict(self.df[x_cols])
+        self.df["y_pred"] = model.predict(self.df[x_cols])
         detector = STEPD(window_size=window_size)
         drift_state = []
 
         for i, row in self.df.iterrows():
-            detector.update(y_true=row[y_col], y_pred=row['y_pred'])
+            detector.update(y_true=row[y_col], y_pred=row["y_pred"])
             drift_state.append(detector.drift_state)
 
-        self.df['drift_state'] = drift_state
+        self.df["drift_state"] = drift_state
         return detector
 
-
-    def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
+    def plot_drift_scatter(self, cols, output_file="plots/drift_scatter_test.png"):
         plt.figure(figsize=(20, 6))
         y_min = None
         y_max = None
@@ -429,24 +569,21 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'):
             if y_max is None or y_max < local_max:
                 y_max = local_max
 
-        plt.grid(False, axis='x')
+        plt.grid(False, axis="x")
         plt.xticks(fontsize=16)
         plt.yticks(fontsize=16)
-        plt.title('Scatter Results', fontsize=22)
-        plt.xlabel('Index', fontsize=18)
-        plt.ylabel('Value', fontsize=18)
+        plt.title("Scatter Results", fontsize=22)
+        plt.xlabel("Index", fontsize=18)
+        plt.ylabel("Value", fontsize=18)
         plt.ylim((y_min, y_max))
-        plt.vlines(x=self.df[self.df['drift_state'] == 'drift'].index, ymin=y_min, ymax=y_max, label='Drift Detected', color='red')
+        plt.vlines(
+            x=self.df[self.df["drift_state"] == "drift"].index,
+            ymin=y_min,
+            ymax=y_max,
+            label="Drift Detected",
+            color="red",
+        )
         plt.legend()
 
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
         plt.savefig(output_file)
-
-
-if __name__ == '__main__':
-    file = 'souza_data/gassensor.arff'
-    tester = InjectionTesting(file)
-    _, classes = tester.inject_random_class_manipulation(manipulation_type='class_swap')
-    nndvi, status = tester.test_nndvi_detector(k_nn=50, sampling_times=100)
-    print(classes)
-    print(status)

From b69371e24df09087082bade612a5acd6d4b5a8a5 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 15 Mar 2023 14:20:22 -0400
Subject: [PATCH 27/35] fixing injector functions for class modifications

---
 menelaus/injection/injection_automation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index a26fa9e7..8d3f6dd0 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -9,7 +9,7 @@
 from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI
 from menelaus.data_drift.cdbd import CDBD
 from menelaus.data_drift.hdddm import HDDDM
-import class_manipulation
+import label_manipulation
 import feature_manipulation
 import noise
 
@@ -125,7 +125,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1):
             rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
             rand_cols.append(rand_col)
 
-            self.df = noise.brownian_noise(self.df, rand_col, x, start_drift, end_drift)
+            self.df = noise.BrownianNoiseInjector.__call__(self.df, rand_col, x, start_drift, end_drift)
 
         return rand_cols
 
@@ -145,7 +145,7 @@ def inject_random_class_manipulation(
             all_rand_classes.append(rand_classes)
 
             if manipulation_type == "class_swap":
-                self.df = class_manipulation.class_swap(
+                self.df = label_manipulation.LabelSwapInjector().__call__(
                     self.df,
                     rand_col,
                     rand_classes[0],
@@ -155,7 +155,7 @@ def inject_random_class_manipulation(
                 )
             elif manipulation_type == "class_join":
                 new_label = f"{rand_classes[0]}_{rand_classes[1]}"
-                self.df = class_manipulation.class_join(
+                self.df = label_manipulation.LabelJoinInjector().__call__(
                     self.df,
                     rand_col,
                     rand_classes[0],
@@ -201,7 +201,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
-            self.df = feature_manipulation.feature_swap(
+            self.df = feature_manipulation.FeatureSwapInjector().__call__(
                 self.df, col_a, col_b, start_drift, end_drift
             )
 
@@ -210,7 +210,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
     def inject_random_feature_hide_and_sample(self):
         rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
         sample_size = min(self.df[rand_col].value_counts())
-        self.df = feature_manipulation.feature_hide_and_sample(
+        self.df = feature_manipulation.FeatureCoverInjector().__call__(
             self.df, rand_col, sample_size
         )
 

From 8ae9d929c2c0aa9649e6a203f2effe84ca6f1726 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Wed, 15 Mar 2023 19:46:12 -0400
Subject: [PATCH 28/35] bug fixing

---
 menelaus/injection/injection_automation.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 8d3f6dd0..353cc055 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -125,7 +125,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1):
             rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
             rand_cols.append(rand_col)
 
-            self.df = noise.BrownianNoiseInjector.__call__(self.df, rand_col, x, start_drift, end_drift)
+            self.df = noise.BrownianNoiseInjector(self.df, rand_col, x, start_drift, end_drift)
 
         return rand_cols
 
@@ -145,7 +145,7 @@ def inject_random_class_manipulation(
             all_rand_classes.append(rand_classes)
 
             if manipulation_type == "class_swap":
-                self.df = label_manipulation.LabelSwapInjector().__call__(
+                self.df = label_manipulation.LabelSwapInjector(
                     self.df,
                     rand_col,
                     rand_classes[0],
@@ -155,7 +155,7 @@ def inject_random_class_manipulation(
                 )
             elif manipulation_type == "class_join":
                 new_label = f"{rand_classes[0]}_{rand_classes[1]}"
-                self.df = label_manipulation.LabelJoinInjector().__call__(
+                self.df = label_manipulation.LabelJoinInjector(
                     self.df,
                     rand_col,
                     rand_classes[0],
@@ -201,7 +201,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
-            self.df = feature_manipulation.FeatureSwapInjector().__call__(
+            self.df = feature_manipulation.FeatureSwapInjector(
                 self.df, col_a, col_b, start_drift, end_drift
             )
 
@@ -210,7 +210,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
     def inject_random_feature_hide_and_sample(self):
         rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
         sample_size = min(self.df[rand_col].value_counts())
-        self.df = feature_manipulation.FeatureCoverInjector().__call__(
+        self.df = feature_manipulation.FeatureCoverInjector(
             self.df, rand_col, sample_size
         )
 

From 9f3c4fdd655974d39d6e8faca417de4fb0dd9d95 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Thu, 16 Mar 2023 16:09:58 -0400
Subject: [PATCH 29/35] injection automation testing

---
 menelaus/injection/injection_automation.py    | 41 +++++++++++--------
 .../injection/test_injection_automation.py    | 17 ++++++++
 2 files changed, 42 insertions(+), 16 deletions(-)
 create mode 100644 tests/menelaus/injection/test_injection_automation.py

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 353cc055..34a309df 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -9,9 +9,9 @@
 from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI
 from menelaus.data_drift.cdbd import CDBD
 from menelaus.data_drift.hdddm import HDDDM
-import label_manipulation
-import feature_manipulation
-import noise
+from menelaus.injection import label_manipulation
+from menelaus.injection import feature_manipulation
+from menelaus.injection import noise
 
 
 def select_random_classes(series):
@@ -30,19 +30,23 @@ def select_random_classes(series):
 
 
 class InjectionTesting:
-    def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None):
-        file_type = data_path.split(".")[-1]
+    def __init__(self, data, seed=None, numeric_cols=None, categorical_cols=None):
         self.seed = seed
         self.numeric_cols = []
         self.categorical_cols = []
 
-        if file_type == "csv":
-            self.df = pd.read_csv(data_path)
-        elif file_type == "arff":
-            raw_data = loadarff(data_path)
-            self.df = pd.DataFrame(raw_data[0])
+        if isinstance(data, pd.DataFrame):
+            self.df = data.copy()
         else:
-            raise ValueError(f"Invalid file type: {file_type}")
+            file_type = data.split(".")[-1]
+
+            if file_type == "csv":
+                self.df = pd.read_csv(data)
+            elif file_type == "arff":
+                raw_data = loadarff(data)
+                self.df = pd.DataFrame(raw_data[0])
+            else:
+                raise ValueError(f"Invalid file type: {file_type}")
 
         if not numeric_cols or not categorical_cols:
             for col in self.df.columns:
@@ -118,6 +122,7 @@ def train_classifier_model(
         return model, x_cols, y_col
 
     def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1):
+        injector = noise.BrownianNoiseInjector()
         rand_cols = []
         start_drift, end_drift = self.select_rows(start, end)
 
@@ -125,7 +130,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1):
             rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
             rand_cols.append(rand_col)
 
-            self.df = noise.BrownianNoiseInjector(self.df, rand_col, x, start_drift, end_drift)
+            self.df = injector(self.df, start_drift, end_drift, rand_col, x)
 
         return rand_cols
 
@@ -145,7 +150,8 @@ def inject_random_class_manipulation(
             all_rand_classes.append(rand_classes)
 
             if manipulation_type == "class_swap":
-                self.df = label_manipulation.LabelSwapInjector(
+                injector = label_manipulation.LabelSwapInjector()
+                self.df = injector(
                     self.df,
                     rand_col,
                     rand_classes[0],
@@ -154,8 +160,9 @@ def inject_random_class_manipulation(
                     end_drift,
                 )
             elif manipulation_type == "class_join":
+                injector = label_manipulation.LabelJoinInjector()
                 new_label = f"{rand_classes[0]}_{rand_classes[1]}"
-                self.df = label_manipulation.LabelJoinInjector(
+                self.df = injector(
                     self.df,
                     rand_col,
                     rand_classes[0],
@@ -172,6 +179,7 @@ def inject_random_class_manipulation(
         return rand_cols, all_rand_classes
 
     def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
+        injector = feature_manipulation.FeatureSwapInjector()
         all_swap_cols = []
         start_drift, end_drift = self.select_rows(start, end)
 
@@ -201,16 +209,17 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
-            self.df = feature_manipulation.FeatureSwapInjector(
+            self.df = injector(
                 self.df, col_a, col_b, start_drift, end_drift
             )
 
         return all_swap_cols
 
     def inject_random_feature_hide_and_sample(self):
+        injector = feature_manipulation.FeatureCoverInjector()
         rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
         sample_size = min(self.df[rand_col].value_counts())
-        self.df = feature_manipulation.FeatureCoverInjector(
+        self.df = injector(
             self.df, rand_col, sample_size
         )
 
diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
new file mode 100644
index 00000000..2f33b91d
--- /dev/null
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -0,0 +1,17 @@
+import numpy as np
+import pandas as pd
+from menelaus.datasets import fetch_circle_data, fetch_rainfall_data
+from menelaus.injection.injection_automation import InjectionTesting
+
+
+def test_brownian_injection():
+    df = pd.DataFrame(np.random.rand(100, 5), columns=['a', 'b', 'c', 'd', 'e'])
+    tester = InjectionTesting(df)
+    start = 0.75
+    end = 1
+
+    col = tester.inject_random_brownian_noise(50, start=start, end=end, num_drift_cols=1)
+    std_normal = tester.df.iloc[0 : int(start * len(df)), ][col].std().iloc[0, ]
+    std_drift = tester.df.iloc[int(start * len(df)) + 1 : int(end * len(df)), ][col].std().iloc[0, ]
+
+    assert(std_drift > std_normal)

From b5c3d53c898f1a71c8700e0841f381760073dad3 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 17 Mar 2023 13:23:47 -0400
Subject: [PATCH 30/35] class manipulation tests

---
 menelaus/injection/injection_automation.py    | 16 ++++-----
 .../injection/test_injection_automation.py    | 35 +++++++++++++++----
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 34a309df..38a2c4ab 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -153,23 +153,23 @@ def inject_random_class_manipulation(
                 injector = label_manipulation.LabelSwapInjector()
                 self.df = injector(
                     self.df,
+                    start_drift,
+                    end_drift,
                     rand_col,
                     rand_classes[0],
                     rand_classes[1],
-                    start_drift,
-                    end_drift,
                 )
             elif manipulation_type == "class_join":
                 injector = label_manipulation.LabelJoinInjector()
                 new_label = f"{rand_classes[0]}_{rand_classes[1]}"
                 self.df = injector(
                     self.df,
+                    start_drift,
+                    end_drift,
                     rand_col,
                     rand_classes[0],
                     rand_classes[1],
                     new_label,
-                    start_drift,
-                    end_drift,
                 )
             else:
                 raise ValueError(
@@ -209,9 +209,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
-            self.df = injector(
-                self.df, col_a, col_b, start_drift, end_drift
-            )
+            self.df = injector(self.df, col_a, col_b, start_drift, end_drift)
 
         return all_swap_cols
 
@@ -219,9 +217,7 @@ def inject_random_feature_hide_and_sample(self):
         injector = feature_manipulation.FeatureCoverInjector()
         rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
         sample_size = min(self.df[rand_col].value_counts())
-        self.df = injector(
-            self.df, rand_col, sample_size
-        )
+        self.df = injector(self.df, rand_col, sample_size)
 
         return rand_col
 
diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
index 2f33b91d..1f1c2b3d 100644
--- a/tests/menelaus/injection/test_injection_automation.py
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -1,17 +1,40 @@
 import numpy as np
 import pandas as pd
-from menelaus.datasets import fetch_circle_data, fetch_rainfall_data
 from menelaus.injection.injection_automation import InjectionTesting
 
 
-def test_brownian_injection():
-    df = pd.DataFrame(np.random.rand(100, 5), columns=['a', 'b', 'c', 'd', 'e'])
+def test_brownian_noise():
+    df = pd.DataFrame(np.random.rand(100, 5), columns=["a", "b", "c", "d", "e"])
     tester = InjectionTesting(df)
     start = 0.75
     end = 1
 
     col = tester.inject_random_brownian_noise(50, start=start, end=end, num_drift_cols=1)
-    std_normal = tester.df.iloc[0 : int(start * len(df)), ][col].std().iloc[0, ]
-    std_drift = tester.df.iloc[int(start * len(df)) + 1 : int(end * len(df)), ][col].std().iloc[0, ]
+    std_normal = (tester.df.iloc[0 : int(start * len(df)), ][col].std().iloc[0, ])
+    std_drift = (tester.df.iloc[int(start * len(df)) + 1:int(end * len(df)), ][col].std().iloc[0, ])
 
-    assert(std_drift > std_normal)
+    assert std_drift > std_normal
+
+
+def test_class_manipulation():
+    df = pd.DataFrame(np.random.choice(a=["a", "b", "c"], size=100, p=[0.4, 0.3, 0.3]))
+    swap_tester = InjectionTesting(df)
+    join_tester = InjectionTesting(df)
+    start = 0
+    end = 1
+
+    cols, all_swap_classes = swap_tester.inject_random_class_manipulation(
+        manipulation_type="class_swap", start=start, end=end
+    )
+    col = cols[0]
+    swap_classes = all_swap_classes[0]
+
+    assert len(df[df[col] == swap_classes[0]]) == len(swap_tester.df[swap_tester.df[col] == swap_classes[1]])
+    assert len(df[df[col] == swap_classes[1]]) == len(swap_tester.df[swap_tester.df[col] == swap_classes[0]])
+
+    cols, all_join_classes = join_tester.inject_random_class_manipulation(manipulation_type="class_join", start=start, end=end)
+    col = cols[0]
+    join_classes = all_join_classes[0]
+
+    assert len(join_tester.df[join_tester.df[col] == join_classes[0]]) == 0
+    assert len(join_tester.df[join_tester.df[col] == join_classes[1]]) == 0

From de44802a69843c2cb08176cb21ecbb6d0a3caf77 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 17 Mar 2023 19:33:25 -0400
Subject: [PATCH 31/35] feature injection tests

---
 menelaus/injection/injection_automation.py    |  6 +++--
 .../injection/test_injection_automation.py    | 23 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index 38a2c4ab..cacef7cf 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -209,13 +209,15 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
-            self.df = injector(self.df, col_a, col_b, start_drift, end_drift)
+            self.df = injector(self.df, start_drift, end_drift, col_a, col_b)
 
         return all_swap_cols
 
     def inject_random_feature_hide_and_sample(self):
         injector = feature_manipulation.FeatureCoverInjector()
-        rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)]
+        rand_col = self.categorical_cols[
+            random.randint(0, len(self.categorical_cols) - 1)
+        ]
         sample_size = min(self.df[rand_col].value_counts())
         self.df = injector(self.df, rand_col, sample_size)
 
diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
index 1f1c2b3d..6a21c079 100644
--- a/tests/menelaus/injection/test_injection_automation.py
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -17,7 +17,7 @@ def test_brownian_noise():
 
 
 def test_class_manipulation():
-    df = pd.DataFrame(np.random.choice(a=["a", "b", "c"], size=100, p=[0.4, 0.3, 0.3]))
+    df = pd.DataFrame(np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]))
     swap_tester = InjectionTesting(df)
     join_tester = InjectionTesting(df)
     start = 0
@@ -38,3 +38,24 @@ def test_class_manipulation():
 
     assert len(join_tester.df[join_tester.df[col] == join_classes[0]]) == 0
     assert len(join_tester.df[join_tester.df[col] == join_classes[1]]) == 0
+
+def test_feature_swap():
+    df = pd.DataFrame()
+    df['a'] = [0] * 100
+    df['b'] = [1] * 100
+    tester = InjectionTesting(df)
+    start = 0.75
+    end = 1
+
+    tester.inject_random_feature_swap(start=start, end=end)
+    assert(tester.df['a'].sum() == 25)
+    assert(tester.df['b'].sum() == 75)
+
+def test_feature_hide_and_sample():
+    df = pd.DataFrame()
+    df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])
+    df['b'] = np.random.rand(100, 1)
+    tester = InjectionTesting(df)
+
+    tester.inject_random_feature_hide_and_sample()
+    assert(len(tester.df) < len(df))

From 4ca0089517ffaadefcd46fd8c14c0999323ae779 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 21 Mar 2023 12:49:44 -0400
Subject: [PATCH 32/35] basic detector tests

---
 .../injection/test_injection_automation.py    | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
index 6a21c079..450af2a7 100644
--- a/tests/menelaus/injection/test_injection_automation.py
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -39,6 +39,7 @@ def test_class_manipulation():
     assert len(join_tester.df[join_tester.df[col] == join_classes[0]]) == 0
     assert len(join_tester.df[join_tester.df[col] == join_classes[1]]) == 0
 
+
 def test_feature_swap():
     df = pd.DataFrame()
     df['a'] = [0] * 100
@@ -51,6 +52,7 @@ def test_feature_swap():
     assert(tester.df['a'].sum() == 25)
     assert(tester.df['b'].sum() == 75)
 
+
 def test_feature_hide_and_sample():
     df = pd.DataFrame()
     df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])
@@ -59,3 +61,24 @@ def test_feature_hide_and_sample():
 
     tester.inject_random_feature_hide_and_sample()
     assert(len(tester.df) < len(df))
+
+
+def test_detectors():
+    df = pd.DataFrame()
+    df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])
+    df['b'] = np.random.rand(100, 1)
+    df['c'] = np.random.rand(100, 1)
+    tester = InjectionTesting(df)
+
+    tester.test_adwin_detector()
+    tester.test_cbdb_detector(cols=['b'])
+    tester.test_ddm_detector()
+    tester.test_eddm_detector()
+    tester.test_hdddm_detector(cols=['b'])
+    tester.test_kdq_tree_batch_detector(cols=['b'])
+    tester.test_kdq_tree_streaming_detector(cols=['b'])
+    tester.test_lfr_detector()
+    tester.test_md3_detector()
+    tester.test_nndvi_detector()
+    tester.test_pcacd_detector()
+    tester.test_stepd_detector()

From 29fa809f55172d10939d52516f871d983314ee4b Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 21 Mar 2023 13:43:48 -0400
Subject: [PATCH 33/35] switching to numpy random number generator

---
 menelaus/injection/injection_automation.py    | 44 +++++++++----------
 .../injection/test_injection_automation.py    | 32 ++++++++------
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index cacef7cf..c553a622 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -1,7 +1,7 @@
 import matplotlib.pyplot as plt
+import numpy as np
 import os
 import pandas as pd
-import random
 import sklearn
 from scipy.io.arff import loadarff
 
@@ -20,11 +20,11 @@ def select_random_classes(series):
     if len(classes) < 2:
         raise ValueError(f"Insufficient classes in series: {len(classes)}")
     else:
-        class_a = classes[random.randint(0, len(classes) - 1)]
-        class_b = classes[random.randint(0, len(classes) - 1)]
+        class_a = classes[np.random.randint(0, len(classes))]
+        class_b = classes[np.random.randint(0, len(classes))]
 
         while class_a == class_b:
-            class_b = classes[random.randint(0, len(classes) - 1)]
+            class_b = classes[np.random.randint(0, len(classes))]
 
         return [class_a, class_b]
 
@@ -60,7 +60,7 @@ def __init__(self, data, seed=None, numeric_cols=None, categorical_cols=None):
             self.categorical_cols = categorical_cols
 
         if seed:
-            random.seed(seed)
+            np.random.seed(seed)
 
     def select_rows(self, start, end):
         start_row = int(start * len(self.df))
@@ -70,7 +70,7 @@ def select_rows(self, start, end):
 
     def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75):
         if not x_cols or not y_col:
-            y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
+            y_col = self.numeric_cols[np.random.randint(0, len(self.numeric_cols))]
             x_cols = self.numeric_cols.copy()
             x_cols.remove(y_col)
 
@@ -94,7 +94,7 @@ def train_classifier_model(
     ):
         if not x_cols or not y_col:
             y_col = self.categorical_cols[
-                random.randint(0, len(self.categorical_cols) - 1)
+                np.random.randint(0, len(self.categorical_cols))
             ]
             x_cols = self.numeric_cols.copy()
 
@@ -127,7 +127,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1):
         start_drift, end_drift = self.select_rows(start, end)
 
         for i in range(num_drift_cols):
-            rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)]
+            rand_col = self.numeric_cols[np.random.randint(0, len(self.numeric_cols))]
             rand_cols.append(rand_col)
 
             self.df = injector(self.df, start_drift, end_drift, rand_col, x)
@@ -143,7 +143,7 @@ def inject_random_class_manipulation(
 
         for i in range(num_drift_cols):
             rand_col = self.categorical_cols[
-                random.randint(0, len(self.categorical_cols) - 1)
+                np.random.randint(0, len(self.categorical_cols))
             ]
             rand_cols.append(rand_col)
             rand_classes = select_random_classes(self.df[rand_col])
@@ -186,7 +186,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
         for i in range(num_swaps):
             col_type = (
                 self.numeric_cols
-                if random.randint(0, 1) == 0
+                if np.random.randint(0, 1) == 0
                 else self.categorical_cols
             )
 
@@ -201,11 +201,11 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
                     "Insufficient numeric and categorical columns for swaps"
                 )
 
-            col_a = col_type[random.randint(0, len(col_type) - 1)]
-            col_b = col_type[random.randint(0, len(col_type) - 1)]
+            col_a = col_type[np.random.randint(0, len(col_type))]
+            col_b = col_type[np.random.randint(0, len(col_type))]
 
             while col_a == col_b:
-                col_b = col_type[random.randint(0, len(col_type) - 1)]
+                col_b = col_type[np.random.randint(0, len(col_type))]
 
             swap_cols = [col_a, col_b]
             all_swap_cols.append(swap_cols)
@@ -216,7 +216,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1):
     def inject_random_feature_hide_and_sample(self):
         injector = feature_manipulation.FeatureCoverInjector()
         rand_col = self.categorical_cols[
-            random.randint(0, len(self.categorical_cols) - 1)
+            np.random.randint(0, len(self.categorical_cols))
         ]
         sample_size = min(self.df[rand_col].value_counts())
         self.df = injector(self.df, rand_col, sample_size)
@@ -241,12 +241,12 @@ def test_adwin_detector(self, model=None, x_cols=None, y_col=None):
     def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets=8):
         if not group_col:
             group_col = self.categorical_cols[
-                random.randint(0, len(self.categorical_cols) - 1)
+                np.random.randint(0, len(self.categorical_cols))
             ]
 
             while group_col in cols:
                 group_col = self.categorical_cols[
-                    random.randint(0, len(self.categorical_cols) - 1)
+                    np.random.randint(0, len(self.categorical_cols))
                 ]
 
         if not reference_group:
@@ -327,12 +327,12 @@ def test_hdddm_detector(
     ):
         if not group_col:
             group_col = self.categorical_cols[
-                random.randint(0, len(self.categorical_cols) - 1)
+                np.random.randint(0, len(self.categorical_cols))
             ]
 
             while group_col in cols:
                 group_col = self.categorical_cols[
-                    random.randint(0, len(self.categorical_cols) - 1)
+                    np.random.randint(0, len(self.categorical_cols))
                 ]
 
         if not reference_group:
@@ -353,12 +353,12 @@ def test_hdddm_detector(
     def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=None):
         if not group_col:
             group_col = self.categorical_cols[
-                random.randint(0, len(self.categorical_cols) - 1)
+                np.random.randint(0, len(self.categorical_cols))
             ]
 
             while group_col in cols:
                 group_col = self.categorical_cols[
-                    random.randint(0, len(self.categorical_cols) - 1)
+                    np.random.randint(0, len(self.categorical_cols))
                 ]
 
         if not reference_group:
@@ -485,13 +485,13 @@ def test_nndvi_detector(
     ):
         if not group_col:
             group_col = self.categorical_cols[
-                random.randint(0, len(self.categorical_cols) - 1)
+                np.random.randint(0, len(self.categorical_cols))
             ]
 
             if cols:
                 while group_col in cols:
                     group_col = self.categorical_cols[
-                        random.randint(0, len(self.categorical_cols) - 1)
+                        np.random.randint(0, len(self.categorical_cols))
                     ]
 
         if not reference_group:
diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
index 450af2a7..4f0e93b3 100644
--- a/tests/menelaus/injection/test_injection_automation.py
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -69,16 +69,22 @@ def test_detectors():
     df['b'] = np.random.rand(100, 1)
     df['c'] = np.random.rand(100, 1)
     tester = InjectionTesting(df)
-
-    tester.test_adwin_detector()
-    tester.test_cbdb_detector(cols=['b'])
-    tester.test_ddm_detector()
-    tester.test_eddm_detector()
-    tester.test_hdddm_detector(cols=['b'])
-    tester.test_kdq_tree_batch_detector(cols=['b'])
-    tester.test_kdq_tree_streaming_detector(cols=['b'])
-    tester.test_lfr_detector()
-    tester.test_md3_detector()
-    tester.test_nndvi_detector()
-    tester.test_pcacd_detector()
-    tester.test_stepd_detector()
+    failed = False
+
+    try:
+        tester.test_adwin_detector()
+        tester.test_cbdb_detector(cols=['b'])
+        tester.test_ddm_detector()
+        tester.test_eddm_detector()
+        tester.test_hdddm_detector(cols=['b'])
+        tester.test_kdq_tree_batch_detector(cols=['b'])
+        tester.test_kdq_tree_streaming_detector(cols=['b'])
+        tester.test_lfr_detector()
+        tester.test_md3_detector()
+        tester.test_nndvi_detector()
+        tester.test_pcacd_detector()
+        tester.test_stepd_detector()
+    except Exception as e:
+        failed = True
+
+    assert(failed is False)

From 41614986a1e28f3ff84258dc5bf400c8dbf5be3a Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Tue, 21 Mar 2023 14:51:50 -0400
Subject: [PATCH 34/35] scatter plot testing

---
 .gitignore                                            | 1 +
 tests/menelaus/injection/test_injection_automation.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0f6395fd..27247819 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,6 @@ menelaus/injection/plots
 examples/*.png
 menelaus/*.png
 tests/*.png
+tests/menelaus/injection/plots
 
 *.tox*
diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
index 4f0e93b3..defebe55 100644
--- a/tests/menelaus/injection/test_injection_automation.py
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -84,6 +84,7 @@ def test_detectors():
         tester.test_nndvi_detector()
         tester.test_pcacd_detector()
         tester.test_stepd_detector()
+        tester.plot_drift_scatter(cols=['b'])
     except Exception as e:
         failed = True
 

From f7a261bacfabcca6126b47c2f165d0ba6d10ced4 Mon Sep 17 00:00:00 2001
From: Alex Isherwood <aisherwood@mitre.org>
Date: Fri, 24 Mar 2023 14:37:26 -0400
Subject: [PATCH 35/35] type coercing plot inputs and testing

---
 .gitignore                                    |  3 +-
 menelaus/injection/injection_automation.py    | 68 ++++++++++---------
 .../injection/test_injection_automation.py    | 12 ++--
 3 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index 27247819..7422e0d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,8 @@ _build
 *.DS_Store
 .idea/
 menelaus/injection/souza_data
-menelaus/injection/plots
+menelaus/injection/sample_scripts
+plots
 
 # Images
 
diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py
index c553a622..c1ef35cd 100644
--- a/menelaus/injection/injection_automation.py
+++ b/menelaus/injection/injection_automation.py
@@ -562,35 +562,41 @@ def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=2
         return detector
 
     def plot_drift_scatter(self, cols, output_file="plots/drift_scatter_test.png"):
-        plt.figure(figsize=(20, 6))
-        y_min = None
-        y_max = None
-
-        for col in cols:
-            plt.scatter(self.df.index, self.df[col], label=col)
-            local_min = self.df[col].min()
-            local_max = self.df[col].max()
-
-            if y_min is None or y_min > local_min:
-                y_min = local_min
-            if y_max is None or y_max < local_max:
-                y_max = local_max
-
-        plt.grid(False, axis="x")
-        plt.xticks(fontsize=16)
-        plt.yticks(fontsize=16)
-        plt.title("Scatter Results", fontsize=22)
-        plt.xlabel("Index", fontsize=18)
-        plt.ylabel("Value", fontsize=18)
-        plt.ylim((y_min, y_max))
-        plt.vlines(
-            x=self.df[self.df["drift_state"] == "drift"].index,
-            ymin=y_min,
-            ymax=y_max,
-            label="Drift Detected",
-            color="red",
-        )
-        plt.legend()
+        if hasattr(cols, '__iter__'):
+            if isinstance(cols, str):
+                cols = [cols]
+
+            plt.figure(figsize=(20, 6))
+            y_min = None
+            y_max = None
+
+            for col in cols:
+                plt.scatter(self.df.index, self.df[col], label=col)
+                local_min = self.df[col].min()
+                local_max = self.df[col].max()
+
+                if y_min is None or y_min > local_min:
+                    y_min = local_min
+                if y_max is None or y_max < local_max:
+                    y_max = local_max
+
+            plt.grid(False, axis="x")
+            plt.xticks(fontsize=16)
+            plt.yticks(fontsize=16)
+            plt.title("Scatter Results", fontsize=22)
+            plt.xlabel("Index", fontsize=18)
+            plt.ylabel("Value", fontsize=18)
+            plt.ylim((y_min, y_max))
+            plt.vlines(
+                x=self.df[self.df["drift_state"] == "drift"].index,
+                ymin=y_min,
+                ymax=y_max,
+                label="Drift Detected",
+                color="red",
+            )
+            plt.legend()
 
-        os.makedirs(os.path.dirname(output_file), exist_ok=True)
-        plt.savefig(output_file)
+            os.makedirs(os.path.dirname(output_file), exist_ok=True)
+            plt.savefig(output_file)
+        else:
+            raise TypeError(f'Variable cols must be an iterable object or string')
diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py
index defebe55..2a650eba 100644
--- a/tests/menelaus/injection/test_injection_automation.py
+++ b/tests/menelaus/injection/test_injection_automation.py
@@ -5,7 +5,7 @@
 
 def test_brownian_noise():
     df = pd.DataFrame(np.random.rand(100, 5), columns=["a", "b", "c", "d", "e"])
-    tester = InjectionTesting(df)
+    tester = InjectionTesting(df, seed=2)
     start = 0.75
     end = 1
 
@@ -18,8 +18,8 @@ def test_brownian_noise():
 
 def test_class_manipulation():
     df = pd.DataFrame(np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]))
-    swap_tester = InjectionTesting(df)
-    join_tester = InjectionTesting(df)
+    swap_tester = InjectionTesting(df, seed=3)
+    join_tester = InjectionTesting(df, seed=5)
     start = 0
     end = 1
 
@@ -44,7 +44,7 @@ def test_feature_swap():
     df = pd.DataFrame()
     df['a'] = [0] * 100
     df['b'] = [1] * 100
-    tester = InjectionTesting(df)
+    tester = InjectionTesting(df, seed=7)
     start = 0.75
     end = 1
 
@@ -57,7 +57,7 @@ def test_feature_hide_and_sample():
     df = pd.DataFrame()
     df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])
     df['b'] = np.random.rand(100, 1)
-    tester = InjectionTesting(df)
+    tester = InjectionTesting(df, seed=11)
 
     tester.inject_random_feature_hide_and_sample()
     assert(len(tester.df) < len(df))
@@ -68,7 +68,7 @@ def test_detectors():
     df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])
     df['b'] = np.random.rand(100, 1)
     df['c'] = np.random.rand(100, 1)
-    tester = InjectionTesting(df)
+    tester = InjectionTesting(df, seed=13)
     failed = False
 
     try: