From ef3ec02d5073c42a1a274de5c9148b7789b9eee0 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 31 Jan 2023 15:41:42 -0500 Subject: [PATCH 01/35] initial injection automation script --- menelaus/injection/injection_automation.py | 154 +++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 menelaus/injection/injection_automation.py diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py new file mode 100644 index 00000000..408d126c --- /dev/null +++ b/menelaus/injection/injection_automation.py @@ -0,0 +1,154 @@ +import pandas as pd +import random +from scipy.io.arff import loadarff + +from menelaus.concept_drift import LinearFourRates, ADWINAccuracy, DDM, EDDM, STEPD, MD3 +from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI +import class_manipulation +import feature_manipulation +import noise + + +def select_random_classes(series): + classes = series.unique() + + if len(classes) < 2: + raise ValueError(f'Insufficient classes in series: {len(classes)}') + else: + class_a = classes[random.randint(0, len(classes) - 1)] + class_b = classes[random.randint(0, len(classes) - 1)] + + while class_a == class_b: + class_b = classes[random.randint(0, len(classes) - 1)] + + return [class_a, class_b] + + +class InjectionTesting: + def __init__(self, data_path, seed=None): + file_type = data_path.split('.')[-1] + self.numeric_cols = [] + self.categorical_cols = [] + + if file_type == 'csv': + self.df = pd.read_csv(data_path) + elif file_type == 'arff': + raw_data = loadarff(data_path) + self.df = pd.DataFrame(raw_data[0]) + else: + raise ValueError(f'Invalid file type: {file_type}') + + for col in self.df.columns: + if pd.api.types.is_numeric_dtype(self.df[col]): + self.numeric_cols.append(col) + elif self.df[col].nunique() < len(self.df): + self.categorical_cols.append(col) + + if seed: + random.seed(seed) + + + def select_rows(self, start, end): + start_drift = int(start * len(self.df)) + end_drift = int(end * len(self.df)) + + return [start_drift, end_drift] + + + def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1): + rand_cols = [] + start_drift, end_drift = self.select_rows(start, end) + + for i in range(num_drift_cols): + rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] + rand_cols.append(rand_col) + + self.df = noise.brownian_noise(self.df, rand_col, x, start_drift, end_drift) + + return rand_cols + + + def inject_random_class_manipulation(self, manipulation_type, start=.75, end=1, num_drift_cols=1): + rand_cols = [] + all_rand_classes = [] + start_drift, end_drift = self.select_rows(start, end) + + for i in range(num_drift_cols): + rand_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + rand_cols.append(rand_col) + rand_classes = select_random_classes(self.df[rand_col]) + all_rand_classes.append(rand_classes) + + if manipulation_type == 'class_swap': + self.df = class_manipulation.class_swap(self.df, rand_col, rand_classes[0], rand_classes[1], start_drift, end_drift) + elif manipulation_type == 'class_join': + new_label = f'{rand_classes[0]}_{rand_classes[1]}' + self.df = class_manipulation.class_join(self.df, rand_col, rand_classes[0], rand_classes[1], new_label, start_drift, end_drift) + else: + raise ValueError(f'Invalid class manipulation type: {manipulation_type}') + + return rand_cols, all_rand_classes + + + def inject_random_feature_swap(self, start=.75, end=1, num_swaps=1): + all_swap_cols = [] + start_drift, end_drift = self.select_rows(start, end) + + for i in range(num_swaps): + col_type = self.numeric_cols if random.randint(0, 1) == 0 else self.categorical_cols + + if len(col_type) < 2: + col_type = self.numeric_cols if col_type == self.categorical_cols else self.categorical_cols + if len(col_type) < 2: + raise ValueError('Insufficient numeric and categorical columns for swaps') + + col_a = col_type[random.randint(0, len(col_type) - 1)] + col_b = col_type[random.randint(0, len(col_type) - 1)] + + while col_a == col_b: + col_b = col_type[random.randint(0, len(col_type) - 1)] + + swap_cols = [col_a, col_b] + all_swap_cols.append(swap_cols) + self.df = feature_manipulation.feature_swap(self.df, col_a, col_b, start_drift, end_drift) + + return all_swap_cols + + + def inject_random_feature_hide_and_sample(self): + rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] + sample_size = min(self.df[rand_col].value_counts()) + self.df = feature_manipulation.feature_hide_and_sample(self.df, rand_col, sample_size) + + return rand_col + + + def test_adwin_detector(self, col): + detector = ADWINAccuracy() + + for i, row in self.df.iterrows(): + detector.update(X=None, y_true=row[col], y_pred=0) + assert detector.drift_state != 'drift', f'Drift detected in row {i}' + + + def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): + detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound) + + for i, row in self.df.iterrows(): + detector.update(row[cols]) + assert detector.drift_state != 'drift', f'Drift detected in row {i}' + + + def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): + detector = PCACD(window_size=window_size, divergence_metric=divergence_metric) + + for i, row in self.df.iterrows(): + detector.update(row) + assert detector.drift_state != 'drift', f'Drift detected in row {i}' + + +if __name__ == '__main__': + file = 'souza_data/gassensor.arff' + tester = InjectionTesting(file) + drift_cols = tester.inject_random_brownian_noise(10) + tester.test_pcacd_detector() From 0761bde7336a0922fcf769f67828becd9dfe21b8 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 31 Jan 2023 15:59:23 -0500 Subject: [PATCH 02/35] adding souza data locally only --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 734db991..2862e2f2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ _build *.coverage *.DS_Store .idea/ +menelaus/injection/souza_data # Images From 40b5a55100205bbaf684b5d5bba929af05d75bb3 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Thu, 2 Feb 2023 16:07:33 -0500 Subject: [PATCH 03/35] implementing drift detection scatter plot visualization --- .gitignore | 1 + menelaus/injection/injection_automation.py | 46 ++++++++++++++++++++-- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 2862e2f2..0f6395fd 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ _build *.DS_Store .idea/ menelaus/injection/souza_data +menelaus/injection/plots # Images diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 408d126c..3e4b42b3 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -1,3 +1,4 @@ +import matplotlib.pyplot as plt import pandas as pd import random from scipy.io.arff import loadarff @@ -125,30 +126,67 @@ def inject_random_feature_hide_and_sample(self): def test_adwin_detector(self, col): detector = ADWINAccuracy() + drift_state = [] for i, row in self.df.iterrows(): detector.update(X=None, y_true=row[col], y_pred=0) - assert detector.drift_state != 'drift', f'Drift detected in row {i}' + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound) + drift_state = [] for i, row in self.df.iterrows(): detector.update(row[cols]) - assert detector.drift_state != 'drift', f'Drift detected in row {i}' + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): detector = PCACD(window_size=window_size, divergence_metric=divergence_metric) + drift_state = [] for i, row in self.df.iterrows(): detector.update(row) - assert detector.drift_state != 'drift', f'Drift detected in row {i}' + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state + + + def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): + plt.figure(figsize=(20, 6)) + y_min = None + y_max = None + + for col in cols: + plt.scatter(self.df.index, self.df[col], label=col) + local_min = self.df[col].min() + local_max = self.df[col].max() + + if y_min is None or y_min > local_min: + y_min = local_min + if y_max is None or y_max < local_max: + y_max = local_max + + plt.grid(False, axis='x') + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + plt.title('Scatter Results', fontsize=22) + plt.xlabel('Index', fontsize=18) + plt.ylabel('Value', fontsize=18) + plt.ylim((y_min, y_max)) + plt.vlines(x=self.df[self.df['drift_state'] == 'drift'].index, ymin=y_min, ymax=y_max, label='Drift Detected', color='red') + plt.legend() + plt.savefig(output_file) if __name__ == '__main__': file = 'souza_data/gassensor.arff' tester = InjectionTesting(file) - drift_cols = tester.inject_random_brownian_noise(10) + drift_cols = tester.inject_random_brownian_noise(1000) tester.test_pcacd_detector() + tester.plot_drift_scatter(drift_cols) From 820be858311dced0afb0c80d161c4734397ec2d3 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 3 Feb 2023 13:53:00 -0500 Subject: [PATCH 04/35] minor tweaks --- menelaus/injection/injection_automation.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 3e4b42b3..85cdbea3 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -1,4 +1,5 @@ import matplotlib.pyplot as plt +import os import pandas as pd import random from scipy.io.arff import loadarff @@ -124,12 +125,12 @@ def inject_random_feature_hide_and_sample(self): return rand_col - def test_adwin_detector(self, col): + def test_adwin_detector(self, cols): detector = ADWINAccuracy() drift_state = [] for i, row in self.df.iterrows(): - detector.update(X=None, y_true=row[col], y_pred=0) + detector.update(X=None, y_true=row[cols], y_pred=0) drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state @@ -181,12 +182,14 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): plt.ylim((y_min, y_max)) plt.vlines(x=self.df[self.df['drift_state'] == 'drift'].index, ymin=y_min, ymax=y_max, label='Drift Detected', color='red') plt.legend() + + os.makedirs(os.path.dirname(output_file), exist_ok=True) plt.savefig(output_file) if __name__ == '__main__': - file = 'souza_data/gassensor.arff' + file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) - drift_cols = tester.inject_random_brownian_noise(1000) - tester.test_pcacd_detector() + drift_cols = tester.inject_random_brownian_noise(10) + tester.test_adwin_detector(drift_cols) tester.plot_drift_scatter(drift_cols) From 4b9ca33d58b8f8324ee579a5a73a63b8d64c8d5c Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 10 Feb 2023 18:24:01 -0500 Subject: [PATCH 05/35] adding nndvi drift detection --- menelaus/injection/injection_automation.py | 29 ++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 85cdbea3..94e208a0 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -147,6 +147,32 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo self.df['drift_state'] = drift_state + def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50): + if not group_name: + group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + while group_name in cols: + group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + filtered_df = self.df.copy() + for filter_col in filtered_df.columns: + if filter_col != group_name and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]): + filtered_df.drop(columns=filter_col, inplace=True) + + grouped_df = filtered_df.groupby(group_name) + status = pd.DataFrame(columns=[group_name, 'drift']) + batches = {group_id: group.sample(frac=0.1).drop(columns=group_name).values for group_id, group in grouped_df} + + detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times) + detector.set_reference(batches.pop(min(self.df[group_name]))) + + for group_id, batch in batches.items(): + detector.update(pd.DataFrame(batch)) + status = pd.concat([status, pd.DataFrame({group_name: [group_id], 'drift': [detector.drift_state]})], ignore_index=True) + + return status + + def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): detector = PCACD(window_size=window_size, divergence_metric=divergence_metric) drift_state = [] @@ -191,5 +217,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_adwin_detector(drift_cols) - tester.plot_drift_scatter(drift_cols) + tester.test_nndvi_detector(drift_cols) From 8f397a1bee6f17fd4324b1cb3af44909dc84bc1c Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Mon, 13 Feb 2023 12:05:54 -0500 Subject: [PATCH 06/35] allow manual column type specification --- menelaus/injection/injection_automation.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 94e208a0..5d9feaa7 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -27,7 +27,7 @@ def select_random_classes(series): class InjectionTesting: - def __init__(self, data_path, seed=None): + def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None): file_type = data_path.split('.')[-1] self.numeric_cols = [] self.categorical_cols = [] @@ -40,11 +40,12 @@ def __init__(self, data_path, seed=None): else: raise ValueError(f'Invalid file type: {file_type}') - for col in self.df.columns: - if pd.api.types.is_numeric_dtype(self.df[col]): - self.numeric_cols.append(col) - elif self.df[col].nunique() < len(self.df): - self.categorical_cols.append(col) + if numeric_cols is None or categorical_cols is None: + for col in self.df.columns: + if pd.api.types.is_numeric_dtype(self.df[col]) and numeric_cols is None: + self.numeric_cols.append(col) + elif self.df[col].nunique() < len(self.df) and categorical_cols is None: + self.categorical_cols.append(col) if seed: random.seed(seed) From 016eb7e9c071402bce9114b30d119ef67bc1bad8 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Mon, 13 Feb 2023 12:10:01 -0500 Subject: [PATCH 07/35] adjusting function returns and local variables --- menelaus/injection/injection_automation.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 5d9feaa7..995822fb 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -46,6 +46,10 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non self.numeric_cols.append(col) elif self.df[col].nunique() < len(self.df) and categorical_cols is None: self.categorical_cols.append(col) + if numeric_cols is None: + self.numeric_cols = numeric_cols + if categorical_cols is None: + self.categorical_cols = categorical_cols if seed: random.seed(seed) @@ -135,6 +139,7 @@ def test_adwin_detector(self, cols): drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state + return detector def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): @@ -146,6 +151,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state + return detector def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50): @@ -171,7 +177,7 @@ def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50): detector.update(pd.DataFrame(batch)) status = pd.concat([status, pd.DataFrame({group_name: [group_id], 'drift': [detector.drift_state]})], ignore_index=True) - return status + return detector, status def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): From 74e95f63b6aecedd8ab9c2a643f4b85279cbb088 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Mon, 13 Feb 2023 12:25:10 -0500 Subject: [PATCH 08/35] cbdb detector implementation --- menelaus/injection/injection_automation.py | 51 ++++++++++++++++------ 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 995822fb..63420431 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -6,6 +6,8 @@ from menelaus.concept_drift import LinearFourRates, ADWINAccuracy, DDM, EDDM, STEPD, MD3 from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI +from menelaus.data_drift.cdbd import CDBD +from menelaus.data_drift.hdddm import HDDDM import class_manipulation import feature_manipulation import noise @@ -46,9 +48,9 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non self.numeric_cols.append(col) elif self.df[col].nunique() < len(self.df) and categorical_cols is None: self.categorical_cols.append(col) - if numeric_cols is None: + if numeric_cols is not None: self.numeric_cols = numeric_cols - if categorical_cols is None: + if categorical_cols is not None: self.categorical_cols = categorical_cols if seed: @@ -142,6 +144,26 @@ def test_adwin_detector(self, cols): return detector + def test_cbdb_detector(self, cols, group_col=None, subsets=8): + if not group_col: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + while group_col in cols: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols] + test_df = self.df[self.df[group_col] != self.df[group_col].min()] + detector = CDBD(subsets=subsets) + detector.set_reference(reference_df) + drift_state = [] + + for group_id, subset_data in test_df.groupby(group_col): + detector.update(subset_data[cols]) + drift_state.append(detector.drift_state) + + return detector, drift_state + + def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound) drift_state = [] @@ -154,28 +176,28 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo return detector - def test_nndvi_detector(self, cols, group_name=None, k_nn=2, sampling_times=50): - if not group_name: - group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50): + if not group_col: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] - while group_name in cols: - group_name = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + while group_col in cols: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] filtered_df = self.df.copy() for filter_col in filtered_df.columns: - if filter_col != group_name and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]): + if filter_col != group_col and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]): filtered_df.drop(columns=filter_col, inplace=True) - grouped_df = filtered_df.groupby(group_name) - status = pd.DataFrame(columns=[group_name, 'drift']) - batches = {group_id: group.sample(frac=0.1).drop(columns=group_name).values for group_id, group in grouped_df} + grouped_df = filtered_df.groupby(group_col) + status = pd.DataFrame(columns=[group_col, 'drift']) + batches = {group_id: group.sample(frac=0.1).drop(columns=group_col).values for group_id, group in grouped_df} detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times) - detector.set_reference(batches.pop(min(self.df[group_name]))) + detector.set_reference(batches.pop(min(self.df[group_col]))) for group_id, batch in batches.items(): detector.update(pd.DataFrame(batch)) - status = pd.concat([status, pd.DataFrame({group_name: [group_id], 'drift': [detector.drift_state]})], ignore_index=True) + status = pd.concat([status, pd.DataFrame({group_col: [group_id], 'drift': [detector.drift_state]})], ignore_index=True) return detector, status @@ -189,6 +211,7 @@ def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state + return detector def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): @@ -224,4 +247,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_nndvi_detector(drift_cols) + tester.test_cbdb_detector(drift_cols) From 4d06ba3cc896cbdb734997268391b5913e961cd4 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 17 Feb 2023 13:24:21 -0500 Subject: [PATCH 09/35] hdddm detector implementation --- menelaus/injection/injection_automation.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 63420431..210ff692 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -164,6 +164,26 @@ def test_cbdb_detector(self, cols, group_col=None, subsets=8): return detector, drift_state + def test_hdddm_detector(self, cols, group_col=None, subsets=8): + if not group_col: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + while group_col in cols: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols] + test_df = self.df[self.df[group_col] != self.df[group_col].min()] + detector = HDDDM(subsets=subsets) + detector.set_reference(reference_df) + drift_state = [] + + for group_id, subset_data in test_df.groupby(group_col): + detector.update(subset_data[cols]) + drift_state.append(detector.drift_state) + + return detector, drift_state + + def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound) drift_state = [] @@ -247,4 +267,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_cbdb_detector(drift_cols) + tester.test_hdddm_detector(drift_cols) From f2f97fcf98a9d4fa9862fd8a06f6f5e2874fa9cf Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 17 Feb 2023 14:37:52 -0500 Subject: [PATCH 10/35] kdq tree batch detector implementation --- menelaus/injection/injection_automation.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 210ff692..1766d9a4 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -184,6 +184,26 @@ def test_hdddm_detector(self, cols, group_col=None, subsets=8): return detector, drift_state + def test_kdq_tree_batch_detector(self, cols, group_col=None): + if not group_col: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + while group_col in cols: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + + reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols] + test_df = self.df[self.df[group_col] != self.df[group_col].min()] + detector = KdqTreeBatch() + detector.set_reference(reference_df) + drift_state = [] + + for group_id, subset_data in test_df.groupby(group_col): + detector.update(subset_data[cols]) + drift_state.append(detector.drift_state) + + return detector, drift_state + + def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound) drift_state = [] @@ -267,4 +287,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_hdddm_detector(drift_cols) + tester.test_kdq_tree_batch_detector(drift_cols) From 6f1d923e9e5061d4b228a1161abb2f2657dae53c Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Sun, 19 Feb 2023 15:44:48 -0500 Subject: [PATCH 11/35] add basic linear model training for concept drift testing --- menelaus/injection/injection_automation.py | 35 ++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 1766d9a4..55253674 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -2,6 +2,7 @@ import os import pandas as pd import random +import sklearn from scipy.io.arff import loadarff from menelaus.concept_drift import LinearFourRates, ADWINAccuracy, DDM, EDDM, STEPD, MD3 @@ -58,10 +59,28 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non def select_rows(self, start, end): - start_drift = int(start * len(self.df)) - end_drift = int(end * len(self.df)) + start_row = int(start * len(self.df)) + end_row = int(end * len(self.df)) - return [start_drift, end_drift] + return [start_row, end_row] + + + def train_linear_model(self, x_cols, y_col=None, start=0, end=0.75): + if not y_col: + if len(x_cols) < len(self.numeric_cols): + y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] + + while y_col in x_cols: + y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] + else: + raise ValueError('Insufficient numerical columns to select a y variable') + + model = sklearn.linear_model.LinearRegression() + start_train, end_train = self.select_rows(start, end) + train_df = self.df.iloc[start_train:end_train, ] + model.fit(train_df[x_cols], train_df[y_col]) + + return model, y_col def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1): @@ -132,12 +151,16 @@ def inject_random_feature_hide_and_sample(self): return rand_col - def test_adwin_detector(self, cols): + def test_adwin_detector(self, cols, model=None, y_col=None): + if not model: + model, y_col = self.train_linear_model(x_cols=cols) + + self.df['y_pred'] = model.predict(self.df[cols]) detector = ADWINAccuracy() drift_state = [] for i, row in self.df.iterrows(): - detector.update(X=None, y_true=row[cols], y_pred=0) + detector.update(X=row[cols], y_true=row[y_col], y_pred=row['y_pred']) drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state @@ -287,4 +310,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_kdq_tree_batch_detector(drift_cols) + tester.test_adwin_detector(drift_cols) From de581265a60ecfb5260f9e37a19aba99d5d7d722 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Sun, 19 Feb 2023 15:52:50 -0500 Subject: [PATCH 12/35] tuning linear model automation and concept drift --- menelaus/injection/injection_automation.py | 27 ++++++++++------------ 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 55253674..5a568fa6 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -65,22 +65,18 @@ def select_rows(self, start, end): return [start_row, end_row] - def train_linear_model(self, x_cols, y_col=None, start=0, end=0.75): - if not y_col: - if len(x_cols) < len(self.numeric_cols): - y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] - - while y_col in x_cols: - y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] - else: - raise ValueError('Insufficient numerical columns to select a y variable') + def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): + if not x_cols or not y_col: + y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] + x_cols = self.numeric_cols.copy() + x_cols.remove(y_col) model = sklearn.linear_model.LinearRegression() start_train, end_train = self.select_rows(start, end) train_df = self.df.iloc[start_train:end_train, ] model.fit(train_df[x_cols], train_df[y_col]) - return model, y_col + return model, x_cols, y_col def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1): @@ -151,16 +147,16 @@ def inject_random_feature_hide_and_sample(self): return rand_col - def test_adwin_detector(self, cols, model=None, y_col=None): + def test_adwin_detector(self, model=None, x_cols=None, y_col=None): if not model: - model, y_col = self.train_linear_model(x_cols=cols) + model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col) - self.df['y_pred'] = model.predict(self.df[cols]) + self.df['y_pred'] = model.predict(self.df[x_cols]) detector = ADWINAccuracy() drift_state = [] for i, row in self.df.iterrows(): - detector.update(X=row[cols], y_true=row[y_col], y_pred=row['y_pred']) + detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred']) drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state @@ -310,4 +306,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_adwin_detector(drift_cols) + tester.test_adwin_detector() + print(tester.df['drift_state'].describe()) From 6c4191f553ba058b4a69b7685211eb8ed1ad1658 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 21 Feb 2023 19:30:12 -0500 Subject: [PATCH 13/35] linear four rates detector implementation --- menelaus/injection/injection_automation.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 5a568fa6..262604d2 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -235,6 +235,24 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo return detector + def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01, + detect_level=0.001, num_mc=5000, burn_in=10, subsample=10): + if not model: + model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col) + + self.df['y_pred'] = model.predict(self.df[x_cols]) + detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level, + num_mc=num_mc, burn_in=burn_in, subsample=subsample) + drift_state = [] + + for i, row in self.df.iterrows(): + detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred']) + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state + return detector + + def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] @@ -306,5 +324,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_adwin_detector() + tester.test_lfr_detector() print(tester.df['drift_state'].describe()) From f9897f8af460db5a4c921c6c93495f44e33760ec Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 21 Feb 2023 19:56:37 -0500 Subject: [PATCH 14/35] adding naive logistic classifier model --- menelaus/injection/injection_automation.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 262604d2..6d79921b 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -79,6 +79,24 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): return model, x_cols, y_col + def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75): + if not x_cols or not y_col: + y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + x_cols = self.numeric_cols.copy() + + encoder = sklearn.preprocessing.LabelEncoder() + encoder.fit(self.df[y_col]) + self.df[f'{y_col}_encoded'] = encoder.transform(self.df[y_col]) + y_col = f'{y_col}_encoded' + + model = sklearn.linear_model.LogisticRegression() + start_train, end_train = self.select_rows(start, end) + train_df = self.df.iloc[start_train:end_train, ] + model.fit(train_df[x_cols], train_df[y_col]) + + return model, x_cols, y_col + + def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1): rand_cols = [] start_drift, end_drift = self.select_rows(start, end) @@ -238,7 +256,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01, detect_level=0.001, num_mc=5000, burn_in=10, subsample=10): if not model: - model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) self.df['y_pred'] = model.predict(self.df[x_cols]) detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level, From 3a6e0e04bcbb619fe5a509e790eeb379b88926d3 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 22 Feb 2023 12:28:54 -0500 Subject: [PATCH 15/35] adding binary option to logistic classifier --- menelaus/injection/injection_automation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 6d79921b..5a78f493 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -79,7 +79,7 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): return model, x_cols, y_col - def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75): + def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None): if not x_cols or not y_col: y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] x_cols = self.numeric_cols.copy() @@ -89,6 +89,9 @@ def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75): self.df[f'{y_col}_encoded'] = encoder.transform(self.df[y_col]) y_col = f'{y_col}_encoded' + if limit_classes: + self.df = self.df[self.df[y_col] < limit_classes] + model = sklearn.linear_model.LogisticRegression() start_train, end_train = self.select_rows(start, end) train_df = self.df.iloc[start_train:end_train, ] @@ -256,7 +259,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01, detect_level=0.001, num_mc=5000, burn_in=10, subsample=10): if not model: - model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col, limit_classes=2) self.df['y_pred'] = model.predict(self.df[x_cols]) detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level, From 11b6f157b4e714f39120af9eb4fb07f6175561ec Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 22 Feb 2023 18:10:37 -0500 Subject: [PATCH 16/35] ddm detector implementation --- menelaus/injection/injection_automation.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 5a78f493..97f3c9e7 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -204,6 +204,22 @@ def test_cbdb_detector(self, cols, group_col=None, subsets=8): return detector, drift_state + def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100, warning_scale=7, drift_scale=10): + if not model: + model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + + self.df['y_pred'] = model.predict(self.df[x_cols]) + detector = DDM(n_threshold=n_threshold, warning_scale=warning_scale, drift_scale=drift_scale) + drift_state = [] + + for i, row in self.df.iterrows(): + detector.update(y_true=row[y_col], y_pred=row['y_pred']) + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state + return detector + + def test_hdddm_detector(self, cols, group_col=None, subsets=8): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] @@ -345,5 +361,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_lfr_detector() + tester.test_ddm_detector() print(tester.df['drift_state'].describe()) From 4339ae68b84dd36f56495221ad76fc0be1799979 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 22 Feb 2023 18:24:01 -0500 Subject: [PATCH 17/35] eddm detector implementation --- menelaus/injection/injection_automation.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 97f3c9e7..30bbdeb4 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -220,6 +220,22 @@ def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100 return detector + def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30, warning_thresh=0.7, drift_thresh=0.5): + if not model: + model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + + self.df['y_pred'] = model.predict(self.df[x_cols]) + detector = EDDM(n_threshold=n_threshold, warning_thresh=warning_thresh, drift_thresh=drift_thresh) + drift_state = [] + + for i, row in self.df.iterrows(): + detector.update(y_true=row[y_col], y_pred=row['y_pred']) + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state + return detector + + def test_hdddm_detector(self, cols, group_col=None, subsets=8): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] @@ -361,5 +377,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_ddm_detector() + tester.test_eddm_detector() print(tester.df['drift_state'].describe()) From c68a81f827d7edb68bf59e203448196b094c1061 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 22 Feb 2023 18:31:37 -0500 Subject: [PATCH 18/35] stepd detector implementation --- menelaus/injection/injection_automation.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 30bbdeb4..8eb9f63f 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -344,6 +344,22 @@ def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): return detector + def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=250): + if not model: + model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + + self.df['y_pred'] = model.predict(self.df[x_cols]) + detector = STEPD(window_size=window_size) + drift_state = [] + + for i, row in self.df.iterrows(): + detector.update(y_true=row[y_col], y_pred=row['y_pred']) + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state + return detector + + def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): plt.figure(figsize=(20, 6)) y_min = None @@ -377,5 +393,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_eddm_detector() + tester.test_stepd_detector() print(tester.df['drift_state'].describe()) From 52d49c653724aec34323556bc46bbc1671a3e812 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 28 Feb 2023 11:17:54 -0500 Subject: [PATCH 19/35] adding reference group logic for concept drift detectors --- menelaus/injection/injection_automation.py | 34 +++++++++++++++------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 8eb9f63f..8a9e4933 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -184,15 +184,18 @@ def test_adwin_detector(self, model=None, x_cols=None, y_col=None): return detector - def test_cbdb_detector(self, cols, group_col=None, subsets=8): + def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets=8): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] while group_col in cols: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] - reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols] - test_df = self.df[self.df[group_col] != self.df[group_col].min()] + if not reference_group: + reference_group = self.df[group_col].min() + + reference_df = self.df[self.df[group_col] == reference_group][cols] + test_df = self.df[self.df[group_col] != reference_group] detector = CDBD(subsets=subsets) detector.set_reference(reference_df) drift_state = [] @@ -236,15 +239,18 @@ def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30 return detector - def test_hdddm_detector(self, cols, group_col=None, subsets=8): + def test_hdddm_detector(self, cols, group_col=None, reference_group=None, subsets=8): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] while group_col in cols: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] - reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols] - test_df = self.df[self.df[group_col] != self.df[group_col].min()] + if not reference_group: + reference_group = self.df[group_col].min() + + reference_df = self.df[self.df[group_col] == reference_group][cols] + test_df = self.df[self.df[group_col] != reference_group] detector = HDDDM(subsets=subsets) detector.set_reference(reference_df) drift_state = [] @@ -256,15 +262,18 @@ def test_hdddm_detector(self, cols, group_col=None, subsets=8): return detector, drift_state - def test_kdq_tree_batch_detector(self, cols, group_col=None): + def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=None): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] while group_col in cols: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] - reference_df = self.df[self.df[group_col] == self.df[group_col].min()][cols] - test_df = self.df[self.df[group_col] != self.df[group_col].min()] + if not reference_group: + reference_group = self.df[group_col].min() + + reference_df = self.df[self.df[group_col] == reference_group][cols] + test_df = self.df[self.df[group_col] != reference_group] detector = KdqTreeBatch() detector.set_reference(reference_df) drift_state = [] @@ -306,13 +315,16 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact return detector - def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50): + def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] while group_col in cols: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + if not reference_group: + reference_group = self.df[group_col].min() + filtered_df = self.df.copy() for filter_col in filtered_df.columns: if filter_col != group_col and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]): @@ -323,7 +335,7 @@ def test_nndvi_detector(self, cols, group_col=None, k_nn=2, sampling_times=50): batches = {group_id: group.sample(frac=0.1).drop(columns=group_col).values for group_id, group in grouped_df} detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times) - detector.set_reference(batches.pop(min(self.df[group_col]))) + detector.set_reference(batches.pop(reference_group)) for group_id, batch in batches.items(): detector.update(pd.DataFrame(batch)) From 3d3ba2259393f22f107a6ded8fb66818f3c5dd12 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 28 Feb 2023 18:24:35 -0500 Subject: [PATCH 20/35] md3 detector implementation --- menelaus/injection/injection_automation.py | 26 +++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 8a9e4933..6570d707 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -32,6 +32,7 @@ def select_random_classes(series): class InjectionTesting: def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None): file_type = data_path.split('.')[-1] + self.seed = seed self.numeric_cols = [] self.categorical_cols = [] @@ -315,6 +316,29 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact return detector + def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5, oracle_labels=1000): + if not model: + model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + + cols = x_cols.copy() + cols.append(y_col) + self.df['y_pred'] = model.predict(self.df[x_cols]) + detector = MD3(clf=model, sensitivity=sensitivity, oracle_data_length_required=oracle_labels) + detector.set_reference(X=self.df[cols], target_name=y_col) + drift_state = [] + + for i, row in self.df.iterrows(): + if detector.waiting_for_oracle: + oracle_label = pd.DataFrame([row[cols]]) + detector.give_oracle_label(oracle_label) + + detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred']) + drift_state.append(detector.drift_state) + + self.df['drift_state'] = drift_state + return detector + + def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] @@ -405,5 +429,5 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_stepd_detector() + tester.test_md3_detector() print(tester.df['drift_state'].describe()) From cff5906ec5b7a88d51f366873dce4efb3787d7ed Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 28 Feb 2023 19:44:23 -0500 Subject: [PATCH 21/35] naive linear svc implementation --- menelaus/injection/injection_automation.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 6570d707..40606681 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -80,7 +80,7 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): return model, x_cols, y_col - def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None): + def train_classifier_model(self, model_type='svc', x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None): if not x_cols or not y_col: y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] x_cols = self.numeric_cols.copy() @@ -93,7 +93,13 @@ def train_logistic_model(self, x_cols=None, y_col=None, start=0, end=0.75, limit if limit_classes: self.df = self.df[self.df[y_col] < limit_classes] - model = sklearn.linear_model.LogisticRegression() + if model_type == 'svc': + model = sklearn.svm.SVC(kernel='linear') + elif model_type == 'logistic': + model = sklearn.linear_model.LogisticRegression() + else: + raise ValueError(f'Model type not supported: {model_type}') + start_train, end_train = self.select_rows(start, end) train_df = self.df.iloc[start_train:end_train, ] model.fit(train_df[x_cols], train_df[y_col]) @@ -210,7 +216,7 @@ def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100, warning_scale=7, drift_scale=10): if not model: - model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) self.df['y_pred'] = model.predict(self.df[x_cols]) detector = DDM(n_threshold=n_threshold, warning_scale=warning_scale, drift_scale=drift_scale) @@ -226,7 +232,7 @@ def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100 def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30, warning_thresh=0.7, drift_thresh=0.5): if not model: - model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) self.df['y_pred'] = model.predict(self.df[x_cols]) detector = EDDM(n_threshold=n_threshold, warning_thresh=warning_thresh, drift_thresh=drift_thresh) @@ -301,7 +307,7 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01, detect_level=0.001, num_mc=5000, burn_in=10, subsample=10): if not model: - model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col, limit_classes=2) + model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, limit_classes=2) self.df['y_pred'] = model.predict(self.df[x_cols]) detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level, @@ -318,7 +324,7 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5, oracle_labels=1000): if not model: - model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) cols = x_cols.copy() cols.append(y_col) @@ -382,7 +388,7 @@ def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=250): if not model: - model, x_cols, y_col = self.train_logistic_model(x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) self.df['y_pred'] = model.predict(self.df[x_cols]) detector = STEPD(window_size=window_size) From 98f58b6c398465c90aaf808020591b607881f998 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 1 Mar 2023 13:53:36 -0500 Subject: [PATCH 22/35] md3 detector bug fixing --- menelaus/injection/injection_automation.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 40606681..b095504d 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -44,15 +44,15 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non else: raise ValueError(f'Invalid file type: {file_type}') - if numeric_cols is None or categorical_cols is None: + if not numeric_cols or not categorical_cols: for col in self.df.columns: if pd.api.types.is_numeric_dtype(self.df[col]) and numeric_cols is None: self.numeric_cols.append(col) elif self.df[col].nunique() < len(self.df) and categorical_cols is None: self.categorical_cols.append(col) - if numeric_cols is not None: + if numeric_cols: self.numeric_cols = numeric_cols - if categorical_cols is not None: + if categorical_cols: self.categorical_cols = categorical_cols if seed: @@ -322,10 +322,14 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact return detector - def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5, oracle_labels=1000): + def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=None): if not model: - model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end) + + if not oracle_labels: + oracle_labels = self.df[y_col].nunique() + training_size = int(len(self.df) * end) cols = x_cols.copy() cols.append(y_col) self.df['y_pred'] = model.predict(self.df[x_cols]) @@ -333,8 +337,8 @@ def test_md3_detector(self, model=None, x_cols=None, y_col=None, sensitivity=1.5 detector.set_reference(X=self.df[cols], target_name=y_col) drift_state = [] - for i, row in self.df.iterrows(): - if detector.waiting_for_oracle: + for i, row in self.df.iloc[training_size:len(self.df), ].iterrows(): + while detector.waiting_for_oracle: oracle_label = pd.DataFrame([row[cols]]) detector.give_oracle_label(oracle_label) From f655b8235b44836a96576d253a9d9bdae83f2d7c Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 1 Mar 2023 17:06:17 -0500 Subject: [PATCH 23/35] md3 detector working --- menelaus/injection/injection_automation.py | 29 ++++++++++++---------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index b095504d..46e155e0 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -322,31 +322,35 @@ def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_fact return detector - def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=None): + def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=1000): if not model: model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end) + retrain_model, _, _ = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end) - if not oracle_labels: - oracle_labels = self.df[y_col].nunique() - - training_size = int(len(self.df) * end) + end_train = int(end * len(self.df)) cols = x_cols.copy() cols.append(y_col) self.df['y_pred'] = model.predict(self.df[x_cols]) + self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols]) detector = MD3(clf=model, sensitivity=sensitivity, oracle_data_length_required=oracle_labels) detector.set_reference(X=self.df[cols], target_name=y_col) drift_state = [] - for i, row in self.df.iloc[training_size:len(self.df), ].iterrows(): - while detector.waiting_for_oracle: + for i, row in self.df.iloc[end_train:len(self.df), ].iterrows(): + if detector.waiting_for_oracle: oracle_label = pd.DataFrame([row[cols]]) detector.give_oracle_label(oracle_label) - detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred']) - drift_state.append(detector.drift_state) + if not detector.waiting_for_oracle: + retrain_model.fit(detector.reference_batch_features, detector.reference_batch_target.values.ravel()) + self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols]) - self.df['drift_state'] = drift_state - return detector + drift_state.append(detector.drift_state) + else: + detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred_retrain']) + drift_state.append(detector.drift_state) + + return detector, drift_state def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50): @@ -439,5 +443,4 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) drift_cols = tester.inject_random_brownian_noise(10) - tester.test_md3_detector() - print(tester.df['drift_state'].describe()) + detector, drift = tester.test_md3_detector() From fee1182b5415650bfdfd4f2646bda0596c3bed9e Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Thu, 2 Mar 2023 11:41:56 -0500 Subject: [PATCH 24/35] some detector tuning --- menelaus/injection/injection_automation.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 46e155e0..f387a18e 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -353,12 +353,13 @@ def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0. return detector, drift_state - def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2, sampling_times=50): + def test_nndvi_detector(self, cols=None, group_col=None, reference_group=None, k_nn=2, sampling_times=50): if not group_col: group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] - while group_col in cols: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + if cols: + while group_col in cols: + group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] if not reference_group: reference_group = self.df[group_col].min() @@ -382,12 +383,15 @@ def test_nndvi_detector(self, cols, group_col=None, reference_group=None, k_nn=2 return detector, status - def test_pcacd_detector(self, window_size=50, divergence_metric='intersection'): + def test_pcacd_detector(self, cols=None, window_size=50, divergence_metric='intersection'): + if not cols: + cols = self.numeric_cols.copy() + detector = PCACD(window_size=window_size, divergence_metric=divergence_metric) drift_state = [] for i, row in self.df.iterrows(): - detector.update(row) + detector.update(row[cols]) drift_state.append(detector.drift_state) self.df['drift_state'] = drift_state @@ -442,5 +446,7 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): if __name__ == '__main__': file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' tester = InjectionTesting(file) - drift_cols = tester.inject_random_brownian_noise(10) - detector, drift = tester.test_md3_detector() + _, classes = tester.inject_random_class_manipulation(manipulation_type='class_swap') + nndvi, status = tester.test_nndvi_detector(k_nn=1000, sampling_times=1000) + print(classes) + print(status) From 6bb1bb9289b76e1d6cd8ad6b16366c6526f4a62a Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 7 Mar 2023 14:38:34 -0500 Subject: [PATCH 25/35] testing --- menelaus/injection/injection_automation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index f387a18e..5a95fe42 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -444,9 +444,9 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): if __name__ == '__main__': - file = 'souza_data/INSECTS-abrupt_balanced_norm.arff' + file = 'souza_data/gassensor.arff' tester = InjectionTesting(file) _, classes = tester.inject_random_class_manipulation(manipulation_type='class_swap') - nndvi, status = tester.test_nndvi_detector(k_nn=1000, sampling_times=1000) + nndvi, status = tester.test_nndvi_detector(k_nn=50, sampling_times=100) print(classes) print(status) From ac726479d4f03e918f65d99c84e1ca77f0ae0181 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 15 Mar 2023 11:36:59 -0400 Subject: [PATCH 26/35] script reformatting --- menelaus/injection/injection_automation.py | 383 ++++++++++++++------- 1 file changed, 260 insertions(+), 123 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 5a95fe42..a26fa9e7 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -18,7 +18,7 @@ def select_random_classes(series): classes = series.unique() if len(classes) < 2: - raise ValueError(f'Insufficient classes in series: {len(classes)}') + raise ValueError(f"Insufficient classes in series: {len(classes)}") else: class_a = classes[random.randint(0, len(classes) - 1)] class_b = classes[random.randint(0, len(classes) - 1)] @@ -31,18 +31,18 @@ def select_random_classes(series): class InjectionTesting: def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None): - file_type = data_path.split('.')[-1] + file_type = data_path.split(".")[-1] self.seed = seed self.numeric_cols = [] self.categorical_cols = [] - if file_type == 'csv': + if file_type == "csv": self.df = pd.read_csv(data_path) - elif file_type == 'arff': + elif file_type == "arff": raw_data = loadarff(data_path) self.df = pd.DataFrame(raw_data[0]) else: - raise ValueError(f'Invalid file type: {file_type}') + raise ValueError(f"Invalid file type: {file_type}") if not numeric_cols or not categorical_cols: for col in self.df.columns: @@ -58,14 +58,12 @@ def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=Non if seed: random.seed(seed) - def select_rows(self, start, end): start_row = int(start * len(self.df)) end_row = int(end * len(self.df)) return [start_row, end_row] - def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): if not x_cols or not y_col: y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] @@ -74,40 +72,52 @@ def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): model = sklearn.linear_model.LinearRegression() start_train, end_train = self.select_rows(start, end) - train_df = self.df.iloc[start_train:end_train, ] + train_df = self.df.iloc[ + start_train:end_train, + ] model.fit(train_df[x_cols], train_df[y_col]) return model, x_cols, y_col - - def train_classifier_model(self, model_type='svc', x_cols=None, y_col=None, start=0, end=0.75, limit_classes=None): + def train_classifier_model( + self, + model_type="svc", + x_cols=None, + y_col=None, + start=0, + end=0.75, + limit_classes=None, + ): if not x_cols or not y_col: - y_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + y_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] x_cols = self.numeric_cols.copy() encoder = sklearn.preprocessing.LabelEncoder() encoder.fit(self.df[y_col]) - self.df[f'{y_col}_encoded'] = encoder.transform(self.df[y_col]) - y_col = f'{y_col}_encoded' + self.df[f"{y_col}_encoded"] = encoder.transform(self.df[y_col]) + y_col = f"{y_col}_encoded" if limit_classes: self.df = self.df[self.df[y_col] < limit_classes] - if model_type == 'svc': - model = sklearn.svm.SVC(kernel='linear') - elif model_type == 'logistic': + if model_type == "svc": + model = sklearn.svm.SVC(kernel="linear") + elif model_type == "logistic": model = sklearn.linear_model.LogisticRegression() else: - raise ValueError(f'Model type not supported: {model_type}') + raise ValueError(f"Model type not supported: {model_type}") start_train, end_train = self.select_rows(start, end) - train_df = self.df.iloc[start_train:end_train, ] + train_df = self.df.iloc[ + start_train:end_train, + ] model.fit(train_df[x_cols], train_df[y_col]) return model, x_cols, y_col - - def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1): + def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1): rand_cols = [] start_drift, end_drift = self.select_rows(start, end) @@ -119,40 +129,69 @@ def inject_random_brownian_noise(self, x, start=.75, end=1, num_drift_cols=1): return rand_cols - - def inject_random_class_manipulation(self, manipulation_type, start=.75, end=1, num_drift_cols=1): + def inject_random_class_manipulation( + self, manipulation_type, start=0.75, end=1, num_drift_cols=1 + ): rand_cols = [] all_rand_classes = [] start_drift, end_drift = self.select_rows(start, end) for i in range(num_drift_cols): - rand_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + rand_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] rand_cols.append(rand_col) rand_classes = select_random_classes(self.df[rand_col]) all_rand_classes.append(rand_classes) - if manipulation_type == 'class_swap': - self.df = class_manipulation.class_swap(self.df, rand_col, rand_classes[0], rand_classes[1], start_drift, end_drift) - elif manipulation_type == 'class_join': - new_label = f'{rand_classes[0]}_{rand_classes[1]}' - self.df = class_manipulation.class_join(self.df, rand_col, rand_classes[0], rand_classes[1], new_label, start_drift, end_drift) + if manipulation_type == "class_swap": + self.df = class_manipulation.class_swap( + self.df, + rand_col, + rand_classes[0], + rand_classes[1], + start_drift, + end_drift, + ) + elif manipulation_type == "class_join": + new_label = f"{rand_classes[0]}_{rand_classes[1]}" + self.df = class_manipulation.class_join( + self.df, + rand_col, + rand_classes[0], + rand_classes[1], + new_label, + start_drift, + end_drift, + ) else: - raise ValueError(f'Invalid class manipulation type: {manipulation_type}') + raise ValueError( + f"Invalid class manipulation type: {manipulation_type}" + ) return rand_cols, all_rand_classes - - def inject_random_feature_swap(self, start=.75, end=1, num_swaps=1): + def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): all_swap_cols = [] start_drift, end_drift = self.select_rows(start, end) for i in range(num_swaps): - col_type = self.numeric_cols if random.randint(0, 1) == 0 else self.categorical_cols + col_type = ( + self.numeric_cols + if random.randint(0, 1) == 0 + else self.categorical_cols + ) if len(col_type) < 2: - col_type = self.numeric_cols if col_type == self.categorical_cols else self.categorical_cols + col_type = ( + self.numeric_cols + if col_type == self.categorical_cols + else self.categorical_cols + ) if len(col_type) < 2: - raise ValueError('Insufficient numeric and categorical columns for swaps') + raise ValueError( + "Insufficient numeric and categorical columns for swaps" + ) col_a = col_type[random.randint(0, len(col_type) - 1)] col_b = col_type[random.randint(0, len(col_type) - 1)] @@ -162,41 +201,46 @@ def inject_random_feature_swap(self, start=.75, end=1, num_swaps=1): swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) - self.df = feature_manipulation.feature_swap(self.df, col_a, col_b, start_drift, end_drift) + self.df = feature_manipulation.feature_swap( + self.df, col_a, col_b, start_drift, end_drift + ) return all_swap_cols - def inject_random_feature_hide_and_sample(self): rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] sample_size = min(self.df[rand_col].value_counts()) - self.df = feature_manipulation.feature_hide_and_sample(self.df, rand_col, sample_size) + self.df = feature_manipulation.feature_hide_and_sample( + self.df, rand_col, sample_size + ) return rand_col - def test_adwin_detector(self, model=None, x_cols=None, y_col=None): if not model: model, x_cols, y_col = self.train_linear_model(x_cols=x_cols, y_col=y_col) - self.df['y_pred'] = model.predict(self.df[x_cols]) + self.df["y_pred"] = model.predict(self.df[x_cols]) detector = ADWINAccuracy() drift_state = [] for i, row in self.df.iterrows(): - detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred']) + detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row["y_pred"]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets=8): if not group_col: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] while group_col in cols: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] if not reference_group: reference_group = self.df[group_col].min() @@ -213,45 +257,76 @@ def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets return detector, drift_state - - def test_ddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=100, warning_scale=7, drift_scale=10): + def test_ddm_detector( + self, + model=None, + x_cols=None, + y_col=None, + n_threshold=100, + warning_scale=7, + drift_scale=10, + ): if not model: - model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) - - self.df['y_pred'] = model.predict(self.df[x_cols]) - detector = DDM(n_threshold=n_threshold, warning_scale=warning_scale, drift_scale=drift_scale) + model, x_cols, y_col = self.train_classifier_model( + model_type="svc", x_cols=x_cols, y_col=y_col + ) + + self.df["y_pred"] = model.predict(self.df[x_cols]) + detector = DDM( + n_threshold=n_threshold, + warning_scale=warning_scale, + drift_scale=drift_scale, + ) drift_state = [] for i, row in self.df.iterrows(): - detector.update(y_true=row[y_col], y_pred=row['y_pred']) + detector.update(y_true=row[y_col], y_pred=row["y_pred"]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - - def test_eddm_detector(self, model=None, x_cols=None, y_col=None, n_threshold=30, warning_thresh=0.7, drift_thresh=0.5): + def test_eddm_detector( + self, + model=None, + x_cols=None, + y_col=None, + n_threshold=30, + warning_thresh=0.7, + drift_thresh=0.5, + ): if not model: - model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) - - self.df['y_pred'] = model.predict(self.df[x_cols]) - detector = EDDM(n_threshold=n_threshold, warning_thresh=warning_thresh, drift_thresh=drift_thresh) + model, x_cols, y_col = self.train_classifier_model( + model_type="svc", x_cols=x_cols, y_col=y_col + ) + + self.df["y_pred"] = model.predict(self.df[x_cols]) + detector = EDDM( + n_threshold=n_threshold, + warning_thresh=warning_thresh, + drift_thresh=drift_thresh, + ) drift_state = [] for i, row in self.df.iterrows(): - detector.update(y_true=row[y_col], y_pred=row['y_pred']) + detector.update(y_true=row[y_col], y_pred=row["y_pred"]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - - def test_hdddm_detector(self, cols, group_col=None, reference_group=None, subsets=8): + def test_hdddm_detector( + self, cols, group_col=None, reference_group=None, subsets=8 + ): if not group_col: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] while group_col in cols: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] if not reference_group: reference_group = self.df[group_col].min() @@ -268,13 +343,16 @@ def test_hdddm_detector(self, cols, group_col=None, reference_group=None, subset return detector, drift_state - def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=None): if not group_col: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] while group_col in cols: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] if not reference_group: reference_group = self.df[group_col].min() @@ -291,8 +369,9 @@ def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=Non return detector, drift_state - - def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50): + def test_kdq_tree_streaming_detector( + self, cols, window_size=500, alpha=0.05, bootstrap_samples=500, count_ubound=50 + ): detector = KdqTreeStreaming(window_size, alpha, bootstrap_samples, count_ubound) drift_state = [] @@ -300,90 +379,151 @@ def test_kdq_tree_streaming_detector(self, cols, window_size=500, alpha=0.05, bo detector.update(row[cols]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - - def test_lfr_detector(self, model=None, x_cols=None, y_col=None, time_decay_factor=0.6, warning_level=0.01, - detect_level=0.001, num_mc=5000, burn_in=10, subsample=10): + def test_lfr_detector( + self, + model=None, + x_cols=None, + y_col=None, + time_decay_factor=0.6, + warning_level=0.01, + detect_level=0.001, + num_mc=5000, + burn_in=10, + subsample=10, + ): if not model: - model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, limit_classes=2) - - self.df['y_pred'] = model.predict(self.df[x_cols]) - detector = LinearFourRates(time_decay_factor=time_decay_factor, warning_level=warning_level, detect_level=detect_level, - num_mc=num_mc, burn_in=burn_in, subsample=subsample) + model, x_cols, y_col = self.train_classifier_model( + model_type="svc", x_cols=x_cols, y_col=y_col, limit_classes=2 + ) + + self.df["y_pred"] = model.predict(self.df[x_cols]) + detector = LinearFourRates( + time_decay_factor=time_decay_factor, + warning_level=warning_level, + detect_level=detect_level, + num_mc=num_mc, + burn_in=burn_in, + subsample=subsample, + ) drift_state = [] for i, row in self.df.iterrows(): - detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row['y_pred']) + detector.update(X=row[x_cols], y_true=row[y_col], y_pred=row["y_pred"]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - - def test_md3_detector(self, model=None, x_cols=None, y_col=None, start=0, end=0.75, sensitivity=1.5, oracle_labels=1000): + def test_md3_detector( + self, + model=None, + x_cols=None, + y_col=None, + start=0, + end=0.75, + sensitivity=1.5, + oracle_labels=1000, + ): if not model: - model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end) - retrain_model, _, _ = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col, start=start, end=end) + model, x_cols, y_col = self.train_classifier_model( + model_type="svc", x_cols=x_cols, y_col=y_col, start=start, end=end + ) + retrain_model, _, _ = self.train_classifier_model( + model_type="svc", x_cols=x_cols, y_col=y_col, start=start, end=end + ) end_train = int(end * len(self.df)) cols = x_cols.copy() cols.append(y_col) - self.df['y_pred'] = model.predict(self.df[x_cols]) - self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols]) - detector = MD3(clf=model, sensitivity=sensitivity, oracle_data_length_required=oracle_labels) + self.df["y_pred"] = model.predict(self.df[x_cols]) + self.df["y_pred_retrain"] = retrain_model.predict(self.df[x_cols]) + detector = MD3( + clf=model, + sensitivity=sensitivity, + oracle_data_length_required=oracle_labels, + ) detector.set_reference(X=self.df[cols], target_name=y_col) drift_state = [] - for i, row in self.df.iloc[end_train:len(self.df), ].iterrows(): + for i, row in self.df.iloc[ + end_train : len(self.df), + ].iterrows(): if detector.waiting_for_oracle: oracle_label = pd.DataFrame([row[cols]]) detector.give_oracle_label(oracle_label) if not detector.waiting_for_oracle: - retrain_model.fit(detector.reference_batch_features, detector.reference_batch_target.values.ravel()) - self.df['y_pred_retrain'] = retrain_model.predict(self.df[x_cols]) + retrain_model.fit( + detector.reference_batch_features, + detector.reference_batch_target.values.ravel(), + ) + self.df["y_pred_retrain"] = retrain_model.predict(self.df[x_cols]) drift_state.append(detector.drift_state) else: - detector.update(X=pd.DataFrame([row[x_cols]]), y_true=row[y_col], y_pred=row['y_pred_retrain']) + detector.update( + X=pd.DataFrame([row[x_cols]]), + y_true=row[y_col], + y_pred=row["y_pred_retrain"], + ) drift_state.append(detector.drift_state) return detector, drift_state - - def test_nndvi_detector(self, cols=None, group_col=None, reference_group=None, k_nn=2, sampling_times=50): + def test_nndvi_detector( + self, cols=None, group_col=None, reference_group=None, k_nn=2, sampling_times=50 + ): if not group_col: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] if cols: while group_col in cols: - group_col = self.categorical_cols[random.randint(0, len(self.categorical_cols) - 1)] + group_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] if not reference_group: reference_group = self.df[group_col].min() filtered_df = self.df.copy() for filter_col in filtered_df.columns: - if filter_col != group_col and not pd.api.types.is_numeric_dtype(filtered_df[filter_col]): + if filter_col != group_col and not pd.api.types.is_numeric_dtype( + filtered_df[filter_col] + ): filtered_df.drop(columns=filter_col, inplace=True) grouped_df = filtered_df.groupby(group_col) - status = pd.DataFrame(columns=[group_col, 'drift']) - batches = {group_id: group.sample(frac=0.1).drop(columns=group_col).values for group_id, group in grouped_df} + status = pd.DataFrame(columns=[group_col, "drift"]) + batches = { + group_id: group.sample(frac=0.1).drop(columns=group_col).values + for group_id, group in grouped_df + } detector = NNDVI(k_nn=k_nn, sampling_times=sampling_times) detector.set_reference(batches.pop(reference_group)) for group_id, batch in batches.items(): detector.update(pd.DataFrame(batch)) - status = pd.concat([status, pd.DataFrame({group_col: [group_id], 'drift': [detector.drift_state]})], ignore_index=True) + status = pd.concat( + [ + status, + pd.DataFrame( + {group_col: [group_id], "drift": [detector.drift_state]} + ), + ], + ignore_index=True, + ) return detector, status - - def test_pcacd_detector(self, cols=None, window_size=50, divergence_metric='intersection'): + def test_pcacd_detector( + self, cols=None, window_size=50, divergence_metric="intersection" + ): if not cols: cols = self.numeric_cols.copy() @@ -394,27 +534,27 @@ def test_pcacd_detector(self, cols=None, window_size=50, divergence_metric='inte detector.update(row[cols]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=250): if not model: - model, x_cols, y_col = self.train_classifier_model(model_type='svc', x_cols=x_cols, y_col=y_col) + model, x_cols, y_col = self.train_classifier_model( + model_type="svc", x_cols=x_cols, y_col=y_col + ) - self.df['y_pred'] = model.predict(self.df[x_cols]) + self.df["y_pred"] = model.predict(self.df[x_cols]) detector = STEPD(window_size=window_size) drift_state = [] for i, row in self.df.iterrows(): - detector.update(y_true=row[y_col], y_pred=row['y_pred']) + detector.update(y_true=row[y_col], y_pred=row["y_pred"]) drift_state.append(detector.drift_state) - self.df['drift_state'] = drift_state + self.df["drift_state"] = drift_state return detector - - def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): + def plot_drift_scatter(self, cols, output_file="plots/drift_scatter_test.png"): plt.figure(figsize=(20, 6)) y_min = None y_max = None @@ -429,24 +569,21 @@ def plot_drift_scatter(self, cols, output_file='plots/drift_scatter_test.png'): if y_max is None or y_max < local_max: y_max = local_max - plt.grid(False, axis='x') + plt.grid(False, axis="x") plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.title('Scatter Results', fontsize=22) - plt.xlabel('Index', fontsize=18) - plt.ylabel('Value', fontsize=18) + plt.title("Scatter Results", fontsize=22) + plt.xlabel("Index", fontsize=18) + plt.ylabel("Value", fontsize=18) plt.ylim((y_min, y_max)) - plt.vlines(x=self.df[self.df['drift_state'] == 'drift'].index, ymin=y_min, ymax=y_max, label='Drift Detected', color='red') + plt.vlines( + x=self.df[self.df["drift_state"] == "drift"].index, + ymin=y_min, + ymax=y_max, + label="Drift Detected", + color="red", + ) plt.legend() os.makedirs(os.path.dirname(output_file), exist_ok=True) plt.savefig(output_file) - - -if __name__ == '__main__': - file = 'souza_data/gassensor.arff' - tester = InjectionTesting(file) - _, classes = tester.inject_random_class_manipulation(manipulation_type='class_swap') - nndvi, status = tester.test_nndvi_detector(k_nn=50, sampling_times=100) - print(classes) - print(status) From b69371e24df09087082bade612a5acd6d4b5a8a5 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 15 Mar 2023 14:20:22 -0400 Subject: [PATCH 27/35] fixing injector functions for class modifications --- menelaus/injection/injection_automation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index a26fa9e7..8d3f6dd0 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -9,7 +9,7 @@ from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI from menelaus.data_drift.cdbd import CDBD from menelaus.data_drift.hdddm import HDDDM -import class_manipulation +import label_manipulation import feature_manipulation import noise @@ -125,7 +125,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1): rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] rand_cols.append(rand_col) - self.df = noise.brownian_noise(self.df, rand_col, x, start_drift, end_drift) + self.df = noise.BrownianNoiseInjector.__call__(self.df, rand_col, x, start_drift, end_drift) return rand_cols @@ -145,7 +145,7 @@ def inject_random_class_manipulation( all_rand_classes.append(rand_classes) if manipulation_type == "class_swap": - self.df = class_manipulation.class_swap( + self.df = label_manipulation.LabelSwapInjector().__call__( self.df, rand_col, rand_classes[0], @@ -155,7 +155,7 @@ def inject_random_class_manipulation( ) elif manipulation_type == "class_join": new_label = f"{rand_classes[0]}_{rand_classes[1]}" - self.df = class_manipulation.class_join( + self.df = label_manipulation.LabelJoinInjector().__call__( self.df, rand_col, rand_classes[0], @@ -201,7 +201,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) - self.df = feature_manipulation.feature_swap( + self.df = feature_manipulation.FeatureSwapInjector().__call__( self.df, col_a, col_b, start_drift, end_drift ) @@ -210,7 +210,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): def inject_random_feature_hide_and_sample(self): rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] sample_size = min(self.df[rand_col].value_counts()) - self.df = feature_manipulation.feature_hide_and_sample( + self.df = feature_manipulation.FeatureCoverInjector().__call__( self.df, rand_col, sample_size ) From 8ae9d929c2c0aa9649e6a203f2effe84ca6f1726 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Wed, 15 Mar 2023 19:46:12 -0400 Subject: [PATCH 28/35] bug fixing --- menelaus/injection/injection_automation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 8d3f6dd0..353cc055 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -125,7 +125,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1): rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] rand_cols.append(rand_col) - self.df = noise.BrownianNoiseInjector.__call__(self.df, rand_col, x, start_drift, end_drift) + self.df = noise.BrownianNoiseInjector(self.df, rand_col, x, start_drift, end_drift) return rand_cols @@ -145,7 +145,7 @@ def inject_random_class_manipulation( all_rand_classes.append(rand_classes) if manipulation_type == "class_swap": - self.df = label_manipulation.LabelSwapInjector().__call__( + self.df = label_manipulation.LabelSwapInjector( self.df, rand_col, rand_classes[0], @@ -155,7 +155,7 @@ def inject_random_class_manipulation( ) elif manipulation_type == "class_join": new_label = f"{rand_classes[0]}_{rand_classes[1]}" - self.df = label_manipulation.LabelJoinInjector().__call__( + self.df = label_manipulation.LabelJoinInjector( self.df, rand_col, rand_classes[0], @@ -201,7 +201,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) - self.df = feature_manipulation.FeatureSwapInjector().__call__( + self.df = feature_manipulation.FeatureSwapInjector( self.df, col_a, col_b, start_drift, end_drift ) @@ -210,7 +210,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): def inject_random_feature_hide_and_sample(self): rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] sample_size = min(self.df[rand_col].value_counts()) - self.df = feature_manipulation.FeatureCoverInjector().__call__( + self.df = feature_manipulation.FeatureCoverInjector( self.df, rand_col, sample_size ) From 9f3c4fdd655974d39d6e8faca417de4fb0dd9d95 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Thu, 16 Mar 2023 16:09:58 -0400 Subject: [PATCH 29/35] injection automation testing --- menelaus/injection/injection_automation.py | 41 +++++++++++-------- .../injection/test_injection_automation.py | 17 ++++++++ 2 files changed, 42 insertions(+), 16 deletions(-) create mode 100644 tests/menelaus/injection/test_injection_automation.py diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 353cc055..34a309df 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -9,9 +9,9 @@ from menelaus.data_drift import PCACD, KdqTreeStreaming, KdqTreeBatch, NNDVI from menelaus.data_drift.cdbd import CDBD from menelaus.data_drift.hdddm import HDDDM -import label_manipulation -import feature_manipulation -import noise +from menelaus.injection import label_manipulation +from menelaus.injection import feature_manipulation +from menelaus.injection import noise def select_random_classes(series): @@ -30,19 +30,23 @@ def select_random_classes(series): class InjectionTesting: - def __init__(self, data_path, seed=None, numeric_cols=None, categorical_cols=None): - file_type = data_path.split(".")[-1] + def __init__(self, data, seed=None, numeric_cols=None, categorical_cols=None): self.seed = seed self.numeric_cols = [] self.categorical_cols = [] - if file_type == "csv": - self.df = pd.read_csv(data_path) - elif file_type == "arff": - raw_data = loadarff(data_path) - self.df = pd.DataFrame(raw_data[0]) + if isinstance(data, pd.DataFrame): + self.df = data.copy() else: - raise ValueError(f"Invalid file type: {file_type}") + file_type = data.split(".")[-1] + + if file_type == "csv": + self.df = pd.read_csv(data) + elif file_type == "arff": + raw_data = loadarff(data) + self.df = pd.DataFrame(raw_data[0]) + else: + raise ValueError(f"Invalid file type: {file_type}") if not numeric_cols or not categorical_cols: for col in self.df.columns: @@ -118,6 +122,7 @@ def train_classifier_model( return model, x_cols, y_col def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1): + injector = noise.BrownianNoiseInjector() rand_cols = [] start_drift, end_drift = self.select_rows(start, end) @@ -125,7 +130,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1): rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] rand_cols.append(rand_col) - self.df = noise.BrownianNoiseInjector(self.df, rand_col, x, start_drift, end_drift) + self.df = injector(self.df, start_drift, end_drift, rand_col, x) return rand_cols @@ -145,7 +150,8 @@ def inject_random_class_manipulation( all_rand_classes.append(rand_classes) if manipulation_type == "class_swap": - self.df = label_manipulation.LabelSwapInjector( + injector = label_manipulation.LabelSwapInjector() + self.df = injector( self.df, rand_col, rand_classes[0], @@ -154,8 +160,9 @@ def inject_random_class_manipulation( end_drift, ) elif manipulation_type == "class_join": + injector = label_manipulation.LabelJoinInjector() new_label = f"{rand_classes[0]}_{rand_classes[1]}" - self.df = label_manipulation.LabelJoinInjector( + self.df = injector( self.df, rand_col, rand_classes[0], @@ -172,6 +179,7 @@ def inject_random_class_manipulation( return rand_cols, all_rand_classes def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): + injector = feature_manipulation.FeatureSwapInjector() all_swap_cols = [] start_drift, end_drift = self.select_rows(start, end) @@ -201,16 +209,17 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) - self.df = feature_manipulation.FeatureSwapInjector( + self.df = injector( self.df, col_a, col_b, start_drift, end_drift ) return all_swap_cols def inject_random_feature_hide_and_sample(self): + injector = feature_manipulation.FeatureCoverInjector() rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] sample_size = min(self.df[rand_col].value_counts()) - self.df = feature_manipulation.FeatureCoverInjector( + self.df = injector( self.df, rand_col, sample_size ) diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py new file mode 100644 index 00000000..2f33b91d --- /dev/null +++ b/tests/menelaus/injection/test_injection_automation.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd +from menelaus.datasets import fetch_circle_data, fetch_rainfall_data +from menelaus.injection.injection_automation import InjectionTesting + + +def test_brownian_injection(): + df = pd.DataFrame(np.random.rand(100, 5), columns=['a', 'b', 'c', 'd', 'e']) + tester = InjectionTesting(df) + start = 0.75 + end = 1 + + col = tester.inject_random_brownian_noise(50, start=start, end=end, num_drift_cols=1) + std_normal = tester.df.iloc[0 : int(start * len(df)), ][col].std().iloc[0, ] + std_drift = tester.df.iloc[int(start * len(df)) + 1 : int(end * len(df)), ][col].std().iloc[0, ] + + assert(std_drift > std_normal) From b5c3d53c898f1a71c8700e0841f381760073dad3 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 17 Mar 2023 13:23:47 -0400 Subject: [PATCH 30/35] class manipulation tests --- menelaus/injection/injection_automation.py | 16 ++++----- .../injection/test_injection_automation.py | 35 +++++++++++++++---- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 34a309df..38a2c4ab 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -153,23 +153,23 @@ def inject_random_class_manipulation( injector = label_manipulation.LabelSwapInjector() self.df = injector( self.df, + start_drift, + end_drift, rand_col, rand_classes[0], rand_classes[1], - start_drift, - end_drift, ) elif manipulation_type == "class_join": injector = label_manipulation.LabelJoinInjector() new_label = f"{rand_classes[0]}_{rand_classes[1]}" self.df = injector( self.df, + start_drift, + end_drift, rand_col, rand_classes[0], rand_classes[1], new_label, - start_drift, - end_drift, ) else: raise ValueError( @@ -209,9 +209,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) - self.df = injector( - self.df, col_a, col_b, start_drift, end_drift - ) + self.df = injector(self.df, col_a, col_b, start_drift, end_drift) return all_swap_cols @@ -219,9 +217,7 @@ def inject_random_feature_hide_and_sample(self): injector = feature_manipulation.FeatureCoverInjector() rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] sample_size = min(self.df[rand_col].value_counts()) - self.df = injector( - self.df, rand_col, sample_size - ) + self.df = injector(self.df, rand_col, sample_size) return rand_col diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py index 2f33b91d..1f1c2b3d 100644 --- a/tests/menelaus/injection/test_injection_automation.py +++ b/tests/menelaus/injection/test_injection_automation.py @@ -1,17 +1,40 @@ import numpy as np import pandas as pd -from menelaus.datasets import fetch_circle_data, fetch_rainfall_data from menelaus.injection.injection_automation import InjectionTesting -def test_brownian_injection(): - df = pd.DataFrame(np.random.rand(100, 5), columns=['a', 'b', 'c', 'd', 'e']) +def test_brownian_noise(): + df = pd.DataFrame(np.random.rand(100, 5), columns=["a", "b", "c", "d", "e"]) tester = InjectionTesting(df) start = 0.75 end = 1 col = tester.inject_random_brownian_noise(50, start=start, end=end, num_drift_cols=1) - std_normal = tester.df.iloc[0 : int(start * len(df)), ][col].std().iloc[0, ] - std_drift = tester.df.iloc[int(start * len(df)) + 1 : int(end * len(df)), ][col].std().iloc[0, ] + std_normal = (tester.df.iloc[0 : int(start * len(df)), ][col].std().iloc[0, ]) + std_drift = (tester.df.iloc[int(start * len(df)) + 1:int(end * len(df)), ][col].std().iloc[0, ]) - assert(std_drift > std_normal) + assert std_drift > std_normal + + +def test_class_manipulation(): + df = pd.DataFrame(np.random.choice(a=["a", "b", "c"], size=100, p=[0.4, 0.3, 0.3])) + swap_tester = InjectionTesting(df) + join_tester = InjectionTesting(df) + start = 0 + end = 1 + + cols, all_swap_classes = swap_tester.inject_random_class_manipulation( + manipulation_type="class_swap", start=start, end=end + ) + col = cols[0] + swap_classes = all_swap_classes[0] + + assert len(df[df[col] == swap_classes[0]]) == len(swap_tester.df[swap_tester.df[col] == swap_classes[1]]) + assert len(df[df[col] == swap_classes[1]]) == len(swap_tester.df[swap_tester.df[col] == swap_classes[0]]) + + cols, all_join_classes = join_tester.inject_random_class_manipulation(manipulation_type="class_join", start=start, end=end) + col = cols[0] + join_classes = all_join_classes[0] + + assert len(join_tester.df[join_tester.df[col] == join_classes[0]]) == 0 + assert len(join_tester.df[join_tester.df[col] == join_classes[1]]) == 0 From de44802a69843c2cb08176cb21ecbb6d0a3caf77 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 17 Mar 2023 19:33:25 -0400 Subject: [PATCH 31/35] feature injection tests --- menelaus/injection/injection_automation.py | 6 +++-- .../injection/test_injection_automation.py | 23 ++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index 38a2c4ab..cacef7cf 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -209,13 +209,15 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) - self.df = injector(self.df, col_a, col_b, start_drift, end_drift) + self.df = injector(self.df, start_drift, end_drift, col_a, col_b) return all_swap_cols def inject_random_feature_hide_and_sample(self): injector = feature_manipulation.FeatureCoverInjector() - rand_col = self.df.columns[random.randint(0, len(self.df.columns) - 1)] + rand_col = self.categorical_cols[ + random.randint(0, len(self.categorical_cols) - 1) + ] sample_size = min(self.df[rand_col].value_counts()) self.df = injector(self.df, rand_col, sample_size) diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py index 1f1c2b3d..6a21c079 100644 --- a/tests/menelaus/injection/test_injection_automation.py +++ b/tests/menelaus/injection/test_injection_automation.py @@ -17,7 +17,7 @@ def test_brownian_noise(): def test_class_manipulation(): - df = pd.DataFrame(np.random.choice(a=["a", "b", "c"], size=100, p=[0.4, 0.3, 0.3])) + df = pd.DataFrame(np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])) swap_tester = InjectionTesting(df) join_tester = InjectionTesting(df) start = 0 @@ -38,3 +38,24 @@ def test_class_manipulation(): assert len(join_tester.df[join_tester.df[col] == join_classes[0]]) == 0 assert len(join_tester.df[join_tester.df[col] == join_classes[1]]) == 0 + +def test_feature_swap(): + df = pd.DataFrame() + df['a'] = [0] * 100 + df['b'] = [1] * 100 + tester = InjectionTesting(df) + start = 0.75 + end = 1 + + tester.inject_random_feature_swap(start=start, end=end) + assert(tester.df['a'].sum() == 25) + assert(tester.df['b'].sum() == 75) + +def test_feature_hide_and_sample(): + df = pd.DataFrame() + df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]) + df['b'] = np.random.rand(100, 1) + tester = InjectionTesting(df) + + tester.inject_random_feature_hide_and_sample() + assert(len(tester.df) < len(df)) From 4ca0089517ffaadefcd46fd8c14c0999323ae779 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 21 Mar 2023 12:49:44 -0400 Subject: [PATCH 32/35] basic detector tests --- .../injection/test_injection_automation.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py index 6a21c079..450af2a7 100644 --- a/tests/menelaus/injection/test_injection_automation.py +++ b/tests/menelaus/injection/test_injection_automation.py @@ -39,6 +39,7 @@ def test_class_manipulation(): assert len(join_tester.df[join_tester.df[col] == join_classes[0]]) == 0 assert len(join_tester.df[join_tester.df[col] == join_classes[1]]) == 0 + def test_feature_swap(): df = pd.DataFrame() df['a'] = [0] * 100 @@ -51,6 +52,7 @@ def test_feature_swap(): assert(tester.df['a'].sum() == 25) assert(tester.df['b'].sum() == 75) + def test_feature_hide_and_sample(): df = pd.DataFrame() df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]) @@ -59,3 +61,24 @@ def test_feature_hide_and_sample(): tester.inject_random_feature_hide_and_sample() assert(len(tester.df) < len(df)) + + +def test_detectors(): + df = pd.DataFrame() + df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]) + df['b'] = np.random.rand(100, 1) + df['c'] = np.random.rand(100, 1) + tester = InjectionTesting(df) + + tester.test_adwin_detector() + tester.test_cbdb_detector(cols=['b']) + tester.test_ddm_detector() + tester.test_eddm_detector() + tester.test_hdddm_detector(cols=['b']) + tester.test_kdq_tree_batch_detector(cols=['b']) + tester.test_kdq_tree_streaming_detector(cols=['b']) + tester.test_lfr_detector() + tester.test_md3_detector() + tester.test_nndvi_detector() + tester.test_pcacd_detector() + tester.test_stepd_detector() From 29fa809f55172d10939d52516f871d983314ee4b Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 21 Mar 2023 13:43:48 -0400 Subject: [PATCH 33/35] switching to numpy random number generator --- menelaus/injection/injection_automation.py | 44 +++++++++---------- .../injection/test_injection_automation.py | 32 ++++++++------ 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index cacef7cf..c553a622 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -1,7 +1,7 @@ import matplotlib.pyplot as plt +import numpy as np import os import pandas as pd -import random import sklearn from scipy.io.arff import loadarff @@ -20,11 +20,11 @@ def select_random_classes(series): if len(classes) < 2: raise ValueError(f"Insufficient classes in series: {len(classes)}") else: - class_a = classes[random.randint(0, len(classes) - 1)] - class_b = classes[random.randint(0, len(classes) - 1)] + class_a = classes[np.random.randint(0, len(classes))] + class_b = classes[np.random.randint(0, len(classes))] while class_a == class_b: - class_b = classes[random.randint(0, len(classes) - 1)] + class_b = classes[np.random.randint(0, len(classes))] return [class_a, class_b] @@ -60,7 +60,7 @@ def __init__(self, data, seed=None, numeric_cols=None, categorical_cols=None): self.categorical_cols = categorical_cols if seed: - random.seed(seed) + np.random.seed(seed) def select_rows(self, start, end): start_row = int(start * len(self.df)) @@ -70,7 +70,7 @@ def select_rows(self, start, end): def train_linear_model(self, x_cols=None, y_col=None, start=0, end=0.75): if not x_cols or not y_col: - y_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] + y_col = self.numeric_cols[np.random.randint(0, len(self.numeric_cols))] x_cols = self.numeric_cols.copy() x_cols.remove(y_col) @@ -94,7 +94,7 @@ def train_classifier_model( ): if not x_cols or not y_col: y_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] x_cols = self.numeric_cols.copy() @@ -127,7 +127,7 @@ def inject_random_brownian_noise(self, x, start=0.75, end=1, num_drift_cols=1): start_drift, end_drift = self.select_rows(start, end) for i in range(num_drift_cols): - rand_col = self.numeric_cols[random.randint(0, len(self.numeric_cols) - 1)] + rand_col = self.numeric_cols[np.random.randint(0, len(self.numeric_cols))] rand_cols.append(rand_col) self.df = injector(self.df, start_drift, end_drift, rand_col, x) @@ -143,7 +143,7 @@ def inject_random_class_manipulation( for i in range(num_drift_cols): rand_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] rand_cols.append(rand_col) rand_classes = select_random_classes(self.df[rand_col]) @@ -186,7 +186,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): for i in range(num_swaps): col_type = ( self.numeric_cols - if random.randint(0, 1) == 0 + if np.random.randint(0, 1) == 0 else self.categorical_cols ) @@ -201,11 +201,11 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): "Insufficient numeric and categorical columns for swaps" ) - col_a = col_type[random.randint(0, len(col_type) - 1)] - col_b = col_type[random.randint(0, len(col_type) - 1)] + col_a = col_type[np.random.randint(0, len(col_type))] + col_b = col_type[np.random.randint(0, len(col_type))] while col_a == col_b: - col_b = col_type[random.randint(0, len(col_type) - 1)] + col_b = col_type[np.random.randint(0, len(col_type))] swap_cols = [col_a, col_b] all_swap_cols.append(swap_cols) @@ -216,7 +216,7 @@ def inject_random_feature_swap(self, start=0.75, end=1, num_swaps=1): def inject_random_feature_hide_and_sample(self): injector = feature_manipulation.FeatureCoverInjector() rand_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] sample_size = min(self.df[rand_col].value_counts()) self.df = injector(self.df, rand_col, sample_size) @@ -241,12 +241,12 @@ def test_adwin_detector(self, model=None, x_cols=None, y_col=None): def test_cbdb_detector(self, cols, group_col=None, reference_group=None, subsets=8): if not group_col: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] while group_col in cols: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] if not reference_group: @@ -327,12 +327,12 @@ def test_hdddm_detector( ): if not group_col: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] while group_col in cols: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] if not reference_group: @@ -353,12 +353,12 @@ def test_hdddm_detector( def test_kdq_tree_batch_detector(self, cols, group_col=None, reference_group=None): if not group_col: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] while group_col in cols: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] if not reference_group: @@ -485,13 +485,13 @@ def test_nndvi_detector( ): if not group_col: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] if cols: while group_col in cols: group_col = self.categorical_cols[ - random.randint(0, len(self.categorical_cols) - 1) + np.random.randint(0, len(self.categorical_cols)) ] if not reference_group: diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py index 450af2a7..4f0e93b3 100644 --- a/tests/menelaus/injection/test_injection_automation.py +++ b/tests/menelaus/injection/test_injection_automation.py @@ -69,16 +69,22 @@ def test_detectors(): df['b'] = np.random.rand(100, 1) df['c'] = np.random.rand(100, 1) tester = InjectionTesting(df) - - tester.test_adwin_detector() - tester.test_cbdb_detector(cols=['b']) - tester.test_ddm_detector() - tester.test_eddm_detector() - tester.test_hdddm_detector(cols=['b']) - tester.test_kdq_tree_batch_detector(cols=['b']) - tester.test_kdq_tree_streaming_detector(cols=['b']) - tester.test_lfr_detector() - tester.test_md3_detector() - tester.test_nndvi_detector() - tester.test_pcacd_detector() - tester.test_stepd_detector() + failed = False + + try: + tester.test_adwin_detector() + tester.test_cbdb_detector(cols=['b']) + tester.test_ddm_detector() + tester.test_eddm_detector() + tester.test_hdddm_detector(cols=['b']) + tester.test_kdq_tree_batch_detector(cols=['b']) + tester.test_kdq_tree_streaming_detector(cols=['b']) + tester.test_lfr_detector() + tester.test_md3_detector() + tester.test_nndvi_detector() + tester.test_pcacd_detector() + tester.test_stepd_detector() + except Exception as e: + failed = True + + assert(failed is False) From 41614986a1e28f3ff84258dc5bf400c8dbf5be3a Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Tue, 21 Mar 2023 14:51:50 -0400 Subject: [PATCH 34/35] scatter plot testing --- .gitignore | 1 + tests/menelaus/injection/test_injection_automation.py | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 0f6395fd..27247819 100644 --- a/.gitignore +++ b/.gitignore @@ -22,5 +22,6 @@ menelaus/injection/plots examples/*.png menelaus/*.png tests/*.png +tests/menelaus/injection/plots *.tox* diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py index 4f0e93b3..defebe55 100644 --- a/tests/menelaus/injection/test_injection_automation.py +++ b/tests/menelaus/injection/test_injection_automation.py @@ -84,6 +84,7 @@ def test_detectors(): tester.test_nndvi_detector() tester.test_pcacd_detector() tester.test_stepd_detector() + tester.plot_drift_scatter(cols=['b']) except Exception as e: failed = True From f7a261bacfabcca6126b47c2f165d0ba6d10ced4 Mon Sep 17 00:00:00 2001 From: Alex Isherwood Date: Fri, 24 Mar 2023 14:37:26 -0400 Subject: [PATCH 35/35] type coercing plot inputs and testing --- .gitignore | 3 +- menelaus/injection/injection_automation.py | 68 ++++++++++--------- .../injection/test_injection_automation.py | 12 ++-- 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 27247819..7422e0d9 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,8 @@ _build *.DS_Store .idea/ menelaus/injection/souza_data -menelaus/injection/plots +menelaus/injection/sample_scripts +plots # Images diff --git a/menelaus/injection/injection_automation.py b/menelaus/injection/injection_automation.py index c553a622..c1ef35cd 100644 --- a/menelaus/injection/injection_automation.py +++ b/menelaus/injection/injection_automation.py @@ -562,35 +562,41 @@ def test_stepd_detector(self, model=None, x_cols=None, y_col=None, window_size=2 return detector def plot_drift_scatter(self, cols, output_file="plots/drift_scatter_test.png"): - plt.figure(figsize=(20, 6)) - y_min = None - y_max = None - - for col in cols: - plt.scatter(self.df.index, self.df[col], label=col) - local_min = self.df[col].min() - local_max = self.df[col].max() - - if y_min is None or y_min > local_min: - y_min = local_min - if y_max is None or y_max < local_max: - y_max = local_max - - plt.grid(False, axis="x") - plt.xticks(fontsize=16) - plt.yticks(fontsize=16) - plt.title("Scatter Results", fontsize=22) - plt.xlabel("Index", fontsize=18) - plt.ylabel("Value", fontsize=18) - plt.ylim((y_min, y_max)) - plt.vlines( - x=self.df[self.df["drift_state"] == "drift"].index, - ymin=y_min, - ymax=y_max, - label="Drift Detected", - color="red", - ) - plt.legend() + if hasattr(cols, '__iter__'): + if isinstance(cols, str): + cols = [cols] + + plt.figure(figsize=(20, 6)) + y_min = None + y_max = None + + for col in cols: + plt.scatter(self.df.index, self.df[col], label=col) + local_min = self.df[col].min() + local_max = self.df[col].max() + + if y_min is None or y_min > local_min: + y_min = local_min + if y_max is None or y_max < local_max: + y_max = local_max + + plt.grid(False, axis="x") + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + plt.title("Scatter Results", fontsize=22) + plt.xlabel("Index", fontsize=18) + plt.ylabel("Value", fontsize=18) + plt.ylim((y_min, y_max)) + plt.vlines( + x=self.df[self.df["drift_state"] == "drift"].index, + ymin=y_min, + ymax=y_max, + label="Drift Detected", + color="red", + ) + plt.legend() - os.makedirs(os.path.dirname(output_file), exist_ok=True) - plt.savefig(output_file) + os.makedirs(os.path.dirname(output_file), exist_ok=True) + plt.savefig(output_file) + else: + raise TypeError(f'Variable cols must be an iterable object or string') diff --git a/tests/menelaus/injection/test_injection_automation.py b/tests/menelaus/injection/test_injection_automation.py index defebe55..2a650eba 100644 --- a/tests/menelaus/injection/test_injection_automation.py +++ b/tests/menelaus/injection/test_injection_automation.py @@ -5,7 +5,7 @@ def test_brownian_noise(): df = pd.DataFrame(np.random.rand(100, 5), columns=["a", "b", "c", "d", "e"]) - tester = InjectionTesting(df) + tester = InjectionTesting(df, seed=2) start = 0.75 end = 1 @@ -18,8 +18,8 @@ def test_brownian_noise(): def test_class_manipulation(): df = pd.DataFrame(np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3])) - swap_tester = InjectionTesting(df) - join_tester = InjectionTesting(df) + swap_tester = InjectionTesting(df, seed=3) + join_tester = InjectionTesting(df, seed=5) start = 0 end = 1 @@ -44,7 +44,7 @@ def test_feature_swap(): df = pd.DataFrame() df['a'] = [0] * 100 df['b'] = [1] * 100 - tester = InjectionTesting(df) + tester = InjectionTesting(df, seed=7) start = 0.75 end = 1 @@ -57,7 +57,7 @@ def test_feature_hide_and_sample(): df = pd.DataFrame() df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]) df['b'] = np.random.rand(100, 1) - tester = InjectionTesting(df) + tester = InjectionTesting(df, seed=11) tester.inject_random_feature_hide_and_sample() assert(len(tester.df) < len(df)) @@ -68,7 +68,7 @@ def test_detectors(): df['a'] = np.random.choice(a=["x", "y", "z"], size=100, p=[0.4, 0.3, 0.3]) df['b'] = np.random.rand(100, 1) df['c'] = np.random.rand(100, 1) - tester = InjectionTesting(df) + tester = InjectionTesting(df, seed=13) failed = False try: