Merge pull request #2 from joao-parana/outlier_detection

wip: new feature for outlier detection using zscore and iqr.
joao-parana · Aug 28, 2023 · d7589b5 · d7589b5
2 parents f17ff23 + bfb9e3c
commit d7589b5
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 2 deletions.
diff --git a/features/05.outliers-detection.feature b/features/05.outliers-detection.feature
@@ -0,0 +1,54 @@
+Feature: Outlier Detection in multivariate and univariate Timeseries on wide format
+  Value Statement:
+    As a data analyst
+    I want the ability to identify outliers in multivariate and univariate Timeseries on wide format
+    So I can start analyzing the data right away and come up with solutions for the business.
+
+  Scenario: Detecting outliers in univariate Timeseries on wide format
+
+    Given a time series dataset
+      | Timestamp       | Value |
+      | 2023-01-01      | 10    |
+      | 2023-01-02      | 15    |
+      | 2023-01-03      | 12    |
+      | 2023-01-04      | 14    |
+      | 2023-01-05      | 120   |
+      | 2023-01-06      | 13    |
+      | 2023-01-07      | 16    |
+      | 2023-01-08      | 18    |
+      | 2023-01-09      | 14    |
+      | 2023-01-10      | 17    |
+
+    When the Z-Score outlier detection algorithm is applied
+    Then outliers should be identified using Z-Score
+      | Timestamp       | Value | Outlier Detection Method |
+      | 2023-01-05      | 120   | Z-Score                 |
+
+    And non-outliers should not be flagged as outliers
+      | Timestamp       | Value | Outlier Detection Method |
+      | 2023-01-01      | 10    | None                    |
+      | 2023-01-02      | 15    | None                    |
+      | 2023-01-03      | 12    | None                    |
+      | 2023-01-04      | 14    | None                    |
+      | 2023-01-06      | 13    | None                    |
+      | 2023-01-07      | 16    | None                    |
+      | 2023-01-08      | 18    | None                    |
+      | 2023-01-09      | 14    | None                    |
+      | 2023-01-10      | 17    | None                    |
+
+    When the IQR-based outlier detection algorithm is applied
+    Then outliers should be identified using IQR
+      | Timestamp       | Value | Outlier Detection Method |
+      | 2023-01-05      | 120   | IQR                     |
+
+    And non-outliers should not be flagged as outliers
+      | Timestamp       | Value | Outlier Detection Method |
+      | 2023-01-01      | 10    | None                    |
+      | 2023-01-02      | 15    | None                    |
+      | 2023-01-03      | 12    | None                    |
+      | 2023-01-04      | 14    | None                    |
+      | 2023-01-06      | 13    | None                    |
+      | 2023-01-07      | 16    | None                    |
+      | 2023-01-08      | 18    | None                    |
+      | 2023-01-09      | 14    | None                    |
+      | 2023-01-10      | 17    | None                    |
diff --git a/features/steps/05.outliers-detection.feature_steps.py b/features/steps/05.outliers-detection.feature_steps.py
@@ -0,0 +1,57 @@
+# Import necessary libraries
+from behave import given, when, then
+import pandas as pd
+import numpy as np
+
+from t8s.util import Util
+
+
+# Define the time series dataset
+time_series_data = [
+    ("2023-01-01", 10),
+    ("2023-01-02", 15),
+    ("2023-01-03", 12),
+    ("2023-01-04", 14),
+    ("2023-01-05", 120),
+    ("2023-01-06", 13),
+    ("2023-01-07", 16),
+    ("2023-01-08", 18),
+    ("2023-01-09", 14),
+    ("2023-01-10", 17)
+]
+
+# Create a pandas.DataFrame from the time series dataset
+df = pd.DataFrame(time_series_data, columns=["timestamp", "tag"])
+
+
+@given('a time series dataset')
+def step_given_time_series(context):
+    context.time_series = df
+
+@when('the Z-Score outlier detection algorithm is applied')
+def step_when_zscore_detection(context):
+    df = context.time_series
+    outliers_mask = Util.detect_outliers(df, 'tag', 'zscore')
+    context.outliers = context.time_series[outliers_mask]['timestamp'].tolist()
+
+@then('outliers should be identified using Z-Score')
+def step_then_zscore_outliers(context):
+    expected_outliers = ['2023-01-05']
+    assert context.outliers == expected_outliers
+
+@when('the IQR-based outlier detection algorithm is applied')
+def step_when_iqr_detection(context):
+    df = context.time_series
+    outliers_mask = Util.detect_outliers(df, 'tag', 'iqr')
+    context.outliers = context.time_series[outliers_mask]['timestamp'].tolist()
+
+@then('outliers should be identified using IQR')
+def step_then_iqr_outliers(context):
+    expected_outliers = ['2023-01-05']
+    assert context.outliers == expected_outliers
+
+@then('non-outliers should not be flagged as outliers')
+def step_then_non_outliers(context):
+    expected_non_outliers = ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-06', '2023-01-07', '2023-01-08', '2023-01-09', '2023-01-10']
+    detected_non_outliers = [x for x in context.time_series['timestamp'].tolist() if x not in context.outliers]
+    assert detected_non_outliers == expected_non_outliers
diff --git a/src/t8s/util.py b/src/t8s/util.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 from pandas import Series
+from scipy.stats import zscore
 from t8s.ts import TimeSerie
 from t8s.ts_writer import TSWriter, WriteParquetFile
 from t8s.log_config import LogConfig
@@ -116,6 +117,27 @@ def identify_all_start_and_end_of_nan_block(df: pd.DataFrame) -> tuple[dict[str,
 
         return (ret_all_s_e_nan_block, last_idx)
 
+    @staticmethod
+    def detect_outliers(df: pd.DataFrame, col: str, method: str) -> list:
+        # Compute the z-score for the given column in the DataFrame and return a list of booleans indicating the outliers
+        values = df[col]
+
+        if method == 'zscore':
+            z_scores = zscore(values)
+            limit = 2.0
+            return [abs(z_score) > limit for z_score in z_scores]
+
+        elif method == 'iqr':
+            q25 = np.percentile(values, 25)
+            q75 = np.percentile(values, 75)
+            iqr = q75 - q25
+            lower_bound = q25 - 1.5 * iqr
+            upper_bound = q75 + 1.5 * iqr
+            return [value < lower_bound or value > upper_bound for value in values]
+
+        else:
+            raise ValueError(f'Unknown method: {method}')
+
     @staticmethod
     def get_limits(df: pd.DataFrame, factor: int) -> pd.DataFrame:
         limits = pd.DataFrame(columns=df.columns)
@@ -156,5 +178,3 @@ def get_tuple_max_min_factors(df, col) -> tuple[int, int]:
             max_factor_dict[col] = get_tuple_max_min_factors(df, col)
 
         return max_factor_dict
-
-