Skip to content

Commit

Permalink
Merge pull request #2 from joao-parana/outlier_detection
Browse files Browse the repository at this point in the history
wip: new feature for outlier detection using zscore and iqr.
  • Loading branch information
joao-parana authored Aug 28, 2023
2 parents f17ff23 + bfb9e3c commit d7589b5
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 2 deletions.
54 changes: 54 additions & 0 deletions features/05.outliers-detection.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
Feature: Outlier Detection in multivariate and univariate Timeseries on wide format
Value Statement:
As a data analyst
I want the ability to identify outliers in multivariate and univariate Timeseries on wide format
So I can start analyzing the data right away and come up with solutions for the business.

Scenario: Detecting outliers in univariate Timeseries on wide format

Given a time series dataset
| Timestamp | Value |
| 2023-01-01 | 10 |
| 2023-01-02 | 15 |
| 2023-01-03 | 12 |
| 2023-01-04 | 14 |
| 2023-01-05 | 120 |
| 2023-01-06 | 13 |
| 2023-01-07 | 16 |
| 2023-01-08 | 18 |
| 2023-01-09 | 14 |
| 2023-01-10 | 17 |

When the Z-Score outlier detection algorithm is applied
Then outliers should be identified using Z-Score
| Timestamp | Value | Outlier Detection Method |
| 2023-01-05 | 120 | Z-Score |

And non-outliers should not be flagged as outliers
| Timestamp | Value | Outlier Detection Method |
| 2023-01-01 | 10 | None |
| 2023-01-02 | 15 | None |
| 2023-01-03 | 12 | None |
| 2023-01-04 | 14 | None |
| 2023-01-06 | 13 | None |
| 2023-01-07 | 16 | None |
| 2023-01-08 | 18 | None |
| 2023-01-09 | 14 | None |
| 2023-01-10 | 17 | None |

When the IQR-based outlier detection algorithm is applied
Then outliers should be identified using IQR
| Timestamp | Value | Outlier Detection Method |
| 2023-01-05 | 120 | IQR |

And non-outliers should not be flagged as outliers
| Timestamp | Value | Outlier Detection Method |
| 2023-01-01 | 10 | None |
| 2023-01-02 | 15 | None |
| 2023-01-03 | 12 | None |
| 2023-01-04 | 14 | None |
| 2023-01-06 | 13 | None |
| 2023-01-07 | 16 | None |
| 2023-01-08 | 18 | None |
| 2023-01-09 | 14 | None |
| 2023-01-10 | 17 | None |
57 changes: 57 additions & 0 deletions features/steps/05.outliers-detection.feature_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Import necessary libraries
from behave import given, when, then
import pandas as pd
import numpy as np

from t8s.util import Util


# Define the time series dataset
time_series_data = [
("2023-01-01", 10),
("2023-01-02", 15),
("2023-01-03", 12),
("2023-01-04", 14),
("2023-01-05", 120),
("2023-01-06", 13),
("2023-01-07", 16),
("2023-01-08", 18),
("2023-01-09", 14),
("2023-01-10", 17)
]

# Create a pandas.DataFrame from the time series dataset
df = pd.DataFrame(time_series_data, columns=["timestamp", "tag"])


@given('a time series dataset')
def step_given_time_series(context):
context.time_series = df

@when('the Z-Score outlier detection algorithm is applied')
def step_when_zscore_detection(context):
df = context.time_series
outliers_mask = Util.detect_outliers(df, 'tag', 'zscore')
context.outliers = context.time_series[outliers_mask]['timestamp'].tolist()

@then('outliers should be identified using Z-Score')
def step_then_zscore_outliers(context):
expected_outliers = ['2023-01-05']
assert context.outliers == expected_outliers

@when('the IQR-based outlier detection algorithm is applied')
def step_when_iqr_detection(context):
df = context.time_series
outliers_mask = Util.detect_outliers(df, 'tag', 'iqr')
context.outliers = context.time_series[outliers_mask]['timestamp'].tolist()

@then('outliers should be identified using IQR')
def step_then_iqr_outliers(context):
expected_outliers = ['2023-01-05']
assert context.outliers == expected_outliers

@then('non-outliers should not be flagged as outliers')
def step_then_non_outliers(context):
expected_non_outliers = ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-06', '2023-01-07', '2023-01-08', '2023-01-09', '2023-01-10']
detected_non_outliers = [x for x in context.time_series['timestamp'].tolist() if x not in context.outliers]
assert detected_non_outliers == expected_non_outliers
24 changes: 22 additions & 2 deletions src/t8s/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
from pandas import Series
from scipy.stats import zscore
from t8s.ts import TimeSerie
from t8s.ts_writer import TSWriter, WriteParquetFile
from t8s.log_config import LogConfig
Expand Down Expand Up @@ -116,6 +117,27 @@ def identify_all_start_and_end_of_nan_block(df: pd.DataFrame) -> tuple[dict[str,

return (ret_all_s_e_nan_block, last_idx)

@staticmethod
def detect_outliers(df: pd.DataFrame, col: str, method: str) -> list:
# Compute the z-score for the given column in the DataFrame and return a list of booleans indicating the outliers
values = df[col]

if method == 'zscore':
z_scores = zscore(values)
limit = 2.0
return [abs(z_score) > limit for z_score in z_scores]

elif method == 'iqr':
q25 = np.percentile(values, 25)
q75 = np.percentile(values, 75)
iqr = q75 - q25
lower_bound = q25 - 1.5 * iqr
upper_bound = q75 + 1.5 * iqr
return [value < lower_bound or value > upper_bound for value in values]

else:
raise ValueError(f'Unknown method: {method}')

@staticmethod
def get_limits(df: pd.DataFrame, factor: int) -> pd.DataFrame:
limits = pd.DataFrame(columns=df.columns)
Expand Down Expand Up @@ -156,5 +178,3 @@ def get_tuple_max_min_factors(df, col) -> tuple[int, int]:
max_factor_dict[col] = get_tuple_max_min_factors(df, col)

return max_factor_dict


0 comments on commit d7589b5

Please sign in to comment.