Merge branch 'master' into outlier_detection

joao-parana · Aug 28, 2023 · bfb9e3c · bfb9e3c
2 parents aa8527c + f17ff23
commit bfb9e3c
Show file tree

Hide file tree

Showing 17 changed files with 774 additions and 165 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ lixo*
 .ruff_cache
 .pytest_cache
 *.mp4
-
+.venv/
+requirements.txt
diff --git a/README.md b/README.md
@@ -13,6 +13,8 @@
 - [Installation](#installation)
 - [Testing](#testing)
 - [Publishing](#publishing)
+- [Graphics](#graphics)
+- [Virtual Environment](#virtual-environment)
 - [License](#license)
 
 ## Installation
@@ -31,24 +33,23 @@ pyright --level warning .
 ## Testing
 
 ```bash
-# Para configurar o ambiente de testes, define a variável de ambiente
-# T8S_WORKSPACE_DIR, por exemplo:
+# To configure the test environment, set the T8S_WORKSPACE_DIR environment variable, for example:
 export T8S_WORKSPACE_DIR=/Volumes/dev/t8s
 ```
 
 ![BDD](docs/bdd.png)
 
-Veja também [BDD](docs/behave.md)
+See too [BDD](docs/behave.md)
 
 ```batch
-# Para inspecionar a configuração do ambiente de testes:
+# To inspect the test environment configuration:
 hatch config show
 hatch clean
 hatch build
 # Edit your main.py code
 hatch run python3 main.py
 ./test-all.sh
-# Usando BDD com behave (https://behave.readthedocs.io/en/latest/)
+# Using BDD with behave (https://behave.readthedocs.io/en/latest/)
 rm logs/timeseries.log
 hatch run python -m behave --logging-level INFO --no-capture --no-capture-stderr --no-skipped features
 cat logs/timeseries.log
@@ -60,6 +61,39 @@ cat logs/timeseries.log
 hatch publish
 ```
 
+## Graphics
+
+Execute the examples below:
+
+```bash
+alias st='streamlit run  --server.headless true --theme.base light '
+st graphics/graph-01.py
+st graphics/graph-02.py
+```
+
+And open the URI in browser
+
+## Virtual Environment
+
+To generate `requirements.txt` from zero, use this:
+
+```bash
+python3 -m venv .venv
+source .venv/Scripts/activate
+# Install packages
+python3 -m pip install --upgrade pip
+python3 -m pip install --upgrade pyright
+python3 -m pip install tensorflow tensorflow-metadata tensorflow-datasets
+python3 -m pip install -e .
+# Freeze instalation
+python3 -m pip freeze > requirements.txt
+# Do your job
+# . . .
+#
+deactivate
+rm -rf .venv/*
+```
+
 ## License
 
 `t8s` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.
diff --git a/data/parquet/ts_03.parquet b/data/parquet/ts_03.parquet
diff --git a/datasets/mach13.parquet b/datasets/mach13.parquet
diff --git a/datasets/machine13_01.parquet b/datasets/machine13_01.parquet
diff --git a/features/01.create_timeserie.feature b/features/01.create_timeserie.feature
@@ -42,4 +42,22 @@ Value Statement:
     When I convert a Datafusion Table to a Pandas Dataframe using the Datafusion API and the query mentioned above
     And I create a time series
     Then I have a time series with the `correct` number of rows and columns, schema and time interval to be checked
-    # https://github.com/apache/arrow-datafusion-python/blob/main/examples/sql-on-pandas.py
+    # https://github.com/apache/arrow-datafusion-python/blob/main/examples/sql-on-pandas.py
+
+  Scenario: Fourth, I need to display the descriptive statistics of a time series
+    Given a time series
+    When I call the get_statistics function
+    Then I have a descriptive statistics object for the time series
+
+  Scenario: Fifth, I need to find the standard deviation of a time series to check for outliers using a naive method
+    Given a time series
+    When I call the get_min_max_variation_factors function in the Util class
+    Then I have a dictionary object with minimum and maximum multiplication factor for each feature
+    And I can use this information to check for outliers using a naive method
+
+  Scenario: Sixth, I need to select only a subset of features in a time series persisted as a parquet file
+    Given a time series
+    When I pass select_features as a list of feature names to the TimeSerie constructor
+    Then I have a time series with only a subset of features as defined in the list
+    And I can read from the file system only the resources I need, improving performance when reading data.
+    # It is done using parquet module from pyarrow package
diff --git a/features/04.identify_nans.feature b/features/04.identify_nans.feature
@@ -5,18 +5,12 @@ Feature: Identify NaN values in multivariate and univariate Timeseries on wide f
     So I can start analyzing the data right away and come up with solutions for the business.
 
   Background:
-    Given that I have a T8S_WORKSPACE_DIR and a wide format time series persisted to a Parquet file
-
-  Scenario: Identify NaN values in multivariate Timeseries on wide format
-    Given that I create a multivariate Timeseries using the selected parquet file in the T8S_WORKSPACE/data/parquet directory
-    When I check the multivariate Timeseries for NaN values
-    Then I build a dictionary of NaN values blocks to use elsewhere
-    And I check the result of NaNs blocks.
-    # Constraint: The Timeseries has no invalid values
+    Given that I have a TimeSerie with a bunch of NaNs blocks saved as a parquet file in T8S_WORKSPACE_DIR
 
   Scenario: Identify NaN values in univariate Timeseries on wide format
-    Given that I create a univariate Timeseries set using the selected parquet file in the T8S_WORKSPACE/data/parquet directory
-    When I check the univariate Timeseries for NaN values
-    Then I build a dictionary list of NaN values blocks to use elsewhere
-    And I check the result of NaNs blocks of univariate Timeseries.
+    Given that I read a multivariate Timeseries and convert to univariate timeseries list
+    When I check the first univariate Timeseries from list for NaN values
+    Then I build a dataframe describing blocks of NaN values to use elsewhere
+    And I check the result of NaNs blocks of univariate Timeseries
+    And I can also add a column with the corrections indicated by the imputation and see the result graphically.
     # Constraint: The Timeseries has no invalid values
diff --git a/features/steps/01.create_timeserie_steps.py b/features/steps/01.create_timeserie_steps.py
@@ -8,14 +8,18 @@
 from t8s.util import Util
 from t8s.io import IO
 from t8s.ts import TimeSerie
+from t8s.ts import TSStats
 from t8s.ts_writer import TSWriter, WriteParquetFile
+from t8s.ts_builder import TSBuilder, ReadParquetFile # , ReadCsvFile
 from behave import given, when, then, use_step_matcher, step # type: ignore
 from behave.model import Table  # type: ignore
 from behave_pandas import table_to_dataframe, dataframe_to_table # type: ignore
 from logging import INFO, DEBUG, WARNING, ERROR, CRITICAL
 
 logger = LogConfig().getLogger()
 
+epsilon = 1e-6
+
 """
 Feature: Create a time series set using Dataframe, CSV and Parquet
 
@@ -214,19 +218,111 @@ def table_to_ts(context):
 
 @given(u'a Datafusion SQL query')
 def step_impl1(context):
-    logger.debug(u'STEP: Given a Datafusion SQL query')
-
+    logger.info(u'STEP: Given a Datafusion SQL query')
+    logger.info(context.ts1)
 
 @when(u'I convert a Datafusion Table to a Pandas Dataframe using the Datafusion API and the query mentioned above')
 def step_impl2(context):
-    logger.debug(u'STEP: When I convert a Datafusion Table to a Pandas Dataframe using the Datafusion API and the query mentioned above')
-
+    logger.info(u'STEP: When I convert a Datafusion Table to a Pandas Dataframe using the Datafusion API and the query mentioned above')
 
 @when(u'I create a time series')
 def step_impl3(context):
-    logger.debug(u'STEP: When I create a time series')
-
+    logger.info(u'STEP: When I create a time series')
 
 @then(u'I have a time series with the `correct` number of rows and columns, schema and time interval to be checked')
 def step_impl4(context):
-    logger.debug(u'STEP: Then I have a time series with the `correct` number of rows and columns, schema and time interval to be checked')
+    logger.info(u'STEP: Then I have a time series with the `correct` number of rows and columns, schema and time interval to be checked')
+
+# -----------------------------------------------------------------------------------------------
+
+@given(u'a time series')
+def step_impl_5(context):
+    logger.info(u'STEP: Given a time series')
+    logger.info(context.ts1)
+
+@when(u'I call the get_statistics function')
+def step_impl_6(context):
+    logger.info(u'STEP: When I call the get_statistics function')
+    logger.info(context.ts1)
+    stats: TSStats = context.ts1.get_statistics()
+    context.stats = stats
+
+@then(u'I have a descriptive statistics object for the time series')
+def step_impl_7(context):
+    logger.info(u'STEP: Then I have a descriptive statistics object for the time series')
+    print(f'ts1 statistics =\n{context.stats}')
+    assert context.stats is not None
+    assert context.stats.count('timestamp')  == 4
+    assert context.stats.mean('velocidade')  - 2325 <= epsilon
+    assert context.stats.mean('temperatura') - 25.299999 <= epsilon
+    assert context.stats.min('velocidade')  - 1100 <= epsilon
+    assert context.stats.min('temperatura') - 23.2 <= epsilon
+    assert context.stats.q1('velocidade')  - 1175 <= epsilon
+    assert context.stats.q1('temperatura') - 24.55 <= epsilon
+    assert context.stats.q2('velocidade')  - 2100 <= epsilon
+    assert context.stats.q2('temperatura') - 25.50 <= epsilon
+    assert context.stats.q3('velocidade')  - 3250 <= epsilon
+    assert context.stats.q3('temperatura') - 26.25 <= epsilon
+    assert context.stats.max('velocidade')  - 4000 <= epsilon
+    assert context.stats.max('temperatura') - 27 <= epsilon
+    assert context.stats.std('velocidade')  - 1417.450806 <= epsilon
+    assert context.stats.std('temperatura') - 1.620699 <= epsilon
+    """
+    ts1 statistics =
+                                timestamp  temperatura   velocidade
+    Contagem                            4     4.000000     4.000000
+    Média             2022-01-01 01:30:00    25.299999  2325.000000
+    Mínimo            2022-01-01 00:00:00    23.200001  1100.000000
+    Primeiro quartil  2022-01-01 00:45:00    24.550000  1175.000000
+    Mediana           2022-01-01 01:30:00    25.500000  2100.000000
+    Terceiro quartil  2022-01-01 02:15:00    26.250000  3250.000000
+    Máximo            2022-01-01 03:00:00    27.000000  4000.000000
+    Desvio padrão                     NaN     1.620699  1417.450806
+    """
+
+# -----------------------------------------------------------------------------------------------
+
+@when(u'I call the get_min_max_variation_factors function in the Util class')
+def step_impl_8(context):
+    logger.info(u'STEP: When I call the get_min_max_variation_factors function in the Util class')
+    min_max_variation_factors = Util.get_min_max_variation_factors(context.ts1.df)
+    context.min_max_variation_factors = min_max_variation_factors
+
+@then(u'I have a dictionary object with minimum and maximum multiplication factor for each feature')
+def step_impl_9(context):
+    logger.info(u'STEP: Then I have a dictionary object with minimum and maximum multiplication factor for each feature')
+    logger.info(f'min_max_variation_factors =\n{context.min_max_variation_factors}')
+
+@then(u'I can use this information to check for outliers using a naive method')
+def step_impl_10(context):
+    logger.info(u'STEP: Then I can use this information to check for outliers using a naive method')
+    logger.info(f'If min_max_variation_factors is greater than 3 in at least one column, we say that there is an outlier in the time series')
+
+# -----------------------------------------------------------------------------------------------
+
+@when(u'I pass select_features as a list of feature names to the TimeSerie constructor')
+def read_ts_for_selected_features(context):
+    logger.info(u'STEP: When I pass select_features as a list of feature names to the TimeSerie constructor')
+    ctx = TSBuilder(ReadParquetFile())
+    logger.debug("Client: ReadStrategy is set to read Parquet file.")
+    path = Path(context.PARQUET_PATH) / 'ts_01.parquet'
+    start = datetime.now()
+    select_features = ['timestamp', 'velocidade']
+    ts_for_selected_features = ctx.build_from_file(path, select_features)
+    end = datetime.now()
+    context.ts_for_selected_features = ts_for_selected_features
+    context.elapsed_time_for_read_only_selected_features = end - start
+
+@then(u'I have a time series with only a subset of features as defined in the list')
+def check_time_serie_with_two_features(context):
+    logger.info(u'STEP: Then I have a time series with only a subset of features as defined in the list')
+    logger.info(f'context.ts_for_select_features =\n{context.ts_for_selected_features}')
+    assert context.ts_for_selected_features.format == 'wide', 'format must be wide'
+    assert context.ts_for_selected_features.features == '2', 'features_qty must be 2 in this case'
+    assert context.ts_for_selected_features.df.columns[0] == 'timestamp', 'first column must be timestamp'
+    assert context.ts_for_selected_features.df.columns[1] == 'velocidade', 'second column must be velocidade'
+
+@then(u'I can read from the file system only the resources I need, improving performance when reading data.')
+def time_for_read_from_parquet_file_only_2_coluns(context):
+    logger.info(u'STEP: Then I can read from the file system only the resources I need, improving performance when reading data.')
+    logger.info(f'context.elapsed_time_for_read_only_selected_features = {context.elapsed_time_for_read_only_selected_features}')
diff --git a/features/steps/02.convert_timeserie.feature_steps.py b/features/steps/02.convert_timeserie.feature_steps.py
@@ -10,8 +10,8 @@
 from t8s.ts import TimeSerie
 from t8s.ts_writer import TSWriter, WriteParquetFile
 from behave import given, when, then, use_step_matcher, step # type: ignore
-from behave.model import Table
-from behave_pandas import table_to_dataframe, dataframe_to_table
+from behave.model import Table # type: ignore
+from behave_pandas import table_to_dataframe, dataframe_to_table # type: ignore
 from logging import INFO, DEBUG, WARNING, ERROR, CRITICAL
 
 logger = LogConfig().getLogger()