add: time_stats for numeric value columns added

jakobgabriel · Oct 27, 2024 · 69d4f17 · 69d4f17
1 parent b60ff8b
commit 69d4f17
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -2,36 +2,58 @@
 
 Timeseries-Shaper is a Python library for efficiently filtering and preprocessing time series data using pandas. It provides a set of tools to handle various data transformations, making data preparation tasks easier and more intuitive.
 
-## Features
+Besides that multiple engineering specific methods are utilized to make it fast and easy to work with time series data.   
 
-### Load 
+## Features | Structure
 
-- **Load Parquet**: Load parquet files for further processing
-
-
-### Transform
-
-
-
-#### Filter
-
-- **Filter Missing Values**: Quickly filter out or fill missing values in your time series data.
-- **Boolean Filters**: Apply boolean logic to filter data based on specific conditions.
-- **Integer and Double Filters**: Perform numeric operations and filters specific to integer and double data types.
-- **String Filters**: Manipulate and filter data based on string operations.
-
-
-#### Calculation 
-
-- 
-
-
-#### Descriptive Statistics
-
-- **Boolean Stats**: 
-- **Numeric Stats**: 
-- **String Stats**: 
-- **Timeseries Stats**: 
+```
+├── timeseries_shaper
+│   ├── __init__.py
+│   ├── base.py
+│   ├── calculator
+│   │   ├── __init__.py
+│   │   └── numeric_calc.py
+│   ├── cycles
+│   │   ├── __init__.py
+│   │   ├── cycle_processor.py
+│   │   └── cycles_extractor.py
+│   ├── events
+│   │   ├── __init__.py
+│   │   ├── outlier_detection.py
+│   │   ├── statistical_process_control.py
+│   │   ├── tolerance_deviation.py
+│   │   └── value_mapping.py
+│   ├── filter
+│   │   ├── __init__.py
+│   │   ├── boolean_filter.py
+│   │   ├── custom_filter.py
+│   │   ├── datetime_filter.py
+│   │   ├── numeric_filter.py
+│   │   └── string_filter.py
+│   ├── functions
+│   │   ├── __init__.py
+│   │   └── lambda_func.py
+│   ├── loader
+│   │   ├── __init__.py
+│   │   ├── metadata
+│   │   │   ├── __init__.py
+│   │   │   ├── metadata_api_loader.py
+│   │   │   └── metadata_json_loader.py
+│   │   └── timeseries
+│   │       ├── __init__.py
+│   │       ├── parquet_loader.py
+│   │       ├── s3proxy_parquet_loader.py
+│   │       └── timescale_loader.py
+│   ├── stats
+│   │   ├── __init__.py
+│   │   ├── boolean_stats.py
+│   │   ├── numeric_stats.py
+│   │   ├── string_stats.py
+│   │   └── timestamp_stats.py
+│   ├── time_stats
+│   │   ├── __init__,py
+│   │   └── time_stats_numeric.py
+```
 
 
 ## Installation

diff --git a/path_extract_md.py b/path_extract_md.py
@@ -0,0 +1,30 @@
+import os
+
+def generate_markdown_tree(directory, prefix=""):
+    """Generates a markdown tree structure of a given directory, excluding __pycache__ folders."""
+    result = []
+    items = sorted(os.listdir(directory))
+    for index, item in enumerate(items):
+        path = os.path.join(directory, item)
+
+        # Skip __pycache__ folders
+        if item == "__pycache__":
+            continue
+
+        connector = "├── " if index < len(items) - 1 else "└── "
+        result.append(f"{prefix}{connector}{item}")
+        if os.path.isdir(path):
+            sub_prefix = "│   " if index < len(items) - 1 else "    "
+            result.extend(generate_markdown_tree(path, prefix + sub_prefix))
+    return result
+
+def save_markdown_tree(directory, output_file="folder_structure.md"):
+    """Saves the markdown tree structure to a file with utf-8 encoding, excluding __pycache__ folders."""
+    tree = generate_markdown_tree(directory)
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write("\n".join(tree))
+    print(f"Markdown folder structure saved to {output_file}")
+
+# Replace 'your_directory_path' with the path to your folder
+directory_path = './src'
+save_markdown_tree(directory_path)
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
 pandas==2.1.0
-pdoc==14.4.0 # for docs
+pdoc==14.6.1 # for docs
diff --git a/setup.py b/setup.py
@@ -5,10 +5,10 @@
 
 setuptools.setup(
     name = "timeseries-shaper",
-    version = "0.0.0.10",
+    version = "0.0.0.11",
     author = "Jakob Gabriel",
     author_email = "[email protected]",
-    description = "timeseries-shaper filters, transforms and abstracts your timeseries dataframe",
+    description = "timeseries-shaper filters, transforms and engineer your timeseries dataframe",
     long_description = long_description,
     long_description_content_type = "text/markdown",
     url = "https://jakobgabriel.github.io/timeseries-shaper/timeseries_shaper.html",

diff --git a/src/timeseries_shaper/time_stats/__init__,py b/src/timeseries_shaper/time_stats/__init__,py
diff --git a/src/timeseries_shaper/time_stats/time_stats_numeric.py b/src/timeseries_shaper/time_stats/time_stats_numeric.py
@@ -0,0 +1,89 @@
+import pandas as pd
+from ..base import Base
+
+class TimeGroupedStatistics(Base):
+    """
+    A class for calculating time-grouped statistics on numeric data, with class methods to apply various statistical functions.
+    """
+
+    @classmethod
+    def calculate_statistic(cls, dataframe: pd.DataFrame, time_column: str, value_column: str, freq: str, stat_method: str) -> pd.DataFrame:
+        """
+        Calculate a specified statistic on the value column over the grouped time intervals.
+
+        Args:
+            dataframe (pd.DataFrame): The DataFrame containing the data.
+            time_column (str): The name of the time column to group and sort by.
+            value_column (str): The name of the numeric column to calculate statistics on.
+            freq (str): Frequency string for time grouping (e.g., 'H' for hourly, 'D' for daily).
+            stat_method (str): The statistical method to apply ('mean', 'sum', 'min', 'max', 'diff', 'range').
+
+        Returns:
+            pd.DataFrame: A DataFrame with the time intervals and the calculated statistics.
+        """
+        # Set the DataFrame index to the time column and resample to the specified frequency
+        grouped_df = dataframe.set_index(time_column).resample(freq)
+
+        # Select the calculation method
+        if stat_method == 'mean':
+            result = grouped_df[value_column].mean().to_frame('mean')
+        elif stat_method == 'sum':
+            result = grouped_df[value_column].sum().to_frame('sum')
+        elif stat_method == 'min':
+            result = grouped_df[value_column].min().to_frame('min')
+        elif stat_method == 'max':
+            result = grouped_df[value_column].max().to_frame('max')
+        elif stat_method == 'diff':
+            # Improved diff: last value - first value within each interval
+            result = (grouped_df[value_column].last() - grouped_df[value_column].first()).to_frame('difference')
+        elif stat_method == 'range':
+            # Range: max value - min value within each interval
+            result = (grouped_df[value_column].max() - grouped_df[value_column].min()).to_frame('range')
+        else:
+            raise ValueError("Invalid stat_method. Choose from 'mean', 'sum', 'min', 'max', 'diff', 'range'.")
+
+        return result
+
+    @classmethod
+    def calculate_statistics(cls, dataframe: pd.DataFrame, time_column: str, value_column: str, freq: str, stat_methods: list) -> pd.DataFrame:
+        """
+        Calculate multiple specified statistics on the value column over the grouped time intervals.
+
+        Args:
+            dataframe (pd.DataFrame): The DataFrame containing the data.
+            time_column (str): The name of the time column to group and sort by.
+            value_column (str): The name of the numeric column to calculate statistics on.
+            freq (str): Frequency string for time grouping (e.g., 'H' for hourly, 'D' for daily).
+            stat_methods (list): A list of statistical methods to apply (e.g., ['mean', 'sum', 'diff', 'range']).
+
+        Returns:
+            pd.DataFrame: A DataFrame with the time intervals and the calculated statistics for each method.
+        """
+        # Initialize an empty DataFrame for combining results
+        result_df = pd.DataFrame()
+
+        # Calculate each requested statistic and join to the result DataFrame
+        for method in stat_methods:
+            stat_df = cls.calculate_statistic(dataframe, time_column, value_column, freq, method)
+            result_df = result_df.join(stat_df, how='outer')
+
+        return result_df
+
+    @classmethod
+    def calculate_custom_func(cls, dataframe: pd.DataFrame, time_column: str, value_column: str, freq: str, func) -> pd.DataFrame:
+        """
+        Apply a custom aggregation function on the value column over the grouped time intervals.
+    
+        Args:
+            dataframe (pd.DataFrame): The DataFrame containing the data.
+            time_column (str): The name of the time column to group and sort by.
+            value_column (str): The name of the numeric column to calculate statistics on.
+            freq (str): Frequency string for time grouping (e.g., 'H' for hourly, 'D' for daily).
+            func (callable): Custom function to apply to each group.
+    
+        Returns:
+            pd.DataFrame: A DataFrame with the custom calculated statistics.
+        """
+        grouped_df = dataframe.set_index(time_column).resample(freq)
+        result = grouped_df[value_column].apply(func).to_frame('custom')
+        return result