Skip to content

Commit

Permalink
add: time_stats for numeric value columns added
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobgabriel committed Oct 27, 2024
1 parent b60ff8b commit 69d4f17
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 31 deletions.
78 changes: 50 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,58 @@

Timeseries-Shaper is a Python library for efficiently filtering and preprocessing time series data using pandas. It provides a set of tools to handle various data transformations, making data preparation tasks easier and more intuitive.

## Features
Besides that multiple engineering specific methods are utilized to make it fast and easy to work with time series data.

### Load
## Features | Structure

- **Load Parquet**: Load parquet files for further processing


### Transform



#### Filter

- **Filter Missing Values**: Quickly filter out or fill missing values in your time series data.
- **Boolean Filters**: Apply boolean logic to filter data based on specific conditions.
- **Integer and Double Filters**: Perform numeric operations and filters specific to integer and double data types.
- **String Filters**: Manipulate and filter data based on string operations.


#### Calculation

-


#### Descriptive Statistics

- **Boolean Stats**:
- **Numeric Stats**:
- **String Stats**:
- **Timeseries Stats**:
```
├── timeseries_shaper
│ ├── __init__.py
│ ├── base.py
│ ├── calculator
│ │ ├── __init__.py
│ │ └── numeric_calc.py
│ ├── cycles
│ │ ├── __init__.py
│ │ ├── cycle_processor.py
│ │ └── cycles_extractor.py
│ ├── events
│ │ ├── __init__.py
│ │ ├── outlier_detection.py
│ │ ├── statistical_process_control.py
│ │ ├── tolerance_deviation.py
│ │ └── value_mapping.py
│ ├── filter
│ │ ├── __init__.py
│ │ ├── boolean_filter.py
│ │ ├── custom_filter.py
│ │ ├── datetime_filter.py
│ │ ├── numeric_filter.py
│ │ └── string_filter.py
│ ├── functions
│ │ ├── __init__.py
│ │ └── lambda_func.py
│ ├── loader
│ │ ├── __init__.py
│ │ ├── metadata
│ │ │ ├── __init__.py
│ │ │ ├── metadata_api_loader.py
│ │ │ └── metadata_json_loader.py
│ │ └── timeseries
│ │ ├── __init__.py
│ │ ├── parquet_loader.py
│ │ ├── s3proxy_parquet_loader.py
│ │ └── timescale_loader.py
│ ├── stats
│ │ ├── __init__.py
│ │ ├── boolean_stats.py
│ │ ├── numeric_stats.py
│ │ ├── string_stats.py
│ │ └── timestamp_stats.py
│ ├── time_stats
│ │ ├── __init__,py
│ │ └── time_stats_numeric.py
```


## Installation
Expand Down
30 changes: 30 additions & 0 deletions path_extract_md.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os

def generate_markdown_tree(directory, prefix=""):
"""Generates a markdown tree structure of a given directory, excluding __pycache__ folders."""
result = []
items = sorted(os.listdir(directory))
for index, item in enumerate(items):
path = os.path.join(directory, item)

# Skip __pycache__ folders
if item == "__pycache__":
continue

connector = "├── " if index < len(items) - 1 else "└── "
result.append(f"{prefix}{connector}{item}")
if os.path.isdir(path):
sub_prefix = "│ " if index < len(items) - 1 else " "
result.extend(generate_markdown_tree(path, prefix + sub_prefix))
return result

def save_markdown_tree(directory, output_file="folder_structure.md"):
"""Saves the markdown tree structure to a file with utf-8 encoding, excluding __pycache__ folders."""
tree = generate_markdown_tree(directory)
with open(output_file, "w", encoding="utf-8") as f:
f.write("\n".join(tree))
print(f"Markdown folder structure saved to {output_file}")

# Replace 'your_directory_path' with the path to your folder
directory_path = './src'
save_markdown_tree(directory_path)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
pandas==2.1.0
pdoc==14.4.0 # for docs
pdoc==14.6.1 # for docs
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

setuptools.setup(
name = "timeseries-shaper",
version = "0.0.0.10",
version = "0.0.0.11",
author = "Jakob Gabriel",
author_email = "[email protected]",
description = "timeseries-shaper filters, transforms and abstracts your timeseries dataframe",
description = "timeseries-shaper filters, transforms and engineer your timeseries dataframe",
long_description = long_description,
long_description_content_type = "text/markdown",
url = "https://jakobgabriel.github.io/timeseries-shaper/timeseries_shaper.html",
Expand Down
Empty file.
89 changes: 89 additions & 0 deletions src/timeseries_shaper/time_stats/time_stats_numeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pandas as pd
from ..base import Base

class TimeGroupedStatistics(Base):
"""
A class for calculating time-grouped statistics on numeric data, with class methods to apply various statistical functions.
"""

@classmethod
def calculate_statistic(cls, dataframe: pd.DataFrame, time_column: str, value_column: str, freq: str, stat_method: str) -> pd.DataFrame:
"""
Calculate a specified statistic on the value column over the grouped time intervals.
Args:
dataframe (pd.DataFrame): The DataFrame containing the data.
time_column (str): The name of the time column to group and sort by.
value_column (str): The name of the numeric column to calculate statistics on.
freq (str): Frequency string for time grouping (e.g., 'H' for hourly, 'D' for daily).
stat_method (str): The statistical method to apply ('mean', 'sum', 'min', 'max', 'diff', 'range').
Returns:
pd.DataFrame: A DataFrame with the time intervals and the calculated statistics.
"""
# Set the DataFrame index to the time column and resample to the specified frequency
grouped_df = dataframe.set_index(time_column).resample(freq)

# Select the calculation method
if stat_method == 'mean':
result = grouped_df[value_column].mean().to_frame('mean')
elif stat_method == 'sum':
result = grouped_df[value_column].sum().to_frame('sum')
elif stat_method == 'min':
result = grouped_df[value_column].min().to_frame('min')
elif stat_method == 'max':
result = grouped_df[value_column].max().to_frame('max')
elif stat_method == 'diff':
# Improved diff: last value - first value within each interval
result = (grouped_df[value_column].last() - grouped_df[value_column].first()).to_frame('difference')
elif stat_method == 'range':
# Range: max value - min value within each interval
result = (grouped_df[value_column].max() - grouped_df[value_column].min()).to_frame('range')
else:
raise ValueError("Invalid stat_method. Choose from 'mean', 'sum', 'min', 'max', 'diff', 'range'.")

return result

@classmethod
def calculate_statistics(cls, dataframe: pd.DataFrame, time_column: str, value_column: str, freq: str, stat_methods: list) -> pd.DataFrame:
"""
Calculate multiple specified statistics on the value column over the grouped time intervals.
Args:
dataframe (pd.DataFrame): The DataFrame containing the data.
time_column (str): The name of the time column to group and sort by.
value_column (str): The name of the numeric column to calculate statistics on.
freq (str): Frequency string for time grouping (e.g., 'H' for hourly, 'D' for daily).
stat_methods (list): A list of statistical methods to apply (e.g., ['mean', 'sum', 'diff', 'range']).
Returns:
pd.DataFrame: A DataFrame with the time intervals and the calculated statistics for each method.
"""
# Initialize an empty DataFrame for combining results
result_df = pd.DataFrame()

# Calculate each requested statistic and join to the result DataFrame
for method in stat_methods:
stat_df = cls.calculate_statistic(dataframe, time_column, value_column, freq, method)
result_df = result_df.join(stat_df, how='outer')

return result_df

@classmethod
def calculate_custom_func(cls, dataframe: pd.DataFrame, time_column: str, value_column: str, freq: str, func) -> pd.DataFrame:
"""
Apply a custom aggregation function on the value column over the grouped time intervals.
Args:
dataframe (pd.DataFrame): The DataFrame containing the data.
time_column (str): The name of the time column to group and sort by.
value_column (str): The name of the numeric column to calculate statistics on.
freq (str): Frequency string for time grouping (e.g., 'H' for hourly, 'D' for daily).
func (callable): Custom function to apply to each group.
Returns:
pd.DataFrame: A DataFrame with the custom calculated statistics.
"""
grouped_df = dataframe.set_index(time_column).resample(freq)
result = grouped_df[value_column].apply(func).to_frame('custom')
return result

0 comments on commit 69d4f17

Please sign in to comment.