Skip to content

Commit

Permalink
Speed up data processing (#683)
Browse files Browse the repository at this point in the history
* Profile data tests #682

Generate an cpu intensity graph of the functions used during pytest.

* Add a pytest project file
* Add pytest-profiling
* Add graphviz
* Document how to profile performance

* Speedup read_excel for SMHI data #682

This shaved off 6 seconds from the test.

* Use calamine when importing SMHI data

* Cache SMHI data #682

Speed up repeat usage of SMHI data to a fraction.

* Add feather-format
* Use a feather file cache for the imported SMI data.

* Make default excel file path and df cache period configurable #682

Generalize the df cache decorator.

* Add a cache_utilities module #682

Refactor-out the cache_df decorator.

* Remove duplicate requirement #682

feather is part of pyarrow which is already in requirements.

* Remove feather-format

* cache_df is not just for read_excel #682

Do not mention excel in cache_df.

Also clarify why column names are cached separately.

* Improve documentation of cache_df #682

* More explicit hint on functions supported.
* Clarify options.

* Make cache_df work without path #682

* Make default path a valid string.
* Remove obsolete error handling for no path.

* Make cache_df decoration more apparent #682

* Hint a return type that is the same as the decorated function.

* Use relative import of cache_df #682

Fixes a problem when using unittest rather than pytest.

* Revert "Speedup read_excel for SMHI data #682"

This reverts commit 216339e

* Speedup read_excel for SMHI data using openpyxl #682

Use openpyxl to read SMHI data.

Note: calamine is only 15% faster which is not worth an additional dependency.
  • Loading branch information
joakimbits authored Oct 23, 2024
1 parent f35ee0f commit 19f34d1
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 3 deletions.
7 changes: 7 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ We utilize Python libraries such as Pandas and NumPy to perform various calculat

If you notice any test failing, please submit a ticket about it.

### Performance profiling

Requires Graphwiz installed on the OS https://www.graphviz.org/download/
```sh
py.test tests --profile-svg && open prof/combined.svg
```

### How to Update Data on Site

To recalculate and refresh the site's data, navigate to the `/data` folder and execute the following command:
Expand Down
109 changes: 109 additions & 0 deletions data/issues/emissions/cache_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import hashlib
import os
import pyarrow.feather as feather

import pandas as pd


def cache_df(f: type(pd.read_excel) = None, path: str = '', freq: str = '1Y') -> type(pd.read_excel):
"""
Cache the DataFrame to an intermediate file and use it if created within the same period.
Args:
f: A function to cache (e.g., a function that loads a DataFrame).
path: Path to the file to be cached. If provided without f, acts as a decorator.
freq: Cache period, e.g. '1D', '1M', '1Y'. Defaults to '1Y'. If provided without f, acts as a decorator.
Returns:
Caching of the output - not calling the function unless we entered a new period.
Example usage:
Create a test Excel file:
>>> df_test = pd.DataFrame({"A": [1], "B": [2]})
>>> test_path = "test_data.xlsx"
>>> df_test.to_excel(test_path, index=False)
Use the decorator to cache the DataFrame loaded from the file:
>>> @cache_df
... def load_data(path):
... print("Creating DataFrame from file (first call)...")
... return pd.read_excel(path)
>>> load_data.__name__
'cache_df_load_data'
>>> print(load_data(test_path))
Creating DataFrame from file (first call)...
A B
0 1 2
>>> print(load_data(test_path)) # Data loaded from cache, no print output
A B
0 1 2
Use the decorator with a short expiration time:
>>> @cache_df(path=test_path, freq='1ms')
... def load_data_short_expiry(path=test_path):
... print("Creating DataFrame from file (short expiration)...")
... return pd.read_excel(path)
>>> print(load_data_short_expiry())
Creating DataFrame from file (short expiration)...
A B
0 1 2
>>> import time
>>> time.sleep(0.001) # Sleep for 1 millisecond
>>> print(load_data_short_expiry()) # Data expired, loading again from file
Creating DataFrame from file (short expiration)...
A B
0 1 2
Clean up the test files:
>>> file_hash = hashlib.md5(test_path.encode()).hexdigest()
>>> os.remove(test_path)
>>> os.remove(f"cache_df_load_data_{file_hash}.feather")
>>> os.remove(f"cache_df_load_data_{file_hash}.pkl")
>>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.feather")
>>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.pkl")
"""
if f is None:
return lambda f: cache_df(f, path=path, freq=freq)

def caching_f(*args, **kwargs):
input_path = kwargs.get('path') or (args[0] if args else path)

# Create a hash of the path for the cache file
path_hash = hashlib.md5(input_path.encode()).hexdigest()
df_file = f'cache_df_{f.__name__}_{path_hash}.feather'
columns_file = f'cache_df_{f.__name__}_{path_hash}.pkl'

# Check if cached file and columns file exist and is in the same period as now
if os.path.exists(df_file):
stat = os.stat(df_file)
cache_mtime = pd.Timestamp(stat.st_mtime_ns // 1_000_000, unit='ms')
if pd.Period(pd.Timestamp.now(), freq=freq) == pd.Period(cache_mtime, freq=freq):
# Load cached data
df = pd.read_feather(df_file)
# Load original column names
if os.path.exists(columns_file):
original_columns = pd.read_pickle(columns_file)
df.columns = original_columns
return df

# Process and cache the data
df = f(*args, **kwargs)
feather.write_feather(df, df_file)

# Save the original column names separately since feather does not support different heading types
pd.to_pickle(df.columns, columns_file)

return df

caching_f.__name__ = 'cache_df_' + f.__name__
return caching_f


if __name__ == "__main__":
import doctest

doctest.testmod()
7 changes: 5 additions & 2 deletions data/issues/emissions/historical_data_calculations.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import pandas as pd

from .cache_utilities import cache_df

PATH_SMHI = 'https://nationellaemissionsdatabasen.smhi.se/' + \
'api/getexcelfile/?county=0&municipality=0&sub=CO2'


def get_smhi_data():
@cache_df(path=PATH_SMHI)
def get_smhi_data(path=PATH_SMHI):
"""
Downloads data from SMHI and loads it into a pandas dataframe.
Returns:
pandas.DataFrame: The dataframe containing the SMHI data.
"""

df_raw = pd.read_excel(PATH_SMHI)
df_raw = pd.read_excel(path, engine="openpyxl")

# Remove the first 4 rows and reset the index
df_raw = df_raw.drop([0, 1, 2]).reset_index(drop=True)
Expand Down
6 changes: 6 additions & 0 deletions data/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "klimatkollen-data"
4 changes: 3 additions & 1 deletion data/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ xlrd
openpyxl
pyarrow
scipy
pytest
pytest
pytest-profiling
graphviz

0 comments on commit 19f34d1

Please sign in to comment.