-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Profile data tests #682 Generate an cpu intensity graph of the functions used during pytest. * Add a pytest project file * Add pytest-profiling * Add graphviz * Document how to profile performance * Speedup read_excel for SMHI data #682 This shaved off 6 seconds from the test. * Use calamine when importing SMHI data * Cache SMHI data #682 Speed up repeat usage of SMHI data to a fraction. * Add feather-format * Use a feather file cache for the imported SMI data. * Make default excel file path and df cache period configurable #682 Generalize the df cache decorator. * Add a cache_utilities module #682 Refactor-out the cache_df decorator. * Remove duplicate requirement #682 feather is part of pyarrow which is already in requirements. * Remove feather-format * cache_df is not just for read_excel #682 Do not mention excel in cache_df. Also clarify why column names are cached separately. * Improve documentation of cache_df #682 * More explicit hint on functions supported. * Clarify options. * Make cache_df work without path #682 * Make default path a valid string. * Remove obsolete error handling for no path. * Make cache_df decoration more apparent #682 * Hint a return type that is the same as the decorated function. * Use relative import of cache_df #682 Fixes a problem when using unittest rather than pytest. * Revert "Speedup read_excel for SMHI data #682" This reverts commit 216339e * Speedup read_excel for SMHI data using openpyxl #682 Use openpyxl to read SMHI data. Note: calamine is only 15% faster which is not worth an additional dependency.
- Loading branch information
1 parent
f35ee0f
commit 19f34d1
Showing
5 changed files
with
130 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import hashlib | ||
import os | ||
import pyarrow.feather as feather | ||
|
||
import pandas as pd | ||
|
||
|
||
def cache_df(f: type(pd.read_excel) = None, path: str = '', freq: str = '1Y') -> type(pd.read_excel): | ||
""" | ||
Cache the DataFrame to an intermediate file and use it if created within the same period. | ||
Args: | ||
f: A function to cache (e.g., a function that loads a DataFrame). | ||
path: Path to the file to be cached. If provided without f, acts as a decorator. | ||
freq: Cache period, e.g. '1D', '1M', '1Y'. Defaults to '1Y'. If provided without f, acts as a decorator. | ||
Returns: | ||
Caching of the output - not calling the function unless we entered a new period. | ||
Example usage: | ||
Create a test Excel file: | ||
>>> df_test = pd.DataFrame({"A": [1], "B": [2]}) | ||
>>> test_path = "test_data.xlsx" | ||
>>> df_test.to_excel(test_path, index=False) | ||
Use the decorator to cache the DataFrame loaded from the file: | ||
>>> @cache_df | ||
... def load_data(path): | ||
... print("Creating DataFrame from file (first call)...") | ||
... return pd.read_excel(path) | ||
>>> load_data.__name__ | ||
'cache_df_load_data' | ||
>>> print(load_data(test_path)) | ||
Creating DataFrame from file (first call)... | ||
A B | ||
0 1 2 | ||
>>> print(load_data(test_path)) # Data loaded from cache, no print output | ||
A B | ||
0 1 2 | ||
Use the decorator with a short expiration time: | ||
>>> @cache_df(path=test_path, freq='1ms') | ||
... def load_data_short_expiry(path=test_path): | ||
... print("Creating DataFrame from file (short expiration)...") | ||
... return pd.read_excel(path) | ||
>>> print(load_data_short_expiry()) | ||
Creating DataFrame from file (short expiration)... | ||
A B | ||
0 1 2 | ||
>>> import time | ||
>>> time.sleep(0.001) # Sleep for 1 millisecond | ||
>>> print(load_data_short_expiry()) # Data expired, loading again from file | ||
Creating DataFrame from file (short expiration)... | ||
A B | ||
0 1 2 | ||
Clean up the test files: | ||
>>> file_hash = hashlib.md5(test_path.encode()).hexdigest() | ||
>>> os.remove(test_path) | ||
>>> os.remove(f"cache_df_load_data_{file_hash}.feather") | ||
>>> os.remove(f"cache_df_load_data_{file_hash}.pkl") | ||
>>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.feather") | ||
>>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.pkl") | ||
""" | ||
if f is None: | ||
return lambda f: cache_df(f, path=path, freq=freq) | ||
|
||
def caching_f(*args, **kwargs): | ||
input_path = kwargs.get('path') or (args[0] if args else path) | ||
|
||
# Create a hash of the path for the cache file | ||
path_hash = hashlib.md5(input_path.encode()).hexdigest() | ||
df_file = f'cache_df_{f.__name__}_{path_hash}.feather' | ||
columns_file = f'cache_df_{f.__name__}_{path_hash}.pkl' | ||
|
||
# Check if cached file and columns file exist and is in the same period as now | ||
if os.path.exists(df_file): | ||
stat = os.stat(df_file) | ||
cache_mtime = pd.Timestamp(stat.st_mtime_ns // 1_000_000, unit='ms') | ||
if pd.Period(pd.Timestamp.now(), freq=freq) == pd.Period(cache_mtime, freq=freq): | ||
# Load cached data | ||
df = pd.read_feather(df_file) | ||
# Load original column names | ||
if os.path.exists(columns_file): | ||
original_columns = pd.read_pickle(columns_file) | ||
df.columns = original_columns | ||
return df | ||
|
||
# Process and cache the data | ||
df = f(*args, **kwargs) | ||
feather.write_feather(df, df_file) | ||
|
||
# Save the original column names separately since feather does not support different heading types | ||
pd.to_pickle(df.columns, columns_file) | ||
|
||
return df | ||
|
||
caching_f.__name__ = 'cache_df_' + f.__name__ | ||
return caching_f | ||
|
||
|
||
if __name__ == "__main__": | ||
import doctest | ||
|
||
doctest.testmod() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
[build-system] | ||
requires = ["hatchling"] | ||
build-backend = "hatchling.build" | ||
|
||
[project] | ||
name = "klimatkollen-data" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,4 +4,6 @@ xlrd | |
openpyxl | ||
pyarrow | ||
scipy | ||
pytest | ||
pytest | ||
pytest-profiling | ||
graphviz |