Speed up data processing (#683)

* Profile data tests #682 Generate an cpu intensity graph of the functions used during pytest. * Add a pytest project file * Add pytest-profiling * Add graphviz * Document how to profile performance * Speedup read_excel for SMHI data #682 This shaved off 6 seconds from the test. * Use calamine when importing SMHI data * Cache SMHI data #682 Speed up repeat usage of SMHI data to a fraction. * Add feather-format * Use a feather file cache for the imported SMI data. * Make default excel file path and df cache period configurable #682 Generalize the df cache decorator. * Add a cache_utilities module #682 Refactor-out the cache_df decorator. * Remove duplicate requirement #682 feather is part of pyarrow which is already in requirements. * Remove feather-format * cache_df is not just for read_excel #682 Do not mention excel in cache_df. Also clarify why column names are cached separately. * Improve documentation of cache_df #682 * More explicit hint on functions supported. * Clarify options. * Make cache_df work without path #682 * Make default path a valid string. * Remove obsolete error handling for no path. * Make cache_df decoration more apparent #682 * Hint a return type that is the same as the decorated function. * Use relative import of cache_df #682 Fixes a problem when using unittest rather than pytest. * Revert "Speedup read_excel for SMHI data #682" This reverts commit 216339e * Speedup read_excel for SMHI data using openpyxl #682 Use openpyxl to read SMHI data. Note: calamine is only 15% faster which is not worth an additional dependency.
Klimatbyran · Oct 23, 2024 · 19f34d1 · 19f34d1
1 parent f35ee0f
commit 19f34d1
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 3 deletions.
diff --git a/data/README.md b/data/README.md
@@ -40,6 +40,13 @@ We utilize Python libraries such as Pandas and NumPy to perform various calculat
     
     If you notice any test failing, please submit a ticket about it.
 
+### Performance profiling
+
+Requires Graphwiz installed on the OS https://www.graphviz.org/download/
+```sh
+py.test tests --profile-svg && open prof/combined.svg
+```
+
 ### How to Update Data on Site
 
 To recalculate and refresh the site's data, navigate to the `/data` folder and execute the following command:

diff --git a/data/issues/emissions/cache_utilities.py b/data/issues/emissions/cache_utilities.py
@@ -0,0 +1,109 @@
+import hashlib
+import os
+import pyarrow.feather as feather
+
+import pandas as pd
+
+
+def cache_df(f: type(pd.read_excel) = None, path: str = '', freq: str = '1Y') -> type(pd.read_excel):
+    """
+    Cache the DataFrame to an intermediate file and use it if created within the same period.
+
+    Args:
+        f: A function to cache (e.g., a function that loads a DataFrame).
+        path: Path to the file to be cached. If provided without f, acts as a decorator.
+        freq: Cache period, e.g. '1D', '1M', '1Y'. Defaults to '1Y'. If provided without f, acts as a decorator.
+
+    Returns:
+        Caching of the output - not calling the function unless we entered a new period.
+
+    Example usage:
+
+    Create a test Excel file:
+        >>> df_test = pd.DataFrame({"A": [1], "B": [2]})
+        >>> test_path = "test_data.xlsx"
+        >>> df_test.to_excel(test_path, index=False)
+
+    Use the decorator to cache the DataFrame loaded from the file:
+        >>> @cache_df
+        ... def load_data(path):
+        ...     print("Creating DataFrame from file (first call)...")
+        ...     return pd.read_excel(path)
+        >>> load_data.__name__
+        'cache_df_load_data'
+        >>> print(load_data(test_path))
+        Creating DataFrame from file (first call)...
+           A  B
+        0  1  2
+
+        >>> print(load_data(test_path))  # Data loaded from cache, no print output
+           A  B
+        0  1  2
+
+    Use the decorator with a short expiration time:
+        >>> @cache_df(path=test_path, freq='1ms')
+        ... def load_data_short_expiry(path=test_path):
+        ...     print("Creating DataFrame from file (short expiration)...")
+        ...     return pd.read_excel(path)
+        >>> print(load_data_short_expiry())
+        Creating DataFrame from file (short expiration)...
+           A  B
+        0  1  2
+
+        >>> import time
+        >>> time.sleep(0.001)  # Sleep for 1 millisecond
+
+        >>> print(load_data_short_expiry())  # Data expired, loading again from file
+        Creating DataFrame from file (short expiration)...
+           A  B
+        0  1  2
+
+    Clean up the test files:
+        >>> file_hash = hashlib.md5(test_path.encode()).hexdigest()
+        >>> os.remove(test_path)
+        >>> os.remove(f"cache_df_load_data_{file_hash}.feather")
+        >>> os.remove(f"cache_df_load_data_{file_hash}.pkl")
+        >>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.feather")
+        >>> os.remove(f"cache_df_load_data_short_expiry_{file_hash}.pkl")
+    """
+    if f is None:
+        return lambda f: cache_df(f, path=path, freq=freq)
+
+    def caching_f(*args, **kwargs):
+        input_path = kwargs.get('path') or (args[0] if args else path)
+
+        # Create a hash of the path for the cache file
+        path_hash = hashlib.md5(input_path.encode()).hexdigest()
+        df_file = f'cache_df_{f.__name__}_{path_hash}.feather'
+        columns_file = f'cache_df_{f.__name__}_{path_hash}.pkl'
+
+        # Check if cached file and columns file exist and is in the same period as now
+        if os.path.exists(df_file):
+            stat = os.stat(df_file)
+            cache_mtime = pd.Timestamp(stat.st_mtime_ns // 1_000_000, unit='ms')
+            if pd.Period(pd.Timestamp.now(), freq=freq) == pd.Period(cache_mtime, freq=freq):
+                # Load cached data
+                df = pd.read_feather(df_file)
+                # Load original column names
+                if os.path.exists(columns_file):
+                    original_columns = pd.read_pickle(columns_file)
+                    df.columns = original_columns
+                return df
+
+        # Process and cache the data
+        df = f(*args, **kwargs)
+        feather.write_feather(df, df_file)
+
+        # Save the original column names separately since feather does not support different heading types
+        pd.to_pickle(df.columns, columns_file)
+
+        return df
+
+    caching_f.__name__ = 'cache_df_' + f.__name__
+    return caching_f
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
diff --git a/data/issues/emissions/historical_data_calculations.py b/data/issues/emissions/historical_data_calculations.py
@@ -1,18 +1,21 @@
 import pandas as pd
 
+from .cache_utilities import cache_df
+
 PATH_SMHI = 'https://nationellaemissionsdatabasen.smhi.se/' + \
     'api/getexcelfile/?county=0&municipality=0&sub=CO2'
 
 
-def get_smhi_data():
+@cache_df(path=PATH_SMHI)
+def get_smhi_data(path=PATH_SMHI):
     """
     Downloads data from SMHI and loads it into a pandas dataframe.
 
     Returns:
         pandas.DataFrame: The dataframe containing the SMHI data.
     """
 
-    df_raw = pd.read_excel(PATH_SMHI)
+    df_raw = pd.read_excel(path, engine="openpyxl")
 
     # Remove the first 4 rows and reset the index
     df_raw = df_raw.drop([0, 1, 2]).reset_index(drop=True)

diff --git a/data/pyproject.toml b/data/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "klimatkollen-data"
diff --git a/data/requirements.txt b/data/requirements.txt
@@ -4,4 +4,6 @@ xlrd
 openpyxl
 pyarrow
 scipy
-pytest
+pytest
+pytest-profiling
+graphviz