Skip to content

Commit

Permalink
update columnflow (fixes memory leak)
Browse files Browse the repository at this point in the history
  • Loading branch information
mafrahm committed Nov 29, 2024
1 parent 74ee544 commit a8820f0
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
24 changes: 24 additions & 0 deletions hbw/columnflow_patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,30 @@ def patch_default_version():
logger.info(f"using default version '{default_version}' for all AnalysisTasks")


@memoize
def patch_materialization_strategy():
"""
Simple patch function to switch to the PARTITIONS materialization strategy for DaskArrayReader.
We might want to try in the future if this improves memory usage, but this requires us to
reproduce all existing outputs with this type of partitioning.
"""
from columnflow.columnar_util import DaskArrayReader

# Save the original __init__ method
_original_init = DaskArrayReader.__init__

def patched_init(self, *args, **kwargs):
logger.debug(f"patched DaskArrayReader.__init__ with {DaskArrayReader.MaterializationStrategy.PARTITIONS}")
# Modify the materialization_strategy before calling the original __init__
kwargs["materialization_strategy"] = (
DaskArrayReader.MaterializationStrategy.PARTITIONS
)
_original_init(self, *args, **kwargs)

# Replace the original __init__ with the patched version
DaskArrayReader.__init__ = patched_init


@memoize
def patch_all():
patch_mltraining()
Expand Down

0 comments on commit a8820f0

Please sign in to comment.