uhh-cms · mafrahm · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/hbw/analysis/create_analysis.py b/hbw/analysis/create_analysis.py
@@ -113,7 +113,7 @@ def analysis_factory(configs: od.UniqueObjectIndex):
                     )
                 else:
                     raise ValueError(
-                        f"Campaign used for {config_name} is not yet initialized; to initialize, run: \n",
+                        f"Campaign used for {config_name} is not yet initialized; to initialize, run: \n"
                         f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y",
                     )
                     # cpn_task.run()
@@ -141,42 +141,39 @@ def analysis_factory(configs: od.UniqueObjectIndex):
 
     # 2017
     add_lazy_config(
-        # {
-        #     "cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9",
-        # },
         "c17",
         1700,
     )
 
     # 2022 preEE
     add_lazy_config(
-        # {
-        #     "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12",
-        #     "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13",
-        # },
         "c22pre",
         2200,
     )
 
     # 2022 postEE
     add_lazy_config(
-        # {
-        #     "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12",
-        #     "cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13",
-        #     "cmsdb.campaigns.run3_2022_postEE_nano_uhh_v12": "campaign_run3_2022_postEE_nano_uhh_v12",
-        # },
         "c22post",
         2210,
     )
 
+    add_lazy_config(
+        "c22pre_das",
+        2201,
+    )
+    add_lazy_config(
+        "c22post_das",
+        2211,
+    )
+
     #
     # modify store_parts
     #
 
     software_tasks = ("cf.BundleBashSandbox", "cf.BundleCMSSWSandbox", "cf.BundleSoftware")
     shareable_analysis_tasks = ("cf.CalibrateEvents", "cf.GetDatasetLFNs")
     limited_config_shared_tasks = ("cf.CalibrateEvents", "cf.GetDatasetLFNs", "cf.SelectEvents", "cf.ReduceEvents")
-    skip_new_version_schema = ("cf.CalibrateEvents", "cf.GetDatasetLFNs")
+    skip_new_version_schema = ()
     known_parts = (
         # from cf
         "analysis", "task_family", "config", "configs", "dataset", "shift", "version",
@@ -237,11 +234,13 @@ def reorganize_parts(task, store_parts):
             "analysis",
             "calibrator", "calibrators", "calib",
             "selector", "sel",
-            "producer", "producers", "prod",
+            "config", "configs",
+            "producers", "prod",
             "ml_data", "ml_model", "ml_models",
             "weightprod", "inf_model",
             "task_family",
-            "config", "dataset", "shift",
+            "calibrator", "producer",
+            "shift", "dataset",
         ]
         parts_order_end = ["version"]
 

diff --git a/hbw/analysis/processes.py b/hbw/analysis/processes.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+
+"""
+Creation and modification of processes in the HH -> bbWW analysis.
+NOTE: it is crucial to modify processes before the campaign is created. Otherwise,
+the changes will not be reflected in the campaign and there will be inconsistencies.
+"""
+
+# import order as od
+
+
+from hbw.config.processes import create_parent_process
+from hbw.config.styling import color_palette
+from cmsdb.util import add_decay_process
+
+
+def modify_cmsdb_processes():
+    from cmsdb.processes import (
+        qcd_mu, qcd_em, qcd_bctoe,
+        tt, ttv, st, w_lnu, vv, h,
+        dy, dy_m4to10, dy_m10to50, dy_m50toinf, dy_m50toinf_0j, dy_m50toinf_1j, dy_m50toinf_2j,
+    )
+
+    qcd_mu.label = "QCD Muon enriched"
+    qcd_ele = create_parent_process(
+        [qcd_em, qcd_bctoe],
+        name="qcd_ele",
+        id=31199,
+        label="QCD Electron enriched",
+    )
+
+    v_lep = create_parent_process(
+        [w_lnu, dy],
+        name="v_lep",
+        id=64575573,  # random number
+        label="W and DY",
+    )
+
+    t_bkg = create_parent_process(
+        [st, tt, ttv],
+        name="t_bkg",
+        id=97842611,  # random number
+        label="tt + st",
+    )
+
+    background = create_parent_process(  # noqa: F841
+        [t_bkg, v_lep, vv, w_lnu, h, qcd_ele, qcd_mu],
+        name="background",
+        id=99999,
+        label="background",
+        color=color_palette["blue"],
+    )
+
+    decay_map = {
+        "lf": {
+            "name": "lf",
+            "id": 50,
+            "label": "(lf)",
+            "br": -1,
+        },
+        "hf": {
+            "name": "hf",
+            "id": 60,
+            "label": "(hf)",
+            "br": -1,
+        },
+    }
+
+    for dy_proc_inst in (
+        dy, dy_m4to10, dy_m10to50, dy_m50toinf, dy_m50toinf_0j, dy_m50toinf_1j, dy_m50toinf_2j,
+    ):
+        add_production_mode_parent = dy_proc_inst.name != "dy"
+        for flavour in ("hf", "lf"):
+            # the 'add_decay_process' function helps us to create all parent-daughter relationships
+            add_decay_process(
+                dy_proc_inst,
+                decay_map[flavour],
+                add_production_mode_parent=add_production_mode_parent,
+                name_func=lambda parent_name, decay_name: f"{parent_name}_{decay_name}",
+                label_func=lambda parent_label, decay_label: f"{parent_label} {decay_label}",
+                xsecs=None,
+                aux={"flavour": flavour},
+            )
diff --git a/hbw/calibration/default.py b/hbw/calibration/default.py
@@ -7,59 +7,170 @@
 import law
 
 from columnflow.calibration import Calibrator, calibrator
+from columnflow.calibration.cms.met import met_phi
 from columnflow.calibration.cms.jets import jec, jer
 from columnflow.production.cms.seeds import deterministic_seeds
-from columnflow.util import maybe_import
+from columnflow.util import maybe_import, try_float
+from columnflow.columnar_util import set_ak_column, EMPTY_FLOAT
 
-from hbw.calibration.jet import jec_nominal, bjet_regression
+from hbw.util import MET_COLUMN
+
+from hbw.calibration.jet import bjet_regression
 
 ak = maybe_import("awkward")
+np = maybe_import("numpy")
 
 
 logger = law.logger.get_logger(__name__)
 
 
 @calibrator(
-    uses={deterministic_seeds},
+    # jec uncertainty_sources: set to None to use config default
+    jec_sources=["Total"],
+    version=1,
+    # add dummy produces such that this calibrator will always be run when requested
+    # (temporary workaround until init's are only run as often as necessary)
+    produces={"FatJet.pt"},
+)
+def fatjet(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
+    """
+    FatJet calibrator, combining JEC and JER.
+    """
+    if self.task.local_shift != "nominal":
+        raise Exception("FatJet Calibrator should not be run for shifts other than nominal")
+
+    # apply the fatjet JEC and JER
+    events = self[self.fatjet_jec_cls](events, **kwargs)
+    if self.dataset_inst.is_mc:
+        events = self[self.fatjet_jer_cls](events, **kwargs)
+
+    return events
+
+
+@fatjet.init
+def fatjet_init(self: Calibrator) -> None:
+    if not self.task or self.task.task_family != "cf.CalibrateEvents":
+        # init only required for task itself
+        return
+
+    if not getattr(self, "dataset_inst", None):
+        return
+
+    # list of calibrators to apply (in that order)
+    self.calibrators = []
+
+    fatjet_jec_cls_dict = {
+        "jet_name": "FatJet",
+        "gen_jet_name": "GenJetAK8",
+        # MET propagation is performed in AK4 jet calibrator; fatjet should never use any MET columns
+        "propagate_met": False,
+        "met_name": "DO_NOT_USE",
+        "raw_met_name": "DO_NOT_USE",
+    }
+    fatjet_jer_cls_dict = fatjet_jec_cls_dict.copy()
+    # NOTE: deterministic FatJet seeds are not yet possible to produce
+    # fatjet_jer_cls_dict["deterministic_seed_index"] = 0
+
+    uncertainty_sources = [] if self.dataset_inst.is_data else self.jec_sources
+    jec_cls_name = f"fatjet_jec{'_nominal' if uncertainty_sources == [] else ''}"
+    self.fatjet_jec_cls = jec.derive(jec_cls_name, cls_dict={
+        **fatjet_jec_cls_dict,
+        "uncertainty_sources": uncertainty_sources,
+    })
+    self.fatjet_jer_cls = jer.derive("deterministic_fatjet_jer", cls_dict=fatjet_jer_cls_dict)
+
+    self.uses |= {self.fatjet_jec_cls, self.fatjet_jer_cls}
+    self.produces |= {self.fatjet_jec_cls, self.fatjet_jer_cls}
+
+
+@calibrator(
+    uses={deterministic_seeds, MET_COLUMN("{pt,phi}")},
     produces={deterministic_seeds},
-    skip_jecunc=True,
+    # jec uncertainty_sources: set to None to use config default
+    jec_sources=["Total"],
     bjet_regression=True,
+    version=1,
 )
-def base(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
+def jet_base(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
     events = self[deterministic_seeds](events, **kwargs)
 
+    # keep a copy of non-propagated MET to replace infinite values
+    pre_calib_met = events[self.config_inst.x.met_name]
+
     logger.info(f"Running calibrators '{[calib.cls_name for calib in self.calibrators]}' (in that order)")
     for calibrator_inst in self.calibrators:
         events = self[calibrator_inst](events, **kwargs)
 
+    # workaround for infinite values in MET pt/phi
+    for route in self.produced_columns:
+        col = route.string_column
+        m = ~np.isfinite(route.apply(events))
+        if ak.any(m):
+            # replace infinite values
+            replace_value = EMPTY_FLOAT
+            if self.config_inst.x.met_name in col:
+                # use pre-calibrated MET to replace infinite values of MET pt/phi
+                replace_value = pre_calib_met[col.split(".")[-1].split("_")[0]]
+            logger.info(
+                f"Found infinite values in {col}; Values will be replaced with "
+                f"{replace_value if try_float(replace_value) else replace_value[m]}",
+            )
+            events = set_ak_column(events, col, ak.where(m, replace_value, route.apply(events)))
+
     return events
 
 
-@base.init
-def base_init(self: Calibrator) -> None:
+@jet_base.init
+def jet_base_init(self: Calibrator) -> None:
+    if not self.task or self.task.task_family != "cf.CalibrateEvents":
+        # init only required for task itself
+        return
+
     if not getattr(self, "dataset_inst", None):
         return
 
+    met_name = self.config_inst.x.met_name
+    raw_met_name = self.config_inst.x.raw_met_name
+
     # list of calibrators to apply (in that order)
     self.calibrators = []
 
-    if self.dataset_inst.is_data or self.skip_jecunc:
-        self.calibrators.append(jec_nominal)
-    else:
-        self.calibrators.append(jec)
+    uncertainty_sources = [] if self.dataset_inst.is_data else self.jec_sources
+    jec_cls_name = f"ak4_jec{'_nominal' if uncertainty_sources == [] else ''}"
+
+    jec_cls = jec.derive(
+        jec_cls_name,
+        cls_dict={
+            "uncertainty_sources": uncertainty_sources,
+            "met_name": met_name,
+            "raw_met_name": raw_met_name,
+        },
+    )
+    self.calibrators.append(jec_cls)
 
     if self.bjet_regression:
         self.calibrators.append(bjet_regression)
 
     # run JER only on MC
     if self.dataset_inst.is_mc:
-        self.calibrators.append(jer)
+        # version of jer that uses the first random number from deterministic_seeds
+        deterministic_jer_cls = jer.derive(
+            "deterministic_jer",
+            cls_dict={
+                "deterministic_seed_index": 0,
+                "met_name": met_name,
+            },
+        )
+        self.calibrators.append(deterministic_jer_cls)
+
+    if self.config_inst.x.run == 2:
+        # derive met_phi calibrator (currently only for run 2)
+        met_phi_cls = met_phi.derive("met_phi", cls_dict={"met_name": met_name})
+        self.calibrators.append(met_phi_cls)
 
     self.uses |= set(self.calibrators)
     self.produces |= set(self.calibrators)
 
 
-default = base.derive("default", cls_dict=dict(skip_jecunc=False, bjet_regression=False))
-skip_jecunc = base.derive("skip_jecunc", cls_dict=dict(skip_jecunc=True, bjet_regression=False))
-with_b_reg = base.derive("with_b_reg", cls_dict=dict(skip_jecunc=True, bjet_regression=True))
-full = base.derive("full", cls_dict=dict(skip_jecunc=False, bjet_regression=True))
+skip_jecunc = jet_base.derive("skip_jecunc", cls_dict=dict(bjet_regression=False))
+with_b_reg = jet_base.derive("with_b_reg", cls_dict=dict(bjet_regression=True))
diff --git a/hbw/categorization/categories.py b/hbw/categorization/categories.py
@@ -13,6 +13,8 @@
 from columnflow.selection import SelectionResult
 from columnflow.columnar_util import has_ak_column, optional_column
 
+from hbw.util import MET_COLUMN
+
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
 
@@ -160,15 +162,15 @@ def catid_fake(
     return events, mask
 
 
-@categorizer(uses={"MET.pt"})
+@categorizer(uses={MET_COLUMN("pt")})
 def catid_highmet(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]:
-    mask = events.MET.pt >= 20
+    mask = events[self.config_inst.x.met_name].pt >= 20
     return events, mask
 
 
-@categorizer(uses={"MET.pt"})
+@categorizer(uses={MET_COLUMN("pt")})
 def catid_lowmet(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]:
-    mask = events.MET.pt < 20
+    mask = events[self.config_inst.x.met_name].pt < 20
     return events, mask
 
 #

diff --git a/hbw/columnflow_patches.py b/hbw/columnflow_patches.py
@@ -93,9 +93,7 @@ def TaskArrayFunction_str(self):
 
     TaskArrayFunction.__str__ = TaskArrayFunction_str
     logger.info(
-        "patched TaskArrayFunction.__str__ to include the CSP version attribute "
-        "(NOTE that this currently does not work for the "
-        "MLTrainingMixin tasks (e.g. MLPreTraining and MLTraining))",
+        "patched TaskArrayFunction.__str__ to include the CSP version attribute",
     )
 
 
@@ -136,6 +134,10 @@ def patched_init(self, *args, **kwargs):
 
 @memoize
 def patch_all():
+    # change the "retries" parameter default
+    from columnflow.tasks.framework.remote import RemoteWorkflow
+    RemoteWorkflow.retries = RemoteWorkflow.retries.copy(default=3)
+
     patch_mltraining()
     patch_htcondor_workflow_naf_resources()
     # patch_column_alias_strategy()