refactor inference model

uhh-cms · Oct 16, 2023 · 4201c92 · 4201c92
1 parent 02f0657
commit 4201c92
Show file tree

Hide file tree

Showing 4 changed files with 223 additions and 166 deletions.
diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py
@@ -22,7 +22,7 @@ def default_ml_model(cls, container, task_params):
 
     # the ml_model parameter is only used by `MLTraining` and `MLEvaluation`, therefore use some default
     # NOTE: default_ml_model does not work for the MLTraining task
-    if cls.task_family in ("cf.MLTraining", "cf.MLEvaulation", "cf.MergeMLEvents", "cf.PrepareMLEvents"):
+    if cls.task_family in ("cf.MLTraining", "cf.MLEvaluation", "cf.MergeMLEvents", "cf.PrepareMLEvents"):
         # TODO: we might want to distinguish between two default ML models (sl vs dl)
         default_ml_model = "dense_default"
 

diff --git a/hbw/inference/constants.py b/hbw/inference/constants.py
@@ -16,10 +16,23 @@
 }
 signals = {*signals_ggHH, *signals_qqHH}
 
-# collection of all datasets (only 2017 ATM)
-e_datasets = {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]}
-mu_datasets = {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]}
-datasets = {*e_datasets, *mu_datasets}
+# mapping between lepton categories and datasets (only 2017 ATM)
+data_datasets = {
+    "1e": {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]},
+    "1mu": {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]},
+    "2e": {"data_e_b"},  # TODO: 2 lep datasets in cmsdb + config
+    "2mu": {"data_mu_b"},  # TODO
+    "emu": {"data_mu_b"},  # TODO
+}
+merged_datasets = set().union(*data_datasets.values())
+
+# mapping between process names in the config and inference model
+inference_procnames = {
+    # key: config process name, value: inference model process name
+    "foo": "bar",
+    # "st": "ST",
+    # "tt": "TT",
+}
 
 # mapping, which processes are used for which QCDScale (rate) uncertainty
 processes_per_QCDScale = {

diff --git a/hbw/inference/default.py b/hbw/inference/default.py
@@ -4,10 +4,13 @@
 hbw inference model.
 """
 
-from columnflow.inference import inference_model, ParameterType, ParameterTransformation
-from columnflow.config_util import get_datasets_from_process
+from columnflow.inference import InferenceModel, inference_model
 import hbw.inference.constants as const  # noqa
 
+from hbw.inference.functions import (
+    add_inference_categories, add_inference_processes, add_inference_parameters,
+)
+
 #
 # Defaults for all the Inference Model parameters
 #
@@ -134,172 +137,17 @@
 @inference_model(
     **default_cls_dict,
 )
-def default(self):
+def default(self: InferenceModel):
     """
     This is the default Inference model.
     Idea: first build an inclusive Inference Model with all Channels/Processes/Systematics,
     then remove anything not listed in the attributes.
     """
     year = self.config_inst.campaign.x.year  # noqa; not used right now
-    ecm = self.config_inst.campaign.ecm
-
-    #
-    # categories
-    #
-
-    # TODO: use ML model inst if possible
-    ml_model_processes = [
-        "ggHH_kl_1_kt_1_sl_hbbhww",
-        "tt",
-        "st",
-        "v_lep",
-        # "w_lnu",
-        # "dy_lep",
-    ]
-
-    # if process names need to be changed to fit some convention
-    inference_procnames = {
-        "foo": "bar",
-        # "st": "ST",
-        # "tt": "TT",
-    }
-
-    for proc in ml_model_processes:
-        for lep in ("e", "mu"):
-            cat_name = f"cat_1{lep}_{proc}"
-            if cat_name not in self.channels:
-                continue
 
-            cat_kwargs = {
-                "config_category": f"1{lep}__ml_{proc}",
-                # "config_variable": f"mlscore.{proc}_rebin",
-                "config_variable": f"mlscore.{proc}_manybins",
-                "mc_stats": self.mc_stats,
-            }
-            if self.skip_data:
-                cat_kwargs["data_from_processes"] = self.processes
-            else:
-                cat_kwargs["config_data_datasets"] = [f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]]
-
-            self.add_category(cat_name, **cat_kwargs)  # noqa
-
-    # add processes with corresponding datasets to all categories of the inference model
-    used_datasets = set()
-    for proc in self.processes:
-        if not self.config_inst.has_process(proc):
-            raise Exception(f"Process {proc} not included in the config {self.config_inst.name}")
-
-        # get datasets corresponding to this process
-        datasets = [
-            d.name for d in
-            get_datasets_from_process(self.config_inst, proc, strategy="inclusive")
-        ]
-
-        # check that no dataset is used multiple times
-        if datasets_already_used := used_datasets.intersection(datasets):
-            raise Exception(f"{datasets_already_used} datasets are used for multiple processes")
-        used_datasets |= set(datasets)
-
-        self.add_process(
-            inference_procnames.get(proc, proc),
-            config_process=proc,
-            is_signal=("HH_" in proc),
-            config_mc_datasets=datasets,
-        )
-
-    #
-    # parameters
-    #
-
-    # groups
-    self.add_parameter_group("experiment")
-    self.add_parameter_group("theory")
-
-    # lumi
-    lumi = self.config_inst.x.luminosity
-    for unc_name in lumi.uncertainties:
-        if unc_name not in self.systematics:
-            continue
-
-        self.add_parameter(
-            unc_name,
-            type=ParameterType.rate_gauss,
-            effect=lumi.get(names=unc_name, direction=("down", "up"), factor=True),
-            transformations=[ParameterTransformation.symmetrize],
-        )
-
-    # add QCD scale (rate) uncertainties to inference model
-    # TODO: combine scale and mtop uncertainties for specific processes?
-    # TODO: some scale/pdf uncertainties should be rounded to 3 digits, others to 4 digits
-    # NOTE: it might be easier to just take the recommended uncertainty values from HH conventions at
-    #       https://gitlab.cern.ch/hh/naming-conventions instead of taking the values from CMSDB
-    for k, procs in const.processes_per_QCDScale.items():
-        syst_name = f"QCDScale_{k}"
-        if syst_name not in self.systematics:
-            continue
-
-        for proc in procs:
-            if proc not in self.processes:
-                continue
-            process_inst = self.config_inst.get_process(proc)
-            if "scale" not in process_inst.xsecs[ecm]:
-                continue
-            self.add_parameter(
-                syst_name,
-                process=inference_procnames.get(proc, proc),
-                type=ParameterType.rate_gauss,
-                effect=tuple(map(
-                    lambda f: round(f, 3),
-                    process_inst.xsecs[ecm].get(names=("scale"), direction=("down", "up"), factor=True),
-                )),
-            )
-        self.add_parameter_to_group(syst_name, "theory")
-
-    # add PDF rate uncertainties to inference model
-    for k, procs in const.processes_per_pdf_rate.items():
-        syst_name = f"pdf_{k}"
-        if syst_name not in self.systematics:
-            continue
-
-        for proc in procs:
-            if proc not in self.processes:
-                continue
-            process_inst = self.config_inst.get_process(proc)
-            if "pdf" not in process_inst.xsecs[ecm]:
-                continue
-
-            self.add_parameter(
-                f"pdf_{k}",
-                process=inference_procnames.get(proc, proc),
-                type=ParameterType.rate_gauss,
-                effect=tuple(map(
-                    lambda f: round(f, 3),
-                    process_inst.xsecs[ecm].get(names=("pdf"), direction=("down", "up"), factor=True),
-                )),
-            )
-        self.add_parameter_to_group(syst_name, "theory")
-
-    for shape_uncertainty, shape_processes in const.processes_per_shape.items():
-        if shape_uncertainty not in self.systematics:
-            continue
-
-        # If "all" is included, takes all processes except for the ones specified (starting with !)
-        if "all" in shape_processes:
-            _remove_processes = {proc[:1] for proc in shape_processes if proc.startswith("!")}
-            shape_processes = set(self.processes) - _remove_processes
-
-        self.add_parameter(
-            shape_uncertainty,
-            process=shape_processes,
-            type=ParameterType.shape,
-            config_shift_source=const.source_per_shape[shape_uncertainty],
-        )
-
-        is_theory = "pdf" in shape_uncertainty or "murf" in shape_uncertainty
-        if is_theory:
-            self.add_parameter_to_group(syst_name, "theory")
-        else:
-            self.add_parameter_to_group(syst_name, "experiment")
+    add_inference_categories(self)
+    add_inference_processes(self)
+    add_inference_parameters(self)
 
     #
     # post-processing
@@ -336,5 +184,7 @@ def default(self):
     "lumi_13TeV_2017",
 ]
 
+cls_dict["ml_model_name"] = "dense_test"
+
 # minimal model for quick test purposes
 test = default.derive("test", cls_dict=cls_dict)