Skip to content

Commit

Permalink
Merge pull request #50 from uhh-cms/refactor_inference_model
Browse files Browse the repository at this point in the history
Refactoring inference model
  • Loading branch information
apaasch authored Oct 18, 2023
2 parents 6b89069 + 4a05465 commit a6cfb29
Show file tree
Hide file tree
Showing 10 changed files with 432 additions and 354 deletions.
9 changes: 7 additions & 2 deletions hbw/config/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def add_categories_selection(config: od.Config) -> None:
Adds categories to a *config*, that are typically produced in `SelectEvents`.
"""

config.x.lepton_channels = {
"sl": ("1e", "1mu"),
"dl": ("2e", "2mu", "emu"),
}[config.x.lepton_tag]

config.add_category(
name="incl",
id=1,
Expand Down Expand Up @@ -138,7 +143,7 @@ def add_categories_production(config: od.Config) -> None:
#

category_blocks = OrderedDict({
"lep": [cat_1e, cat_1mu],
"lep": [config.get_category(lep_ch) for lep_ch in config.x.lepton_channels],
"jet": [cat_resolved, cat_boosted],
"b": [cat_1b, cat_2b],
})
Expand Down Expand Up @@ -172,7 +177,7 @@ def add_categories_ml(config, ml_model_inst):
))

category_blocks = OrderedDict({
"lep": [config.get_category("1e"), config.get_category("1mu")],
"lep": [config.get_category(lep_ch) for lep_ch in config.x.lepton_channels],
"jet": [config.get_category("resolved"), config.get_category("boosted")],
"b": [config.get_category("1b"), config.get_category("2b")],
"dnn": ml_categories,
Expand Down
7 changes: 6 additions & 1 deletion hbw/config/config_run2.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from hbw.config.defaults_and_groups import set_config_defaults_and_groups
from hbw.util import four_vec


thisdir = os.path.dirname(os.path.abspath(__file__))

logger = law.logger.get_logger(__name__)
Expand Down Expand Up @@ -56,6 +55,12 @@ def add_config(

# create a config by passing the campaign, so id and name will be identical
cfg = analysis.add_config(campaign, name=config_name, id=config_id, tags=analysis.tags)
if cfg.has_tag("is_sl"):
cfg.x.lepton_tag = "sl"
elif cfg.has_tag("is_dl"):
cfg.x.lepton_tag = "dl"
else:
raise Exception(f"config {cfg.name} needs either the 'is_sl' or 'is_dl' tag")

# use custom get_dataset_lfns function
cfg.x.get_dataset_lfns = get_dataset_lfns
Expand Down
2 changes: 1 addition & 1 deletion hbw/config/defaults_and_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def default_ml_model(cls, container, task_params):

# the ml_model parameter is only used by `MLTraining` and `MLEvaluation`, therefore use some default
# NOTE: default_ml_model does not work for the MLTraining task
if cls.task_family in ("cf.MLTraining", "cf.MLEvaulation", "cf.MergeMLEvents", "cf.PrepareMLEvents"):
if cls.task_family in ("cf.MLTraining", "cf.MLEvaluation", "cf.MergeMLEvents", "cf.PrepareMLEvents"):
# TODO: we might want to distinguish between two default ML models (sl vs dl)
default_ml_model = "dense_default"

Expand Down
223 changes: 223 additions & 0 deletions hbw/inference/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
# coding: utf-8

"""
Collection of helper functions for creating inference models
"""

import law
import order as od

from columnflow.inference import InferenceModel, ParameterType, ParameterTransformation
from columnflow.ml import MLModel
from columnflow.util import maybe_import
from columnflow.config_util import get_datasets_from_process

import hbw.inference.constants as const # noqa


np = maybe_import("numpy")
ak = maybe_import("awkward")

logger = law.logger.get_logger(__name__)


class HBWInferenceModelBase(InferenceModel):
"""
This is the base for all Inference models in our hbw analysis.
"""

#
# Model attributes, can be changed via `cls.derive`
#

# default ml model
ml_model_name = "dense_default"

# list of all processes/channels/systematics to include in the datacards
processes = []
channels = []
systematics = []

mc_stats = True
skip_data = True

def __init__(self, config_inst: od.Config, *args, **kwargs):
super().__init__(config_inst)

self.add_inference_categories()
self.add_inference_processes()
self.add_inference_parameters()

#
# post-processing
#

self.cleanup()

def add_inference_categories(self: InferenceModel):
"""
This function creates categories for the inference model
"""

# get processes used in MLTraining
ml_model_inst = MLModel.get_cls(self.ml_model_name)(self.config_inst)
ml_model_processes = ml_model_inst.processes

lepton_channels = self.config_inst.x.lepton_channels

for proc in ml_model_processes:
for lep in lepton_channels:
cat_name = f"cat_{lep}_{proc}"
if cat_name not in self.channels:
continue

cat_kwargs = {
"config_category": f"{lep}__ml_{proc}",
# "config_variable": f"mlscore.{proc}_rebin",
"config_variable": f"mlscore.{proc}_manybins",
"mc_stats": self.mc_stats,
}
if self.skip_data:
cat_kwargs["data_from_processes"] = self.processes
else:
cat_kwargs["config_data_datasets"] = const.data_datasets[lep]

self.add_category(cat_name, **cat_kwargs) # noqa

def add_inference_processes(self: InferenceModel):
"""
Function that adds processes with their corresponding datasets to all categories
of the inference model
"""
used_datasets = set()
for proc in self.processes:
if not self.config_inst.has_process(proc):
raise Exception(f"Process {proc} not included in the config {self.config_inst.name}")

# get datasets corresponding to this process
datasets = [
d.name for d in
get_datasets_from_process(self.config_inst, proc, strategy="inclusive")
]

# check that no dataset is used multiple times
if datasets_already_used := used_datasets.intersection(datasets):
raise Exception(f"{datasets_already_used} datasets are used for multiple processes")
used_datasets |= set(datasets)

self.add_process(
const.inference_procnames.get(proc, proc),
config_process=proc,
is_signal=("HH_" in proc),
config_mc_datasets=datasets,
)

def add_inference_parameters(self: InferenceModel):
"""
Function that adds all parameters (systematic variations) to the inference model
"""
# define the two relevant types of parameter groups
self.add_parameter_group("experiment")
self.add_parameter_group("theory")

# add rate + shape parameters to the inference model
self.add_rate_parameters()
self.add_shape_parameters()

# TODO: check that all requested systematics are in final MLModel?

def add_rate_parameters(self: InferenceModel):
"""
Function that adds all rate parameters to the inference model
"""
ecm = self.config_inst.campaign.ecm

# lumi
lumi = self.config_inst.x.luminosity
for unc_name in lumi.uncertainties:
if unc_name not in self.systematics:
continue

self.add_parameter(
unc_name,
type=ParameterType.rate_gauss,
effect=lumi.get(names=unc_name, direction=("down", "up"), factor=True),
transformations=[ParameterTransformation.symmetrize],
)

# add QCD scale (rate) uncertainties to inference model
# TODO: combine scale and mtop uncertainties for specific processes?
# TODO: some scale/pdf uncertainties should be rounded to 3 digits, others to 4 digits
# NOTE: it might be easier to just take the recommended uncertainty values from HH conventions at
# https://gitlab.cern.ch/hh/naming-conventions instead of taking the values from CMSDB
for k, procs in const.processes_per_QCDScale.items():
syst_name = f"QCDScale_{k}"
if syst_name not in self.systematics:
continue

for proc in procs:
if proc not in self.processes:
continue
process_inst = self.config_inst.get_process(proc)
if "scale" not in process_inst.xsecs[ecm]:
continue
self.add_parameter(
syst_name,
process=const.inference_procnames.get(proc, proc),
type=ParameterType.rate_gauss,
effect=tuple(map(
lambda f: round(f, 3),
process_inst.xsecs[ecm].get(names=("scale"), direction=("down", "up"), factor=True),
)),
)
self.add_parameter_to_group(syst_name, "theory")

# add PDF rate uncertainties to inference model
for k, procs in const.processes_per_pdf_rate.items():
syst_name = f"pdf_{k}"
if syst_name not in self.systematics:
continue

for proc in procs:
if proc not in self.processes:
continue
process_inst = self.config_inst.get_process(proc)
if "pdf" not in process_inst.xsecs[ecm]:
continue

self.add_parameter(
f"pdf_{k}",
process=const.inference_procnames.get(proc, proc),
type=ParameterType.rate_gauss,
effect=tuple(map(
lambda f: round(f, 3),
process_inst.xsecs[ecm].get(names=("pdf"), direction=("down", "up"), factor=True),
)),
)
self.add_parameter_to_group(syst_name, "theory")

def add_shape_parameters(self: InferenceModel):
"""
Function that adds all rate parameters to the inference model
"""
for shape_uncertainty, shape_processes in const.processes_per_shape.items():
if shape_uncertainty not in self.systematics:
continue

# If "all" is included, takes all processes except for the ones specified (starting with !)
if "all" in shape_processes:
_remove_processes = {proc[:1] for proc in shape_processes if proc.startswith("!")}
shape_processes = set(self.processes) - _remove_processes

self.add_parameter(
shape_uncertainty,
process=shape_processes,
type=ParameterType.shape,
config_shift_source=const.source_per_shape[shape_uncertainty],
)

is_theory = "pdf" in shape_uncertainty or "murf" in shape_uncertainty
if is_theory:
self.add_parameter_to_group(shape_uncertainty, "theory")
else:
self.add_parameter_to_group(shape_uncertainty, "experiment")
21 changes: 17 additions & 4 deletions hbw/inference/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,23 @@
}
signals = {*signals_ggHH, *signals_qqHH}

# collection of all datasets (only 2017 ATM)
e_datasets = {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]}
mu_datasets = {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]}
datasets = {*e_datasets, *mu_datasets}
# mapping between lepton categories and datasets (only 2017 ATM)
data_datasets = {
"1e": {f"data_e_{i}" for i in ["b", "c", "d", "e", "f"]},
"1mu": {f"data_mu_{i}" for i in ["b", "c", "d", "e", "f"]},
"2e": {"data_e_b"}, # TODO: 2 lep datasets in cmsdb + config
"2mu": {"data_mu_b"}, # TODO
"emu": {"data_mu_b"}, # TODO
}
merged_datasets = set().union(*data_datasets.values())

# mapping between process names in the config and inference model
inference_procnames = {
# key: config process name, value: inference model process name
"foo": "bar",
# "st": "ST",
# "tt": "TT",
}

# mapping, which processes are used for which QCDScale (rate) uncertainty
processes_per_QCDScale = {
Expand Down
Loading

0 comments on commit a6cfb29

Please sign in to comment.