Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/calib and sel #99

Open
wants to merge 23 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 15 additions & 16 deletions hbw/analysis/create_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def analysis_factory(configs: od.UniqueObjectIndex):
)
else:
raise ValueError(
f"Campaign used for {config_name} is not yet initialized; to initialize, run: \n",
f"Campaign used for {config_name} is not yet initialized; to initialize, run: \n"
f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y",
)
# cpn_task.run()
Expand Down Expand Up @@ -141,42 +141,39 @@ def analysis_factory(configs: od.UniqueObjectIndex):

# 2017
add_lazy_config(
# {
# "cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9",
# },
"c17",
1700,
)

# 2022 preEE
add_lazy_config(
# {
# "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12",
# "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13",
# },
"c22pre",
2200,
)

# 2022 postEE
add_lazy_config(
# {
# "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12",
# "cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13",
# "cmsdb.campaigns.run3_2022_postEE_nano_uhh_v12": "campaign_run3_2022_postEE_nano_uhh_v12",
# },
"c22post",
2210,
)

add_lazy_config(
"c22pre_das",
2201,
)
add_lazy_config(
"c22post_das",
2211,
)

#
# modify store_parts
#

software_tasks = ("cf.BundleBashSandbox", "cf.BundleCMSSWSandbox", "cf.BundleSoftware")
shareable_analysis_tasks = ("cf.CalibrateEvents", "cf.GetDatasetLFNs")
limited_config_shared_tasks = ("cf.CalibrateEvents", "cf.GetDatasetLFNs", "cf.SelectEvents", "cf.ReduceEvents")
skip_new_version_schema = ("cf.CalibrateEvents", "cf.GetDatasetLFNs")
skip_new_version_schema = ()
known_parts = (
# from cf
"analysis", "task_family", "config", "configs", "dataset", "shift", "version",
Expand Down Expand Up @@ -237,11 +234,13 @@ def reorganize_parts(task, store_parts):
"analysis",
"calibrator", "calibrators", "calib",
"selector", "sel",
"producer", "producers", "prod",
"config", "configs",
"producers", "prod",
"ml_data", "ml_model", "ml_models",
"weightprod", "inf_model",
"task_family",
"config", "dataset", "shift",
"calibrator", "producer",
"shift", "dataset",
]
parts_order_end = ["version"]

Expand Down
83 changes: 83 additions & 0 deletions hbw/analysis/processes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# coding: utf-8

"""
Creation and modification of processes in the HH -> bbWW analysis.
NOTE: it is crucial to modify processes before the campaign is created. Otherwise,
the changes will not be reflected in the campaign and there will be inconsistencies.
"""

# import order as od


from hbw.config.processes import create_parent_process
from hbw.config.styling import color_palette
from cmsdb.util import add_decay_process


def modify_cmsdb_processes():
from cmsdb.processes import (
qcd_mu, qcd_em, qcd_bctoe,
tt, ttv, st, w_lnu, vv, h,
dy, dy_m4to10, dy_m10to50, dy_m50toinf, dy_m50toinf_0j, dy_m50toinf_1j, dy_m50toinf_2j,
)

qcd_mu.label = "QCD Muon enriched"
qcd_ele = create_parent_process(
[qcd_em, qcd_bctoe],
name="qcd_ele",
id=31199,
label="QCD Electron enriched",
)

v_lep = create_parent_process(
[w_lnu, dy],
name="v_lep",
id=64575573, # random number
label="W and DY",
)

t_bkg = create_parent_process(
[st, tt, ttv],
name="t_bkg",
id=97842611, # random number
label="tt + st",
)

background = create_parent_process( # noqa: F841
[t_bkg, v_lep, vv, w_lnu, h, qcd_ele, qcd_mu],
name="background",
id=99999,
label="background",
color=color_palette["blue"],
)

decay_map = {
"lf": {
"name": "lf",
"id": 50,
"label": "(lf)",
"br": -1,
},
"hf": {
"name": "hf",
"id": 60,
"label": "(hf)",
"br": -1,
},
}

for dy_proc_inst in (
dy, dy_m4to10, dy_m10to50, dy_m50toinf, dy_m50toinf_0j, dy_m50toinf_1j, dy_m50toinf_2j,
):
add_production_mode_parent = dy_proc_inst.name != "dy"
for flavour in ("hf", "lf"):
# the 'add_decay_process' function helps us to create all parent-daughter relationships
add_decay_process(
dy_proc_inst,
decay_map[flavour],
add_production_mode_parent=add_production_mode_parent,
name_func=lambda parent_name, decay_name: f"{parent_name}_{decay_name}",
label_func=lambda parent_label, decay_label: f"{parent_label} {decay_label}",
xsecs=None,
aux={"flavour": flavour},
)
143 changes: 127 additions & 16 deletions hbw/calibration/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,170 @@
import law

from columnflow.calibration import Calibrator, calibrator
from columnflow.calibration.cms.met import met_phi
from columnflow.calibration.cms.jets import jec, jer
from columnflow.production.cms.seeds import deterministic_seeds
from columnflow.util import maybe_import
from columnflow.util import maybe_import, try_float
from columnflow.columnar_util import set_ak_column, EMPTY_FLOAT

from hbw.calibration.jet import jec_nominal, bjet_regression
from hbw.util import MET_COLUMN

from hbw.calibration.jet import bjet_regression

ak = maybe_import("awkward")
np = maybe_import("numpy")


logger = law.logger.get_logger(__name__)


@calibrator(
uses={deterministic_seeds},
# jec uncertainty_sources: set to None to use config default
jec_sources=["Total"],
version=1,
# add dummy produces such that this calibrator will always be run when requested
# (temporary workaround until init's are only run as often as necessary)
produces={"FatJet.pt"},
)
def fatjet(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
"""
FatJet calibrator, combining JEC and JER.
"""
if self.task.local_shift != "nominal":
raise Exception("FatJet Calibrator should not be run for shifts other than nominal")

# apply the fatjet JEC and JER
events = self[self.fatjet_jec_cls](events, **kwargs)
if self.dataset_inst.is_mc:
events = self[self.fatjet_jer_cls](events, **kwargs)

return events


@fatjet.init
def fatjet_init(self: Calibrator) -> None:
if not self.task or self.task.task_family != "cf.CalibrateEvents":
# init only required for task itself
return

if not getattr(self, "dataset_inst", None):
return

# list of calibrators to apply (in that order)
self.calibrators = []

fatjet_jec_cls_dict = {
"jet_name": "FatJet",
"gen_jet_name": "GenJetAK8",
# MET propagation is performed in AK4 jet calibrator; fatjet should never use any MET columns
"propagate_met": False,
"met_name": "DO_NOT_USE",
"raw_met_name": "DO_NOT_USE",
}
fatjet_jer_cls_dict = fatjet_jec_cls_dict.copy()
# NOTE: deterministic FatJet seeds are not yet possible to produce
# fatjet_jer_cls_dict["deterministic_seed_index"] = 0

uncertainty_sources = [] if self.dataset_inst.is_data else self.jec_sources
jec_cls_name = f"fatjet_jec{'_nominal' if uncertainty_sources == [] else ''}"
self.fatjet_jec_cls = jec.derive(jec_cls_name, cls_dict={
**fatjet_jec_cls_dict,
"uncertainty_sources": uncertainty_sources,
})
self.fatjet_jer_cls = jer.derive("deterministic_fatjet_jer", cls_dict=fatjet_jer_cls_dict)

self.uses |= {self.fatjet_jec_cls, self.fatjet_jer_cls}
self.produces |= {self.fatjet_jec_cls, self.fatjet_jer_cls}


@calibrator(
uses={deterministic_seeds, MET_COLUMN("{pt,phi}")},
produces={deterministic_seeds},
skip_jecunc=True,
# jec uncertainty_sources: set to None to use config default
jec_sources=["Total"],
bjet_regression=True,
version=1,
)
def base(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
def jet_base(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array:
events = self[deterministic_seeds](events, **kwargs)

# keep a copy of non-propagated MET to replace infinite values
pre_calib_met = events[self.config_inst.x.met_name]

logger.info(f"Running calibrators '{[calib.cls_name for calib in self.calibrators]}' (in that order)")
for calibrator_inst in self.calibrators:
events = self[calibrator_inst](events, **kwargs)

# workaround for infinite values in MET pt/phi
for route in self.produced_columns:
col = route.string_column
m = ~np.isfinite(route.apply(events))
if ak.any(m):
# replace infinite values
replace_value = EMPTY_FLOAT
if self.config_inst.x.met_name in col:
# use pre-calibrated MET to replace infinite values of MET pt/phi
replace_value = pre_calib_met[col.split(".")[-1].split("_")[0]]
logger.info(
f"Found infinite values in {col}; Values will be replaced with "
f"{replace_value if try_float(replace_value) else replace_value[m]}",
)
events = set_ak_column(events, col, ak.where(m, replace_value, route.apply(events)))

return events


@base.init
def base_init(self: Calibrator) -> None:
@jet_base.init
def jet_base_init(self: Calibrator) -> None:
if not self.task or self.task.task_family != "cf.CalibrateEvents":
# init only required for task itself
return

if not getattr(self, "dataset_inst", None):
return

met_name = self.config_inst.x.met_name
raw_met_name = self.config_inst.x.raw_met_name

# list of calibrators to apply (in that order)
self.calibrators = []

if self.dataset_inst.is_data or self.skip_jecunc:
self.calibrators.append(jec_nominal)
else:
self.calibrators.append(jec)
uncertainty_sources = [] if self.dataset_inst.is_data else self.jec_sources
jec_cls_name = f"ak4_jec{'_nominal' if uncertainty_sources == [] else ''}"

jec_cls = jec.derive(
jec_cls_name,
cls_dict={
"uncertainty_sources": uncertainty_sources,
"met_name": met_name,
"raw_met_name": raw_met_name,
},
)
self.calibrators.append(jec_cls)

if self.bjet_regression:
self.calibrators.append(bjet_regression)

# run JER only on MC
if self.dataset_inst.is_mc:
self.calibrators.append(jer)
# version of jer that uses the first random number from deterministic_seeds
deterministic_jer_cls = jer.derive(
"deterministic_jer",
cls_dict={
"deterministic_seed_index": 0,
"met_name": met_name,
},
)
self.calibrators.append(deterministic_jer_cls)

if self.config_inst.x.run == 2:
# derive met_phi calibrator (currently only for run 2)
met_phi_cls = met_phi.derive("met_phi", cls_dict={"met_name": met_name})
self.calibrators.append(met_phi_cls)

self.uses |= set(self.calibrators)
self.produces |= set(self.calibrators)


default = base.derive("default", cls_dict=dict(skip_jecunc=False, bjet_regression=False))
skip_jecunc = base.derive("skip_jecunc", cls_dict=dict(skip_jecunc=True, bjet_regression=False))
with_b_reg = base.derive("with_b_reg", cls_dict=dict(skip_jecunc=True, bjet_regression=True))
full = base.derive("full", cls_dict=dict(skip_jecunc=False, bjet_regression=True))
skip_jecunc = jet_base.derive("skip_jecunc", cls_dict=dict(bjet_regression=False))
with_b_reg = jet_base.derive("with_b_reg", cls_dict=dict(bjet_regression=True))
10 changes: 6 additions & 4 deletions hbw/categorization/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from columnflow.selection import SelectionResult
from columnflow.columnar_util import has_ak_column, optional_column

from hbw.util import MET_COLUMN

np = maybe_import("numpy")
ak = maybe_import("awkward")

Expand Down Expand Up @@ -160,15 +162,15 @@ def catid_fake(
return events, mask


@categorizer(uses={"MET.pt"})
@categorizer(uses={MET_COLUMN("pt")})
def catid_highmet(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]:
mask = events.MET.pt >= 20
mask = events[self.config_inst.x.met_name].pt >= 20
return events, mask


@categorizer(uses={"MET.pt"})
@categorizer(uses={MET_COLUMN("pt")})
def catid_lowmet(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]:
mask = events.MET.pt < 20
mask = events[self.config_inst.x.met_name].pt < 20
return events, mask

#
Expand Down
8 changes: 5 additions & 3 deletions hbw/columnflow_patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,7 @@ def TaskArrayFunction_str(self):

TaskArrayFunction.__str__ = TaskArrayFunction_str
logger.info(
"patched TaskArrayFunction.__str__ to include the CSP version attribute "
"(NOTE that this currently does not work for the "
"MLTrainingMixin tasks (e.g. MLPreTraining and MLTraining))",
"patched TaskArrayFunction.__str__ to include the CSP version attribute",
)


Expand Down Expand Up @@ -136,6 +134,10 @@ def patched_init(self, *args, **kwargs):

@memoize
def patch_all():
# change the "retries" parameter default
from columnflow.tasks.framework.remote import RemoteWorkflow
RemoteWorkflow.retries = RemoteWorkflow.retries.copy(default=3)

patch_mltraining()
patch_htcondor_workflow_naf_resources()
# patch_column_alias_strategy()
Expand Down
Loading
Loading