cleanup

mafrahm · mafrahm · commit 3b6ea9b8d4f6 · 2023-10-23T14:26:39.000+02:00
diff --git a/hbw/analysis/create_analysis.py b/hbw/analysis/create_analysis.py
@@ -65,12 +65,12 @@ def create_hbw_analysis(
     campaign_run2_2017_nano_v9 = cmsdb.campaigns.run2_2017_nano_v9.campaign_run2_2017_nano_v9
 
     # default config
-    #c17 = add_config(  # noqa
-    #    analysis_inst,
-    #    campaign_run2_2017_nano_v9.copy(),
-    #    config_name="c17",
-    #    config_id=2,
-    #)
+    c17 = add_config(  # noqa
+        analysis_inst,
+        campaign_run2_2017_nano_v9.copy(),
+        config_name="c17",
+        config_id=2,
+    )
 
     # config with limited number of files
     l17 = add_config(  # noqa
diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py
@@ -36,7 +36,6 @@ def add_config(
     limit_dataset_files: int | None = None,
 ) -> od.Config:
     # validations
-    print("add", analysis, config_name, config_id)
     assert campaign.x.year in [2016, 2017, 2018]
     if campaign.x.year == 2016:
         assert campaign.x.vfp in ["pre", "post"]
diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py
@@ -53,12 +53,7 @@ def ml_inputs_producer(cls, container, task_params):
 def default_producers(cls, container, task_params):
     """ Default producers chosen based on the Inference model and the ML Model """
 
-    # how it was before merge default, use the ml_inputs and event_weights
-    # TODO: we might need two ml_inputs producers in the future (sl vs dl)
-    #default_producers = ["dl_ml_inputs"]
-    #if dataset_inst and dataset_inst.is_mc:
-        # run event weights producer only if it's a MC dataset
-    #    default_producers.append("event_weights")
+    # per default, use the ml_inputs and event_weights
     default_producers = [ml_inputs_producer(cls, container, task_params), "event_weights"]
 
     # check if a ml_model has been set
diff --git a/hbw/config/dl/variables.py b/hbw/config/dl/variables.py
@@ -11,9 +11,11 @@
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
 
-from columnflow.columnar_util import EMPTY_FLOAT  # noqa
+from hbw.config.styling import default_var_binning, default_var_unit
+from hbw.util import call_once_on_config
 
 
+@call_once_on_config()
 def add_dl_variables(config: od.Config) -> None:
     # bjet features
     config.add_variable(
@@ -99,3 +101,156 @@ def add_dl_variables(config: od.Config) -> None:
         binning=(40, 0, 3),
         x_title=r"$ \Delta \phi(ll,jj)$",
     )
+
+
+@call_once_on_config()
+def add_dl_ml_variables(config: od.Config) -> None:
+    """
+    Adds ML input variables to a *config*.
+    """
+
+    # reconstructed variables
+    config.add_variable(
+        name="mli_ht",
+        expression="mli_ht",
+        binning=(40, 0, 1200),
+        unit="GeV",
+        x_title="HT",
+    )
+    config.add_variable(
+        name="mli_n_jet",
+        expression="mli_n_jet",
+        binning=(11, -0.5, 10.5),
+        x_title="Number of jets",
+    )
+    config.add_variable(
+        name="mli_n_deepjet",
+        expression="mli_n_deepjet",
+        binning=(11, -0.5, 10.5),
+        x_title="Number of b-tagged jets (deepjet medium WP)",
+    )
+    config.add_variable(
+        name="mli_deepjetsum",
+        expression="mli_deepjetsum",
+        binning=(40, 0, 4),
+        x_title="sum of deepjet scores",
+    )
+    config.add_variable(
+        name="mli_b_deepjetsum",
+        expression="mli_b_deepjetsum",
+        binning=(40, 0, 4),
+        x_title="sum of bjet deepjet scores",
+    )
+    config.add_variable(
+        name="mli_dr_bb",
+        expression="mli_dr_bb",
+        binning=(40, 0, 8),
+        x_title=r"$\Delta R(b,b)$",
+    )
+    config.add_variable(
+        name="mli_dphi_bb",
+        expression="mli_dphi_bb",
+        binning=(40, 0, 3.2),
+        x_title=r"$\Delta\Phi(b,b)$",
+    )
+    config.add_variable(
+        name="mli_mbb",
+        expression="mli_mbb",
+        binning=(40, 0, 400),
+        unit="GeV",
+        x_title=r"m(b,b)",
+    )
+    config.add_variable(
+        name="mli_mindr_lb",
+        expression="mli_mindr_lb",
+        binning=(40, 0, 8),
+        x_title=r"min $\Delta R(l,b)$",
+    )
+    config.add_variable(
+        name="mli_dphi_bb_nu",
+        expression="mli_dphi_bb_nu",
+        binning=(40, 0, 3.2),
+        x_title=r"$\Delta\Phi(bb,\nu)$",
+    )
+    config.add_variable(
+        name="mli_dr_bb_l",
+        expression="mli_dr_bb_l",
+        binning=(40, 0, 8),
+        x_title=r"$\Delta R(bb,l)$",
+    )
+    config.add_variable(
+        name="mli_mll",
+        expression="mli_mll",
+        binning=(40, 0, 80),
+        x_title=r"$m_{ll}$",
+    )
+    config.add_variable(
+        name="mli_dr_ll",
+        expression="mli_dr_ll",
+        binning=(40, 0, 8),
+        x_title=r"$\Delta R(ll)$",
+    )
+    config.add_variable(
+        name="mli__min_dr_llbb",
+        expression="mli_min_dr_llbb",
+        binning=(40, 0, 8),
+        x_title=r"$\Delta R(bb,ll)$",
+    )
+    config.add_variable(
+        name="mli_bb_pt",
+        expression="mli_bb_pt",
+        binning=(40, 0, 500),
+        unit="GeV",
+        x_title=r"$bb_p_T$",
+    )
+    config.add_variable(
+        name="mli_mllMET",
+        expression="mli_mllMET",
+        binning=(40, 0, 200),
+        x_title=r"$m_{llMET}$",
+    )
+    config.add_variable(
+        name="mli_dr_bb_llMET",
+        expression="mli_dr_bb_llMET",
+        binning=(40, 0, 8),
+        x_title=r"$\Delta R(bb,llMET)$",
+    )
+    config.add_variable(
+        name="mli_dphi_bb_llMET",
+        expression="mli_dphi_bb_llMET",
+        binning=(40, 0, 8),
+        x_title=r"$\Delta \phi(bb,llMET)$",
+    )
+    config.add_variable(
+        name="mli_mbbllMET",
+        expression="mli_mbbllMET",
+        binning=(40, 0, 500),
+        unit="GeV",
+        x_title=r"$m_{bbllMET}$",
+    )
+    config.add_variable(
+        name="mli_dphi_ll",
+        expression="mli_dphi_ll",
+        binning=(40, 0, 8),
+        unit="GeV",
+        x_title=r"$\Delta \phi_{ll}$",
+    )
+    config.add_variable(
+        name="mli_ll_pt",
+        expression="mli_ll_pt",
+        binning=(40, 0, 200),
+        unit="GeV",
+        x_title=r"$ll p_T$",
+    )
+
+    for obj in ["b1", "b2", "lep", "lep2", "met"]:
+        for var in ["pt", "eta"]:
+            if var == "eta" and obj == "met":
+                continue
+            config.add_variable(
+                name=f"mli_{obj}_{var}",
+                expression=f"mli_{obj}_{var}",
+                binning=default_var_binning[var],
+                unit=default_var_unit.get(var, var),
+                x_title="{obj} {var}".format(obj=obj, var=var),
+            )
diff --git a/hbw/config/ml_variables.py b/hbw/config/ml_variables.py
@@ -6,7 +6,6 @@
 
 import order as od
 
-# from columnflow.columnar_util import EMPTY_FLOAT
 from hbw.config.styling import default_var_binning, default_var_unit
 from hbw.util import call_once_on_config
 
diff --git a/hbw/ml/base.py b/hbw/ml/base.py
@@ -184,7 +184,7 @@ def prepare_inputs(
 
                 # calculate some stats per dataset
                 filenames = [inp["mlevents"].path for inp in files]
-                
+
                 N_events = sum([len(ak.from_parquet(fn)) for fn in filenames])
                 if N_events == 0:
                     # skip empty datasets
@@ -378,8 +378,7 @@ def prepare_ml_model(
 
         from keras.models import Sequential
         from keras.layers import Dense, BatchNormalization
-        # from hbw.ml.tf_util import cumulated_crossentropy
-        import tensorflow as tf
+        from hbw.ml.tf_util import cumulated_crossentropy
 
         n_inputs = len(set(self.input_features))
         n_outputs = len(self.processes)
@@ -400,9 +399,8 @@ def prepare_ml_model(
         # compile the network
         # NOTE: the custom loss needed due to output layer changes for negative weights
         optimizer = keras.optimizers.Adam(learning_rate=0.00050)
-        categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
         model.compile(
-            loss=categorical_crossentropy, #cumulated_crossentropy,
+            loss=cumulated_crossentropy,
             optimizer=optimizer,
             weighted_metrics=["categorical_accuracy"],
         )
diff --git a/hbw/ml/dense_classifier.py b/hbw/ml/dense_classifier.py
@@ -25,45 +25,40 @@
 class DenseClassifier(ModelFitMixin, DenseModelMixin, MLClassifierBase):
 
     processes = [
-        #"sg",
-        "ggHH_kl_5_kt_1_dl_hbbhww",
-        #"tt",
-        #"st",
+        "ggHH_kl_1_kt_1_sl_hbbhww",
+        "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww",
+        "tt",
+        "st",
         "v_lep",
-        "t_bkg",
-        #"w_lnu",
-        #"dy_lep",
+        # "w_lnu",
+        # "dy_lep",
     ]
 
     ml_process_weights = {
-        "ggHH_kl_0_kt_1_dl_hbbhww": 1,
-        "ggHH_kl_1_kt_1_dl_hbbhww": 1,
-        "ggHH_kl_5_kt_1_dl_hbbhww": 1,
-        "sg": 1,
-        "tt": 1,
-        "st": 1,
-        "v_lep": 1,
-        "tt_bkg": 1,
+        "ggHH_kl_1_kt_1_sl_hbbhww": 1,
+        "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww": 1,
+        "tt": 2,
+        "st": 2,
+        "v_lep": 2,
         "w_lnu": 2,
-        "dy_lep": 1,
+        "dy_lep": 2,
     }
 
     dataset_names = {
-        #"ggHH_kl_0_kt_1_dl_hbbhww_powheg",
-        #"ggHH_kl_1_kt_1_dl_hbbhww_powheg",
-        "ggHH_kl_5_kt_1_dl_hbbhww_powheg",
+        "ggHH_kl_1_kt_1_sl_hbbhww_powheg",
+        "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww_madgraph",
         # TTbar
         "tt_sl_powheg",
         "tt_dl_powheg",
         "tt_fh_powheg",
         # SingleTop
         "st_tchannel_t_powheg",
-        # "st_tchannel_tbar_powheg", #problem in previous task for production
+        "st_tchannel_tbar_powheg",
         "st_twchannel_t_powheg",
         "st_twchannel_tbar_powheg",
-        #"st_schannel_lep_amcatnlo", #problem with normalizatino weights.. 
+        "st_schannel_lep_amcatnlo",
         # "st_schannel_had_amcatnlo",
-        # WJets commented out because no events avaible and hence no nomralization weights 
+        # WJets
         "w_lnu_ht70To100_madgraph",
         "w_lnu_ht100To200_madgraph",
         "w_lnu_ht200To400_madgraph",
@@ -84,41 +79,29 @@ class DenseClassifier(ModelFitMixin, DenseModelMixin, MLClassifierBase):
     }
 
     input_features = [
-        "mli_mll", "mli_min_dr_llbb", "mli_dr_ll", "mli_bb_pt",
         "mli_ht", "mli_n_jet", "mli_n_deepjet",
-        "mli_deepjetsum", "mli_b_deepjetsum", "mli_l_deepjetsum",
+        # "mli_deepjetsum", "mli_b_deepjetsum", "mli_l_deepjetsum",
         "mli_dr_bb", "mli_dphi_bb", "mli_mbb", "mli_mindr_lb",
-        "mli_dphi_ll", "mli_dphi_bb_nu", "mli_dphi_bb_llMET", "mli_mllMET",
-        "mli_mbbllMET", "mli_dr_bb_llMET", "mli_ll_pt", "mli_met_pt",
-        #"mli_met_eta", "meli_met_pt", 
-        #"mli_dr_jj", "mli_dphi_jj", "mli_mjj", "mli_mindr_lj",
-        #"mli_dphi_lnu", "mli_mlnu", "mli_mjjlnu", "mli_mjjl", "mli_dphi_bb_jjlnu", "mli_dr_bb_jjlnu",
-        #"mli_dphi_bb_jjl", "mli_dr_bb_jjl", "mli_dphi_bb_nu", "mli_dphi_jj_nu", "mli_dr_bb_l", "mli_dr_jj_l",
-        #"mli_mbbjjlnu", "mli_mbbjjl", "mli_s_min",
+        "mli_dr_jj", "mli_dphi_jj", "mli_mjj", "mli_mindr_lj",
+        "mli_dphi_lnu", "mli_mlnu", "mli_mjjlnu", "mli_mjjl", "mli_dphi_bb_jjlnu", "mli_dr_bb_jjlnu",
+        "mli_dphi_bb_jjl", "mli_dr_bb_jjl", "mli_dphi_bb_nu", "mli_dphi_jj_nu", "mli_dr_bb_l", "mli_dr_jj_l",
+        "mli_mbbjjlnu", "mli_mbbjjl", "mli_s_min",
     ] + [
         f"mli_{obj}_{var}"
-        for obj in ["b1", "b2", "lep", "lep2"]
+        for obj in ["b1", "b2", "j1", "j2", "lep", "met"]
         for var in ["pt", "eta"]
-    ] 
-    """
-      + [
+    ] + [
         f"mli_{obj}_{var}"
         for obj in ["fj"]
         for var in ["pt", "eta", "phi", "mass", "msoftdrop", "deepTagMD_HbbvsQCD"]
     ]
-    """
 
     store_name = "inputs_v1"
 
-    folds = 3
-    layers = (164, 164, 164)
-    activation = "relu"
-    learningrate = 0.0005
-    batchsize = 8000 #2 ** 12
-    epochs = 150
-    dropout = 0.50
-    negative_weights = "abs"
+    folds = 5
     validation_fraction = 0.20
+    learningrate = 0.00050
+    negative_weights = "handle"
 
     # overwriting DenseModelMixin parameters
     activation = "relu"
@@ -204,21 +187,19 @@ def training_selector(self, config_inst: od.Config, requested_selector: str) ->
 
     def training_producers(self, config_inst: od.Config, requested_producers: Sequence[str]) -> list[str]:
         # fix MLTraining Phase Space
-        return ["dl_ml_inputs"] if self.config_ist.has_tag("is_sl") else [""]
+        return ["ml_inputs"]
 
 
 # copies of the default DenseClassifier for testing hard-coded changes
 for i in range(10):
     dense_copy = DenseClassifier.derive(f"dense_{i}")
 
 cls_dict_test = {
-    "folds": 5,
-    "epochs": 100,
-    "processes": ["ggHH_kl_5_kt_1_dl_hbbhww", "v_lep", "t_bkg"],
+    "epochs": 4,
+    "processes": ["ggHH_kl_1_kt_1_sl_hbbhww", "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww", "tt", "st", "v_lep"],
     "dataset_names": {
-        "ggHH_kl_5_kt_1_dl_hbbhww_powheg",  # "tt_dl_powheg",
-        # "st_tchannel_t_powheg", #"w_lnu_ht400To600_madgraph",
-        "dy_lep_m50_ht400to600_madgraph",
+        "ggHH_kl_1_kt_1_sl_hbbhww_powheg", "qqHH_CV_1_C2V_1_kl_1_sl_hbbhww_madgraph", "tt_dl_powheg",
+        "st_tchannel_t_powheg", "w_lnu_ht400To600_madgraph",
     },
 }
 
diff --git a/hbw/ml/dl.py b/hbw/ml/dl.py
diff --git a/hbw/production/ml_inputs.py b/hbw/production/ml_inputs.py