fix: sync changes from #185 to script (#186)

alexander-held · web-flow · commit 4f5a6613f21c · 2023-08-04T13:29:44.000+02:00
* run jupytext to sync notebook changes to script version
diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.14.1
+#       jupytext_version: 1.14.7
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
@@ -40,7 +40,7 @@
 # %% [markdown]
 # ### Imports: setting up our environment
 
-# %% tags=[]
+# %%
 import asyncio
 import logging
 import os
@@ -89,7 +89,7 @@
 #
 # The input files are all in the 1–3 GB range.
 
-# %% tags=[]
+# %%
 ### GLOBAL CONFIGURATION
 # input files per process, set to e.g. 10 (smaller number = faster)
 N_FILES_MAX_PER_SAMPLE = 5
@@ -114,7 +114,7 @@
 # - calculating systematic uncertainties at the event and object level,
 # - filling all the information into histograms that get aggregated and ultimately returned to us by `coffea`.
 
-# %% tags=[]
+# %%
 # functions creating systematic variations
 def flat_variation(ones):
     # 2.5% weight variations
@@ -316,8 +316,13 @@ def postprocess(self, accumulator):
 #
 # Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata.
 
-# %% tags=[]
-fileset = utils.construct_fileset(N_FILES_MAX_PER_SAMPLE, use_xcache=False, af_name=config["benchmarking"]["AF_NAME"])  # local files on /data for ssl-dev
+# %%
+fileset = utils.construct_fileset(
+                                    N_FILES_MAX_PER_SAMPLE,
+                                    use_xcache=False,
+                                    af_name=config["benchmarking"]["AF_NAME"],
+                                    input_from_eos=config["benchmarking"]["INPUT_FROM_EOS"]
+                                 )  # local files on /data for ssl-dev as af_name
 
 print(f"processes in fileset: {list(fileset.keys())}")
 print(f"\nexample of information in fileset:\n{{\n  'files': [{fileset['ttbar__nominal']['files'][0]}, ...],")
@@ -329,7 +334,7 @@ def postprocess(self, accumulator):
 #
 # Define the func_adl query to be used for the purpose of extracting columns and filtering.
 
-# %% tags=[]
+# %%
 def get_query(source: ObjectStream) -> ObjectStream:
     """Query for event / column selection: >=4j >=1b, ==1 lep with pT>25 GeV, return relevant columns
     """
@@ -355,7 +360,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 #
 # Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query.
 
-# %% tags=[]
+# %%
 if USE_SERVICEX:
     # dummy dataset on which to generate the query
     dummy_ds = ServiceXSourceUpROOT("cernopendata://dummy", "Events", backend_name="uproot")
@@ -385,7 +390,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 #
 # When `USE_SERVICEX` is false, the input files need to be processed during this step as well.
 
-# %% tags=[]
+# %%
 NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here
 if USE_DASK:
     executor = processor.DaskExecutor(client=utils.get_client(af=config["global"]["AF"]))
@@ -410,7 +415,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 
 print(f"\nexecution took {exec_time:.2f} seconds")
 
-# %% tags=[]
+# %%
 # track metrics
 dataset_source = "/data" if fileset["ttbar__nominal"]["files"][0].startswith("/data") else "https://xrootd-local.unl.edu:1094" # TODO: xcache support
 metrics.update({
@@ -448,15 +453,15 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # Let's have a look at the data we obtained.
 # We built histograms in two phase space regions, for multiple physics processes and systematic variations.
 
-# %% tags=[]
+# %%
 utils.set_style()
 
 all_histograms[110j::hist.rebin(2), "4j1b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1, edgecolor="grey")
 plt.legend(frameon=False)
 plt.title(">= 4 jets, 1 b-tag")
 plt.xlabel("HT [GeV]");
 
-# %% tags=[]
+# %%
 all_histograms[:, "4j2b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1,edgecolor="grey")
 plt.legend(frameon=False)
 plt.title(">= 4 jets, >= 2 b-tags")
@@ -471,7 +476,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 #
 # We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin.
 
-# %% tags=[]
+# %%
 # b-tagging variations
 all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "nominal"].plot(label="nominal", linewidth=2)
 all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "btag_var_0_up"].plot(label="NP 1", linewidth=2)
@@ -482,7 +487,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 plt.xlabel("HT [GeV]")
 plt.title("b-tagging variations");
 
-# %% tags=[]
+# %%
 # jet energy scale variations
 all_histograms[:, "4j2b", "ttbar", "nominal"].plot(label="nominal", linewidth=2)
 all_histograms[:, "4j2b", "ttbar", "pt_scale_up"].plot(label="scale up", linewidth=2)
@@ -497,7 +502,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # We'll save everything to disk for subsequent usage.
 # This also builds pseudo-data by combining events from the various simulation setups we have processed.
 
-# %% tags=[]
+# %%
 utils.save_histograms(all_histograms, fileset, "histograms.root")
 
 # %% [markdown]
@@ -510,7 +515,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # A statistical model has been defined in `config.yml`, ready to be used with our output.
 # We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
 
-# %% tags=[]
+# %%
 config = cabinetry.configuration.load("cabinetry_config.yml")
 
 # rebinning: lower edge 110 GeV, merge bins 2->1
@@ -523,13 +528,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # %% [markdown]
 # We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference.
 
-# %% tags=[]
+# %%
 # !pyhf inspect workspace.json | head -n 20
 
 # %% [markdown]
 # Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built.
 
-# %% tags=[]
+# %%
 model, data = cabinetry.model_utils.model_and_data(ws)
 fit_results = cabinetry.fit.fit(model, data)
 
@@ -540,31 +545,31 @@ def get_query(source: ObjectStream) -> ObjectStream:
 # %% [markdown]
 # For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction?
 
-# %% tags=[]
+# %%
 poi_index = model.config.poi_index
 print(f"\nfit result for ttbar_norm: {fit_results.bestfit[poi_index]:.3f} +/- {fit_results.uncertainty[poi_index]:.3f}")
 
 # %% [markdown]
 # Let's also visualize the model before and after the fit, in both the regions we are using.
 # The binning here corresponds to the binning used for the fit.
 
-# %% tags=[]
+# %%
 model_prediction = cabinetry.model_utils.prediction(model)
 figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
 figs[0]["figure"]
 
-# %% tags=[]
+# %%
 figs[1]["figure"]
 
 # %% [markdown]
 # We can see very good post-fit agreement.
 
-# %% tags=[]
+# %%
 model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
 figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
 figs[0]["figure"]
 
-# %% tags=[]
+# %%
 figs[1]["figure"]
 
 # %% [markdown]