Skip to content

Commit 4f5a661

Browse files
fix: sync changes from #185 to script (#186)
* run jupytext to sync notebook changes to script version
1 parent 24a069b commit 4f5a661

File tree

1 file changed

+28
-23
lines changed

1 file changed

+28
-23
lines changed

analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# extension: .py
77
# format_name: percent
88
# format_version: '1.3'
9-
# jupytext_version: 1.14.1
9+
# jupytext_version: 1.14.7
1010
# kernelspec:
1111
# display_name: Python 3 (ipykernel)
1212
# language: python
@@ -40,7 +40,7 @@
4040
# %% [markdown]
4141
# ### Imports: setting up our environment
4242

43-
# %% tags=[]
43+
# %%
4444
import asyncio
4545
import logging
4646
import os
@@ -89,7 +89,7 @@
8989
#
9090
# The input files are all in the 1–3 GB range.
9191

92-
# %% tags=[]
92+
# %%
9393
### GLOBAL CONFIGURATION
9494
# input files per process, set to e.g. 10 (smaller number = faster)
9595
N_FILES_MAX_PER_SAMPLE = 5
@@ -114,7 +114,7 @@
114114
# - calculating systematic uncertainties at the event and object level,
115115
# - filling all the information into histograms that get aggregated and ultimately returned to us by `coffea`.
116116

117-
# %% tags=[]
117+
# %%
118118
# functions creating systematic variations
119119
def flat_variation(ones):
120120
# 2.5% weight variations
@@ -316,8 +316,13 @@ def postprocess(self, accumulator):
316316
#
317317
# Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata.
318318

319-
# %% tags=[]
320-
fileset = utils.construct_fileset(N_FILES_MAX_PER_SAMPLE, use_xcache=False, af_name=config["benchmarking"]["AF_NAME"]) # local files on /data for ssl-dev
319+
# %%
320+
fileset = utils.construct_fileset(
321+
N_FILES_MAX_PER_SAMPLE,
322+
use_xcache=False,
323+
af_name=config["benchmarking"]["AF_NAME"],
324+
input_from_eos=config["benchmarking"]["INPUT_FROM_EOS"]
325+
) # local files on /data for ssl-dev as af_name
321326

322327
print(f"processes in fileset: {list(fileset.keys())}")
323328
print(f"\nexample of information in fileset:\n{{\n 'files': [{fileset['ttbar__nominal']['files'][0]}, ...],")
@@ -329,7 +334,7 @@ def postprocess(self, accumulator):
329334
#
330335
# Define the func_adl query to be used for the purpose of extracting columns and filtering.
331336

332-
# %% tags=[]
337+
# %%
333338
def get_query(source: ObjectStream) -> ObjectStream:
334339
"""Query for event / column selection: >=4j >=1b, ==1 lep with pT>25 GeV, return relevant columns
335340
"""
@@ -355,7 +360,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
355360
#
356361
# Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query.
357362

358-
# %% tags=[]
363+
# %%
359364
if USE_SERVICEX:
360365
# dummy dataset on which to generate the query
361366
dummy_ds = ServiceXSourceUpROOT("cernopendata://dummy", "Events", backend_name="uproot")
@@ -385,7 +390,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
385390
#
386391
# When `USE_SERVICEX` is false, the input files need to be processed during this step as well.
387392

388-
# %% tags=[]
393+
# %%
389394
NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here
390395
if USE_DASK:
391396
executor = processor.DaskExecutor(client=utils.get_client(af=config["global"]["AF"]))
@@ -410,7 +415,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
410415

411416
print(f"\nexecution took {exec_time:.2f} seconds")
412417

413-
# %% tags=[]
418+
# %%
414419
# track metrics
415420
dataset_source = "/data" if fileset["ttbar__nominal"]["files"][0].startswith("/data") else "https://xrootd-local.unl.edu:1094" # TODO: xcache support
416421
metrics.update({
@@ -448,15 +453,15 @@ def get_query(source: ObjectStream) -> ObjectStream:
448453
# Let's have a look at the data we obtained.
449454
# We built histograms in two phase space regions, for multiple physics processes and systematic variations.
450455

451-
# %% tags=[]
456+
# %%
452457
utils.set_style()
453458

454459
all_histograms[110j::hist.rebin(2), "4j1b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1, edgecolor="grey")
455460
plt.legend(frameon=False)
456461
plt.title(">= 4 jets, 1 b-tag")
457462
plt.xlabel("HT [GeV]");
458463

459-
# %% tags=[]
464+
# %%
460465
all_histograms[:, "4j2b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1,edgecolor="grey")
461466
plt.legend(frameon=False)
462467
plt.title(">= 4 jets, >= 2 b-tags")
@@ -471,7 +476,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
471476
#
472477
# We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin.
473478

474-
# %% tags=[]
479+
# %%
475480
# b-tagging variations
476481
all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "nominal"].plot(label="nominal", linewidth=2)
477482
all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "btag_var_0_up"].plot(label="NP 1", linewidth=2)
@@ -482,7 +487,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
482487
plt.xlabel("HT [GeV]")
483488
plt.title("b-tagging variations");
484489

485-
# %% tags=[]
490+
# %%
486491
# jet energy scale variations
487492
all_histograms[:, "4j2b", "ttbar", "nominal"].plot(label="nominal", linewidth=2)
488493
all_histograms[:, "4j2b", "ttbar", "pt_scale_up"].plot(label="scale up", linewidth=2)
@@ -497,7 +502,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
497502
# We'll save everything to disk for subsequent usage.
498503
# This also builds pseudo-data by combining events from the various simulation setups we have processed.
499504

500-
# %% tags=[]
505+
# %%
501506
utils.save_histograms(all_histograms, fileset, "histograms.root")
502507

503508
# %% [markdown]
@@ -510,7 +515,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
510515
# A statistical model has been defined in `config.yml`, ready to be used with our output.
511516
# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
512517

513-
# %% tags=[]
518+
# %%
514519
config = cabinetry.configuration.load("cabinetry_config.yml")
515520

516521
# rebinning: lower edge 110 GeV, merge bins 2->1
@@ -523,13 +528,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
523528
# %% [markdown]
524529
# We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference.
525530

526-
# %% tags=[]
531+
# %%
527532
# !pyhf inspect workspace.json | head -n 20
528533

529534
# %% [markdown]
530535
# Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built.
531536

532-
# %% tags=[]
537+
# %%
533538
model, data = cabinetry.model_utils.model_and_data(ws)
534539
fit_results = cabinetry.fit.fit(model, data)
535540

@@ -540,31 +545,31 @@ def get_query(source: ObjectStream) -> ObjectStream:
540545
# %% [markdown]
541546
# For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction?
542547

543-
# %% tags=[]
548+
# %%
544549
poi_index = model.config.poi_index
545550
print(f"\nfit result for ttbar_norm: {fit_results.bestfit[poi_index]:.3f} +/- {fit_results.uncertainty[poi_index]:.3f}")
546551

547552
# %% [markdown]
548553
# Let's also visualize the model before and after the fit, in both the regions we are using.
549554
# The binning here corresponds to the binning used for the fit.
550555

551-
# %% tags=[]
556+
# %%
552557
model_prediction = cabinetry.model_utils.prediction(model)
553558
figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config)
554559
figs[0]["figure"]
555560

556-
# %% tags=[]
561+
# %%
557562
figs[1]["figure"]
558563

559564
# %% [markdown]
560565
# We can see very good post-fit agreement.
561566

562-
# %% tags=[]
567+
# %%
563568
model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results)
564569
figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config)
565570
figs[0]["figure"]
566571

567-
# %% tags=[]
572+
# %%
568573
figs[1]["figure"]
569574

570575
# %% [markdown]

0 commit comments

Comments
 (0)