66# extension: .py
77# format_name: percent
88# format_version: '1.3'
9- # jupytext_version: 1.14.1
9+ # jupytext_version: 1.14.7
1010# kernelspec:
1111# display_name: Python 3 (ipykernel)
1212# language: python
4040# %% [markdown]
4141# ### Imports: setting up our environment
4242
43- # %% tags=[]
43+ # %%
4444import asyncio
4545import logging
4646import os
8989#
9090# The input files are all in the 1–3 GB range.
9191
92- # %% tags=[]
92+ # %%
9393### GLOBAL CONFIGURATION
9494# input files per process, set to e.g. 10 (smaller number = faster)
9595N_FILES_MAX_PER_SAMPLE = 5
114114# - calculating systematic uncertainties at the event and object level,
115115# - filling all the information into histograms that get aggregated and ultimately returned to us by `coffea`.
116116
117- # %% tags=[]
117+ # %%
118118# functions creating systematic variations
119119def flat_variation (ones ):
120120 # 2.5% weight variations
@@ -316,8 +316,13 @@ def postprocess(self, accumulator):
316316#
317317# Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata.
318318
319- # %% tags=[]
320- fileset = utils .construct_fileset (N_FILES_MAX_PER_SAMPLE , use_xcache = False , af_name = config ["benchmarking" ]["AF_NAME" ]) # local files on /data for ssl-dev
319+ # %%
320+ fileset = utils .construct_fileset (
321+ N_FILES_MAX_PER_SAMPLE ,
322+ use_xcache = False ,
323+ af_name = config ["benchmarking" ]["AF_NAME" ],
324+ input_from_eos = config ["benchmarking" ]["INPUT_FROM_EOS" ]
325+ ) # local files on /data for ssl-dev as af_name
321326
322327print (f"processes in fileset: { list (fileset .keys ())} " )
323328print (f"\n example of information in fileset:\n {{\n 'files': [{ fileset ['ttbar__nominal' ]['files' ][0 ]} , ...]," )
@@ -329,7 +334,7 @@ def postprocess(self, accumulator):
329334#
330335# Define the func_adl query to be used for the purpose of extracting columns and filtering.
331336
332- # %% tags=[]
337+ # %%
333338def get_query (source : ObjectStream ) -> ObjectStream :
334339 """Query for event / column selection: >=4j >=1b, ==1 lep with pT>25 GeV, return relevant columns
335340 """
@@ -355,7 +360,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
355360#
356361# Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query.
357362
358- # %% tags=[]
363+ # %%
359364if USE_SERVICEX :
360365 # dummy dataset on which to generate the query
361366 dummy_ds = ServiceXSourceUpROOT ("cernopendata://dummy" , "Events" , backend_name = "uproot" )
@@ -385,7 +390,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
385390#
386391# When `USE_SERVICEX` is false, the input files need to be processed during this step as well.
387392
388- # %% tags=[]
393+ # %%
389394NanoAODSchema .warn_missing_crossrefs = False # silences warnings about branches we will not use here
390395if USE_DASK :
391396 executor = processor .DaskExecutor (client = utils .get_client (af = config ["global" ]["AF" ]))
@@ -410,7 +415,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
410415
411416print (f"\n execution took { exec_time :.2f} seconds" )
412417
413- # %% tags=[]
418+ # %%
414419# track metrics
415420dataset_source = "/data" if fileset ["ttbar__nominal" ]["files" ][0 ].startswith ("/data" ) else "https://xrootd-local.unl.edu:1094" # TODO: xcache support
416421metrics .update ({
@@ -448,15 +453,15 @@ def get_query(source: ObjectStream) -> ObjectStream:
448453# Let's have a look at the data we obtained.
449454# We built histograms in two phase space regions, for multiple physics processes and systematic variations.
450455
451- # %% tags=[]
456+ # %%
452457utils .set_style ()
453458
454459all_histograms [110j ::hist .rebin (2 ), "4j1b" , :, "nominal" ].stack ("process" )[::- 1 ].plot (stack = True , histtype = "fill" , linewidth = 1 , edgecolor = "grey" )
455460plt .legend (frameon = False )
456461plt .title (">= 4 jets, 1 b-tag" )
457462plt .xlabel ("HT [GeV]" );
458463
459- # %% tags=[]
464+ # %%
460465all_histograms [:, "4j2b" , :, "nominal" ].stack ("process" )[::- 1 ].plot (stack = True , histtype = "fill" , linewidth = 1 ,edgecolor = "grey" )
461466plt .legend (frameon = False )
462467plt .title (">= 4 jets, >= 2 b-tags" )
@@ -471,7 +476,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
471476#
472477# We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin.
473478
474- # %% tags=[]
479+ # %%
475480# b-tagging variations
476481all_histograms [110j ::hist .rebin (2 ), "4j1b" , "ttbar" , "nominal" ].plot (label = "nominal" , linewidth = 2 )
477482all_histograms [110j ::hist .rebin (2 ), "4j1b" , "ttbar" , "btag_var_0_up" ].plot (label = "NP 1" , linewidth = 2 )
@@ -482,7 +487,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
482487plt .xlabel ("HT [GeV]" )
483488plt .title ("b-tagging variations" );
484489
485- # %% tags=[]
490+ # %%
486491# jet energy scale variations
487492all_histograms [:, "4j2b" , "ttbar" , "nominal" ].plot (label = "nominal" , linewidth = 2 )
488493all_histograms [:, "4j2b" , "ttbar" , "pt_scale_up" ].plot (label = "scale up" , linewidth = 2 )
@@ -497,7 +502,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
497502# We'll save everything to disk for subsequent usage.
498503# This also builds pseudo-data by combining events from the various simulation setups we have processed.
499504
500- # %% tags=[]
505+ # %%
501506utils .save_histograms (all_histograms , fileset , "histograms.root" )
502507
503508# %% [markdown]
@@ -510,7 +515,7 @@ def get_query(source: ObjectStream) -> ObjectStream:
510515# A statistical model has been defined in `config.yml`, ready to be used with our output.
511516# We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built.
512517
513- # %% tags=[]
518+ # %%
514519config = cabinetry .configuration .load ("cabinetry_config.yml" )
515520
516521# rebinning: lower edge 110 GeV, merge bins 2->1
@@ -523,13 +528,13 @@ def get_query(source: ObjectStream) -> ObjectStream:
523528# %% [markdown]
524529# We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference.
525530
526- # %% tags=[]
531+ # %%
527532# !pyhf inspect workspace.json | head -n 20
528533
529534# %% [markdown]
530535# Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built.
531536
532- # %% tags=[]
537+ # %%
533538model , data = cabinetry .model_utils .model_and_data (ws )
534539fit_results = cabinetry .fit .fit (model , data )
535540
@@ -540,31 +545,31 @@ def get_query(source: ObjectStream) -> ObjectStream:
540545# %% [markdown]
541546# For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction?
542547
543- # %% tags=[]
548+ # %%
544549poi_index = model .config .poi_index
545550print (f"\n fit result for ttbar_norm: { fit_results .bestfit [poi_index ]:.3f} +/- { fit_results .uncertainty [poi_index ]:.3f} " )
546551
547552# %% [markdown]
548553# Let's also visualize the model before and after the fit, in both the regions we are using.
549554# The binning here corresponds to the binning used for the fit.
550555
551- # %% tags=[]
556+ # %%
552557model_prediction = cabinetry .model_utils .prediction (model )
553558figs = cabinetry .visualize .data_mc (model_prediction , data , close_figure = True , config = config )
554559figs [0 ]["figure" ]
555560
556- # %% tags=[]
561+ # %%
557562figs [1 ]["figure" ]
558563
559564# %% [markdown]
560565# We can see very good post-fit agreement.
561566
562- # %% tags=[]
567+ # %%
563568model_prediction_postfit = cabinetry .model_utils .prediction (model , fit_results = fit_results )
564569figs = cabinetry .visualize .data_mc (model_prediction_postfit , data , close_figure = True , config = config )
565570figs [0 ]["figure" ]
566571
567- # %% tags=[]
572+ # %%
568573figs [1 ]["figure" ]
569574
570575# %% [markdown]
0 commit comments