From e09f5676867852495d7cd4a284bdb41dc0e18aff Mon Sep 17 00:00:00 2001
From: Mathis Frahm <mathisfrahm@gmx.de>
Date: Tue, 6 Feb 2024 13:30:24 +0100
Subject: [PATCH] add function to convert dataset information for top datasets

---
 scripts/get_das_info.py | 108 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 103 insertions(+), 5 deletions(-)

diff --git a/scripts/get_das_info.py b/scripts/get_das_info.py
index 03301cdc..15308973 100644
--- a/scripts/get_das_info.py
+++ b/scripts/get_das_info.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-# USAGE: python GetDASinfo.py -d das_string
+# USAGE: python get_das_info.py -d das_string
 # e.g. /JetHT/Run2018C-UL2018_MiniAODv2_JMENanoAODv9-v1/NANOAOD
 
 from __future__ import annotations
@@ -13,6 +13,9 @@
 
 
 def convert_to_desired_structure(data: dict) -> str:
+    """
+    Function that converts dataset info into one order Dataset per query
+    """
     return f"""cpn.add_dataset(
     name="PLACEHOLDER",
     id={data['dataset_id']},
@@ -22,10 +25,105 @@ def convert_to_desired_structure(data: dict) -> str:
     ],
     n_files={data['nfiles']},
     n_events={data['nevents']},
+)
+"""
+
+
+identifier_map = {
+    "_TuneCP5Down_": "tune_down",
+    "_TuneCP5Up_": "tune_up",
+    "_TuneCP5CR1_": "cr_1",
+    "_TuneCP5CR2_": "cr_2",
+    "_Hdamp-158_": "hdamp_down",
+    "_Hdamp-418_": "hdamp_up",
+    "_MT-171p5_": "mtop_down",
+    "_MT-173p5_": "mtop_up",
+    # dataset types that I have no use for but want to keep anyways
+    "_MT-166p5_": "xxx",
+    "_MT-169p5_": "xxx",
+    "_MT-175p5_": "xxx",
+    "_MT-178p5_": "xxx",
+    "_DS_TuneCP5_": "xxx",
+    "_TuneCP5_ERDOn_": "xxx",
+    "_TuneCH3_": "xxx",
+    # nominal entry as the last one such that other dataset types get priority
+    "_TuneCP5_": "nominal",
+}
+
+
+def convert_for_top_queries(data: dict) -> str:
+    """
+    Function that converts dataset info into either an order Datset for nominal datasets
+    or to a DatasetInfo for variations of datasets such as tune or mtop.
+
+    Exemplary usage:
+    python get_das_info.py -f convert_for_top_queries -d "/TTtoLNu2Q*/Run3Summer22EENanoAODv12-130X_*/NANOAODSIM"
+    """
+    dataset_type = None
+
+    for identifier in identifier_map:
+        if identifier in data["name"]:
+            dataset_type = identifier_map[identifier]
+            break
+
+    if not dataset_type:
+        return f"""
+        #####
+        #####ERROR! Did not manage to determine type of dataset {data['name']}
+        #####
+        """
+
+    if dataset_type == "nominal":
+        return f"""cpn.add_dataset(
+    name="PLACEHOLDER",
+    id={data['dataset_id']},
+    processes=[procs.PLACEHOLDER],
+    info=dict(
+        nominal=DatasetInfo(
+            keys=[
+                "{data['name']}",  # noqa
+            ],
+            n_files={data['nfiles']},
+            n_events={data['nevents']},
+        ),
+    ),
 )"""
+    elif dataset_type == "xxx":
+        # comment out this dataset
+        return f"""        # {dataset_type}=DatasetInfo(
+        #     keys=[
+        #         "{data['name']}",  # noqa
+        #     ],
+        #     n_files={data['nfiles']},
+        #     n_events={data['nevents']},
+        # ),"""
+    else:
+        # some known variation of the dataset
+        return f"""        {dataset_type}=DatasetInfo(
+            keys=[
+                "{data['name']}",  # noqa
+            ],
+            n_files={data['nfiles']},
+            n_events={data['nevents']},
+        ),"""
+
+
+convert_functions = {
+    "convert_to_desired_structure": convert_to_desired_structure,
+    "convert_for_top_queries": convert_for_top_queries,
+}
+
 
+def print_das_info(
+    das_strings: list[str],
+    keys_of_interest: tuple | None = None,
+    convert_function_str: str | None = None,
+):
+    # get the requested convert function
+    if not convert_function_str:
+        convert_function_str = "convert_to_desired_structure"
+    convert_function = convert_functions[convert_function_str]
 
-def print_das_info(das_strings: list[str], keys_of_interest: tuple | None = None):
     for das_string in das_strings:
         # set default keys of interest
         keys_of_interest = keys_of_interest or (
@@ -76,13 +174,13 @@ def print_das_info(das_strings: list[str], keys_of_interest: tuple | None = None
                     info_of_interest["nfiles"] = dataset_info.get("nfiles", "")
                     info_of_interest["nevents"] = dataset_info.get("nevents", "")
 
-            desired_output = convert_to_desired_structure(info_of_interest)
+            desired_output = convert_function(info_of_interest)
             print(desired_output)
-            print()
 
 
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("-d", "--dataset", dest="dataset", nargs="+", help="das name")
+    parser.add_argument("-f", "--function", dest="function", help="function that converts info into code")
     args = parser.parse_args()
-    print_das_info(args.dataset)
+    print_das_info(args.dataset, convert_function_str=args.function)