From b03754aa2feeba1679f4785127142fce8bbe0739 Mon Sep 17 00:00:00 2001
From: svburke <sean.burke2@nih.gov>
Date: Wed, 12 Mar 2025 15:04:07 -0500
Subject: [PATCH 1/4] initial_fix

---
 prefect.yaml          |  2 +-
 src/s3_ccdi_to_gdc.py | 69 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/prefect.yaml b/prefect.yaml
index 058f137..ebd80ce 100644
--- a/prefect.yaml
+++ b/prefect.yaml
@@ -960,7 +960,7 @@ deployments:
       - prefect.projects.steps.git_clone_project:
           id: clone-step
           repository: https://github.com/CBIIT/ChildhoodCancerDataInitiative-Prefect_Pipeline.git
-          branch: main
+          branch: CCDIDC-1640-Fix-CCDI-output-to-GDC
       - prefect.projects.steps.pip_install_requirements:
           requirements_file: requirements.txt
           directory: "{{ clone-step.directory }}"
diff --git a/src/s3_ccdi_to_gdc.py b/src/s3_ccdi_to_gdc.py
index 9c9345d..84e4f9d 100644
--- a/src/s3_ccdi_to_gdc.py
+++ b/src/s3_ccdi_to_gdc.py
@@ -91,7 +91,7 @@ def save_dataframe_as_nested_json(df: pd.DataFrame, output_file: str):
         output_file (str): The path to the output JSON file.
     """
 
-    #set up logger for prefect
+    # set up logger for prefect
     logger = get_run_logger()
 
     try:
@@ -214,9 +214,8 @@ def resolve_experiment_name(group):
 def ccdi_to_gdc(
     file_path: str, ccdi_gdc_translation_file: str, platform_preservation_file: str
 ):
-    
-    logger = get_run_logger()
 
+    logger = get_run_logger()
 
     ################
     #
@@ -309,6 +308,30 @@ def ccdi_to_gdc(
         # platform and preservation
         platform_preservation_conv = pd.read_csv(platform_preservation_file, sep="\t")
 
+    #####################
+    #####################
+    ##
+    ## SETUP FOR DATA FILES
+    ##
+    #####################
+    #####################
+
+    # We have to capture all data files that are present in the meta_dfs. There should not be an issue
+    # with capturing all data files, as the next script will only access the files that are called for by file metadata.
+    # Thus we will pass along the file_name, file_size, md5sum, and file_url to the next script.
+    df_data_file_list = pd.DataFrame(
+        {
+            "submitter_id": pd.Series(dtype="str"),
+            "file_url": pd.Series(dtype="str"),
+            "file_name": pd.Series(dtype="str"),
+            "file_size": pd.Series(dtype="str"),
+            "md5sum": pd.Series(dtype="str"),
+        }
+    )
+
+    # Create a list of all data file metadata columns
+    data_file_columns = ["submitter_id", "file_url", "file_name", "file_size", "md5sum"]
+
     #####################
     #####################
     ##
@@ -898,6 +921,9 @@ def ccdi_to_gdc(
         df_submitted_aligned_reads_Archer_Fusion.drop_duplicates()
     )
 
+    # Add files to the data file list dataframe
+    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+
     ###################################
     # submitted_aligned_reads_WXS
     ###################################
@@ -924,6 +950,9 @@ def ccdi_to_gdc(
 
     # CURRENTLY EMPTY, ALL WXS IS CRAM
 
+    # Add files to the data file list dataframe
+    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+
     ###################################
     # submitted_unaligned_reads_Archer_Fusion
     ###################################
@@ -952,6 +981,9 @@ def ccdi_to_gdc(
 
     # CURRENTLY EMPTY, ALL ARCHER FUSION IS BAM ONLY
 
+    # Add files to the data file list dataframe
+    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+
     ###################################
     # submitted_unaligned_reads_WXS
     ###################################
@@ -1008,6 +1040,9 @@ def ccdi_to_gdc(
         df_submitted_unaligned_reads_WXS.drop_duplicates()
     )
 
+    # Add files to the data file list dataframe
+    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+
     ###################################
     # DF file write out
     ###################################
@@ -1018,19 +1053,20 @@ def ccdi_to_gdc(
 
     # The original data frame identification seems to be problematic in Prefect
     # Instead, I will just list the data frames that are produced
-    dataframes={
-        'df_aligned_reads_index': df_aligned_reads_index,
-        'df_aliquot': df_aliquot,
-        'df_case': df_case,
-        'df_demographic': df_demographic,
-        'df_diagnosis': df_diagnosis,
-        'df_raw_methylation_array': df_raw_methylation_array,
-        'df_read_group': df_read_group,
-        'df_sample': df_sample,
-        'df_submitted_aligned_reads_Archer_Fusion': df_submitted_aligned_reads_Archer_Fusion,
-        'df_submitted_aligned_reads_WXS': df_submitted_aligned_reads_WXS,
-        'df_submitted_unaligned_reads_Archer_Fusion': df_submitted_unaligned_reads_Archer_Fusion,
-        'df_submitted_unaligned_reads_WXS': df_submitted_unaligned_reads_WXS
+    dataframes = {
+        "df_aligned_reads_index": df_aligned_reads_index,
+        "df_aliquot": df_aliquot,
+        "df_case": df_case,
+        "df_demographic": df_demographic,
+        "df_diagnosis": df_diagnosis,
+        "df_raw_methylation_array": df_raw_methylation_array,
+        "df_read_group": df_read_group,
+        "df_sample": df_sample,
+        "df_submitted_aligned_reads_Archer_Fusion": df_submitted_aligned_reads_Archer_Fusion,
+        "df_submitted_aligned_reads_WXS": df_submitted_aligned_reads_WXS,
+        "df_submitted_unaligned_reads_Archer_Fusion": df_submitted_unaligned_reads_Archer_Fusion,
+        "df_submitted_unaligned_reads_WXS": df_submitted_unaligned_reads_WXS,
+        "df_data_file_list": df_data_file_list,
     }
 
     # Save each DataFrame as a TSV file
@@ -1047,5 +1083,4 @@ def ccdi_to_gdc(
             else:
                 logger.info(f"Skipped {name} (empty DataFrame).")
 
-
     return output_dir

From 32275bd10d2c852b2ac1b34336c6b8c1de2efe1a Mon Sep 17 00:00:00 2001
From: svburke <sean.burke2@nih.gov>
Date: Wed, 12 Mar 2025 15:30:53 -0500
Subject: [PATCH 2/4] Update s3_ccdi_to_gdc.py

---
 src/s3_ccdi_to_gdc.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/s3_ccdi_to_gdc.py b/src/s3_ccdi_to_gdc.py
index 84e4f9d..4ce1a25 100644
--- a/src/s3_ccdi_to_gdc.py
+++ b/src/s3_ccdi_to_gdc.py
@@ -321,7 +321,6 @@ def ccdi_to_gdc(
     # Thus we will pass along the file_name, file_size, md5sum, and file_url to the next script.
     df_data_file_list = pd.DataFrame(
         {
-            "submitter_id": pd.Series(dtype="str"),
             "file_url": pd.Series(dtype="str"),
             "file_name": pd.Series(dtype="str"),
             "file_size": pd.Series(dtype="str"),
@@ -330,7 +329,7 @@ def ccdi_to_gdc(
     )
 
     # Create a list of all data file metadata columns
-    data_file_columns = ["submitter_id", "file_url", "file_name", "file_size", "md5sum"]
+    data_file_columns = ["file_url", "file_name", "file_size", "md5sum"]
 
     #####################
     #####################

From 1ca5cfd2f325ea5ec766715ba557a61ca96e8491 Mon Sep 17 00:00:00 2001
From: svburke <sean.burke2@nih.gov>
Date: Wed, 12 Mar 2025 16:00:46 -0500
Subject: [PATCH 3/4] Update s3_ccdi_to_gdc.py

---
 src/s3_ccdi_to_gdc.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/s3_ccdi_to_gdc.py b/src/s3_ccdi_to_gdc.py
index 4ce1a25..e251621 100644
--- a/src/s3_ccdi_to_gdc.py
+++ b/src/s3_ccdi_to_gdc.py
@@ -723,6 +723,11 @@ def ccdi_to_gdc(
 
     df_raw_methylation_array = df_raw_methylation_array.drop_duplicates()
 
+    df_data_file_list = pd.concat(
+        [df_data_file_list[data_file_columns], data[data_file_columns]],
+        ignore_index=True,
+    )
+
     ###################################
     # read_group
     ###################################
@@ -921,7 +926,10 @@ def ccdi_to_gdc(
     )
 
     # Add files to the data file list dataframe
-    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+    df_data_file_list = pd.concat(
+        [df_data_file_list[data_file_columns], data[data_file_columns]],
+        ignore_index=True,
+    )
 
     ###################################
     # submitted_aligned_reads_WXS
@@ -950,7 +958,10 @@ def ccdi_to_gdc(
     # CURRENTLY EMPTY, ALL WXS IS CRAM
 
     # Add files to the data file list dataframe
-    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+    df_data_file_list = pd.concat(
+        [df_data_file_list[data_file_columns], data[data_file_columns]],
+        ignore_index=True,
+    )
 
     ###################################
     # submitted_unaligned_reads_Archer_Fusion
@@ -981,7 +992,10 @@ def ccdi_to_gdc(
     # CURRENTLY EMPTY, ALL ARCHER FUSION IS BAM ONLY
 
     # Add files to the data file list dataframe
-    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+    df_data_file_list = pd.concat(
+        [df_data_file_list[data_file_columns], data[data_file_columns]],
+        ignore_index=True,
+    )
 
     ###################################
     # submitted_unaligned_reads_WXS
@@ -1040,7 +1054,10 @@ def ccdi_to_gdc(
     )
 
     # Add files to the data file list dataframe
-    df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True)
+    df_data_file_list = pd.concat(
+        [df_data_file_list[data_file_columns], data[data_file_columns]],
+        ignore_index=True,
+    )
 
     ###################################
     # DF file write out

From ee53fc5b766e8fb1e4807c9564a37045cc1b366c Mon Sep 17 00:00:00 2001
From: svburke <42675174+svburke@users.noreply.github.com>
Date: Thu, 13 Mar 2025 09:42:46 -0500
Subject: [PATCH 4/4] Update prefect.yaml

---
 prefect.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prefect.yaml b/prefect.yaml
index ebd80ce..058f137 100644
--- a/prefect.yaml
+++ b/prefect.yaml
@@ -960,7 +960,7 @@ deployments:
       - prefect.projects.steps.git_clone_project:
           id: clone-step
           repository: https://github.com/CBIIT/ChildhoodCancerDataInitiative-Prefect_Pipeline.git
-          branch: CCDIDC-1640-Fix-CCDI-output-to-GDC
+          branch: main
       - prefect.projects.steps.pip_install_requirements:
           requirements_file: requirements.txt
           directory: "{{ clone-step.directory }}"