From b03754aa2feeba1679f4785127142fce8bbe0739 Mon Sep 17 00:00:00 2001 From: svburke Date: Wed, 12 Mar 2025 15:04:07 -0500 Subject: [PATCH 1/4] initial_fix --- prefect.yaml | 2 +- src/s3_ccdi_to_gdc.py | 69 ++++++++++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 18 deletions(-) diff --git a/prefect.yaml b/prefect.yaml index 058f137..ebd80ce 100644 --- a/prefect.yaml +++ b/prefect.yaml @@ -960,7 +960,7 @@ deployments: - prefect.projects.steps.git_clone_project: id: clone-step repository: https://github.com/CBIIT/ChildhoodCancerDataInitiative-Prefect_Pipeline.git - branch: main + branch: CCDIDC-1640-Fix-CCDI-output-to-GDC - prefect.projects.steps.pip_install_requirements: requirements_file: requirements.txt directory: "{{ clone-step.directory }}" diff --git a/src/s3_ccdi_to_gdc.py b/src/s3_ccdi_to_gdc.py index 9c9345d..84e4f9d 100644 --- a/src/s3_ccdi_to_gdc.py +++ b/src/s3_ccdi_to_gdc.py @@ -91,7 +91,7 @@ def save_dataframe_as_nested_json(df: pd.DataFrame, output_file: str): output_file (str): The path to the output JSON file. """ - #set up logger for prefect + # set up logger for prefect logger = get_run_logger() try: @@ -214,9 +214,8 @@ def resolve_experiment_name(group): def ccdi_to_gdc( file_path: str, ccdi_gdc_translation_file: str, platform_preservation_file: str ): - - logger = get_run_logger() + logger = get_run_logger() ################ # @@ -309,6 +308,30 @@ def ccdi_to_gdc( # platform and preservation platform_preservation_conv = pd.read_csv(platform_preservation_file, sep="\t") + ##################### + ##################### + ## + ## SETUP FOR DATA FILES + ## + ##################### + ##################### + + # We have to capture all data files that are present in the meta_dfs. There should not be an issue + # with capturing all data files, as the next script will only access the files that are called for by file metadata. + # Thus we will pass along the file_name, file_size, md5sum, and file_url to the next script. + df_data_file_list = pd.DataFrame( + { + "submitter_id": pd.Series(dtype="str"), + "file_url": pd.Series(dtype="str"), + "file_name": pd.Series(dtype="str"), + "file_size": pd.Series(dtype="str"), + "md5sum": pd.Series(dtype="str"), + } + ) + + # Create a list of all data file metadata columns + data_file_columns = ["submitter_id", "file_url", "file_name", "file_size", "md5sum"] + ##################### ##################### ## @@ -898,6 +921,9 @@ def ccdi_to_gdc( df_submitted_aligned_reads_Archer_Fusion.drop_duplicates() ) + # Add files to the data file list dataframe + df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + ################################### # submitted_aligned_reads_WXS ################################### @@ -924,6 +950,9 @@ def ccdi_to_gdc( # CURRENTLY EMPTY, ALL WXS IS CRAM + # Add files to the data file list dataframe + df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + ################################### # submitted_unaligned_reads_Archer_Fusion ################################### @@ -952,6 +981,9 @@ def ccdi_to_gdc( # CURRENTLY EMPTY, ALL ARCHER FUSION IS BAM ONLY + # Add files to the data file list dataframe + df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + ################################### # submitted_unaligned_reads_WXS ################################### @@ -1008,6 +1040,9 @@ def ccdi_to_gdc( df_submitted_unaligned_reads_WXS.drop_duplicates() ) + # Add files to the data file list dataframe + df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + ################################### # DF file write out ################################### @@ -1018,19 +1053,20 @@ def ccdi_to_gdc( # The original data frame identification seems to be problematic in Prefect # Instead, I will just list the data frames that are produced - dataframes={ - 'df_aligned_reads_index': df_aligned_reads_index, - 'df_aliquot': df_aliquot, - 'df_case': df_case, - 'df_demographic': df_demographic, - 'df_diagnosis': df_diagnosis, - 'df_raw_methylation_array': df_raw_methylation_array, - 'df_read_group': df_read_group, - 'df_sample': df_sample, - 'df_submitted_aligned_reads_Archer_Fusion': df_submitted_aligned_reads_Archer_Fusion, - 'df_submitted_aligned_reads_WXS': df_submitted_aligned_reads_WXS, - 'df_submitted_unaligned_reads_Archer_Fusion': df_submitted_unaligned_reads_Archer_Fusion, - 'df_submitted_unaligned_reads_WXS': df_submitted_unaligned_reads_WXS + dataframes = { + "df_aligned_reads_index": df_aligned_reads_index, + "df_aliquot": df_aliquot, + "df_case": df_case, + "df_demographic": df_demographic, + "df_diagnosis": df_diagnosis, + "df_raw_methylation_array": df_raw_methylation_array, + "df_read_group": df_read_group, + "df_sample": df_sample, + "df_submitted_aligned_reads_Archer_Fusion": df_submitted_aligned_reads_Archer_Fusion, + "df_submitted_aligned_reads_WXS": df_submitted_aligned_reads_WXS, + "df_submitted_unaligned_reads_Archer_Fusion": df_submitted_unaligned_reads_Archer_Fusion, + "df_submitted_unaligned_reads_WXS": df_submitted_unaligned_reads_WXS, + "df_data_file_list": df_data_file_list, } # Save each DataFrame as a TSV file @@ -1047,5 +1083,4 @@ def ccdi_to_gdc( else: logger.info(f"Skipped {name} (empty DataFrame).") - return output_dir From 32275bd10d2c852b2ac1b34336c6b8c1de2efe1a Mon Sep 17 00:00:00 2001 From: svburke Date: Wed, 12 Mar 2025 15:30:53 -0500 Subject: [PATCH 2/4] Update s3_ccdi_to_gdc.py --- src/s3_ccdi_to_gdc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/s3_ccdi_to_gdc.py b/src/s3_ccdi_to_gdc.py index 84e4f9d..4ce1a25 100644 --- a/src/s3_ccdi_to_gdc.py +++ b/src/s3_ccdi_to_gdc.py @@ -321,7 +321,6 @@ def ccdi_to_gdc( # Thus we will pass along the file_name, file_size, md5sum, and file_url to the next script. df_data_file_list = pd.DataFrame( { - "submitter_id": pd.Series(dtype="str"), "file_url": pd.Series(dtype="str"), "file_name": pd.Series(dtype="str"), "file_size": pd.Series(dtype="str"), @@ -330,7 +329,7 @@ def ccdi_to_gdc( ) # Create a list of all data file metadata columns - data_file_columns = ["submitter_id", "file_url", "file_name", "file_size", "md5sum"] + data_file_columns = ["file_url", "file_name", "file_size", "md5sum"] ##################### ##################### From 1ca5cfd2f325ea5ec766715ba557a61ca96e8491 Mon Sep 17 00:00:00 2001 From: svburke Date: Wed, 12 Mar 2025 16:00:46 -0500 Subject: [PATCH 3/4] Update s3_ccdi_to_gdc.py --- src/s3_ccdi_to_gdc.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/s3_ccdi_to_gdc.py b/src/s3_ccdi_to_gdc.py index 4ce1a25..e251621 100644 --- a/src/s3_ccdi_to_gdc.py +++ b/src/s3_ccdi_to_gdc.py @@ -723,6 +723,11 @@ def ccdi_to_gdc( df_raw_methylation_array = df_raw_methylation_array.drop_duplicates() + df_data_file_list = pd.concat( + [df_data_file_list[data_file_columns], data[data_file_columns]], + ignore_index=True, + ) + ################################### # read_group ################################### @@ -921,7 +926,10 @@ def ccdi_to_gdc( ) # Add files to the data file list dataframe - df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + df_data_file_list = pd.concat( + [df_data_file_list[data_file_columns], data[data_file_columns]], + ignore_index=True, + ) ################################### # submitted_aligned_reads_WXS @@ -950,7 +958,10 @@ def ccdi_to_gdc( # CURRENTLY EMPTY, ALL WXS IS CRAM # Add files to the data file list dataframe - df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + df_data_file_list = pd.concat( + [df_data_file_list[data_file_columns], data[data_file_columns]], + ignore_index=True, + ) ################################### # submitted_unaligned_reads_Archer_Fusion @@ -981,7 +992,10 @@ def ccdi_to_gdc( # CURRENTLY EMPTY, ALL ARCHER FUSION IS BAM ONLY # Add files to the data file list dataframe - df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + df_data_file_list = pd.concat( + [df_data_file_list[data_file_columns], data[data_file_columns]], + ignore_index=True, + ) ################################### # submitted_unaligned_reads_WXS @@ -1040,7 +1054,10 @@ def ccdi_to_gdc( ) # Add files to the data file list dataframe - df_data_file_list = pd.concat([df_data_file_list[data_file_columns], data[data_file_columns]], ignore_index=True) + df_data_file_list = pd.concat( + [df_data_file_list[data_file_columns], data[data_file_columns]], + ignore_index=True, + ) ################################### # DF file write out From ee53fc5b766e8fb1e4807c9564a37045cc1b366c Mon Sep 17 00:00:00 2001 From: svburke <42675174+svburke@users.noreply.github.com> Date: Thu, 13 Mar 2025 09:42:46 -0500 Subject: [PATCH 4/4] Update prefect.yaml --- prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prefect.yaml b/prefect.yaml index ebd80ce..058f137 100644 --- a/prefect.yaml +++ b/prefect.yaml @@ -960,7 +960,7 @@ deployments: - prefect.projects.steps.git_clone_project: id: clone-step repository: https://github.com/CBIIT/ChildhoodCancerDataInitiative-Prefect_Pipeline.git - branch: CCDIDC-1640-Fix-CCDI-output-to-GDC + branch: main - prefect.projects.steps.pip_install_requirements: requirements_file: requirements.txt directory: "{{ clone-step.directory }}"