diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py index 8815c1e8..e9c7834f 100755 --- a/build/beatAML/GetBeatAML.py +++ b/build/beatAML/GetBeatAML.py @@ -665,6 +665,7 @@ def generate_drug_list(drug_map_path,drug_path): print(improve_map_file) t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file) t_df = t_df[t_df.entrez_id.notna()] + t_df = t_df[t_df.entrez_id != 0] t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates() t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip') @@ -676,14 +677,15 @@ def generate_drug_list(drug_map_path,drug_path): p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics') p_df = map_and_combine(p_df, "proteomics", args.genes, improve_map_file, proteomics_map) p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]] + p_df = p_df[p_df.entrez_id != 0] p_df.to_csv("/tmp/beataml_proteomics.csv.gz",index=False,compression='gzip') # New Mutation Data print("Starting Mutation Data") m_df = pd.read_csv(mutations_file, sep = '\t') - m_df = map_and_combine(m_df, "mutations", args.genes,improve_map_file, mutation_map_file) m_df = m_df[["improve_sample_id","mutation", "entrez_id","variant_classification","source","study"]] + m_df = m_df[m_df.entrez_id != 0] m_df.to_csv("/tmp/beataml_mutations.csv.gz",index=False,compression='gzip') if args.exp: diff --git a/build/bladderpdo/00_createBladderPDOSampleFile.py b/build/bladderpdo/00_createBladderPDOSampleFile.py index 07c11cce..757da98f 100644 --- a/build/bladderpdo/00_createBladderPDOSampleFile.py +++ b/build/bladderpdo/00_createBladderPDOSampleFile.py @@ -31,7 +31,7 @@ def _parse_model_type(sample_id): if "_xenoorganoid" in low: return "xenograft derived organoid" if "_organoid" in low: - return "organoid" + return "patient derived organoid" if "_xenograft" in low: return "patient derived xenograft" if "_parental" in low: diff --git a/build/bladderpdo/01_createBladderPDOOmicsFiles.py b/build/bladderpdo/01_createBladderPDOOmicsFiles.py index 5e1fca0c..e4c87b5a 100644 --- a/build/bladderpdo/01_createBladderPDOOmicsFiles.py +++ b/build/bladderpdo/01_createBladderPDOOmicsFiles.py @@ -104,8 +104,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes): final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]] final_mutations['study'] = "Lee etal 2018 Bladder PDOs" final_mutations = final_mutations.dropna(subset=["entrez_id"]) - final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int) - final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int) + final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int) + #drop entrez_ids equal to zero or N/A. + final_mutations = final_mutations.dropna(subset=["entrez_id"]) + final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int) + final_mutations = final_mutations[final_mutations["entrez_id"] != 0] return final_mutations def get_bladder_pdo_copynumber(synObject, samples, genes): @@ -124,7 +127,12 @@ def get_bladder_pdo_copynumber(synObject, samples, genes): final_copynumber['study'] = "Lee etal 2018 Bladder PDOs" final_copynumber = final_copynumber.dropna(subset=["entrez_id"]) final_copynumber["improve_sample_id"] = final_copynumber["improve_sample_id"].astype(int) - final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int) + #Drop genes that don't map to genes.csv + valid_entrez = set(genes['entrez_id'].astype(int)) + final_copynumber = final_copynumber[ + final_copynumber['entrez_id'].isin(valid_entrez) + ] + final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int) return final_copynumber diff --git a/build/broad_sanger/02-broadSangerOmics.R b/build/broad_sanger/02-broadSangerOmics.R index f5ee3bad..ee1bc1a2 100755 --- a/build/broad_sanger/02-broadSangerOmics.R +++ b/build/broad_sanger/02-broadSangerOmics.R @@ -405,6 +405,7 @@ depmap_files<-function(fi,value){ res<-exp_file|> mutate(entrez_id=as.numeric(EntrezGeneID))|> + filter(entrez_id %in% genes$entrez_id) |> left_join(as.data.frame(depmap_vtab)) ##now many variants are missing??? @@ -442,7 +443,8 @@ depmap_files<-function(fi,value){ print("wide to long") res = tidyr::pivot_longer(data=exp_file,cols=c(2:ncol(exp_file)), names_to='gene_entrez',values_to='transcriptomics', - values_transform=list(expression=as.numeric)) + values_transform=list(transcriptomics=as.numeric))|> + dplyr::mutate(transcriptomics = 2^transcriptomics - 1) colnames(res)[1]<-'other_id' print('fixing gene names') diff --git a/build/broad_sanger/05b_separate_datasets.py b/build/broad_sanger/05b_separate_datasets.py index dd28c990..6097ca32 100644 --- a/build/broad_sanger/05b_separate_datasets.py +++ b/build/broad_sanger/05b_separate_datasets.py @@ -40,6 +40,9 @@ def main(): # Extract information to separate out datasets exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list() exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list() + + #Ensure that the improve_sample_id column is in integer form. + exp = exp.with_column(pl.col("improve_sample_id").cast(pl.Float64).cast(pl.Int64)) # Write Filtered Experiments File to TSV. Then delete it from memory. exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower() diff --git a/build/cptac/getCptacData.py b/build/cptac/getCptacData.py index 697df3c1..7464d397 100755 --- a/build/cptac/getCptacData.py +++ b/build/cptac/getCptacData.py @@ -129,7 +129,7 @@ def buildTumorSampleTable(sample_names, cancer_type, samples, maxval): samples = samples.reset_index(drop=True) return samples, maxval -def formatMutData(df, dtype, ctype, samp_names, source, samples): +def formatMutData(df, dtype, ctype, samp_names, source, genes, samples): ''' Formats mutational data. ''' @@ -159,6 +159,10 @@ def formatMutData(df, dtype, ctype, samp_names, source, samples): 'Mutation': 'mutation' }) blongdf = blongdf[['improve_sample_id', 'entrez_id', 'mutation', 'variant_classification', 'source', 'study']] + + #Ensure that genes that don't map to genes_file are dropped. + valid = set(genes['entrez_id'].astype(int)) + blongdf = blongdf[blongdf.entrez_id.isin(valid)] return blongdf @@ -366,7 +370,7 @@ def main(): df.dropna(how='all', axis=0, inplace=True) print(cancertype + ' ' + dtype) if dtype == 'somatic_mutation': - fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], samples) + fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], genes, samples) fdf = fdf.reset_index(drop=True) dtype_key = 'mutations' elif dtype == 'CNV': @@ -393,6 +397,7 @@ def main(): print(df.to_string()) df['entrez_id'] = df['entrez_id'].fillna(0) df['entrez_id'] = df['entrez_id'].astype(int) + df = df[df.entrez_id != 0] df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip') if __name__ == '__main__': diff --git a/build/crcpdo/01-samples-crcpdo.py b/build/crcpdo/01-samples-crcpdo.py index 63ca423c..05b0352f 100644 --- a/build/crcpdo/01-samples-crcpdo.py +++ b/build/crcpdo/01-samples-crcpdo.py @@ -118,13 +118,13 @@ def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str for index, row in samples_df.iterrows(): if "Tumor-Organoid" in samples_df.loc[index, 'other_id']: samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O" - samples_df.loc[index, 'model_type'] = "organoid" + samples_df.loc[index, 'model_type'] = "patient derived organoid" if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']: samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B" - samples_df.loc[index, 'model_type'] = "ex vivo" + samples_df.loc[index, 'model_type'] = "tumor" if "Normal-Organoid" in samples_df.loc[index, 'other_id']: samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O" - samples_df.loc[index, 'model_type'] = "organoid" + samples_df.loc[index, 'model_type'] = "patient derived organoid" samples_df['other_id_source'] = "vandeWetering_2015" samples_df['cancer_type'] = "Colorectal Carcinoma" samples_df['species'] = "Homo sapiens (Human)" diff --git a/build/hcmi/01-createHCMISamplesFile.py b/build/hcmi/01-createHCMISamplesFile.py index e5db163d..8f75f330 100755 --- a/build/hcmi/01-createHCMISamplesFile.py +++ b/build/hcmi/01-createHCMISamplesFile.py @@ -22,19 +22,19 @@ def align_to_linkml_schema(input_df): ------- pd.DataFrame A copy of the input DataFrame with the 'model_type' column values mapped to - a set of predefined categories ('tumor', 'organoid', 'cell line'). + a set of predefined categories ('tumor', 'patient derived organoid', 'cell line'). The mapping is designed to align the DataFrame with the LinkML schema requirements. """ mapping_dict = { 'Solid Tissue': 'tumor', - '3D Organoid': 'organoid', + '3D Organoid': 'patient derived organoid', 'Peripheral Blood Components NOS': 'tumor', 'Buffy Coat': np.nan, None: np.nan, 'Peripheral Whole Blood': 'tumor', 'Adherent Cell Line': 'cell line', - '3D Neurosphere': 'organoid', + '3D Neurosphere': 'patient derived organoid', '2D Modified Conditionally Reprogrammed Cells': 'cell line', 'Pleural Effusion': np.nan, 'Human Original Cells': 'cell line', @@ -50,6 +50,9 @@ def align_to_linkml_schema(input_df): input_df.dropna(subset=['model_type'], inplace=True) input_df = input_df.sort_values(by='improve_sample_id') + #Apparently any missing cancer type is normal tissue. + input_df['cancer_type'] = input_df['cancer_type'].replace('', np.nan) + input_df['cancer_type'] = input_df['cancer_type'].fillna('Normal Tissue') return input_df def download_from_github(raw_url, save_path): diff --git a/build/hcmi/02-getHCMIData.py b/build/hcmi/02-getHCMIData.py index a4aac9f8..0d451fcd 100644 --- a/build/hcmi/02-getHCMIData.py +++ b/build/hcmi/02-getHCMIData.py @@ -402,7 +402,7 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file): # Load mapping files using Polars genes = pl.read_csv(entrez_map_file) # Map gene_name to entrez_id - + valid_entrez = genes["entrez_id"].cast(pl.Int64).unique().to_list() # Process each dataframe based on its data_type while dataframe_list: df = dataframe_list.pop() @@ -428,8 +428,16 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file): mapped_df = mapped_df.select(['entrez_id', 'mutation', 'Variant_Classification', 'file_id']) mapped_df = mapped_df.with_columns([pl.lit('GDC').alias('source'), pl.lit('HCMI').alias('study')]) - mapped_df = mapped_df.with_columns(mapped_df["entrez_id"].cast(str)) - + mapped_df = mapped_df.with_columns([ + pl.col("entrez_id").cast(pl.Int64), + pl.lit('GDC' ).alias('source'), + pl.lit('HCMI').alias('study'), + ]) + #drop genes not in genes file. + mapped_df = mapped_df.filter( + (pl.col("entrez_id") != 0) & + pl.col("entrez_id").is_in(valid_entrez) + ) final_dataframe = pl.concat([final_dataframe, mapped_df]) del df, mapped_df gc.collect() diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R index ae45826c..0ec5704b 100755 --- a/build/mpnst/00_sample_gen.R +++ b/build/mpnst/00_sample_gen.R @@ -55,7 +55,7 @@ sampTable<-manifest|> ##third, generate a sample for the MTs if they were generated pdxmt<-subset(sampTable,!is.na(MicroTissueDrugFolder)) -pdxmt$model_type=rep('organoid',nrow(pdxmt)) +pdxmt$model_type=rep('xenograft derived organoid',nrow(pdxmt)) print(pdxmt) main<-rbind(sampTable,pdxmt)|> diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R index a1276eff..9097465a 100755 --- a/build/mpnst/01_mpnst_get_omics.R +++ b/build/mpnst/01_mpnst_get_omics.R @@ -34,7 +34,7 @@ samples_df <- fread(patients)|> pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') tumor_samps<-subset(samples_df,model_type=='tumor') -mt_samps<-subset(samples_df,model_type=='organoid') +mt_samps<-subset(samples_df,model_type=='xenograft derived organoid') ##now get the manifest from synapse manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> diff --git a/build/pancpdo/01-createPancPDOSamplesFile.py b/build/pancpdo/01-createPancPDOSamplesFile.py index fe918fc8..a031e6bc 100755 --- a/build/pancpdo/01-createPancPDOSamplesFile.py +++ b/build/pancpdo/01-createPancPDOSamplesFile.py @@ -50,13 +50,13 @@ def align_to_linkml_schema(input_df): mapping_dict = { 'Solid Tissue': 'tumor', - '3D Organoid': 'organoid', + '3D Organoid': 'patient derived organoid', 'Peripheral Blood Components NOS': 'tumor', 'Buffy Coat': np.nan, None: np.nan, 'Peripheral Whole Blood': 'tumor', 'Adherent Cell Line': 'cell line', - '3D Neurosphere': 'organoid', + '3D Neurosphere': 'patient derived organoid', '2D Modified Conditionally Reprogrammed Cells': 'cell line', 'Pleural Effusion': np.nan, 'Human Original Cells': 'cell line', @@ -301,6 +301,10 @@ def filter_and_subset_data(df, maxval, mapfile): if not missing_ids.empty: print("\nWarning: Some samples could not be assigned an 'improve_sample_id'.") print(missing_ids) + + # Missing cancer type indicates that it is normal tissue. + longtab['cancer_type'] = longtab['cancer_type'].replace('', np.nan) + longtab['cancer_type'] = longtab['cancer_type'].fillna('Normal Tissue') return longtab def main(): diff --git a/build/sarcpdo/00_createSarcPDOSampleFile.py b/build/sarcpdo/00_createSarcPDOSampleFile.py index af69c196..956b35e2 100644 --- a/build/sarcpdo/00_createSarcPDOSampleFile.py +++ b/build/sarcpdo/00_createSarcPDOSampleFile.py @@ -86,10 +86,12 @@ def download_and_format_rna_samples(synLoginObject): rna_samples['model_type'] = modeltypeDF[0] # add rows by hand for SARC0139_1 that are missing from sample sheet but present in rnaseq data addrow1 = {'other_id' : 'SARC0139_1_Tumor', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'tumor'} - addrow2 = {'other_id' : 'SARC0139_1_Organoid', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'organoid'} + addrow2 = {'other_id' : 'SARC0139_1_Organoid', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'patient derived organoid'} rna_samples.loc[len(rna_samples)] = addrow1 rna_samples.loc[len(rna_samples)] = addrow2 - + + rna_samples.loc[rna_samples['model_type'] == 'organoid', 'model_type'] = 'patient derived organoid' + return rna_samples diff --git a/build/sarcpdo/01_createSarcPDOOmicsFiles.py b/build/sarcpdo/01_createSarcPDOOmicsFiles.py index 2a5829d8..ce1b0e40 100644 --- a/build/sarcpdo/01_createSarcPDOOmicsFiles.py +++ b/build/sarcpdo/01_createSarcPDOOmicsFiles.py @@ -40,6 +40,8 @@ def download_and_format_transcriptomic(synLoginObject, genesTable, samplesTable) final = melted_joined_renamed[['entrez_id', 'improve_sample_id', 'transcriptomics', 'source', 'study']] #dropduplicates (see a few lines above - should be down here) final = final.drop_duplicates() + # make sure entrez id is in int format. + final['entrez_id'] = final['entrez_id'].astype(int) return final def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTable): @@ -79,6 +81,9 @@ def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTabl mutationData =mutationData.rename({"Name": "mutation"}, axis=1) # drop duplicates mutationData = mutationData.drop_duplicates() + # make sure entrez_id is in integer format + mutationData['entrez_id'] = mutationData['entrez_id'].astype(int) + return mutationData