-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding handler for output file generation
- Loading branch information
1 parent
4cd378e
commit 75d7d3e
Showing
1 changed file
with
124 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,7 @@ import sys, os, subprocess | |
import argparse, textwrap | ||
|
||
__author__ = 'Skyler Kuhn' | ||
__version__ = 'v1.0.0' | ||
__version__ = 'v1.1.0' | ||
__email__ = '[email protected]' | ||
|
||
|
||
|
@@ -112,51 +112,137 @@ def run(sub_args): | |
@param sub_args <parser.parse_args() object>: | ||
Parsed arguments for run sub-command | ||
""" | ||
# Create dictionary to map transcript ID to its sequence | ||
transcriptome = {} | ||
# Initialize the output directory | ||
initialize(sub_args.output) | ||
# Truncates non-frame shift mutations | ||
# +/- N positions from mutation start | ||
# site in the amino acid sequence | ||
subset = int(sub_args.subset) | ||
# Dictionary to quickly map each | ||
# transcript ID to its coding DNA | ||
# sequence or CDS sequence. The | ||
# build coomand can be used to | ||
# generate this reference file. | ||
transcriptome = {} | ||
for sid, sequence in fasta(sub_args.transcripts): | ||
# Grab the transcipt id and remove the version suffix | ||
transcript_id = sid.split(' ')[0].split('.')[0] | ||
transcriptome[transcript_id] = sequence | ||
|
||
|
||
# Run AASAP against each user supplied input file | ||
for file in sub_args.input: | ||
# Parse field of interest from each | ||
# excel file. Each input file is | ||
# required have the following fields: | ||
# 'Transcript_ID', 'Variant_Classification', | ||
# 'HGVSc', 'Hugo_Symbol' where 'Transcript_ID' | ||
# and 'Variant_Classification','HGVSc' are | ||
# mandatory. | ||
err('Opening {}'.format(file)) | ||
df = excel(file, subset=['Transcript_ID','Variant_Classification','HGVSc','Hugo_Symbol','Gene']) | ||
|
||
for i,row in df.iterrows(): | ||
variant_class = str(row['Variant_Classification']) | ||
transcript = str(row['Transcript_ID']) | ||
hgvs = str(row['HGVSc']) | ||
if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'): | ||
try: | ||
sequence = transcriptome[transcript] | ||
except KeyError: | ||
err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript), | ||
"Please verify the correct reference file is provided!")) | ||
continue # Skip over un-annotated transcript | ||
try: | ||
mutated_dna, variant_position = mutate(sequence, hgvs) | ||
mutated_amino_acid = translate(mutated_dna) | ||
aa_variant_position = convert_aa_cooridate(variant_position) | ||
if variant_class.lower().startswith('frame_shift'): | ||
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset) | ||
else: | ||
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset) | ||
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(variant_class, transcript, hgvs, variant_position, | ||
sequence, mutated_dna, mutated_amino_acid, truncated_aa)) | ||
except NonCodingVariantError as e: | ||
err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript)) | ||
except UnsupportedVariantTypeError as e: | ||
err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript)) | ||
except VariantParsingError as e: | ||
err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript)) | ||
except NonMatchingReferenceBases as e: | ||
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript), | ||
"Please verify the correct reference file is provided!") | ||
except InvalidCodonError as e: | ||
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript), | ||
"Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna)) | ||
# Create output file name from input file | ||
# Output file name generated by removing the | ||
# suffix or input file name extension and | ||
# adding a new extension '.aasap.tsv' | ||
output_file = os.path.join(sub_args.output, "{}.aasap.tsv".format(os.path.splitext(os.path.basename(file))[0])) | ||
err('Writing output file {}'.format(output_file)) | ||
with open(output_file, 'w') as ofh: | ||
# Write header to output file | ||
ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( | ||
"Variant_Classification", | ||
"Hugo_Symbol", | ||
"Transcript_ID", | ||
"HGVSc", | ||
"Variant_Start_Position", | ||
"Transcript_Sequence", | ||
"Mutated_Transcript_Sequence", | ||
"Mutated_AA_Sequence", | ||
"Subset_AA_Sequence")) | ||
|
||
# Mutate each recorded variant in the input file. | ||
for i,row in df.iterrows(): | ||
# Variant class is used to determine the size | ||
# of the downstream portion of the subset AA | ||
# sequence from the variant start site. Frame | ||
# shift mutations will report until the end of | ||
# the coding AA sequence or until a stop codon | ||
# is reached. | ||
variant_class = str(row['Variant_Classification']) | ||
transcript = str(row['Transcript_ID']) | ||
hgvs = str(row['HGVSc']) | ||
hugo = str(row['Hugo_Symbol']) | ||
if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'): | ||
try: | ||
sequence = transcriptome[transcript] | ||
except KeyError: | ||
# Skip over un-annotated transcript. | ||
# Recorded transcript is not annotated | ||
# in the user provided reference file, | ||
# which may indicate that the user did | ||
# not provide the same reference files | ||
# to call variants and to generate the | ||
# MAF file in the build sub command | ||
err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript), | ||
"Please verify the correct reference file is provided!")) | ||
continue | ||
try: | ||
# Mutate the coding DNA sequence based on | ||
# the recorded HGVS representation of the | ||
# mutation, and get the mutation start site. | ||
# HGVS terms representing mutations in non-exonic | ||
# regions will return a NonCodingVariantError. | ||
# HGVS terms without a parser or HGVS terms which | ||
# are not supported will return a | ||
# UnsupportedVariantTypeError. | ||
# HGVS terms which cannot be parsed during term | ||
# tokenization will return a VariantParsingError. | ||
# HGVS terms containing the variants reference | ||
# sequence will be checked against the transcripts | ||
# sequence. If the transcript sequence does not | ||
# match the HGVS term sequence then a | ||
# NonMatchingReferenceBases error is raised. | ||
mutated_dna, variant_position = mutate(sequence, hgvs) | ||
# Translate the mutated coding DNA sequence into | ||
# an amino acid sequence. Sequences containing | ||
# codons with non-stardard nucleotide | ||
# representations (i.e. not "A,a,C,c,G,T,t") | ||
# will not be translated and will return | ||
# InvalidCodonError. | ||
mutated_amino_acid = translate(mutated_dna) | ||
# Convert coding DNA varaint start site to | ||
# amino acid coordinate system. | ||
aa_variant_position = convert_aa_cooridate(variant_position) | ||
if variant_class.lower().startswith('frame_shift'): | ||
# In the Subset_AA_sequence representation of | ||
# the mutated amino acid seuqnce, the downstream | ||
# portion of frameshift mutations are reported | ||
# until one of the following conditions are met: | ||
# the end of the mutated, coding AA sequence is | ||
# reached, OR until the first terminating stop | ||
# codon is reached. | ||
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset) | ||
else: | ||
# In the Subset_AA_sequence representation of | ||
# the mutated amino acid seuqnce, the upstream and | ||
# downstream portion of non-frame shift mutations are | ||
# +/- N amino acids of the mutation start site. This | ||
# vairable is adjustable via the --subset cli option. | ||
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset) | ||
# Write results to output file | ||
ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(variant_class, hugo, transcript, hgvs, variant_position, | ||
sequence, mutated_dna, mutated_amino_acid, truncated_aa)) | ||
except NonCodingVariantError as e: | ||
err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript)) | ||
except UnsupportedVariantTypeError as e: | ||
err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript)) | ||
except VariantParsingError as e: | ||
err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript)) | ||
except NonMatchingReferenceBases as e: | ||
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript), | ||
"Please verify the correct reference file is provided!") | ||
except InvalidCodonError as e: | ||
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript), | ||
"Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna)) | ||
|
||
|
||
def parsed_arguments(): | ||
|