Skip to content

Commit

Permalink
Adding handler for output file generation
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Jul 30, 2021
1 parent 4cd378e commit 75d7d3e
Showing 1 changed file with 124 additions and 38 deletions.
162 changes: 124 additions & 38 deletions aasap
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import sys, os, subprocess
import argparse, textwrap

__author__ = 'Skyler Kuhn'
__version__ = 'v1.0.0'
__version__ = 'v1.1.0'
__email__ = '[email protected]'


Expand Down Expand Up @@ -112,51 +112,137 @@ def run(sub_args):
@param sub_args <parser.parse_args() object>:
Parsed arguments for run sub-command
"""
# Create dictionary to map transcript ID to its sequence
transcriptome = {}
# Initialize the output directory
initialize(sub_args.output)
# Truncates non-frame shift mutations
# +/- N positions from mutation start
# site in the amino acid sequence
subset = int(sub_args.subset)
# Dictionary to quickly map each
# transcript ID to its coding DNA
# sequence or CDS sequence. The
# build coomand can be used to
# generate this reference file.
transcriptome = {}
for sid, sequence in fasta(sub_args.transcripts):
# Grab the transcipt id and remove the version suffix
transcript_id = sid.split(' ')[0].split('.')[0]
transcriptome[transcript_id] = sequence


# Run AASAP against each user supplied input file
for file in sub_args.input:
# Parse field of interest from each
# excel file. Each input file is
# required have the following fields:
# 'Transcript_ID', 'Variant_Classification',
# 'HGVSc', 'Hugo_Symbol' where 'Transcript_ID'
# and 'Variant_Classification','HGVSc' are
# mandatory.
err('Opening {}'.format(file))
df = excel(file, subset=['Transcript_ID','Variant_Classification','HGVSc','Hugo_Symbol','Gene'])

for i,row in df.iterrows():
variant_class = str(row['Variant_Classification'])
transcript = str(row['Transcript_ID'])
hgvs = str(row['HGVSc'])
if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'):
try:
sequence = transcriptome[transcript]
except KeyError:
err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript),
"Please verify the correct reference file is provided!"))
continue # Skip over un-annotated transcript
try:
mutated_dna, variant_position = mutate(sequence, hgvs)
mutated_amino_acid = translate(mutated_dna)
aa_variant_position = convert_aa_cooridate(variant_position)
if variant_class.lower().startswith('frame_shift'):
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset)
else:
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset)
print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(variant_class, transcript, hgvs, variant_position,
sequence, mutated_dna, mutated_amino_acid, truncated_aa))
except NonCodingVariantError as e:
err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript))
except UnsupportedVariantTypeError as e:
err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript))
except VariantParsingError as e:
err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript))
except NonMatchingReferenceBases as e:
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript),
"Please verify the correct reference file is provided!")
except InvalidCodonError as e:
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript),
"Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna))
# Create output file name from input file
# Output file name generated by removing the
# suffix or input file name extension and
# adding a new extension '.aasap.tsv'
output_file = os.path.join(sub_args.output, "{}.aasap.tsv".format(os.path.splitext(os.path.basename(file))[0]))
err('Writing output file {}'.format(output_file))
with open(output_file, 'w') as ofh:
# Write header to output file
ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
"Variant_Classification",
"Hugo_Symbol",
"Transcript_ID",
"HGVSc",
"Variant_Start_Position",
"Transcript_Sequence",
"Mutated_Transcript_Sequence",
"Mutated_AA_Sequence",
"Subset_AA_Sequence"))

# Mutate each recorded variant in the input file.
for i,row in df.iterrows():
# Variant class is used to determine the size
# of the downstream portion of the subset AA
# sequence from the variant start site. Frame
# shift mutations will report until the end of
# the coding AA sequence or until a stop codon
# is reached.
variant_class = str(row['Variant_Classification'])
transcript = str(row['Transcript_ID'])
hgvs = str(row['HGVSc'])
hugo = str(row['Hugo_Symbol'])
if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'):
try:
sequence = transcriptome[transcript]
except KeyError:
# Skip over un-annotated transcript.
# Recorded transcript is not annotated
# in the user provided reference file,
# which may indicate that the user did
# not provide the same reference files
# to call variants and to generate the
# MAF file in the build sub command
err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript),
"Please verify the correct reference file is provided!"))
continue
try:
# Mutate the coding DNA sequence based on
# the recorded HGVS representation of the
# mutation, and get the mutation start site.
# HGVS terms representing mutations in non-exonic
# regions will return a NonCodingVariantError.
# HGVS terms without a parser or HGVS terms which
# are not supported will return a
# UnsupportedVariantTypeError.
# HGVS terms which cannot be parsed during term
# tokenization will return a VariantParsingError.
# HGVS terms containing the variants reference
# sequence will be checked against the transcripts
# sequence. If the transcript sequence does not
# match the HGVS term sequence then a
# NonMatchingReferenceBases error is raised.
mutated_dna, variant_position = mutate(sequence, hgvs)
# Translate the mutated coding DNA sequence into
# an amino acid sequence. Sequences containing
# codons with non-stardard nucleotide
# representations (i.e. not "A,a,C,c,G,T,t")
# will not be translated and will return
# InvalidCodonError.
mutated_amino_acid = translate(mutated_dna)
# Convert coding DNA varaint start site to
# amino acid coordinate system.
aa_variant_position = convert_aa_cooridate(variant_position)
if variant_class.lower().startswith('frame_shift'):
# In the Subset_AA_sequence representation of
# the mutated amino acid seuqnce, the downstream
# portion of frameshift mutations are reported
# until one of the following conditions are met:
# the end of the mutated, coding AA sequence is
# reached, OR until the first terminating stop
# codon is reached.
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset)
else:
# In the Subset_AA_sequence representation of
# the mutated amino acid seuqnce, the upstream and
# downstream portion of non-frame shift mutations are
# +/- N amino acids of the mutation start site. This
# vairable is adjustable via the --subset cli option.
truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset)
# Write results to output file
ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(variant_class, hugo, transcript, hgvs, variant_position,
sequence, mutated_dna, mutated_amino_acid, truncated_aa))
except NonCodingVariantError as e:
err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript))
except UnsupportedVariantTypeError as e:
err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript))
except VariantParsingError as e:
err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript))
except NonMatchingReferenceBases as e:
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript),
"Please verify the correct reference file is provided!")
except InvalidCodonError as e:
err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript),
"Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna))


def parsed_arguments():
Expand Down

0 comments on commit 75d7d3e

Please sign in to comment.