Adding handler for output file generation

CCBR · Jul 30, 2021 · 75d7d3e · 75d7d3e
1 parent 4cd378e
commit 75d7d3e
Showing 1 changed file with 124 additions and 38 deletions.
diff --git a/aasap b/aasap
@@ -31,7 +31,7 @@ import sys, os, subprocess
 import argparse, textwrap
 
 __author__ = 'Skyler Kuhn'
-__version__ = 'v1.0.0'
+__version__ = 'v1.1.0'
 __email__ = '[email protected]'
 
 
@@ -112,51 +112,137 @@ def run(sub_args):
     @param sub_args <parser.parse_args() object>:
         Parsed arguments for run sub-command
     """
-    # Create dictionary to map transcript ID to its sequence
-    transcriptome = {}
+    # Initialize the output directory
+    initialize(sub_args.output)
+    # Truncates non-frame shift mutations 
+    # +/- N positions from mutation start 
+    # site in the amino acid sequence
     subset = int(sub_args.subset)
+    # Dictionary to quickly map each 
+    # transcript ID to its coding DNA
+    # sequence or CDS sequence. The 
+    # build coomand can be used to 
+    # generate this reference file.
+    transcriptome = {}
     for sid, sequence in fasta(sub_args.transcripts):
         # Grab the transcipt id and remove the version suffix
         transcript_id = sid.split(' ')[0].split('.')[0]
         transcriptome[transcript_id] = sequence
-
+
+    # Run AASAP against each user supplied input file 
     for file in sub_args.input:
+        # Parse field of interest from each 
+        # excel file. Each input file is 
+        # required have the following fields:
+        # 'Transcript_ID', 'Variant_Classification',
+        # 'HGVSc', 'Hugo_Symbol' where 'Transcript_ID'
+        # and 'Variant_Classification','HGVSc' are 
+        # mandatory.
         err('Opening {}'.format(file))
         df = excel(file, subset=['Transcript_ID','Variant_Classification','HGVSc','Hugo_Symbol','Gene'])
-
-        for i,row in df.iterrows():
-            variant_class = str(row['Variant_Classification'])
-            transcript = str(row['Transcript_ID'])
-            hgvs = str(row['HGVSc'])
-            if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'):
-                try:
-                    sequence = transcriptome[transcript]
-                except KeyError:
-                    err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript),
-                    "Please verify the correct reference file is provided!"))
-                    continue # Skip over un-annotated transcript
-                try:
-                    mutated_dna, variant_position = mutate(sequence, hgvs)
-                    mutated_amino_acid = translate(mutated_dna)
-                    aa_variant_position = convert_aa_cooridate(variant_position)
-                    if variant_class.lower().startswith('frame_shift'):
-                        truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset)
-                    else:
-                        truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset)
-                    print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(variant_class, transcript, hgvs, variant_position, 
-                        sequence, mutated_dna, mutated_amino_acid, truncated_aa))
-                except NonCodingVariantError as e: 
-                    err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript))
-                except UnsupportedVariantTypeError as e:
-                    err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript))
-                except VariantParsingError as e:
-                    err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript))
-                except NonMatchingReferenceBases as e:
-                    err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript),
-                    "Please verify the correct reference file is provided!")
-                except InvalidCodonError as e:
-                    err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript),
-                    "Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna))
+        # Create output file name from input file
+        # Output file name generated by removing the
+        # suffix or input file name extension and 
+        # adding a new extension '.aasap.tsv' 
+        output_file = os.path.join(sub_args.output, "{}.aasap.tsv".format(os.path.splitext(os.path.basename(file))[0]))
+        err('Writing output file {}'.format(output_file))
+        with open(output_file, 'w') as ofh:
+            # Write header to output file
+            ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                "Variant_Classification", 
+                "Hugo_Symbol", 
+                "Transcript_ID", 
+                "HGVSc", 
+                "Variant_Start_Position", 
+                "Transcript_Sequence", 
+                "Mutated_Transcript_Sequence", 
+                "Mutated_AA_Sequence", 
+                "Subset_AA_Sequence"))
+
+            # Mutate each recorded variant in the input file. 
+            for i,row in df.iterrows():
+                # Variant class is used to determine the size 
+                # of the downstream portion of the subset AA
+                # sequence from the variant start site. Frame
+                # shift mutations will report until the end of 
+                # the coding AA sequence or until a stop codon 
+                # is reached.
+                variant_class = str(row['Variant_Classification'])
+                transcript = str(row['Transcript_ID'])
+                hgvs = str(row['HGVSc'])
+                hugo = str(row['Hugo_Symbol'])
+                if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'):
+                    try:
+                        sequence = transcriptome[transcript]
+                    except KeyError:
+                        # Skip over un-annotated transcript.
+                        # Recorded transcript is not annotated
+                        # in the user provided reference file,
+                        # which may indicate that the user did 
+                        # not provide the same reference files
+                        # to call variants and to generate the 
+                        # MAF file in the build sub command
+                        err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript),
+                        "Please verify the correct reference file is provided!"))
+                        continue
+                    try:
+                        # Mutate the coding DNA sequence based on
+                        # the recorded HGVS representation of the 
+                        # mutation, and get the mutation start site.
+                        # HGVS terms representing mutations in non-exonic 
+                        # regions will return a NonCodingVariantError. 
+                        # HGVS terms without a parser or HGVS terms which 
+                        # are not supported will return a 
+                        # UnsupportedVariantTypeError.
+                        # HGVS terms which cannot be parsed during term
+                        # tokenization will return a VariantParsingError.
+                        # HGVS terms containing the variants reference 
+                        # sequence will be checked against the transcripts
+                        # sequence. If the transcript sequence does not 
+                        # match the HGVS term sequence then a 
+                        # NonMatchingReferenceBases error is raised.
+                        mutated_dna, variant_position = mutate(sequence, hgvs)
+                        # Translate the mutated coding DNA sequence into 
+                        # an amino acid sequence. Sequences containing 
+                        # codons with non-stardard nucleotide 
+                        # representations (i.e. not "A,a,C,c,G,T,t") 
+                        # will not be translated and will return  
+                        # InvalidCodonError.
+                        mutated_amino_acid = translate(mutated_dna)
+                        # Convert coding DNA varaint start site to 
+                        # amino acid coordinate system.
+                        aa_variant_position = convert_aa_cooridate(variant_position)
+                        if variant_class.lower().startswith('frame_shift'):
+                            # In the Subset_AA_sequence representation of 
+                            # the mutated amino acid seuqnce, the downstream 
+                            # portion of frameshift mutations are reported 
+                            # until one of the following conditions are met:
+                            # the end of the mutated, coding AA sequence is
+                            # reached, OR until the first terminating stop 
+                            # codon is reached. 
+                            truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset)
+                        else:
+                            # In the Subset_AA_sequence representation of 
+                            # the mutated amino acid seuqnce, the upstream and 
+                            # downstream portion of non-frame shift mutations are
+                            # +/- N amino acids of the mutation start site. This 
+                            # vairable is adjustable via the --subset cli option.
+                            truncated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset)
+                        # Write results to output file
+                        ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(variant_class, hugo, transcript, hgvs, variant_position, 
+                            sequence, mutated_dna, mutated_amino_acid, truncated_aa))
+                    except NonCodingVariantError as e: 
+                        err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript))
+                    except UnsupportedVariantTypeError as e:
+                        err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript))
+                    except VariantParsingError as e:
+                        err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript))
+                    except NonMatchingReferenceBases as e:
+                        err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript),
+                        "Please verify the correct reference file is provided!")
+                    except InvalidCodonError as e:
+                        err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript),
+                        "Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna))
 
 
 def parsed_arguments():