sd17fall · vivienyuwenchen · Sep 13, 2017 · Sep 14, 2017 · Sep 14, 2017 · Sep 14, 2017
diff --git a/gene_finder.py b/gene_finder.py
@@ -1,14 +1,17 @@
 # -*- coding: utf-8 -*-
 """
-YOUR HEADER COMMENT HERE
+Finds amino acid sequences coded by DNA input
 
-@author: YOUR NAME HERE
+@author: Vivien Chen
 
 """
 
 import random
 from amino_acids import aa, codons, aa_table   # you may find these useful
 from load import load_seq
+dna = load_seq("./data/X73525.fa")
+import doctest
+from pickle import dump, load
 
 
 def shuffle_string(s):
@@ -25,13 +28,31 @@ def get_complement(nucleotide):
 
         nucleotide: a nucleotide (A, C, G, or T) represented as a string
         returns: the complementary nucleotide
+
+        I added two more doctests because each complementary nucleotide is in
+        its own if/else if branch. If one branch doesn't work and there is no
+        doctest for that branch, the mistake will not be caught. In fact, I
+        found out that I spelled nucleotide wrong in the G branch.
+
     >>> get_complement('A')
     'T'
     >>> get_complement('C')
     'G'
+    >>> get_complement('G')
+    'C'
+    >>> get_complement('T')
+    'A'
     """
-    # TODO: implement this
-    pass
+    if nucleotide == 'A':
+        return 'T'
+    elif nucleotide == 'C':
+        return 'G'
+    elif nucleotide == 'G':
+        return 'C'
+    elif nucleotide == 'T':
+        return 'A'
+
+#doctest.run_docstring_examples(get_complement, globals(), verbose=True)
 
 
 def get_reverse_complement(dna):
@@ -40,13 +61,27 @@ def get_reverse_complement(dna):
 
         dna: a DNA sequence represented as a string
         returns: the reverse complementary DNA sequence represented as a string
+
+        I did not add any more doctests because each string contains all the
+        nucleotides, so two doctests with random strings is sufficient.
+
     >>> get_reverse_complement("ATGCCCGCTTT")
     'AAAGCGGGCAT'
     >>> get_reverse_complement("CCGCGTTCA")
     'TGAACGCGG'
     """
-    # TODO: implement this
-    pass
+    reversed_dna = dna[::-1]
+    comp_list = []
+
+    for nuc in range(0, len(reversed_dna)):
+        comp_nuc = get_complement(reversed_dna[nuc])
+        comp_list.append(comp_nuc)
+
+    comp_str = ''.join(comp_list)
+
+    return comp_str
+
+#doctest.run_docstring_examples(get_reverse_complement, globals(), verbose=True)
 
 
 def rest_of_ORF(dna):
@@ -57,13 +92,29 @@ def rest_of_ORF(dna):
 
         dna: a DNA sequence
         returns: the open reading frame represented as a string
+
+        I added two more doctests: one in which the frame stop codon is TAA and
+        another in which there is no frame stop codon. This way, all the
+        branches are tested for error.
+
     >>> rest_of_ORF("ATGTGAA")
     'ATG'
     >>> rest_of_ORF("ATGAGATAGG")
     'ATGAGA'
+    >>> rest_of_ORF("ATGGAATAAGTA")
+    'ATGGAA'
+    >>> rest_of_ORF("ATGAGAGA")
+    'ATGAGAGA'
     """
-    # TODO: implement this
-    pass
+    stop_codons = ['TAA', 'TAG', 'TGA']
+
+    for i in range(3, len(dna)-2, 3):
+        if dna[i:i+3] in stop_codons:
+            return dna[:i]
+
+    return dna
+
+#doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True)
 
 
 def find_all_ORFs_oneframe(dna):
@@ -76,11 +127,30 @@ def find_all_ORFs_oneframe(dna):
 
         dna: a DNA sequence
         returns: a list of non-nested ORFs
+
+        I added another doctest that includes three nested ATGs in the default
+        frame of the sequence in order to make sure that the function does not
+        return nested ORFs in that frame.
+
     >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
     ['ATGCATGAATGTAGA', 'ATGTGCCC']
+    >>> find_all_ORFs_oneframe('ATGATGATGTAAC')
+    ['ATGATGATG']
     """
-    # TODO: implement this
-    pass
+    all_ORFs = []
+    stopped = True
+    stop_codons = ['TAA', 'TAG', 'TGA']
+
+    for i in range(0, len(dna)-2, 3):
+        if dna[i:i+3] == 'ATG' and stopped:
+            all_ORFs.append(rest_of_ORF(dna[i:]))
+            stopped = False
+        elif dna[i:i+3] in stop_codons:
+            stopped = True
+
+    return all_ORFs
+
+#doctest.run_docstring_examples(find_all_ORFs_oneframe, globals(), verbose=True)
 
 
 def find_all_ORFs(dna):
@@ -93,11 +163,21 @@ def find_all_ORFs(dna):
         dna: a DNA sequence
         returns: a list of non-nested ORFs
 
+        I did not add any more doctests because this function depends on the
+        previous functions, so assuming the doctests of those were fruitful,
+        one doctest is sufficient for this function. The doctest includes one
+        ORF in each frame, so the function works if the doctest yields true.
+
     >>> find_all_ORFs("ATGCATGAATGTAG")
     ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
     """
-    # TODO: implement this
-    pass
+    frame1 = find_all_ORFs_oneframe(dna)
+    frame2 = find_all_ORFs_oneframe(dna[1:])
+    frame3 = find_all_ORFs_oneframe(dna[2:])
+
+    return frame1 + frame2 + frame3
+
+#doctest.run_docstring_examples(find_all_ORFs, globals(), verbose=True)
 
 
 def find_all_ORFs_both_strands(dna):
@@ -106,21 +186,40 @@ def find_all_ORFs_both_strands(dna):
 
         dna: a DNA sequence
         returns: a list of non-nested ORFs
+
+        I did not add any more doctests because this function depends on the
+        previous functions, so assuming the doctests of those were fruitful,
+        one doctest is sufficient for this function. The doctest includes one
+        ORF in each strand, so the function works if the doctest yields true.
+
     >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
     ['ATGCGAATG', 'ATGCTACATTCGCAT']
     """
-    # TODO: implement this
-    pass
+    strand1 = find_all_ORFs(dna)
+    strand2 = find_all_ORFs(get_reverse_complement(dna))
+
+    return strand1 + strand2
+
+#doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True)
 
 
 def longest_ORF(dna):
     """ Finds the longest ORF on both strands of the specified DNA and returns it
         as a string
+
     >>> longest_ORF("ATGCGAATGTAGCATCAAA")
     'ATGCTACATTCGCAT'
     """
-    # TODO: implement this
-    pass
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    humongest = 0
+
+    for i in range(0, len(all_ORFs)-1):
+        if len(all_ORFs[i]) < len(all_ORFs[i+1]):
+            humongest = all_ORFs[i+1]
+
+    return humongest
+
+#doctest.run_docstring_examples(longest_ORF, globals(), verbose=True)
 
 
 def longest_ORF_noncoding(dna, num_trials):
@@ -130,8 +229,18 @@ def longest_ORF_noncoding(dna, num_trials):
         dna: a DNA sequence
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
-    # TODO: implement this
-    pass
+    dna_shuffles = []
+
+    for i in range(0, num_trials):
+        dna_shuffles.append(shuffle_string(dna))
+
+    for i in range(0, len(dna_shuffles)-1):
+        if longest_ORF(dna_shuffles[i]) < longest_ORF(dna_shuffles[i+1]):
+            humongest = longest_ORF(dna_shuffles[i+1])
+
+    return len(humongest)
+
+#print longest_ORF_noncoding('ATGCGAATGTAGCATCAAA', 100)
 
 
 def coding_strand_to_AA(dna):
@@ -143,13 +252,22 @@ def coding_strand_to_AA(dna):
         returns: a string containing the sequence of amino acids encoded by the
                  the input DNA fragment
 
-        >>> coding_strand_to_AA("ATGCGA")
-        'MR'
-        >>> coding_strand_to_AA("ATGCCCGCTTT")
-        'MPA'
+    >>> coding_strand_to_AA("ATGCGA")
+    'MR'
+    >>> coding_strand_to_AA("ATGCCCGCTTT")
+    'MPA'
     """
-    # TODO: implement this
-    pass
+    amino_acids = []
+
+    for i in range(0, len(dna)-2, 3):
+        amino_acid = aa_table[dna[i:i+3]]
+        amino_acids.append(amino_acid)
+
+    amino_acids_str = ''.join(amino_acids)
+
+    return amino_acids_str
+
+#doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True)
 
 
 def gene_finder(dna):
@@ -158,9 +276,28 @@ def gene_finder(dna):
         dna: a DNA sequence
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
-    # TODO: implement this
-    pass
+    threshold = longest_ORF_noncoding(dna, 1500)
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    amino_acids = []
+
+    for i in range(0, len(all_ORFs)):
+        if len(all_ORFs[i]) > threshold:
+            amino_acid = coding_strand_to_AA(all_ORFs[i])
+            amino_acids.append(amino_acid)
+
+    return amino_acids
+
 
 if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
+    #import doctest
+    #doctest.testmod()
+
+    genes = gene_finder(dna)
+
+    #write to genes.txt file
+    file_ = open('genes.txt', 'wb')
+    dump(genes, file_)
+
+    #read from genes.txt file
+    file_ = open('genes.txt', 'rb+')
+    print(load(file_))
diff --git a/genes.txt b/genes.txt
@@ -0,0 +1,18 @@
+(lp0
+S'MSLRVRQIDRREWLLAQTATECQRHGREATLEYPTRQGMWVRLSDAEKRWSAWIKPGDWLEHVSPALAGAAVSAGAEHLVVPWLAATERPFELPVPHLSCRRLCVENPVPGSALPEGKLLHIMSDRGGLWFEHLPELPAVGGGRPKMLRWPLRFVIGSSDTQRSLLGRIGIGDVLLIRTSRAEVYCYAKKLGHFNRVEGGIIVETLDIQHIEEENNTTETAETLPGLNQLPVKLEFVLYRKNVTLAELEAMGQQQLLSLPTNAELNVEIMANGVLLGNGELVQMNDTLGVEIHEWLSESGNGE'
+p1
+aS'MSSNKTEKPTKKRLEDSAKKGQSFKSKDLIIACLTLGGIAYLVSYGSFNEFMGIIKIIIADNFDQSMADYSLAVFGIGLKYLIPFMLLCLVCSALPALLQAGFVLATEALKPNLSALNPVEGAKKLFSMRTVKDTVKTLLYLSSFVVAAIICWKKYKVEIFSQLNGNIVGIAVIWRELLLALVLTCLACALIVLLLDAIAEYFLTMKDMKMDKEEVKREMKEQEGNPEVKSKRREVHMEILSEQVKSDIENSRLIVANPTHITIGIYFKPELMPIPMISVYETNQRALAVRAYAEKVGVPVIVDIKLARSLFKTHRRYDLVSLEEIDEVLRLLVWLEEVENAGKDVIQPQENEVRH'
+p2
+aS'MGIFASAGCGKTMLMHMLIEQTEADVFVIGLIGERGREVTEFVDMLRASHKKEKCVLVFATSDFPSVDRCNAAQLATTVAEYFRDQGKRVVLFIDSMTRYARALRDVALASGERPARRGYPASVFDNLPRLLERPGATSEGSITAFYTVLLESEEEADPMADEIRSILDGHLYLSRKLAGQGHYPAIDVLKSVSRVFGQVTTPTHAEQASAVRKLMTRLEELQLFIDLGEYRPGENIDNDRAMQMRDSLKAWLCQPVAQYSSFDDTLSGMNAFADQN'
+p3
+aS'MGDVSAVSSSGNILLPQQDEVGGLSEALKKAVEKHKTEYSGDKKDRDYGDAFVMHKETALPLLLAAWRHGAPAKSEHHNGNVSGLHHNGKSELRIAEKLLKVTAEKSVGLISAEAKVDKSAALLSSKNRPLESVSGKKLSADLKAVESVSEVTDNATGISDDNIKALPGDNKAIAGEGVRKEGAPLARDVAPARMAAANTGKPEDKDHKKVKDVSQLPLQPTTIADLSQLTGGDEKMPLAAQSKPMMTIFPTADGVKGEDSSLTYRFQRWGNDYSVNIQARQAGEFSLIPSNTQVEHRLHDQWQNGNPQRWHLTRDDQQNPQQQQHRQQSGEEDDA'
+p4
+aS'MGNDISLIALLAFSTLLPFIIASGTCFVKFSIVFVMVRNALGLQQIPSNMTLNGVALLLSMFVMWPIMHDAYVYFEDEDVTFNDISSLSKHVDEGLDGYRDYLIKYSDRELVQFFENAQLKRQYGEETETVKRDKDEIEKPSIFALLPAYALSEIKSAFKIGFYLYLPFVVVDLVVSSVLLALGMMMMSPVTISTPIKLVLFVALDGWTLLSKGLILQYMDIAT'
+p5
+aS'MFYALYFEIHHLVASAALGFARVAPIFFFLPFLNSGVLSGAPRNAIIILVALGVWPHALNEAPPFLSVAMIPLVLQEAAVGVMLGCLLSWPFWVMHALGCIIDNQRGATLSSSIDPANGIDTSEMANFLNMFAAVVYLQNGGLVTMVDVLNKSYQLCDPMNECTPSLPPLLTFINQVAQNALVLASPVVLVLLLSEVFLGLLSRFAPQMNAFAISLTVKSGIAVLIMLLYFSPVLPDNVLRLSFQATGLSSWFYERGATHVLE'
+p6
+aS'MCNNFPSGSALPGTGFSTHKRRQDKCGTGNSNGRSVAASQGTTRCSAPAETAAPARAGETCSSQSPGLIQADHRFSASLNRTHIPCRVGYSSVASRPWRWHSVAVCANSHSRRSICLTRNDIRRHPPRQIAVCAVAAADFVDRLASGASAGDYRFAIDHANDVQPAYLTVLTKTPLLAAPEY'
+p7
+aS'MCRRRDLSKNAAYAFQYIDCRVMSLPGQLSAQIQVTVKDRANFIRHRVRLFLAFQQYRIKGSNASLAGRPWAFQQAGQIIEYGGGITSTSRTLSRRQCHVSQSTRITGHGIDKKHDPFSLVAKIFRYGCRQLRRIAAIDRGEIGSGKNQHAFFFLMRSAQHIHEFSDLTASFTDKTDNKDIRLRLLDQHMHQHGLTASCGGKNAHSLAYATGQ'
+p8
+a.