From b396b876802abdebfb0db5f4f6a5622d4dfd2b5a Mon Sep 17 00:00:00 2001
From: Shreya Rangarajan <Shreya.Rangarajan@students.olin.edu>
Date: Thu, 21 Sep 2017 22:12:09 -0400
Subject: [PATCH 1/2] Gene Finder

---
 gene_finder.py | 102 +++++++++++++++++++++++++++++++++++--------------
 load.py        |   2 +-
 2 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 3b1e7dd..71b1d05 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -1,16 +1,15 @@
 # -*- coding: utf-8 -*-
 """
-YOUR HEADER COMMENT HERE
-
-@author: YOUR NAME HERE
+Created on Mon Sep  11 2017
+Gene Finder Mini Project
 
+@author: Shreya Rangarajan
 """
 
 import random
 from amino_acids import aa, codons, aa_table   # you may find these useful
 from load import load_seq
 
-
 def shuffle_string(s):
     """Shuffles the characters in the input string
         NOTE: this is a helper function, you do not
@@ -19,7 +18,6 @@ def shuffle_string(s):
 
 # YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###
 
-
 def get_complement(nucleotide):
     """ Returns the complementary nucleotide
 
@@ -30,9 +28,14 @@ def get_complement(nucleotide):
     >>> get_complement('C')
     'G'
     """
-    # TODO: implement this
-    pass
-
+    if nucleotide == 'A':
+        return 'T'
+    if nucleotide == 'C':
+        return 'G'
+    if nucleotide == 'T':
+        return 'A'
+    if nucleotide == 'G':
+        return 'C'
 
 def get_reverse_complement(dna):
     """ Computes the reverse complementary sequence of DNA for the specfied DNA
@@ -45,9 +48,11 @@ def get_reverse_complement(dna):
     >>> get_reverse_complement("CCGCGTTCA")
     'TGAACGCGG'
     """
-    # TODO: implement this
-    pass
+    reverse_complement = ''
+    for i in dna[::-1]:
+        reverse_complement += get_complement(i)
 
+    return reverse_complement
 
 def rest_of_ORF(dna):
     """ Takes a DNA sequence that is assumed to begin with a start
@@ -62,9 +67,13 @@ def rest_of_ORF(dna):
     >>> rest_of_ORF("ATGAGATAGG")
     'ATGAGA'
     """
-    # TODO: implement this
-    pass
-
+    stop_codons = ['TAA', 'TAG', 'TGA']
+    for i in range(0,len(dna)-1,3):
+        seg = dna[i:i+3]
+        if seg in stop_codons:
+            dna = dna[0:i]
+            return dna
+    return dna
 
 def find_all_ORFs_oneframe(dna):
     """ Finds all non-nested open reading frames in the given DNA
@@ -79,9 +88,17 @@ def find_all_ORFs_oneframe(dna):
     >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
     ['ATGCATGAATGTAGA', 'ATGTGCCC']
     """
-    # TODO: implement this
-    pass
-
+    new_ORFs_oneframe_list = []
+    start_codon = 'ATG'
+    i = 0
+    while i < len(dna):
+        seg = dna[i:i+3]
+        if seg == start_codon:
+            new_ORFs_oneframe_list.append(rest_of_ORF(dna[i:]))
+            i += len(rest_of_ORF(dna[i:]))
+        i += 3
+
+    return new_ORFs_oneframe_list
 
 def find_all_ORFs(dna):
     """ Finds all non-nested open reading frames in the given DNA sequence in
@@ -96,9 +113,11 @@ def find_all_ORFs(dna):
     >>> find_all_ORFs("ATGCATGAATGTAG")
     ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
     """
-    # TODO: implement this
-    pass
+    new_ORFs_list_all = []
+    for i in range(3):
+        new_ORFs_list_all = new_ORFs_list_all + find_all_ORFs_oneframe(dna[i:])
 
+    return new_ORFs_list_all
 
 def find_all_ORFs_both_strands(dna):
     """ Finds all non-nested open reading frames in the given DNA sequence on both
@@ -109,9 +128,10 @@ def find_all_ORFs_both_strands(dna):
     >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
     ['ATGCGAATG', 'ATGCTACATTCGCAT']
     """
-    # TODO: implement this
-    pass
+    reverse_complement_DNA = get_reverse_complement(dna)
+    two_strands_list = find_all_ORFs(dna) + find_all_ORFs(reverse_complement_DNA)
 
+    return two_strands_list
 
 def longest_ORF(dna):
     """ Finds the longest ORF on both strands of the specified DNA and returns it
@@ -119,8 +139,10 @@ def longest_ORF(dna):
     >>> longest_ORF("ATGCGAATGTAGCATCAAA")
     'ATGCTACATTCGCAT'
     """
-    # TODO: implement this
-    pass
+    longest_ORF_string = ''
+    longest_ORF_string = max(find_all_ORFs_both_strands(dna))
+
+    return longest_ORF_string
 
 
 def longest_ORF_noncoding(dna, num_trials):
@@ -130,9 +152,13 @@ def longest_ORF_noncoding(dna, num_trials):
         dna: a DNA sequence
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
-    # TODO: implement this
-    pass
+    longest_ORF_overnumtrials = []
+    for i in range(num_trials):
+        shuffle_DNA = shuffle_string(dna)
+        longest_ORF_overnumtrials.append(len(longest_ORF(shuffle_DNA)))
+    max_of_longest_orf = max(longest_ORF_overnumtrials)
 
+    return max_of_longest_orf
 
 def coding_strand_to_AA(dna):
     """ Computes the Protein encoded by a sequence of DNA.  This function
@@ -148,9 +174,17 @@ def coding_strand_to_AA(dna):
         >>> coding_strand_to_AA("ATGCCCGCTTT")
         'MPA'
     """
-    # TODO: implement this
-    pass
+    amino_acids_list = ''
+    index = 0
 
+    if len(dna)%3 != 0:
+        dna = dna[:-(len(dna)%3)]
+
+    for index in range(0,len(dna),3):
+        codon = dna[index:index+3]
+        amino_acid_letter = aa_table[codon]
+        amino_acids_list = amino_acids_list + amino_acid_letter
+    return amino_acids_list
 
 def gene_finder(dna):
     """ Returns the amino acid sequences that are likely coded by the specified dna
@@ -158,9 +192,19 @@ def gene_finder(dna):
         dna: a DNA sequence
         returns: a list of all amino acid sequences coded by the sequence dna.
     """
-    # TODO: implement this
-    pass
+    threshold = longest_ORF_noncoding(dna,1500)
+    all_ORFs_bothstrands = []
+    aa_sequence = []
+
+    all_ORFs = find_all_ORFs_both_strands(dna)
+    for orfs in all_ORFs:
+        if len(orfs) > threshold:
+            aa_sequence.append(coding_strand_to_AA(orfs))
+    return aa_sequence
 
 if __name__ == "__main__":
     import doctest
-    doctest.testmod()
+    dna = load_seq("./data/X73525.fa")
+    print(gene_finder(dna))
+    #doctest.testmod()
+    # doctest.run_docstring_examples(coding_strand_to_AA, globals())
diff --git a/load.py b/load.py
index eb8f44d..b7a355c 100644
--- a/load.py
+++ b/load.py
@@ -98,4 +98,4 @@ def load_metagenome():
                  tuples.  The sequence is represented as an uppercase
                  string of nucleotides
     """
-    return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna')
\ No newline at end of file
+    return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna')

From 265e05d379b53e0b25acfbff17a14b31a0f3fecf Mon Sep 17 00:00:00 2001
From: Shreya Rangarajan <Shreya.Rangarajan@students.olin.edu>
Date: Sun, 24 Sep 2017 18:38:25 -0400
Subject: [PATCH 2/2] Gene Finder Final Submission

---
 gene_finder.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gene_finder.py b/gene_finder.py
index 71b1d05..93d5bbc 100644
--- a/gene_finder.py
+++ b/gene_finder.py
@@ -139,7 +139,6 @@ def longest_ORF(dna):
     >>> longest_ORF("ATGCGAATGTAGCATCAAA")
     'ATGCTACATTCGCAT'
     """
-    longest_ORF_string = ''
     longest_ORF_string = max(find_all_ORFs_both_strands(dna))
 
     return longest_ORF_string
@@ -153,7 +152,7 @@ def longest_ORF_noncoding(dna, num_trials):
         num_trials: the number of random shuffles
         returns: the maximum length longest ORF """
     longest_ORF_overnumtrials = []
-    for i in range(num_trials):
+    for _ in range(num_trials):
         shuffle_DNA = shuffle_string(dna)
         longest_ORF_overnumtrials.append(len(longest_ORF(shuffle_DNA)))
     max_of_longest_orf = max(longest_ORF_overnumtrials)