Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gene Finder Final Submission #9

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 72 additions & 29 deletions gene_finder.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# -*- coding: utf-8 -*-
"""
YOUR HEADER COMMENT HERE

@author: YOUR NAME HERE
Created on Mon Sep 11 2017
Gene Finder Mini Project

@author: Shreya Rangarajan
"""

import random
from amino_acids import aa, codons, aa_table # you may find these useful
from load import load_seq


def shuffle_string(s):
"""Shuffles the characters in the input string
NOTE: this is a helper function, you do not
Expand All @@ -19,7 +18,6 @@ def shuffle_string(s):

# YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###


def get_complement(nucleotide):
""" Returns the complementary nucleotide

Expand All @@ -30,9 +28,14 @@ def get_complement(nucleotide):
>>> get_complement('C')
'G'
"""
# TODO: implement this
pass

if nucleotide == 'A':
return 'T'
if nucleotide == 'C':
return 'G'
if nucleotide == 'T':
return 'A'
if nucleotide == 'G':
return 'C'

def get_reverse_complement(dna):
""" Computes the reverse complementary sequence of DNA for the specfied DNA
Expand All @@ -45,9 +48,11 @@ def get_reverse_complement(dna):
>>> get_reverse_complement("CCGCGTTCA")
'TGAACGCGG'
"""
# TODO: implement this
pass
reverse_complement = ''
for i in dna[::-1]:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[::-1] is one of my favorite Python gimmicks 😃

reverse_complement += get_complement(i)

return reverse_complement

def rest_of_ORF(dna):
""" Takes a DNA sequence that is assumed to begin with a start
Expand All @@ -62,9 +67,13 @@ def rest_of_ORF(dna):
>>> rest_of_ORF("ATGAGATAGG")
'ATGAGA'
"""
# TODO: implement this
pass

stop_codons = ['TAA', 'TAG', 'TGA']
for i in range(0,len(dna)-1,3):
seg = dna[i:i+3]
if seg in stop_codons:
dna = dna[0:i]
return dna
return dna

def find_all_ORFs_oneframe(dna):
""" Finds all non-nested open reading frames in the given DNA
Expand All @@ -79,9 +88,17 @@ def find_all_ORFs_oneframe(dna):
>>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
['ATGCATGAATGTAGA', 'ATGTGCCC']
"""
# TODO: implement this
pass

new_ORFs_oneframe_list = []
start_codon = 'ATG'
i = 0
while i < len(dna):
seg = dna[i:i+3]
if seg == start_codon:
new_ORFs_oneframe_list.append(rest_of_ORF(dna[i:]))
i += len(rest_of_ORF(dna[i:]))
i += 3

return new_ORFs_oneframe_list

def find_all_ORFs(dna):
""" Finds all non-nested open reading frames in the given DNA sequence in
Expand All @@ -96,9 +113,11 @@ def find_all_ORFs(dna):
>>> find_all_ORFs("ATGCATGAATGTAG")
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
"""
# TODO: implement this
pass
new_ORFs_list_all = []
for i in range(3):
new_ORFs_list_all = new_ORFs_list_all + find_all_ORFs_oneframe(dna[i:])

return new_ORFs_list_all

def find_all_ORFs_both_strands(dna):
""" Finds all non-nested open reading frames in the given DNA sequence on both
Expand All @@ -109,18 +128,20 @@ def find_all_ORFs_both_strands(dna):
>>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
['ATGCGAATG', 'ATGCTACATTCGCAT']
"""
# TODO: implement this
pass
reverse_complement_DNA = get_reverse_complement(dna)
two_strands_list = find_all_ORFs(dna) + find_all_ORFs(reverse_complement_DNA)

return two_strands_list

def longest_ORF(dna):
""" Finds the longest ORF on both strands of the specified DNA and returns it
as a string
>>> longest_ORF("ATGCGAATGTAGCATCAAA")
'ATGCTACATTCGCAT'
"""
# TODO: implement this
pass
longest_ORF_string = max(find_all_ORFs_both_strands(dna))

return longest_ORF_string


def longest_ORF_noncoding(dna, num_trials):
Expand All @@ -130,9 +151,13 @@ def longest_ORF_noncoding(dna, num_trials):
dna: a DNA sequence
num_trials: the number of random shuffles
returns: the maximum length longest ORF """
# TODO: implement this
pass
longest_ORF_overnumtrials = []
for _ in range(num_trials):
shuffle_DNA = shuffle_string(dna)
longest_ORF_overnumtrials.append(len(longest_ORF(shuffle_DNA)))
max_of_longest_orf = max(longest_ORF_overnumtrials)

return max_of_longest_orf

def coding_strand_to_AA(dna):
""" Computes the Protein encoded by a sequence of DNA. This function
Expand All @@ -148,19 +173,37 @@ def coding_strand_to_AA(dna):
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'
"""
# TODO: implement this
pass
amino_acids_list = ''
index = 0

if len(dna)%3 != 0:
dna = dna[:-(len(dna)%3)]

for index in range(0,len(dna),3):
codon = dna[index:index+3]
amino_acid_letter = aa_table[codon]
amino_acids_list = amino_acids_list + amino_acid_letter
return amino_acids_list

def gene_finder(dna):
""" Returns the amino acid sequences that are likely coded by the specified dna

dna: a DNA sequence
returns: a list of all amino acid sequences coded by the sequence dna.
"""
# TODO: implement this
pass
threshold = longest_ORF_noncoding(dna,1500)
all_ORFs_bothstrands = []
aa_sequence = []

all_ORFs = find_all_ORFs_both_strands(dna)
for orfs in all_ORFs:
if len(orfs) > threshold:
aa_sequence.append(coding_strand_to_AA(orfs))
return aa_sequence

if __name__ == "__main__":
import doctest
doctest.testmod()
dna = load_seq("./data/X73525.fa")
print(gene_finder(dna))
#doctest.testmod()
# doctest.run_docstring_examples(coding_strand_to_AA, globals())
2 changes: 1 addition & 1 deletion load.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,4 @@ def load_metagenome():
tuples. The sequence is represented as an uppercase
string of nucleotides
"""
return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna')
return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna')