Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 165 additions & 28 deletions gene_finder.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# -*- coding: utf-8 -*-
"""
YOUR HEADER COMMENT HERE
Finds amino acid sequences coded by DNA input

@author: YOUR NAME HERE
@author: Vivien Chen

"""

import random
from amino_acids import aa, codons, aa_table # you may find these useful
from load import load_seq
dna = load_seq("./data/X73525.fa")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you usually want to separate any function calls from these import statements. It would make more sense to put it under if name == "main":

import doctest
from pickle import dump, load

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow pickle toolbox! great job



def shuffle_string(s):
Expand All @@ -25,13 +28,31 @@ def get_complement(nucleotide):

nucleotide: a nucleotide (A, C, G, or T) represented as a string
returns: the complementary nucleotide

I added two more doctests because each complementary nucleotide is in
its own if/else if branch. If one branch doesn't work and there is no
doctest for that branch, the mistake will not be caught. In fact, I
found out that I spelled nucleotide wrong in the G branch.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great practice that you are adding your own tests


>>> get_complement('A')
'T'
>>> get_complement('C')
'G'
>>> get_complement('G')
'C'
>>> get_complement('T')
'A'
"""
# TODO: implement this
pass
if nucleotide == 'A':
return 'T'
elif nucleotide == 'C':
return 'G'
elif nucleotide == 'G':
return 'C'
elif nucleotide == 'T':
return 'A'

#doctest.run_docstring_examples(get_complement, globals(), verbose=True)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove comments in your final upload



def get_reverse_complement(dna):
Expand All @@ -40,13 +61,27 @@ def get_reverse_complement(dna):

dna: a DNA sequence represented as a string
returns: the reverse complementary DNA sequence represented as a string

I did not add any more doctests because each string contains all the
nucleotides, so two doctests with random strings is sufficient.

>>> get_reverse_complement("ATGCCCGCTTT")
'AAAGCGGGCAT'
>>> get_reverse_complement("CCGCGTTCA")
'TGAACGCGG'
"""
# TODO: implement this
pass
reversed_dna = dna[::-1]
comp_list = []

for nuc in range(0, len(reversed_dna)):
comp_nuc = get_complement(reversed_dna[nuc])
comp_list.append(comp_nuc)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you wanna get fancy with just one line of code :
return ''.join([get_complement(reversed_dna[nuc]) for nuc in dna[::-1]])

comp_str = ''.join(comp_list)

return comp_str

#doctest.run_docstring_examples(get_reverse_complement, globals(), verbose=True)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another comment to remove



def rest_of_ORF(dna):
Expand All @@ -57,13 +92,29 @@ def rest_of_ORF(dna):

dna: a DNA sequence
returns: the open reading frame represented as a string

I added two more doctests: one in which the frame stop codon is TAA and
another in which there is no frame stop codon. This way, all the
branches are tested for error.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

>>> rest_of_ORF("ATGTGAA")
'ATG'
>>> rest_of_ORF("ATGAGATAGG")
'ATGAGA'
>>> rest_of_ORF("ATGGAATAAGTA")
'ATGGAA'
>>> rest_of_ORF("ATGAGAGA")
'ATGAGAGA'
"""
# TODO: implement this
pass
stop_codons = ['TAA', 'TAG', 'TGA']

for i in range(3, len(dna)-2, 3):
if dna[i:i+3] in stop_codons:
return dna[:i]

return dna

#doctest.run_docstring_examples(rest_of_ORF, globals(), verbose=True)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another comment to remove



def find_all_ORFs_oneframe(dna):
Expand All @@ -76,11 +127,30 @@ def find_all_ORFs_oneframe(dna):

dna: a DNA sequence
returns: a list of non-nested ORFs

I added another doctest that includes three nested ATGs in the default
frame of the sequence in order to make sure that the function does not
return nested ORFs in that frame.

>>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
['ATGCATGAATGTAGA', 'ATGTGCCC']
>>> find_all_ORFs_oneframe('ATGATGATGTAAC')
['ATGATGATG']
"""
# TODO: implement this
pass
all_ORFs = []
stopped = True
stop_codons = ['TAA', 'TAG', 'TGA']

for i in range(0, len(dna)-2, 3):
if dna[i:i+3] == 'ATG' and stopped:
all_ORFs.append(rest_of_ORF(dna[i:]))
stopped = False
elif dna[i:i+3] in stop_codons:
stopped = True

return all_ORFs

#doctest.run_docstring_examples(find_all_ORFs_oneframe, globals(), verbose=True)


def find_all_ORFs(dna):
Expand All @@ -93,11 +163,21 @@ def find_all_ORFs(dna):
dna: a DNA sequence
returns: a list of non-nested ORFs

I did not add any more doctests because this function depends on the
previous functions, so assuming the doctests of those were fruitful,
one doctest is sufficient for this function. The doctest includes one
ORF in each frame, so the function works if the doctest yields true.

>>> find_all_ORFs("ATGCATGAATGTAG")
['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
"""
# TODO: implement this
pass
frame1 = find_all_ORFs_oneframe(dna)
frame2 = find_all_ORFs_oneframe(dna[1:])
frame3 = find_all_ORFs_oneframe(dna[2:])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could also use a for loop here
[find_all_ORFs_oneframe(dna[i:]) for i in range 3]

return frame1 + frame2 + frame3

#doctest.run_docstring_examples(find_all_ORFs, globals(), verbose=True)


def find_all_ORFs_both_strands(dna):
Expand All @@ -106,21 +186,40 @@ def find_all_ORFs_both_strands(dna):

dna: a DNA sequence
returns: a list of non-nested ORFs

I did not add any more doctests because this function depends on the
previous functions, so assuming the doctests of those were fruitful,
one doctest is sufficient for this function. The doctest includes one
ORF in each strand, so the function works if the doctest yields true.

>>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
['ATGCGAATG', 'ATGCTACATTCGCAT']
"""
# TODO: implement this
pass
strand1 = find_all_ORFs(dna)
strand2 = find_all_ORFs(get_reverse_complement(dna))

return strand1 + strand2

#doctest.run_docstring_examples(find_all_ORFs_both_strands, globals(), verbose=True)


def longest_ORF(dna):
""" Finds the longest ORF on both strands of the specified DNA and returns it
as a string

>>> longest_ORF("ATGCGAATGTAGCATCAAA")
'ATGCTACATTCGCAT'
"""
# TODO: implement this
pass
all_ORFs = find_all_ORFs_both_strands(dna)
humongest = 0

for i in range(0, len(all_ORFs)-1):
if len(all_ORFs[i]) < len(all_ORFs[i+1]):
humongest = all_ORFs[i+1]

return humongest

#doctest.run_docstring_examples(longest_ORF, globals(), verbose=True)


def longest_ORF_noncoding(dna, num_trials):
Expand All @@ -130,8 +229,18 @@ def longest_ORF_noncoding(dna, num_trials):
dna: a DNA sequence
num_trials: the number of random shuffles
returns: the maximum length longest ORF """
# TODO: implement this
pass
dna_shuffles = []

for i in range(0, num_trials):
dna_shuffles.append(shuffle_string(dna))

for i in range(0, len(dna_shuffles)-1):
if longest_ORF(dna_shuffles[i]) < longest_ORF(dna_shuffles[i+1]):
humongest = longest_ORF(dna_shuffles[i+1])

return len(humongest)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be more memory efficient to just save the max_length of longest_ORF than to save the actual longest_ORF

#print longest_ORF_noncoding('ATGCGAATGTAGCATCAAA', 100)


def coding_strand_to_AA(dna):
Expand All @@ -143,13 +252,22 @@ def coding_strand_to_AA(dna):
returns: a string containing the sequence of amino acids encoded by the
the input DNA fragment

>>> coding_strand_to_AA("ATGCGA")
'MR'
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'
>>> coding_strand_to_AA("ATGCGA")
'MR'
>>> coding_strand_to_AA("ATGCCCGCTTT")
'MPA'
"""
# TODO: implement this
pass
amino_acids = []

for i in range(0, len(dna)-2, 3):
amino_acid = aa_table[dna[i:i+3]]
amino_acids.append(amino_acid)

amino_acids_str = ''.join(amino_acids)

return amino_acids_str

#doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True)


def gene_finder(dna):
Expand All @@ -158,9 +276,28 @@ def gene_finder(dna):
dna: a DNA sequence
returns: a list of all amino acid sequences coded by the sequence dna.
"""
# TODO: implement this
pass
threshold = longest_ORF_noncoding(dna, 1500)
all_ORFs = find_all_ORFs_both_strands(dna)
amino_acids = []

for i in range(0, len(all_ORFs)):
if len(all_ORFs[i]) > threshold:
amino_acid = coding_strand_to_AA(all_ORFs[i])
amino_acids.append(amino_acid)

return amino_acids


if __name__ == "__main__":
import doctest
doctest.testmod()
#import doctest
#doctest.testmod()

genes = gene_finder(dna)

#write to genes.txt file
file_ = open('genes.txt', 'wb')
dump(genes, file_)

#read from genes.txt file
file_ = open('genes.txt', 'rb+')
print(load(file_))
18 changes: 18 additions & 0 deletions genes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
(lp0
S'MSLRVRQIDRREWLLAQTATECQRHGREATLEYPTRQGMWVRLSDAEKRWSAWIKPGDWLEHVSPALAGAAVSAGAEHLVVPWLAATERPFELPVPHLSCRRLCVENPVPGSALPEGKLLHIMSDRGGLWFEHLPELPAVGGGRPKMLRWPLRFVIGSSDTQRSLLGRIGIGDVLLIRTSRAEVYCYAKKLGHFNRVEGGIIVETLDIQHIEEENNTTETAETLPGLNQLPVKLEFVLYRKNVTLAELEAMGQQQLLSLPTNAELNVEIMANGVLLGNGELVQMNDTLGVEIHEWLSESGNGE'
p1
aS'MSSNKTEKPTKKRLEDSAKKGQSFKSKDLIIACLTLGGIAYLVSYGSFNEFMGIIKIIIADNFDQSMADYSLAVFGIGLKYLIPFMLLCLVCSALPALLQAGFVLATEALKPNLSALNPVEGAKKLFSMRTVKDTVKTLLYLSSFVVAAIICWKKYKVEIFSQLNGNIVGIAVIWRELLLALVLTCLACALIVLLLDAIAEYFLTMKDMKMDKEEVKREMKEQEGNPEVKSKRREVHMEILSEQVKSDIENSRLIVANPTHITIGIYFKPELMPIPMISVYETNQRALAVRAYAEKVGVPVIVDIKLARSLFKTHRRYDLVSLEEIDEVLRLLVWLEEVENAGKDVIQPQENEVRH'
p2
aS'MGIFASAGCGKTMLMHMLIEQTEADVFVIGLIGERGREVTEFVDMLRASHKKEKCVLVFATSDFPSVDRCNAAQLATTVAEYFRDQGKRVVLFIDSMTRYARALRDVALASGERPARRGYPASVFDNLPRLLERPGATSEGSITAFYTVLLESEEEADPMADEIRSILDGHLYLSRKLAGQGHYPAIDVLKSVSRVFGQVTTPTHAEQASAVRKLMTRLEELQLFIDLGEYRPGENIDNDRAMQMRDSLKAWLCQPVAQYSSFDDTLSGMNAFADQN'
p3
aS'MGDVSAVSSSGNILLPQQDEVGGLSEALKKAVEKHKTEYSGDKKDRDYGDAFVMHKETALPLLLAAWRHGAPAKSEHHNGNVSGLHHNGKSELRIAEKLLKVTAEKSVGLISAEAKVDKSAALLSSKNRPLESVSGKKLSADLKAVESVSEVTDNATGISDDNIKALPGDNKAIAGEGVRKEGAPLARDVAPARMAAANTGKPEDKDHKKVKDVSQLPLQPTTIADLSQLTGGDEKMPLAAQSKPMMTIFPTADGVKGEDSSLTYRFQRWGNDYSVNIQARQAGEFSLIPSNTQVEHRLHDQWQNGNPQRWHLTRDDQQNPQQQQHRQQSGEEDDA'
p4
aS'MGNDISLIALLAFSTLLPFIIASGTCFVKFSIVFVMVRNALGLQQIPSNMTLNGVALLLSMFVMWPIMHDAYVYFEDEDVTFNDISSLSKHVDEGLDGYRDYLIKYSDRELVQFFENAQLKRQYGEETETVKRDKDEIEKPSIFALLPAYALSEIKSAFKIGFYLYLPFVVVDLVVSSVLLALGMMMMSPVTISTPIKLVLFVALDGWTLLSKGLILQYMDIAT'
p5
aS'MFYALYFEIHHLVASAALGFARVAPIFFFLPFLNSGVLSGAPRNAIIILVALGVWPHALNEAPPFLSVAMIPLVLQEAAVGVMLGCLLSWPFWVMHALGCIIDNQRGATLSSSIDPANGIDTSEMANFLNMFAAVVYLQNGGLVTMVDVLNKSYQLCDPMNECTPSLPPLLTFINQVAQNALVLASPVVLVLLLSEVFLGLLSRFAPQMNAFAISLTVKSGIAVLIMLLYFSPVLPDNVLRLSFQATGLSSWFYERGATHVLE'
p6
aS'MCNNFPSGSALPGTGFSTHKRRQDKCGTGNSNGRSVAASQGTTRCSAPAETAAPARAGETCSSQSPGLIQADHRFSASLNRTHIPCRVGYSSVASRPWRWHSVAVCANSHSRRSICLTRNDIRRHPPRQIAVCAVAAADFVDRLASGASAGDYRFAIDHANDVQPAYLTVLTKTPLLAAPEY'
p7
aS'MCRRRDLSKNAAYAFQYIDCRVMSLPGQLSAQIQVTVKDRANFIRHRVRLFLAFQQYRIKGSNASLAGRPWAFQQAGQIIEYGGGITSTSRTLSRRQCHVSQSTRITGHGIDKKHDPFSLVAKIFRYGCRQLRRIAAIDRGEIGSGKNQHAFFFLMRSAQHIHEFSDLTASFTDKTDNKDIRLRLLDQHMHQHGLTASCGGKNAHSLAYATGQ'
p8
a.