From b396b876802abdebfb0db5f4f6a5622d4dfd2b5a Mon Sep 17 00:00:00 2001 From: Shreya Rangarajan Date: Thu, 21 Sep 2017 22:12:09 -0400 Subject: [PATCH 1/2] Gene Finder --- gene_finder.py | 102 +++++++++++++++++++++++++++++++++++-------------- load.py | 2 +- 2 files changed, 74 insertions(+), 30 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 3b1e7dd..71b1d05 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- """ -YOUR HEADER COMMENT HERE - -@author: YOUR NAME HERE +Created on Mon Sep 11 2017 +Gene Finder Mini Project +@author: Shreya Rangarajan """ import random from amino_acids import aa, codons, aa_table # you may find these useful from load import load_seq - def shuffle_string(s): """Shuffles the characters in the input string NOTE: this is a helper function, you do not @@ -19,7 +18,6 @@ def shuffle_string(s): # YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ### - def get_complement(nucleotide): """ Returns the complementary nucleotide @@ -30,9 +28,14 @@ def get_complement(nucleotide): >>> get_complement('C') 'G' """ - # TODO: implement this - pass - + if nucleotide == 'A': + return 'T' + if nucleotide == 'C': + return 'G' + if nucleotide == 'T': + return 'A' + if nucleotide == 'G': + return 'C' def get_reverse_complement(dna): """ Computes the reverse complementary sequence of DNA for the specfied DNA @@ -45,9 +48,11 @@ def get_reverse_complement(dna): >>> get_reverse_complement("CCGCGTTCA") 'TGAACGCGG' """ - # TODO: implement this - pass + reverse_complement = '' + for i in dna[::-1]: + reverse_complement += get_complement(i) + return reverse_complement def rest_of_ORF(dna): """ Takes a DNA sequence that is assumed to begin with a start @@ -62,9 +67,13 @@ def rest_of_ORF(dna): >>> rest_of_ORF("ATGAGATAGG") 'ATGAGA' """ - # TODO: implement this - pass - + stop_codons = ['TAA', 'TAG', 'TGA'] + for i in range(0,len(dna)-1,3): + seg = dna[i:i+3] + if seg in stop_codons: + dna = dna[0:i] + return dna + return dna def find_all_ORFs_oneframe(dna): """ Finds all non-nested open reading frames in the given DNA @@ -79,9 +88,17 @@ def find_all_ORFs_oneframe(dna): >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") ['ATGCATGAATGTAGA', 'ATGTGCCC'] """ - # TODO: implement this - pass - + new_ORFs_oneframe_list = [] + start_codon = 'ATG' + i = 0 + while i < len(dna): + seg = dna[i:i+3] + if seg == start_codon: + new_ORFs_oneframe_list.append(rest_of_ORF(dna[i:])) + i += len(rest_of_ORF(dna[i:])) + i += 3 + + return new_ORFs_oneframe_list def find_all_ORFs(dna): """ Finds all non-nested open reading frames in the given DNA sequence in @@ -96,9 +113,11 @@ def find_all_ORFs(dna): >>> find_all_ORFs("ATGCATGAATGTAG") ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] """ - # TODO: implement this - pass + new_ORFs_list_all = [] + for i in range(3): + new_ORFs_list_all = new_ORFs_list_all + find_all_ORFs_oneframe(dna[i:]) + return new_ORFs_list_all def find_all_ORFs_both_strands(dna): """ Finds all non-nested open reading frames in the given DNA sequence on both @@ -109,9 +128,10 @@ def find_all_ORFs_both_strands(dna): >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") ['ATGCGAATG', 'ATGCTACATTCGCAT'] """ - # TODO: implement this - pass + reverse_complement_DNA = get_reverse_complement(dna) + two_strands_list = find_all_ORFs(dna) + find_all_ORFs(reverse_complement_DNA) + return two_strands_list def longest_ORF(dna): """ Finds the longest ORF on both strands of the specified DNA and returns it @@ -119,8 +139,10 @@ def longest_ORF(dna): >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ - # TODO: implement this - pass + longest_ORF_string = '' + longest_ORF_string = max(find_all_ORFs_both_strands(dna)) + + return longest_ORF_string def longest_ORF_noncoding(dna, num_trials): @@ -130,9 +152,13 @@ def longest_ORF_noncoding(dna, num_trials): dna: a DNA sequence num_trials: the number of random shuffles returns: the maximum length longest ORF """ - # TODO: implement this - pass + longest_ORF_overnumtrials = [] + for i in range(num_trials): + shuffle_DNA = shuffle_string(dna) + longest_ORF_overnumtrials.append(len(longest_ORF(shuffle_DNA))) + max_of_longest_orf = max(longest_ORF_overnumtrials) + return max_of_longest_orf def coding_strand_to_AA(dna): """ Computes the Protein encoded by a sequence of DNA. This function @@ -148,9 +174,17 @@ def coding_strand_to_AA(dna): >>> coding_strand_to_AA("ATGCCCGCTTT") 'MPA' """ - # TODO: implement this - pass + amino_acids_list = '' + index = 0 + if len(dna)%3 != 0: + dna = dna[:-(len(dna)%3)] + + for index in range(0,len(dna),3): + codon = dna[index:index+3] + amino_acid_letter = aa_table[codon] + amino_acids_list = amino_acids_list + amino_acid_letter + return amino_acids_list def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna @@ -158,9 +192,19 @@ def gene_finder(dna): dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ - # TODO: implement this - pass + threshold = longest_ORF_noncoding(dna,1500) + all_ORFs_bothstrands = [] + aa_sequence = [] + + all_ORFs = find_all_ORFs_both_strands(dna) + for orfs in all_ORFs: + if len(orfs) > threshold: + aa_sequence.append(coding_strand_to_AA(orfs)) + return aa_sequence if __name__ == "__main__": import doctest - doctest.testmod() + dna = load_seq("./data/X73525.fa") + print(gene_finder(dna)) + #doctest.testmod() + # doctest.run_docstring_examples(coding_strand_to_AA, globals()) diff --git a/load.py b/load.py index eb8f44d..b7a355c 100644 --- a/load.py +++ b/load.py @@ -98,4 +98,4 @@ def load_metagenome(): tuples. The sequence is represented as an uppercase string of nucleotides """ - return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna') \ No newline at end of file + return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna') From 265e05d379b53e0b25acfbff17a14b31a0f3fecf Mon Sep 17 00:00:00 2001 From: Shreya Rangarajan Date: Sun, 24 Sep 2017 18:38:25 -0400 Subject: [PATCH 2/2] Gene Finder Final Submission --- gene_finder.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gene_finder.py b/gene_finder.py index 71b1d05..93d5bbc 100644 --- a/gene_finder.py +++ b/gene_finder.py @@ -139,7 +139,6 @@ def longest_ORF(dna): >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ - longest_ORF_string = '' longest_ORF_string = max(find_all_ORFs_both_strands(dna)) return longest_ORF_string @@ -153,7 +152,7 @@ def longest_ORF_noncoding(dna, num_trials): num_trials: the number of random shuffles returns: the maximum length longest ORF """ longest_ORF_overnumtrials = [] - for i in range(num_trials): + for _ in range(num_trials): shuffle_DNA = shuffle_string(dna) longest_ORF_overnumtrials.append(len(longest_ORF(shuffle_DNA))) max_of_longest_orf = max(longest_ORF_overnumtrials)