Skip to content

Commit

Permalink
-sequence_locator: updating test cases; working on #20, #23
Browse files Browse the repository at this point in the history
-riplike: modified bootstrapping to use random.choices() (#12)
  • Loading branch information
kwade4 committed Aug 9, 2019
1 parent 454473c commit d052fda
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 94 deletions.
9 changes: 4 additions & 5 deletions poplars/riplike.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# TODO: consistent reference coordinates across outputs

import sys
import subprocess
import random
import argparse
import os
Expand Down Expand Up @@ -50,7 +48,7 @@ def update_alignment(seq, reference):
# eliminate insertions in query relative to references
try:
conseq = dict(fasta)['CON_OF_CONS']
except:
except KeyError:
print("ERROR: reference alignment in poplars.riplike does not contain CON_OF_CONS entry")
raise

Expand Down Expand Up @@ -135,9 +133,10 @@ def riplike(seq, reference, window=400, step=5, nrep=100):
# use nonparametric bootstrap to determine significance
count = 0.
n = len(best_seq)
sample = random.choices(best_seq, k=n*nrep)
for rep in range(nrep):
boot = [best_seq[round(random.random() * (n - 1))] for _ in range(n)]
if sum(boot) / len(boot) < second_p:
boot = sample[rep: rep + n]
if sum(boot) / n < second_p:
count += 1
quant = count / nrep

Expand Down
19 changes: 10 additions & 9 deletions poplars/sequence_locator.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,16 +166,17 @@ def global_to_local_index(self, coord_pair, base):
local_pair = [start_offset, end_offset]
return local_pair

def local_to_global_index(self, local_pair, base):
def local_to_global_index(self, region, coord_pair, base):
"""
Converts a pair of local indices (relative to the region of interest) to global indices
"""
start = local_pair[0] + self.get_local_coords(base)[0] - 1
end = self.get_local_coords(base)[0] + local_pair[1] - 1
global_pair = [start, end]

global_start = region.get_global_coords(base)[0] + coord_pair[0] - 1
global_end = region.get_local_coords(base)[0] + coord_pair[1] - 1
global_pair = [global_start, global_end]
return global_pair

def get_overlap(self, coord_pair, base):
def get_overlap(self, region, coord_pair, base):
"""
Gets the sequence regions that overlap with the region of interest
:param coord_pair: the coordinates
Expand All @@ -189,9 +190,9 @@ def get_overlap(self, coord_pair, base):
start = max(local_pair[0], 0)
end = min(local_pair[1] + 1, len(seq))
if base == 'nucl':
overlap = (seq[start: end], self.local_to_global_index([start, end], base))
overlap = (seq[start: end], self.local_to_global_index(region, [start, end], base))
else:
overlap = (seq[start - 1: end - 1], self.local_to_global_index([start, end - 1], base))
overlap = (seq[start - 1: end - 1], self.local_to_global_index(region, [start, end - 1], base))
return overlap


Expand Down Expand Up @@ -454,7 +455,7 @@ def find_matches(virus, base, ref_regions, match_coordinates):

for ref_region in ref_regions:
if ref_region.region_name != "Complete":
ov_seq, ov_coord = ref_region.get_overlap([start_aln, end_aln], base)
ov_seq, ov_coord = ref_region.get_overlap(ref_region, [start_aln, end_aln], base)

if ov_seq:
query_region = GenomeRegion(ref_region.region_name)
Expand Down Expand Up @@ -634,7 +635,7 @@ def retrieve(virus, base, ref_regions, region, outfile=None, qstart=1, qend='end

# Set local and global coordinates
query_region.set_local_coords([qstart, qend], base)
global_coords = query_region.local_to_global_index([qstart, qend], base)
global_coords = query_region.local_to_global_index(ref_region, [qstart, qend], base)
query_region.set_global_coords(global_coords, base)

# Set sequences protein and nucleotide sequences
Expand Down
144 changes: 64 additions & 80 deletions poplars/tests/test_sequence_locator.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ def testBeyondEnd(self):
self.assertEqual(expected_seq, result_seq)


class TestHandleArgs(unittest.TestCase):
class TestHandleArgs(InputTestCase):

maxDiff = None

Expand Down Expand Up @@ -862,86 +862,70 @@ def testDefaultSIVProt(self):
result_ref_aa_seq = result[2][2:5]
self.assertEqual(expected_ref_aa_seq, result_ref_aa_seq)

def testInputNuclProtHIV(self):
"""
Tests the scenario when the user selects HIV, specifies the reference sequences,
and selects nucleotide alignment
"""

result = handle_args('hiv', 'nucl', self.hiv_nt_seq_path, self.hiv_global_ncoords_path,
self.hiv_aa_seq_path, self.hiv_global_pcoords_path)

expected_ref_nt = [
['K03455|HIVHXB2CG',
'TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAG'
'GGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACC'
'CTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGT'
'ACTTCAAGAACTGCTGACATCGAGCTTGCTACAAG']]
result_ref_nt = result[0][0][1]
self.assertEqual(expected_ref_nt, result_ref_nt)

expected_ref_prot = [
['Gag|HIVHXB2CG', 'MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQTGSEELRSLYNTVATLYCV'
'HQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALS'
'EGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKR'
'WIILGLNKIVRMYSPTSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTACQG'
'VGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEGHQMKDCTERQANFLGKIWPS'
'YKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLRSLFGNDPSSQ'],
['Matrix|HIVHXB2CG', 'MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQTGSEELRSLYNTVATL'
'YCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNY'],
['Capsid|HIVHXB2CG', 'PIVQNIQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRVHP'
'VHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPTSILDIRQGPKEPFRDYVDRFYK'
'TLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTACQGVGGPGHKARVL']]
result_ref_prot = result[1]
self.assertEqual(expected_ref_prot, result_ref_prot)

def testInputAllSIV(self):
"""
Tests the scenario when the user selects SIV, specifies the reference sequences, and selects protein alignment
"""
result = handle_args('siv', 'prot', None, None, self.siv_aa_seq_path, self.siv_global_pcoords_path)

# def testInputNuclProtHIV(self):
# """
# Tests the scenario when the user selects HIV, specifies the reference sequences,
# and selects nucleotide alignment
# """
# result = handle_args('hiv', 'nucl', self.nucl_query, revcomp='n',
# ref_nt=TEST_HIV_GENOME, ref_aa=TEST_HIV_PROTS)
#
# expected_query = [
# ['query', 'GACTCGAAAGCGAAAGTTCCAGAGAAGTTCTCTCGACGCAGGACTCGGCTTGCTGAGGTGCACACAGCAAGAGGCGAGAGCGGCGACTGGTGAGTA'
# 'CGCCAAATTTTGACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGTGGGGGAAAATTAGATGCATGGGAAAAAATTCG'
# 'GTTACGGCCAGGGGGAAAGAAAAAATATAGAATGAAACATTTAGTATGGGCAAGCAGAGAGTTAGAAAGATTCGCACTTAACCC']]
# result_query = result[0]
# self.assertEqual(expected_query, result_query)
#
# expected_ref_nt = [
# ['K03455|HIVHXB2CG',
# 'TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAG'
# 'GGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACC'
# 'CTGTGAGCCTGCATGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGT'
# 'ACTTCAAGAACTGCTGACATCGAGCTTGCTACAAG']]
# result_ref_nt = result[1]
# self.assertEqual(expected_ref_nt, result_ref_nt)
#
# expected_ref_prot = [
# ['Gag|HIVHXB2CG', 'MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQTGSEELRSLYNTVATLYCV'
# 'HQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALS'
# 'EGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKR'
# 'WIILGLNKIVRMYSPTSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTACQG'
# 'VGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEGHQMKDCTERQANFLGKIWPS'
# 'YKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLRSLFGNDPSSQ'],
# ['Matrix|HIVHXB2CG', 'MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQTGSEELRSLYNTVATL'
# 'YCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNY'],
# ['Capsid|HIVHXB2CG', 'PIVQNIQGQMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAAEWDRVHP'
# 'VHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPTSILDIRQGPKEPFRDYVDRFYK'
# 'TLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTACQGVGGPGHKARVL']]
# result_ref_prot = result[2]
# self.assertEqual(expected_ref_prot, result_ref_prot)
#
# def testInputAllSIV(self):
# """
# Tests the scenario when the user selects SIV, specifies the reference sequences, and selects protein alignment
# """
# result = handle_args('siv', 'prot', self.prot_query, revcomp='n',
# ref_nt=TEST_SIV_GENOME, ref_aa=TEST_SIV_PROTS)
#
# expected_query = [
# ['query', 'MAYTQTATTSALLDTVRGNNSLVNDLAKRRLYDTAVEEFNARDRRPKVNFSKVISEEQTLIATRAYPEFQITFYNTQNAVHSLAGGLRSLELEYLM'
# 'MQIPYGSLTYDIGGNFASHLFKGRAYVHCCMPNLDVRDIMRHEGQKDSIELYLSRLERGGKTVPNFQKEAFDRYAEIPEDAVCHNTFQTMRHQPMQ'
# 'QSGRVYAIALHSIYDIPADEFGAALLRKNVHTCYAAFHFSENLLLEDSYVNLDEINACFSRDGDKLTFSFASESTLNYCHSYSNILKYVCKTYFPA'
# 'SNREVYMKEFLV']]
# result_query = result[0]
# self.assertEqual(expected_query, result_query)
#
# expected_ref_nt = [
# ['M33262|SIVMM239', 'GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAG'
# 'GCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATAC'
# 'CCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTG'
# 'GAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGA'
# 'TTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAAT']]
# result_ref_nt = result[1]
# self.assertEqual(expected_ref_nt, result_ref_nt)
#
# expected_ref_prot = [
# ['RNase|SIVMM239', 'YTDGSCNKQSKEGKAGYITDRGKDKVKVLEQTTNQQAELEAFLMALTDSGPKANIIVDSQYVMGIITGCPTESESRLVNQIIEEMIK'
# 'KSEIYVAWVPAHKGIGGNQEIDHLVSQGIRQVL'],
# ['Integrase|SIVMM239', 'FLEKIEPAQEEHDKYHSNVKELVFKFGLPRIVARQIVDTCDKCHQKGEAIHGQANSDLGTWQMDCTHLEGKIIIVAVHVASGF'
# 'IEAEVIPQETGRQTALFLLKLAGRWPITHLHTDNGANFASQEVKMVAWWAGIEHTFGVPYNPQSQGVVEAMNHHLKNQIDRIR'
# 'EQANSVETIVLMAVHCMNFKRRGGIGDMTPAERLINMITTEQEIQFQQSKNSKFKNFRVYYREGRDQLWKGPGELLWKGEGAV'
# 'ILKVGTDIKVVPRRKAKIIKDYGGGKEVDSSSHMEDTGEAREVA'],
# ['Vif|SIVMM239', 'MEEEKRWIAVPTWRIPERLERWHSLIKYLKYKTKDLQKVCYVPHFKVGWAWWTCSRVIFPLQEGSHLEVQGYWHLTPEKGWLSTYAVRI'
# 'TWYSKNFWTDVTPNYADILLHSTYFPCFTAGEVRRAIRGEQLLSCCRFPRAHKYQVPSLQYLALKVVSDVRSQGENPTWKQWRRDNRRG'
# 'LRMAKQNSRGDKQRGGKPPTKGANFPGLAKVLGILA'],
# ['Vpx|SIVMM239', 'MSDPRERIPPGNSGEETIGEAFEWLNRTVEEINREAVNHLPRELIFQVWQRSWEYWHDEQGMSPSYVKYRYLCLIQKALFMHCKKGCRC'
# 'LGEGHGAGGWRPGPPPPPPPGLA'],
# ['Vpr|SIVMM239', 'MEERPPENEGPQREPWDEWVVEVLEELKEEALKHFDPRLLTALGNHIYNRHGDTLEGAGELIRILQRALFMHFRGGCIHSRIGQPGGGN'
# 'PLSAIPPSRSML']]
# result_ref_prot = result[2]
# self.assertEqual(expected_ref_prot, result_ref_prot)
expected_ref_nt = [
['M33262|SIVMM239', 'GCATGCACATTTTAAAGGCTTTTGCTAAATATAGCCAAAAGTCCTTCTACAAATTTTCTAAGAGTTCTGATTCAAAGCAGTAACAG'
'GCCTTGTCTCATCATGAACTTTGGCATTTCATCTACAGCTAAGTTTATATCATAAATAGTTCTTTACAGGCAGCACCAACTTATAC'
'CCTTATAGCATACTTTACTGTGTGAAAATTGCATCTTTCATTAAGCTTACTGTAAATTTACTGGCTGTCTTCCTTGCAGGTTTCTG'
'GAAGGGATTTATTACAGTGCAAGAAGACATAGAATCTTAGACATATACTTAGAAAAGGAAGAAGGCATCATACCAGATTGGCAGGA'
'TTACACCTCAGGACCAGGAATTAGATACCCAAAGACATTTGGCTGGCTATGGAAAT']]
result_ref_nt = result[0]
self.assertEqual(expected_ref_nt, result_ref_nt)

expected_ref_prot = [
['RNase|SIVMM239', 'YTDGSCNKQSKEGKAGYITDRGKDKVKVLEQTTNQQAELEAFLMALTDSGPKANIIVDSQYVMGIITGCPTESESRLVNQIIEEMIK'
'KSEIYVAWVPAHKGIGGNQEIDHLVSQGIRQVL'],
['Integrase|SIVMM239', 'FLEKIEPAQEEHDKYHSNVKELVFKFGLPRIVARQIVDTCDKCHQKGEAIHGQANSDLGTWQMDCTHLEGKIIIVAVHVASGF'
'IEAEVIPQETGRQTALFLLKLAGRWPITHLHTDNGANFASQEVKMVAWWAGIEHTFGVPYNPQSQGVVEAMNHHLKNQIDRIR'
'EQANSVETIVLMAVHCMNFKRRGGIGDMTPAERLINMITTEQEIQFQQSKNSKFKNFRVYYREGRDQLWKGPGELLWKGEGAV'
'ILKVGTDIKVVPRRKAKIIKDYGGGKEVDSSSHMEDTGEAREVA'],
['Vif|SIVMM239', 'MEEEKRWIAVPTWRIPERLERWHSLIKYLKYKTKDLQKVCYVPHFKVGWAWWTCSRVIFPLQEGSHLEVQGYWHLTPEKGWLSTYAVRI'
'TWYSKNFWTDVTPNYADILLHSTYFPCFTAGEVRRAIRGEQLLSCCRFPRAHKYQVPSLQYLALKVVSDVRSQGENPTWKQWRRDNRRG'
'LRMAKQNSRGDKQRGGKPPTKGANFPGLAKVLGILA'],
['Vpx|SIVMM239', 'MSDPRERIPPGNSGEETIGEAFEWLNRTVEEINREAVNHLPRELIFQVWQRSWEYWHDEQGMSPSYVKYRYLCLIQKALFMHCKKGCRC'
'LGEGHGAGGWRPGPPPPPPPGLA'],
['Vpr|SIVMM239', 'MEERPPENEGPQREPWDEWVVEVLEELKEEALKHFDPRLLTALGNHIYNRHGDTLEGAGELIRILQRALFMHFRGGCIHSRIGQPGGGN'
'PLSAIPPSRSML']]
result_ref_prot = result[1]
self.assertEqual(expected_ref_prot, result_ref_prot)


if __name__ == '__main__':
Expand Down

0 comments on commit d052fda

Please sign in to comment.