Skip to content

Commit

Permalink
fixes #12
Browse files Browse the repository at this point in the history
  • Loading branch information
ArtPoon committed Jun 25, 2019
1 parent 6a1415b commit 66a078c
Showing 1 changed file with 53 additions and 62 deletions.
115 changes: 53 additions & 62 deletions poplars/riplike.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,46 +15,41 @@
with open(ref_seq) as handle:
reference = convert_fasta(handle)

def pdistance(seq1, seq2):
"""
Calculate p-distance between two aligned sequences
:param seq1: First sequence
:param seq2: Second sequence
:return: <ndiff> is the number of differences, <denom> is the number of valid positions

def hamming(fasta):
"""
denom = 0. # number of valid columns
ndiff = 0
for i, nt1 in enumerate(seq1):
nt2 = seq2[i]
if nt1 == '-' or nt2 == '-':
continue
denom += 1
if nt1 != nt2:
ndiff += 1
return ndiff, denom


def bootstrap(s1, s2, reps=100):
Convert list of lists into boolean outcomes (difference between query and reference)
:param fasta: object returned by align()
:return: dictionary of boolean lists keyed by reference label
"""
Sample positions at random with replacement.
:param s1: first sequence
:param s2: second sequence (must be aligned to s1)
:param reps: number of replicates to generate
:yield: tuples of sequences generated by bootstrap resampling
"""
seqlen = len(s1)
assert len(s2)==seqlen, "s1 and s2 must be of same length in bootstrap()"

for rep in range(reps):
aln = dict(fasta)
assert "query" in aln, "Argument <fasta> must contain 'query' entry"
query = aln.pop('query')
_ = aln.pop('CON_OF_CONS')

# iterate over remaining sequences as references
results = {}
for h, s in aln.items():
result = []
bootstrap = [random.randint(0, seqlen-1) for _ in range(seqlen)]
b1 = ''.join([s1[i] for i in bootstrap])
b2 = ''.join([s2[i] for i in bootstrap])
yield b1, b2
for i, nt1 in enumerate(query):
nt2 = s[i]
if nt1 == '-' or nt2 == '-':
result.append(None)
continue
result.append(int(nt1 != nt2))
results.update({h: result})

return results



def update_alignment(seq):
"""
Append query sequence <seq> to reference alignment and remove insertions relative to
global consensus sequence.
:param seq:
:return: a list of [header, sequence] lists
"""
# append query sequence to reference alignment
fasta = align(seq, reference)

Expand All @@ -81,32 +76,33 @@ def riplike(seq, window=400, step=5, nrep=100):
:param window: width of sliding window in nucleotides
:param step: step size of sliding window in nucleotides
:param nrep: number of replicates for nonparametric bootstrap sampling
:return: list of result dictionaries in order of window position
"""

results = []

fasta = update_alignment(seq)
query = dict(fasta)['query'] # aligned query
seqlen = len(query)

ham = hamming(fasta)

for left in range(0, seqlen-window, step):
best_p, second_p = 1., 1. # maximum p-distance
best_ref, second_ref = None, None
best_seq = ''

# cut slice from query sequence for this window
q1 = query[left:(left+window)]

best_seq = []

# iterate over reference genomes
for h, s in fasta:
if h=='query' or h=='CON_OF_CONS':
for h, s in ham.items():
if h == 'query' or h == 'CON_OF_CONS':
continue

# slice window segment from reference
s1 = s[left:(left+window)]
s2 = [x for x in s1 if x is not None]

# calculate p-distance
ndiff, denom = pdistance(s1, q1)
ndiff = sum(s2)
denom = len(s2)
if denom == 0:
# no overlap! TODO: require minimum overlap?
continue
Expand All @@ -115,30 +111,25 @@ def riplike(seq, window=400, step=5, nrep=100):
if pd < best_p:
# query is closer to this reference
second_p = best_p; second_ref = best_ref
best_p = pd; best_ref = h; best_seq = s1
best_p = pd; best_ref = h; best_seq = s2
elif pd < second_p:
# replace second best
second_p = pd; second_ref = h

if best_ref is None:
outfile.write('{},{},None,,None,,\n'.format(header, left))
continue

result = {'left': left, 'best_ref': best_ref, 'best_p': best_p,
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
result = {'left': left, 'best_ref': best_ref, 'best_p': best_p,
'second_ref': second_ref, 'second_p': None if second_ref is None else second_p}
# print(result)

quant = None
if second_ref is not None:
if second_ref is not None and nrep > 0:
# use nonparametric bootstrap to determine significance
boot_dist = []
for bs, bq in bootstrap(best_seq, q1, reps=nrep):
ndiff, denom = pdistance(bs, bq)
if denom > 0:
boot_dist.append(ndiff/denom)

# how many are closer than second best?
quant = list(map(lambda x: x < second_p, boot_dist))
quant = sum(quant) / float(len(quant))
count = 0.
n = len(best_seq)
for rep in range(nrep):
boot = [best_seq[round(random.random()*(n-1))] for _ in range(n)]
if sum(boot)/len(boot) < second_p:
count += 1
quant = count / nrep

result.update({'quant': quant})
results.append(result)
Expand Down Expand Up @@ -172,7 +163,7 @@ def main():
for result in results:
args.outfile.write(
'{},{left},{best_ref},{best_p},{second_ref},{second_p},{quant}\n'
.format(header, **result)
.format(h, **result)
)

args.outfile.close()
Expand Down

0 comments on commit 66a078c

Please sign in to comment.