-
Notifications
You must be signed in to change notification settings - Fork 1
/
new_mutation.py
executable file
·331 lines (281 loc) · 9.13 KB
/
new_mutation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# This program answers the question "For two mutations close enough together
# that they might be in the same read, how often are they actually both
# present on a read spanning both their positions?" Its input is a filtered
# Strelka output file and an indexed BAM and BAI file for a single sample.
# NOTE: BAM is zero based. VCF is one based. Subtract 1 from your VCF
# mutation position before looking it up in BAM!
# WARNING: We wrote this code against a BAM file where paired reads had the
# same name, but this is not standardized: it will NOT WORK RIGHT if paired
# reads append /1, /2 or _1, _2 or _a, _b or anything like that!
import pysam
strelkafile = "/home/mkkuhner/seagate/data/wgs_dna/filtered_strelka/163-23740-14O-37--163-25059N-78R-BLD.snv.strelka.filtered.vcf"
samfile = pysam.AlignmentFile("163-23740-14O-37.final.bam","rb",
index_filename="163-23740-14O-37.final.bai",require_index=True)
read_cutoff = 1
# how many non ref/alt bases can we tolerate at a mutation pair
# before throwing it out?
max_unexpected_bases = 5
# lowest acceptable mapping quality for a read
min_quality = 25
# functions
def tier1(entry):
entry = entry.split(",")
entry = int(entry[0])
return entry
def tier1_cutoff(entry, cutoff):
val = tier1(entry)
assert val >= 0
if val >= cutoff: return val
else: return 0
def hasbin(result,bin):
if result[bin] > 1:
return True
return False
def score_read(base1,base2,mut1,mut2,scorearray):
chr1,pos1,ref1,alt1,vaf1 = mut1
chr2,pos2,ref2,alt2,vaf2 = mut2
# score the mutation
if base1 == ref1 and base2 == ref2:
scorearray[0] += 1
if base1 == ref1 and base2 == alt2:
scorearray[1] += 1
if base1 == alt1 and base2 == ref2:
scorearray[2] += 1
if base1 == alt1 and base2 == alt2:
scorearray[3] += 1
scorearray[4] += 1
def summarize(pairlist):
# taxonomy:
# less than 12 total reads: toosmall
# just bin0: wt
# just bin1 or just bin2: missed
# all three bins: fourgamete
# bins 1 and 2 but not 3: trans
# bin 3 alone: cis
# bin 3 with bin 1 or 2 but not both: nested
results = {}
results["empty"] = 0
results["anomaly"] = 0
results["toosmall"] = 0
results["wt"] = 0
results["missed"] = 0
results["fourgamete"] = 0
results["cis"] = 0
results["trans"] = 0
results["nested"] = 0
for tally in pairlist:
# empty
if tally[4] == 0:
results["empty"] += 1
continue
# anomaly
if tally[4] - max_unexpected_bases >= sum(tally[0:4]):
results["anomaly"] += 1
continue
# toosmall
if tally[4] < 12:
results["toosmall"] += 1
continue
# wt
if not hasbin(tally,1) and not hasbin(tally,2) and not hasbin(tally,3):
results["wt"] += 1
continue
# missed
if hasbin(tally,1) and not hasbin(tally,2) and not hasbin(tally,3):
results["missed"] += 1
continue
if not hasbin(tally,1) and hasbin(tally,2) and not hasbin(tally,3):
results["missed"] += 1
continue
# fourgamete
if hasbin(tally,1) and hasbin(tally,2) and hasbin(tally,3):
results["fourgamete"] += 1
continue
# trans
if hasbin(tally,1) and hasbin(tally,2) and not hasbin(tally,3):
results["trans"] += 1
continue
# cis
if not hasbin(tally,1) and not hasbin(tally,2) and hasbin(tally,3):
results["cis"] += 1
continue
# nested
if (hasbin(tally,1) or hasbin(tally,2)) and hasbin(tally,3):
results["nested"] += 1
continue
print("found anomaly:",tally)
assert False
return results
########################################################################
# main program
# parse the vcf to find eligible mutation pairs
prevch = 0
prevpos = -500
mutpairs = []
mut1 = None
mut2 = None
for line in open(strelkafile,"r"):
if line.startswith("#"): continue # skip headers
# find a mutation
chr,pos,id,ref,alt,qual,filter,info,format,normal,tumor = line.rstrip().split()
pos = int(pos)
# compute its VAF
# parse the NORMAL and TUMOR fields to get raw read counts
trimmed_normal = []
trimmed_tumor = []
norm = normal.split(":")[4:]
for entry in norm:
trimmed_normal.append(tier1_cutoff(entry,read_cutoff))
tumor = tumor.split(":")[4:]
for entry in tumor:
trimmed_tumor.append(tier1_cutoff(entry,read_cutoff))
assert sum(trimmed_normal) > 0
assert sum(trimmed_tumor) > 0
# compute VAF based on trimmed read-counts
numer = 0.0
denom = 0.0
for norm, tum in zip(trimmed_normal, trimmed_tumor):
if norm == 0 and tum != 0:
numer += tum
denom += tum
vaf = numer/denom
if vaf > 0.5: continue
# pair the mutations
muttype = info.split(";")[0]
if muttype != "NT=ref": continue # heterozygous sites are too confusing for now
if chr == "X": continue # no X chromosomes please
if chr != prevch: # new chromosome, so no pair
prevch = chr
prevpos = pos
mut1 = [chr,pos,ref,alt,vaf]
mut2 = None
continue
if pos - prevpos > 500: # too far away, so no pair
prevpos = pos
mut1 = [chr,pos,ref,alt,vaf]
mut2 = None
continue
# if we have a previous mutation in hand, this is a pair!
if mut1 != None:
mut2 = [chr,pos,ref,alt,vaf]
mutpairs.append([mut1,mut2])
mut1 = mut2 = None
prevpos = -500
else: # we already used up mut1 in a different pair, so we skip
mut1 = [chr,pos,ref,alt,vaf]
mut2 = None
prevpos = pos
##########################################################################3
# test mutation pairs in bam file
pairresults = []
pairresults_same = []
for mut1,mut2 in mutpairs:
print("trying a mut-pair")
chr1,pos1,ref1,alt1,vaf1 = mut1
chr2,pos2,ref2,alt2,vaf2 = mut2
# correct for zero versus one based
pos1 -= 1
pos2 -= 1
assert chr1 == chr2
myresult = [0,0,0,0,0]
myresult_sameonly = [0,0,0,0,0]
readcount = 0 #DEBUG
# pull reads for mutation position 1 into a list
reads1 = samfile.fetch(chr1,pos1,pos1+1)
reads2 = samfile.fetch(chr2,pos2,pos2+1)
for read1 in reads1:
print("trying a read")
if read1.mapping_quality < min_quality:
continue # a bad read
readcount += 1 #DEBUG
# find out which, if any, mutation positions are present
aligned_pairs = read1.get_aligned_pairs(matches_only=True)
found1 = False
found2 = False
for mypair in aligned_pairs:
if mypair[1] == pos1:
found1 = True
base1 = read1.query_sequence[mypair[0]]
if mypair[1] == pos2:
found2 = True
base2 = read1.query_sequence[mypair[0]]
print(readcount,found1,found2)
# mutation position 1 is missing: skip this read
if not found1:
print("bail on not found1")
# both mutation positions 1 and 2 are present: same-end score
elif found1 and found2:
score_read(base1,base2,mut1,mut2,myresult_sameonly)
score_read(base1,base2,mut1,mut2,myresult)
print("score and bail")
# only mutation position 1 is present: find the mate and try an
# opposite-end score
elif found1 and not found2:
print("trying mate-pair logic")
name1 = read1.query_name
for read2 in reads2:
if read2.mapping_quality < min_quality:
continue
found2 = False
name2 = read2.query_name
if name1 != name2:
continue
if (read1.is_read1 == read2.is_read1): # these are the same read!
continue
found2 = True
# break
if not found2:
print("not found2 after finding mate-pair")
continue
#
# aligned_pairs2 = read2.get_aligned_pairs(matches_only=True)
# found2 = False
# for mypair in aligned_pairs2:
# if mypair[1] == pos2:
# found2 = True
# base2 = read2.query_sequence[mypair[0]]
# print("success finding mate-pair that works")
# break
# if found1 and found2: # the mate works, score it
# print("scoring mate-pair that works")
# # NO! score_read(base1,base2,mut1,mut2,myresult_sameonly)
# score_read(base1,base2,mut1,mut2,myresult)
# continue
pairresults.append(myresult)
pairresults_same.append(myresult_sameonly)
if len(pairresults) > 2:
exit()
print("Total pairs",len(mutpairs))
print
results = summarize(pairresults)
print("All reads:")
print("cis",results["cis"],)
print("trans",results["trans"])
print("nested",results["nested"])
print("toosmall",results["toosmall"])
print("wt",results["wt"])
print("missed",results["missed"])
print("fourgamete",results["fourgamete"])
print("empty",results["empty"])
print("anomaly",results["anomaly"])
total = 0
for categ in results.keys():
total += results[categ]
if total != len(mutpairs):
print("WARNING: not all pairs accounted for in total pairs!")
results = summarize(pairresults_same)
print("Same-end reads:")
print("cis",results["cis"],)
print("trans",results["trans"])
print("nested",results["nested"],)
print("toosmall",results["toosmall"],)
print("wt",results["wt"],)
print("missed",results["missed"],)
print("fourgamete",results["fourgamete"])
print("empty",results["empty"],)
print("anomaly",results["anomaly"])
total = 0
for categ in results.keys():
total += results[categ]
if total != len(mutpairs):
print("WARNING: not all pairs accounted for in same-end pairs!")