Skip to content

Commit 20c0170

Browse files
committed
Deal with the fact that the Kircher data file is missing a base
1 parent e6d0901 commit 20c0170

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

ExtractSequenceFromDataFile.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
if __name__ == "__main__":
66
filetype = "?"
77
chrom = "?"
8-
seq = defaultdict(lambda: set("ATGC"))
98
for line in open(argv[2]):
109
data = line.strip().split()
1110
try:
@@ -16,8 +15,10 @@
1615
if filetype == "?":
1716
if len(data) == 10:
1817
filetype = "kircher"
18+
seq = defaultdict(lambda: set("N"))
1919
elif len(data) == 5:
2020
filetype = "patwardhan"
21+
seq = defaultdict(lambda: set("ATGC"))
2122

2223
if filetype == "kircher":
2324
if data[9] == argv[1]:
@@ -33,5 +34,7 @@
3334

3435
min = min(seq.keys())
3536
max = max(seq.keys())
36-
seq = wrap("".join(seq[i].pop() for i in sorted(seq.keys())))
37+
# We know that TCF7L2 has a missing base in the data file, so I'm going to
38+
# use range instead of just looping over the bases.
39+
seq = wrap("".join(seq[i].pop() for i in range(min, max+1)))
3740
print(f">Homo_sapiens chr{chrom}:{min}-{max} 100.0", *seq, sep="\n")

0 commit comments

Comments
 (0)