-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathallele_cv.py
executable file
·60 lines (46 loc) · 1.46 KB
/
allele_cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def read_pos_dat(vcf_file):
""" take the vcf file and read in lines, adding info to a list
stored with contig as key, and values are a list of tuples,
first part of tuple is the pos, second is the entire line in a string"""
dict_of_dat ={}
with open(vcf_file) as file:
for line in file:
line_string = line.rstrip()
line_dat = line_string.split('\t')
if line_dat[0] in dict_of_dat.keys():
try:
dict_of_dat[line_dat[0]].append((line_dat[1], line_string))
except:
print(line)
continue
else:
try:
dict_of_dat[line_dat[0]] = [(line_dat[1], line_string)]
except:
print(line)
continue
return dict_of_dat
if __name__ == '__main__':
""" read in the koop positions and alleles"""
koop_positions = read_pos_dat('trimmed_koop_alleles.tsv')
""" read in the vcf from the other dataset """
query_positions = read_pos_dat('compare_koop_allele_check.txt')
""" """
output_file = 'all_danzfer_compare.tsv'
for contig in query_positions.keys():
try:
subject = set(koop_positions[contig])
except:
continue
""" build a dict from the subject data, then compare to it """
subject_positions_dict = {key: value for (key, value) in subject}
for query_snp in query_positions[contig]:
try:
compare_alleles = subject_positions_dict[query_snp[0]]
output = query_snp[1] + '\t' + compare_alleles
file=open(output_file, 'a')
file.write(output)
file.write('\n')
file.close()
except:
pass