-
Notifications
You must be signed in to change notification settings - Fork 1
/
tax2_16s.py
80 lines (49 loc) · 1.62 KB
/
tax2_16s.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/python
from Bio import SeqIO
import sys
'''
#reads a list of taxonomy from a file and extracts the closest taxonomy maches of fasta files from greengenes db
#usage:
#tax2_16s.py taxa.list gg_13_5.fasta gg_13_5_taxonomy.txt output16s.fasta
#taxonomy must be same format as gg, e.g. k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacterales; f__; g__; s__
#Before use, clean up the taxonomy file - remove trailing levels and square brackets
sed -i 's/\[//g' gg_13_5_taxonomy.txt
sed -i 's/\]//g' gg_13_5_taxonomy.txt
sed -i 's/ c__;//g' gg_13_5_taxonomy.txt
sed -i 's/ o__;//g' gg_13_5_taxonomy.txt
sed -i 's/ f__;//g' gg_13_5_taxonomy.txt
sed -i 's/ g__;//g' gg_13_5_taxonomy.txt
sed -i ':a;N;$!ba;s/ s__\n/\n/g' gg_13_5_taxonomy.txt
sed -i ':a;N;$!ba;s/;\n/\n/g' gg_13_5_taxonomy.txt
sed -i 's/|/; /g' gg_13_5_taxonomy.txt
'''
e=open(sys.argv[1],'r') #input genus list
f=open(sys.argv[2],'r') #gg fastas
h=open(sys.argv[3],'r') #taxonomy key
g=open(sys.argv[4],'w') #outfile - 16s of the genus list
print 'loading seqs'
x = SeqIO.to_dict(SeqIO.parse(f,'fasta'))
tax={}
print 'parsing taxonomy'
for i in h:
taxid=i.split()[0]
genus=i.split()[-1]
genus=genus.rstrip("\n")
genus=genus.rstrip(";")
if genus not in tax.keys():
tax[genus]=taxid
print "writing"
genera=[]
for i in e:
k=i.rstrip("\n")
k=k.split("; ")
genera.append(k)
for i in genera:
fulltax="; ".join(str(t) for t in i)
for p in range(1,len(i)):
v=i[-p]
if v in tax.keys():
g.write(">"+fulltax+"\n"+str(x[tax[v]].seq)+"\n")
break
if v not in tax.keys():
print fulltax,v,"not found"