-
Notifications
You must be signed in to change notification settings - Fork 1
/
lca_count.py
145 lines (96 loc) · 3.58 KB
/
lca_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
import sys
import os
import re
import glob
#made in project 075.10
digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token
for token, match in
((fragment, digits.search(fragment))
for fragment in digits.split(filename)))
#makes an otu table from input species lists
folder = sys.argv[1] #working folder
outfile=sys.argv[2] #output file prefix
#delim=sys.argv[3] #taxonomy output delimiter
fthresh=int(sys.argv[3]) #threshold of number of samples that have each otu
otuthresh=int(sys.argv[4])#total count threshold for each otu
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
print filelist
data={}
allspecies={}
tax_levels=["|superkingdom_","|kingdom_","|phylum_","|order_","|family_","|genus_","|species_"]
filenames=[]
for i in filelist:
file1 = open(i,'r')
filename = ".".join(str(p) for p in i.split("/")[-1].split(".")[:-1])
filenames.append(filename)
data[filename]={}
#example input format
#|no rank__root|no rank__cellular organisms|superkingdom__Eukaryota|no rank__Opisthokonta|kingdom__Fungi|subkingdom__Dikarya|phylum__Ascomycota|no rank__saccharomyceta|subphylum__Pezizomycotina|no rank__leotiomyceta|class__Eurotiomycetes|subclass__Eurotiomycetidae|order__Eurotiales|family__Aspergillaceae|genus__Aspergillus|species__Aspergillus flavus|no rank__Aspergillus flavus NRRL3357
for line in file1:
for level in tax_levels:
if level not in allspecies.keys():
allspecies[level]=[]
if level not in data[filename].keys():
data[filename][level]={}
if line[0]<>"#":
fulltax = line.split("\t")[0]
lowest_tax=fulltax.split("|")[-1]
levs=[]
for p in fulltax.split("|"):
levs.append("|"+p.split("__")[0]+"_")
freq= int(line.split("\t")[-1].rstrip("\n").rstrip("\r"))
if level in levs: #lev==level: #n.b. this undercounted higher levels..
k1=fulltax.split("|")[:levs.index(level)+1]
levtax="|".join(str(p) for p in k1)
if levtax not in allspecies[level]:
allspecies[level].append(levtax)
if levtax not in data[filename][level].keys():
data[filename][level][levtax]=freq
else:
data[filename][level][levtax]=data[filename][level][levtax]+freq
file1.close()
filenames.sort(key=tokenize)
for v in allspecies.keys():
allspecies[v].sort()
#print allspecies
#remove if below threshold
allspecies2={}
for level in tax_levels:
allspecies2[level]=[]
for i in allspecies[level]:
c=0
otutotal=0
for j in filenames:
if i in data[j][level].keys():
c=c+1
otutotal=otutotal+data[j][level][i]
if c >= fthresh and otutotal>=otuthresh:
allspecies2[level].append(i)
g=open(outfile+"_all_"+".tab",'w')
for level in tax_levels:
lname=level.lstrip("|")
print level,"###########################################################"
g2=open(outfile+"_"+lname+".tab",'w')
g.write(level+"\t\t"+"\t".join(str(x) for x in filenames)+"\n")
g2.write("#OTU_ID"+"\t"+"\t".join(str(x) for x in filenames)+"\n")
for i in allspecies2[level]:
print (i.split("__")[-1])
#g.write(i+"\t") #change to write the whole taxonomy
g.write(i+"\t"+i.split("__")[-1])
#g2.write(i) #change to write the whole taxonomy
g2.write(i.split("__")[-1])
for j in filenames:
if i in data[j][level].keys():
g.write("\t"+str(data[j][level][i]))
g2.write("\t"+str(data[j][level][i]))
else:
g.write("\t"+"0")
g2.write("\t"+"0")
g.write("\n")
g2.write("\n")
g.write("\n")
g2.write("\n")