-
Notifications
You must be signed in to change notification settings - Fork 0
/
meta_otu_table.py
136 lines (80 loc) · 2.46 KB
/
meta_otu_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import sys
import os
import re
import glob
digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token
for token, match in
((fragment, digits.search(fragment))
for fragment in digits.split(filename)))
#makes an otu table from insput species lists
folder = sys.argv[1] #working folder
outfile=sys.argv[2] #output file
fmt=sys.argv[3] #file extension minus . dot
level=sys.argv[4]
delim=sys.argv[5] #taxonomy output delimiter
file_delim=sys.argv[6] #delimter in file name to obtain proper sample name
g=open(outfile,'w')
filelist=glob.glob(folder+"/*.%s" %fmt)
filelist.sort(key=tokenize)
print filelist
data={}
allspecies=[]
for i in filelist:
file1 = open(i,'r')
filename = i.split("/")[-1].split("file_delim")[0]
data[filename]={}
for line in file1:
if line[0]<>"#":
sp1 = line.split("\t")[0]
sp=sp1.split(delim)[-1]
lev = sp.split("__")[0]
freq= float(line.split("\t")[1].rstrip("\n").rstrip("\r"))
if sp1=="unclassified":
if sp1 not in allspecies:
allspecies.append(sp1)
if sp1 not in data[filename]:
data[filename][sp1]=""
data[filename][sp1]=freq
if lev==level:
if sp1 not in allspecies:
allspecies.append(sp1)
if sp1 not in data[filename]:
data[filename][sp1]=""
data[filename][sp1]=freq
file1.close()
allspecies.sort()
print "\n".join(str(x.split(delim)[-1]) for x in allspecies)
g.write("\t"+"\t".join(str(x) for x in allspecies)+"\n")
sorted_data=[]
for i in data.keys():
sorted_data.append(i)
sorted_data.sort(key=tokenize)
print sorted_data
sums={}
sum_names=[]
for name in sorted_data:
out = name+"\t"
sum_name=name.split(".")[0] #get first part of file name of bins to make a sum over all bins for that sample
if sum_name not in sums.keys():
sums[sum_name]={}
if sum_name not in sum_names:
sum_names.append(sum_name)
for i in allspecies: #make sums dict
sums[sum_name][i]=0
for i in allspecies:
if i in data[name].keys():
out = out + str(data[name][i])+"\t"
sums[sum_name][i]=sums[sum_name][i]+data[name][i]
else:
out = out + "0" +"\t"
g.write(out+"\n")
#record the summary information
sum_names.sort(key=tokenize)
for sample in sum_names:
g.write(sample)
for i in allspecies:
g.write("\t"+str(sums[sample][i]))
g.write("\n")
g.close()