-
Notifications
You must be signed in to change notification settings - Fork 1
/
gene-matrix-from-uclust2.py
99 lines (61 loc) · 1.69 KB
/
gene-matrix-from-uclust2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import re
import glob
import sys
import subprocess
#T. Allnutt, CSIRO, 2015
#script to parse the uclust .uc file and make a table of clusters with their %identity
#usage: gene-matrix-from-uclust2.py clusters_sorted.uc pangenome.txt 60
#nb '95' is the identiy threshold below which presence of a gene is not scored
digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token
for token, match in
((fragment, digits.search(fragment))
for fragment in digits.split(filename)))
f = open(sys.argv[1],'r')
g = open(sys.argv[2],'w')
id_thresh=float(sys.argv[3])
#shared_tot=int(sys.argv[3])
data = {}
hitlist=[]
c=1
ct=0
n=0
for line in f:
k = line.split("\t")
if k[0]<>"S":
hit=k[8].split("_")[0] #nb the "_" separates the cds number from the genome id, so only score the genome id as a hit
if hit not in hitlist:
hitlist.append(hit)
if k[0]=="C":
n=0
ct=ct+1
cds = k[8]
data[cds]={} #new cluster
data[cds][hit]=float(1) #self identity
print "cluster",ct,cds
if k[0]=="H":
id = float(k[3])/100
if id*100>id_thresh:
n=n+1
print "cluster",ct,cds,n, id
if hit not in data[cds].keys():
data[cds][hit]=id
elif id>data[cds][hit]: #keep highest id only
data[cds][hit]=id
print "writing"
hitlist.sort(key=tokenize)
title="\t"+"\t".join(str(p)for p in hitlist)+"\n"
g.write(title)
output=""
for i in data.keys():
c=c+1
output = output+i
for j in hitlist:
if j in data[i].keys():
output = output +"\t"+str(data[i][j])
else:
output = output +"\t"+"0"
g.write(output+"\n")
output=""