Skip to content

Commit 1eea2fe

Browse files
committed
a new script to cluster similar sequences
1 parent 1d346d5 commit 1eea2fe

File tree

1 file changed

+174
-0
lines changed

1 file changed

+174
-0
lines changed

misc/pafcluster.js

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#!/usr/bin/env k8
2+
3+
"use strict";
4+
5+
Array.prototype.delete_at = function(i) {
6+
for (let j = i; j < this.length - 1; ++j)
7+
this[j] = this[j + 1];
8+
--this.length;
9+
}
10+
11+
function* getopt(argv, ostr, longopts) {
12+
if (argv.length == 0) return;
13+
let pos = 0, cur = 0;
14+
while (cur < argv.length) {
15+
let lopt = "", opt = "?", arg = "";
16+
while (cur < argv.length) { // skip non-option arguments
17+
if (argv[cur][0] == "-" && argv[cur].length > 1) {
18+
if (argv[cur] == "--") cur = argv.length;
19+
break;
20+
} else ++cur;
21+
}
22+
if (cur == argv.length) break;
23+
let a = argv[cur];
24+
if (a[0] == "-" && a[1] == "-") { // a long option
25+
pos = -1;
26+
let c = 0, k = -1, tmp = "", o;
27+
const pos_eq = a.indexOf("=");
28+
if (pos_eq > 0) {
29+
o = a.substring(2, pos_eq);
30+
arg = a.substring(pos_eq + 1);
31+
} else o = a.substring(2);
32+
for (let i = 0; i < longopts.length; ++i) {
33+
let y = longopts[i];
34+
if (y[y.length - 1] == "=") y = y.substring(0, y.length - 1);
35+
if (o.length <= y.length && o == y.substring(0, o.length)) {
36+
k = i, tmp = y;
37+
++c; // c is the number of matches
38+
if (o == y) { // exact match
39+
c = 1;
40+
break;
41+
}
42+
}
43+
}
44+
if (c == 1) { // find a unique match
45+
lopt = tmp;
46+
if (pos_eq < 0 && longopts[k][longopts[k].length-1] == "=" && cur + 1 < argv.length) {
47+
arg = argv[cur+1];
48+
argv.delete_at(cur + 1);
49+
}
50+
}
51+
} else { // a short option
52+
if (pos == 0) pos = 1;
53+
opt = a[pos++];
54+
let k = ostr.indexOf(opt);
55+
if (k < 0) {
56+
opt = "?";
57+
} else if (k + 1 < ostr.length && ostr[k+1] == ":") { // requiring an argument
58+
if (pos >= a.length) {
59+
arg = argv[cur+1];
60+
argv.delete_at(cur + 1);
61+
} else arg = a.substring(pos);
62+
pos = -1;
63+
}
64+
}
65+
if (pos < 0 || pos >= argv[cur].length) {
66+
argv.delete_at(cur);
67+
pos = 0;
68+
}
69+
if (lopt != "") yield { opt: `--${lopt}`, arg: arg };
70+
else if (opt != "?") yield { opt: `-${opt}`, arg: arg };
71+
else yield { opt: "?", arg: "" };
72+
}
73+
}
74+
75+
function* k8_readline(fn) {
76+
let buf = new Bytes();
77+
let file = new File(fn);
78+
while (file.readline(buf) >= 0) {
79+
yield buf.toString();
80+
}
81+
file.close();
82+
buf.destroy();
83+
}
84+
85+
function main(args) {
86+
let opt = { min_cov:.9, max_dv:.01 };
87+
for (const o of getopt(args, "c:d:", [])) {
88+
if (o.opt == '-c') opt.min_cov = parseFloat(o.arg);
89+
else if (o.opt == '-d') opt.max_dv = parseFloat(o.arg);
90+
}
91+
if (args.length == 0) {
92+
print("Usage: pafcluster.js [options] <ava.paf>");
93+
print("Options:");
94+
print(` -c FLOAT min coverage [${opt.min_cov}]`);
95+
print(` -d FLOAT max divergence [${opt.max_dv}]`);
96+
return;
97+
}
98+
99+
// read
100+
let a = [], len = {};
101+
for (const line of k8_readline(args[0])) {
102+
let m, t = line.split("\t");
103+
if (t[4] != "+") continue;
104+
const len1 = parseInt(t[1]), len2 = parseInt(t[6]);
105+
let s1 = -1, dv = -1.0;
106+
for (let i = 12; i < t.length; ++i) {
107+
if ((m = /^(s1|dv):\S:(\S+)/.exec(t[i])) != null) {
108+
if (m[1] == "s1") s1 = parseInt(m[2]);
109+
else if (m[1] == "dv") dv = parseFloat(m[2]);
110+
}
111+
}
112+
if (s1 < 0 || dv < 0) continue;
113+
const cov1 = (parseInt(t[3]) - parseInt(t[2])) / len1;
114+
const cov2 = (parseInt(t[8]) - parseInt(t[7])) / len2;
115+
const min_cov = cov1 < cov2? cov1 : cov2;
116+
const max_cov = cov1 > cov2? cov1 : cov2;
117+
a.push({ name1:t[0], name2:t[5], len1:len1, len2:len2, min_cov:min_cov, max_cov:max_cov, s1:s1, dv:dv });
118+
len[t[0]] = len1, len[t[5]] = len2;
119+
}
120+
warn(`Read ${a.length} hits`);
121+
122+
// filter duplicated hits
123+
let pair = {}, n_flt = 0, b = [];
124+
a.sort(function(x, y) { return y.s1 - x.s1 });
125+
for (let i = 0; i < a.length; ++i) {
126+
const key = `${a[i].name1}\t${a[i].name2}`;
127+
if (pair[key] != null) {
128+
++n_flt;
129+
continue;
130+
}
131+
pair[key] = 1;
132+
b.push(a[i]);
133+
}
134+
pair = null;
135+
warn(`Filtered ${n_flt} hits`);
136+
a = b;
137+
138+
// core loop
139+
while (a.length > 1) {
140+
// select the sequence with the highest sum of s1
141+
let h = {};
142+
for (let i = 0; i < a.length; ++i) {
143+
if (h[a[i].name1] == null) h[a[i].name1] = 0;
144+
h[a[i].name1] += a[i].s1;
145+
}
146+
let max_s1 = 0, max_name = "";
147+
for (const name in h)
148+
if (max_s1 < h[name])
149+
max_s1 = h[name], max_name = name;
150+
// find contigs in the same group
151+
h = {};
152+
h[max_name] = 1;
153+
for (let i = 0; i < a.length; ++i) {
154+
if (a[i].name1 != max_name && a[i].name2 != max_name)
155+
continue;
156+
if (a[i].min_cov >= opt.min_cov && a[i].dv <= opt.max_dv)
157+
h[a[i].name1] = h[a[i].name2] = 1;
158+
}
159+
let n = 0;
160+
for (const key in h) ++n;
161+
print(`SD\t${max_name}\t${n}`);
162+
for (const key in h) print(`CL\t${key}\t${len[key]}`);
163+
print("//");
164+
// filter out redundant hits
165+
let b = [];
166+
for (let i = 0; i < a.length; ++i)
167+
if (h[a[i].name1] == null && h[a[i].name2] == null)
168+
b.push(a[i]);
169+
warn(`Reduced the number of hits from ${a.length} to ${b.length}`);
170+
a = b;
171+
}
172+
}
173+
174+
main(arguments);

0 commit comments

Comments
 (0)