Skip to content

Commit 1877818

Browse files
committed
r1255: moved jump code to a separate file
1 parent 9ede5c4 commit 1877818

File tree

4 files changed

+170
-171
lines changed

4 files changed

+170
-171
lines changed

Makefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
22
CPPFLAGS= -DHAVE_KALLOC
33
INCLUDES=
44
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o \
5-
lchain.o align.o hit.o seed.o map.o format.o pe.o esterr.o splitidx.o \
5+
lchain.o align.o hit.o seed.o jump.o map.o format.o pe.o esterr.o splitidx.o \
66
ksw2_ll_sse.o
77
PROG= minimap2
88
PROG_EXTRA= sdust minimap2-lite
@@ -115,8 +115,9 @@ esterr.o: mmpriv.h minimap.h bseq.h kseq.h
115115
example.o: minimap.h kseq.h
116116
format.o: kalloc.h mmpriv.h minimap.h bseq.h kseq.h
117117
hit.o: mmpriv.h minimap.h bseq.h kseq.h kalloc.h khash.h
118-
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h kvec.h kalloc.h khash.h
119-
index.o: ksort.h
118+
index.o: kthread.h bseq.h minimap.h mmpriv.h kseq.h ksw2.h kalloc.h kvec.h
119+
index.o: khash.h ksort.h
120+
jump.o: mmpriv.h minimap.h bseq.h kseq.h
120121
kalloc.o: kalloc.h
121122
ksw2_extd2_sse.o: ksw2.h kalloc.h
122123
ksw2_exts2_sse.o: ksw2.h kalloc.h

hit.c

Lines changed: 0 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -464,170 +464,3 @@ void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int ma
464464
}
465465
mm_set_inv_mapq(km, n_regs, regs);
466466
}
467-
468-
/******************************
469-
* Instructed split alignment *
470-
******************************/
471-
472-
#define MM_MIN_EXON_LEN 20
473-
474-
static int32_t mm_jump_check(void *km, const mm_idx_t *mi, int32_t qlen, const uint8_t *qseq0, const mm_reg1_t *r, int32_t ext, int32_t is_left) // TODO: check close N
475-
{
476-
int32_t clip, clen, e = !r->rev ^ !is_left; // 0 for left of the alignment; 1 for right
477-
uint32_t cigar;
478-
if (!r->p || r->p->n_cigar <= 0) return -1; // only working with CIGAR
479-
clip = e == 0? r->qs : qlen - r->qe;
480-
cigar = r->p->cigar[e == 0? 0 : r->p->n_cigar - 1];
481-
clen = (cigar&0xf) == MM_CIGAR_MATCH? cigar>>4 : 0;
482-
if (clen <= ext) return -1;
483-
if (is_left) {
484-
if (clip >= r->rs) return -1; // no space to jump
485-
} else {
486-
if (clip >= mi->seq[r->rid].len - r->re) return -1; // no space to jump
487-
}
488-
return 0;
489-
}
490-
491-
static uint8_t *mm_jump_get_qseq_seq(void *km, int32_t qlen, const uint8_t *qseq0, const mm_reg1_t *r, int32_t is_left, int32_t ql0, uint8_t *qseq)
492-
{
493-
int32_t i, k;
494-
if (!r->rev) {
495-
if (is_left) memcpy(qseq, qseq0, ql0);
496-
else memcpy(qseq, &qseq0[qlen - ql0], ql0);
497-
} else {
498-
if (is_left)
499-
for (i = qlen - 1, k = 0; i >= qlen - ql0; --i)
500-
qseq[k++] = qseq0[i] >= 4? qseq0[i] : 3 - qseq0[i];
501-
else
502-
for (i = ql0 - 1, k = 0; i >= 0; --i)
503-
qseq[k++] = qseq0[i] >= 4? qseq0[i] : 3 - qseq0[i];
504-
}
505-
return qseq;
506-
}
507-
508-
static void mm_jump_split_left(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq0, mm_reg1_t *r, int32_t ts_strand)
509-
{
510-
uint8_t *tseq = 0, *qseq = 0;
511-
int32_t i, n, i0 = -1, m = 0, l;
512-
int32_t ext = 1 + (opt->b + opt->a - 1) / opt->a + 1;
513-
int32_t clip = !r->rev? r->qs : qlen - r->qe;
514-
int32_t extt = clip < ext? clip : ext;
515-
const mm_idx_jjump1_t *a;
516-
517-
if (mm_jump_check(km, mi, qlen, qseq0, r, ext + MM_MIN_EXON_LEN, 1) < 0) return;
518-
a = mm_idx_jump_get(mi, r->rid, r->rs - extt, r->rs + ext, &n);
519-
if (n == 0) return;
520-
521-
for (i = 0; i < n; ++i) { // traverse possible jumps
522-
const mm_idx_jjump1_t *ai = &a[i];
523-
int32_t tlen, tl1, j, mm1, mm2;
524-
assert(ai->off >= r->rs - extt && ai->off < r->rs + ext);
525-
if (ts_strand * ai->strand < 0) continue; // wrong strand
526-
if (ai->off2 >= ai->off) continue; // wrong direction
527-
if (ai->off2 < clip + ext) continue; // not long enough
528-
if (tseq == 0) {
529-
tseq = Kcalloc(km, uint8_t, (clip + ext) * 2); // tseq and qseq are allocated together
530-
qseq = tseq + clip + ext;
531-
mm_jump_get_qseq_seq(km, qlen, qseq0, r, 1, clip + ext, qseq);
532-
}
533-
tl1 = clip + (ai->off - r->rs);
534-
tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off, r->rs + ext, &tseq[tl1]);
535-
assert(tlen == r->rs + ext - ai->off);
536-
tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off2 - tl1, ai->off2, tseq);
537-
assert(tlen == tl1);
538-
for (j = 0, mm1 = 0; j < tl1; ++j)
539-
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
540-
++mm1;
541-
for (mm2 = 0; j < clip + ext; ++j)
542-
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
543-
++mm2;
544-
if (mm1 == 0 && mm2 == 1)
545-
i0 = i, ++m; // i0 points to the rightmost i
546-
}
547-
kfree(km, tseq);
548-
549-
l = m > 0? a[i0].off - r->rs : 0; // may be negative
550-
if (m == 1 && clip + l >= opt->jump_min_alen) { // add one more exon
551-
mm_enlarge_cigar(r, 2);
552-
memmove(r->p->cigar + 2, r->p->cigar, r->p->n_cigar * 4);
553-
r->p->cigar[0] = (clip + l) << 4 | MM_CIGAR_MATCH;
554-
r->p->cigar[1] = (a[i0].off - a[i0].off2) << 4 | MM_CIGAR_N_SKIP;
555-
r->p->cigar[2] = ((r->p->cigar[2]>>4) - l) << 4 | MM_CIGAR_MATCH;
556-
r->p->n_cigar += 2;
557-
r->rs = a[i0].off2 - (clip + l);
558-
if (!r->rev) r->qs = 0;
559-
else r->qe = qlen;
560-
} else if (m > 0 && a[i0].off > r->rs) { // trim by l; l is always positive
561-
r->p->cigar[0] -= l << 4 | MM_CIGAR_MATCH;
562-
r->rs += l;
563-
if (!r->rev) r->qs += l;
564-
else r->qe -= l;
565-
}
566-
}
567-
568-
static void mm_jump_split_right(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq0, mm_reg1_t *r, int32_t ts_strand)
569-
{
570-
uint8_t *tseq = 0, *qseq = 0;
571-
int32_t i, n, i0 = -1, m = 0, l;
572-
int32_t ext = 1 + (opt->b + opt->a - 1) / opt->a + 1;
573-
int32_t clip = !r->rev? qlen - r->qe : r->qs;
574-
int32_t extt = clip < ext? clip : ext;
575-
const mm_idx_jjump1_t *a;
576-
577-
if (mm_jump_check(km, mi, qlen, qseq0, r, ext + MM_MIN_EXON_LEN, 1) < 0) return;
578-
a = mm_idx_jump_get(mi, r->rid, r->re - ext, r->re + extt, &n);
579-
if (n == 0) return;
580-
581-
for (i = 0; i < n; ++i) { // traverse possible jumps
582-
const mm_idx_jjump1_t *ai = &a[i];
583-
int32_t tlen, tl1, j, mm1, mm2;
584-
assert(ai->off >= r->rs - extt && ai->off < r->rs + ext);
585-
if (ts_strand * ai->strand < 0) continue; // wrong strand
586-
if (ai->off2 <= ai->off) continue; // wrong direction
587-
if (ai->off2 + clip + ext > mi->seq[r->rid].len) continue; // not long enough
588-
if (tseq == 0) {
589-
tseq = Kcalloc(km, uint8_t, (clip + ext) * 2); // tseq and qseq are allocated together
590-
qseq = tseq + clip + ext;
591-
mm_jump_get_qseq_seq(km, qlen, qseq0, r, 0, clip + ext, qseq);
592-
}
593-
tl1 = clip + (r->re - ai->off);
594-
tlen = mm_idx_getseq2(mi, 0, r->rid, r->re - ext, ai->off, tseq);
595-
assert(tlen == ai->off - (r->re - ext));
596-
tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off2, ai->off2 + tl1, &tseq[clip + ext - tl1]);
597-
assert(tlen == tl1);
598-
for (j = 0, mm2 = 0; j < clip + ext - tl1; ++j)
599-
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
600-
++mm2;
601-
for (mm1 = 0; j < clip + ext; ++j)
602-
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
603-
++mm1;
604-
if (mm1 == 0 && mm2 == 1)
605-
i0 = i0 >= 0? i0 : i, ++m; // i0 points to the leftmost i
606-
}
607-
kfree(km, tseq);
608-
609-
l = m > 0? r->re - a[i0].off : 0; // may be negative
610-
if (m == 1 && clip + l >= opt->jump_min_alen) { // add one more exon
611-
mm_enlarge_cigar(r, 2);
612-
memmove(r->p->cigar + 2, r->p->cigar, r->p->n_cigar * 4);
613-
r->p->cigar[r->p->n_cigar - 1] = ((r->p->cigar[r->p->n_cigar - 1]>>4) - l) << 4 | MM_CIGAR_MATCH;
614-
r->p->cigar[r->p->n_cigar] = (a[i0].off2 - a[i0].off) << 4 | MM_CIGAR_N_SKIP;
615-
r->p->cigar[r->p->n_cigar + 1] = (clip + l) << 4 | MM_CIGAR_MATCH;
616-
r->p->n_cigar += 2;
617-
r->re = a[i0].off2 + (clip + l);
618-
if (!r->rev) r->qe = qlen;
619-
else r->qs = 0;
620-
} else if (m > 0 && r->re > a[i0].off) { // trim by l; l is always positive
621-
r->p->cigar[r->p->n_cigar - 1] -= l << 4 | MM_CIGAR_MATCH;
622-
r->re -= l;
623-
if (!r->rev) r->qe -= l;
624-
else r->qs += l;
625-
}
626-
}
627-
628-
void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand)
629-
{
630-
assert((opt->flag & MM_F_EQX) == 0);
631-
mm_jump_split_left(km, mi, opt, qlen, qseq, r, ts_strand);
632-
mm_jump_split_right(km, mi, opt, qlen, qseq, r, ts_strand);
633-
}

jump.c

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#include "mmpriv.h"
2+
#include "kalloc.h"
3+
4+
#define MM_MIN_EXON_LEN 20
5+
6+
static int32_t mm_jump_check(void *km, const mm_idx_t *mi, int32_t qlen, const uint8_t *qseq0, const mm_reg1_t *r, int32_t ext, int32_t is_left) // TODO: check close N
7+
{
8+
int32_t clip, clen, e = !r->rev ^ !is_left; // 0 for left of the alignment; 1 for right
9+
uint32_t cigar;
10+
if (!r->p || r->p->n_cigar <= 0) return -1; // only working with CIGAR
11+
clip = e == 0? r->qs : qlen - r->qe;
12+
cigar = r->p->cigar[e == 0? 0 : r->p->n_cigar - 1];
13+
clen = (cigar&0xf) == MM_CIGAR_MATCH? cigar>>4 : 0;
14+
if (clen <= ext) return -1;
15+
if (is_left) {
16+
if (clip >= r->rs) return -1; // no space to jump
17+
} else {
18+
if (clip >= mi->seq[r->rid].len - r->re) return -1; // no space to jump
19+
}
20+
return 0;
21+
}
22+
23+
static uint8_t *mm_jump_get_qseq_seq(void *km, int32_t qlen, const uint8_t *qseq0, const mm_reg1_t *r, int32_t is_left, int32_t ql0, uint8_t *qseq)
24+
{
25+
int32_t i, k;
26+
if (!r->rev) {
27+
if (is_left) memcpy(qseq, qseq0, ql0);
28+
else memcpy(qseq, &qseq0[qlen - ql0], ql0);
29+
} else {
30+
if (is_left)
31+
for (i = qlen - 1, k = 0; i >= qlen - ql0; --i)
32+
qseq[k++] = qseq0[i] >= 4? qseq0[i] : 3 - qseq0[i];
33+
else
34+
for (i = ql0 - 1, k = 0; i >= 0; --i)
35+
qseq[k++] = qseq0[i] >= 4? qseq0[i] : 3 - qseq0[i];
36+
}
37+
return qseq;
38+
}
39+
40+
static void mm_jump_split_left(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq0, mm_reg1_t *r, int32_t ts_strand)
41+
{
42+
uint8_t *tseq = 0, *qseq = 0;
43+
int32_t i, n, i0 = -1, m = 0, l;
44+
int32_t ext = 1 + (opt->b + opt->a - 1) / opt->a + 1;
45+
int32_t clip = !r->rev? r->qs : qlen - r->qe;
46+
int32_t extt = clip < ext? clip : ext;
47+
const mm_idx_jjump1_t *a;
48+
49+
if (mm_jump_check(km, mi, qlen, qseq0, r, ext + MM_MIN_EXON_LEN, 1) < 0) return;
50+
a = mm_idx_jump_get(mi, r->rid, r->rs - extt, r->rs + ext, &n);
51+
if (n == 0) return;
52+
53+
for (i = 0; i < n; ++i) { // traverse possible jumps
54+
const mm_idx_jjump1_t *ai = &a[i];
55+
int32_t tlen, tl1, j, mm1, mm2;
56+
assert(ai->off >= r->rs - extt && ai->off < r->rs + ext);
57+
if (ts_strand * ai->strand < 0) continue; // wrong strand
58+
if (ai->off2 >= ai->off) continue; // wrong direction
59+
if (ai->off2 < clip + ext) continue; // not long enough
60+
if (tseq == 0) {
61+
tseq = Kcalloc(km, uint8_t, (clip + ext) * 2); // tseq and qseq are allocated together
62+
qseq = tseq + clip + ext;
63+
mm_jump_get_qseq_seq(km, qlen, qseq0, r, 1, clip + ext, qseq);
64+
}
65+
tl1 = clip + (ai->off - r->rs);
66+
tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off, r->rs + ext, &tseq[tl1]);
67+
assert(tlen == r->rs + ext - ai->off);
68+
tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off2 - tl1, ai->off2, tseq);
69+
assert(tlen == tl1);
70+
for (j = 0, mm1 = 0; j < tl1; ++j)
71+
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
72+
++mm1;
73+
for (mm2 = 0; j < clip + ext; ++j)
74+
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
75+
++mm2;
76+
if (mm1 == 0 && mm2 == 1)
77+
i0 = i, ++m; // i0 points to the rightmost i
78+
}
79+
kfree(km, tseq);
80+
81+
l = m > 0? a[i0].off - r->rs : 0; // may be negative
82+
if (m == 1 && clip + l >= opt->jump_min_alen) { // add one more exon
83+
mm_enlarge_cigar(r, 2);
84+
memmove(r->p->cigar + 2, r->p->cigar, r->p->n_cigar * 4);
85+
r->p->cigar[0] = (clip + l) << 4 | MM_CIGAR_MATCH;
86+
r->p->cigar[1] = (a[i0].off - a[i0].off2) << 4 | MM_CIGAR_N_SKIP;
87+
r->p->cigar[2] = ((r->p->cigar[2]>>4) - l) << 4 | MM_CIGAR_MATCH;
88+
r->p->n_cigar += 2;
89+
r->rs = a[i0].off2 - (clip + l);
90+
if (!r->rev) r->qs = 0;
91+
else r->qe = qlen;
92+
} else if (m > 0 && a[i0].off > r->rs) { // trim by l; l is always positive
93+
r->p->cigar[0] -= l << 4 | MM_CIGAR_MATCH;
94+
r->rs += l;
95+
if (!r->rev) r->qs += l;
96+
else r->qe -= l;
97+
}
98+
}
99+
100+
static void mm_jump_split_right(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq0, mm_reg1_t *r, int32_t ts_strand)
101+
{
102+
uint8_t *tseq = 0, *qseq = 0;
103+
int32_t i, n, i0 = -1, m = 0, l;
104+
int32_t ext = 1 + (opt->b + opt->a - 1) / opt->a + 1;
105+
int32_t clip = !r->rev? qlen - r->qe : r->qs;
106+
int32_t extt = clip < ext? clip : ext;
107+
const mm_idx_jjump1_t *a;
108+
109+
if (mm_jump_check(km, mi, qlen, qseq0, r, ext + MM_MIN_EXON_LEN, 1) < 0) return;
110+
a = mm_idx_jump_get(mi, r->rid, r->re - ext, r->re + extt, &n);
111+
if (n == 0) return;
112+
113+
for (i = 0; i < n; ++i) { // traverse possible jumps
114+
const mm_idx_jjump1_t *ai = &a[i];
115+
int32_t tlen, tl1, j, mm1, mm2;
116+
assert(ai->off >= r->rs - extt && ai->off < r->rs + ext);
117+
if (ts_strand * ai->strand < 0) continue; // wrong strand
118+
if (ai->off2 <= ai->off) continue; // wrong direction
119+
if (ai->off2 + clip + ext > mi->seq[r->rid].len) continue; // not long enough
120+
if (tseq == 0) {
121+
tseq = Kcalloc(km, uint8_t, (clip + ext) * 2); // tseq and qseq are allocated together
122+
qseq = tseq + clip + ext;
123+
mm_jump_get_qseq_seq(km, qlen, qseq0, r, 0, clip + ext, qseq);
124+
}
125+
tl1 = clip + (r->re - ai->off);
126+
tlen = mm_idx_getseq2(mi, 0, r->rid, r->re - ext, ai->off, tseq);
127+
assert(tlen == ai->off - (r->re - ext));
128+
tlen = mm_idx_getseq2(mi, 0, r->rid, ai->off2, ai->off2 + tl1, &tseq[clip + ext - tl1]);
129+
assert(tlen == tl1);
130+
for (j = 0, mm2 = 0; j < clip + ext - tl1; ++j)
131+
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
132+
++mm2;
133+
for (mm1 = 0; j < clip + ext; ++j)
134+
if (qseq[j] != tseq[j] || qseq[j] > 3 || tseq[j] > 3)
135+
++mm1;
136+
if (mm1 == 0 && mm2 == 1)
137+
i0 = i0 >= 0? i0 : i, ++m; // i0 points to the leftmost i
138+
}
139+
kfree(km, tseq);
140+
141+
l = m > 0? r->re - a[i0].off : 0; // may be negative
142+
if (m == 1 && clip + l >= opt->jump_min_alen) { // add one more exon
143+
mm_enlarge_cigar(r, 2);
144+
memmove(r->p->cigar + 2, r->p->cigar, r->p->n_cigar * 4);
145+
r->p->cigar[r->p->n_cigar - 1] = ((r->p->cigar[r->p->n_cigar - 1]>>4) - l) << 4 | MM_CIGAR_MATCH;
146+
r->p->cigar[r->p->n_cigar] = (a[i0].off2 - a[i0].off) << 4 | MM_CIGAR_N_SKIP;
147+
r->p->cigar[r->p->n_cigar + 1] = (clip + l) << 4 | MM_CIGAR_MATCH;
148+
r->p->n_cigar += 2;
149+
r->re = a[i0].off2 + (clip + l);
150+
if (!r->rev) r->qe = qlen;
151+
else r->qs = 0;
152+
} else if (m > 0 && r->re > a[i0].off) { // trim by l; l is always positive
153+
r->p->cigar[r->p->n_cigar - 1] -= l << 4 | MM_CIGAR_MATCH;
154+
r->re -= l;
155+
if (!r->rev) r->qe -= l;
156+
else r->qs += l;
157+
}
158+
}
159+
160+
void mm_jump_split(void *km, const mm_idx_t *mi, const mm_mapopt_t *opt, int32_t qlen, const uint8_t *qseq, mm_reg1_t *r, int32_t ts_strand)
161+
{
162+
assert((opt->flag & MM_F_EQX) == 0);
163+
mm_jump_split_left(km, mi, opt, qlen, qseq, r, ts_strand);
164+
mm_jump_split_right(km, mi, opt, qlen, qseq, r, ts_strand);
165+
}

minimap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <stdio.h>
66
#include <sys/types.h>
77

8-
#define MM_VERSION "2.28-r1251-dirty"
8+
#define MM_VERSION "2.28-r1255-dirty"
99

1010
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
1111
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name

0 commit comments

Comments
 (0)