Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "tseq" output & make available to Python #1201

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions format.c
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,20 @@ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_r
return mm_gen_cs_or_MD(km, buf, max_len, mi, r, seq, 1, 0, 0);
}

int mm_gen_tseq(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r)
{
uint8_t *tseq;
kstring_t str;
str.s = *buf, str.l = 0, str.m = *max_len;
tseq = (uint8_t*)kmalloc(km, r->re - r->rs);
int len = mm_idx_getseq(mi, r->rid, r->rs, r->re, tseq);
for (int i=0; i<len; i++)
mm_sprintf_lite(&str, "%c", "ACGTN"[tseq[i]]);
*max_len = str.m;
*buf = str.s;
return str.l;
}

static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
{
int type;
Expand Down
1 change: 1 addition & 0 deletions minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
*/
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
int mm_gen_tseq(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r);

// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
Expand Down
1 change: 1 addition & 0 deletions python/cmappy.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ cdef extern from "minimap.h":
void *mm_tbuf_get_km(mm_tbuf_t *b)
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden)
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq)
int mm_gen_tseq(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r)

#
# Helper header (because it is hard to expose mm_reg1_t with Cython)
Expand Down
18 changes: 13 additions & 5 deletions python/mappy.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ cdef class Alignment:
cdef int8_t _strand, _trans_strand
cdef uint8_t _mapq, _is_primary
cdef int _seg_id
cdef _ctg, _cigar, _cs, _MD # these are python objects
cdef _ctg, _cigar, _cs, _MD, _tseq # these are python objects

def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str):
def __cinit__(self, ctg, cl, cs, ce, strand, qs, qe, mapq, cigar, is_primary, mlen, blen, NM, trans_strand, seg_id, cs_str, MD_str, tseq_str):
self._ctg = ctg if isinstance(ctg, str) else ctg.decode()
self._ctg_len, self._r_st, self._r_en = cl, cs, ce
self._strand, self._q_st, self._q_en = strand, qs, qe
Expand All @@ -28,6 +28,7 @@ cdef class Alignment:
self._seg_id = seg_id
self._cs = cs_str
self._MD = MD_str
self._tseq = tseq_str

@property
def ctg(self): return self._ctg
Expand Down Expand Up @@ -80,6 +81,9 @@ cdef class Alignment:
@property
def MD(self): return self._MD

@property
def tseq(self): return self._tseq

@property
def cigar_str(self):
return "".join(map(lambda x: str(x[0]) + 'MIDNSHP=XB'[x[1]], self._cigar))
Expand All @@ -97,6 +101,7 @@ cdef class Alignment:
str(self._mlen), str(self._blen), str(self._mapq), tp, ts, "cg:Z:" + self.cigar_str]
if self._cs != "": a.append("cs:Z:" + self._cs)
if self._MD != "": a.append("MD:Z:" + self._MD)
if self._tseq != "": a.append("tq:Z:" + self._tseq)
return "\t".join(a)

cdef class ThreadBuffer:
Expand Down Expand Up @@ -163,7 +168,7 @@ cdef class Aligner:
def __bool__(self):
return (self._idx != NULL)

def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None):
def map(self, seq, seq2=None, buf=None, cs=False, MD=False, tseq=False, max_frag_len=None, extra_flags=None):
cdef cmappy.mm_reg1_t *regs
cdef cmappy.mm_hitpy_t h
cdef ThreadBuffer b
Expand Down Expand Up @@ -195,7 +200,7 @@ cdef class Aligner:
i = 0
while i < n_regs:
cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
cigar, _cs, _MD = [], '', ''
cigar, _cs, _MD, _tseq = [], '', '', ''
for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array
c = h.cigar32[k]
cigar.append([c>>4, c&0xf])
Expand All @@ -206,7 +211,10 @@ cdef class Aligner:
if MD:
l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, &regs[i], _seq)
_MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD)
if tseq:
l_cs_str = cmappy.mm_gen_tseq(km, &cs_str, &m_cs_str, self._idx, &regs[i])
_tseq = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode()
yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD, _tseq)
cmappy.mm_free_reg1(&regs[i])
i += 1
finally:
Expand Down