Skip to content

Commit 2b0ae25

Browse files
committed
feat: add an option to output MD in the XA tag
1 parent 79b230d commit 2b0ae25

File tree

4 files changed

+30
-16
lines changed

4 files changed

+30
-16
lines changed

bwape.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
621621
return pacseq;
622622
}
623623

624-
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
624+
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line, int with_md)
625625
{
626626
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
627627
int i, j, n_seqs;
@@ -692,7 +692,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
692692

693693
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
694694
for (j = 0; j < 2; ++j)
695-
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
695+
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, with_md);
696696
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
697697
if (pac == 0) free(pacseq);
698698

@@ -732,12 +732,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
732732

733733
int bwa_sai2sam_pe(int argc, char *argv[])
734734
{
735-
int c;
735+
int c, with_md = 0;
736736
pe_opt_t *popt;
737737
char *prefix, *rg_line = 0;
738738

739739
popt = bwa_init_pe_opt();
740-
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
740+
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:d")) >= 0) {
741741
switch (c) {
742742
case 'r':
743743
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
@@ -751,6 +751,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
751751
case 'c': popt->ap_prior = atof(optarg); break;
752752
case 'f': xreopen(optarg, "w", stdout); break;
753753
case 'A': popt->force_isize = 1; break;
754+
case 'd': with_md = 1; break;
754755
default: return 1;
755756
}
756757
}
@@ -768,6 +769,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
768769
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
769770
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
770771
fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
772+
fprintf(stderr, " -d output the MD to each alignment in the XA tag, otherwise use \".\"\n\n");
771773
fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
772774
fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
773775
fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
@@ -778,7 +780,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
778780
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
779781
return 1;
780782
}
781-
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
783+
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line, with_md);
782784
free(prefix); free(popt);
783785
return 0;
784786
}

bwase.c

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -284,11 +284,11 @@ void bwa_correct_trimmed(bwa_seq_t *s)
284284
s->len = s->full_len;
285285
}
286286

287-
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
287+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, int with_md)
288288
{
289289
ubyte_t *pacseq;
290290
int i, j, k;
291-
kstring_t *str;
291+
kstring_t *str = (kstring_t*)calloc(1, sizeof(kstring_t));
292292

293293
if (!_pacseq) {
294294
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
@@ -305,15 +305,25 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
305305
q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar);
306306
q->n_cigar = n_cigar;
307307
if (q->cigar) s->multi[k++] = *q;
308-
} else s->multi[k++] = *q;
308+
} else {
309+
s->multi[k++] = *q;
310+
if (with_md) { // create the cigar, needed for bwa_cal_md1 below
311+
q->n_cigar = 1;
312+
q->cigar = calloc(q->n_cigar, sizeof(uint32_t));
313+
q->cigar[0] = __cigar_create(FROM_M, s->len);
314+
}
315+
}
316+
if (with_md) {
317+
int nm;
318+
q->md = bwa_cal_md1(q->n_cigar, q->cigar, s->len, q->pos, q->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm);
319+
}
309320
}
310321
s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation
311322
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
312323
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar);
313324
if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH;
314325
}
315326
// generate MD tag
316-
str = (kstring_t*)calloc(1, sizeof(kstring_t));
317327
for (i = 0; i != n_seqs; ++i) {
318328
bwa_seq_t *s = seqs + i;
319329
if (s->type != BWA_TYPE_NO_MATCH) {
@@ -504,7 +514,7 @@ void bwase_initialize()
504514
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
505515
}
506516

507-
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
517+
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line, int with_md)
508518
{
509519
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
510520
int i, n_seqs, m_aln;
@@ -557,7 +567,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
557567
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
558568

559569
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
560-
bwa_refine_gapped(bns, n_seqs, seqs, 0);
570+
bwa_refine_gapped(bns, n_seqs, seqs, 0, with_md);
561571
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
562572

563573
fprintf(stderr, "[bwa_aln_core] print alignments... ");
@@ -578,11 +588,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
578588

579589
int bwa_sai2sam_se(int argc, char *argv[])
580590
{
581-
int c, n_occ = 3;
591+
int c, n_occ = 3, with_md = 0;
582592
char *prefix, *rg_line = 0;
583-
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
593+
while ((c = getopt(argc, argv, "hdn:f:r:")) >= 0) {
584594
switch (c) {
585595
case 'h': break;
596+
case 'd': with_md = 1; break;
586597
case 'r':
587598
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
588599
break;
@@ -593,14 +604,14 @@ int bwa_sai2sam_se(int argc, char *argv[])
593604
}
594605

595606
if (optind + 3 > argc) {
596-
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
607+
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-d] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
597608
return 1;
598609
}
599610
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
600611
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
601612
return 1;
602613
}
603-
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
614+
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line, with_md);
604615
free(prefix);
605616
return 0;
606617
}

bwase.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ extern "C" {
1414
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
1515
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
1616
// Refine the approximate position of the sequence to an actual placement for the sequence.
17-
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
17+
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, int with_md);
1818
// Backfill certain alignment properties mainly centering around number of matches.
1919
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
2020
// Calculate the end position of a read given a certain sequence.

bwtaln.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ typedef struct {
6161
int ref_shift;
6262
bwtint_t pos;
6363
bwa_cigar_t *cigar;
64+
char *md;
6465
} bwt_multi1_t;
6566

6667
typedef struct {

0 commit comments

Comments
 (0)