Skip to content

Commit

Permalink
ExtractUMI bug fix, '--add-as-tag' parameter, and separator parameter…
Browse files Browse the repository at this point in the history
…. SuperDeduper now supports umis in tag and had a column parameter
  • Loading branch information
bnjenner committed Sep 29, 2024
1 parent 36f2ea7 commit a94dd8a
Show file tree
Hide file tree
Showing 10 changed files with 230 additions and 116 deletions.
1 change: 1 addition & 0 deletions common/src/ioHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ std::vector<ReadPtr> TabReadImpl::load_read(std::istream *input) {
return reads;
}


template <>
InputReader<SingleEndRead, SingleEndReadFastqImpl>::value_type InputReader<SingleEndRead, SingleEndReadFastqImpl>::next() {
return InputReader<SingleEndRead, SingleEndReadFastqImpl>::value_type(new SingleEndRead(load_read(input)));
Expand Down
80 changes: 56 additions & 24 deletions common/src/read.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,30 +75,6 @@ class Read {
}
const std::string get_id_first() const { return id; }
const std::string get_id_second() const { return id2; }

const std::string get_umi(const char& del) {
size_t idx = id.rfind(del);
if (idx == std::string::npos) {
throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation");
}
std::string umi = id.substr(idx + 1);

// clean up DRAGEN format if present
idx = umi.rfind('+');
if (idx == std::string::npos) {
return umi;
} else {
std::vector<std::string> result;
boost::split(result, id, boost::is_any_of(":"));
if (result.size() < 8) {
throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns for DRAGEN UMI format");
}
umi = result[7];
idx = umi.rfind('+');
umi.erase(idx);
return umi;
}
}

std::vector<std::string> get_comment() const { return comments;}
static char complement(char bp);
Expand All @@ -119,6 +95,62 @@ class Read {
std::reverse(begin(q), end(q));
return q; }

const std::string get_umi(const char &del, const size_t &col, bool &both_reads) {

size_t idx = id.rfind(del);
if (idx == std::string::npos) {
throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation");
}

std::vector<std::string> result;
boost::split(result, id, boost::is_any_of(":"));

std::string umi;
if (col == 0) {
umi = result[result.size() - 1];
} else if (result.size() > col) {
throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns to extract UMI");
} else {
umi = result[col - 1];
}

for (const auto &s : DEL_OPTIONS) {

idx = umi.find(s);

if (idx != std::string::npos) {
both_reads = true;
umi.erase(idx, 1);
return umi;
}
}
return umi;
}

const std::string get_umi_tag(bool &both_reads) {

size_t idx;
for (const auto &c : get_comment()) {
idx = c.find("RX:Z:");
if (idx != std::string::npos) {
std::string umi = c.substr(idx + 5);

for (const auto &s : DEL_OPTIONS) {

idx = umi.find(s);

if (idx != std::string::npos) {
both_reads = true;
umi.erase(idx, 1);
return umi;
}
}
return umi;
}
}
throw HtsRuntimeException("No UMI Tag found in read header. Be sure hts_ExtractUMI is run prior to the current operation");
}


void add_comment( const std::string& tag ) { if (tag != "") comments.push_back(tag);}
void join_comment( const std::vector<std::string>& new_comments, size_t offset = 0) { comments.insert(comments.end(), new_comments.begin() + offset, new_comments.end()); }
Expand Down
4 changes: 2 additions & 2 deletions common/src/typedefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ typedef std::vector<Vec> Mat;

const uint64_t QUAL_MAX = 94;
const uint64_t DEFAULT_QUAL_OFFSET = 33;
const std::vector<char> READ_OPTIONS{'F', 'R', 'B'}; // possible read options
const std::vector<char> DEL_OPTIONS{'-', '_', ':', ' '}; // possible char options (last one is to allow unset)
const std::vector<char> READ_OPTIONS{'F', 'R', 'B', 'P'}; // possible read options
const std::vector<char> DEL_OPTIONS{'-', '_', ':', '+', ' '}; // possible char options (last one is to allow unset)


#endif
111 changes: 58 additions & 53 deletions hts_ExtractUMI/src/hts_ExtractUMI.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {

// bases some parameters and allows passing UMIs between PE reads
struct UMI {
std::string seq1;
std::string seq2;
std::string seq;
std::string qual;
size_t length;
bool discard;
Expand All @@ -51,9 +50,10 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {

void add_extra_options(po::options_description &desc) {
desc.add_options()
("read,r", po::value<char>()->default_value('F')->notifier(boost::bind(&check_values<char>, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE")
("read,r", po::value<char>()->default_value('F')->notifier(boost::bind(&check_values<char>, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both, P = Paired End), ignored if SE")
("umi-length,l", po::value<size_t>()->default_value(6)->notifier(boost::bind(&check_range<size_t>, "umi_length", _1, 1, 36)), "Total length of UMI to extract (1, 36)")
("delimiter,d", po::value<char>()->default_value('_')->notifier(boost::bind(&check_values<char>, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')")
("delimiter,d", po::value<char>()->default_value('_')->notifier(boost::bind(&check_values<char>, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':'). Ignored if --add-as-tag is used")
("separator,s", po::value<char>()->default_value('+')->notifier(boost::bind(&check_values<char>, "separator", _1, DEL_OPTIONS)), "Character to separate the UMI sequence when using Paired End mode (Possible options: '-', '_', ':').")
("qual-score,q", po::value<size_t>()->default_value(0)->notifier(boost::bind(&check_range<size_t>, "qual-score", _1, 0, 10000)), "Threshold for quality score for any base within a UMI (min 1, max 10000), read pairs are discarded, default is unset")
("avg-qual-score,Q", po::value<size_t>()->default_value(0)->notifier(boost::bind(&check_range<size_t>, "avg-qual-score", _1, 0, 10000)), "Threshold for quality score average of UMI (min 1, max 10000), read pairs are discarded, default is unset")
("homopolymer,p", po::bool_switch()->default_value(false), "Remove reads with homopolymer UMIs")
Expand Down Expand Up @@ -108,44 +108,33 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
}


void set_dragen(Read &r, const UMI &umi, const bool &se = false) {

std::string new_id;
if (se) {
new_id = umi.seq1;
} else {
new_id = umi.seq1 + "+" + umi.seq2;
}

void set_dragen(Read &r, const UMI &umi) {
std::vector<std::string> result;
boost::split(result, r.get_id_first(), boost::is_any_of(":"));
if (result.size() < 7) {
throw HtsIOException("DRAGEN read ID format not found. Must have at least 7 fields deliminated by a ':'");
} else if (result.size() == 7) {
result.push_back(new_id);
result.push_back(umi.seq);
} else {
result[7] = new_id;
result[7] = umi.seq;
}

r.set_id_first(boost::algorithm::join(result, ":"));
}

void set_tag(Read &r, const UMI &umi, const bool &se = false) {
std::string new_id;
if (se) {
new_id = umi.seq1;
} else {
new_id = umi.seq1 + "-" + umi.seq2;
}
r.add_comment("RX:Z:" + new_id);
void set_tag(Read &r, const UMI &umi) {
r.add_comment("RX:Z:" + umi.seq);
}

void set_paired(Read &r, const UMI &umi, const char &del) {
r.set_id_first(r.get_id_first() + del + umi.seq); // append UMI to read ID
}

void extract_umi(Read &r, UMI &umi, const char &del, const bool &dragen = false, const bool &both_reads = false) {
void extract_umi(Read &r, UMI &umi, const char &del, const char &sep, const bool &dragen = false, const bool &paired_reads = false) {

std::string tmp_seq;

if ((umi.seq1.empty() || dragen) || both_reads ) {
if ((umi.seq.empty() || dragen) || paired_reads ) {

tmp_seq = r.get_seq().substr(0, umi.length);

Expand All @@ -166,17 +155,16 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
r.setLCut(umi.length);
}

if ((!umi.seq1.empty() && dragen) || (!umi.seq1.empty() && both_reads && umi.add_as_tag)) {
umi.seq2 = tmp_seq; // necessary copy?
if ((!umi.seq.empty() && dragen) || (!umi.seq.empty() && paired_reads)) {
umi.seq = umi.seq + sep + tmp_seq;
} else {
umi.seq1 = tmp_seq; // necessary copy?
umi.seq = tmp_seq;
}

}

if (!umi.discard) {
if (!dragen && !umi.add_as_tag) {
r.set_id_first(r.get_id_first() + del + umi.seq1); // append UMI to read ID
if (!dragen && !umi.add_as_tag && !paired_reads) {
r.set_id_first(r.get_id_first() + del + umi.seq); // append UMI to read ID
}
} else {
r.setDiscard();
Expand All @@ -191,6 +179,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
size_t umi_length = vm["umi-length"].as<size_t>();
bool add_as_tag = vm["add-as-tag"].as<bool>();
char del = vm["delimiter"].as<char>();
char sep = vm["separator"].as<char>();
size_t qual_offset = vm["qual-offset"].as<size_t>();
size_t qual_threshold = vm["qual-score"].as<size_t>();
size_t avg_qual_threshold = vm["avg-qual-score"].as<size_t>();
Expand All @@ -199,15 +188,14 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
bool dragen = vm["DRAGEN"].as<bool>();


if (dragen & (del != ':')) {
throw HtsIOException("Delimiter (--delimiter) must be ':' to be compatible with --DRAGEN parameter");
if (dragen & (del != ':' || sep != '+')) {
throw HtsIOException("Delimiter (--delimiter) must be ':' and Separator (--separator) was be '+' to be compatible with --DRAGEN parameter");
}


// init UMI struct
UMI umi = {
"", // umi R1 sequence
"", // umi R2 sequence
"", // umi sequence
"", // qual sequence
umi_length, // umi length
false, // discard status (init)
Expand All @@ -227,9 +215,9 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
if (dragen && (umi_length > 15)) {
throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads");
}
extract_umi( ser->non_const_read_one(), umi, del, dragen );
if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); }
if (add_as_tag) { set_tag( ser->non_const_read_one(), umi, true); }
extract_umi( ser->non_const_read_one(), umi, del, sep, dragen );
if (dragen) { set_dragen( ser->non_const_read_one(), umi ); }
if (add_as_tag) { set_tag( ser->non_const_read_one(), umi ); }
},
[&](PairedEndRead * per) {
if (dragen && (umi_length > 8)) {
Expand All @@ -239,22 +227,39 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
throw HtsIOException("DRAGEN parameter (--DRAGEN) is not compatible with --add-as-tag parameter");
}
if (read == 'F') {
extract_umi( per->non_const_read_one(), umi, del );
extract_umi( per->non_const_read_two(), umi, del );
extract_umi( per->non_const_read_one(), umi, del, sep );
extract_umi( per->non_const_read_two(), umi, del, sep );

} else if (read == 'R') {
extract_umi( per->non_const_read_two(), umi, del );
extract_umi( per->non_const_read_one(), umi, del );
extract_umi( per->non_const_read_two(), umi, del, sep );
extract_umi( per->non_const_read_one(), umi, del, sep );

} else if (read == 'B') {
extract_umi( per->non_const_read_one(), umi, del, dragen, true );
std::tie(umi.qual) = std::make_tuple(""); // reset umi struct
extract_umi( per->non_const_read_two(), umi, del, dragen, true );
if (dragen) {
set_dragen( per->non_const_read_one(), umi );
set_dragen( per->non_const_read_two(), umi );
} else if (add_as_tag) {
set_tag( per->non_const_read_one(), umi );
set_tag( per->non_const_read_two(), umi );
}
extract_umi( per->non_const_read_one(), umi, del, sep );
if (dragen) { set_dragen( per->non_const_read_one(), umi ); }
if (add_as_tag) { set_tag( per->non_const_read_one(), umi ); }
std::tie(umi.seq) = std::make_tuple(""); // reset umi struct
extract_umi( per->non_const_read_two(), umi, del, sep );
if (dragen) { set_dragen( per->non_const_read_two(), umi ); }
if (add_as_tag) { set_tag( per->non_const_read_two(), umi ); }
dragen = false;
add_as_tag = false;

} else if (read == 'P') {
extract_umi( per->non_const_read_one(), umi, del, sep, dragen, true );
extract_umi( per->non_const_read_two(), umi, del, sep, dragen, true );
if (!dragen && !add_as_tag) {
set_paired( per->non_const_read_one(), umi, del );
set_paired( per->non_const_read_two(), umi, del );
}
}
if (dragen) {
set_dragen( per->non_const_read_one(), umi );
set_dragen( per->non_const_read_two(), umi );
}
if (add_as_tag) {
set_tag( per->non_const_read_one(), umi );
set_tag( per->non_const_read_two(), umi );
}
}
);
Expand All @@ -263,7 +268,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
auto i = reader.next();
counters.input(*i);
i->accept(read_visit);
std::tie(umi.seq1, umi.seq2, umi.qual, umi.discard) = std::make_tuple("", "", "", false); // reset umi struct
std::tie(umi.seq, umi.qual, umi.discard) = std::make_tuple("", "", false); // reset umi struct
writer(*i);
counters.output(*i);
}
Expand Down
Loading

0 comments on commit a94dd8a

Please sign in to comment.