diff --git a/common/src/ioHandler.cpp b/common/src/ioHandler.cpp index 8cb9074..11d9106 100644 --- a/common/src/ioHandler.cpp +++ b/common/src/ioHandler.cpp @@ -206,6 +206,7 @@ std::vector TabReadImpl::load_read(std::istream *input) { return reads; } + template <> InputReader::value_type InputReader::next() { return InputReader::value_type(new SingleEndRead(load_read(input))); diff --git a/common/src/read.h b/common/src/read.h index b0618d7..ef74983 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -75,30 +75,6 @@ class Read { } const std::string get_id_first() const { return id; } const std::string get_id_second() const { return id2; } - - const std::string get_umi(const char& del) { - size_t idx = id.rfind(del); - if (idx == std::string::npos) { - throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation"); - } - std::string umi = id.substr(idx + 1); - - // clean up DRAGEN format if present - idx = umi.rfind('+'); - if (idx == std::string::npos) { - return umi; - } else { - std::vector result; - boost::split(result, id, boost::is_any_of(":")); - if (result.size() < 8) { - throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns for DRAGEN UMI format"); - } - umi = result[7]; - idx = umi.rfind('+'); - umi.erase(idx); - return umi; - } - } std::vector get_comment() const { return comments;} static char complement(char bp); @@ -119,6 +95,62 @@ class Read { std::reverse(begin(q), end(q)); return q; } + const std::string get_umi(const char &del, const size_t &col, bool &both_reads) { + + size_t idx = id.rfind(del); + if (idx == std::string::npos) { + throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation"); + } + + std::vector result; + boost::split(result, id, boost::is_any_of(":")); + + std::string umi; + if (col == 0) { + umi = result[result.size() - 1]; + } else if (result.size() > col) { + throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns to extract UMI"); + } else { + umi = result[col - 1]; + } + + for (const auto &s : DEL_OPTIONS) { + + idx = umi.find(s); + + if (idx != std::string::npos) { + both_reads = true; + umi.erase(idx, 1); + return umi; + } + } + return umi; + } + + const std::string get_umi_tag(bool &both_reads) { + + size_t idx; + for (const auto &c : get_comment()) { + idx = c.find("RX:Z:"); + if (idx != std::string::npos) { + std::string umi = c.substr(idx + 5); + + for (const auto &s : DEL_OPTIONS) { + + idx = umi.find(s); + + if (idx != std::string::npos) { + both_reads = true; + umi.erase(idx, 1); + return umi; + } + } + return umi; + } + } + throw HtsRuntimeException("No UMI Tag found in read header. Be sure hts_ExtractUMI is run prior to the current operation"); + } + void add_comment( const std::string& tag ) { if (tag != "") comments.push_back(tag);} void join_comment( const std::vector& new_comments, size_t offset = 0) { comments.insert(comments.end(), new_comments.begin() + offset, new_comments.end()); } diff --git a/common/src/typedefs.h b/common/src/typedefs.h index d77c6c8..1ca7b89 100644 --- a/common/src/typedefs.h +++ b/common/src/typedefs.h @@ -16,8 +16,8 @@ typedef std::vector Mat; const uint64_t QUAL_MAX = 94; const uint64_t DEFAULT_QUAL_OFFSET = 33; -const std::vector READ_OPTIONS{'F', 'R', 'B'}; // possible read options -const std::vector DEL_OPTIONS{'-', '_', ':', ' '}; // possible char options (last one is to allow unset) +const std::vector READ_OPTIONS{'F', 'R', 'B', 'P'}; // possible read options +const std::vector DEL_OPTIONS{'-', '_', ':', '+', ' '}; // possible char options (last one is to allow unset) #endif diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index 575427e..4eb323a 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -28,8 +28,7 @@ class ExtractUMI: public MainTemplate { // bases some parameters and allows passing UMIs between PE reads struct UMI { - std::string seq1; - std::string seq2; + std::string seq; std::string qual; size_t length; bool discard; @@ -51,9 +50,10 @@ class ExtractUMI: public MainTemplate { void add_extra_options(po::options_description &desc) { desc.add_options() - ("read,r", po::value()->default_value('F')->notifier(boost::bind(&check_values, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE") + ("read,r", po::value()->default_value('F')->notifier(boost::bind(&check_values, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both, P = Paired End), ignored if SE") ("umi-length,l", po::value()->default_value(6)->notifier(boost::bind(&check_range, "umi_length", _1, 1, 36)), "Total length of UMI to extract (1, 36)") - ("delimiter,d", po::value()->default_value('_')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')") + ("delimiter,d", po::value()->default_value('_')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':'). Ignored if --add-as-tag is used") + ("separator,s", po::value()->default_value('+')->notifier(boost::bind(&check_values, "separator", _1, DEL_OPTIONS)), "Character to separate the UMI sequence when using Paired End mode (Possible options: '-', '_', ':').") ("qual-score,q", po::value()->default_value(0)->notifier(boost::bind(&check_range, "qual-score", _1, 0, 10000)), "Threshold for quality score for any base within a UMI (min 1, max 10000), read pairs are discarded, default is unset") ("avg-qual-score,Q", po::value()->default_value(0)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 0, 10000)), "Threshold for quality score average of UMI (min 1, max 10000), read pairs are discarded, default is unset") ("homopolymer,p", po::bool_switch()->default_value(false), "Remove reads with homopolymer UMIs") @@ -108,44 +108,33 @@ class ExtractUMI: public MainTemplate { } - void set_dragen(Read &r, const UMI &umi, const bool &se = false) { - - std::string new_id; - if (se) { - new_id = umi.seq1; - } else { - new_id = umi.seq1 + "+" + umi.seq2; - } - + void set_dragen(Read &r, const UMI &umi) { std::vector result; boost::split(result, r.get_id_first(), boost::is_any_of(":")); if (result.size() < 7) { throw HtsIOException("DRAGEN read ID format not found. Must have at least 7 fields deliminated by a ':'"); } else if (result.size() == 7) { - result.push_back(new_id); + result.push_back(umi.seq); } else { - result[7] = new_id; + result[7] = umi.seq; } r.set_id_first(boost::algorithm::join(result, ":")); } - void set_tag(Read &r, const UMI &umi, const bool &se = false) { - std::string new_id; - if (se) { - new_id = umi.seq1; - } else { - new_id = umi.seq1 + "-" + umi.seq2; - } - r.add_comment("RX:Z:" + new_id); + void set_tag(Read &r, const UMI &umi) { + r.add_comment("RX:Z:" + umi.seq); } + void set_paired(Read &r, const UMI &umi, const char &del) { + r.set_id_first(r.get_id_first() + del + umi.seq); // append UMI to read ID + } - void extract_umi(Read &r, UMI &umi, const char &del, const bool &dragen = false, const bool &both_reads = false) { + void extract_umi(Read &r, UMI &umi, const char &del, const char &sep, const bool &dragen = false, const bool &paired_reads = false) { std::string tmp_seq; - if ((umi.seq1.empty() || dragen) || both_reads ) { + if ((umi.seq.empty() || dragen) || paired_reads ) { tmp_seq = r.get_seq().substr(0, umi.length); @@ -166,17 +155,16 @@ class ExtractUMI: public MainTemplate { r.setLCut(umi.length); } - if ((!umi.seq1.empty() && dragen) || (!umi.seq1.empty() && both_reads && umi.add_as_tag)) { - umi.seq2 = tmp_seq; // necessary copy? + if ((!umi.seq.empty() && dragen) || (!umi.seq.empty() && paired_reads)) { + umi.seq = umi.seq + sep + tmp_seq; } else { - umi.seq1 = tmp_seq; // necessary copy? + umi.seq = tmp_seq; } - } if (!umi.discard) { - if (!dragen && !umi.add_as_tag) { - r.set_id_first(r.get_id_first() + del + umi.seq1); // append UMI to read ID + if (!dragen && !umi.add_as_tag && !paired_reads) { + r.set_id_first(r.get_id_first() + del + umi.seq); // append UMI to read ID } } else { r.setDiscard(); @@ -191,6 +179,7 @@ class ExtractUMI: public MainTemplate { size_t umi_length = vm["umi-length"].as(); bool add_as_tag = vm["add-as-tag"].as(); char del = vm["delimiter"].as(); + char sep = vm["separator"].as(); size_t qual_offset = vm["qual-offset"].as(); size_t qual_threshold = vm["qual-score"].as(); size_t avg_qual_threshold = vm["avg-qual-score"].as(); @@ -199,15 +188,14 @@ class ExtractUMI: public MainTemplate { bool dragen = vm["DRAGEN"].as(); - if (dragen & (del != ':')) { - throw HtsIOException("Delimiter (--delimiter) must be ':' to be compatible with --DRAGEN parameter"); + if (dragen & (del != ':' || sep != '+')) { + throw HtsIOException("Delimiter (--delimiter) must be ':' and Separator (--separator) was be '+' to be compatible with --DRAGEN parameter"); } // init UMI struct UMI umi = { - "", // umi R1 sequence - "", // umi R2 sequence + "", // umi sequence "", // qual sequence umi_length, // umi length false, // discard status (init) @@ -227,9 +215,9 @@ class ExtractUMI: public MainTemplate { if (dragen && (umi_length > 15)) { throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads"); } - extract_umi( ser->non_const_read_one(), umi, del, dragen ); - if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); } - if (add_as_tag) { set_tag( ser->non_const_read_one(), umi, true); } + extract_umi( ser->non_const_read_one(), umi, del, sep, dragen ); + if (dragen) { set_dragen( ser->non_const_read_one(), umi ); } + if (add_as_tag) { set_tag( ser->non_const_read_one(), umi ); } }, [&](PairedEndRead * per) { if (dragen && (umi_length > 8)) { @@ -239,22 +227,39 @@ class ExtractUMI: public MainTemplate { throw HtsIOException("DRAGEN parameter (--DRAGEN) is not compatible with --add-as-tag parameter"); } if (read == 'F') { - extract_umi( per->non_const_read_one(), umi, del ); - extract_umi( per->non_const_read_two(), umi, del ); + extract_umi( per->non_const_read_one(), umi, del, sep ); + extract_umi( per->non_const_read_two(), umi, del, sep ); + } else if (read == 'R') { - extract_umi( per->non_const_read_two(), umi, del ); - extract_umi( per->non_const_read_one(), umi, del ); + extract_umi( per->non_const_read_two(), umi, del, sep ); + extract_umi( per->non_const_read_one(), umi, del, sep ); + } else if (read == 'B') { - extract_umi( per->non_const_read_one(), umi, del, dragen, true ); - std::tie(umi.qual) = std::make_tuple(""); // reset umi struct - extract_umi( per->non_const_read_two(), umi, del, dragen, true ); - if (dragen) { - set_dragen( per->non_const_read_one(), umi ); - set_dragen( per->non_const_read_two(), umi ); - } else if (add_as_tag) { - set_tag( per->non_const_read_one(), umi ); - set_tag( per->non_const_read_two(), umi ); - } + extract_umi( per->non_const_read_one(), umi, del, sep ); + if (dragen) { set_dragen( per->non_const_read_one(), umi ); } + if (add_as_tag) { set_tag( per->non_const_read_one(), umi ); } + std::tie(umi.seq) = std::make_tuple(""); // reset umi struct + extract_umi( per->non_const_read_two(), umi, del, sep ); + if (dragen) { set_dragen( per->non_const_read_two(), umi ); } + if (add_as_tag) { set_tag( per->non_const_read_two(), umi ); } + dragen = false; + add_as_tag = false; + + } else if (read == 'P') { + extract_umi( per->non_const_read_one(), umi, del, sep, dragen, true ); + extract_umi( per->non_const_read_two(), umi, del, sep, dragen, true ); + if (!dragen && !add_as_tag) { + set_paired( per->non_const_read_one(), umi, del ); + set_paired( per->non_const_read_two(), umi, del ); + } + } + if (dragen) { + set_dragen( per->non_const_read_one(), umi ); + set_dragen( per->non_const_read_two(), umi ); + } + if (add_as_tag) { + set_tag( per->non_const_read_one(), umi ); + set_tag( per->non_const_read_two(), umi ); } } ); @@ -263,7 +268,7 @@ class ExtractUMI: public MainTemplate { auto i = reader.next(); counters.input(*i); i->accept(read_visit); - std::tie(umi.seq1, umi.seq2, umi.qual, umi.discard) = std::make_tuple("", "", "", false); // reset umi struct + std::tie(umi.seq, umi.qual, umi.discard) = std::make_tuple("", "", false); // reset umi struct writer(*i); counters.output(*i); } diff --git a/hts_ExtractUMI/test/hts_TestExtractUMI.cpp b/hts_ExtractUMI/test/hts_TestExtractUMI.cpp index 8ae1897..a0762bd 100644 --- a/hts_ExtractUMI/test/hts_TestExtractUMI.cpp +++ b/hts_ExtractUMI/test/hts_TestExtractUMI.cpp @@ -16,7 +16,6 @@ class ExtractUMITest : public ::testing::Test { // init struct for testing ExtractUMI::UMI umi = { "", // umi R1 sequence - "", // umi R2 sequence "", // qual sequence 6, // umi length false, // discard status (init) @@ -36,7 +35,7 @@ TEST_F(ExtractUMITest, BasicExtract) { // SE extract test auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi, ':'); + eu.extract_umi(ser->non_const_read_one(), umi, ':', '-'); ASSERT_EQ("Read1:NAAAAA", (ser->non_const_read_one()).get_id_first()); }; @@ -51,14 +50,14 @@ TEST_F(ExtractUMITest, ReadOneExtract) { // R1 extract test std::shared_ptr tab(new ReadBaseOutTab(hts_of)); // reset umi struct - std::tie(umi.seq1, umi.qual) = std::make_tuple("", ""); + std::tie(umi.seq, umi.qual) = std::make_tuple("", ""); WriterHelper writer(tab, tab); while (ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi(per->non_const_read_one(), umi, ':'); - eu.extract_umi(per->non_const_read_two(), umi, ':'); + eu.extract_umi(per->non_const_read_one(), umi, ':', '-'); + eu.extract_umi(per->non_const_read_two(), umi, ':', '-'); writer(*per); } ASSERT_EQ("Read1:NAAAAA\tGACATTAAGCAA\t############\tRead2:NAAAAA\tTTTTTTGACATTAAGCAA\t!!!!!!############\n", out1->str()); @@ -75,14 +74,14 @@ TEST_F(ExtractUMITest, ReadTwoExtract) { // R2 extract test std::shared_ptr tab(new ReadBaseOutTab(hts_of)); // reset umi struct - std::tie(umi.seq1, umi.qual) = std::make_tuple("", ""); + std::tie(umi.seq, umi.qual) = std::make_tuple("", ""); WriterHelper writer(tab, tab); while (ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi(per->non_const_read_two(), umi, ':'); - eu.extract_umi(per->non_const_read_one(), umi, ':'); + eu.extract_umi(per->non_const_read_two(), umi, ':', '-'); + eu.extract_umi(per->non_const_read_one(), umi, ':', '-'); writer(*per); } ASSERT_EQ("Read1:TTTTTT\tNAAAAAGACATTAAGCAA\t!!!!!!############\tRead2:TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); @@ -98,14 +97,15 @@ TEST_F(ExtractUMITest, BothExtract) { // Both extract test std::shared_ptr tab(new ReadBaseOutTab(hts_of)); // reset umi struct - std::tie(umi.seq1, umi.qual) = std::make_tuple("", ""); + std::tie(umi.seq, umi.qual) = std::make_tuple("", ""); WriterHelper writer(tab, tab); while (ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi(per->non_const_read_one(), umi, ':', false, true); - eu.extract_umi(per->non_const_read_two(), umi, ':', false, true); + eu.extract_umi(per->non_const_read_one(), umi, ':', '-', false, false); + std::tie(umi.seq) = std::make_tuple(""); // reset umi struct + eu.extract_umi(per->non_const_read_two(), umi, ':', '-', false, false); writer(*per); } ASSERT_EQ("Read1:NAAAAA\tGACATTAAGCAA\t############\tRead2:TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); @@ -118,11 +118,11 @@ TEST_F(ExtractUMITest, QualFilt) { // Hard Filter extract test InputReader ifp(in1); // reset umi struct - std::tie(umi.seq1, umi.seq2, umi.qual, umi.qual_threshold) = std::make_tuple("", "", "", 20); + std::tie(umi.seq, umi.qual, umi.qual_threshold) = std::make_tuple("", "", 20); auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi, ':'); + eu.extract_umi(ser->non_const_read_one(), umi, ':', '-'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -133,11 +133,11 @@ TEST_F(ExtractUMITest, AvgQualFilt) { // Average Filter extract test InputReader ifp(in1); // reset umi struct - std::tie(umi.seq1, umi.qual, umi.discard, umi.qual_threshold, umi.avg_qual_threshold) = std::make_tuple("", "", false, 0, 20); + std::tie(umi.seq, umi.qual, umi.discard, umi.qual_threshold, umi.avg_qual_threshold) = std::make_tuple("", "", false, 0, 20); auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi, ':'); + eu.extract_umi(ser->non_const_read_one(), umi, ':', '-'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -148,11 +148,11 @@ TEST_F(ExtractUMITest, HomopolymerFilt) { // Homopolymer filter test InputReader ifp(in2); // reset umi struct - std::tie(umi.seq1, umi.qual, umi.discard, umi.avg_qual_threshold, umi.homopolymer) = std::make_tuple("", "", false, 0, true); + std::tie(umi.seq, umi.qual, umi.discard, umi.avg_qual_threshold, umi.homopolymer) = std::make_tuple("", "", false, 0, true); auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi, ':'); + eu.extract_umi(ser->non_const_read_one(), umi, ':', '-'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -163,11 +163,11 @@ TEST_F(ExtractUMITest, NFilt) { // N filter test InputReader ifp(in1); // reset umi struct - std::tie(umi.seq1, umi.qual, umi.discard, umi.homopolymer, umi.discard_n) = std::make_tuple("", "", false, false, true); + std::tie(umi.seq, umi.qual, umi.discard, umi.homopolymer, umi.discard_n) = std::make_tuple("", "", false, false, true); auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi, ':'); + eu.extract_umi(ser->non_const_read_one(), umi, ':', '-'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -182,14 +182,14 @@ TEST_F(ExtractUMITest, DRAGEN) { // DRAGEN test std::shared_ptr tab(new ReadBaseOutTab(hts_of)); // reset umi struct - std::tie(umi.seq1, umi.qual, umi.discard, umi.homopolymer, umi.discard_n) = std::make_tuple("", "", false, false, false); + std::tie(umi.seq, umi.qual, umi.discard, umi.homopolymer, umi.discard_n) = std::make_tuple("", "", false, false, false); WriterHelper writer(tab, tab); while (ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi(per->non_const_read_one(), umi, ':', true, false); - eu.extract_umi(per->non_const_read_two(), umi, ':', true, false); + eu.extract_umi(per->non_const_read_one(), umi, ':', '+', true, false); + eu.extract_umi(per->non_const_read_two(), umi, ':', '+', true, false); eu.set_dragen(per->non_const_read_one(), umi); eu.set_dragen(per->non_const_read_two(), umi); writer(*per); @@ -197,7 +197,7 @@ TEST_F(ExtractUMITest, DRAGEN) { // DRAGEN test ASSERT_EQ("A00887_1:1:2:3:4:5:6:NAAAAA+TTTTTT\tGACATTAAGCAA\t############\tA00887_2:1:2:3:4:5:6:NAAAAA+TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); }; -TEST_F(ExtractUMITest, TagExtract) { // Both extract test +TEST_F(ExtractUMITest, TagExtract) { // Tag extract test std::istringstream in1(readData_1); std::istringstream in2(readData_2); @@ -207,18 +207,18 @@ TEST_F(ExtractUMITest, TagExtract) { // Both extract test std::shared_ptr tab(new ReadBaseOutTab(hts_of)); // reset umi struct - std::tie(umi.seq1, umi.qual, umi.add_as_tag) = std::make_tuple("", "", true); + std::tie(umi.seq, umi.qual, umi.add_as_tag) = std::make_tuple("", "", true); WriterHelper writer(tab, tab); while (ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi( per->non_const_read_one(), umi, ':', false, true ); - std::tie(umi.qual) = std::make_tuple(""); // reset umi struct - eu.extract_umi( per->non_const_read_two(), umi, ':', false, true ); + eu.extract_umi( per->non_const_read_one(), umi, ':', '-', false, true ); eu.set_tag( per->non_const_read_one(), umi ); + std::tie(umi.seq) = std::make_tuple(""); // reset umi struct + eu.extract_umi( per->non_const_read_two(), umi, ':', '-', false, true ); eu.set_tag( per->non_const_read_two(), umi ); writer(*per); } - ASSERT_EQ("Read1\tGACATTAAGCAA\t############\tRead2\tGACATTAAGCAA\t############\tRX:Z:NAAAAA-TTTTTT\tRX:Z:NAAAAA-TTTTTT\n", out1->str()); + ASSERT_EQ("Read1\tGACATTAAGCAA\t############\tRead2\tGACATTAAGCAA\t############\tRX:Z:NAAAAA\tRX:Z:TTTTTT\n", out1->str()); }; \ No newline at end of file diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index f7fd14d..ff81b0c 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,12 +116,15 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi-delimiter,d", po::value()->default_value(' ')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Enables UMI mode and specifies character to separate the UMI sequence from other fields in the Read ID. Possible options: '-', '_', ':'. Default is unset."); + ("umi-delimiter,d", po::value()->default_value(' ')->notifier(boost::bind(&check_values, "umi-delimiter", _1, DEL_OPTIONS)), "Enables UMI mode and specifies character to separate the UMI sequence from other fields in the Read ID. Possible options: '-', '_', ':'. Default is unset.") + ("umi-column,c", po::value()->default_value(0)->notifier(boost::bind(&check_range, "umi-column", _1, 0, 100)), "Sepcifies column (1-based index) to look for UMI sequence if apart of read ID. Default is 0, refering to the last column") + ("umi-tag,C", po::bool_switch()->default_value(false), "Enables UMI mode but extracts UMI from read header tags."); } template - void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const char del = ' '){ + void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const char del = ' ', const size_t col = 0, const bool tag = false){ double tmpAvg; + bool both_reads = false; std::string umi_seq; boost::optional> bit; WriterHelper writer(pe, se, false); @@ -134,7 +137,16 @@ class SuperDeduper: public MainTemplate { if (del != ' ') { umi_seq = ""; for (const auto &r : i -> get_reads()) { - umi_seq += r -> get_umi(del); + umi_seq += r -> get_umi(del, col, both_reads); + if (both_reads) { break; } + } + bit = i -> bitjoin(i -> str_to_bit(umi_seq), i -> get_key(start, length)); + + } else if (tag) { + umi_seq = ""; + for (const auto &r : i -> get_reads()) { + umi_seq += r -> get_umi_tag(both_reads); + if (both_reads) { break; } } bit = i -> bitjoin(i -> str_to_bit(umi_seq), i -> get_key(start, length)); @@ -142,7 +154,6 @@ class SuperDeduper: public MainTemplate { bit = i -> get_key(start, length); } - //check for existance, store or compare quality and replace: if ( tmpAvg < discard_qual ){ // averge qual must be less than discard_qual, ignored counters.increment_ignored(); @@ -185,9 +196,12 @@ class SuperDeduper: public MainTemplate { const size_t log_freq = vm["log_freq"].as(); const size_t qual_offset = vm["qual-offset"].as(); const char del = vm["umi-delimiter"].as(); + const size_t col = vm["umi-column"].as(); + const bool tag = vm["umi-tag"].as(); + WriterHelper writer(pe, se, false, false); - load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, del); + load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, del, col, tag); for(auto const &i : read_map) { if (i.second.get() != nullptr) { counter.output(*i.second.get()); diff --git a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp index 7c9b4fd..5163b90 100644 --- a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp +++ b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp @@ -10,8 +10,8 @@ class SDTest : public ::testing::Test { public: po::variables_map vm; const std::string readData = "@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nAAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nBAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nCAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCNNNNNNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nAAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n"; - const std::string umiData = "@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CNNNNN 1:N:0:1\nNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n"; - + const std::string umiData1 = "@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CNNNNN 1:N:0:1\nNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n"; + const std::string umiData2 = "@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT+TTTCCC 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT+TTTCCC 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT+TTTCCC 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CNNNNN+TTTCCC 1:N:0:1\nNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n"; SuperDeduper sd; }; @@ -41,8 +41,8 @@ TEST_F(SDTest, HashMapLoadTest) { }; -TEST_F(SDTest, UMITest) { - std::istringstream in1(umiData); +TEST_F(SDTest, UMITest_Both) { + std::istringstream in1(umiData1); size_t start = 5; size_t length = 5; double avg_auto_write = 100; @@ -63,3 +63,60 @@ TEST_F(SDTest, UMITest) { }; + +TEST_F(SDTest, UMITest_Paired) { + std::istringstream in1(umiData2); + size_t start = 5; + size_t length = 5; + double avg_auto_write = 100; + BitMap read_map; + SuperDeduperCounters counter("hts_SuperDeduper", vm); + + InputReader ifp(in1); + std::shared_ptr out1(new std::ostringstream); + std::shared_ptr hts_of(new HtsOfstream(out1)); + std::shared_ptr tab(new ReadBaseOutTab(hts_of)); + + sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100, 30, ':'); + std::cout << read_map.size() << '\n'; + ASSERT_EQ(sd.read_map.size(), 1u); + ASSERT_EQ(counter.TotalFragmentsInput, 4u); + ASSERT_EQ(counter.Duplicate, 2u); + ASSERT_EQ(counter.Ignored, 1u); + +}; + +TEST_F(SDTest, UMITest_Tag) { + std::istringstream in1(readData); + size_t start = 5; + size_t length = 5; + double avg_auto_write = 100; + BitMap read_map; + SuperDeduperCounters counter("hts_SuperDeduper", vm); + + InputReader ifp(in1); + std::shared_ptr out1(new std::ostringstream); + std::shared_ptr hts_of(new HtsOfstream(out1)); + std::shared_ptr tab(new ReadBaseOutTab(hts_of)); + + WriterHelper writer(tab, tab); + while (ifp.has_next()) { + auto i = ifp.next(); + SingleEndRead *ser = dynamic_cast(i.get()); + ser->non_const_read_one().add_comment("RX:Z:CCTTTT+TTTCCC"); + writer(*ser); + } + + std::string umiData3 = out1->str(); + std::istringstream in2(umiData3); + InputReader ift(in2); + + sd.load_map(ift, counter, tab, tab, avg_auto_write, 3, start, length, 100, 30, ' ', 0, true); + + std::cout << read_map.size() << '\n'; + ASSERT_EQ(sd.read_map.size(), 1u); + ASSERT_EQ(counter.TotalFragmentsInput, 4u); + ASSERT_EQ(counter.Duplicate, 2u); + ASSERT_EQ(counter.Ignored, 1u); + +}; \ No newline at end of file diff --git a/regression/chain.json b/regression/chain.json index 5240303..8824ddb 100644 --- a/regression/chain.json +++ b/regression/chain.json @@ -14,7 +14,9 @@ "read2-input": [ "./test_r2.fastq.gz"], "start": 10, "stats-file": "chain.json", + "umi-column": 0, "umi-delimiter": " ", + "umi-tag": false, "uncompressed": false } }, diff --git a/regression/hts_ExtractUMI.json b/regression/hts_ExtractUMI.json index b9a242e..9174f4a 100644 --- a/regression/hts_ExtractUMI.json +++ b/regression/hts_ExtractUMI.json @@ -16,6 +16,7 @@ "read": "F", "read1-input": [ "./test_r1.fastq.gz"], "read2-input": [ "./test_r2.fastq.gz"], + "separator": "+", "stats-file": "hts_ExtractUMI.json", "tab-output": "hts_ExtractUMI", "umi-length": 6, diff --git a/regression/hts_SuperDeduper.json b/regression/hts_SuperDeduper.json index 6cdaa53..fc9d1d8 100644 --- a/regression/hts_SuperDeduper.json +++ b/regression/hts_SuperDeduper.json @@ -15,7 +15,9 @@ "start": 10, "stats-file": "hts_SuperDeduper.json", "tab-output": "hts_SuperDeduper", + "umi-column": 0, "umi-delimiter": " ", + "umi-tag": false, "uncompressed": false } },