From 290aa54e858aa886124ab1b8d30a465ba1d3c770 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 13 Apr 2024 03:05:40 -0700 Subject: [PATCH 01/16] hts_SuperDeduper umi support init --- common/src/read.cpp | 10 ++++++++- common/src/read.h | 3 +++ hts_SuperDeduper/src/hts_SuperDeduper.h | 22 +++++++++++++++---- .../test/hts_testSuperDeduper.cpp | 2 +- 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/common/src/read.cpp b/common/src/read.cpp index 5d272c4b..7a489ce4 100644 --- a/common/src/read.cpp +++ b/common/src/read.cpp @@ -16,7 +16,7 @@ std::string strjoin(const std::vector & v, const std::string& delim } std::string ReadBase::bit_to_str(const BitSet &bits) { - size_t str_len = bits.size()/2; + size_t str_len = bits.size(); std::string out; out.resize(str_len); size_t i = bits.size() -1; @@ -35,6 +35,14 @@ std::string ReadBase::bit_to_str(const BitSet &bits) { return out; } +boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2 ) { + size_t bits = bit1 -> size() + bit2 -> size(); + BitSet bittag(bits); + for(size_t i = 0; i < bit1 -> size(); i++) { bittag[i] = (int)(*bit1)[i]; } + for(size_t i = 0; i < bit2 -> size(); i++) { bittag[i + (bit1 -> size())] = (int)(*bit2)[i]; } + return bittag; +} + // Read Read Read::subread(size_t _start, size_t _length){ return Read(seq.substr(_start, _length), qual.substr(_start,_length), id); diff --git a/common/src/read.h b/common/src/read.h index 13c22815..5606ce3d 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -69,6 +69,8 @@ class Read { } const std::string get_id_first() const { return id; } const std::string get_id_second() const { return id2; } + const std::string get_umi() { return id.substr(id.rfind("_") + 2); } + std::vector get_comment() const { return comments;} static char complement(char bp); @@ -165,6 +167,7 @@ class ReadBase { return bit; } static std::string bit_to_str(const BitSet &bits); + boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2 ); static boost::optional reverse_complement(const std::string& str, int start, int length); virtual double avg_q_score(const size_t qual_offset = DEFAULT_QUAL_OFFSET) = 0; diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index e1051155..d212dfe0 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -11,6 +11,7 @@ #include #include #include +#include extern template class InputReader; extern template class InputReader; @@ -114,22 +115,33 @@ class SuperDeduper: public MainTemplate { ("length,l", po::value()->default_value(10)->notifier(boost::bind(&check_range, "length", _1, 1, 10000)), "Length of unique ID (min 1, max 10000)") ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double - ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off)."); + ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") + ("umi,u", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior)"); + } template - void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET){ + void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const bool umi = false){ double tmpAvg; + std::string umi_seq; + boost::optional> umi_bit(0); WriterHelper writer(pe, se, false); while(reader.has_next()) { auto i = reader.next(); counters.input(*i, log_freq); tmpAvg = i->avg_q_score(qual_offset); + + if (umi) { + umi_seq = ""; + for (const auto &r : i -> get_reads_non_const()) { umi_seq += r -> get_umi(); } + umi_bit = i -> str_to_bit(umi_seq); + }; + //check for existance, store or compare quality and replace: if ( tmpAvg < discard_qual ){ // averge qual must be less than discard_qual, ignored counters.increment_ignored(); - } else if (auto key=i->get_key(start, length)) { // check for duplicate + } else if (auto key=i->bitjoin(umi_bit, (i -> get_key(start, length)))) { // check for duplicate // find faster than count on some compilers, new key if(read_map.find(*key) == read_map.end()) { // first time the key is seen if ( tmpAvg >= avg_automatic_write ) { // if its greater than avg_automatic_write then write out @@ -167,8 +179,10 @@ class SuperDeduper: public MainTemplate { const size_t length = vm["length"].as(); const size_t log_freq = vm["log_freq"].as(); const size_t qual_offset = vm["qual-offset"].as(); + const bool umi = vm["umi"].as(); + WriterHelper writer(pe, se, false, false); - load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset); + load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, umi); for(auto const &i : read_map) { if (i.second.get() != nullptr) { counter.output(*i.second.get()); diff --git a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp index bb8b223b..becbdf70 100644 --- a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp +++ b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp @@ -27,7 +27,7 @@ TEST_F(SDTest, HashMapLoadTest) { std::shared_ptr hts_of(new HtsOfstream(out1)); std::shared_ptr tab(new ReadBaseOutTab(hts_of)); - sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100); + sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100, false); std::cout << read_map.size() << '\n'; ASSERT_EQ(sd.read_map.size(), 1u); ASSERT_EQ(counter.TotalFragmentsInput, 4u); From 129bb4d1a9fb0bc0d434e548f97bbd985186ec22 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 13 Apr 2024 03:39:48 -0700 Subject: [PATCH 02/16] hts_SuperDeduper UMI Dedup implemented --- common/src/read.cpp | 3 ++- common/src/read.h | 13 ++++++++++--- hts_SuperDeduper/src/hts_SuperDeduper.h | 6 +++--- hts_SuperDeduper/test/hts_testSuperDeduper.cpp | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/common/src/read.cpp b/common/src/read.cpp index 7a489ce4..ba8447f8 100644 --- a/common/src/read.cpp +++ b/common/src/read.cpp @@ -35,7 +35,8 @@ std::string ReadBase::bit_to_str(const BitSet &bits) { return out; } -boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2 ) { +boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2, const bool& include) { + if (!include) { return bit2; } size_t bits = bit1 -> size() + bit2 -> size(); BitSet bittag(bits); for(size_t i = 0; i < bit1 -> size(); i++) { bittag[i] = (int)(*bit1)[i]; } diff --git a/common/src/read.h b/common/src/read.h index 5606ce3d..49d24d30 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -9,6 +9,7 @@ #include #include #include "typedefs.h" +#include "hts_exception.h" typedef boost::dynamic_bitset<> BitSet; std::string strjoin(const std::vector & v, const std::string& delim); @@ -69,7 +70,13 @@ class Read { } const std::string get_id_first() const { return id; } const std::string get_id_second() const { return id2; } - const std::string get_umi() { return id.substr(id.rfind("_") + 2); } + + const std::string get_umi() { + if (id.find('_') != std::string::npos) { + return id.substr(id.rfind("_") + 2); + } + throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to hts_Superdeduper"); + } std::vector get_comment() const { return comments;} static char complement(char bp); @@ -117,7 +124,7 @@ class Read { bool getDiscard() const { return discard; } void setDiscard() { discard = true; } size_t getLength() const { return length; } - size_t getLengthTrue() const { return cut_R <= cut_L ? 1 : cut_R - cut_L; } + size_t getLengthTrue() const { return cut_R <= cut_L ? 1 : cut_R - cut_L; } // number of bp that are trimmed off left side size_t getLTrim() const { return cut_L; } // number of bp that are trimmed off right side @@ -167,7 +174,7 @@ class ReadBase { return bit; } static std::string bit_to_str(const BitSet &bits); - boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2 ); + static boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2, const bool &include); static boost::optional reverse_complement(const std::string& str, int start, int length); virtual double avg_q_score(const size_t qual_offset = DEFAULT_QUAL_OFFSET) = 0; diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index d212dfe0..582cd6e0 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,7 +116,7 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi,u", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior)"); + ("umi,u", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior to hts_SuperDeduper)"); } @@ -124,7 +124,7 @@ class SuperDeduper: public MainTemplate { void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const bool umi = false){ double tmpAvg; std::string umi_seq; - boost::optional> umi_bit(0); + boost::optional> umi_bit; WriterHelper writer(pe, se, false); while(reader.has_next()) { @@ -141,7 +141,7 @@ class SuperDeduper: public MainTemplate { //check for existance, store or compare quality and replace: if ( tmpAvg < discard_qual ){ // averge qual must be less than discard_qual, ignored counters.increment_ignored(); - } else if (auto key=i->bitjoin(umi_bit, (i -> get_key(start, length)))) { // check for duplicate + } else if (auto key=i->bitjoin(umi_bit, (i -> get_key(start, length)), umi)) { // check for duplicate // find faster than count on some compilers, new key if(read_map.find(*key) == read_map.end()) { // first time the key is seen if ( tmpAvg >= avg_automatic_write ) { // if its greater than avg_automatic_write then write out diff --git a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp index becbdf70..bb8b223b 100644 --- a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp +++ b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp @@ -27,7 +27,7 @@ TEST_F(SDTest, HashMapLoadTest) { std::shared_ptr hts_of(new HtsOfstream(out1)); std::shared_ptr tab(new ReadBaseOutTab(hts_of)); - sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100, false); + sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100); std::cout << read_map.size() << '\n'; ASSERT_EQ(sd.read_map.size(), 1u); ASSERT_EQ(counter.TotalFragmentsInput, 4u); From 9003e626bc683ed42a199964bdb3fdb3de17cfec Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 13 Apr 2024 03:46:29 -0700 Subject: [PATCH 03/16] umi parameter rename and update test JSON files for SuperDeduper --- hts_SuperDeduper/src/hts_SuperDeduper.h | 4 ++-- regression/chain.json | 1 + regression/hts_SuperDeduper.json | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index 582cd6e0..ab9b1e49 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,7 +116,7 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi,u", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior to hts_SuperDeduper)"); + ("umi-mode,m", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior to hts_SuperDeduper)"); } @@ -179,7 +179,7 @@ class SuperDeduper: public MainTemplate { const size_t length = vm["length"].as(); const size_t log_freq = vm["log_freq"].as(); const size_t qual_offset = vm["qual-offset"].as(); - const bool umi = vm["umi"].as(); + const bool umi = vm["umi-mode"].as(); WriterHelper writer(pe, se, false, false); load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, umi); diff --git a/regression/chain.json b/regression/chain.json index 63f5b6e0..02c2d0f0 100644 --- a/regression/chain.json +++ b/regression/chain.json @@ -14,6 +14,7 @@ "read2-input": [ "./test_r2.fastq.gz"], "start": 10, "stats-file": "chain.json", + "umi-mode": false, "uncompressed": false } }, diff --git a/regression/hts_SuperDeduper.json b/regression/hts_SuperDeduper.json index 4ab6241c..f360d666 100644 --- a/regression/hts_SuperDeduper.json +++ b/regression/hts_SuperDeduper.json @@ -15,6 +15,7 @@ "start": 10, "stats-file": "hts_SuperDeduper.json", "tab-output": "hts_SuperDeduper", + "umi-mode": false, "uncompressed": false } }, From 8d36ed04d5ea3e9246b4474b5baf5851873c9b1f Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 13 Apr 2024 13:30:16 -0700 Subject: [PATCH 04/16] hts_ExtractUMI del paramter --- hts_ExtractUMI/src/hts_ExtractUMI.h | 25 ++++++++++++---------- hts_ExtractUMI/test/hts_TestExtractUMI.cpp | 22 +++++++++---------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index 1684fb9c..af3a9ffb 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -23,6 +23,7 @@ class ExtractUMI: public MainTemplate { public: std::vector read_options{'F', 'R', 'B'}; // possible paramters for read options + std::vector del_options{'-', '_', ':', '+'}; // possible paramters for read options // bases some parameters and allows passing UMIs between PE reads struct UMI { @@ -41,14 +42,15 @@ class ExtractUMI: public MainTemplate { ExtractUMI() { program_name = "hts_ExtractUMI"; app_description = - "The hts_ExtractUMI application trims a set number of bases from the 5'\n"; - app_description += " (left) end of a read and appends it to the read ID.\n"; + "The hts_ExtractUMI application trims a set number of bases from the 5' (left)\n"; + app_description += " end of a read and appends it to the end of the read ID.\n"; } void add_extra_options(po::options_description &desc) { desc.add_options() ("read,r", po::value()->default_value('F')->notifier(boost::bind(&check_values, "read", _1, read_options)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE") ("umi-length,l", po::value()->default_value(6)->notifier(boost::bind(&check_range, "umi_length", _1, 1, 36)), "Total length of UMI to extract (1, 36)") + ("delimiter,d", po::value()->default_value(':')->notifier(boost::bind(&check_values, "delimter", _1, del_options)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':', '+')") ("qual-score,q", po::value()->default_value(0)->notifier(boost::bind(&check_range, "qual-score", _1, 0, 10000)), "Threshold for quality score for any base within a UMI (min 1, max 10000), read pairs are discarded, default is unset") ("avg-qual-score,Q", po::value()->default_value(0)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 0, 10000)), "Threshold for quality score average of UMI (min 1, max 10000), read pairs are discarded, default is unset") ("homopolymer,p", po::bool_switch()->default_value(false), "Remove reads with homopolymer UMIs") @@ -104,7 +106,7 @@ class ExtractUMI: public MainTemplate { } - void extract_umi(Read &r, UMI &umi) { + void extract_umi(Read &r, UMI &umi, const char &del) { if (umi.seq.empty()) { @@ -130,7 +132,7 @@ class ExtractUMI: public MainTemplate { } if (!umi.discard) { - r.set_id_first(r.get_id_first() + "_" + umi.seq); + r.set_id_first(r.get_id_first() + del + umi.seq); } else { r.setDiscard(); } @@ -143,6 +145,7 @@ class ExtractUMI: public MainTemplate { char read = vm["read"].as(); size_t umi_length = vm["umi-length"].as(); + char del = vm["delimiter"].as(); size_t qual_offset = vm["qual-offset"].as(); size_t qual_threshold = vm["qual-score"].as(); size_t avg_qual_threshold = vm["avg-qual-score"].as(); @@ -168,19 +171,19 @@ class ExtractUMI: public MainTemplate { auto read_visit = make_read_visitor_func( [&](SingleEndRead *ser) { - extract_umi( ser->non_const_read_one(), umi); + extract_umi( ser->non_const_read_one(), umi, del ); }, [&](PairedEndRead *per) { if (read == 'F') { - extract_umi( per->non_const_read_one(), umi ); - extract_umi( per->non_const_read_two(), umi ); + extract_umi( per->non_const_read_one(), umi, del ); + extract_umi( per->non_const_read_two(), umi, del ); } else if (read == 'R') { - extract_umi( per->non_const_read_two(), umi ); - extract_umi( per->non_const_read_one(), umi ); + extract_umi( per->non_const_read_two(), umi, del ); + extract_umi( per->non_const_read_one(), umi, del ); } else { - extract_umi( per->non_const_read_one(), umi ); + extract_umi( per->non_const_read_one(), umi, del ); std::tie(umi.seq, umi.qual) = std::make_tuple("", ""); // reset umi struct - extract_umi( per->non_const_read_two(), umi ); + extract_umi( per->non_const_read_two(), umi, del ); } } ); diff --git a/hts_ExtractUMI/test/hts_TestExtractUMI.cpp b/hts_ExtractUMI/test/hts_TestExtractUMI.cpp index 3353eba6..62ba351e 100644 --- a/hts_ExtractUMI/test/hts_TestExtractUMI.cpp +++ b/hts_ExtractUMI/test/hts_TestExtractUMI.cpp @@ -32,7 +32,7 @@ TEST_F(ExtractUMITest, BasicExtract) { // SE extract test auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi); + eu.extract_umi(ser->non_const_read_one(), umi, ':'); ASSERT_EQ("Read1_NAAAAA", (ser->non_const_read_one()).get_id_first()); }; @@ -53,11 +53,11 @@ TEST_F(ExtractUMITest, ReadOneExtract) { // R1 extract test while(ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi(per->non_const_read_one(), umi); - eu.extract_umi(per->non_const_read_two(), umi); + eu.extract_umi(per->non_const_read_one(), umi, ':'); + eu.extract_umi(per->non_const_read_two(), umi, ':'); writer(*per); } - ASSERT_EQ("Read1_NAAAAA\tGACATTAAGCAA\t############\tRead2_NAAAAA\tTTTTTTGACATTAAGCAA\t!!!!!!############\n", out1->str()); + ASSERT_EQ("Read1:NAAAAA\tGACATTAAGCAA\t############\tRead2:NAAAAA\tTTTTTTGACATTAAGCAA\t!!!!!!############\n", out1->str()); }; @@ -77,11 +77,11 @@ TEST_F(ExtractUMITest, ReadTwoExtract) { // R2 extract test while(ifp.has_next()) { auto i = ifp.next(); PairedEndRead *per = dynamic_cast(i.get()); - eu.extract_umi(per->non_const_read_two(), umi); - eu.extract_umi(per->non_const_read_one(), umi); + eu.extract_umi(per->non_const_read_two(), umi, ':'); + eu.extract_umi(per->non_const_read_one(), umi, ':'); writer(*per); } - ASSERT_EQ("Read1_TTTTTT\tNAAAAAGACATTAAGCAA\t!!!!!!############\tRead2_TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); + ASSERT_EQ("Read1:TTTTTT\tNAAAAAGACATTAAGCAA\t!!!!!!############\tRead2:TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); }; @@ -95,7 +95,7 @@ TEST_F(ExtractUMITest, QualFilt) { // Hard Filter extract test auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi); + eu.extract_umi(ser->non_const_read_one(), umi, ':'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -110,7 +110,7 @@ TEST_F(ExtractUMITest, AvgQualFilt) { // Average Filter extract test auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi); + eu.extract_umi(ser->non_const_read_one(), umi, ':'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -125,7 +125,7 @@ TEST_F(ExtractUMITest, HomopolymerFilt) { // Homopolymer filter test auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi); + eu.extract_umi(ser->non_const_read_one(), umi, ':'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; @@ -140,6 +140,6 @@ TEST_F(ExtractUMITest, NFilt) { // N filter test auto i = ifp.next(); SingleEndRead *ser = dynamic_cast(i.get()); - eu.extract_umi(ser->non_const_read_one(), umi); + eu.extract_umi(ser->non_const_read_one(), umi, ':'); ASSERT_EQ(true, (ser->non_const_read_one()).getDiscard()); }; From 45a370d9db32053d61a556bb7efe65cf75893d96 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 13 Apr 2024 13:38:12 -0700 Subject: [PATCH 05/16] Revert "umi parameter rename and update test JSON files for SuperDeduper" This reverts commit 9003e626bc683ed42a199964bdb3fdb3de17cfec. --- hts_SuperDeduper/src/hts_SuperDeduper.h | 4 ++-- regression/chain.json | 1 - regression/hts_SuperDeduper.json | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index ab9b1e49..582cd6e0 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,7 +116,7 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi-mode,m", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior to hts_SuperDeduper)"); + ("umi,u", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior to hts_SuperDeduper)"); } @@ -179,7 +179,7 @@ class SuperDeduper: public MainTemplate { const size_t length = vm["length"].as(); const size_t log_freq = vm["log_freq"].as(); const size_t qual_offset = vm["qual-offset"].as(); - const bool umi = vm["umi-mode"].as(); + const bool umi = vm["umi"].as(); WriterHelper writer(pe, se, false, false); load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, umi); diff --git a/regression/chain.json b/regression/chain.json index 02c2d0f0..63f5b6e0 100644 --- a/regression/chain.json +++ b/regression/chain.json @@ -14,7 +14,6 @@ "read2-input": [ "./test_r2.fastq.gz"], "start": 10, "stats-file": "chain.json", - "umi-mode": false, "uncompressed": false } }, diff --git a/regression/hts_SuperDeduper.json b/regression/hts_SuperDeduper.json index f360d666..4ab6241c 100644 --- a/regression/hts_SuperDeduper.json +++ b/regression/hts_SuperDeduper.json @@ -15,7 +15,6 @@ "start": 10, "stats-file": "hts_SuperDeduper.json", "tab-output": "hts_SuperDeduper", - "umi-mode": false, "uncompressed": false } }, From 3095dc916a0aad1f70b1fa9369bf6b0b6a0b2940 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Mon, 15 Apr 2024 05:11:25 -0700 Subject: [PATCH 06/16] Working implementation of UMI mode in hts_SuperDeduper --- common/src/read.cpp | 10 ++++--- common/src/read.h | 20 ++++++++++---- common/src/typedefs.h | 3 +++ hts_ExtractUMI/src/hts_ExtractUMI.h | 11 ++++---- hts_SuperDeduper/src/hts_SuperDeduper.h | 17 ++++++------ .../test/hts_testSuperDeduper.cpp | 27 +++++++++++++++++++ regression/chain.json | 1 + regression/hts_SuperDeduper.json | 1 + 8 files changed, 68 insertions(+), 22 deletions(-) diff --git a/common/src/read.cpp b/common/src/read.cpp index ba8447f8..cff23650 100644 --- a/common/src/read.cpp +++ b/common/src/read.cpp @@ -16,7 +16,7 @@ std::string strjoin(const std::vector & v, const std::string& delim } std::string ReadBase::bit_to_str(const BitSet &bits) { - size_t str_len = bits.size(); + size_t str_len = bits.size()/2; std::string out; out.resize(str_len); size_t i = bits.size() -1; @@ -35,8 +35,12 @@ std::string ReadBase::bit_to_str(const BitSet &bits) { return out; } -boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2, const bool& include) { - if (!include) { return bit2; } +boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2, const char& del) { + if (del == '\0') { + return bit2; + } else if (bit1 == boost::none) { + return boost::none; + } size_t bits = bit1 -> size() + bit2 -> size(); BitSet bittag(bits); for(size_t i = 0; i < bit1 -> size(); i++) { bittag[i] = (int)(*bit1)[i]; } diff --git a/common/src/read.h b/common/src/read.h index 49d24d30..d1274be1 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -71,11 +71,21 @@ class Read { const std::string get_id_first() const { return id; } const std::string get_id_second() const { return id2; } - const std::string get_umi() { - if (id.find('_') != std::string::npos) { - return id.substr(id.rfind("_") + 2); + const std::string get_umi(const char& del) { + size_t idx = id.rfind(del); + if (idx == std::string::npos) { + throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to hts_Superdeduper"); + } + std::string umi = id.substr(idx + 1); + + // clean up DRAGEN format if present + idx = umi.rfind('+'); + if (idx == std::string::npos) { + return umi; + } else { + umi.erase(idx); + return umi; } - throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to hts_Superdeduper"); } std::vector get_comment() const { return comments;} @@ -174,7 +184,7 @@ class ReadBase { return bit; } static std::string bit_to_str(const BitSet &bits); - static boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2, const bool &include); + static boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2, const char& del); static boost::optional reverse_complement(const std::string& str, int start, int length); virtual double avg_q_score(const size_t qual_offset = DEFAULT_QUAL_OFFSET) = 0; diff --git a/common/src/typedefs.h b/common/src/typedefs.h index 5816792f..c96a5426 100644 --- a/common/src/typedefs.h +++ b/common/src/typedefs.h @@ -16,5 +16,8 @@ typedef std::vector Mat; const uint64_t QUAL_MAX = 94; const uint64_t DEFAULT_QUAL_OFFSET = 33; +const std::vector READ_OPTIONS{'F', 'R', 'B'}; // possible read options +const std::vector DEL_OPTIONS{'-', '_', ':', '\0'}; // possible char options (last one is to allow unset) + #endif diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index d0501d91..c13225eb 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -22,9 +22,6 @@ extern template class InputReader; class ExtractUMI: public MainTemplate { public: - std::vector read_options{'F', 'R', 'B'}; // possible paramters for read options - std::vector del_options{'-', '_', ':', '+'}; // possible paramters for read options - // bases some parameters and allows passing UMIs between PE reads struct UMI { std::string seq1; @@ -49,9 +46,9 @@ class ExtractUMI: public MainTemplate { void add_extra_options(po::options_description &desc) { desc.add_options() - ("read,r", po::value()->default_value('F')->notifier(boost::bind(&check_values, "read", _1, read_options)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE") + ("read,r", po::value()->default_value('F')->notifier(boost::bind(&check_values, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE") ("umi-length,l", po::value()->default_value(6)->notifier(boost::bind(&check_range, "umi_length", _1, 1, 36)), "Total length of UMI to extract (1, 36)") - ("delimiter,d", po::value()->default_value('_')->notifier(boost::bind(&check_values, "delimiter", _1, del_options)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':', '+')") + ("delimiter,d", po::value()->default_value('_')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')") ("qual-score,q", po::value()->default_value(0)->notifier(boost::bind(&check_range, "qual-score", _1, 0, 10000)), "Threshold for quality score for any base within a UMI (min 1, max 10000), read pairs are discarded, default is unset") ("avg-qual-score,Q", po::value()->default_value(0)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 0, 10000)), "Threshold for quality score average of UMI (min 1, max 10000), read pairs are discarded, default is unset") ("homopolymer,p", po::bool_switch()->default_value(false), "Remove reads with homopolymer UMIs") @@ -144,7 +141,9 @@ class ExtractUMI: public MainTemplate { } if ((!umi.discard)) { - if (!dragen) { r.set_id_first(r.get_id_first() + del + umi.seq1); } + if (!dragen) { + r.set_id_first(r.get_id_first() + del + umi.seq1); + } } else { r.setDiscard(); } diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index 582cd6e0..f2434450 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,12 +116,11 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi,u", po::bool_switch()->default_value(false), "Includes UMI in unique ID (assumes hts_ExtractUMI has been ran prior to hts_SuperDeduper)"); - + ("umi-delimiter,d", po::value()->default_value('\0')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Enables UMI mode and speicifes character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')"); } template - void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const bool umi = false){ + void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const char del = '\0'){ double tmpAvg; std::string umi_seq; boost::optional> umi_bit; @@ -132,16 +131,18 @@ class SuperDeduper: public MainTemplate { counters.input(*i, log_freq); tmpAvg = i->avg_q_score(qual_offset); - if (umi) { + if (del != '\0') { umi_seq = ""; - for (const auto &r : i -> get_reads_non_const()) { umi_seq += r -> get_umi(); } + for (const auto &r : i -> get_reads_non_const()) { + umi_seq += r -> get_umi(del); + } umi_bit = i -> str_to_bit(umi_seq); }; //check for existance, store or compare quality and replace: if ( tmpAvg < discard_qual ){ // averge qual must be less than discard_qual, ignored counters.increment_ignored(); - } else if (auto key=i->bitjoin(umi_bit, (i -> get_key(start, length)), umi)) { // check for duplicate + } else if (auto key=i->bitjoin(umi_bit, (i -> get_key(start, length)), del)) { // check for duplicate // find faster than count on some compilers, new key if(read_map.find(*key) == read_map.end()) { // first time the key is seen if ( tmpAvg >= avg_automatic_write ) { // if its greater than avg_automatic_write then write out @@ -179,10 +180,10 @@ class SuperDeduper: public MainTemplate { const size_t length = vm["length"].as(); const size_t log_freq = vm["log_freq"].as(); const size_t qual_offset = vm["qual-offset"].as(); - const bool umi = vm["umi"].as(); + const char del = vm["umi-delimiter"].as(); WriterHelper writer(pe, se, false, false); - load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, umi); + load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, del); for(auto const &i : read_map) { if (i.second.get() != nullptr) { counter.output(*i.second.get()); diff --git a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp index bb8b223b..7c9b4fda 100644 --- a/hts_SuperDeduper/test/hts_testSuperDeduper.cpp +++ b/hts_SuperDeduper/test/hts_testSuperDeduper.cpp @@ -10,6 +10,9 @@ class SDTest : public ::testing::Test { public: po::variables_map vm; const std::string readData = "@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nAAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nBAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nCAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCNNNNNNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nAAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n"; + const std::string umiData = "@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CNNNNN 1:N:0:1\nNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n"; + + SuperDeduper sd; }; @@ -36,3 +39,27 @@ TEST_F(SDTest, HashMapLoadTest) { }; + + +TEST_F(SDTest, UMITest) { + std::istringstream in1(umiData); + size_t start = 5; + size_t length = 5; + double avg_auto_write = 100; + BitMap read_map; + SuperDeduperCounters counter("hts_SuperDeduper", vm); + + InputReader ifp(in1); + std::shared_ptr out1(new std::ostringstream); + std::shared_ptr hts_of(new HtsOfstream(out1)); + std::shared_ptr tab(new ReadBaseOutTab(hts_of)); + + sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100, 30, ':'); + std::cout << read_map.size() << '\n'; + ASSERT_EQ(sd.read_map.size(), 1u); + ASSERT_EQ(counter.TotalFragmentsInput, 4u); + ASSERT_EQ(counter.Duplicate, 2u); + ASSERT_EQ(counter.Ignored, 1u); + + +}; diff --git a/regression/chain.json b/regression/chain.json index 63f5b6e0..accbf2b4 100644 --- a/regression/chain.json +++ b/regression/chain.json @@ -14,6 +14,7 @@ "read2-input": [ "./test_r2.fastq.gz"], "start": 10, "stats-file": "chain.json", + "umi-delimiter": "\u0000", "uncompressed": false } }, diff --git a/regression/hts_SuperDeduper.json b/regression/hts_SuperDeduper.json index 4ab6241c..78c2e22e 100644 --- a/regression/hts_SuperDeduper.json +++ b/regression/hts_SuperDeduper.json @@ -15,6 +15,7 @@ "start": 10, "stats-file": "hts_SuperDeduper.json", "tab-output": "hts_SuperDeduper", + "umi-delimiter": "\u0000", "uncompressed": false } }, From ed859686662214bdaa5183e764b5bcce2366c994 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Mon, 15 Apr 2024 05:20:23 -0700 Subject: [PATCH 07/16] hts_SuperDeduper help typo fix --- hts_SuperDeduper/src/hts_SuperDeduper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index f2434450..bfde5c6f 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,7 +116,7 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi-delimiter,d", po::value()->default_value('\0')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Enables UMI mode and speicifes character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')"); + ("umi-delimiter,d", po::value()->default_value('\0')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Enables UMI mode and specifies character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')"); } template From 68af22b7b5adedc3dd63eb8ba1b3c2baa74e3e8c Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Tue, 16 Apr 2024 00:13:19 -0700 Subject: [PATCH 08/16] bitjoin() fix, watches for key to return boost::none --- common/src/read.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/read.cpp b/common/src/read.cpp index cff23650..acba578c 100644 --- a/common/src/read.cpp +++ b/common/src/read.cpp @@ -38,7 +38,7 @@ std::string ReadBase::bit_to_str(const BitSet &bits) { boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2, const char& del) { if (del == '\0') { return bit2; - } else if (bit1 == boost::none) { + } else if ((bit1 == boost::none) || (bit2 == boost::none)) { return boost::none; } size_t bits = bit1 -> size() + bit2 -> size(); From 0a4010c9d31f5649a49ff36284c4cef896df0722 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Thu, 18 Apr 2024 19:33:39 -0700 Subject: [PATCH 09/16] update hts_ExtractUMI --- hts_ExtractUMI/src/hts_ExtractUMI.h | 13 ++++++++++++- hts_ExtractUMI/test/hts_TestExtractUMI.cpp | 10 ++++++---- hts_SuperDeduper/src/hts_SuperDeduper.h | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index c13225eb..dba53d1c 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include extern template class InputReader; @@ -103,7 +105,16 @@ class ExtractUMI: public MainTemplate { void set_dragen(Read &r, const UMI &umi) { - r.set_id_first(r.get_id_first() + ":" + umi.seq1 + "+" + umi.seq2); + std::vector result; + boost::split(result, r.get_id_first(), boost::is_any_of(":")); + if (result.size() < 7) { + throw HtsIOException("DRAGEN read ID format not found. Must have at least 7 fields deliminated by a ':'"); + } if (result.size() == 7) { + result.push_back(umi.seq1 + "+" + umi.seq2); + } else { + result[7] = umi.seq1 + "+" + umi.seq2; + } + r.set_id_first(boost::algorithm::join(result, ":")); } diff --git a/hts_ExtractUMI/test/hts_TestExtractUMI.cpp b/hts_ExtractUMI/test/hts_TestExtractUMI.cpp index a6bfc3a0..678c8c6e 100644 --- a/hts_ExtractUMI/test/hts_TestExtractUMI.cpp +++ b/hts_ExtractUMI/test/hts_TestExtractUMI.cpp @@ -8,6 +8,8 @@ class ExtractUMITest : public ::testing::Test { const std::string readData_1 = "@Read1\nNAAAAAGACATTAAGCAA\n+\n!!!!!!############\n"; const std::string readData_2 = "@Read2\nTTTTTTGACATTAAGCAA\n+\n!!!!!!############\n"; + const std::string dragenData_1 = "@A00887_1:1:2:3:4:5:6\nNAAAAAGACATTAAGCAA\n+\n!!!!!!############\n"; + const std::string dragenData_2 = "@A00887_2:1:2:3:4:5:6\nTTTTTTGACATTAAGCAA\n+\n!!!!!!############\n"; ExtractUMI eu; @@ -169,9 +171,9 @@ TEST_F(ExtractUMITest, NFilt) { // N filter test }; -TEST_F(ExtractUMITest, DRAGEN) { // TRAGEN test - std::istringstream in1(readData_1); - std::istringstream in2(readData_2); +TEST_F(ExtractUMITest, DRAGEN) { // DRAGEN test + std::istringstream in1(dragenData_1); + std::istringstream in2(dragenData_2); InputReader ifp(in1, in2); std::shared_ptr out1(new std::ostringstream); @@ -191,5 +193,5 @@ TEST_F(ExtractUMITest, DRAGEN) { // TRAGEN test eu.set_dragen(per->non_const_read_two(), umi); writer(*per); } - ASSERT_EQ("Read1:NAAAAA+TTTTTT\tGACATTAAGCAA\t############\tRead2:NAAAAA+TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); + ASSERT_EQ("A00887_1:1:2:3:4:5:6:NAAAAA+TTTTTT\tGACATTAAGCAA\t############\tA00887_2:1:2:3:4:5:6:NAAAAA+TTTTTT\tGACATTAAGCAA\t############\n", out1->str()); }; \ No newline at end of file diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index bfde5c6f..6263820b 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -116,7 +116,7 @@ class SuperDeduper: public MainTemplate { ("avg-qual-score,q", po::value()->default_value(30)->notifier(boost::bind(&check_range, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)") ("inform-avg-qual-score,a", po::value()->default_value(5)->notifier(boost::bind(&check_range, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double ("log_freq,e", po::value()->default_value(1000000)->notifier(boost::bind(&check_range, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).") - ("umi-delimiter,d", po::value()->default_value('\0')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), "Enables UMI mode and specifies character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')"); + ("umi-delimiter,d", po::value()->default_value('\0')->notifier(boost::bind(&check_values, "delimiter", _1, DEL_OPTIONS)), " Enables UMI mode and specifies character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')"); } template From 756b809a329d3a66d77c372ee5b965ec9ffc10e7 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Thu, 18 Apr 2024 22:05:37 -0700 Subject: [PATCH 10/16] hts_ExtractUMI hotfix --- hts_ExtractUMI/src/hts_ExtractUMI.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index dba53d1c..7f8f3062 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -104,15 +104,23 @@ class ExtractUMI: public MainTemplate { } - void set_dragen(Read &r, const UMI &umi) { + void set_dragen(Read &r, const UMI &umi, const bool &se = false) { + + std::string new_id; + if (se) { + new_id = umi.seq1; + } else { + new_id = umi.seq1 + "+" + umi.seq2; + } + std::vector result; boost::split(result, r.get_id_first(), boost::is_any_of(":")); if (result.size() < 7) { throw HtsIOException("DRAGEN read ID format not found. Must have at least 7 fields deliminated by a ':'"); } if (result.size() == 7) { - result.push_back(umi.seq1 + "+" + umi.seq2); + result.push_back(new_id); } else { - result[7] = umi.seq1 + "+" + umi.seq2; + result[7] = new_id; } r.set_id_first(boost::algorithm::join(result, ":")); } @@ -203,6 +211,7 @@ class ExtractUMI: public MainTemplate { throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads"); } extract_umi( ser->non_const_read_one(), umi, del ); + if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); } }, [&](PairedEndRead * per) { if (dragen && (umi_length > 8)) { From 61465bc85de613366d2122d1d079368beed06067 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Thu, 25 Apr 2024 15:01:50 -0700 Subject: [PATCH 11/16] hts_ExtractUMI typo --- hts_ExtractUMI/src/hts_ExtractUMI.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index 49c040e6..09e92a4f 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -13,9 +13,8 @@ #include #include #include -<<<<<<< HEAD -======= ->>>>>>> hts_SuperDeduper + + #include extern template class InputReader; From f52f4d3ae9fab5dfa997389de2b7394c9b014826 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Thu, 25 Apr 2024 15:09:37 -0700 Subject: [PATCH 12/16] hts_ExtractUMI merge conflict resolve --- hts_ExtractUMI/src/hts_ExtractUMI.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index 09e92a4f..75f1da31 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -108,11 +108,7 @@ class ExtractUMI: public MainTemplate { void set_dragen(Read &r, const UMI &umi, const bool &se = false) { -<<<<<<< HEAD - std::string new_id; -======= std::string new_id; ->>>>>>> hts_SuperDeduper if (se) { new_id = umi.seq1; } else { @@ -123,20 +119,13 @@ class ExtractUMI: public MainTemplate { boost::split(result, r.get_id_first(), boost::is_any_of(":")); if (result.size() < 7) { throw HtsIOException("DRAGEN read ID format not found. Must have at least 7 fields deliminated by a ':'"); -<<<<<<< HEAD } else if (result.size() == 7) { -======= - } if (result.size() == 7) { ->>>>>>> hts_SuperDeduper result.push_back(new_id); } else { result[7] = new_id; } -<<<<<<< HEAD + r.set_id_first(boost::algorithm::join(result, ":")); -======= - r.set_id_first(boost::algorithm::join(result, ":")); ->>>>>>> hts_SuperDeduper } @@ -224,13 +213,8 @@ class ExtractUMI: public MainTemplate { if (dragen && (umi_length > 15)) { throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads"); } -<<<<<<< HEAD - extract_umi( ser->non_const_read_one(), umi, del, dragen ); - if (dragen) { set_dragen( ser->non_const_read_one(), umi, true); } -======= extract_umi( ser->non_const_read_one(), umi, del ); if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); } ->>>>>>> hts_SuperDeduper }, [&](PairedEndRead * per) { if (dragen && (umi_length > 8)) { From 0d89a74fe95305d21a8a97b93add63f42dfaa0e7 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Thu, 25 Apr 2024 15:38:47 -0700 Subject: [PATCH 13/16] More explicit extraction of UMI for DRAGEN format, hts_SuperDeduper --- common/src/read.h | 8 ++++++++ hts_ExtractUMI/src/hts_ExtractUMI.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/common/src/read.h b/common/src/read.h index d1274be1..59c2f630 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,13 @@ class Read { if (idx == std::string::npos) { return umi; } else { + std::vector result; + boost::split(result, id, boost::is_any_of(":")); + if (result.size() < 7) { + throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns for DRAGEN UMI format"); + } + umi = result[7]; + idx = umi.rfind('+'); umi.erase(idx); return umi; } diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h index 75f1da31..d6669121 100644 --- a/hts_ExtractUMI/src/hts_ExtractUMI.h +++ b/hts_ExtractUMI/src/hts_ExtractUMI.h @@ -213,7 +213,7 @@ class ExtractUMI: public MainTemplate { if (dragen && (umi_length > 15)) { throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads"); } - extract_umi( ser->non_const_read_one(), umi, del ); + extract_umi( ser->non_const_read_one(), umi, del, dragen ); if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); } }, [&](PairedEndRead * per) { From 370a4366668a710380c8616945a7830248964c4e Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 27 Apr 2024 14:57:19 -0700 Subject: [PATCH 14/16] Error message update, switch to const get_reads() --- common/src/read.h | 2 +- hts_SuperDeduper/src/hts_SuperDeduper.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/read.h b/common/src/read.h index 59c2f630..c2754ea6 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -75,7 +75,7 @@ class Read { const std::string get_umi(const char& del) { size_t idx = id.rfind(del); if (idx == std::string::npos) { - throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to hts_Superdeduper"); + throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation"); } std::string umi = id.substr(idx + 1); diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index 6263820b..a8fe67a9 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -133,7 +133,7 @@ class SuperDeduper: public MainTemplate { if (del != '\0') { umi_seq = ""; - for (const auto &r : i -> get_reads_non_const()) { + for (const auto &r : i -> get_reads()) { umi_seq += r -> get_umi(del); } umi_bit = i -> str_to_bit(umi_seq); From 8987f88a69d3d379d9f1045c955026582efa2ece Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 27 Apr 2024 16:57:07 -0700 Subject: [PATCH 15/16] get_umi(), result.size() < 8 --- common/src/read.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/read.h b/common/src/read.h index c2754ea6..24d80d5e 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -86,7 +86,7 @@ class Read { } else { std::vector result; boost::split(result, id, boost::is_any_of(":")); - if (result.size() < 7) { + if (result.size() < 8) { throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns for DRAGEN UMI format"); } umi = result[7]; From 1b3a7ee30111e03fcd7ec4d0eeba41d60e9f9881 Mon Sep 17 00:00:00 2001 From: Bradley Jenner Date: Sat, 27 Apr 2024 17:14:05 -0700 Subject: [PATCH 16/16] remove SuperDeduper logic from bit_join() --- common/src/read.cpp | 6 ++---- common/src/read.h | 2 +- hts_SuperDeduper/src/hts_SuperDeduper.h | 12 ++++++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/common/src/read.cpp b/common/src/read.cpp index acba578c..fd00e0b4 100644 --- a/common/src/read.cpp +++ b/common/src/read.cpp @@ -35,10 +35,8 @@ std::string ReadBase::bit_to_str(const BitSet &bits) { return out; } -boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2, const char& del) { - if (del == '\0') { - return bit2; - } else if ((bit1 == boost::none) || (bit2 == boost::none)) { +boost::optional ReadBase::bitjoin(const boost::optional &bit1, const boost::optional &bit2) { + if ((bit1 == boost::none) || (bit2 == boost::none)) { return boost::none; } size_t bits = bit1 -> size() + bit2 -> size(); diff --git a/common/src/read.h b/common/src/read.h index 24d80d5e..ddd4bc30 100644 --- a/common/src/read.h +++ b/common/src/read.h @@ -192,7 +192,7 @@ class ReadBase { return bit; } static std::string bit_to_str(const BitSet &bits); - static boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2, const char& del); + static boost::optional bitjoin(const boost::optional &bit1, const boost::optional &bit2); static boost::optional reverse_complement(const std::string& str, int start, int length); virtual double avg_q_score(const size_t qual_offset = DEFAULT_QUAL_OFFSET) = 0; diff --git a/hts_SuperDeduper/src/hts_SuperDeduper.h b/hts_SuperDeduper/src/hts_SuperDeduper.h index a8fe67a9..8fb198af 100644 --- a/hts_SuperDeduper/src/hts_SuperDeduper.h +++ b/hts_SuperDeduper/src/hts_SuperDeduper.h @@ -123,7 +123,7 @@ class SuperDeduper: public MainTemplate { void load_map(InputReader &reader, SuperDeduperCounters& counters, std::shared_ptr pe, std::shared_ptr se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const char del = '\0'){ double tmpAvg; std::string umi_seq; - boost::optional> umi_bit; + boost::optional> bit; WriterHelper writer(pe, se, false); while(reader.has_next()) { @@ -136,13 +136,17 @@ class SuperDeduper: public MainTemplate { for (const auto &r : i -> get_reads()) { umi_seq += r -> get_umi(del); } - umi_bit = i -> str_to_bit(umi_seq); - }; + bit = i -> bitjoin(i -> str_to_bit(umi_seq), i -> get_key(start, length)); + + } else { + bit = i -> get_key(start, length); + } + //check for existance, store or compare quality and replace: if ( tmpAvg < discard_qual ){ // averge qual must be less than discard_qual, ignored counters.increment_ignored(); - } else if (auto key=i->bitjoin(umi_bit, (i -> get_key(start, length)), del)) { // check for duplicate + } else if (auto key=bit) { // check for duplicate // find faster than count on some compilers, new key if(read_map.find(*key) == read_map.end()) { // first time the key is seen if ( tmpAvg >= avg_automatic_write ) { // if its greater than avg_automatic_write then write out