ExtractUMI bug fix, '--add-as-tag' parameter, and separator parameter…

…. SuperDeduper now supports umis in tag and had a column parameter
s4hts · Sep 29, 2024 · a94dd8a · a94dd8a
1 parent 36f2ea7
commit a94dd8a
Show file tree

Hide file tree

Showing 10 changed files with 230 additions and 116 deletions.
diff --git a/common/src/ioHandler.cpp b/common/src/ioHandler.cpp
@@ -206,6 +206,7 @@ std::vector<ReadPtr> TabReadImpl::load_read(std::istream *input) {
     return reads;
 }
 
+
 template <>
 InputReader<SingleEndRead, SingleEndReadFastqImpl>::value_type InputReader<SingleEndRead, SingleEndReadFastqImpl>::next() {
     return InputReader<SingleEndRead, SingleEndReadFastqImpl>::value_type(new SingleEndRead(load_read(input)));

diff --git a/common/src/read.h b/common/src/read.h
@@ -75,30 +75,6 @@ class Read {
     }
     const std::string get_id_first() const { return id; }
     const std::string get_id_second() const { return id2; }
-
-    const std::string get_umi(const char& del) {
-        size_t idx = id.rfind(del);
-        if (idx == std::string::npos) {
-            throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation");
-        }
-        std::string umi = id.substr(idx + 1);
-
-        // clean up DRAGEN format if present
-        idx = umi.rfind('+');
-        if (idx == std::string::npos) { 
-            return umi; 
-        } else {
-            std::vector<std::string> result;
-            boost::split(result, id, boost::is_any_of(":"));
-            if (result.size() < 8) {
-                throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns for DRAGEN UMI format");
-            }
-            umi = result[7];
-            idx = umi.rfind('+');
-            umi.erase(idx);
-            return umi;
-        }
-    }
 
     std::vector<std::string> get_comment() const { return comments;}
     static char complement(char bp);
@@ -119,6 +95,62 @@ class Read {
                                             std::reverse(begin(q), end(q));
                                             return q;  }
 
+    const std::string get_umi(const char &del, const size_t &col, bool &both_reads) {
+
+        size_t idx = id.rfind(del);
+        if (idx == std::string::npos) {
+            throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation");
+        }
+
+        std::vector<std::string> result;
+        boost::split(result, id, boost::is_any_of(":"));
+
+        std::string umi;
+        if (col == 0) {
+            umi = result[result.size() - 1];
+        } else if (result.size() > col) {
+            throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns to extract UMI");
+        } else {
+            umi = result[col - 1];
+        }
+
+        for (const auto &s : DEL_OPTIONS) {
+
+            idx = umi.find(s);
+
+            if (idx != std::string::npos) {
+                both_reads = true;
+                umi.erase(idx, 1);
+                return umi;
+            }
+        }
+        return umi;
+    }
+
+    const std::string get_umi_tag(bool &both_reads) {
+
+        size_t idx;
+        for (const auto &c : get_comment()) {
+            idx = c.find("RX:Z:");
+            if (idx != std::string::npos) {
+                std::string umi = c.substr(idx + 5);
+
+                for (const auto &s : DEL_OPTIONS) {
+
+                    idx = umi.find(s);
+
+                    if (idx != std::string::npos) {
+                        both_reads = true;
+                        umi.erase(idx, 1);
+                        return umi;
+                    }
+                }
+                return umi;
+            }
+        }
+        throw HtsRuntimeException("No UMI Tag found in read header. Be sure hts_ExtractUMI is run prior to the current operation");
+    }
+
 
     void add_comment( const std::string& tag ) { if (tag != "") comments.push_back(tag);}
     void join_comment( const std::vector<std::string>& new_comments, size_t offset = 0) { comments.insert(comments.end(), new_comments.begin() + offset, new_comments.end()); }

diff --git a/common/src/typedefs.h b/common/src/typedefs.h
@@ -16,8 +16,8 @@ typedef std::vector<Vec> Mat;
 
 const uint64_t QUAL_MAX = 94;
 const uint64_t DEFAULT_QUAL_OFFSET = 33;
-const std::vector<char> READ_OPTIONS{'F', 'R', 'B'}; // possible read options
-const std::vector<char> DEL_OPTIONS{'-', '_', ':', ' '}; // possible char options (last one is to allow unset)
+const std::vector<char> READ_OPTIONS{'F', 'R', 'B', 'P'}; // possible read options
+const std::vector<char> DEL_OPTIONS{'-', '_', ':', '+', ' '}; // possible char options (last one is to allow unset)
 
 
 #endif
diff --git a/hts_ExtractUMI/src/hts_ExtractUMI.h b/hts_ExtractUMI/src/hts_ExtractUMI.h
@@ -28,8 +28,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
 
     // bases some parameters and allows passing UMIs between PE reads
     struct UMI {
-        std::string seq1;
-        std::string seq2;
+        std::string seq;
         std::string qual;
         size_t length;
         bool discard;
@@ -51,9 +50,10 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
 
     void add_extra_options(po::options_description &desc) {
         desc.add_options()
-        ("read,r", po::value<char>()->default_value('F')->notifier(boost::bind(&check_values<char>, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE")
+        ("read,r", po::value<char>()->default_value('F')->notifier(boost::bind(&check_values<char>, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both, P = Paired End), ignored if SE")
         ("umi-length,l", po::value<size_t>()->default_value(6)->notifier(boost::bind(&check_range<size_t>, "umi_length", _1, 1, 36)), "Total length of UMI to extract (1, 36)")
-        ("delimiter,d", po::value<char>()->default_value('_')->notifier(boost::bind(&check_values<char>, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')")
+        ("delimiter,d", po::value<char>()->default_value('_')->notifier(boost::bind(&check_values<char>, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':'). Ignored if --add-as-tag is used")
+        ("separator,s", po::value<char>()->default_value('+')->notifier(boost::bind(&check_values<char>, "separator", _1, DEL_OPTIONS)), "Character to separate the UMI sequence when using Paired End mode (Possible options: '-', '_', ':').")
         ("qual-score,q", po::value<size_t>()->default_value(0)->notifier(boost::bind(&check_range<size_t>, "qual-score", _1, 0, 10000)), "Threshold for quality score for any base within a UMI (min 1, max 10000), read pairs are discarded, default is unset")
         ("avg-qual-score,Q", po::value<size_t>()->default_value(0)->notifier(boost::bind(&check_range<size_t>, "avg-qual-score", _1, 0, 10000)), "Threshold for quality score average of UMI (min 1, max 10000), read pairs are discarded, default is unset")
         ("homopolymer,p", po::bool_switch()->default_value(false), "Remove reads with homopolymer UMIs")
@@ -108,44 +108,33 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
     }
 
 
-    void set_dragen(Read &r, const UMI &umi, const bool &se = false) {
-
-        std::string new_id;
-        if (se) {
-            new_id = umi.seq1;
-        } else {
-            new_id = umi.seq1 + "+" + umi.seq2;
-        }
-
+    void set_dragen(Read &r, const UMI &umi) {
         std::vector<std::string> result;
         boost::split(result, r.get_id_first(), boost::is_any_of(":"));
         if (result.size() < 7) {
             throw HtsIOException("DRAGEN read ID format not found. Must have at least 7 fields deliminated by a ':'");               
         } else if (result.size() == 7) {
-            result.push_back(new_id);
+            result.push_back(umi.seq);
         } else {
-            result[7] = new_id;
+            result[7] = umi.seq;
         }
 
         r.set_id_first(boost::algorithm::join(result, ":"));
     }
 
-    void set_tag(Read &r, const UMI &umi, const bool &se = false) {
-        std::string new_id;
-        if (se) {
-            new_id = umi.seq1;
-        } else {
-            new_id = umi.seq1 + "-" + umi.seq2;
-        }
-        r.add_comment("RX:Z:" + new_id);
+    void set_tag(Read &r, const UMI &umi) {
+        r.add_comment("RX:Z:" + umi.seq);
     }
 
+    void set_paired(Read &r, const UMI &umi, const char &del) {
+        r.set_id_first(r.get_id_first() + del + umi.seq); // append UMI to read ID
+    }
 
-    void extract_umi(Read &r, UMI &umi, const char &del, const bool &dragen = false, const bool &both_reads = false) {
+    void extract_umi(Read &r, UMI &umi, const char &del, const char &sep, const bool &dragen = false, const bool &paired_reads = false) {
 
         std::string tmp_seq;
 
-        if ((umi.seq1.empty() || dragen) || both_reads ) {
+        if ((umi.seq.empty() || dragen) || paired_reads ) {
 
             tmp_seq = r.get_seq().substr(0, umi.length);
 
@@ -166,17 +155,16 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
                 r.setLCut(umi.length);
             }
 
-            if ((!umi.seq1.empty() && dragen) || (!umi.seq1.empty() && both_reads && umi.add_as_tag)) {
-                umi.seq2 = tmp_seq; // necessary copy?
+            if ((!umi.seq.empty() && dragen) || (!umi.seq.empty() && paired_reads)) {
+                umi.seq = umi.seq + sep + tmp_seq;
             } else {
-                umi.seq1 = tmp_seq; // necessary copy?
+                umi.seq = tmp_seq;
             }
-
         }
 
         if (!umi.discard) {
-            if (!dragen && !umi.add_as_tag) {
-                r.set_id_first(r.get_id_first() + del + umi.seq1); // append UMI to read ID
+            if (!dragen && !umi.add_as_tag && !paired_reads) {
+                r.set_id_first(r.get_id_first() + del + umi.seq); // append UMI to read ID
             }
         } else {
             r.setDiscard();
@@ -191,6 +179,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
         size_t umi_length = vm["umi-length"].as<size_t>();
         bool add_as_tag = vm["add-as-tag"].as<bool>();
         char del = vm["delimiter"].as<char>();
+        char sep = vm["separator"].as<char>();
         size_t qual_offset = vm["qual-offset"].as<size_t>();
         size_t qual_threshold = vm["qual-score"].as<size_t>();
         size_t avg_qual_threshold = vm["avg-qual-score"].as<size_t>();
@@ -199,15 +188,14 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
         bool dragen = vm["DRAGEN"].as<bool>();
 
 
-        if (dragen & (del != ':')) {
-            throw HtsIOException("Delimiter (--delimiter) must be ':' to be compatible with --DRAGEN parameter");    
+        if (dragen & (del != ':' || sep != '+')) {
+            throw HtsIOException("Delimiter (--delimiter) must be ':' and Separator (--separator) was be '+' to be compatible with --DRAGEN parameter");    
         }
 
 
         // init UMI struct
         UMI umi = {
-            "",                     // umi R1 sequence
-            "",                     // umi R2 sequence
+            "",                     // umi sequence
             "",                     // qual sequence
             umi_length,             // umi length
             false,                  // discard status (init)
@@ -227,9 +215,9 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
             if (dragen && (umi_length > 15)) {
                 throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads");    
             }
-            extract_umi( ser->non_const_read_one(), umi, del, dragen );
-            if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); }
-            if (add_as_tag) { set_tag( ser->non_const_read_one(), umi, true); }
+            extract_umi( ser->non_const_read_one(), umi, del, sep, dragen );
+            if (dragen) { set_dragen( ser->non_const_read_one(), umi ); }
+            if (add_as_tag) { set_tag( ser->non_const_read_one(), umi ); }
         },
         [&](PairedEndRead * per) {
             if (dragen && (umi_length > 8)) {
@@ -239,22 +227,39 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
                 throw HtsIOException("DRAGEN parameter (--DRAGEN) is not compatible with --add-as-tag parameter");    
             }
             if (read == 'F') {
-                extract_umi( per->non_const_read_one(), umi, del );
-                extract_umi( per->non_const_read_two(), umi, del );
+                extract_umi( per->non_const_read_one(), umi, del, sep );
+                extract_umi( per->non_const_read_two(), umi, del, sep );
+
             } else if (read == 'R') {
-                extract_umi( per->non_const_read_two(), umi, del );
-                extract_umi( per->non_const_read_one(), umi, del );
+                extract_umi( per->non_const_read_two(), umi, del, sep );
+                extract_umi( per->non_const_read_one(), umi, del, sep );
+
             } else if (read == 'B') {
-                extract_umi( per->non_const_read_one(), umi, del, dragen, true );
-                std::tie(umi.qual) = std::make_tuple(""); // reset umi struct
-                extract_umi( per->non_const_read_two(), umi, del, dragen, true );
-                if (dragen) {
-                    set_dragen( per->non_const_read_one(), umi );
-                    set_dragen( per->non_const_read_two(), umi );
-                } else if (add_as_tag) {
-                    set_tag( per->non_const_read_one(), umi );
-                    set_tag( per->non_const_read_two(), umi );
-                }
+                extract_umi( per->non_const_read_one(), umi, del, sep );
+                if (dragen) { set_dragen( per->non_const_read_one(), umi ); }
+                if (add_as_tag) { set_tag( per->non_const_read_one(), umi ); }
+                std::tie(umi.seq) = std::make_tuple(""); // reset umi struct
+                extract_umi( per->non_const_read_two(), umi, del, sep );
+                if (dragen) { set_dragen( per->non_const_read_two(), umi ); }
+                if (add_as_tag) { set_tag( per->non_const_read_two(), umi ); }
+                dragen = false;
+                add_as_tag = false; 
+
+            } else if (read == 'P') {
+                extract_umi( per->non_const_read_one(), umi, del, sep, dragen, true );
+                extract_umi( per->non_const_read_two(), umi, del, sep, dragen, true );
+                if (!dragen && !add_as_tag) {
+                    set_paired( per->non_const_read_one(), umi, del ); 
+                    set_paired( per->non_const_read_two(), umi, del ); 
+                }   
+            }
+            if (dragen) {
+                set_dragen( per->non_const_read_one(), umi );
+                set_dragen( per->non_const_read_two(), umi );
+            } 
+            if (add_as_tag) {
+                set_tag( per->non_const_read_one(), umi );
+                set_tag( per->non_const_read_two(), umi );
             }
         }
 );
@@ -263,7 +268,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
             auto i = reader.next();
             counters.input(*i);
             i->accept(read_visit);
-            std::tie(umi.seq1, umi.seq2, umi.qual, umi.discard) = std::make_tuple("", "", "", false); // reset umi struct
+            std::tie(umi.seq, umi.qual, umi.discard) = std::make_tuple("", "", false); // reset umi struct
             writer(*i);
             counters.output(*i);
         }