Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UMI Support for hts_SuperDeduper! #261

Merged
merged 20 commits into from
Apr 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions common/src/read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ std::string ReadBase::bit_to_str(const BitSet &bits) {
return out;
}

boost::optional<BitSet> ReadBase::bitjoin(const boost::optional<BitSet> &bit1, const boost::optional<BitSet> &bit2) {
if ((bit1 == boost::none) || (bit2 == boost::none)) {
return boost::none;
}
size_t bits = bit1 -> size() + bit2 -> size();
BitSet bittag(bits);
for(size_t i = 0; i < bit1 -> size(); i++) { bittag[i] = (int)(*bit1)[i]; }
for(size_t i = 0; i < bit2 -> size(); i++) { bittag[i + (bit1 -> size())] = (int)(*bit2)[i]; }
return bittag;
}

// Read
Read Read::subread(size_t _start, size_t _length){
return Read(seq.substr(_start, _length), qual.substr(_start,_length), id);
Expand Down
30 changes: 29 additions & 1 deletion common/src/read.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
#include <boost/dynamic_bitset.hpp>
#include <boost/optional.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/join.hpp>
#include <memory>
#include <iostream>
#include <sstream>
#include <unordered_map>
#include "typedefs.h"
#include "hts_exception.h"

typedef boost::dynamic_bitset<> BitSet;
std::string strjoin(const std::vector <std::string>& v, const std::string& delim);
Expand Down Expand Up @@ -69,6 +71,31 @@ class Read {
}
const std::string get_id_first() const { return id; }
const std::string get_id_second() const { return id2; }

const std::string get_umi(const char& del) {
size_t idx = id.rfind(del);
if (idx == std::string::npos) {
throw HtsRuntimeException("Did not detected extracted UMI. Be sure hts_ExtractUMI is run prior to the current operation");
}
std::string umi = id.substr(idx + 1);

// clean up DRAGEN format if present
idx = umi.rfind('+');
if (idx == std::string::npos) {
return umi;
} else {
std::vector<std::string> result;
boost::split(result, id, boost::is_any_of(":"));
if (result.size() < 8) {
throw HtsRuntimeException("Read ID misformated. Does not have appropriate number of \":\" delimited columns for DRAGEN UMI format");
}
umi = result[7];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

result.size() < 7 should that be < 8 ? you are getting the 8th element here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the read ID can actually only have 7 fields, the 8th field does not exist on a lot reads and is optionally for the UMI, from my understanding. That is why we I check if there are less than 7, which I understood to be the minimum.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my point is you will get undefined behavior reading result[7] if the vector is size 7, so you need to check the size here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OH, you're totally right, I don't know why I thought the check would be < 7, it is definitely < 8 here. Illumina headers would definitely need 8 fields in this situation. Making the changes now.

idx = umi.rfind('+');
umi.erase(idx);
return umi;
}
}

std::vector<std::string> get_comment() const { return comments;}
static char complement(char bp);

Expand Down Expand Up @@ -115,7 +142,7 @@ class Read {
bool getDiscard() const { return discard; }
void setDiscard() { discard = true; }
size_t getLength() const { return length; }
size_t getLengthTrue() const { return cut_R <= cut_L ? 1 : cut_R - cut_L; }
size_t getLengthTrue() const { return cut_R <= cut_L ? 1 : cut_R - cut_L; }
// number of bp that are trimmed off left side
size_t getLTrim() const { return cut_L; }
// number of bp that are trimmed off right side
Expand Down Expand Up @@ -165,6 +192,7 @@ class ReadBase {
return bit;
}
static std::string bit_to_str(const BitSet &bits);
static boost::optional<BitSet> bitjoin(const boost::optional<BitSet> &bit1, const boost::optional<BitSet> &bit2);
static boost::optional<BitSet> reverse_complement(const std::string& str, int start, int length);
virtual double avg_q_score(const size_t qual_offset = DEFAULT_QUAL_OFFSET) = 0;

Expand Down
3 changes: 3 additions & 0 deletions common/src/typedefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@ typedef std::vector<Vec> Mat;

const uint64_t QUAL_MAX = 94;
const uint64_t DEFAULT_QUAL_OFFSET = 33;
const std::vector<char> READ_OPTIONS{'F', 'R', 'B'}; // possible read options
const std::vector<char> DEL_OPTIONS{'-', '_', ':', '\0'}; // possible char options (last one is to allow unset)


#endif
17 changes: 9 additions & 8 deletions hts_ExtractUMI/src/hts_ExtractUMI.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/join.hpp>


#include <algorithm>

extern template class InputReader<SingleEndRead, SingleEndReadFastqImpl>;
Expand All @@ -25,9 +26,6 @@ extern template class InputReader<ReadBase, TabReadImpl>;
class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
public:

std::vector<char> read_options{'F', 'R', 'B'}; // possible paramters for read options
std::vector<char> del_options{'-', '_', ':', '+'}; // possible paramters for read options

// bases some parameters and allows passing UMIs between PE reads
struct UMI {
std::string seq1;
Expand All @@ -52,9 +50,9 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {

void add_extra_options(po::options_description &desc) {
desc.add_options()
("read,r", po::value<char>()->default_value('F')->notifier(boost::bind(&check_values<char>, "read", _1, read_options)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE")
("read,r", po::value<char>()->default_value('F')->notifier(boost::bind(&check_values<char>, "read", _1, READ_OPTIONS)), "Read from which to extract the UMI (F = Forward, R = Reverse, B = Both), ignored if SE")
("umi-length,l", po::value<size_t>()->default_value(6)->notifier(boost::bind(&check_range<size_t>, "umi_length", _1, 1, 36)), "Total length of UMI to extract (1, 36)")
("delimiter,d", po::value<char>()->default_value('_')->notifier(boost::bind(&check_values<char>, "delimiter", _1, del_options)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':', '+')")
("delimiter,d", po::value<char>()->default_value('_')->notifier(boost::bind(&check_values<char>, "delimiter", _1, DEL_OPTIONS)), "Character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')")
("qual-score,q", po::value<size_t>()->default_value(0)->notifier(boost::bind(&check_range<size_t>, "qual-score", _1, 0, 10000)), "Threshold for quality score for any base within a UMI (min 1, max 10000), read pairs are discarded, default is unset")
("avg-qual-score,Q", po::value<size_t>()->default_value(0)->notifier(boost::bind(&check_range<size_t>, "avg-qual-score", _1, 0, 10000)), "Threshold for quality score average of UMI (min 1, max 10000), read pairs are discarded, default is unset")
("homopolymer,p", po::bool_switch()->default_value(false), "Remove reads with homopolymer UMIs")
Expand Down Expand Up @@ -110,7 +108,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {

void set_dragen(Read &r, const UMI &umi, const bool &se = false) {

std::string new_id;
std::string new_id;
if (se) {
new_id = umi.seq1;
} else {
Expand All @@ -126,6 +124,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
} else {
result[7] = new_id;
}

r.set_id_first(boost::algorithm::join(result, ":"));
}

Expand Down Expand Up @@ -164,7 +163,9 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
}

if ((!umi.discard)) {
if (!dragen) { r.set_id_first(r.get_id_first() + del + umi.seq1); }
if (!dragen) {
r.set_id_first(r.get_id_first() + del + umi.seq1);
}
} else {
r.setDiscard();
}
Expand Down Expand Up @@ -213,7 +214,7 @@ class ExtractUMI: public MainTemplate<TrimmingCounters, ExtractUMI> {
throw HtsIOException("UMI length (--umi-length) greater than 15 is not compatible with --DRAGEN parameter for Single End Reads");
}
extract_umi( ser->non_const_read_one(), umi, del, dragen );
if (dragen) { set_dragen( ser->non_const_read_one(), umi, true); }
if (dragen) { set_dragen( ser->non_const_read_one(), umi, true ); }
},
[&](PairedEndRead * per) {
if (dragen && (umi_length > 8)) {
Expand Down
27 changes: 23 additions & 4 deletions hts_SuperDeduper/src/hts_SuperDeduper.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <unordered_map>
#include <boost/dynamic_bitset.hpp>
#include <boost/functional/hash.hpp>
#include <boost/optional/optional_io.hpp>

extern template class InputReader<SingleEndRead, SingleEndReadFastqImpl>;
extern template class InputReader<PairedEndRead, PairedEndReadFastqImpl>;
Expand Down Expand Up @@ -114,22 +115,38 @@ class SuperDeduper: public MainTemplate<SuperDeduperCounters, SuperDeduper> {
("length,l", po::value<size_t>()->default_value(10)->notifier(boost::bind(&check_range<size_t>, "length", _1, 1, 10000)), "Length of unique ID (min 1, max 10000)")
("avg-qual-score,q", po::value<double>()->default_value(30)->notifier(boost::bind(&check_range<double>, "avg-qual-score", _1, 1, 10000)), "Avg quality score to have the read written automatically (min 1, max 10000)")
("inform-avg-qual-score,a", po::value<double>()->default_value(5)->notifier(boost::bind(&check_range<double>, "inform-avg-qual-score", _1, 1, 10000)), "Avg quality score to consider a read informative (min 1, max 10000)") //I know this says user input is a int, but is actually a double
("log_freq,e", po::value<size_t>()->default_value(1000000)->notifier(boost::bind(&check_range<size_t>, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).");
("log_freq,e", po::value<size_t>()->default_value(1000000)->notifier(boost::bind(&check_range<size_t>, "log_freq", _1, 0, 1000000000)), "Frequency in which to log duplicates in reads, can be used to create a saturation plot (0 turns off).")
("umi-delimiter,d", po::value<char>()->default_value('\0')->notifier(boost::bind(&check_values<char>, "delimiter", _1, DEL_OPTIONS)), " Enables UMI mode and specifies character to separate the UMI sequence from other fields in the Read ID (Possible options: '-', '_', ':')");
}

template <class T, class Impl>
void load_map(InputReader<T, Impl> &reader, SuperDeduperCounters& counters, std::shared_ptr<OutputWriter> pe, std::shared_ptr<OutputWriter> se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET){
void load_map(InputReader<T, Impl> &reader, SuperDeduperCounters& counters, std::shared_ptr<OutputWriter> pe, std::shared_ptr<OutputWriter> se, double avg_automatic_write, double discard_qual, size_t start, size_t length, size_t log_freq, const size_t qual_offset = DEFAULT_QUAL_OFFSET, const char del = '\0'){
double tmpAvg;
std::string umi_seq;
boost::optional<boost::dynamic_bitset<>> bit;
WriterHelper writer(pe, se, false);

while(reader.has_next()) {
auto i = reader.next();
counters.input(*i, log_freq);
tmpAvg = i->avg_q_score(qual_offset);

if (del != '\0') {
umi_seq = "";
for (const auto &r : i -> get_reads()) {
umi_seq += r -> get_umi(del);
}
bit = i -> bitjoin(i -> str_to_bit(umi_seq), i -> get_key(start, length));

} else {
bit = i -> get_key(start, length);
}


//check for existance, store or compare quality and replace:
if ( tmpAvg < discard_qual ){ // averge qual must be less than discard_qual, ignored
counters.increment_ignored();
} else if (auto key=i->get_key(start, length)) { // check for duplicate
} else if (auto key=bit) { // check for duplicate
// find faster than count on some compilers, new key
if(read_map.find(*key) == read_map.end()) { // first time the key is seen
if ( tmpAvg >= avg_automatic_write ) { // if its greater than avg_automatic_write then write out
Expand Down Expand Up @@ -167,8 +184,10 @@ class SuperDeduper: public MainTemplate<SuperDeduperCounters, SuperDeduper> {
const size_t length = vm["length"].as<size_t>();
const size_t log_freq = vm["log_freq"].as<size_t>();
const size_t qual_offset = vm["qual-offset"].as<size_t>();
const char del = vm["umi-delimiter"].as<char>();

WriterHelper writer(pe, se, false, false);
load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset);
load_map(reader, counter, pe, se, avg_automatic_write, discard_qual, start, length, log_freq, qual_offset, del);
for(auto const &i : read_map) {
if (i.second.get() != nullptr) {
counter.output(*i.second.get());
Expand Down
27 changes: 27 additions & 0 deletions hts_SuperDeduper/test/hts_testSuperDeduper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ class SDTest : public ::testing::Test {
public:
po::variables_map vm;
const std::string readData = "@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nAAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nBAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCCTTTTGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nCAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965 1:N:0:1\nCNNNNNNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\nAAAAAF5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n";
const std::string umiData = "@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CCTTTT 1:N:0:1\nGTTTGTATTGTTTCTCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n@M03610:15:000000000-ALE1U:1:1101:17702:1965:CNNNNN 1:N:0:1\nNNNNNNNNNNNNNNNCTTAAGCATATTTCTTAATTACTTAACTAACTACCTCAAATGAAATTTGTAAAAAATCCATCAATTTTAACCGCTTGTTTGCTTGCCAGTGTCCTCGGAGGCATTGGGACACTCGCCTACTCCCCTTTTG\n+\n5CFFFF6FGFGGGGGGHHGHHFBHFFHHHHHBEGHHHHHEGGHHHHHFGFGGFEFGHBFBHHHHHEEGFFEGHHFHHEBGHHHEGHHGEGGEHHHGHHHEHHFFG3GHHHHGCDEGGGHHDGEHHGHHHFEEGGFHHHHGHHHH1\n";


SuperDeduper sd;
};

Expand All @@ -36,3 +39,27 @@ TEST_F(SDTest, HashMapLoadTest) {


};


TEST_F(SDTest, UMITest) {
std::istringstream in1(umiData);
size_t start = 5;
size_t length = 5;
double avg_auto_write = 100;
BitMap read_map;
SuperDeduperCounters counter("hts_SuperDeduper", vm);

InputReader<SingleEndRead, SingleEndReadFastqImpl> ifp(in1);
std::shared_ptr<std::ostringstream> out1(new std::ostringstream);
std::shared_ptr<HtsOfstream> hts_of(new HtsOfstream(out1));
std::shared_ptr<OutputWriter> tab(new ReadBaseOutTab(hts_of));

sd.load_map(ifp, counter, tab, tab, avg_auto_write, 3, start, length, 100, 30, ':');
std::cout << read_map.size() << '\n';
ASSERT_EQ(sd.read_map.size(), 1u);
ASSERT_EQ(counter.TotalFragmentsInput, 4u);
ASSERT_EQ(counter.Duplicate, 2u);
ASSERT_EQ(counter.Ignored, 1u);


};
1 change: 1 addition & 0 deletions regression/chain.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"read2-input": [ "./test_r2.fastq.gz"],
"start": 10,
"stats-file": "chain.json",
"umi-delimiter": "\u0000",
"uncompressed": false
}
},
Expand Down
1 change: 1 addition & 0 deletions regression/hts_SuperDeduper.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"start": 10,
"stats-file": "hts_SuperDeduper.json",
"tab-output": "hts_SuperDeduper",
"umi-delimiter": "\u0000",
"uncompressed": false
}
},
Expand Down
Loading