diff --git a/src/colord/encoder.cpp b/src/colord/encoder.cpp index 1415655..db7c08e 100644 --- a/src/colord/encoder.cpp +++ b/src/colord/encoder.cpp @@ -204,50 +204,192 @@ class CMmersHashMapLP mmers.insert_fast(x); } - //for HiFi - explicit CMmersHashMapLP(const read_t& read, uint32_t m, hash_set_type& include, CBloomFilter& bloom_mmers) : -// mmers(~0ull, static_cast(include.size() / 0.4), 0.4, std::equal_to{}, MurMur64Hash{}) - mmers(~0ull, 16, 0.8, std::equal_to{}, MurMur64Hash{}) + void GetIntersection(CMmersHashMapLP& inParam, + std::vector>>& outThis, + std::vector>>& outParam + ) { + outThis.clear(); + outParam.clear(); + hash_map_type& in_mmers_this = this->mmers; + hash_map_type& in_mmers_param = inParam.mmers; + + hash_map_type* _smaller_input = &in_mmers_this; + hash_map_type* _bigger_input = &in_mmers_param; + + std::vector>>* _smaller_output = &outThis; + std::vector>>* _bigger_output = &outParam; + if (in_mmers_this.size() > in_mmers_param.size()) + { + std::swap(_smaller_input, _bigger_input); + std::swap(_smaller_output, _bigger_output); + } + hash_map_type& smaller_input = *_smaller_input; + hash_map_type& bigger_input = *_bigger_input; + std::vector>>& smaller_output = *_smaller_output; + std::vector>>& bigger_output = *_bigger_output; + + smaller_output.reserve(smaller_input.size()); + bigger_output.reserve(smaller_input.size()); + + if (smaller_input.size() == smaller_input.size_unique()) //only unique values, it may be done simpler + { + for (auto outerIt = smaller_input.begin(); outerIt != smaller_input.end(); ++outerIt) + { + auto key = outerIt->first; + if (auto it = bigger_input.find(key); it != bigger_input.local_end()) + { + //auto smaller_it = smaller_input.find(key); //must be successfull, but it seems to be unnecessary, maybe there is a better way +/* auto smaller_it = smaller_input.local_begin(outerIt); //it is guranted that local_begin work well in a case of unique records + smaller_output.emplace_back(key, std::vector{}); + for (; smaller_it != smaller_input.local_end(); ++smaller_it) + smaller_output.back().second.push_back(static_cast(smaller_it->second));*/ + auto smaller_it = smaller_input.local_value_begin(outerIt); //it is guranted that local_begin work well in a case of unique records + smaller_output.emplace_back(key, std::vector(smaller_it, smaller_input.local_value_end())); + + bigger_output.emplace_back(key, std::vector{}); + for (; it != bigger_input.local_end(); ++it) + bigger_output.back().second.push_back(static_cast(it->second)); + } + } + } + else + { + //hash_map_type done(~0ull, (smaller_input.size()) / 0.4, 0.4, std::equal_to{}, MurMur64Hash{}); + + hash_set_type done(~0ull, static_cast(smaller_input.size() / 0.4), 0.4, std::equal_to{}, MurMur64Hash{}); + for (auto outerIt = smaller_input.begin(); outerIt != smaller_input.end(); ++outerIt) + { + auto key = outerIt->first; + if (done.check(key)) + continue; + else +// done.insert(key); + done.insert_fast(key); + if (auto it = bigger_input.find(key); it != bigger_input.local_end()) + { + auto smaller_it = smaller_input.find(key); //in case of non unique values local_begin may produce erroneous results + + smaller_output.emplace_back(key, std::vector{}); + for (; smaller_it != smaller_input.local_end(); ++smaller_it) + smaller_output.back().second.push_back(static_cast(smaller_it->second)); + + bigger_output.emplace_back(key, std::vector{}); + for (; it != bigger_input.local_end(); ++it) + bigger_output.back().second.push_back(static_cast(it->second)); + } + } + } + } +}; +//the idea of this class is to keep m-mers in hash table where values are positions +//but in some cases there is a lot of duplicated m-mers which are all stored as separate entries in HT +//for this reason in this class there is a limit of a numebr of stored duplicates +//if there is more duplicates in the input data the remaining positions are stored in a separate data structure (just a vector) +//this is possible with insert_up_to_n_duplicates method added to hash table which has some limitation (for example there may be no HT restruct to keep the results correct) +class CMmersHashMapDuplicateOptimizedLP +{ + using hash_map_type = hash_map_lp, MurMur64Hash>; + using hash_set_type = hash_set_lp, MurMur64Hash>; + hash_map_type mmers; + + const size_t insert_up_to = 20; + + std::vector> vec_for_highly_duplicated_elems; + + void insert_elem(anchor_type key, uint64_t val) { + uint64_t x = vec_for_highly_duplicated_elems.size(); + auto y = mmers.insert_up_to_n_duplicates(key, val, x, insert_up_to); + + if (y == insert_up_to) { + if (x == vec_for_highly_duplicated_elems.size()) + vec_for_highly_duplicated_elems.emplace_back(); + vec_for_highly_duplicated_elems[x].push_back(val); + } + } + +public: + uint32_t GetNMmers() { + return static_cast(mmers.size_unique()); + } + + size_t Size() { + size_t x = 0; + for (auto p : vec_for_highly_duplicated_elems) + x += p.size(); + + // - vec_for_highly_duplicated_elems.size() because this is the numebr of entries in HT that are just to store indexes of vectors + return mmers.size() - vec_for_highly_duplicated_elems.size() + x; + } + + explicit CMmersHashMapDuplicateOptimizedLP(const read_t& read, uint32_t m) : + //+ 1 in initial size is crucial because we use insert_up_to_n_duplicates which does not allow restruct of HT + mmers(~0ull, static_cast((read_len(read) - m + 1) / 0.4) + 1, 0.4, std::equal_to{}, MurMur64Hash{}) + { if (read_len(read) < m) return; anchor_type mask = (1ull << (2 * m)) - 1; - - anchor_type mmer{}, rev{}; + anchor_type mmer{}; uint32_t pos = 0; for (; pos < m - 1; ++pos) { assert(read[pos] < 4); //only ACGT mmer <<= 2; mmer += read[pos]; - - rev >>= 2; - rev += ((uint64_t)(3 - read[pos])) << (2 * (m - 1)); } - for (; pos < read_len(read); ++pos) { assert(read[pos] < 4); //only ACGT mmer <<= 2; mmer += read[pos]; mmer &= mask; + insert_elem(mmer, pos - m + 1); + } + } - rev >>= 2; - rev += ((uint64_t)(3 - read[pos])) << (2 * (m - 1)); - auto can = mmer < rev ? mmer : rev; + explicit CMmersHashMapDuplicateOptimizedLP(const read_t& read, uint32_t m, CMmersHashMapDuplicateOptimizedLP& allowOnly, CBloomFilter& bloom_mmers) : + mmers(~0ull, 16, 0.8, std::equal_to{}, MurMur64Hash{}) + { + hash_map_type& include = allowOnly.mmers; + if (read_len(read) < m) + return; - if (include.check(can)) - { - mmers.insert_fast(std::make_pair(mmer, pos - m + 1)); - } + anchor_type mask = (1ull << (2 * m)) - 1; + + anchor_type mmer{}; + uint32_t pos = 0; + for (; pos < m - 1; ++pos) + { + assert(read[pos] < 4); //only ACGT + mmer <<= 2; + mmer += read[pos]; } + std::vector> v_mmers; + v_mmers.reserve(read_len(read) - m + 1); + + for (; pos < read_len(read); ++pos) + { + assert(read[pos] < 4); //only ACGT + mmer <<= 2; + mmer += read[pos]; + mmer &= mask; + + if (bloom_mmers.test(mmer)) + if (include.check(mmer)) + v_mmers.emplace_back(mmer, pos - m + 1); + } + //+ 1 in initial size is crucial because we use insert_up_to_n_duplicates which does not allow restruct of HT + mmers = hash_map_type(~0ull, static_cast(v_mmers.size() / 0.4) + 1, 0.4, std::equal_to{}, MurMur64Hash{}); + for (auto& x : v_mmers) + insert_elem(x.first, x.second); } - void GetIntersection(CMmersHashMapLP& inParam, + + void GetIntersection(CMmersHashMapDuplicateOptimizedLP& inParam, std::vector>>& outThis, std::vector>>& outParam ) @@ -257,18 +399,27 @@ class CMmersHashMapLP hash_map_type& in_mmers_this = this->mmers; hash_map_type& in_mmers_param = inParam.mmers; + std::vector>& vec_for_highly_duplicated_elems_this = this->vec_for_highly_duplicated_elems; + std::vector>& vec_for_highly_duplicated_elems_param = inParam.vec_for_highly_duplicated_elems; + hash_map_type* _smaller_input = &in_mmers_this; hash_map_type* _bigger_input = &in_mmers_param; + std::vector>* _vec_for_highly_duplicated_elems_smaller = &vec_for_highly_duplicated_elems_this; + std::vector>* _vec_for_highly_duplicated_elems_bigger = &vec_for_highly_duplicated_elems_param; + std::vector>>* _smaller_output = &outThis; std::vector>>* _bigger_output = &outParam; if (in_mmers_this.size() > in_mmers_param.size()) { std::swap(_smaller_input, _bigger_input); std::swap(_smaller_output, _bigger_output); + std::swap(_vec_for_highly_duplicated_elems_smaller, _vec_for_highly_duplicated_elems_bigger); } hash_map_type& smaller_input = *_smaller_input; hash_map_type& bigger_input = *_bigger_input; + std::vector>& vec_for_highly_duplicated_elems_smaller = *_vec_for_highly_duplicated_elems_smaller; + std::vector>& vec_for_highly_duplicated_elems_bigger = *_vec_for_highly_duplicated_elems_bigger; std::vector>>& smaller_output = *_smaller_output; std::vector>>& bigger_output = *_bigger_output; @@ -291,15 +442,19 @@ class CMmersHashMapLP smaller_output.emplace_back(key, std::vector(smaller_it, smaller_input.local_value_end())); bigger_output.emplace_back(key, std::vector{}); - for (; it != bigger_input.local_end(); ++it) - bigger_output.back().second.push_back(static_cast(it->second)); + for (size_t i = 0; it != bigger_input.local_end(); ++it, ++i) { + assert(i <= insert_up_to - 1); + if (i == insert_up_to - 1) + for (auto x : vec_for_highly_duplicated_elems_bigger[it->second]) + bigger_output.back().second.push_back(static_cast(x)); + else + bigger_output.back().second.push_back(static_cast(it->second)); + } } } } else { - //hash_map_type done(~0ull, (smaller_input.size()) / 0.4, 0.4, std::equal_to{}, MurMur64Hash{}); - hash_set_type done(~0ull, static_cast(smaller_input.size() / 0.4), 0.4, std::equal_to{}, MurMur64Hash{}); for (auto outerIt = smaller_input.begin(); outerIt != smaller_input.end(); ++outerIt) { @@ -307,27 +462,36 @@ class CMmersHashMapLP if (done.check(key)) continue; else -// done.insert(key); done.insert_fast(key); if (auto it = bigger_input.find(key); it != bigger_input.local_end()) { - auto smaller_it = smaller_input.find(key); //in case of non unique values local_begin may produce erroneous results + auto smaller_it = smaller_input.find(key); //in case of non unique values local_begin may produce erroneous results smaller_output.emplace_back(key, std::vector{}); - for (; smaller_it != smaller_input.local_end(); ++smaller_it) - smaller_output.back().second.push_back(static_cast(smaller_it->second)); + for (size_t i = 0; smaller_it != smaller_input.local_end(); ++smaller_it, ++i) { + assert(i <= insert_up_to - 1); + if (i == insert_up_to - 1) + for (auto x : vec_for_highly_duplicated_elems_smaller[smaller_it->second]) + smaller_output.back().second.push_back(static_cast(x)); + else + smaller_output.back().second.push_back(static_cast(smaller_it->second)); + } bigger_output.emplace_back(key, std::vector{}); - for (; it != bigger_input.local_end(); ++it) - bigger_output.back().second.push_back(static_cast(it->second)); + for (size_t i = 0; it != bigger_input.local_end(); ++it, ++i) { + assert(i <= insert_up_to - 1); + if (i == insert_up_to - 1) + for (auto x : vec_for_highly_duplicated_elems_bigger[it->second]) + bigger_output.back().second.push_back(static_cast(x)); + else + bigger_output.back().second.push_back(static_cast(it->second)); + } } } } } }; - - class CKmersHashSetLP { using hash_set_type = hash_set_lp, MurMur64Hash>; diff --git a/src/colord/encoder.h b/src/colord/encoder.h index 2277e6a..d0b675a 100644 --- a/src/colord/encoder.h +++ b/src/colord/encoder.h @@ -64,7 +64,9 @@ enum class AnalyseRefReadWithKmersRes { NoAnchors, CorespondingKmersIncompatibil class CMmersHashMapLP; class CKmersHashMapLP; class CKmersHashSetLP; -using CMmers = CMmersHashMapLP; +class CMmersHashMapDuplicateOptimizedLP; +//using CMmers = CMmersHashMapLP; +using CMmers = CMmersHashMapDuplicateOptimizedLP; using CKmers = CKmersHashMapLP; class CBloomFilter diff --git a/src/colord/hm.h b/src/colord/hm.h index 0fa3c0d..dc69acb 100644 --- a/src/colord/hm.h +++ b/src/colord/hm.h @@ -796,6 +796,63 @@ template 1 + //returns the number of elements with this key after adding + //so that the caller may check if the value was added, if n is returned it means value was not added, and either + // - alt_val was added if there were n - 1 occurrences of key + // - or nothing is added and alt_val is set to a value of last occurrence + //warning: there will be no restruct, the caller must ensure HT has enough space from its construction point + size_t insert_up_to_n_duplicates(const Key_t& key, const Value_t& value, Value_t& alt_val, size_t n) + { + assert(!(no_elements >= size_when_restruct)); + + size_t h = hash(key) & allocated_mask; + size_t n_occ = 0; + + size_t last_occ_pos = std::numeric_limits::max(); + + if (!compare(data[h].first, empty_key)) + { + if (compare(data[h].first, key)) { + ++n_occ; + last_occ_pos = h; + } + + do + { + h = (h + 1) & allocated_mask; + if (compare(data[h].first, key)) { + ++n_occ; + last_occ_pos = h; + } + } while (data[h].first != empty_key); + } + + if (!n_occ) + ++no_elements_unique; + else if (n_occ == n - 1) + { + ++no_elements; + data[h] = value_type(key, alt_val); + return n_occ + 1; // == n + } + else if (n_occ == n) + { + alt_val = data[last_occ_pos].second; + return n_occ; // == n + } + + ++no_elements; + data[h] = value_type(key, value); + + return n_occ + 1; + } local_iterator find(const key_type& key) { diff --git a/src/colord/utils.cpp b/src/colord/utils.cpp index 3e9492b..0361397 100644 --- a/src/colord/utils.cpp +++ b/src/colord/utils.cpp @@ -71,6 +71,7 @@ bool isFastq(const std::string& path) std::cerr << "Error: file " << path << " is empty\n"; exit(1); } + gzclose(f); return c == '@'; }