1+ //
2+ // HashRow<T> type for keeping track of rows of a distance matrix of T
3+ // (hashing them, to make it cheaper to determine which rows
4+ // are identical).
5+ //
6+ // DuplicateTaxa type, for reporting the row numbers of rows
7+ // that indicate identical taxa (or at least, taxa whose inter-taxa
8+ // differences are the same, and that are all distance 0 from each
9+ // other).
10+ //
11+ // Copyright James Barbetti (2021-22)
12+ //
13+
114#pragma once
215#ifndef hashrow_h
316#define hashrow_h
821
922typedef std::vector< std::vector< intptr_t > > DuplicateTaxa;
1023
24+ /* *
25+ * @brief A pair, of a hash, and contiguous block of
26+ * something (T) that is hashable and comparable
27+ * via both operator!= and operator<.
28+ * Ordering is hash primary, with lexicographic
29+ * block order to tie-break.
30+ * @tparam T something hashable (via std::hash) and
31+ * comparable.
32+ * @note if blocks are different sizes, and the contents
33+ * of the blocks compare equal (to the size of the
34+ * smaller block, the shorter block is treated as
35+ * being less than the longer).
36+ */
1137template <class T > class HashRow {
1238public:
13- intptr_t row_num;
14- const T* row_data;
15- size_t row_length;
16- size_t row_hash;
39+ intptr_t row_num; // an identifying row number (not used for ordering)
40+ const T* row_data; // the data block for the row
41+ size_t row_length; // the size of the data block
42+ size_t row_hash; // the hash of the elements in the block
43+
1744 HashRow (): row_num(-1 ), row_data(nullptr ), row_length(0 ), row_hash(0 ) {}
1845 HashRow (intptr_t num, const T* row_start, size_t length)
1946 : row_num(num), row_data(row_start), row_length(length), row_hash(0 ) {
@@ -25,16 +52,31 @@ template <class T> class HashRow {
2552 HashRow (const HashRow& rhs) = default ;
2653 HashRow& operator = (const HashRow& rhs) = default ;
2754
28-
55+ /* *
56+ * @brief Compare *this with rhs
57+ * @param rhs
58+ * @return int -1 if *this compares less, +1 if rhs compares less,
59+ * and 0, if *this and rhs compare equal
60+ */
2961 int compare (const HashRow& rhs) const {
62+ // First compare hashes
3063 if (row_hash<rhs.row_hash ) { return -1 ; }
3164 if (rhs.row_hash <row_hash) { return 1 ; }
32- for ( size_t col=0 ; col<row_length ; ++col) {
65+ int count_diff = 0 ;
66+ size_t min_col_count = row_length;
67+ if (row_length != rhs.row_length ) {
68+ bool left_less = (row_length < rhs.row_length );
69+ count_diff = left_less ? -1 : 1 ;
70+ min_col_count = left_less ? row_length : rhs.row_length ;
71+ }
72+ // compare elements in the blocks, up to the size of
73+ // the smaller block.
74+ for ( size_t col=0 ; col<min_col_count ; ++col) {
3375 if (row_data[col]!=rhs.row_data [col]) {
3476 return (row_data[col]<rhs.row_data [col]) ? -1 : 1 ;
3577 }
3678 }
37- return 0 ;
79+ return count_diff ;
3880 }
3981 bool operator < (const HashRow& rhs) const {
4082 return compare (rhs)<0 ;
@@ -43,6 +85,19 @@ template <class T> class HashRow {
4385 return compare (rhs)==0 ;
4486 }
4587
88+ /* *
89+ * @brief Given a sorted vector of HashRow<T>, that represents the
90+ * rows of a distance matrix, determine which groups of rows
91+ * in the distance matrix are duplicates.
92+ * @param hashed_rows the hashed distance matrix rows, in sorted order
93+ * @param vvc reference to a DuplicateTaxa instance (a vector of
94+ * vectors of intptr_t). Each of the vectors will
95+ * contains the row numbers of the (duplicate) rows
96+ * in an equivalence class (in no particular order).
97+ * @note it assumed (but not checked) that hashed_rows has been sorted
98+ * @note row numbers, within an equivalence class, appear in vvc
99+ * in their order of appearance in hashed_rows.
100+ */
46101 static void identifyDuplicateClusters (const std::vector< HashRow<T> >& hashed_rows,
47102 DuplicateTaxa& vvc) {
48103 std::vector< intptr_t > vc; // vector of cluster #s
@@ -51,7 +106,7 @@ template <class T> class HashRow {
51106 bool is_duplicate = hashed_rows[i].compare (hashed_rows[i-1 ])==0 ;
52107 if (is_duplicate) {
53108 intptr_t h = hashed_rows[i-1 ].row_num ;
54- is_duplicate = hashed_rows[i].row_data [h] ==0 ;
109+ is_duplicate = hashed_rows[i].row_data [h] == 0 ;
55110 }
56111 if (!is_duplicate) {
57112 // Not a duplicate of the previous row.
0 commit comments