1
+ //
2
+ // HashRow<T> type for keeping track of rows of a distance matrix of T
3
+ // (hashing them, to make it cheaper to determine which rows
4
+ // are identical).
5
+ //
6
+ // DuplicateTaxa type, for reporting the row numbers of rows
7
+ // that indicate identical taxa (or at least, taxa whose inter-taxa
8
+ // differences are the same, and that are all distance 0 from each
9
+ // other).
10
+ //
11
+ // Copyright James Barbetti (2021-22)
12
+ //
13
+
1
14
#pragma once
2
15
#ifndef hashrow_h
3
16
#define hashrow_h
8
21
9
22
typedef std::vector< std::vector< intptr_t > > DuplicateTaxa;
10
23
24
+ /* *
25
+ * @brief A pair, of a hash, and contiguous block of
26
+ * something (T) that is hashable and comparable
27
+ * via both operator!= and operator<.
28
+ * Ordering is hash primary, with lexicographic
29
+ * block order to tie-break.
30
+ * @tparam T something hashable (via std::hash) and
31
+ * comparable.
32
+ * @note if blocks are different sizes, and the contents
33
+ * of the blocks compare equal (to the size of the
34
+ * smaller block, the shorter block is treated as
35
+ * being less than the longer).
36
+ */
11
37
template <class T > class HashRow {
12
38
public:
13
- intptr_t row_num;
14
- const T* row_data;
15
- size_t row_length;
16
- size_t row_hash;
39
+ intptr_t row_num; // an identifying row number (not used for ordering)
40
+ const T* row_data; // the data block for the row
41
+ size_t row_length; // the size of the data block
42
+ size_t row_hash; // the hash of the elements in the block
43
+
17
44
HashRow (): row_num(-1 ), row_data(nullptr ), row_length(0 ), row_hash(0 ) {}
18
45
HashRow (intptr_t num, const T* row_start, size_t length)
19
46
: row_num(num), row_data(row_start), row_length(length), row_hash(0 ) {
@@ -25,16 +52,31 @@ template <class T> class HashRow {
25
52
HashRow (const HashRow& rhs) = default ;
26
53
HashRow& operator = (const HashRow& rhs) = default ;
27
54
28
-
55
+ /* *
56
+ * @brief Compare *this with rhs
57
+ * @param rhs
58
+ * @return int -1 if *this compares less, +1 if rhs compares less,
59
+ * and 0, if *this and rhs compare equal
60
+ */
29
61
int compare (const HashRow& rhs) const {
62
+ // First compare hashes
30
63
if (row_hash<rhs.row_hash ) { return -1 ; }
31
64
if (rhs.row_hash <row_hash) { return 1 ; }
32
- for ( size_t col=0 ; col<row_length ; ++col) {
65
+ int count_diff = 0 ;
66
+ size_t min_col_count = row_length;
67
+ if (row_length != rhs.row_length ) {
68
+ bool left_less = (row_length < rhs.row_length );
69
+ count_diff = left_less ? -1 : 1 ;
70
+ min_col_count = left_less ? row_length : rhs.row_length ;
71
+ }
72
+ // compare elements in the blocks, up to the size of
73
+ // the smaller block.
74
+ for ( size_t col=0 ; col<min_col_count ; ++col) {
33
75
if (row_data[col]!=rhs.row_data [col]) {
34
76
return (row_data[col]<rhs.row_data [col]) ? -1 : 1 ;
35
77
}
36
78
}
37
- return 0 ;
79
+ return count_diff ;
38
80
}
39
81
bool operator < (const HashRow& rhs) const {
40
82
return compare (rhs)<0 ;
@@ -43,6 +85,19 @@ template <class T> class HashRow {
43
85
return compare (rhs)==0 ;
44
86
}
45
87
88
+ /* *
89
+ * @brief Given a sorted vector of HashRow<T>, that represents the
90
+ * rows of a distance matrix, determine which groups of rows
91
+ * in the distance matrix are duplicates.
92
+ * @param hashed_rows the hashed distance matrix rows, in sorted order
93
+ * @param vvc reference to a DuplicateTaxa instance (a vector of
94
+ * vectors of intptr_t). Each of the vectors will
95
+ * contains the row numbers of the (duplicate) rows
96
+ * in an equivalence class (in no particular order).
97
+ * @note it assumed (but not checked) that hashed_rows has been sorted
98
+ * @note row numbers, within an equivalence class, appear in vvc
99
+ * in their order of appearance in hashed_rows.
100
+ */
46
101
static void identifyDuplicateClusters (const std::vector< HashRow<T> >& hashed_rows,
47
102
DuplicateTaxa& vvc) {
48
103
std::vector< intptr_t > vc; // vector of cluster #s
@@ -51,7 +106,7 @@ template <class T> class HashRow {
51
106
bool is_duplicate = hashed_rows[i].compare (hashed_rows[i-1 ])==0 ;
52
107
if (is_duplicate) {
53
108
intptr_t h = hashed_rows[i-1 ].row_num ;
54
- is_duplicate = hashed_rows[i].row_data [h] ==0 ;
109
+ is_duplicate = hashed_rows[i].row_data [h] == 0 ;
55
110
}
56
111
if (!is_duplicate) {
57
112
// Not a duplicate of the previous row.
0 commit comments