Skip to content

Commit 4661a37

Browse files
James BarbettiJames Barbetti
James Barbetti
authored and
James Barbetti
committed
Added comments to hashrow.h.
Also, added stuff to HashRow<T> so that it won't run off the end of the second row, if the first row (*this) in a comparison has fewer items in it (a lower row_count) than the than the second (rhs).
1 parent ba6991f commit 4661a37

File tree

1 file changed

+63
-8
lines changed

1 file changed

+63
-8
lines changed

hashrow.h

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
//
2+
// HashRow<T> type for keeping track of rows of a distance matrix of T
3+
// (hashing them, to make it cheaper to determine which rows
4+
// are identical).
5+
//
6+
// DuplicateTaxa type, for reporting the row numbers of rows
7+
// that indicate identical taxa (or at least, taxa whose inter-taxa
8+
// differences are the same, and that are all distance 0 from each
9+
// other).
10+
//
11+
// Copyright James Barbetti (2021-22)
12+
//
13+
114
#pragma once
215
#ifndef hashrow_h
316
#define hashrow_h
@@ -8,12 +21,26 @@
821

922
typedef std::vector< std::vector< intptr_t > > DuplicateTaxa;
1023

24+
/**
25+
* @brief A pair, of a hash, and contiguous block of
26+
* something (T) that is hashable and comparable
27+
* via both operator!= and operator<.
28+
* Ordering is hash primary, with lexicographic
29+
* block order to tie-break.
30+
* @tparam T something hashable (via std::hash) and
31+
* comparable.
32+
* @note if blocks are different sizes, and the contents
33+
* of the blocks compare equal (to the size of the
34+
* smaller block, the shorter block is treated as
35+
* being less than the longer).
36+
*/
1137
template <class T> class HashRow {
1238
public:
13-
intptr_t row_num;
14-
const T* row_data;
15-
size_t row_length;
16-
size_t row_hash;
39+
intptr_t row_num; //an identifying row number (not used for ordering)
40+
const T* row_data; //the data block for the row
41+
size_t row_length; //the size of the data block
42+
size_t row_hash; //the hash of the elements in the block
43+
1744
HashRow(): row_num(-1), row_data(nullptr), row_length(0), row_hash(0) {}
1845
HashRow(intptr_t num, const T* row_start, size_t length)
1946
: row_num(num), row_data(row_start), row_length(length), row_hash(0) {
@@ -25,16 +52,31 @@ template <class T> class HashRow {
2552
HashRow(const HashRow& rhs) = default;
2653
HashRow& operator= (const HashRow& rhs) = default;
2754

28-
55+
/**
56+
* @brief Compare *this with rhs
57+
* @param rhs
58+
* @return int -1 if *this compares less, +1 if rhs compares less,
59+
* and 0, if *this and rhs compare equal
60+
*/
2961
int compare(const HashRow& rhs) const {
62+
//First compare hashes
3063
if (row_hash<rhs.row_hash) { return -1; }
3164
if (rhs.row_hash<row_hash) { return 1; }
32-
for ( size_t col=0; col<row_length ; ++col) {
65+
int count_diff = 0;
66+
size_t min_col_count = row_length;
67+
if (row_length != rhs.row_length) {
68+
bool left_less = (row_length < rhs.row_length);
69+
count_diff = left_less ? -1 : 1;
70+
min_col_count = left_less ? row_length : rhs.row_length;
71+
}
72+
//compare elements in the blocks, up to the size of
73+
//the smaller block.
74+
for ( size_t col=0; col<min_col_count ; ++col) {
3375
if (row_data[col]!=rhs.row_data[col]) {
3476
return (row_data[col]<rhs.row_data[col]) ? -1 : 1;
3577
}
3678
}
37-
return 0;
79+
return count_diff;
3880
}
3981
bool operator< (const HashRow& rhs) const {
4082
return compare(rhs)<0;
@@ -43,6 +85,19 @@ template <class T> class HashRow {
4385
return compare(rhs)==0;
4486
}
4587

88+
/**
89+
* @brief Given a sorted vector of HashRow<T>, that represents the
90+
* rows of a distance matrix, determine which groups of rows
91+
* in the distance matrix are duplicates.
92+
* @param hashed_rows the hashed distance matrix rows, in sorted order
93+
* @param vvc reference to a DuplicateTaxa instance (a vector of
94+
* vectors of intptr_t). Each of the vectors will
95+
* contains the row numbers of the (duplicate) rows
96+
* in an equivalence class (in no particular order).
97+
* @note it assumed (but not checked) that hashed_rows has been sorted
98+
* @note row numbers, within an equivalence class, appear in vvc
99+
* in their order of appearance in hashed_rows.
100+
*/
46101
static void identifyDuplicateClusters(const std::vector< HashRow<T> >& hashed_rows,
47102
DuplicateTaxa& vvc) {
48103
std::vector< intptr_t> vc; //vector of cluster #s
@@ -51,7 +106,7 @@ template <class T> class HashRow {
51106
bool is_duplicate = hashed_rows[i].compare(hashed_rows[i-1])==0;
52107
if (is_duplicate) {
53108
intptr_t h = hashed_rows[i-1].row_num;
54-
is_duplicate = hashed_rows[i].row_data[h] ==0;
109+
is_duplicate = hashed_rows[i].row_data[h] == 0 ;
55110
}
56111
if (!is_duplicate) {
57112
//Not a duplicate of the previous row.

0 commit comments

Comments
 (0)