Added comments to hashrow.h.

James Barbetti · James Barbetti · commit 4661a3741b51 · 2022-11-21T21:24:34.000+11:00
Also, added stuff to HashRow&lt;T&gt; so that it won't run off the end of
the second row, if the first row (*this) in a comparison has fewer
items in it (a lower row_count) than the than the second (rhs).
diff --git a/hashrow.h b/hashrow.h
@@ -1,3 +1,16 @@
+//
+//  HashRow<T> type for keeping track of rows of a distance matrix of T
+//  (hashing them, to make it cheaper to determine which rows
+//  are identical).
+//
+//  DuplicateTaxa type, for reporting the row numbers of rows
+//  that indicate identical taxa (or at least, taxa whose inter-taxa
+//  differences are the same, and that are all distance 0 from each 
+//  other).
+//
+//  Copyright James Barbetti (2021-22)
+//
+
 #pragma once
 #ifndef hashrow_h
 #define hashrow_h
@@ -8,12 +21,26 @@
 
 typedef std::vector< std::vector< intptr_t > > DuplicateTaxa;
 
+/**
+ * @brief  A pair, of a hash, and contiguous block of
+ *         something (T) that is hashable and comparable
+ *         via both operator!= and operator<.
+ *         Ordering is hash primary, with lexicographic 
+ *         block order to tie-break.
+ * @tparam T something hashable (via std::hash) and 
+ *         comparable.  
+ * @note   if blocks are different sizes, and the contents
+ *         of the blocks compare equal (to the size of the
+ *         smaller block, the shorter block is treated as
+ *         being less than the longer).
+ */
 template <class T> class HashRow {
 public:
-    intptr_t row_num;
-    const T* row_data;
-    size_t   row_length;
-    size_t   row_hash;
+    intptr_t row_num;    //an identifying row number (not used for ordering)
+    const T* row_data;   //the data block for the row
+    size_t   row_length; //the size of the data block
+    size_t   row_hash;   //the hash of the elements in the block
+
     HashRow(): row_num(-1), row_data(nullptr), row_length(0), row_hash(0) {}
     HashRow(intptr_t num, const T* row_start, size_t length)
         : row_num(num), row_data(row_start), row_length(length), row_hash(0) {
@@ -25,16 +52,31 @@ template <class T> class HashRow {
     HashRow(const HashRow& rhs) = default;
     HashRow& operator= (const HashRow& rhs) = default;
     
-
+    /**
+     * @brief  Compare *this with rhs
+     * @param  rhs 
+     * @return int -1 if *this compares less, +1 if rhs compares less,
+     *         and 0, if *this and rhs compare equal
+     */
     int compare(const HashRow& rhs) const {
+        //First compare hashes
         if (row_hash<rhs.row_hash) { return -1; }
         if (rhs.row_hash<row_hash) { return 1;  }
-        for ( size_t col=0; col<row_length ; ++col) {
+        int    count_diff    = 0;
+        size_t min_col_count = row_length; 
+        if (row_length != rhs.row_length) {
+            bool left_less = (row_length < rhs.row_length);
+            count_diff     = left_less ? -1 : 1;
+            min_col_count  = left_less ? row_length : rhs.row_length;
+        }
+        //compare elements in the blocks, up to the size of
+        //the smaller block.
+        for ( size_t col=0; col<min_col_count ; ++col) {
             if (row_data[col]!=rhs.row_data[col]) {
                 return (row_data[col]<rhs.row_data[col]) ? -1 : 1;
             }
         }
-        return 0;
+        return count_diff;
     }
     bool operator< (const HashRow& rhs) const {
         return compare(rhs)<0;
@@ -43,6 +85,19 @@ template <class T> class HashRow {
         return compare(rhs)==0;
     }
     
+    /**
+     * @brief  Given a sorted vector of HashRow<T>, that represents the
+     *         rows of a distance matrix, determine which groups of rows
+     *         in the distance matrix are duplicates.
+     * @param  hashed_rows the hashed distance matrix rows, in sorted order
+     * @param  vvc reference to a DuplicateTaxa instance (a vector of 
+     *             vectors of intptr_t).  Each of the vectors will 
+     *             contains the row numbers of the (duplicate) rows
+     *             in an equivalence class (in no particular order).
+     * @note   it assumed (but not checked) that hashed_rows has been sorted
+     * @note   row numbers, within an equivalence class, appear in vvc
+     *         in their order of appearance in hashed_rows.
+     */
     static void identifyDuplicateClusters(const std::vector< HashRow<T> >& hashed_rows,
                                           DuplicateTaxa& vvc) {
         std::vector< intptr_t> vc; //vector of cluster #s
@@ -51,7 +106,7 @@ template <class T> class HashRow {
             bool is_duplicate = hashed_rows[i].compare(hashed_rows[i-1])==0;
             if (is_duplicate) {
                 intptr_t   h = hashed_rows[i-1].row_num;
-                is_duplicate = hashed_rows[i].row_data[h] ==0;
+                is_duplicate = hashed_rows[i].row_data[h] == 0 ;
             }
             if (!is_duplicate) {
                 //Not a duplicate of the previous row.