Skip to content

Commit 81e8e40

Browse files
James BarbettiJames Barbetti
James Barbetti
authored and
James Barbetti
committed
Added doxygen-style comments to nj.h.
1 parent a18eafa commit 81e8e40

File tree

1 file changed

+190
-38
lines changed

1 file changed

+190
-38
lines changed

nj.h

Lines changed: 190 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,26 @@
7575

7676
namespace StartTree
7777
{
78+
79+
/**
80+
* @brief Neighbour Joining implementation
81+
* @tparam T the distance type
82+
* @note The main structure is a D matrix
83+
* (a matrix of distances) and a U vector
84+
* (a vector of row totals for that matrix)
85+
* @note The textbook implementation uses formulae that feature
86+
* row totals, and multiplies distances by (n-2). This
87+
* implementation calculates "scaled" row totals, which are
88+
* row totals, U', multiplied by (1/(n-2)), so simpler
89+
* formulae of the form Dij-U'i-U'j rather than (n-2)Dij-Ui-Uj,
90+
* can be used (since we're just comparing magnitudes to determine
91+
* which pair of clusters to join, it doesn't matter if those
92+
* magnitudes are all rescaled.
93+
* This trick saves a multiplication on every evaluation, at the
94+
* cost of row_count multiplications (to calculate U' from U)
95+
* per iteration.
96+
*/
7897
template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
79-
//NJMatrix is a D matrix (a matrix of distances).
8098
{
8199
public:
82100
typedef UPGMA_Matrix<T> super;
@@ -99,13 +117,15 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
99117
return "NJ";
100118
}
101119
protected:
120+
/**
121+
* @brief Calculate scaled row totals, U' (in scaledRowTotals)
122+
* by multiplying row totals, U (in rowTotals) by
123+
* (1/(n-2)). If U' is used, it isn't necessarily to
124+
* multiply distances (from the D matrix) by (n-2)
125+
* when comparing adjusted distances to determine which
126+
* cluster to join.
127+
*/
102128
virtual void calculateScaledRowTotals() const {
103-
//
104-
//Note: Rather than multiplying distances by (n-2)
105-
// repeatedly, it is cheaper to work with row
106-
// totals multiplied by (1/(T)(n-2)).
107-
// Better n multiplications than n*(n-1)/2.
108-
//
109129
scaledRowTotals.resize(row_count);
110130
T nless2 = (T)( row_count - 2 );
111131
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ((T)1.0 / nless2);
@@ -116,10 +136,29 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
116136
scaledRowTotals[r] = rowTotals[r] * tMultiplier;
117137
}
118138
}
139+
/**
140+
* @brief Override of calculateRowTotals() - that ensures that the
141+
* scaled row totals are also recalculated.
142+
*/
119143
virtual void calculateRowTotals() const override {
120144
super::calculateRowTotals();
121145
calculateScaledRowTotals();
122146
}
147+
/**
148+
* @brief determine, for each row, r, which column,row pair of
149+
* clusters (with column c less than row r), would be the
150+
* "best" choice for the next pair of clusters to join.
151+
* @note scaled row totals are looked up "around" the
152+
* scaledRowTotals vector via a "naked" array pointer,
153+
* tot to avoid range-checking overhead.
154+
* @note bestVrc is, initially, the "best" value of Drc - U'c
155+
* for the part of the current row in the lower-left
156+
* triangle (c between 0 and r-1 inclusive). U'r
157+
* is only subtracted *after* the minimum for (Drc - U'c)
158+
* has been found. We don't need to calculate
159+
* Drc - U'c - U'r except for the c for which Drc - U'c
160+
* was minimized.
161+
*/
123162
virtual void getRowMinima() const override {
124163
calculateScaledRowTotals();
125164
rowMinima.resize(row_count);
@@ -145,9 +184,30 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
145184
, getImbalance(row, bestColumn));
146185
}
147186
}
187+
/**
188+
* @brief Join two clusters (corresponding with column a and
189+
* row b in the matrix), and update row totals on the go.
190+
* @param a the column
191+
* @param b the row (a<b)
192+
* @note It is assumed throughout that 0<a<b<row_count.
193+
* @note Most implementations of Neighbour Joining periodically
194+
* recalculate row totals "from scratch" (the fear is that
195+
* rounding error will otherwise accumulate). Originally
196+
* I had code to do that, periodically. But it didn't
197+
* seem to make much real difference. -James B.
198+
* @note In the following c is the cluster that results by joining
199+
* the clusters corresponding to rows a and b. In practice,
200+
* cluster c will (by the time this function returns)
201+
* correspond to the (a)th row/column in the matrix.
202+
* And what *was* cluster (row_count-1) will correspond
203+
* to the (b)th row/column.
204+
* @note If we think of NJ as a "variant" of BIONJ, it is as
205+
* though we always have lambda=0.5. And mu (1-lambda)
206+
* is also 0.5. That's why those variable names are used.
207+
* @note Perhaps scaledRowTotals[i] ought to be updated here, too.
208+
* But then every subclass would have to do the same.
209+
*/
148210
virtual void cluster(intptr_t a, intptr_t b) override {
149-
//Cluster two active rows, identified by row indices a and b).
150-
//Assumed 0<=a<b<n
151211
T nless2 = (T)(row_count-2);
152212
T tMultiplier = (row_count<3) ? (T)0.0 : ((T)0.5 / nless2);
153213
T lambda = (T)0.5;
@@ -171,7 +231,6 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
171231
aRow[i] = Dci;
172232
rows[i][a] = Dci;
173233
rowTotals[i] += Dci - Dai - Dbi;
174-
//JB2020-06-18 Adjust row totals on fly
175234
cTotal += Dci;
176235
}
177236
}
@@ -182,6 +241,13 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
182241
rowToCluster[b] = rowToCluster[row_count-1];
183242
removeRowAndColumn(b);
184243
}
244+
/**
245+
* @brief Finish the tree, by joining the last two or three clusters.
246+
* (Conventionally there are three clusters, but if neighbour
247+
* joining is being used to cluster a subtree, when honouring
248+
* a constraint tree, depending on what the caller wants, the
249+
* "top" node of the tree might have degree 2 rather than 3).
250+
*/
185251
virtual void finishClustering() override {
186252
ASSERT( row_count == 2 || row_count == 3);
187253
T halfD01 = (T)0.5 * rows[0][1];
@@ -203,9 +269,18 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
203269
}
204270
};
205271

272+
/**
273+
* @brief Unweighted Neighbour Joining implementation
274+
* @tparam T the distance type.
275+
* @note This is based on NJMatrix<T> (and why not, when the only
276+
* difference between the two algorithms is how distances
277+
* between newly joined clusters, and other existing clusters
278+
* are calculated.
279+
*/
206280
template <class T=NJFloat> class UNJMatrix: public NJMatrix<T> {
207281
protected:
208-
intptr_t original_n;
282+
intptr_t original_n; //Needed in formulae used in distance formulae
283+
//during clutering.
209284
public:
210285
typedef NJMatrix<T> super;
211286
UNJMatrix(): super(), original_n(0) { }
@@ -225,24 +300,29 @@ template <class T=NJFloat> class UNJMatrix: public NJMatrix<T> {
225300
using super::rowToCluster;
226301
using super::removeRowAndColumn;
227302

303+
/**
304+
* @brief Join two clusters (those corresponding to rows a and b
305+
* in the working distance matrix).
306+
* @param a the lower-numbered row (or, if you prefer the column,
307+
* in the lower-left triangle of the matrix)
308+
* @param b the higher-numbered row (a<b)
309+
* @note It is assumed a<b.
310+
* @note The clusters are weighted in terms of the number of
311+
* taxa they contain (aCount and bCount), as per [GAS1997].
312+
* (Conceptually, the leaf nodes for the taxa are NOT weighted,
313+
* and standard NJ is "downweighting" each taxon, with a weight
314+
* of 1/aCount in the cluster for row a, and by a weight
315+
* of 1/bCount in the cluster for row b; but in practice,
316+
* "undoing the effect of the weighting" requires more
317+
* multiplication, and more time).
318+
* So UNJMatrix runs slower than NJMatrix! The End. -James B.
319+
* @note The greek letter lambda is given a different role
320+
* in [GAS1997], but I wanted to use it in a fashion
321+
* a bit more consistent with later BIONJ implementations
322+
* (since BIONJ is much more widely used). -James B.
323+
* @note mu is just (1-lambda).
324+
*/
228325
virtual void cluster(intptr_t a, intptr_t b) override {
229-
//Cluster two active rows, identified by row indices a and b).
230-
//Assumes: 0<=a<b<n
231-
//
232-
//The clusters are weighted in terms of the number of taxa they
233-
//contain (aCount and bCount), as per [GAS1997].
234-
//(Conceptually, the leaf nodes for the taxa are NOT weighted,
235-
// and standard NJ is "downweighting" each taxon, with a weight of
236-
// 1/aCount in the cluster for row a, and by a weight of
237-
// 1/bCount in the cluster for row b; but in practice, "undoing
238-
// the effect of the weighting" requires more multiplication, and
239-
// more time).
240-
//
241-
//Note: The greek letter lambda is given a different role
242-
// in [GAS1997], but I wanted to use it in a fashion
243-
// a bit more consistent with later BIONJ implementations.
244-
// -James B. 19-Oct-2020.
245-
//
246326
size_t aCount = clusters[rowToCluster[a]].countOfExteriorNodes;
247327
size_t bCount = clusters[rowToCluster[b]].countOfExteriorNodes;
248328
size_t cCount = aCount + bCount;
@@ -294,6 +374,19 @@ template <class T=NJFloat> class UNJMatrix: public NJMatrix<T> {
294374
}
295375
};
296376

377+
/**
378+
* @brief Implementation of the BIONJ neighbour joining variant
379+
* @tparam T the distance type
380+
* @note The big difference between BIONJ and NJ is that BIONJ
381+
* maintains an auxiliary V matrix, of "estimated variances"
382+
* of "positions" of clusters in a notional genetic "space".
383+
* The variance member is V. Row and column rearrangements
384+
* in V duplicate those in the primary distance matrix, D.
385+
* Values in the V matrix are used for determining lambda
386+
* (the "weight" to be assigned to the first cluster, when
387+
* merging two clusters, corresponding to rows a and b, a<b,
388+
* in the D matrix).
389+
*/
297390
template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
298391
public:
299392
typedef NJMatrix<T> super;
@@ -321,11 +414,32 @@ template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
321414
bool rc = super::loadMatrix(names, matrix);
322415
return rc;
323416
}
417+
/**
418+
* @brief Calculate lambda, the weight to give distances
419+
* to the cluster that currently corresponds to
420+
* row a of the distance matrix, if it is joined
421+
* with the custer that currently corresponds to
422+
* row b of the distance matrix (given row numbers
423+
* a and b, 0<=a<b<row_count).
424+
* @param a the lower row number (0<=a)
425+
* @param b the higher row number (a<b<row_count)
426+
* @param Vab the estimated variance of the genetic
427+
* distance between the clusters that currently
428+
* correspond to rows a and b of the distance matrix.
429+
* @return T lambda, the weight to give distances to the a cluster,
430+
* between 0 and 1. mu, the weight to give distances
431+
* to the b cluster, is always (1-lambda).
432+
* @note The code that adds for all i, and then subtracts
433+
* for a and b, runs faster than would code that,
434+
* for all i, checked that i!=a && i!=b before deciding
435+
* whether to add. Yes, there's a slight increase in
436+
* rounding error, but the performance benefit appears
437+
* to be worth it. -James B.
438+
*/
324439
inline T chooseLambda(intptr_t a, intptr_t b, T Vab) {
325-
//Assumed 0<=a<b<n
326440
T lambda = 0;
327441
if (Vab==0.0) {
328-
return 0.5;
442+
return 0.5; //special case (to avoid division by zero)
329443
}
330444
auto vRowA = variance.rows[a];
331445
auto vRowB = variance.rows[b];
@@ -343,6 +457,19 @@ template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
343457
if (lambda<0.0) lambda=(T)0.0;
344458
return lambda;
345459
}
460+
/**
461+
* @brief Join the clusters that currently correspond to rows
462+
* a and b of the distance matrix. Update row totals
463+
* (but not scaled row totals), and variances, too.
464+
* @param a the lower row number (0<=a<b)
465+
* @param b the higher row number (a<b<row_count)
466+
* @note The bits of this that differ, in a meaningful way, from
467+
* the corresponding code in NJMatrix are tagged with BIO.
468+
* In NJ, lambda is always 0.5, and elements in V for the new
469+
* cluster (which overwrite what is in row a in V), need to
470+
* be calcualted. And lastly, what is done to the D matrix
471+
* to shrink it, also has to be "echoed" on the V matrix.
472+
*/
346473
virtual void cluster(intptr_t a, intptr_t b) override {
347474
//Assumed 0<=a<b<n
348475
//Bits that differ from super::cluster tagged BIO
@@ -398,18 +525,27 @@ template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
398525
};
399526

400527
#if USE_VECTORCLASS_LIBRARY
528+
/**
529+
* @brief
530+
*
531+
* @tparam T the distance type
532+
* @tparam SUPER the superclass (in practice, NJMatrix<T>,
533+
* UNJMatrix<T>, or BIONJMatrix<T>) this class is to
534+
* vectorize.
535+
* @tparam V the vector type
536+
* @tparam VB the boolean vector type
537+
* @note This works by overriding getRowMinima() with
538+
* a vectorized version.
539+
* @note This can't subclass UPGMAMatrix<T> (so don't try
540+
* using it with SUPER=UPGMAMatrix<T>!), because
541+
* that has a rather different getRowMinima().
542+
* There's a separate class for vectorizing that.
543+
*/
401544
template <class T=NJFloat, class SUPER=BIONJMatrix<T>,
402545
class V=FloatVector, class VB=FloatBoolVector>
403546
class VectorizedMatrix: public SUPER
404547
{
405548
typedef SUPER super;
406-
//
407-
//Note: this is a first attempt at hand-vectorizing
408-
// BIONJMatrix::getRowMinima (via Agner Fog's
409-
// vectorclass library).
410-
// It can subclass either NJMatrix or BIONJMatrix.
411-
// It cannot subclass UPGMA_Matrix.
412-
//
413549
using super::rows;
414550
using super::row_count;
415551
using super::rowMinima;
@@ -426,12 +562,29 @@ class VectorizedMatrix: public SUPER
426562
virtual std::string getAlgorithmName() const {
427563
return "Vectorized-" + super::getAlgorithmName();
428564
}
565+
/**
566+
* @brief Override for calculateRowTotals().
567+
* Make sure the scaled total and row number
568+
* scratch vectors are large enough.
569+
*/
429570
virtual void calculateRowTotals() const {
430571
super::calculateRowTotals();
431572
size_t fluff = MATRIX_ALIGNMENT / sizeof(T);
432573
scratchTotals.resize(row_count + fluff, 0.0);
433574
scratchColumnNumbers.resize(row_count + fluff, 0.0);
434575
}
576+
/**
577+
* @brief Get the Row Minima object
578+
* @note tot is an *aligned* pointer to
579+
* elements in a "scratch" vector of rescaled
580+
* row totals, calculated each time this member
581+
* is called.
582+
* @note nums is an *aligned* pointer to elements in
583+
* a "scratch" vector, which is used to keep
584+
* track of *where* (which is to say, which)
585+
* column, each adjusted distance in the current
586+
* row comes from.
587+
*/
435588
virtual void getRowMinima() const {
436589
T nless2 = (T)( row_count - 2 );
437590
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ((T)1.0 / nless2);
@@ -498,5 +651,4 @@ class VectorizedMatrix: public SUPER
498651
typedef VectorizedMatrix<NJFloat, BIONJMatrix<NJFloat>> VectorBIONJ;
499652
#endif
500653
}
501-
502654
#endif /* nj_h */

0 commit comments

Comments
 (0)