75
75
76
76
namespace StartTree
77
77
{
78
+
79
+ /* *
80
+ * @brief Neighbour Joining implementation
81
+ * @tparam T the distance type
82
+ * @note The main structure is a D matrix
83
+ * (a matrix of distances) and a U vector
84
+ * (a vector of row totals for that matrix)
85
+ * @note The textbook implementation uses formulae that feature
86
+ * row totals, and multiplies distances by (n-2). This
87
+ * implementation calculates "scaled" row totals, which are
88
+ * row totals, U', multiplied by (1/(n-2)), so simpler
89
+ * formulae of the form Dij-U'i-U'j rather than (n-2)Dij-Ui-Uj,
90
+ * can be used (since we're just comparing magnitudes to determine
91
+ * which pair of clusters to join, it doesn't matter if those
92
+ * magnitudes are all rescaled.
93
+ * This trick saves a multiplication on every evaluation, at the
94
+ * cost of row_count multiplications (to calculate U' from U)
95
+ * per iteration.
96
+ */
78
97
template <class T =NJFloat> class NJMatrix : public UPGMA_Matrix <T>
79
- // NJMatrix is a D matrix (a matrix of distances).
80
98
{
81
99
public:
82
100
typedef UPGMA_Matrix<T> super;
@@ -99,13 +117,15 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
99
117
return " NJ" ;
100
118
}
101
119
protected:
120
+ /* *
121
+ * @brief Calculate scaled row totals, U' (in scaledRowTotals)
122
+ * by multiplying row totals, U (in rowTotals) by
123
+ * (1/(n-2)). If U' is used, it isn't necessarily to
124
+ * multiply distances (from the D matrix) by (n-2)
125
+ * when comparing adjusted distances to determine which
126
+ * cluster to join.
127
+ */
102
128
virtual void calculateScaledRowTotals () const {
103
- //
104
- // Note: Rather than multiplying distances by (n-2)
105
- // repeatedly, it is cheaper to work with row
106
- // totals multiplied by (1/(T)(n-2)).
107
- // Better n multiplications than n*(n-1)/2.
108
- //
109
129
scaledRowTotals.resize (row_count);
110
130
T nless2 = (T)( row_count - 2 );
111
131
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ((T)1.0 / nless2);
@@ -116,10 +136,29 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
116
136
scaledRowTotals[r] = rowTotals[r] * tMultiplier;
117
137
}
118
138
}
139
+ /* *
140
+ * @brief Override of calculateRowTotals() - that ensures that the
141
+ * scaled row totals are also recalculated.
142
+ */
119
143
virtual void calculateRowTotals () const override {
120
144
super::calculateRowTotals ();
121
145
calculateScaledRowTotals ();
122
146
}
147
+ /* *
148
+ * @brief determine, for each row, r, which column,row pair of
149
+ * clusters (with column c less than row r), would be the
150
+ * "best" choice for the next pair of clusters to join.
151
+ * @note scaled row totals are looked up "around" the
152
+ * scaledRowTotals vector via a "naked" array pointer,
153
+ * tot to avoid range-checking overhead.
154
+ * @note bestVrc is, initially, the "best" value of Drc - U'c
155
+ * for the part of the current row in the lower-left
156
+ * triangle (c between 0 and r-1 inclusive). U'r
157
+ * is only subtracted *after* the minimum for (Drc - U'c)
158
+ * has been found. We don't need to calculate
159
+ * Drc - U'c - U'r except for the c for which Drc - U'c
160
+ * was minimized.
161
+ */
123
162
virtual void getRowMinima () const override {
124
163
calculateScaledRowTotals ();
125
164
rowMinima.resize (row_count);
@@ -145,9 +184,30 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
145
184
, getImbalance (row, bestColumn));
146
185
}
147
186
}
187
+ /* *
188
+ * @brief Join two clusters (corresponding with column a and
189
+ * row b in the matrix), and update row totals on the go.
190
+ * @param a the column
191
+ * @param b the row (a<b)
192
+ * @note It is assumed throughout that 0<a<b<row_count.
193
+ * @note Most implementations of Neighbour Joining periodically
194
+ * recalculate row totals "from scratch" (the fear is that
195
+ * rounding error will otherwise accumulate). Originally
196
+ * I had code to do that, periodically. But it didn't
197
+ * seem to make much real difference. -James B.
198
+ * @note In the following c is the cluster that results by joining
199
+ * the clusters corresponding to rows a and b. In practice,
200
+ * cluster c will (by the time this function returns)
201
+ * correspond to the (a)th row/column in the matrix.
202
+ * And what *was* cluster (row_count-1) will correspond
203
+ * to the (b)th row/column.
204
+ * @note If we think of NJ as a "variant" of BIONJ, it is as
205
+ * though we always have lambda=0.5. And mu (1-lambda)
206
+ * is also 0.5. That's why those variable names are used.
207
+ * @note Perhaps scaledRowTotals[i] ought to be updated here, too.
208
+ * But then every subclass would have to do the same.
209
+ */
148
210
virtual void cluster (intptr_t a, intptr_t b) override {
149
- // Cluster two active rows, identified by row indices a and b).
150
- // Assumed 0<=a<b<n
151
211
T nless2 = (T)(row_count-2 );
152
212
T tMultiplier = (row_count<3 ) ? (T)0.0 : ((T)0.5 / nless2);
153
213
T lambda = (T)0.5 ;
@@ -171,7 +231,6 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
171
231
aRow[i] = Dci;
172
232
rows[i][a] = Dci;
173
233
rowTotals[i] += Dci - Dai - Dbi;
174
- // JB2020-06-18 Adjust row totals on fly
175
234
cTotal += Dci;
176
235
}
177
236
}
@@ -182,6 +241,13 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
182
241
rowToCluster[b] = rowToCluster[row_count-1 ];
183
242
removeRowAndColumn (b);
184
243
}
244
+ /* *
245
+ * @brief Finish the tree, by joining the last two or three clusters.
246
+ * (Conventionally there are three clusters, but if neighbour
247
+ * joining is being used to cluster a subtree, when honouring
248
+ * a constraint tree, depending on what the caller wants, the
249
+ * "top" node of the tree might have degree 2 rather than 3).
250
+ */
185
251
virtual void finishClustering () override {
186
252
ASSERT ( row_count == 2 || row_count == 3 );
187
253
T halfD01 = (T)0.5 * rows[0 ][1 ];
@@ -203,9 +269,18 @@ template <class T=NJFloat> class NJMatrix: public UPGMA_Matrix<T>
203
269
}
204
270
};
205
271
272
+ /* *
273
+ * @brief Unweighted Neighbour Joining implementation
274
+ * @tparam T the distance type.
275
+ * @note This is based on NJMatrix<T> (and why not, when the only
276
+ * difference between the two algorithms is how distances
277
+ * between newly joined clusters, and other existing clusters
278
+ * are calculated.
279
+ */
206
280
template <class T =NJFloat> class UNJMatrix : public NJMatrix <T> {
207
281
protected:
208
- intptr_t original_n;
282
+ intptr_t original_n; // Needed in formulae used in distance formulae
283
+ // during clutering.
209
284
public:
210
285
typedef NJMatrix<T> super;
211
286
UNJMatrix (): super(), original_n(0 ) { }
@@ -225,24 +300,29 @@ template <class T=NJFloat> class UNJMatrix: public NJMatrix<T> {
225
300
using super::rowToCluster;
226
301
using super::removeRowAndColumn;
227
302
303
+ /* *
304
+ * @brief Join two clusters (those corresponding to rows a and b
305
+ * in the working distance matrix).
306
+ * @param a the lower-numbered row (or, if you prefer the column,
307
+ * in the lower-left triangle of the matrix)
308
+ * @param b the higher-numbered row (a<b)
309
+ * @note It is assumed a<b.
310
+ * @note The clusters are weighted in terms of the number of
311
+ * taxa they contain (aCount and bCount), as per [GAS1997].
312
+ * (Conceptually, the leaf nodes for the taxa are NOT weighted,
313
+ * and standard NJ is "downweighting" each taxon, with a weight
314
+ * of 1/aCount in the cluster for row a, and by a weight
315
+ * of 1/bCount in the cluster for row b; but in practice,
316
+ * "undoing the effect of the weighting" requires more
317
+ * multiplication, and more time).
318
+ * So UNJMatrix runs slower than NJMatrix! The End. -James B.
319
+ * @note The greek letter lambda is given a different role
320
+ * in [GAS1997], but I wanted to use it in a fashion
321
+ * a bit more consistent with later BIONJ implementations
322
+ * (since BIONJ is much more widely used). -James B.
323
+ * @note mu is just (1-lambda).
324
+ */
228
325
virtual void cluster (intptr_t a, intptr_t b) override {
229
- // Cluster two active rows, identified by row indices a and b).
230
- // Assumes: 0<=a<b<n
231
- //
232
- // The clusters are weighted in terms of the number of taxa they
233
- // contain (aCount and bCount), as per [GAS1997].
234
- // (Conceptually, the leaf nodes for the taxa are NOT weighted,
235
- // and standard NJ is "downweighting" each taxon, with a weight of
236
- // 1/aCount in the cluster for row a, and by a weight of
237
- // 1/bCount in the cluster for row b; but in practice, "undoing
238
- // the effect of the weighting" requires more multiplication, and
239
- // more time).
240
- //
241
- // Note: The greek letter lambda is given a different role
242
- // in [GAS1997], but I wanted to use it in a fashion
243
- // a bit more consistent with later BIONJ implementations.
244
- // -James B. 19-Oct-2020.
245
- //
246
326
size_t aCount = clusters[rowToCluster[a]].countOfExteriorNodes ;
247
327
size_t bCount = clusters[rowToCluster[b]].countOfExteriorNodes ;
248
328
size_t cCount = aCount + bCount;
@@ -294,6 +374,19 @@ template <class T=NJFloat> class UNJMatrix: public NJMatrix<T> {
294
374
}
295
375
};
296
376
377
+ /* *
378
+ * @brief Implementation of the BIONJ neighbour joining variant
379
+ * @tparam T the distance type
380
+ * @note The big difference between BIONJ and NJ is that BIONJ
381
+ * maintains an auxiliary V matrix, of "estimated variances"
382
+ * of "positions" of clusters in a notional genetic "space".
383
+ * The variance member is V. Row and column rearrangements
384
+ * in V duplicate those in the primary distance matrix, D.
385
+ * Values in the V matrix are used for determining lambda
386
+ * (the "weight" to be assigned to the first cluster, when
387
+ * merging two clusters, corresponding to rows a and b, a<b,
388
+ * in the D matrix).
389
+ */
297
390
template <class T =NJFloat> class BIONJMatrix : public NJMatrix <T> {
298
391
public:
299
392
typedef NJMatrix<T> super;
@@ -321,11 +414,32 @@ template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
321
414
bool rc = super::loadMatrix (names, matrix);
322
415
return rc;
323
416
}
417
+ /* *
418
+ * @brief Calculate lambda, the weight to give distances
419
+ * to the cluster that currently corresponds to
420
+ * row a of the distance matrix, if it is joined
421
+ * with the custer that currently corresponds to
422
+ * row b of the distance matrix (given row numbers
423
+ * a and b, 0<=a<b<row_count).
424
+ * @param a the lower row number (0<=a)
425
+ * @param b the higher row number (a<b<row_count)
426
+ * @param Vab the estimated variance of the genetic
427
+ * distance between the clusters that currently
428
+ * correspond to rows a and b of the distance matrix.
429
+ * @return T lambda, the weight to give distances to the a cluster,
430
+ * between 0 and 1. mu, the weight to give distances
431
+ * to the b cluster, is always (1-lambda).
432
+ * @note The code that adds for all i, and then subtracts
433
+ * for a and b, runs faster than would code that,
434
+ * for all i, checked that i!=a && i!=b before deciding
435
+ * whether to add. Yes, there's a slight increase in
436
+ * rounding error, but the performance benefit appears
437
+ * to be worth it. -James B.
438
+ */
324
439
inline T chooseLambda (intptr_t a, intptr_t b, T Vab) {
325
- // Assumed 0<=a<b<n
326
440
T lambda = 0 ;
327
441
if (Vab==0.0 ) {
328
- return 0.5 ;
442
+ return 0.5 ; // special case (to avoid division by zero)
329
443
}
330
444
auto vRowA = variance.rows [a];
331
445
auto vRowB = variance.rows [b];
@@ -343,6 +457,19 @@ template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
343
457
if (lambda<0.0 ) lambda=(T)0.0 ;
344
458
return lambda;
345
459
}
460
+ /* *
461
+ * @brief Join the clusters that currently correspond to rows
462
+ * a and b of the distance matrix. Update row totals
463
+ * (but not scaled row totals), and variances, too.
464
+ * @param a the lower row number (0<=a<b)
465
+ * @param b the higher row number (a<b<row_count)
466
+ * @note The bits of this that differ, in a meaningful way, from
467
+ * the corresponding code in NJMatrix are tagged with BIO.
468
+ * In NJ, lambda is always 0.5, and elements in V for the new
469
+ * cluster (which overwrite what is in row a in V), need to
470
+ * be calcualted. And lastly, what is done to the D matrix
471
+ * to shrink it, also has to be "echoed" on the V matrix.
472
+ */
346
473
virtual void cluster (intptr_t a, intptr_t b) override {
347
474
// Assumed 0<=a<b<n
348
475
// Bits that differ from super::cluster tagged BIO
@@ -398,18 +525,27 @@ template <class T=NJFloat> class BIONJMatrix : public NJMatrix<T> {
398
525
};
399
526
400
527
#if USE_VECTORCLASS_LIBRARY
528
+ /* *
529
+ * @brief
530
+ *
531
+ * @tparam T the distance type
532
+ * @tparam SUPER the superclass (in practice, NJMatrix<T>,
533
+ * UNJMatrix<T>, or BIONJMatrix<T>) this class is to
534
+ * vectorize.
535
+ * @tparam V the vector type
536
+ * @tparam VB the boolean vector type
537
+ * @note This works by overriding getRowMinima() with
538
+ * a vectorized version.
539
+ * @note This can't subclass UPGMAMatrix<T> (so don't try
540
+ * using it with SUPER=UPGMAMatrix<T>!), because
541
+ * that has a rather different getRowMinima().
542
+ * There's a separate class for vectorizing that.
543
+ */
401
544
template <class T =NJFloat, class SUPER =BIONJMatrix<T>,
402
545
class V =FloatVector, class VB =FloatBoolVector>
403
546
class VectorizedMatrix : public SUPER
404
547
{
405
548
typedef SUPER super;
406
- //
407
- // Note: this is a first attempt at hand-vectorizing
408
- // BIONJMatrix::getRowMinima (via Agner Fog's
409
- // vectorclass library).
410
- // It can subclass either NJMatrix or BIONJMatrix.
411
- // It cannot subclass UPGMA_Matrix.
412
- //
413
549
using super::rows;
414
550
using super::row_count;
415
551
using super::rowMinima;
@@ -426,12 +562,29 @@ class VectorizedMatrix: public SUPER
426
562
virtual std::string getAlgorithmName () const {
427
563
return " Vectorized-" + super::getAlgorithmName ();
428
564
}
565
+ /* *
566
+ * @brief Override for calculateRowTotals().
567
+ * Make sure the scaled total and row number
568
+ * scratch vectors are large enough.
569
+ */
429
570
virtual void calculateRowTotals () const {
430
571
super::calculateRowTotals ();
431
572
size_t fluff = MATRIX_ALIGNMENT / sizeof (T);
432
573
scratchTotals.resize (row_count + fluff, 0.0 );
433
574
scratchColumnNumbers.resize (row_count + fluff, 0.0 );
434
575
}
576
+ /* *
577
+ * @brief Get the Row Minima object
578
+ * @note tot is an *aligned* pointer to
579
+ * elements in a "scratch" vector of rescaled
580
+ * row totals, calculated each time this member
581
+ * is called.
582
+ * @note nums is an *aligned* pointer to elements in
583
+ * a "scratch" vector, which is used to keep
584
+ * track of *where* (which is to say, which)
585
+ * column, each adjusted distance in the current
586
+ * row comes from.
587
+ */
435
588
virtual void getRowMinima () const {
436
589
T nless2 = (T)( row_count - 2 );
437
590
T tMultiplier = ( row_count <= 2 ) ? (T)0.0 : ((T)1.0 / nless2);
@@ -498,5 +651,4 @@ class VectorizedMatrix: public SUPER
498
651
typedef VectorizedMatrix<NJFloat, BIONJMatrix<NJFloat>> VectorBIONJ;
499
652
#endif
500
653
}
501
-
502
654
#endif /* nj_h */
0 commit comments