Skip to content

Commit ac84514

Browse files
James BarbettiJames Barbetti
James Barbetti
authored and
James Barbetti
committed
Copying over ("merging") recent changes from iqtree2's dev branch:
1. Added the files hashrow.h and fancyrapidnj.h. 2. Supporting changes, in other files, for the FancyRapidNJ class: (a) added ClusterTree::appendToLastCluster (b) advertising of BROKEN and BROKEN-V (temporary names for Fancy Rapid NJ) 3. Tweaked the omp parallel directives in BoundingMatrix, to squeeze a bit more speed out of BoundingMatrix (which implements NJ-R and BIONJ-R). 4. Added VectorizedBoundingMatrix 5. Supporting changes, in other files: (a) advertising of NJ-R-V (VectorizedBoundingMatrix) 6. Moved calculation of RMS of T (the distance matrix implied by the constructed tree) minus D (the input distance matrix) out of upgma.h, and into hashrow.h (the hashing!), and clustertree.h (calculateDistancesToLeaves and calculateRMSOfTMinusD). 7. Added DoubleArgument to utils/argument.h and utils/argument.cpp. 8. Fixed up definition of the "-dist" StringArgument (so it once again works), and added a "-max-dist" DoubleArgument (it's no longer "hard-coded", only defaulted as 10.0). Plus, supporting changes in: (a) sequence.h and sequence.cpp (extra parameters to correctedDistance and uncorrectedDistance, to indicate max distance), likewise to SequenceLoader'S constructor. 8. If compression is requested, via "-c" and "-gz", distance output format has ".gz" appended to it (so it will be compressed). Previously, newick tree files honoured "-c" and "-gz", but distance matrix files did not. 9. FlatMatrix::writeToDistanceFile now reports how much of the file it has written (via a progress_display instance). 10.Fix ups in pigzstream::open(). It now has a much better failure path (previously it claimed it had succeeded, in what it wrote to standard output!). In utils/gzstream.cpp. This works via a new member function, markAsFailed(), in progress_display (in utils/progress.h and .cpp). If it's been called, "done" is replaced with "failed" in what is written to standard output when done() is called. 11.MAJOR bug fix in utils/hammingdistance.h (the vectorized version of countBitsSetInEitherTemplate() didn't work properly at all) (W parameter is the size of V in uint64_t instances, not its size in bytes, so the j loop is (j=0; j<W; ++j)... not (j=0; j<W/8; ++j)...)
1 parent 28ac3c8 commit ac84514

21 files changed

+1725
-269
lines changed

auctionmatrix.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ namespace StartTree {
3030
using super::entryToCluster;
3131
using super::entriesSorted;
3232
using super::purgeRow;
33-
virtual std::string getAlgorithmName() const {
33+
virtual std::string getAlgorithmName() const override {
3434
return "Auction" + M::getAlgorithmName();
3535
}
36-
StartTree::Position<T> getRowMinimum(size_t row, T maxTot, T qBest) const {
36+
virtual StartTree::Position<T> getRowMinimum(intptr_t row, T maxTot, T qBest) const override {
3737
StartTree::Position<T> pos(row, 0, infiniteDistance, 0);
3838
const T* rowData = entriesSorted.rows[row];
3939
const int* toCluster = entryToCluster.rows[row];

bionj2.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#include "nj.h"
5656
#include "rapidnj.h"
5757
#include "auctionmatrix.h"
58+
#include "fancyrapidnj.h"
5859

5960
namespace StartTree
6061
{
@@ -64,10 +65,22 @@ namespace StartTree
6465

6566
void addBioNJ2020TreeBuilders(Factory& f) {
6667
const char* defaultName = "RapidNJ";
67-
ADVERTISE(NJMatrix<NJFloat>, "NJ", "Neighbour Joining (Saitou, Nei [1987])");
68-
ADVERTISE(UNJMatrix<NJFloat>, "UNJ", "Unweighted Neighbour Joining (Gascel [1997])");
69-
ADVERTISE(RapidNJ, "NJ-R", "Rapid Neighbour Joining"
70-
" (Simonsen, Mailund, Pedersen [2011])");
68+
ADVERTISE(NJMatrix<NJFloat>, "NJ", "Neighbour Joining (Saitou, Nei [1987])");
69+
ADVERTISE(UNJMatrix<NJFloat>, "UNJ", "Unweighted Neighbour Joining (Gascel [1997])");
70+
ADVERTISE(RapidNJ, "NJ-R", "Rapid Neighbour Joining"
71+
" (Simonsen, Mailund, Pedersen [2011])");
72+
ADVERTISE(Vectorized_RapidNJ, "NJ-R-V", "Rapid Neighbour Joining (Vectorized)"
73+
" (Simonsen, Mailund, Pedersen [2011])");
74+
75+
ADVERTISE(RapidNJ, defaultName, "Rapid Neighbour Joining"
76+
" (Simonsen, Mailund, Pedersen [2011])");
77+
78+
ADVERTISE(FancyNJMatrix<NJFloat>,"BROKEN", "Rapid Neighbour Joining (Broken Version)"
79+
" (Simonsen, Mailund, Pedersen [2011])");
80+
81+
ADVERTISE(VectorizedFancyNJMatrix<NJFloat>, "BROKEN-V", "Rapid Neighbour Joining (Broken Version)"
82+
" (Simonsen, Mailund, Pedersen [2011]) (Vectorized)");
83+
7184
#ifdef USE_VECTORCLASS_LIBRARY
7285
ADVERTISE(VectorNJ, "NJ-V", "Vectorized Neighbour Joining (Saitou, Nei [1987])");
7386
#endif
@@ -82,7 +95,6 @@ void addBioNJ2020TreeBuilders(Factory& f) {
8295
ADVERTISE(VectorizedUPGMA_Matrix<NJFloat>, "UPGMA-V", "Vectorized UPGMA (Sokal, Michener [1958])");
8396
#endif
8497
ADVERTISE(BoundingMatrix<double>,"NJ-R-D", "Double precision Rapid Neighbour Joining");
85-
ADVERTISE(RapidNJ, defaultName, "Rapid Neighbour Joining (Simonsen, Mailund, Pedersen [2011]) (default)");
8698
f.setNameOfDefaultTreeBuilder(defaultName);
8799
ADVERTISE(DistanceAuctionMatrix, "AUCTION", "Auction Joining");
88100
}

clustertree.h

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <iostream> //for std::istream
3434
#include <sstream> //for std::stringstream
3535
#include <utils/progress.h> //for progress_display
36+
#include <utils/parallel_mergesort.h>//for MergeSorter
3637

3738
template <class T=double> struct Link {
3839
//
@@ -114,6 +115,108 @@ template <class T> class ClusterTree: public std::vector<Cluster<T>>
114115
//<< " is (" << a << ", " << b << ", " << c << ")" << std::endl;
115116
return cluster;
116117
}
118+
Cluster<T>& appendToLastCluster
119+
( size_t c, T length) {
120+
back().links.emplace_back(c, length);
121+
back().countOfExteriorNodes += at(c).countOfExteriorNodes;
122+
return back();
123+
}
124+
typedef std::pair<intptr_t /*taxon*/, T /*distance*/ > LeafDistance;
125+
typedef std::vector<LeafDistance> LeafDistanceVector;
126+
void calculateDistancesToLeaves(intptr_t top, double branch_length,
127+
LeafDistanceVector& ldv) {
128+
//Running time proportional to subtree size
129+
std::vector<LeafDistance> stack;
130+
stack.emplace_back(top, branch_length);
131+
while (!stack.empty()) {
132+
LeafDistance b = stack.back();
133+
stack.pop_back();
134+
Cluster<T>& node = at(b.first);
135+
if (node.links.empty()) {
136+
ldv.emplace_back(b);
137+
continue;
138+
}
139+
for (Link<T>& link : node.links ) {
140+
stack.emplace_back(link.clusterIndex,
141+
b.second + link.linkDistance );
142+
}
143+
}
144+
//
145+
//Hack: Reordering makes memory accesses "cache-friendlier"
146+
// for large clusters, in calculateRMSOfTMinusD().
147+
//Note: Single-threaded sorting is used, here, because
148+
// calculateRMSOfTMinusD() is already executing
149+
// calls to this function in parallel.
150+
//
151+
MergeSorter<LeafDistance> sorter;
152+
sorter.single_thread_sort(ldv.data(), ldv.size());
153+
}
154+
bool calculateRMSOfTMinusD(const double* matrix, intptr_t rank, double& rms) {
155+
//Assumes: rank is at least 3.
156+
//
157+
//Total running time: proportional to rank*(rank-1)/2.
158+
//(that's the number of additions to sum_of_squares).
159+
//
160+
//Total memory consumption: depends on thread count, X.
161+
//In theory, on the order of (X+1)*rank*(sizeof(intptr_t)+sizeof(T)).
162+
//2 * X * (maximum sum of sizes of leaf distance vectors).
163+
//The 2 is because of the use of a MergeSorter in
164+
//calculateDistancesToLeaves(). An in-place sort would
165+
//lower that to a 1.
166+
//
167+
intptr_t cluster_count = size();
168+
double sum_of_squares = 0.0;
169+
#ifdef _OPENMP
170+
#pragma omp parallel for reduction(+:sum_of_squares)
171+
#endif
172+
for (intptr_t h=rank; h<cluster_count; ++h) {
173+
//For each (non-leaf) cluster...
174+
//1. calculate distances to all the
175+
// leaves, for each contributing cluster.
176+
Cluster<T>& node = at(h);
177+
std::vector<LeafDistanceVector> subtrees;
178+
for (Link<T>& link : node.links ) {
179+
LeafDistanceVector ldv;
180+
calculateDistancesToLeaves(link.clusterIndex,
181+
link.linkDistance, ldv);
182+
subtrees.emplace_back(ldv);
183+
}
184+
//2. for each pair of LeafDistanceVectors, A, B,
185+
// (for separate contributing clusters)...
186+
size_t subtree_count = subtrees.size();
187+
for (size_t i=0; i+1<subtree_count; ++i) {
188+
for (size_t j=i+1; j<subtree_count; ++j) {
189+
//2b. for each leaf (a.first) from A
190+
for (LeafDistance& a: subtrees[i]) {
191+
auto row = matrix + a.first * rank;
192+
//2c. for each leaf (b.first) from B
193+
// calculate error; difference between
194+
// distance(a.first) + distance(b.first)
195+
// and D[a.first * rank + b.first].
196+
for (LeafDistance& b: subtrees[j]) {
197+
double diff = a.second + b.second
198+
- row[b.first];
199+
sum_of_squares += (diff * diff);
200+
#if (0)
201+
std::cout << a.first << " " << b.first << " "
202+
<< a.second + b.second << " "
203+
<< row[b.first] << " "
204+
<< diff << "\n";
205+
#endif
206+
}
207+
}
208+
}
209+
}
210+
//std::cout << "\n";
211+
}
212+
double double_rank = static_cast<double>(rank);
213+
rms = sqrt( sum_of_squares * 2.0 / double_rank / (double_rank - 1.0) );
214+
//std::cout << "rank " << (double_rank) << std::endl;
215+
//std::cout << "sum " << sum_of_squares << ","
216+
// << " divisor " << double_rank*(double_rank-1)*0.5 << std::endl;
217+
return true;
218+
}
219+
117220
template <class F>
118221
bool writeTreeToFile(int precision,
119222
const std::string &treeFilePath,

decenttree.cpp

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,19 @@ namespace {
5353
return answer;
5454
}
5555

56-
bool correcting_distances = true;
57-
bool is_DNA = true;
58-
bool numbered_names = false;
59-
bool filter_problem_sequences = false;
60-
char unknown_char = 'N';
61-
int precision = 8;
62-
int compression_level = 9;
56+
bool correcting_distances = true;
57+
bool is_DNA = true;
58+
bool numbered_names = false;
59+
bool filter_problem_sequences = false;
60+
char unknown_char = 'N';
61+
double max_distance = 10.0;
62+
int precision = 8;
63+
int compression_level = 9;
6364
std::string msaOutputPath; //write .msa formatted version of .fasta input here
6465
std::string alphabet; //defaults to ACGT
6566
std::string unknown_chars; //defaults to .~_-?N
66-
std::string format = "square.interleaved";
67-
bool interleaved_format = true;
67+
std::string format = "square.interleaved";
68+
bool interleaved_format = true;
6869

6970
std::string stripName; //characters to strip from names
7071
std::string nameReplace("_"); //characters to replace stripepd chars with, in names
@@ -116,7 +117,8 @@ void showUsage() {
116117
bool loadSequenceDistancesIntoMatrix(Sequences& sequences,
117118
const std::vector<char>& is_site_variant,
118119
bool report_progress, FlatMatrix& m) {
119-
SequenceLoader loader(unknown_char, is_DNA, sequences, correcting_distances,
120+
SequenceLoader loader(unknown_char, is_DNA, max_distance,
121+
sequences, correcting_distances,
120122
precision, compression_level, format,
121123
is_site_variant, report_progress);
122124
bool success = loader.loadSequenceDistances(m);
@@ -247,15 +249,17 @@ bool prepInput(const std::string& fastaFilePath,
247249
if (loadMatrix) {
248250
useNumberedNamesIfAskedTo(numbered_names, m);
249251
return m.writeToDistanceFile(format, precision,
250-
compression_level,
252+
compression_level, reportProgress,
251253
distanceOutputFilePath );
252254
}
253255
else {
254-
SequenceLoader loader(unknown_char, is_DNA, sequences,
256+
SequenceLoader loader(unknown_char, is_DNA,
257+
max_distance, sequences,
255258
correcting_distances, precision,
256259
compression_level, format,
257260
is_site_variant, reportProgress);
258-
bool success = loader.writeDistanceMatrixToFile(numbered_names, distanceOutputFilePath);
261+
bool success = loader.writeDistanceMatrixToFile(numbered_names,
262+
distanceOutputFilePath);
259263
return success;
260264
}
261265
}
@@ -265,7 +269,7 @@ bool prepInput(const std::string& fastaFilePath,
265269
fixUpSequenceNames(truncateName, stripName, nameReplace, m);
266270
if (!distanceOutputFilePath.empty()) {
267271
return m.writeToDistanceFile(format, precision,
268-
compression_level,
272+
compression_level, reportProgress,
269273
distanceOutputFilePath );
270274
}
271275
return true;
@@ -327,7 +331,7 @@ class DecentTreeOptions {
327331
arg_map << new StringArgument("-fasta", "fasta file path", fastaFilePath);
328332
arg_map << new StringArgument("-phylip", "phylip alignment file path", phylipFilePath);
329333
arg_map << new StringArgument("-in", "distance matrix file path", inputFilePath);
330-
arg_map << new StringArgument("-dist", "distance matrix file path", inputFilePath);
334+
arg_map << new StringArgument("-dist", "distance matrix file path", distanceOutputFilePath);
331335
arg_map << new IntArgument ("-c", "compression level between 1 and 9",
332336
compression_level);
333337
arg_map << new IntArgument ("-f", "precision level between 4 and 15",
@@ -349,6 +353,7 @@ class DecentTreeOptions {
349353
arg_map << new SwitchArgument("-gz", isOutputZipped, true);
350354
arg_map << new SwitchArgument("-no-banner", isBannerSuppressed, true);
351355
arg_map << new SwitchArgument("-uncorrected", correcting_distances, false);
356+
arg_map << new DoubleArgument("-max-dist", "maximum distance", max_distance);
352357
arg_map << new IntArgument ("-nt", "thread count", threadCount);
353358
arg_map << new SwitchArgument("-q", beSilent, true);
354359
arg_map << new SwitchArgument("-filter", filter_problem_sequences, true);
@@ -385,9 +390,17 @@ class DecentTreeOptions {
385390
break;
386391
}
387392
}
393+
if (max_distance<=0) {
394+
PROBLEM("Maximum distance (" << max_distance << ") too low");
395+
}
388396
range_restrict(0, 9, compression_level );
389397
range_restrict(1, 15, precision );
390398
format = string_to_lower(format);
399+
if (isOutputZipped &&
400+
format.find(".gz") == std::string::npos) {
401+
//Ensure that distance file will be compressed
402+
format += ".gz";
403+
}
391404
if (isOutputToStandardOutput) {
392405
outputFilePath = "STDOUT";
393406
}
@@ -502,7 +515,7 @@ int obeyCommandLineOptions(DecentTreeOptions& options) {
502515
else if (succeeded && options.isMatrixToBeLoaded) {
503516
succeeded = algorithm->constructTreeInMemory(m.getSequenceNames(),
504517
m.getDistanceMatrix(),
505-
options.outputFilePath);
518+
options.outputFilePath);
506519
}
507520
else if (!options.inputFilePath.empty()) {
508521
succeeded = algorithm->constructTree(options.inputFilePath,

decenttree_update.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ cd ../iqtree2/distancematrixtree
1414
cp decenttree.cpp flatmatrix.cpp flatmatrix.h starttree.cpp starttree.h ../../decenttree
1515
cp bionj.cpp bionj2.cpp bionj2.h upgma.h nj.h rapidnj.h auctionmatrix.h ../../decenttree
1616
cp clustertree.h distancematrix.h stitchup.cpp sequence.cpp sequence.h ../../decenttree
17+
cp hashrow.h fancyrapidnj.h ../../decenttree
1718
cd ../utils
1819
cp argument.cpp argument.h ../../decenttree/utils
1920
cp gzstream.cpp gzstream.h heapsort.h ../../decenttree/utils

0 commit comments

Comments
 (0)