Skip to content

Commit 59c74c3

Browse files
authored
Merge pull request #1023 from sooyoung-cha/mmseqsSET2
cluster with --set-mode 1 makes multimer-wise clustering possible
2 parents e007ce3 + 6b5ad9a commit 59c74c3

File tree

10 files changed

+388
-63
lines changed

10 files changed

+388
-63
lines changed

src/clustering/AlignmentSymmetry.cpp

Lines changed: 122 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ void AlignmentSymmetry::readInData(DBReader<unsigned int>*alnDbr, DBReader<unsig
103103
Debug(Debug::ERROR) << "Alignment format is not supported!\n";
104104
EXIT(EXIT_FAILURE);
105105
}
106-
107106
}
108107
if (currElement == UINT_MAX || currElement > seqDbr->getSize()) {
109108
Debug(Debug::ERROR) << "Element " << dbKey
@@ -120,6 +119,126 @@ void AlignmentSymmetry::readInData(DBReader<unsigned int>*alnDbr, DBReader<unsig
120119
}
121120
}
122121

122+
void AlignmentSymmetry::readInDataSet(DBReader<unsigned int>*alnDbr, DBReader<unsigned int>*seqDbr,
123+
unsigned int **elementLookupTable, unsigned short **elementScoreTable,
124+
int scoretype, size_t *offsets, size_t *sourceOffsets, unsigned int **sourceLookupTable, unsigned int *keyToSet, bool isfirst) {
125+
const int alnType = alnDbr->getDbtype();
126+
const size_t dbSize = seqDbr->getSize();
127+
const size_t flushSize = 1000000;
128+
Debug::Progress progress(dbSize);
129+
size_t iterations = static_cast<int>(ceil(static_cast<double>(dbSize)/static_cast<double>(flushSize)));
130+
for(size_t it = 0; it < iterations; it++) {
131+
size_t start = it * flushSize;
132+
size_t bucketSize = std::min(dbSize - (it * flushSize), flushSize);
133+
#pragma omp parallel
134+
{
135+
unsigned int thread_idx = 0;
136+
#ifdef OPENMP
137+
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
138+
#endif
139+
#pragma omp for schedule(dynamic, 100)
140+
141+
for (size_t i = start; i < (start + bucketSize); i++) {
142+
progress.updateProgress();
143+
// seqDbr is descending sorted by length
144+
// the assumption is that clustering is B -> B (not A -> B)
145+
const unsigned int clusterId = seqDbr->getDbKey(i);
146+
size_t start1 = sourceOffsets[clusterId];
147+
size_t end1 = sourceOffsets[clusterId+1];
148+
size_t len = end1 - start1;
149+
size_t isnull = 0;
150+
151+
size_t writePos = 0;
152+
std::vector<bool> bitFlags(dbSize, false);
153+
for (size_t j = 0; j < len; ++j) {
154+
unsigned int value = sourceLookupTable[clusterId][j];
155+
if (value != UINT_MAX) {
156+
const size_t alnId = alnDbr->getId(value);
157+
char *data = alnDbr->getData(alnId, thread_idx);
158+
if (*data == '\0') { // check if file contains entry
159+
isnull++;
160+
continue;
161+
}
162+
while (*data != '\0') {
163+
char similarity[255 + 1];
164+
char dbKey[255 + 1];
165+
Util::parseKey(data, dbKey);
166+
const size_t currElement = seqDbr->getId(keyToSet[(unsigned int) strtoul(dbKey, NULL, 10)]);
167+
if(bitFlags[currElement]==0){
168+
if (elementScoreTable != NULL) {
169+
if (Parameters::isEqualDbtype(alnType,Parameters::DBTYPE_ALIGNMENT_RES)) {
170+
if (scoretype == Parameters::APC_ALIGNMENTSCORE) {
171+
//column 1 = alignment score
172+
Util::parseByColumnNumber(data, similarity, 1);
173+
elementScoreTable[i][writePos] = (unsigned short) (atof(similarity));
174+
} else {
175+
//column 2 = sequence identity [0-1]
176+
Util::parseByColumnNumber(data, similarity, 2);
177+
elementScoreTable[i][writePos] = (unsigned short) (atof(similarity) * 1000.0f);
178+
}
179+
}
180+
else if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_RES) ||
181+
Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_REV_RES)) {
182+
//column 1 = alignment score or sequence identity [0-100]
183+
Util::parseByColumnNumber(data, similarity, 1);
184+
short sim = atoi(similarity);
185+
elementScoreTable[i][writePos] = (unsigned short) (sim >0 ? sim : -sim);
186+
}
187+
else if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_CLUSTER_RES)) {
188+
elementScoreTable[i][writePos] = (unsigned short) (USHRT_MAX);
189+
}
190+
else {
191+
Debug(Debug::ERROR) << "Alignment format is not supported!\n";
192+
EXIT(EXIT_FAILURE);
193+
}
194+
}
195+
if (currElement == UINT_MAX || currElement > seqDbr->getSize()) {
196+
Debug(Debug::ERROR) << "Element " << dbKey
197+
<< " contained in some alignment list, but not contained in the sequence database!\n";
198+
EXIT(EXIT_FAILURE);
199+
}
200+
elementLookupTable[i][writePos] = currElement;
201+
bitFlags[currElement] = 1;
202+
writePos++;
203+
}
204+
data = Util::skipLine(data);
205+
}
206+
}
207+
}
208+
if (isfirst) {
209+
offsets[i] = writePos;
210+
}
211+
if (isnull == len) {
212+
elementLookupTable[i][0] = seqDbr->getId(clusterId);
213+
if (elementScoreTable != NULL) {
214+
if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_ALIGNMENT_RES)) {
215+
if (scoretype == Parameters::APC_ALIGNMENTSCORE) {
216+
//column 1 = alignment score
217+
elementScoreTable[i][0] = (unsigned short) (USHRT_MAX);
218+
} else {
219+
//column 2 = sequence identity [0-1]
220+
elementScoreTable[i][0] = (unsigned short) (1.0 * 1000.0f);
221+
}
222+
} else if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_RES) ||
223+
Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_REV_RES)) {
224+
//column 1 = alignment score or sequence identity [0-100]
225+
elementScoreTable[i][0] = (unsigned short) (USHRT_MAX);
226+
} else if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_CLUSTER_RES)) {
227+
elementScoreTable[i][0] = (unsigned short) (USHRT_MAX);
228+
}
229+
}
230+
isnull = 0;
231+
if (isfirst) {
232+
offsets[i] = 1;
233+
}
234+
continue;
235+
}
236+
}
237+
}
238+
alnDbr->remapData();
239+
}
240+
}
241+
123242
size_t AlignmentSymmetry::findMissingLinks(unsigned int ** elementLookupTable, size_t * offsetTable, size_t dbSize, int threads) {
124243
// init memory for parallel merge
125244
unsigned int * tmpSize = new(std::nothrow) unsigned int[threads * dbSize];
@@ -138,9 +257,9 @@ size_t AlignmentSymmetry::findMissingLinks(unsigned int ** elementLookupTable, s
138257
const unsigned int currElm = elementLookupTable[setId][elementId];
139258
const unsigned int currElementSize = LEN(offsetTable, currElm);
140259
const bool elementFound = std::binary_search(elementLookupTable[currElm],
141-
elementLookupTable[currElm] + currElementSize, setId);
260+
elementLookupTable[currElm] + currElementSize, setId);
142261
// this is a new connection since setId is not contained in currentElementSet
143-
if (elementFound == false) {
262+
if (elementFound == false) {
144263
tmpSize[static_cast<size_t>(currElm) * static_cast<size_t>(threads) +
145264
static_cast<size_t>(thread_idx)] += 1;
146265
}
@@ -189,7 +308,6 @@ void AlignmentSymmetry::addMissingLinks(unsigned int **elementLookupTable,
189308
}
190309
const unsigned int oldCurrElementSize = LEN(offsetTableWithOutNewLinks, currElm);
191310
const unsigned int newCurrElementSize = LEN(offsetTableWithNewLinks, currElm);
192-
193311
bool found = false;
194312
// check if setId is already in set of currElm
195313
for(size_t pos = 0; pos < oldCurrElementSize && found == false; pos++){
@@ -202,7 +320,6 @@ void AlignmentSymmetry::addMissingLinks(unsigned int **elementLookupTable,
202320
while( pos < newCurrElementSize && elementLookupTable[currElm][pos] != UINT_MAX ){
203321
pos++;
204322
}
205-
206323
if(pos >= newCurrElementSize){
207324
Debug(Debug::ERROR) << "pos(" << pos << ") > newCurrElementSize(" << newCurrElementSize << "). This should not happen.\n";
208325
EXIT(EXIT_FAILURE);

src/clustering/AlignmentSymmetry.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
class AlignmentSymmetry {
1515
public:
1616
static void readInData(DBReader<unsigned int>*pReader, DBReader<unsigned int>*pDBReader, unsigned int **pInt,unsigned short**elementScoreTable, int scoretype, size_t *offsets);
17+
static void readInDataSet(DBReader<unsigned int>*alnDbr, DBReader<unsigned int>*seqDbr, unsigned int **elementLookupTable, unsigned short **elementScoreTable, int scoretype, size_t *offsets, size_t *sourceOffsets, unsigned int **sourceLookupTable, unsigned int *keyToSet, bool isfirst);
1718
template<typename T>
1819
static void computeOffsetFromCounts(T* elementSizes, size_t dbSize) {
1920
size_t prevElementLength = elementSizes[0];

src/clustering/Clustering.cpp

Lines changed: 138 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,30 @@
11
#include "Clustering.h"
22
#include "ClusteringAlgorithms.h"
3+
#include "AlignmentSymmetry.h"
34
#include "Debug.h"
45
#include "Util.h"
56
#include "itoa.h"
67
#include "Timer.h"
78
#include "SequenceWeights.h"
9+
#include <fstream>
810

911
Clustering::Clustering(const std::string &seqDB, const std::string &seqDBIndex,
1012
const std::string &alnDB, const std::string &alnDBIndex,
1113
const std::string &outDB, const std::string &outDBIndex,
1214
const std::string &sequenceWeightFile,
13-
unsigned int maxIteration, int similarityScoreType, int threads, int compressed) : maxIteration(maxIteration),
15+
unsigned int maxIteration, int similarityScoreType, int threads, int compressed, bool needSET) : needSET(needSET),
16+
maxIteration(maxIteration),
1417
similarityScoreType(similarityScoreType),
1518
threads(threads),
1619
compressed(compressed),
1720
outDB(outDB),
1821
outDBIndex(outDBIndex) {
1922

2023
seqDbr = new DBReader<unsigned int>(seqDB.c_str(), seqDBIndex.c_str(), threads, DBReader<unsigned int>::USE_INDEX);
21-
24+
alnDbr = new DBReader<unsigned int>(alnDB.c_str(), alnDBIndex.c_str(), threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
25+
alnDbr->open(DBReader<unsigned int>::NOSORT);
2226
if (!sequenceWeightFile.empty()) {
23-
2427
seqDbr->open(DBReader<unsigned int>::SORT_BY_ID);
25-
2628
SequenceWeights *sequenceWeights = new SequenceWeights(sequenceWeightFile.c_str());
2729
float *localid2weight = new float[seqDbr->getSize()];
2830
for (size_t id = 0; id < seqDbr->getSize(); id++) {
@@ -33,29 +35,153 @@ Clustering::Clustering(const std::string &seqDB, const std::string &seqDBIndex,
3335
delete[] localid2weight;
3436
delete sequenceWeights;
3537

36-
} else
37-
seqDbr->open(DBReader<unsigned int>::SORT_BY_LENGTH);
38+
} else {
39+
if (needSET == false) {
40+
seqDbr->open(DBReader<unsigned int>::SORT_BY_LENGTH);
41+
} else {
42+
DBReader<unsigned int> *originalseqDbr = new DBReader<unsigned int>(seqDB.c_str(), seqDBIndex.c_str(), threads, DBReader<unsigned int>::USE_INDEX);
43+
originalseqDbr->open(DBReader<unsigned int>::NOSORT);
44+
DBReader<unsigned int>::Index * seqIndex = originalseqDbr->getIndex();
45+
46+
unsigned int lastKey = originalseqDbr->getLastKey();
47+
keyToSet = new unsigned int[lastKey+1];
48+
std::vector<bool> keysInSeq(lastKey+1, false);
49+
std::map<unsigned int, unsigned int> setToLength;
50+
51+
std::ifstream mappingStream(seqDB + ".lookup");
52+
std::string line;
53+
unsigned int setkey = 0;
54+
while (std::getline(mappingStream, line)) {
55+
std::vector<std::string> split = Util::split(line, "\t");
56+
unsigned int key = strtoul(split[0].c_str(), NULL, 10);
57+
setkey = strtoul(split[2].c_str(), NULL, 10);
58+
keyToSet[key] = setkey;
59+
}
60+
for (size_t id = 0; id < originalseqDbr->getSize(); id++) {
61+
setToLength[keyToSet[seqIndex[id].id]] += seqIndex[id].length;
62+
keysInSeq[seqIndex[id].id] = 1;
63+
}
64+
unsigned int sourceLen = setkey + 1;
65+
seqnum = setToLength.size();
66+
sourceList = new(std::nothrow) unsigned int[lastKey];
67+
sourceOffsets = new(std::nothrow) size_t[sourceLen + 1];
68+
sourceLookupTable = new(std::nothrow) unsigned int *[sourceLen];
69+
70+
mappingStream.close();
71+
mappingStream.open(seqDB + ".lookup");
72+
line = "";
73+
unsigned int prevsetkey = UINT_MAX;
74+
size_t n = 0;
75+
size_t lookupOrder = 0;
76+
setkey = UINT_MAX;
77+
while (std::getline(mappingStream, line)) {
78+
std::vector<std::string> split = Util::split(line, "\t");
79+
unsigned int key = strtoul(split[0].c_str(), NULL, 10);
80+
setkey = strtoul(split[2].c_str(), NULL, 10);
81+
if(setkey != prevsetkey) {
82+
if (prevsetkey != UINT_MAX){
83+
sourceOffsets[prevsetkey] = n;
84+
for (size_t k = prevsetkey+1; k<setkey; k++) {
85+
sourceOffsets[k] = 0;
86+
}
87+
}
88+
prevsetkey = setkey;
89+
if(keysInSeq[key] == 1) {
90+
sourceKeyVec.emplace_back(setkey);
91+
}
92+
n = 0;
93+
}
94+
if(keysInSeq[key] == 1) {
95+
sourceList[lookupOrder] = key;
96+
} else {
97+
sourceList[lookupOrder] = UINT_MAX;
98+
}
99+
n++;
100+
lookupOrder++;
101+
}
102+
sourceOffsets[prevsetkey] = n;
103+
AlignmentSymmetry::computeOffsetFromCounts(sourceOffsets, sourceLen);
104+
AlignmentSymmetry::setupPointers<unsigned int>(sourceList, sourceLookupTable, sourceOffsets, sourceLen, lastKey);
105+
106+
char* data = (char*)malloc(
107+
sizeof(size_t) +
108+
sizeof(size_t) +
109+
sizeof(unsigned int) +
110+
sizeof(int) +
111+
sizeof(unsigned int) +
112+
sizeof(DBReader<unsigned int>::Index) * seqnum
113+
);
114+
115+
std::vector<DBReader<unsigned int>::Index*> indexStorage(seqnum);
116+
117+
n = 0;
118+
for (const auto& pairs : setToLength) {
119+
indexStorage[n] = new DBReader<unsigned int>::Index;
120+
indexStorage[n]->id = pairs.first;
121+
indexStorage[n]->length = pairs.second;
122+
indexStorage[n]->offset = 0;
123+
n++;
124+
}
125+
126+
char* p = data;
127+
*((size_t*)p) = seqnum;
128+
p += sizeof(size_t);
129+
*((size_t*)p) = 0;
130+
p += sizeof(size_t);
131+
*((unsigned int*)p) = indexStorage[seqnum-1]->id;
132+
p += sizeof(unsigned int);
133+
*((int*)p) = originalseqDbr->getDbtype();
134+
p += sizeof(int);
135+
*((unsigned int*)p) = indexStorage[0]->length;
136+
p += sizeof(unsigned int);
137+
for (size_t i = 0; i < seqnum; ++i) {
138+
memcpy(
139+
p + i * sizeof(DBReader<unsigned int>::Index),
140+
indexStorage[i],
141+
sizeof(DBReader<unsigned int>::Index)
142+
);
143+
}
144+
p += sizeof(DBReader<unsigned int>::Index) * seqnum;
145+
seqDbr = DBReader<unsigned int>::unserialize(data, threads);
146+
seqDbr->open(DBReader<unsigned int>::SORT_BY_LENGTH);
147+
for (auto* ptr : indexStorage) {
148+
delete ptr;
149+
}
150+
}
151+
}
38152

39-
alnDbr = new DBReader<unsigned int>(alnDB.c_str(), alnDBIndex.c_str(), threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
40-
alnDbr->open(DBReader<unsigned int>::NOSORT);
41153

42154
}
43155

44156
Clustering::~Clustering() {
45157
delete seqDbr;
46158
delete alnDbr;
159+
if(needSET){
160+
delete keyToSet;
161+
delete sourceOffsets;
162+
delete sourceList;
163+
delete[] sourceLookupTable;
164+
}
47165
}
48166

49167

50168
void Clustering::run(int mode) {
51169
Timer timer;
52-
DBWriter *dbw = new DBWriter(outDB.c_str(), outDBIndex.c_str(), 1, compressed, Parameters::DBTYPE_CLUSTER_RES);
170+
171+
unsigned int dbType = Parameters::DBTYPE_CLUSTER_RES;
172+
unsigned int dbTypeSet = DBReader<unsigned int>::setExtendedDbtype(dbType, Parameters::DBTYPE_EXTENDED_SET);
173+
DBWriter *dbw;
174+
if(needSET) {
175+
dbw = new DBWriter(outDB.c_str(), outDBIndex.c_str(), 1, compressed, dbTypeSet);
176+
} else {
177+
dbw = new DBWriter(outDB.c_str(), outDBIndex.c_str(), 1, compressed, dbType);
178+
}
53179
dbw->open();
54180

55181
std::pair<unsigned int, unsigned int> * ret;
56182
ClusteringAlgorithms *algorithm = new ClusteringAlgorithms(seqDbr, alnDbr,
57183
threads, similarityScoreType,
58-
maxIteration);
184+
maxIteration, keyToSet, sourceOffsets, sourceLookupTable, sourceList, seqnum, needSET);
59185

60186
if (mode == Parameters::GREEDY) {
61187
Debug(Debug::INFO) << "Clustering mode: Greedy\n";
@@ -79,7 +205,7 @@ void Clustering::run(int mode) {
79205
size_t dbSize = alnDbr->getSize();
80206
size_t seqDbSize = seqDbr->getSize();
81207
size_t cluNum = (dbSize > 0) ? 1 : 0;
82-
for(size_t i = 1; i < dbSize; i++){
208+
for(size_t i = 1; i < seqDbSize; i++){
83209
cluNum += (ret[i].first != ret[i-1].first);
84210
}
85211
Debug(Debug::INFO) << "Total time: " << timer.lap() << "\n";
@@ -88,11 +214,10 @@ void Clustering::run(int mode) {
88214
Debug(Debug::INFO) << "Number of clusters: " << cluNum << "\n\n";
89215

90216
Debug(Debug::INFO) << "Writing results ";
91-
writeData(dbw, ret, dbSize);
217+
writeData(dbw, ret, seqDbSize);
92218
Debug(Debug::INFO) << timerWrite.lap() << "\n";
93219
delete [] ret;
94220
delete algorithm;
95-
96221
dbw->close(false, false);
97222
seqDbr->close();
98223
alnDbr->close();

0 commit comments

Comments
 (0)