Skip to content

Commit 88d14e5

Browse files
authored
Merge pull request #1027 from sooyoung-cha/mmseqsSET2
lookup order doesn't matter in clust
2 parents 59c74c3 + 70c50ca commit 88d14e5

File tree

2 files changed

+26
-29
lines changed

2 files changed

+26
-29
lines changed

src/clustering/Clustering.cpp

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -51,58 +51,56 @@ Clustering::Clustering(const std::string &seqDB, const std::string &seqDBIndex,
5151
std::ifstream mappingStream(seqDB + ".lookup");
5252
std::string line;
5353
unsigned int setkey = 0;
54+
unsigned int maxsetkey = 0;
5455
while (std::getline(mappingStream, line)) {
5556
std::vector<std::string> split = Util::split(line, "\t");
5657
unsigned int key = strtoul(split[0].c_str(), NULL, 10);
5758
setkey = strtoul(split[2].c_str(), NULL, 10);
5859
keyToSet[key] = setkey;
60+
if (maxsetkey < setkey) {
61+
maxsetkey = setkey;
62+
}
5963
}
6064
for (size_t id = 0; id < originalseqDbr->getSize(); id++) {
6165
setToLength[keyToSet[seqIndex[id].id]] += seqIndex[id].length;
6266
keysInSeq[seqIndex[id].id] = 1;
6367
}
64-
unsigned int sourceLen = setkey + 1;
68+
unsigned int sourceLen = maxsetkey + 1;
6569
seqnum = setToLength.size();
6670
sourceList = new(std::nothrow) unsigned int[lastKey];
67-
sourceOffsets = new(std::nothrow) size_t[sourceLen + 1];
71+
sourceOffsets = new(std::nothrow) size_t[sourceLen + 1]();
6872
sourceLookupTable = new(std::nothrow) unsigned int *[sourceLen];
73+
size_t * sourceOffsetsDecrease = new(std::nothrow) size_t[sourceLen + 1]();
6974

7075
mappingStream.close();
7176
mappingStream.open(seqDB + ".lookup");
77+
78+
line = "";
79+
while (std::getline(mappingStream, line)) {
80+
std::vector<std::string> split = Util::split(line, "\t");
81+
setkey = strtoul(split[2].c_str(), NULL, 10);
82+
sourceOffsets[setkey]++;
83+
sourceOffsetsDecrease[setkey]++;
84+
}
85+
AlignmentSymmetry::computeOffsetFromCounts(sourceOffsets, sourceLen);
86+
AlignmentSymmetry::setupPointers<unsigned int>(sourceList, sourceLookupTable, sourceOffsets, sourceLen, lastKey);
87+
88+
mappingStream.close();
89+
mappingStream.open(seqDB + ".lookup");
90+
7291
line = "";
73-
unsigned int prevsetkey = UINT_MAX;
74-
size_t n = 0;
75-
size_t lookupOrder = 0;
76-
setkey = UINT_MAX;
7792
while (std::getline(mappingStream, line)) {
7893
std::vector<std::string> split = Util::split(line, "\t");
7994
unsigned int key = strtoul(split[0].c_str(), NULL, 10);
8095
setkey = strtoul(split[2].c_str(), NULL, 10);
81-
if(setkey != prevsetkey) {
82-
if (prevsetkey != UINT_MAX){
83-
sourceOffsets[prevsetkey] = n;
84-
for (size_t k = prevsetkey+1; k<setkey; k++) {
85-
sourceOffsets[k] = 0;
86-
}
87-
}
88-
prevsetkey = setkey;
89-
if(keysInSeq[key] == 1) {
90-
sourceKeyVec.emplace_back(setkey);
91-
}
92-
n = 0;
93-
}
96+
size_t order = sourceOffsets[setkey + 1] - sourceOffsetsDecrease[setkey];
9497
if(keysInSeq[key] == 1) {
95-
sourceList[lookupOrder] = key;
98+
sourceList[order] = key;
9699
} else {
97-
sourceList[lookupOrder] = UINT_MAX;
100+
sourceList[order] = UINT_MAX;
98101
}
99-
n++;
100-
lookupOrder++;
102+
sourceOffsetsDecrease[setkey]--;
101103
}
102-
sourceOffsets[prevsetkey] = n;
103-
AlignmentSymmetry::computeOffsetFromCounts(sourceOffsets, sourceLen);
104-
AlignmentSymmetry::setupPointers<unsigned int>(sourceList, sourceLookupTable, sourceOffsets, sourceLen, lastKey);
105-
106104
char* data = (char*)malloc(
107105
sizeof(size_t) +
108106
sizeof(size_t) +
@@ -114,7 +112,7 @@ Clustering::Clustering(const std::string &seqDB, const std::string &seqDBIndex,
114112

115113
std::vector<DBReader<unsigned int>::Index*> indexStorage(seqnum);
116114

117-
n = 0;
115+
size_t n = 0;
118116
for (const auto& pairs : setToLength) {
119117
indexStorage[n] = new DBReader<unsigned int>::Index;
120118
indexStorage[n]->id = pairs.first;

src/clustering/Clustering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ class Clustering {
4141
int compressed;
4242
std::string outDB;
4343
std::string outDBIndex;
44-
std::vector<unsigned int> sourceKeyVec;
4544
};
4645

4746
#endif

0 commit comments

Comments
 (0)