Skip to content

Commit 760a9e1

Browse files
Fix #1011
1 parent c8fbdf9 commit 760a9e1

File tree

1 file changed

+55
-16
lines changed

1 file changed

+55
-16
lines changed

src/util/diffseqdbs.cpp

Lines changed: 55 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,33 @@
1-
// Computes either a PSSM or a MSA from clustering or alignment result
2-
// For PSSMs: MMseqs just stores the position specific score in 1 byte
3-
41
#include <cstdlib>
52
#include <fstream>
63
#include <sstream>
74
#include <algorithm>
85
#include <utility>
9-
6+
#include <climits>
107
#include "Parameters.h"
118

129
#include "DBReader.h"
1310
#include "DBWriter.h"
1411
#include "Util.h"
12+
#include "FastSort.h"
1513

1614
#ifdef OPENMP
1715
#include <omp.h>
1816
#endif
1917

18+
19+
struct compareSecondEntry {
20+
bool
21+
operator()(const std::pair<std::string, unsigned int> &lhs, const std::pair<std::string, unsigned int> &rhs) const {
22+
return (lhs.second < rhs.second);
23+
}
24+
};
25+
2026
struct compareFirstEntry {
2127
bool
2228
operator()(const std::pair<std::string, unsigned int> &lhs, const std::pair<std::string, unsigned int> &rhs) const {
23-
return (lhs.first.compare(rhs.first) <= 0);
29+
return (lhs.first < rhs.first) ||
30+
(lhs.first == rhs.first && lhs.second < rhs.second);
2431
}
2532
};
2633

@@ -101,17 +108,45 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
101108
}
102109

103110
//sort by header for binary search
104-
std::stable_sort(keysNew, keysNew + indexSizeNew, compareFirstEntry());
105-
111+
SORT_PARALLEL(keysNew, keysNew + indexSizeNew, compareFirstEntry());
112+
// remove duplicates in new DB by setting the dbkey to UINT_MAX
113+
for(size_t i = 0; i + 1 < indexSizeNew; ++i) {
114+
if(keysNew[i].first == keysNew[i+1].first) {
115+
keysNew[i+1].second = UINT_MAX;
116+
}
117+
}
106118
// default initialized with false
107119
bool* checkedNew = new bool[indexSizeNew]();
108120
// doesn't need to be initialized
109121
size_t *mappedIds = new size_t[indexSizeNew];
110-
111122
bool* deletedIds = new bool[indexSizeOld]();
112123

124+
// copy the orignal dbKey from keysOld to originalOldKeys
125+
unsigned int* originalOldKeys = new unsigned int[indexSizeOld]();
126+
for (size_t i = 0; i < indexSizeOld; ++i) {
127+
originalOldKeys[i] = keysOld[i].second;
128+
keysOld[i].second = i;
129+
}
130+
131+
// sorting should be the same as with orignal dbKeys since they are monotonically increasing
132+
SORT_PARALLEL(keysOld, keysOld + indexSizeOld, compareFirstEntry());
133+
for (size_t i = 0; i + 1 < indexSizeOld; ++i) {
134+
if(keysOld[i].first == keysOld[i+1].first) {
135+
deletedIds[keysOld[i+1].second] = true;
136+
}
137+
}
138+
for (size_t i = 0; i < indexSizeOld; ++i) {
139+
keysOld[i].second = originalOldKeys[keysOld[i].second];
140+
}
141+
delete [] originalOldKeys;
142+
// restore original order
143+
SORT_PARALLEL(keysOld, keysOld + indexSizeOld, compareSecondEntry());
144+
113145
#pragma omp parallel for schedule(dynamic, 10)
114146
for (size_t id = 0; id < indexSizeOld; ++id) {
147+
if (deletedIds[id]) {
148+
continue;
149+
}
115150
const std::string &keyToSearch = keysOld[id].first;
116151
std::pair<std::string, unsigned int> *mappedKey
117152
= std::lower_bound(keysNew, keysNew + indexSizeNew, keyToSearch, compareKeyToFirstEntry());
@@ -127,20 +162,25 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
127162
}
128163
}
129164

130-
for (size_t i = 0; i < indexSizeOld; ++i) {
131-
if(deletedIds[i]) {
132-
removedSeqDBWriter << keysOld[i].second << std::endl;
133-
}
134-
}
135-
removedSeqDBWriter.close();
136-
137165
for (size_t id = 0; id < indexSizeNew; ++id) {
166+
if (keysNew[id].second == UINT_MAX) {
167+
continue;
168+
}
138169
if (checkedNew[id]) {
139170
keptSeqDBWriter << keysOld[mappedIds[id]].second << "\t" << keysNew[id].second << std::endl;
140171
} else {
141172
newSeqDBWriter << keysNew[id].second << std::endl;
142173
}
143174
}
175+
176+
for (size_t i = 0; i < indexSizeOld; ++i) {
177+
if(deletedIds[i]) {
178+
removedSeqDBWriter << keysOld[i].second << std::endl;
179+
}
180+
}
181+
removedSeqDBWriter.close();
182+
183+
144184
newSeqDBWriter.close();
145185
keptSeqDBWriter.close();
146186

@@ -149,7 +189,6 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
149189
delete[] checkedNew;
150190
delete[] keysNew;
151191
delete[] keysOld;
152-
153192
newReader.close();
154193
oldReader.close();
155194

0 commit comments

Comments
 (0)