1
- // Computes either a PSSM or a MSA from clustering or alignment result
2
- // For PSSMs: MMseqs just stores the position specific score in 1 byte
3
-
4
1
#include < cstdlib>
5
2
#include < fstream>
6
3
#include < sstream>
7
4
#include < algorithm>
8
5
#include < utility>
9
-
6
+ # include < climits >
10
7
#include " Parameters.h"
11
8
12
9
#include " DBReader.h"
13
10
#include " DBWriter.h"
14
11
#include " Util.h"
12
+ #include " FastSort.h"
15
13
16
14
#ifdef OPENMP
17
15
#include < omp.h>
18
16
#endif
19
17
18
+
19
+ struct compareSecondEntry {
20
+ bool
21
+ operator ()(const std::pair<std::string, unsigned int > &lhs, const std::pair<std::string, unsigned int > &rhs) const {
22
+ return (lhs.second < rhs.second );
23
+ }
24
+ };
25
+
20
26
struct compareFirstEntry {
21
27
bool
22
28
operator ()(const std::pair<std::string, unsigned int > &lhs, const std::pair<std::string, unsigned int > &rhs) const {
23
- return (lhs.first .compare (rhs.first ) <= 0 );
29
+ return (lhs.first < rhs.first ) ||
30
+ (lhs.first == rhs.first && lhs.second < rhs.second );
24
31
}
25
32
};
26
33
@@ -101,17 +108,45 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
101
108
}
102
109
103
110
// sort by header for binary search
104
- std::stable_sort (keysNew, keysNew + indexSizeNew, compareFirstEntry ());
105
-
111
+ SORT_PARALLEL (keysNew, keysNew + indexSizeNew, compareFirstEntry ());
112
+ // remove duplicates in new DB by setting the dbkey to UINT_MAX
113
+ for (size_t i = 0 ; i + 1 < indexSizeNew; ++i) {
114
+ if (keysNew[i].first == keysNew[i+1 ].first ) {
115
+ keysNew[i+1 ].second = UINT_MAX;
116
+ }
117
+ }
106
118
// default initialized with false
107
119
bool * checkedNew = new bool [indexSizeNew]();
108
120
// doesn't need to be initialized
109
121
size_t *mappedIds = new size_t [indexSizeNew];
110
-
111
122
bool * deletedIds = new bool [indexSizeOld]();
112
123
124
+ // copy the orignal dbKey from keysOld to originalOldKeys
125
+ unsigned int * originalOldKeys = new unsigned int [indexSizeOld]();
126
+ for (size_t i = 0 ; i < indexSizeOld; ++i) {
127
+ originalOldKeys[i] = keysOld[i].second ;
128
+ keysOld[i].second = i;
129
+ }
130
+
131
+ // sorting should be the same as with orignal dbKeys since they are monotonically increasing
132
+ SORT_PARALLEL (keysOld, keysOld + indexSizeOld, compareFirstEntry ());
133
+ for (size_t i = 0 ; i + 1 < indexSizeOld; ++i) {
134
+ if (keysOld[i].first == keysOld[i+1 ].first ) {
135
+ deletedIds[keysOld[i+1 ].second ] = true ;
136
+ }
137
+ }
138
+ for (size_t i = 0 ; i < indexSizeOld; ++i) {
139
+ keysOld[i].second = originalOldKeys[keysOld[i].second ];
140
+ }
141
+ delete [] originalOldKeys;
142
+ // restore original order
143
+ SORT_PARALLEL (keysOld, keysOld + indexSizeOld, compareSecondEntry ());
144
+
113
145
#pragma omp parallel for schedule(dynamic, 10)
114
146
for (size_t id = 0 ; id < indexSizeOld; ++id) {
147
+ if (deletedIds[id]) {
148
+ continue ;
149
+ }
115
150
const std::string &keyToSearch = keysOld[id].first ;
116
151
std::pair<std::string, unsigned int > *mappedKey
117
152
= std::lower_bound (keysNew, keysNew + indexSizeNew, keyToSearch, compareKeyToFirstEntry ());
@@ -127,20 +162,25 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
127
162
}
128
163
}
129
164
130
- for (size_t i = 0 ; i < indexSizeOld; ++i) {
131
- if (deletedIds[i]) {
132
- removedSeqDBWriter << keysOld[i].second << std::endl;
133
- }
134
- }
135
- removedSeqDBWriter.close ();
136
-
137
165
for (size_t id = 0 ; id < indexSizeNew; ++id) {
166
+ if (keysNew[id].second == UINT_MAX) {
167
+ continue ;
168
+ }
138
169
if (checkedNew[id]) {
139
170
keptSeqDBWriter << keysOld[mappedIds[id]].second << " \t " << keysNew[id].second << std::endl;
140
171
} else {
141
172
newSeqDBWriter << keysNew[id].second << std::endl;
142
173
}
143
174
}
175
+
176
+ for (size_t i = 0 ; i < indexSizeOld; ++i) {
177
+ if (deletedIds[i]) {
178
+ removedSeqDBWriter << keysOld[i].second << std::endl;
179
+ }
180
+ }
181
+ removedSeqDBWriter.close ();
182
+
183
+
144
184
newSeqDBWriter.close ();
145
185
keptSeqDBWriter.close ();
146
186
@@ -149,7 +189,6 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
149
189
delete[] checkedNew;
150
190
delete[] keysNew;
151
191
delete[] keysOld;
152
-
153
192
newReader.close ();
154
193
oldReader.close ();
155
194
0 commit comments