1
1
#include " Clustering.h"
2
2
#include " ClusteringAlgorithms.h"
3
+ #include " AlignmentSymmetry.h"
3
4
#include " Debug.h"
4
5
#include " Util.h"
5
6
#include " itoa.h"
6
7
#include " Timer.h"
7
8
#include " SequenceWeights.h"
9
+ #include < fstream>
8
10
9
11
Clustering::Clustering (const std::string &seqDB, const std::string &seqDBIndex,
10
12
const std::string &alnDB, const std::string &alnDBIndex,
11
13
const std::string &outDB, const std::string &outDBIndex,
12
14
const std::string &sequenceWeightFile,
13
- unsigned int maxIteration, int similarityScoreType, int threads, int compressed) : maxIteration(maxIteration),
15
+ unsigned int maxIteration, int similarityScoreType, int threads, int compressed, bool needSET) : needSET(needSET),
16
+ maxIteration(maxIteration),
14
17
similarityScoreType(similarityScoreType),
15
18
threads(threads),
16
19
compressed(compressed),
17
20
outDB(outDB),
18
21
outDBIndex(outDBIndex) {
19
22
20
23
seqDbr = new DBReader<unsigned int >(seqDB.c_str (), seqDBIndex.c_str (), threads, DBReader<unsigned int >::USE_INDEX);
21
-
24
+ alnDbr = new DBReader<unsigned int >(alnDB.c_str (), alnDBIndex.c_str (), threads, DBReader<unsigned int >::USE_DATA|DBReader<unsigned int >::USE_INDEX);
25
+ alnDbr->open (DBReader<unsigned int >::NOSORT);
22
26
if (!sequenceWeightFile.empty ()) {
23
-
24
27
seqDbr->open (DBReader<unsigned int >::SORT_BY_ID);
25
-
26
28
SequenceWeights *sequenceWeights = new SequenceWeights (sequenceWeightFile.c_str ());
27
29
float *localid2weight = new float [seqDbr->getSize ()];
28
30
for (size_t id = 0 ; id < seqDbr->getSize (); id++) {
@@ -33,29 +35,153 @@ Clustering::Clustering(const std::string &seqDB, const std::string &seqDBIndex,
33
35
delete[] localid2weight;
34
36
delete sequenceWeights;
35
37
36
- } else
37
- seqDbr->open (DBReader<unsigned int >::SORT_BY_LENGTH);
38
+ } else {
39
+ if (needSET == false ) {
40
+ seqDbr->open (DBReader<unsigned int >::SORT_BY_LENGTH);
41
+ } else {
42
+ DBReader<unsigned int > *originalseqDbr = new DBReader<unsigned int >(seqDB.c_str (), seqDBIndex.c_str (), threads, DBReader<unsigned int >::USE_INDEX);
43
+ originalseqDbr->open (DBReader<unsigned int >::NOSORT);
44
+ DBReader<unsigned int >::Index * seqIndex = originalseqDbr->getIndex ();
45
+
46
+ unsigned int lastKey = originalseqDbr->getLastKey ();
47
+ keyToSet = new unsigned int [lastKey+1 ];
48
+ std::vector<bool > keysInSeq (lastKey+1 , false );
49
+ std::map<unsigned int , unsigned int > setToLength;
50
+
51
+ std::ifstream mappingStream (seqDB + " .lookup" );
52
+ std::string line;
53
+ unsigned int setkey = 0 ;
54
+ while (std::getline (mappingStream, line)) {
55
+ std::vector<std::string> split = Util::split (line, " \t " );
56
+ unsigned int key = strtoul (split[0 ].c_str (), NULL , 10 );
57
+ setkey = strtoul (split[2 ].c_str (), NULL , 10 );
58
+ keyToSet[key] = setkey;
59
+ }
60
+ for (size_t id = 0 ; id < originalseqDbr->getSize (); id++) {
61
+ setToLength[keyToSet[seqIndex[id].id ]] += seqIndex[id].length ;
62
+ keysInSeq[seqIndex[id].id ] = 1 ;
63
+ }
64
+ unsigned int sourceLen = setkey + 1 ;
65
+ seqnum = setToLength.size ();
66
+ sourceList = new (std::nothrow) unsigned int [lastKey];
67
+ sourceOffsets = new (std::nothrow) size_t [sourceLen + 1 ];
68
+ sourceLookupTable = new (std::nothrow) unsigned int *[sourceLen];
69
+
70
+ mappingStream.close ();
71
+ mappingStream.open (seqDB + " .lookup" );
72
+ line = " " ;
73
+ unsigned int prevsetkey = UINT_MAX;
74
+ size_t n = 0 ;
75
+ size_t lookupOrder = 0 ;
76
+ setkey = UINT_MAX;
77
+ while (std::getline (mappingStream, line)) {
78
+ std::vector<std::string> split = Util::split (line, " \t " );
79
+ unsigned int key = strtoul (split[0 ].c_str (), NULL , 10 );
80
+ setkey = strtoul (split[2 ].c_str (), NULL , 10 );
81
+ if (setkey != prevsetkey) {
82
+ if (prevsetkey != UINT_MAX){
83
+ sourceOffsets[prevsetkey] = n;
84
+ for (size_t k = prevsetkey+1 ; k<setkey; k++) {
85
+ sourceOffsets[k] = 0 ;
86
+ }
87
+ }
88
+ prevsetkey = setkey;
89
+ if (keysInSeq[key] == 1 ) {
90
+ sourceKeyVec.emplace_back (setkey);
91
+ }
92
+ n = 0 ;
93
+ }
94
+ if (keysInSeq[key] == 1 ) {
95
+ sourceList[lookupOrder] = key;
96
+ } else {
97
+ sourceList[lookupOrder] = UINT_MAX;
98
+ }
99
+ n++;
100
+ lookupOrder++;
101
+ }
102
+ sourceOffsets[prevsetkey] = n;
103
+ AlignmentSymmetry::computeOffsetFromCounts (sourceOffsets, sourceLen);
104
+ AlignmentSymmetry::setupPointers<unsigned int >(sourceList, sourceLookupTable, sourceOffsets, sourceLen, lastKey);
105
+
106
+ char * data = (char *)malloc (
107
+ sizeof (size_t ) +
108
+ sizeof (size_t ) +
109
+ sizeof (unsigned int ) +
110
+ sizeof (int ) +
111
+ sizeof (unsigned int ) +
112
+ sizeof (DBReader<unsigned int >::Index) * seqnum
113
+ );
114
+
115
+ std::vector<DBReader<unsigned int >::Index*> indexStorage (seqnum);
116
+
117
+ n = 0 ;
118
+ for (const auto & pairs : setToLength) {
119
+ indexStorage[n] = new DBReader<unsigned int >::Index;
120
+ indexStorage[n]->id = pairs.first ;
121
+ indexStorage[n]->length = pairs.second ;
122
+ indexStorage[n]->offset = 0 ;
123
+ n++;
124
+ }
125
+
126
+ char * p = data;
127
+ *((size_t *)p) = seqnum;
128
+ p += sizeof (size_t );
129
+ *((size_t *)p) = 0 ;
130
+ p += sizeof (size_t );
131
+ *((unsigned int *)p) = indexStorage[seqnum-1 ]->id ;
132
+ p += sizeof (unsigned int );
133
+ *((int *)p) = originalseqDbr->getDbtype ();
134
+ p += sizeof (int );
135
+ *((unsigned int *)p) = indexStorage[0 ]->length ;
136
+ p += sizeof (unsigned int );
137
+ for (size_t i = 0 ; i < seqnum; ++i) {
138
+ memcpy (
139
+ p + i * sizeof (DBReader<unsigned int >::Index),
140
+ indexStorage[i],
141
+ sizeof (DBReader<unsigned int >::Index)
142
+ );
143
+ }
144
+ p += sizeof (DBReader<unsigned int >::Index) * seqnum;
145
+ seqDbr = DBReader<unsigned int >::unserialize (data, threads);
146
+ seqDbr->open (DBReader<unsigned int >::SORT_BY_LENGTH);
147
+ for (auto * ptr : indexStorage) {
148
+ delete ptr;
149
+ }
150
+ }
151
+ }
38
152
39
- alnDbr = new DBReader<unsigned int >(alnDB.c_str (), alnDBIndex.c_str (), threads, DBReader<unsigned int >::USE_DATA|DBReader<unsigned int >::USE_INDEX);
40
- alnDbr->open (DBReader<unsigned int >::NOSORT);
41
153
42
154
}
43
155
44
156
Clustering::~Clustering () {
45
157
delete seqDbr;
46
158
delete alnDbr;
159
+ if (needSET){
160
+ delete keyToSet;
161
+ delete sourceOffsets;
162
+ delete sourceList;
163
+ delete[] sourceLookupTable;
164
+ }
47
165
}
48
166
49
167
50
168
void Clustering::run (int mode) {
51
169
Timer timer;
52
- DBWriter *dbw = new DBWriter (outDB.c_str (), outDBIndex.c_str (), 1 , compressed, Parameters::DBTYPE_CLUSTER_RES);
170
+
171
+ unsigned int dbType = Parameters::DBTYPE_CLUSTER_RES;
172
+ unsigned int dbTypeSet = DBReader<unsigned int >::setExtendedDbtype (dbType, Parameters::DBTYPE_EXTENDED_SET);
173
+ DBWriter *dbw;
174
+ if (needSET) {
175
+ dbw = new DBWriter (outDB.c_str (), outDBIndex.c_str (), 1 , compressed, dbTypeSet);
176
+ } else {
177
+ dbw = new DBWriter (outDB.c_str (), outDBIndex.c_str (), 1 , compressed, dbType);
178
+ }
53
179
dbw->open ();
54
180
55
181
std::pair<unsigned int , unsigned int > * ret;
56
182
ClusteringAlgorithms *algorithm = new ClusteringAlgorithms (seqDbr, alnDbr,
57
183
threads, similarityScoreType,
58
- maxIteration);
184
+ maxIteration, keyToSet, sourceOffsets, sourceLookupTable, sourceList, seqnum, needSET );
59
185
60
186
if (mode == Parameters::GREEDY) {
61
187
Debug (Debug::INFO) << " Clustering mode: Greedy\n " ;
@@ -79,7 +205,7 @@ void Clustering::run(int mode) {
79
205
size_t dbSize = alnDbr->getSize ();
80
206
size_t seqDbSize = seqDbr->getSize ();
81
207
size_t cluNum = (dbSize > 0 ) ? 1 : 0 ;
82
- for (size_t i = 1 ; i < dbSize ; i++){
208
+ for (size_t i = 1 ; i < seqDbSize ; i++){
83
209
cluNum += (ret[i].first != ret[i-1 ].first );
84
210
}
85
211
Debug (Debug::INFO) << " Total time: " << timer.lap () << " \n " ;
@@ -88,11 +214,10 @@ void Clustering::run(int mode) {
88
214
Debug (Debug::INFO) << " Number of clusters: " << cluNum << " \n\n " ;
89
215
90
216
Debug (Debug::INFO) << " Writing results " ;
91
- writeData (dbw, ret, dbSize );
217
+ writeData (dbw, ret, seqDbSize );
92
218
Debug (Debug::INFO) << timerWrite.lap () << " \n " ;
93
219
delete [] ret;
94
220
delete algorithm;
95
-
96
221
dbw->close (false , false );
97
222
seqDbr->close ();
98
223
alnDbr->close ();
0 commit comments