Skip to content

Commit 176c2c9

Browse files
committed
first attempt at using hashing
1 parent 678b393 commit 176c2c9

File tree

3 files changed

+95
-74
lines changed

3 files changed

+95
-74
lines changed

include/bucket_buffer.h

Lines changed: 70 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -31,51 +31,77 @@ struct BufferEntry {
3131
}
3232
};
3333

34-
// class BucketBufferHashMap {
35-
// public:
36-
// // std::unordered_map<int, std::unordered_map<int, Bucket>> entries;
37-
// std::unordered_map<std::pair<int, int>, Bucket> entries;
38-
// size_t _capacity;
39-
// BucketBufferHashMap(size_t capacity): _capacity(capacity) {};
34+
class BucketBufferHashMap {
35+
public:
36+
// std::unordered_map<int, std::unordered_map<int, Bucket>> entries;
37+
std::unordered_map<uint32_t, Bucket> entries;
38+
size_t _capacity;
39+
40+
uint32_t coords_to_key(int col_idx, int row_idx) {
41+
return (uint32_t) col_idx << 16 | ((uint16_t) row_idx);
42+
}
43+
44+
int key_to_row(uint32_t key) {
45+
return (int) key & 0xFFFF;
46+
}
47+
int key_to_col(uint32_t key) {
48+
return (int) key >> 16;
49+
}
50+
51+
BucketBufferHashMap(size_t capacity): _capacity(capacity) {
52+
entries = std::unordered_map<uint32_t, Bucket>();
53+
entries.reserve(_capacity);
54+
};
55+
56+
BucketBufferHashMap(): _capacity(128) {
57+
entries = std::unordered_map<uint32_t, Bucket>();
58+
entries.reserve(_capacity);
59+
}
60+
61+
62+
bool over_capacity() const {
63+
return entries.size() >= _capacity / 2;
64+
}
65+
66+
size_t size() const {
67+
return entries.size();
68+
}
4069

41-
// bool insert(int col_idx, int row_idx, Bucket value) {
42-
// if (entries.size() >= _capacity) {
43-
// return false;
44-
// }
45-
// static constexpr Bucket zero_bucket = {0, 0};
46-
// entries.emplace(std::make_pair(col_idx, row_idx), zero_bucket);
47-
// entries[{col_idx, row_idx}] ^= value;
48-
// if (Bucket_Boruvka::is_empty(entries[{col_idx, row_idx}])) {
49-
// entries.erase({col_idx, row_idx});
50-
// }
51-
// return true;
52-
// }
53-
// bool merge(const BucketBufferHashMap &other) {
54-
// for (const auto &idx : other.entries) {
55-
// static constexpr Bucket zero_bucket = {0, 0};
56-
// entries.emplace(idx.first, zero_bucket);
57-
// entries[idx.first] ^= idx.second;
58-
// if (Bucket_Boruvka::is_empty(entries[idx.first])) {
59-
// entries.erase(idx.first);
60-
// }
61-
// }
62-
// // TODO - make this less gross
63-
// unlikely_if (entries.size() >= _capacity) {
64-
// // UNDO THE MERGE
65-
// for (const auto &idx : other.entries) {
66-
// static constexpr Bucket zero_bucket = {0, 0};
67-
// entries.emplace(idx.first, zero_bucket);
68-
// entries[idx.first] ^= idx.second;
69-
// if (Bucket_Boruvka::is_empty(entries[idx.first])) {
70-
// entries.erase(idx.first);
71-
// }
72-
// }
73-
// return false;
74-
// }
75-
// else
76-
// return true;
77-
// }
78-
// };
70+
void clear() {
71+
entries.clear();
72+
}
73+
74+
75+
bool insert(int col_idx, int row_idx, Bucket value) {
76+
static constexpr Bucket zero_bucket = {0, 0};
77+
entries.emplace(std::make_pair(col_idx, row_idx), zero_bucket);
78+
entries[coords_to_key(col_idx, row_idx)] ^= value;
79+
if (Bucket_Boruvka::is_empty(entries[coords_to_key(col_idx, row_idx)])) {
80+
entries.erase(coords_to_key(col_idx, row_idx));
81+
}
82+
return over_capacity();
83+
}
84+
bool merge(const BucketBufferHashMap &other) {
85+
assert(size() + other.size() <= _capacity);
86+
for (const auto &idx : other.entries) {
87+
static constexpr Bucket zero_bucket = {0, 0};
88+
entries.emplace(idx.first, zero_bucket);
89+
entries[idx.first] ^= idx.second;
90+
if (Bucket_Boruvka::is_empty(entries[idx.first])) {
91+
entries.erase(idx.first);
92+
}
93+
}
94+
}
95+
96+
Bucket get_bucket(int col_idx, int row_idx) {
97+
auto key = coords_to_key(col_idx, row_idx);
98+
if (entries.find(key) == entries.end()) {
99+
return {0, 0};
100+
}
101+
return entries[key];
102+
// return entries[coords_to_key(col_idx, row_idx)];
103+
}
104+
};
79105

80106

81107
// note that we consider these to be

include/sketch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class Sketch {
6262
// bucket data
6363
Bucket* buckets;
6464
// bucket coo buffer
65-
BucketBuffer bucket_buffer;
65+
BucketBufferHashMap bucket_buffer;
6666

6767
// flags
6868

src/sketch.cpp

Lines changed: 24 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ Sketch::Sketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) :
2020
num_columns = num_samples * cols_per_sample;
2121
bkt_per_col = calc_bkt_per_col(vector_len);
2222
num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket
23-
bucket_buffer = BucketBuffer();
23+
// bucket_buffer = BucketBuffer();
24+
bucket_buffer = BucketBufferHashMap();
2425
#ifdef EAGER_BUCKET_CHECK
2526
buckets = (Bucket*) (new char[bucket_array_bytes()]);
2627
nonempty_buckets = (vec_t*) (buckets + num_buckets);
@@ -53,7 +54,7 @@ Sketch::Sketch(vec_t vector_len, uint64_t seed, bool compressed, std::istream &b
5354
// bkt_per_col = calc_bkt_per_col(vector_len);
5455
num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket
5556
// bucket_buffer = BucketBuffer(new BufferEntry[_cols * 2], _cols * 2);
56-
bucket_buffer = BucketBuffer();
57+
bucket_buffer = BucketBufferHashMap();
5758
buckets = (Bucket*) new char[bucket_array_bytes()];
5859
#ifdef EAGER_BUCKET_CHECK
5960
nonempty_buckets = (vec_t*) (buckets + num_buckets);
@@ -116,7 +117,7 @@ Sketch::Sketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t
116117
bkt_per_col = calc_bkt_per_col(vector_len);
117118
// bkt_per_col = 1;
118119
num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket
119-
bucket_buffer = BucketBuffer();
120+
bucket_buffer = BucketBufferHashMap();
120121
buckets = (Bucket*) new char[bucket_array_bytes()];
121122
#ifdef EAGER_BUCKET_CHECK
122123
nonempty_buckets = (vec_t*) (buckets + num_buckets);
@@ -132,7 +133,7 @@ Sketch::Sketch(const Sketch &s) : seed(s.seed) {
132133
bkt_per_col = s.bkt_per_col;
133134
num_buckets = s.num_buckets;
134135
// TODO - do this correctly in other places. Otherwise serialization is broken
135-
bucket_buffer = BucketBuffer();
136+
bucket_buffer = BucketBufferHashMap();
136137
buckets = (Bucket*) new char[bucket_array_bytes()];
137138
// buckets = new Bucket[num_buckets];
138139

@@ -195,21 +196,18 @@ Sketch::~Sketch() {
195196
* backwards until we reach the point where the columns are once again not
196197
* being stored
197198
*/
198-
// bucket_buffer.sort_and_compact();
199-
size_t buffer_size = bucket_buffer.size();
200-
// ACTUALLY - we dont need to sort. just need to partition
201-
size_t to_keep_sz = bucket_buffer.partition(bkt_per_col);
202-
int i = ((int) buffer_size)-1;
203-
// while (i >= 0 && bucket_buffer[i].row_idx < bkt_per_col) {
204-
while (i >= 0 && i >= to_keep_sz) {
205-
// update the bucket
206-
get_bucket(bucket_buffer[i].col_idx, bucket_buffer[i].row_idx) ^= bucket_buffer[i].value;
207-
i--;
208-
}
209-
bucket_buffer.entries.resize(to_keep_sz);
210-
// bucket_buffer.entries.resize(i+1);
211-
// if (buffer_size > 3)
212-
// std::cout << "Injected buffer buckets:" << buffer_size << " to " << i+1 << std::endl;
199+
auto it = bucket_buffer.entries.begin();
200+
while (it != bucket_buffer.entries.end()) {
201+
if (bucket_buffer.key_to_row(it->first) >= bkt_per_col) {
202+
get_bucket(
203+
bucket_buffer.key_to_col(it->first),
204+
bucket_buffer.key_to_row(it->first)
205+
) ^= it->second;
206+
it = bucket_buffer.entries.erase(it);
207+
} else {
208+
it++;
209+
}
210+
}
213211
}
214212

215213

@@ -322,18 +320,15 @@ SketchSample Sketch::sample() {
322320
}
323321
}
324322
// finally, check the deep buffer
325-
for (size_t i = 0; i < bucket_buffer.size(); i++) {
326-
const BufferEntry &entry = bucket_buffer[i];
327-
// TODO - optimize this check. THIS IS GONNA CAUSE REALLY POOR
328-
// PERFORMANCE UNTIL WE DO SOMETHING ABOUT IT
329-
if (entry.col_idx >= first_column && entry.col_idx < first_column + cols_per_sample) {
330-
if (Bucket_Boruvka::is_good(entry.value, checksum_seed())) {
331-
// std::cout << "Found a bucket in the buffer" << std::endl;
332-
assert(entry.row_idx >= bkt_per_col);
333-
return {entry.value.alpha, GOOD};
334-
}
323+
for (size_t col = first_column; col < first_column + cols_per_sample; ++col) {
324+
for (size_t row = bkt_per_col; row < bkt_per_col + 6; ++row) {
325+
Bucket bucket = bucket_buffer.get_bucket(col, row);
326+
// Bucket &bucket = bucket_buffer.get_bucket(col, row);
327+
if (Bucket_Boruvka::is_good(bucket, checksum_seed()))
328+
return {bucket.alpha, GOOD};
335329
}
336330
}
331+
337332
return {0, FAIL};
338333
}
339334

0 commit comments

Comments
 (0)