Skip to content

Commit 50d39ee

Browse files
committed
initial commit for sparse sketch work
1 parent e0b7da0 commit 50d39ee

File tree

12 files changed

+1112
-483
lines changed

12 files changed

+1112
-483
lines changed

CMakeLists.txt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,16 @@ endif()
7575
FetchContent_MakeAvailable(GutterTree StreamingUtilities)
7676

7777
# AVAILABLE COMPILATION DEFINITIONS:
78-
# VERIFY_SAMPLES_F Use a deterministic connected-components
79-
# algorithm to verify post-processing.
8078
# NO_EAGER_DSU Do not use the eager DSU query optimization
8179
# if this flag is present.
8280
# L0_SAMPLING Run the CubeSketch l0 sampling algorithm
8381
# to ensure that we sample uniformly.
8482
# Otherwise, run a support finding algorithm.
83+
# L0_FULLY_DENSE Fully allocate the sketch matrix at the beginning
84+
# of the program. If this flag is not used, sketches
85+
# are allocated dynamically.
86+
# VERIFY_SAMPLES_F Use a deterministic connected-components
87+
# algorithm to verify post-processing.
8588
#
8689
# Example:
8790
# cmake -DCMAKE_CXX_FLAGS="-DL0_SAMPLING" ..
@@ -91,7 +94,8 @@ add_library(GraphZeppelin
9194
src/return_types.cpp
9295
src/driver_configuration.cpp
9396
src/cc_alg_configuration.cpp
94-
src/sketch.cpp
97+
src/sparse_sketch.cpp
98+
src/dense_sketch.cpp
9599
src/util.cpp)
96100
add_dependencies(GraphZeppelin GutterTree StreamingUtilities)
97101
target_link_libraries(GraphZeppelin PUBLIC xxhash GutterTree StreamingUtilities)
@@ -105,7 +109,8 @@ add_library(GraphZeppelinVerifyCC
105109
src/return_types.cpp
106110
src/driver_configuration.cpp
107111
src/cc_alg_configuration.cpp
108-
src/sketch.cpp
112+
src/sparse_sketch.cpp
113+
src/dense_sketch.cpp
109114
src/util.cpp
110115
test/util/graph_verifier.cpp)
111116
add_dependencies(GraphZeppelinVerifyCC GutterTree StreamingUtilities)

include/bucket.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ struct Bucket {
99
vec_t alpha;
1010
vec_hash_t gamma;
1111
};
12+
struct SparseBucket {
13+
uint16_t position; // (col << 8) | row
14+
Bucket bkt;
15+
};
1216
#pragma pack(pop)
1317

1418
namespace Bucket_Boruvka {
@@ -34,13 +38,19 @@ namespace Bucket_Boruvka {
3438
inline static vec_hash_t get_index_hash(const vec_t index, const long sketch_seed);
3539

3640
/**
37-
* Checks whether a Bucket is good, assuming the Bucket contains all elements.
41+
* Checks whether a Bucket is good.
3842
* @param bucket The bucket to check
3943
* @param sketch_seed The seed of the Sketch this Bucket belongs to.
4044
* @return true if this Bucket is good, else false.
4145
*/
4246
inline static bool is_good(const Bucket &bucket, const long sketch_seed);
4347

48+
/**
49+
* Checks whether a Bucket is empty.
50+
* @return true if this Bucket is empty (alpha and gamma == 0), else false.
51+
*/
52+
inline static bool is_empty(const Bucket &bucket);
53+
4454
/**
4555
* Updates a Bucket with the given update index
4656
* @param bucket The bucket to update
@@ -66,6 +76,10 @@ inline bool Bucket_Boruvka::is_good(const Bucket &bucket, const long sketch_seed
6676
return bucket.gamma == get_index_hash(bucket.alpha, sketch_seed);
6777
}
6878

79+
inline bool Bucket_Boruvka::is_empty(const Bucket &bucket) {
80+
return bucket.alpha == 0 && bucket.gamma == 0;
81+
}
82+
6983
inline void Bucket_Boruvka::update(Bucket& bucket, const vec_t update_idx,
7084
const vec_hash_t update_hash) {
7185
bucket.alpha ^= update_idx;

include/dense_sketch.h

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#pragma once
2+
#include <graph_zeppelin_common.h>
3+
#include <gtest/gtest_prod.h>
4+
#include <sys/mman.h>
5+
6+
#include <fstream>
7+
#include <unordered_set>
8+
#include <cmath>
9+
#include <cassert>
10+
#include <mutex>
11+
12+
#include "util.h"
13+
#include "bucket.h"
14+
#include "sketch_types.h"
15+
16+
/**
17+
* Sketch for graph processing, either CubeSketch or CameoSketch.
18+
* Sub-linear representation of a vector.
19+
*/
20+
class DenseSketch {
21+
private:
22+
const uint64_t seed; // seed for hash functions
23+
size_t num_samples; // number of samples we can perform
24+
size_t cols_per_sample; // number of columns to use on each sample
25+
size_t num_columns; // Total number of columns. (product of above 2)
26+
size_t bkt_per_col; // maximum number of buckets per column (max number of rows)
27+
size_t num_buckets; // number of total buckets product of above two
28+
size_t sample_idx = 0; // number of samples performed so far
29+
30+
// Allocated buckets
31+
Bucket* buckets;
32+
33+
inline Bucket& deterministic_bucket() {
34+
return buckets[0];
35+
}
36+
inline const Bucket& deterministic_bucket() const {
37+
return buckets[0];
38+
}
39+
40+
// return the bucket at a particular index in bucket array
41+
inline Bucket& bucket(size_t col, size_t row) {
42+
return buckets[col * bkt_per_col + row + 1];
43+
}
44+
inline const Bucket& bucket(size_t col, size_t row) const {
45+
return buckets[col * bkt_per_col + row + 1];
46+
}
47+
48+
public:
49+
/**
50+
* The below constructors use vector length as their input. However, in graph sketching our input
51+
* is the number of vertices. This function converts from number of graph vertices to vector
52+
* length.
53+
* @param num_vertices Number of graph vertices
54+
* @return The length of the vector to sketch
55+
*/
56+
static vec_t calc_vector_length(node_id_t num_vertices) {
57+
return ceil(double(num_vertices) * (num_vertices - 1) / 2);
58+
}
59+
60+
/**
61+
* This function computes the number of samples a Sketch should support in order to solve
62+
* connected components. Optionally, can increase or decrease the number of samples by a
63+
* multiplicative factor.
64+
* @param num_vertices Number of graph vertices
65+
* @param f Multiplicative sample factor
66+
* @return The number of samples
67+
*/
68+
static size_t calc_cc_samples(node_id_t num_vertices, double f) {
69+
return std::max(size_t(18), (size_t) ceil(f * log2(num_vertices) / num_samples_div));
70+
}
71+
72+
/**
73+
* Construct a sketch object
74+
* @param vector_len Length of the vector we are sketching
75+
* @param seed Random seed of the sketch
76+
* @param num_samples [Optional] Number of samples this sketch supports (default = 1)
77+
* @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1)
78+
*/
79+
DenseSketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1,
80+
size_t cols_per_sample = default_cols_per_sample);
81+
82+
/**
83+
* Construct a sketch from a serialized stream
84+
* @param vector_len Length of the vector we are sketching
85+
* @param seed Random seed of the sketch
86+
* @param binary_in Stream holding serialized sketch object
87+
* @param num_samples [Optional] Number of samples this sketch supports (default = 1)
88+
* @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1)
89+
*/
90+
DenseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1,
91+
size_t cols_per_sample = default_cols_per_sample);
92+
93+
/**
94+
* Sketch copy constructor
95+
* @param s The sketch to copy.
96+
*/
97+
DenseSketch(const DenseSketch& s);
98+
99+
~DenseSketch();
100+
101+
/**
102+
* Update a sketch based on information about one of its indices.
103+
* @param update the point update.
104+
*/
105+
void update(const vec_t update);
106+
107+
/**
108+
* Function to sample from the sketch.
109+
* cols_per_sample determines the number of columns we allocate to this query
110+
* @return A pair with the result index and a code indicating the type of result.
111+
*/
112+
SketchSample sample();
113+
114+
/**
115+
* Function to sample from the appropriate columns to return 1 or more non-zero indices
116+
* @return A pair with the result indices and a code indicating the type of result.
117+
*/
118+
ExhaustiveSketchSample exhaustive_sample();
119+
120+
std::mutex mutex; // lock the sketch for applying updates in multithreaded processing
121+
122+
/**
123+
* In-place merge function.
124+
* @param other Sketch to merge into caller
125+
*/
126+
void merge(const DenseSketch &other);
127+
128+
/**
129+
* In-place range merge function. Updates the caller Sketch.
130+
* The range merge only merges some of the Sketches
131+
* This function should only be used if you know what you're doing
132+
* @param other Sketch to merge into caller
133+
* @param start_sample Index of first sample to merge
134+
* @param n_samples Number of samples to merge
135+
*/
136+
void range_merge(const DenseSketch &other, size_t start_sample, size_t n_samples);
137+
138+
/**
139+
* Perform an in-place merge function without another Sketch and instead
140+
* use a raw bucket memory.
141+
* We also allow for only a portion of the buckets to be merge at once
142+
* @param raw_bucket Raw bucket data to merge into this sketch
143+
*/
144+
void merge_raw_bucket_buffer(const Bucket *raw_buckets);
145+
146+
/**
147+
* Zero out all the buckets of a sketch.
148+
*/
149+
void zero_contents();
150+
151+
friend bool operator==(const DenseSketch& sketch1, const DenseSketch& sketch2);
152+
friend std::ostream& operator<<(std::ostream& os, const DenseSketch& sketch);
153+
154+
/**
155+
* Serialize the sketch to a binary output stream.
156+
* @param binary_out the stream to write to.
157+
*/
158+
void serialize(std::ostream& binary_out) const;
159+
160+
inline void reset_sample_state() {
161+
sample_idx = 0;
162+
}
163+
164+
// return the size of the sketching datastructure in bytes (just the buckets, not the metadata)
165+
inline size_t bucket_array_bytes() const {
166+
return num_buckets * sizeof(Bucket);
167+
}
168+
169+
inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; }
170+
inline uint64_t get_seed() const { return seed; }
171+
inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; }
172+
inline size_t checksum_seed() const { return seed; }
173+
inline size_t get_columns() const { return num_columns; }
174+
inline size_t get_buckets() const { return num_buckets; }
175+
inline size_t get_num_samples() const { return num_samples; }
176+
177+
static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; }
178+
179+
static constexpr size_t default_cols_per_sample = 1;
180+
static constexpr double num_samples_div = 1 - log2(2 - 0.8);
181+
};

include/graph_sketch_driver.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,6 @@ class GraphSketchDriver {
206206
inline void batch_callback(int thr_id, node_id_t src_vertex,
207207
const std::vector<node_id_t> &dst_vertices) {
208208
total_updates += dst_vertices.size();
209-
return;
210209
sketching_alg->apply_update_batch(thr_id, src_vertex, dst_vertices);
211210
}
212211

0 commit comments

Comments
 (0)