From 140800573da6b3b2072d42dc8825e84c50a7a4a8 Mon Sep 17 00:00:00 2001 From: Evan West Date: Tue, 24 Sep 2024 14:22:51 -0400 Subject: [PATCH 01/14] improvements from gts repo --- CMakeLists.txt | 2 +- include/graph_sketch_driver.h | 12 ++++++++---- tools/process_stream.cpp | 3 ++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 95e90896..29f622b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ FetchContent_Declare( GutterTree GIT_REPOSITORY https://github.com/GraphStreamingProject/GutterTree.git - GIT_TAG main + GIT_TAG better_pht ) # Get StreamingUtilities diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h index e627e443..86e73cf3 100644 --- a/include/graph_sketch_driver.h +++ b/include/graph_sketch_driver.h @@ -147,6 +147,9 @@ class GraphSketchDriver { auto task = [&](int thr_id) { GraphStreamUpdate update_array[update_array_size]; + + // buffer of updates for gts + update_t gts_buffer[2 * update_array_size]; #ifdef VERIFY_SAMPLES_F GraphVerifier local_verifier(sketching_alg->get_num_vertices()); #endif @@ -159,6 +162,7 @@ class GraphSketchDriver { upd.type = static_cast(update_array[i].type); if (upd.type == BREAKPOINT) { // reached the breakpoint. Update verifier if applicable and return + gts->batch_insert(gts_buffer, 2 * i, thr_id); #ifdef VERIFY_SAMPLES_F std::lock_guard lk(verifier_mtx); verifier->combine(local_verifier); @@ -167,14 +171,14 @@ class GraphSketchDriver { } else { sketching_alg->pre_insert(upd, thr_id); - Edge edge = upd.edge; - gts->insert({edge.src, edge.dst}, thr_id); - gts->insert({edge.dst, edge.src}, thr_id); + gts_buffer[2 * i] = {upd.edge.src, upd.edge.dst}; + gts_buffer[2 * i + 1] = {upd.edge.dst, upd.edge.src}; #ifdef VERIFY_SAMPLES_F - local_verifier.edge_update(edge); + local_verifier.edge_update(upd.edge); #endif } } + gts->batch_insert(gts_buffer, 2 * updates, thr_id); } }; diff --git a/tools/process_stream.cpp b/tools/process_stream.cpp index 53c49586..19f7e7e3 100644 --- a/tools/process_stream.cpp +++ b/tools/process_stream.cpp @@ -85,7 +85,8 @@ int main(int argc, char **argv) { std::cout << std::endl; auto driver_config = DriverConfiguration().gutter_sys(CACHETREE).worker_threads(num_threads); - auto cc_config = CCAlgConfiguration().batch_factor(1); + driver_config.gutter_conf().wq_batch_per_elm(8); + auto cc_config = CCAlgConfiguration().batch_factor(2); CCSketchAlg cc_alg{num_nodes, get_seed(), cc_config}; GraphSketchDriver driver{&cc_alg, &stream, driver_config, reader_threads}; From 7b78f1fbbe21256db602b9615d87d4aabcfb41dc Mon Sep 17 00:00:00 2001 From: Evan West Date: Tue, 24 Sep 2024 16:00:58 -0400 Subject: [PATCH 02/14] better gts batch call --- include/graph_sketch_driver.h | 37 +++++++++++++++-------------------- tools/process_stream.cpp | 4 ++-- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h index 86e73cf3..0006baed 100644 --- a/include/graph_sketch_driver.h +++ b/include/graph_sketch_driver.h @@ -147,38 +147,33 @@ class GraphSketchDriver { auto task = [&](int thr_id) { GraphStreamUpdate update_array[update_array_size]; - - // buffer of updates for gts - update_t gts_buffer[2 * update_array_size]; #ifdef VERIFY_SAMPLES_F GraphVerifier local_verifier(sketching_alg->get_num_vertices()); #endif while (true) { + bool got_breakpoint = false; size_t updates = stream->get_update_buffer(update_array, update_array_size); - for (size_t i = 0; i < updates; i++) { - GraphUpdate upd; - upd.edge = update_array[i].edge; - upd.type = static_cast(update_array[i].type); - if (upd.type == BREAKPOINT) { - // reached the breakpoint. Update verifier if applicable and return - gts->batch_insert(gts_buffer, 2 * i, thr_id); + + if (update_array[updates - 1].type == BREAKPOINT) { #ifdef VERIFY_SAMPLES_F - std::lock_guard lk(verifier_mtx); - verifier->combine(local_verifier); + std::lock_guard lk(verifier_mtx); + verifier->combine(local_verifier); #endif - return; - } - else { - sketching_alg->pre_insert(upd, thr_id); - gts_buffer[2 * i] = {upd.edge.src, upd.edge.dst}; - gts_buffer[2 * i + 1] = {upd.edge.dst, upd.edge.src}; + --updates; + got_breakpoint = true; + } + gts->process_stream_upd_batch(update_array, updates, thr_id); + + for (size_t i = 0; i < updates; i++) { + GraphUpdate upd = {update_array[i].edge, (UpdateType) update_array[i].type}; + sketching_alg->pre_insert(upd, thr_id); #ifdef VERIFY_SAMPLES_F - local_verifier.edge_update(upd.edge); + local_verifier.edge_update(upd.edge); #endif - } } - gts->batch_insert(gts_buffer, 2 * updates, thr_id); + + if (got_breakpoint) return; } }; diff --git a/tools/process_stream.cpp b/tools/process_stream.cpp index 19f7e7e3..ce129112 100644 --- a/tools/process_stream.cpp +++ b/tools/process_stream.cpp @@ -85,8 +85,8 @@ int main(int argc, char **argv) { std::cout << std::endl; auto driver_config = DriverConfiguration().gutter_sys(CACHETREE).worker_threads(num_threads); - driver_config.gutter_conf().wq_batch_per_elm(8); - auto cc_config = CCAlgConfiguration().batch_factor(2); + driver_config.gutter_conf().wq_batch_per_elm(4); + auto cc_config = CCAlgConfiguration().batch_factor(1); CCSketchAlg cc_alg{num_nodes, get_seed(), cc_config}; GraphSketchDriver driver{&cc_alg, &stream, driver_config, reader_threads}; From e0b7da0f3bd318cff0ace60dae94c8213e953f47 Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 27 Feb 2025 15:15:45 -0500 Subject: [PATCH 03/14] initial commit for sparse sketch --- include/graph_sketch_driver.h | 13 +++++++----- include/sketch.h | 37 +++++++++++++++++++++++++++++++++-- src/sketch.cpp | 7 +++++++ 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h index 0006baed..a1a8610d 100644 --- a/include/graph_sketch_driver.h +++ b/include/graph_sketch_driver.h @@ -156,10 +156,6 @@ class GraphSketchDriver { size_t updates = stream->get_update_buffer(update_array, update_array_size); if (update_array[updates - 1].type == BREAKPOINT) { -#ifdef VERIFY_SAMPLES_F - std::lock_guard lk(verifier_mtx); - verifier->combine(local_verifier); -#endif --updates; got_breakpoint = true; } @@ -173,7 +169,13 @@ class GraphSketchDriver { #endif } - if (got_breakpoint) return; + if (got_breakpoint) { +#ifdef VERIFY_SAMPLES_F + std::lock_guard lk(verifier_mtx); + verifier->combine(local_verifier); +#endif + return; + } } }; @@ -204,6 +206,7 @@ class GraphSketchDriver { inline void batch_callback(int thr_id, node_id_t src_vertex, const std::vector &dst_vertices) { total_updates += dst_vertices.size(); + return; sketching_alg->apply_update_batch(thr_id, src_vertex, dst_vertices); } diff --git a/include/sketch.h b/include/sketch.h index 80473ecc..af55ac75 100644 --- a/include/sketch.h +++ b/include/sketch.h @@ -11,6 +11,19 @@ #include "util.h" #include "bucket.h" +// TODO: Do we want to use row major or column major order? +// TODO: How do we want to handle raw_bucket_merge() and get_readonly_bucket_ptr()? +// These functions are nice for performance because we can skip serialization but aren't +// strictly necessary. +// TODO: It would be nice to preallocate the structure if we know how big its probably going to be. +// This would be helpful for delta sketches for example. +// TODO: What are we doing with the num_buckets variable? Could be nice to just be the size of +// buckets array. Could also be upperbound on the size. + +// A strategy that could work well would be to allocate a chunk of memory some of which is given to +// the dense region of the sketch and 3 * sizeof(uint64_t) are given to sparse region. +// 3 -> position, alpha, gamma (could save a little more space by using 16 bits for position) + // enum SerialType { // FULL, // RANGE, @@ -43,14 +56,34 @@ class Sketch { size_t num_samples; // number of samples we can perform size_t cols_per_sample; // number of columns to use on each sample size_t num_columns; // Total number of columns. (product of above 2) - size_t bkt_per_col; // number of buckets per column + size_t bkt_per_col; // maximum number of buckets per column (max number of rows) size_t num_buckets; // number of total buckets (product of above 2) size_t sample_idx = 0; // number of samples performed so far - // bucket data + // bucket data, stored densely Bucket* buckets; +#ifndef L0_FULLY_DENSE + size_t num_dense_rows = 4; + + // sparse representation of lower levels of Matrix + // TODO: Evaluate if this is shit. It probably is + std::vector> bucket_buffer; + size_t number_of_sparse_buckets = 0; + size_t sparse_capacity = 2 * num_columns; // TODO: evaluate implications of this constant + + /** + * Reallocates the dense region of the sketch to have a different number of rows + * @param new_num_rows the new number of rows to store densely + */ + void reallocate_dense_region(size_t new_num_rows); +#endif + + inline Bucket& get_deterministic_bucket() { + // TODO: implement this + } + public: /** * The below constructors use vector length as their input. However, in graph sketching our input diff --git a/src/sketch.cpp b/src/sketch.cpp index ac674c5e..9da1c574 100644 --- a/src/sketch.cpp +++ b/src/sketch.cpp @@ -11,6 +11,11 @@ Sketch::Sketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) : num_columns = num_samples * cols_per_sample; bkt_per_col = calc_bkt_per_col(vector_len); num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket + +#ifndef L0_FULLY_DENSE + buckets = new Bucket[num_columns * num_dense_rows]; + // TODO: AHHHHHHHHH +#else buckets = new Bucket[num_buckets]; // initialize bucket values @@ -18,6 +23,8 @@ Sketch::Sketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) : buckets[i].alpha = 0; buckets[i].gamma = 0; } +#endif + } Sketch::Sketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, From 50d39ee48c74e16f5bd464c06b2199a479decd18 Mon Sep 17 00:00:00 2001 From: Evan West Date: Tue, 4 Mar 2025 16:49:21 -0500 Subject: [PATCH 04/14] initial commit for sparse sketch work --- CMakeLists.txt | 13 +- include/bucket.h | 16 +- include/dense_sketch.h | 181 +++++++++++++++++ include/graph_sketch_driver.h | 1 - include/sketch.h | 238 +---------------------- include/sketch_types.h | 35 ++++ include/sparse_sketch.h | 244 +++++++++++++++++++++++ src/dense_sketch.cpp | 236 +++++++++++++++++++++++ src/sketch.cpp | 234 ---------------------- src/sparse_sketch.cpp | 354 ++++++++++++++++++++++++++++++++++ test/sketch_test.cpp | 42 +++- test/util/graph_verifier.cpp | 1 + 12 files changed, 1112 insertions(+), 483 deletions(-) create mode 100644 include/dense_sketch.h create mode 100644 include/sketch_types.h create mode 100644 include/sparse_sketch.h create mode 100644 src/dense_sketch.cpp delete mode 100644 src/sketch.cpp create mode 100644 src/sparse_sketch.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 29f622b8..cb54756b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,13 +75,16 @@ endif() FetchContent_MakeAvailable(GutterTree StreamingUtilities) # AVAILABLE COMPILATION DEFINITIONS: -# VERIFY_SAMPLES_F Use a deterministic connected-components -# algorithm to verify post-processing. # NO_EAGER_DSU Do not use the eager DSU query optimization # if this flag is present. # L0_SAMPLING Run the CubeSketch l0 sampling algorithm # to ensure that we sample uniformly. # Otherwise, run a support finding algorithm. +# L0_FULLY_DENSE Fully allocate the sketch matrix at the beginning +# of the program. If this flag is not used, sketches +# are allocated dynamically. +# VERIFY_SAMPLES_F Use a deterministic connected-components +# algorithm to verify post-processing. # # Example: # cmake -DCMAKE_CXX_FLAGS="-DL0_SAMPLING" .. @@ -91,7 +94,8 @@ add_library(GraphZeppelin src/return_types.cpp src/driver_configuration.cpp src/cc_alg_configuration.cpp - src/sketch.cpp + src/sparse_sketch.cpp + src/dense_sketch.cpp src/util.cpp) add_dependencies(GraphZeppelin GutterTree StreamingUtilities) target_link_libraries(GraphZeppelin PUBLIC xxhash GutterTree StreamingUtilities) @@ -105,7 +109,8 @@ add_library(GraphZeppelinVerifyCC src/return_types.cpp src/driver_configuration.cpp src/cc_alg_configuration.cpp - src/sketch.cpp + src/sparse_sketch.cpp + src/dense_sketch.cpp src/util.cpp test/util/graph_verifier.cpp) add_dependencies(GraphZeppelinVerifyCC GutterTree StreamingUtilities) diff --git a/include/bucket.h b/include/bucket.h index 5d6a6af6..5c5a4df6 100644 --- a/include/bucket.h +++ b/include/bucket.h @@ -9,6 +9,10 @@ struct Bucket { vec_t alpha; vec_hash_t gamma; }; +struct SparseBucket { + uint16_t position; // (col << 8) | row + Bucket bkt; +}; #pragma pack(pop) namespace Bucket_Boruvka { @@ -34,13 +38,19 @@ namespace Bucket_Boruvka { inline static vec_hash_t get_index_hash(const vec_t index, const long sketch_seed); /** - * Checks whether a Bucket is good, assuming the Bucket contains all elements. + * Checks whether a Bucket is good. * @param bucket The bucket to check * @param sketch_seed The seed of the Sketch this Bucket belongs to. * @return true if this Bucket is good, else false. */ inline static bool is_good(const Bucket &bucket, const long sketch_seed); + /** + * Checks whether a Bucket is empty. + * @return true if this Bucket is empty (alpha and gamma == 0), else false. + */ + inline static bool is_empty(const Bucket &bucket); + /** * Updates a Bucket with the given update index * @param bucket The bucket to update @@ -66,6 +76,10 @@ inline bool Bucket_Boruvka::is_good(const Bucket &bucket, const long sketch_seed return bucket.gamma == get_index_hash(bucket.alpha, sketch_seed); } +inline bool Bucket_Boruvka::is_empty(const Bucket &bucket) { + return bucket.alpha == 0 && bucket.gamma == 0; +} + inline void Bucket_Boruvka::update(Bucket& bucket, const vec_t update_idx, const vec_hash_t update_hash) { bucket.alpha ^= update_idx; diff --git a/include/dense_sketch.h b/include/dense_sketch.h new file mode 100644 index 00000000..2a2fd199 --- /dev/null +++ b/include/dense_sketch.h @@ -0,0 +1,181 @@ +#pragma once +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "util.h" +#include "bucket.h" +#include "sketch_types.h" + +/** + * Sketch for graph processing, either CubeSketch or CameoSketch. + * Sub-linear representation of a vector. + */ +class DenseSketch { + private: + const uint64_t seed; // seed for hash functions + size_t num_samples; // number of samples we can perform + size_t cols_per_sample; // number of columns to use on each sample + size_t num_columns; // Total number of columns. (product of above 2) + size_t bkt_per_col; // maximum number of buckets per column (max number of rows) + size_t num_buckets; // number of total buckets product of above two + size_t sample_idx = 0; // number of samples performed so far + + // Allocated buckets + Bucket* buckets; + + inline Bucket& deterministic_bucket() { + return buckets[0]; + } + inline const Bucket& deterministic_bucket() const { + return buckets[0]; + } + + // return the bucket at a particular index in bucket array + inline Bucket& bucket(size_t col, size_t row) { + return buckets[col * bkt_per_col + row + 1]; + } + inline const Bucket& bucket(size_t col, size_t row) const { + return buckets[col * bkt_per_col + row + 1]; + } + + public: + /** + * The below constructors use vector length as their input. However, in graph sketching our input + * is the number of vertices. This function converts from number of graph vertices to vector + * length. + * @param num_vertices Number of graph vertices + * @return The length of the vector to sketch + */ + static vec_t calc_vector_length(node_id_t num_vertices) { + return ceil(double(num_vertices) * (num_vertices - 1) / 2); + } + + /** + * This function computes the number of samples a Sketch should support in order to solve + * connected components. Optionally, can increase or decrease the number of samples by a + * multiplicative factor. + * @param num_vertices Number of graph vertices + * @param f Multiplicative sample factor + * @return The number of samples + */ + static size_t calc_cc_samples(node_id_t num_vertices, double f) { + return std::max(size_t(18), (size_t) ceil(f * log2(num_vertices) / num_samples_div)); + } + + /** + * Construct a sketch object + * @param vector_len Length of the vector we are sketching + * @param seed Random seed of the sketch + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) + */ + DenseSketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1, + size_t cols_per_sample = default_cols_per_sample); + + /** + * Construct a sketch from a serialized stream + * @param vector_len Length of the vector we are sketching + * @param seed Random seed of the sketch + * @param binary_in Stream holding serialized sketch object + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) + */ + DenseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, + size_t cols_per_sample = default_cols_per_sample); + + /** + * Sketch copy constructor + * @param s The sketch to copy. + */ + DenseSketch(const DenseSketch& s); + + ~DenseSketch(); + + /** + * Update a sketch based on information about one of its indices. + * @param update the point update. + */ + void update(const vec_t update); + + /** + * Function to sample from the sketch. + * cols_per_sample determines the number of columns we allocate to this query + * @return A pair with the result index and a code indicating the type of result. + */ + SketchSample sample(); + + /** + * Function to sample from the appropriate columns to return 1 or more non-zero indices + * @return A pair with the result indices and a code indicating the type of result. + */ + ExhaustiveSketchSample exhaustive_sample(); + + std::mutex mutex; // lock the sketch for applying updates in multithreaded processing + + /** + * In-place merge function. + * @param other Sketch to merge into caller + */ + void merge(const DenseSketch &other); + + /** + * In-place range merge function. Updates the caller Sketch. + * The range merge only merges some of the Sketches + * This function should only be used if you know what you're doing + * @param other Sketch to merge into caller + * @param start_sample Index of first sample to merge + * @param n_samples Number of samples to merge + */ + void range_merge(const DenseSketch &other, size_t start_sample, size_t n_samples); + + /** + * Perform an in-place merge function without another Sketch and instead + * use a raw bucket memory. + * We also allow for only a portion of the buckets to be merge at once + * @param raw_bucket Raw bucket data to merge into this sketch + */ + void merge_raw_bucket_buffer(const Bucket *raw_buckets); + + /** + * Zero out all the buckets of a sketch. + */ + void zero_contents(); + + friend bool operator==(const DenseSketch& sketch1, const DenseSketch& sketch2); + friend std::ostream& operator<<(std::ostream& os, const DenseSketch& sketch); + + /** + * Serialize the sketch to a binary output stream. + * @param binary_out the stream to write to. + */ + void serialize(std::ostream& binary_out) const; + + inline void reset_sample_state() { + sample_idx = 0; + } + + // return the size of the sketching datastructure in bytes (just the buckets, not the metadata) + inline size_t bucket_array_bytes() const { + return num_buckets * sizeof(Bucket); + } + + inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; } + inline uint64_t get_seed() const { return seed; } + inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; } + inline size_t checksum_seed() const { return seed; } + inline size_t get_columns() const { return num_columns; } + inline size_t get_buckets() const { return num_buckets; } + inline size_t get_num_samples() const { return num_samples; } + + static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; } + + static constexpr size_t default_cols_per_sample = 1; + static constexpr double num_samples_div = 1 - log2(2 - 0.8); +}; diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h index a1a8610d..c05df28a 100644 --- a/include/graph_sketch_driver.h +++ b/include/graph_sketch_driver.h @@ -206,7 +206,6 @@ class GraphSketchDriver { inline void batch_callback(int thr_id, node_id_t src_vertex, const std::vector &dst_vertices) { total_updates += dst_vertices.size(); - return; sketching_alg->apply_update_batch(thr_id, src_vertex, dst_vertices); } diff --git a/include/sketch.h b/include/sketch.h index af55ac75..97a208a0 100644 --- a/include/sketch.h +++ b/include/sketch.h @@ -1,237 +1,9 @@ #pragma once -#include -#include -#include +#include "dense_sketch.h" +#include "sparse_sketch.h" -#include -#include -#include -#include - -#include "util.h" -#include "bucket.h" - -// TODO: Do we want to use row major or column major order? -// TODO: How do we want to handle raw_bucket_merge() and get_readonly_bucket_ptr()? -// These functions are nice for performance because we can skip serialization but aren't -// strictly necessary. -// TODO: It would be nice to preallocate the structure if we know how big its probably going to be. -// This would be helpful for delta sketches for example. -// TODO: What are we doing with the num_buckets variable? Could be nice to just be the size of -// buckets array. Could also be upperbound on the size. - -// A strategy that could work well would be to allocate a chunk of memory some of which is given to -// the dense region of the sketch and 3 * sizeof(uint64_t) are given to sparse region. -// 3 -> position, alpha, gamma (could save a little more space by using 16 bits for position) - -// enum SerialType { -// FULL, -// RANGE, -// SPARSE, -// }; - -enum SampleResult { - GOOD, // sampling this sketch returned a single non-zero value - ZERO, // sampling this sketch returned that there are no non-zero values - FAIL // sampling this sketch failed to produce a single non-zero value -}; - -struct SketchSample { - vec_t idx; - SampleResult result; -}; - -struct ExhaustiveSketchSample { - std::unordered_set idxs; - SampleResult result; -}; - -/** - * Sketch for graph processing, either CubeSketch or CameoSketch. - * Sub-linear representation of a vector. - */ -class Sketch { - private: - const uint64_t seed; // seed for hash functions - size_t num_samples; // number of samples we can perform - size_t cols_per_sample; // number of columns to use on each sample - size_t num_columns; // Total number of columns. (product of above 2) - size_t bkt_per_col; // maximum number of buckets per column (max number of rows) - size_t num_buckets; // number of total buckets (product of above 2) - - size_t sample_idx = 0; // number of samples performed so far - - // bucket data, stored densely - Bucket* buckets; - -#ifndef L0_FULLY_DENSE - size_t num_dense_rows = 4; - - // sparse representation of lower levels of Matrix - // TODO: Evaluate if this is shit. It probably is - std::vector> bucket_buffer; - size_t number_of_sparse_buckets = 0; - size_t sparse_capacity = 2 * num_columns; // TODO: evaluate implications of this constant - - /** - * Reallocates the dense region of the sketch to have a different number of rows - * @param new_num_rows the new number of rows to store densely - */ - void reallocate_dense_region(size_t new_num_rows); -#endif - - inline Bucket& get_deterministic_bucket() { - // TODO: implement this - } - - public: - /** - * The below constructors use vector length as their input. However, in graph sketching our input - * is the number of vertices. This function converts from number of graph vertices to vector - * length. - * @param num_vertices Number of graph vertices - * @return The length of the vector to sketch - */ - static vec_t calc_vector_length(node_id_t num_vertices) { - return ceil(double(num_vertices) * (num_vertices - 1) / 2); - } - - /** - * This function computes the number of samples a Sketch should support in order to solve - * connected components. Optionally, can increase or decrease the number of samples by a - * multiplicative factor. - * @param num_vertices Number of graph vertices - * @param f Multiplicative sample factor - * @return The number of samples - */ - static size_t calc_cc_samples(node_id_t num_vertices, double f) { - return std::max(size_t(18), (size_t) ceil(f * log2(num_vertices) / num_samples_div)); - } - - /** - * Construct a sketch object - * @param vector_len Length of the vector we are sketching - * @param seed Random seed of the sketch - * @param num_samples [Optional] Number of samples this sketch supports (default = 1) - * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) - */ - Sketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1, - size_t cols_per_sample = default_cols_per_sample); - - /** - * Construct a sketch from a serialized stream - * @param vector_len Length of the vector we are sketching - * @param seed Random seed of the sketch - * @param binary_in Stream holding serialized sketch object - * @param num_samples [Optional] Number of samples this sketch supports (default = 1) - * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) - */ - Sketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, - size_t cols_per_sample = default_cols_per_sample); - - /** - * Sketch copy constructor - * @param s The sketch to copy. - */ - Sketch(const Sketch& s); - - ~Sketch(); - - /** - * Update a sketch based on information about one of its indices. - * @param update the point update. - */ - void update(const vec_t update); - - /** - * Function to sample from the sketch. - * cols_per_sample determines the number of columns we allocate to this query - * @return A pair with the result index and a code indicating the type of result. - */ - SketchSample sample(); - - /** - * Function to sample from the appropriate columns to return 1 or more non-zero indices - * @return A pair with the result indices and a code indicating the type of result. - */ - ExhaustiveSketchSample exhaustive_sample(); - - std::mutex mutex; // lock the sketch for applying updates in multithreaded processing - - /** - * In-place merge function. - * @param other Sketch to merge into caller - */ - void merge(const Sketch &other); - - /** - * In-place range merge function. Updates the caller Sketch. - * The range merge only merges some of the Sketches - * This function should only be used if you know what you're doing - * @param other Sketch to merge into caller - * @param start_sample Index of first sample to merge - * @param n_samples Number of samples to merge - */ - void range_merge(const Sketch &other, size_t start_sample, size_t n_samples); - - /** - * Perform an in-place merge function without another Sketch and instead - * use a raw bucket memory. - * We also allow for only a portion of the buckets to be merge at once - * @param raw_bucket Raw bucket data to merge into this sketch - */ - void merge_raw_bucket_buffer(const Bucket *raw_buckets); - - /** - * Zero out all the buckets of a sketch. - */ - void zero_contents(); - - friend bool operator==(const Sketch& sketch1, const Sketch& sketch2); - friend std::ostream& operator<<(std::ostream& os, const Sketch& sketch); - - /** - * Serialize the sketch to a binary output stream. - * @param binary_out the stream to write to. - */ - void serialize(std::ostream& binary_out) const; - - inline void reset_sample_state() { - sample_idx = 0; - } - - // return the size of the sketching datastructure in bytes (just the buckets, not the metadata) - inline size_t bucket_array_bytes() const { return num_buckets * sizeof(Bucket); } - - inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; } - inline uint64_t get_seed() const { return seed; } - inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; } - inline size_t checksum_seed() const { return seed; } - inline size_t get_columns() const { return num_columns; } - inline size_t get_buckets() const { return num_buckets; } - inline size_t get_num_samples() const { return num_samples; } - - static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; } - -#ifdef L0_SAMPLING - static constexpr size_t default_cols_per_sample = 7; - // NOTE: can improve this but leaving for comparison purposes - static constexpr double num_samples_div = log2(3) - 1; +#ifdef L0_FULLY_DENSE +typedef DenseSketch Sketch; #else - static constexpr size_t default_cols_per_sample = 1; - static constexpr double num_samples_div = 1 - log2(2 - 0.8); +typedef SparseSketch Sketch; #endif -}; - -class OutOfSamplesException : public std::exception { - private: - std::string err_msg; - public: - OutOfSamplesException(size_t seed, size_t num_samples, size_t sample_idx) - : err_msg("This sketch (seed=" + std::to_string(seed) + - ", max samples=" + std::to_string(num_samples) + - ") cannot be sampled more times (cur idx=" + std::to_string(sample_idx) + ")!") {} - virtual const char* what() const throw() { - return err_msg.c_str(); - } -}; diff --git a/include/sketch_types.h b/include/sketch_types.h new file mode 100644 index 00000000..725e7c68 --- /dev/null +++ b/include/sketch_types.h @@ -0,0 +1,35 @@ +#pragma once +// enum SerialType { +// FULL, +// RANGE, +// SPARSE, +// }; + +enum SampleResult { + GOOD, // sampling this sketch returned a single non-zero value + ZERO, // sampling this sketch returned that there are no non-zero values + FAIL // sampling this sketch failed to produce a single non-zero value +}; + +struct SketchSample { + vec_t idx; + SampleResult result; +}; + +struct ExhaustiveSketchSample { + std::unordered_set idxs; + SampleResult result; +}; + +class OutOfSamplesException : public std::exception { + private: + std::string err_msg; + public: + OutOfSamplesException(size_t seed, size_t num_samples, size_t sample_idx) + : err_msg("This sketch (seed=" + std::to_string(seed) + + ", max samples=" + std::to_string(num_samples) + + ") cannot be sampled more times (cur idx=" + std::to_string(sample_idx) + ")!") {} + virtual const char* what() const throw() { + return err_msg.c_str(); + } +}; diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h new file mode 100644 index 00000000..7a8630a7 --- /dev/null +++ b/include/sparse_sketch.h @@ -0,0 +1,244 @@ +#pragma once +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "util.h" +#include "bucket.h" +#include "sketch_types.h" + +// TODO: Do we want to use row major or column major order? +// So the advantage of row-major is that we can update faster. Most updates will only touch +// first few rows of data-structure. However, could slow down queries. (Although most query +// answers will probably be in sparse data-structure). OH! Also, range_merge is important here +// if column-major then the column we are merging is contig, if not, then not. +// A: Keep column-major for the moment, performance evaluation later. + + +// TODO: How do we want to handle raw_bucket_merge() and get_readonly_bucket_ptr()? +// These functions are nice for performance because we can skip serialization but aren't +// strictly necessary. +// A: Make function to get size in bytes of bucket data and have the 'hash table' be contig with +// the bucket data. This way we can still use these functions. + + +// TODO: It would be nice to preallocate the structure if we know how big its probably going to be. +// This would be helpful for delta sketches for example. +// A: Yeah do this + + +// TODO: What are we doing with the num_buckets variable? Could be nice to just be the size of +// buckets array. Could also be upperbound on the size. +// A: Need two variables. Both the current number of buckets (rows) allocated AND the maximum. + +// A strategy that could work well would be to allocate a chunk of memory some of which is given to +// the dense region of the sketch and 3 * sizeof(uint64_t) are given to sparse region. +// 3 -> position, alpha, gamma (could save a little more space by using 16 bits for position) + +/* Memory Allocation of a Sketch. Contiguous + _________________________________________________________________________________________ +| Dense | Sparse | +| Sketch | Bucket | +| Buckets | Region (hash-table) | +| log n * log z buckets | clog n buckets | +|__________________________________________________________|______________________________| +*/ + +/** + * Sketch for graph processing, either CubeSketch or CameoSketch. + * Sub-linear representation of a vector. + */ +class SparseSketch { + private: + const uint64_t seed; // seed for hash functions + size_t num_samples; // number of samples we can perform + size_t cols_per_sample; // number of columns to use on each sample + size_t num_columns; // Total number of columns. (product of above 2) + size_t bkt_per_col; // maximum number of buckets per column (max number of rows) + size_t num_buckets; // number of total buckets + // (either product of above two or col * dense_rows + sparse_capacity) + + size_t sample_idx = 0; // number of samples performed so far + + // Allocated buckets + Bucket* buckets; + + static constexpr size_t min_num_dense_rows = 4; + size_t num_dense_rows = min_num_dense_rows; + + // Variables for sparse representation of lower levels of bucket Matrix + // TODO: evaluate implications of this constant + static constexpr double sparse_bucket_constant = 3; // constant factor c (see above) + SparseBucket* sparse_buckets; // a pointer into the buckets array + size_t number_of_sparse_buckets = 0; // cur number of sparse buckets + size_t sparse_capacity = sparse_bucket_constant * num_columns; // max number of sparse buckets + + /** + * Reallocates the bucket array if necessary to either grow or shrink the dense region + */ + void reallocate_if_needed(); + + // This variable lets us know how many Buckets to allocate to make space for the SparseBuckets + // that will be using that space + size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket)); + + int update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum); + SketchSample sample_sparse(size_t column); + + inline Bucket& deterministic_bucket() { + return buckets[0]; + } + inline const Bucket& deterministic_bucket() const { + return buckets[0]; + } + + // return the bucket at a particular index in bucket array + inline Bucket& bucket(size_t col, size_t row) { + assert(row < num_dense_rows); + return buckets[col * num_dense_rows + row + 1]; + } + inline const Bucket& bucket(size_t col, size_t row) const { + assert(row < num_dense_rows); + return buckets[col * num_dense_rows + row + 1]; + } + + public: + /** + * The below constructors use vector length as their input. However, in graph sketching our input + * is the number of vertices. This function converts from number of graph vertices to vector + * length. + * @param num_vertices Number of graph vertices + * @return The length of the vector to sketch + */ + static vec_t calc_vector_length(node_id_t num_vertices) { + return ceil(double(num_vertices) * (num_vertices - 1) / 2); + } + + /** + * This function computes the number of samples a Sketch should support in order to solve + * connected components. Optionally, can increase or decrease the number of samples by a + * multiplicative factor. + * @param num_vertices Number of graph vertices + * @param f Multiplicative sample factor + * @return The number of samples + */ + static size_t calc_cc_samples(node_id_t num_vertices, double f) { + return std::max(size_t(18), (size_t) ceil(f * log2(num_vertices) / num_samples_div)); + } + + /** + * Construct a sketch object + * @param vector_len Length of the vector we are sketching + * @param seed Random seed of the sketch + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) + */ + SparseSketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1, + size_t cols_per_sample = default_cols_per_sample); + + /** + * Construct a sketch from a serialized stream + * @param vector_len Length of the vector we are sketching + * @param seed Random seed of the sketch + * @param binary_in Stream holding serialized sketch object + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) + */ + SparseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, + size_t cols_per_sample = default_cols_per_sample); + + /** + * SparseSketch copy constructor + * @param s The sketch to copy. + */ + SparseSketch(const SparseSketch& s); + + ~SparseSketch(); + + /** + * Update a sketch based on information about one of its indices. + * @param update the point update. + */ + void update(const vec_t update); + + /** + * Function to sample from the sketch. + * cols_per_sample determines the number of columns we allocate to this query + * @return A pair with the result index and a code indicating the type of result. + */ + SketchSample sample(); + + /** + * Function to sample from the appropriate columns to return 1 or more non-zero indices + * @return A pair with the result indices and a code indicating the type of result. + */ + ExhaustiveSketchSample exhaustive_sample(); + + std::mutex mutex; // lock the sketch for applying updates in multithreaded processing + + /** + * In-place merge function. + * @param other Sketch to merge into caller + */ + void merge(const SparseSketch &other); + + /** + * In-place range merge function. Updates the caller Sketch. + * The range merge only merges some of the Sketches + * This function should only be used if you know what you're doing + * @param other Sketch to merge into caller + * @param start_sample Index of first sample to merge + * @param n_samples Number of samples to merge + */ + void range_merge(const SparseSketch &other, size_t start_sample, size_t n_samples); + + /** + * Perform an in-place merge function without another Sketch and instead + * use a raw bucket memory. + * We also allow for only a portion of the buckets to be merge at once + * @param raw_bucket Raw bucket data to merge into this sketch + */ + void merge_raw_bucket_buffer(const Bucket *raw_buckets); + + /** + * Zero out all the buckets of a sketch. + */ + void zero_contents(); + + friend bool operator==(const SparseSketch& sketch1, const SparseSketch& sketch2); + friend std::ostream& operator<<(std::ostream& os, const SparseSketch& sketch); + + /** + * Serialize the sketch to a binary output stream. + * @param binary_out the stream to write to. + */ + void serialize(std::ostream& binary_out) const; + + inline void reset_sample_state() { + sample_idx = 0; + } + + // return the size of the sketching datastructure in bytes (just the buckets, not the metadata) + inline size_t bucket_array_bytes() const { + return num_buckets * sizeof(Bucket); + } + + inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; } + inline uint64_t get_seed() const { return seed; } + inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; } + inline size_t checksum_seed() const { return seed; } + inline size_t get_columns() const { return num_columns; } + inline size_t get_buckets() const { return num_buckets; } + inline size_t get_num_samples() const { return num_samples; } + + static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; } + + static constexpr size_t default_cols_per_sample = 1; + static constexpr double num_samples_div = 1 - log2(2 - 0.8); +}; diff --git a/src/dense_sketch.cpp b/src/dense_sketch.cpp new file mode 100644 index 00000000..9c6c5f73 --- /dev/null +++ b/src/dense_sketch.cpp @@ -0,0 +1,236 @@ +#include "dense_sketch.h" + +#include +#include +#include +#include + +DenseSketch::DenseSketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) + : seed(seed), + num_samples(_samples), + cols_per_sample(_cols), + num_columns(cols_per_sample * num_samples), + bkt_per_col(calc_bkt_per_col(vector_len)) { + + num_buckets = num_columns * bkt_per_col + 1; // plus 1, deterministic bucket + buckets = new Bucket[num_buckets]; + + // initialize bucket values + for (size_t i = 0; i < num_buckets; ++i) { + buckets[i].alpha = 0; + buckets[i].gamma = 0; + } +} + +DenseSketch::DenseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, + size_t _cols) + : seed(seed), + num_samples(_samples), + cols_per_sample(_cols), + num_columns(cols_per_sample * num_samples), + bkt_per_col(calc_bkt_per_col(vector_len)) { + num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket + buckets = new Bucket[num_buckets]; + + // Read the serialized Sketch contents + binary_in.read((char *)buckets, bucket_array_bytes()); +} + +DenseSketch::DenseSketch(const DenseSketch &s) + : seed(s.seed), + num_samples(s.num_samples), + cols_per_sample(s.cols_per_sample), + num_columns(s.num_columns), + bkt_per_col(s.bkt_per_col) { + num_buckets = s.num_buckets; + buckets = new Bucket[num_buckets]; + + std::memcpy(buckets, s.buckets, bucket_array_bytes()); +} + +DenseSketch::~DenseSketch() { delete[] buckets; } + + +void DenseSketch::update(const vec_t update_idx) { + vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); + + // Update depth 0 bucket + Bucket_Boruvka::update(deterministic_bucket(), update_idx, checksum); + + // Update higher depth buckets + for (unsigned i = 0; i < num_columns; ++i) { + col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), bkt_per_col); + likely_if(depth < bkt_per_col) { + Bucket_Boruvka::update(bucket(i, depth), update_idx, checksum); + } + } +} + +static void is_empty(DenseSketch &skt) { + const Bucket* buckets = skt.get_readonly_bucket_ptr(); + for (size_t i = 0; i < skt.get_buckets(); i++) { + if (!Bucket_Boruvka::is_empty(buckets[i])) { + std::cerr << "FOUND NOT EMPTY BUCKET!" << std::endl; + } + } +} + +// TODO: Switch the L0_SAMPLING flag to instead affect query procedure. +// (Only use deepest bucket. We don't need the alternate update procedure in the code anymore.) + +void DenseSketch::zero_contents() { + for (size_t i = 0; i < num_buckets; i++) { + buckets[i].alpha = 0; + buckets[i].gamma = 0; + } + reset_sample_state(); +} + +SketchSample DenseSketch::sample() { + if (sample_idx >= num_samples) { + throw OutOfSamplesException(seed, num_samples, sample_idx); + } + + size_t idx = sample_idx++; + size_t first_column = idx * cols_per_sample; + + // std::cout << "Sampling: " << first_column << ", " << first_column + cols_per_sample << std::endl; + + // std::cout << *this << std::endl; + + if (Bucket_Boruvka::is_empty(deterministic_bucket())) { + is_empty(*this); + return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return + } + + if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) + return {deterministic_bucket().alpha, GOOD}; + + for (size_t i = 0; i < cols_per_sample; ++i) { + for (size_t j = 0; j < bkt_per_col; ++j) { + if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) + return {bucket(i + first_column, j).alpha, GOOD}; + } + } + return {0, FAIL}; +} + +ExhaustiveSketchSample DenseSketch::exhaustive_sample() { + if (sample_idx >= num_samples) { + throw OutOfSamplesException(seed, num_samples, sample_idx); + } + std::unordered_set ret; + + size_t idx = sample_idx++; + size_t first_column = idx * cols_per_sample; + + unlikely_if (deterministic_bucket().alpha == 0 && deterministic_bucket().gamma == 0) + return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return + + unlikely_if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) { + ret.insert(deterministic_bucket().alpha); + return {ret, GOOD}; + } + + for (size_t i = 0; i < cols_per_sample; ++i) { + for (size_t j = 0; j < bkt_per_col; ++j) { + unlikely_if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) { + ret.insert(bucket(i + first_column, j).alpha); + } + } + } + + unlikely_if (ret.size() == 0) + return {ret, FAIL}; + return {ret, GOOD}; +} + +void DenseSketch::merge(const DenseSketch &other) { + for (size_t i = 0; i < num_buckets; ++i) { + buckets[i].alpha ^= other.buckets[i].alpha; + buckets[i].gamma ^= other.buckets[i].gamma; + } +} + +void DenseSketch::range_merge(const DenseSketch &other, size_t start_sample, size_t n_samples) { + if (start_sample + n_samples > num_samples) { + assert(false); + sample_idx = num_samples; // sketch is in a fail state! + return; + } + + // std::cout << "MERGING THIS" << std::endl; + // std::cout << *this << std::endl; + // std::cout << "WITH THIS" << std::endl; + // std::cout << other << std::endl; + + // update sample idx to point at beginning of this range if before it + sample_idx = std::max(sample_idx, start_sample); + + // merge deterministic bucket + // TODO: I don't like this. Repeated calls to range_merge on same sketches will potentially cause us issues + deterministic_bucket().alpha ^= other.deterministic_bucket().alpha; + deterministic_bucket().gamma ^= other.deterministic_bucket().gamma; + + // merge other buckets + size_t start_column = start_sample * cols_per_sample; + size_t end_column = (start_sample + n_samples) * cols_per_sample; + + // std::cout << start_column << ", " << end_column << std::endl; + for (size_t i = start_column; i < end_column; i++) { + for (size_t j = 0; j < bkt_per_col; j++) { + bucket(i, j).alpha ^= other.bucket(i, j).alpha; + bucket(i, j).gamma ^= other.bucket(i, j).gamma; + } + } + + // std::cout << "RESULT" << std::endl; + // std::cout << *this << std::endl; +} + +void DenseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { + for (size_t i = 0; i < num_buckets; i++) { + buckets[i].alpha ^= raw_buckets[i].alpha; + buckets[i].gamma ^= raw_buckets[i].gamma; + } +} + +void DenseSketch::serialize(std::ostream &binary_out) const { + binary_out.write((char*) buckets, bucket_array_bytes()); +} + +bool operator==(const DenseSketch &sketch1, const DenseSketch &sketch2) { + if (sketch1.num_buckets != sketch2.num_buckets || sketch1.seed != sketch2.seed) + return false; + + for (size_t i = 0; i < sketch1.num_buckets; ++i) { + if (sketch1.buckets[i].alpha != sketch2.buckets[i].alpha || + sketch1.buckets[i].gamma != sketch2.buckets[i].gamma) { + return false; + } + } + + return true; +} + +std::ostream &operator<<(std::ostream &os, const DenseSketch &sketch) { + Bucket bkt = sketch.buckets[sketch.num_buckets - 1]; + bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); + vec_t a = bkt.alpha; + vec_hash_t c = bkt.gamma; + + os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + + for (unsigned i = 0; i < sketch.num_columns; ++i) { + for (unsigned j = 0; j < sketch.bkt_per_col; ++j) { + Bucket bkt = sketch.bucket(i, j); + vec_t a = bkt.alpha; + vec_hash_t c = bkt.gamma; + bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); + + os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + } + os << std::endl; + } + return os; +} diff --git a/src/sketch.cpp b/src/sketch.cpp deleted file mode 100644 index 9da1c574..00000000 --- a/src/sketch.cpp +++ /dev/null @@ -1,234 +0,0 @@ -#include "sketch.h" - -#include -#include -#include -#include - -Sketch::Sketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) : seed(seed) { - num_samples = _samples; - cols_per_sample = _cols; - num_columns = num_samples * cols_per_sample; - bkt_per_col = calc_bkt_per_col(vector_len); - num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket - -#ifndef L0_FULLY_DENSE - buckets = new Bucket[num_columns * num_dense_rows]; - // TODO: AHHHHHHHHH -#else - buckets = new Bucket[num_buckets]; - - // initialize bucket values - for (size_t i = 0; i < num_buckets; ++i) { - buckets[i].alpha = 0; - buckets[i].gamma = 0; - } -#endif - -} - -Sketch::Sketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, - size_t _cols) - : seed(seed) { - num_samples = _samples; - cols_per_sample = _cols; - num_columns = num_samples * cols_per_sample; - bkt_per_col = calc_bkt_per_col(vector_len); - num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket - buckets = new Bucket[num_buckets]; - - // Read the serialized Sketch contents - binary_in.read((char *)buckets, bucket_array_bytes()); -} - -Sketch::Sketch(const Sketch &s) : seed(s.seed) { - num_samples = s.num_samples; - cols_per_sample = s.cols_per_sample; - num_columns = s.num_columns; - bkt_per_col = s.bkt_per_col; - num_buckets = s.num_buckets; - buckets = new Bucket[num_buckets]; - - std::memcpy(buckets, s.buckets, bucket_array_bytes()); -} - -Sketch::~Sketch() { delete[] buckets; } - -#ifdef L0_SAMPLING -void Sketch::update(const vec_t update_idx) { - vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); - - // Update depth 0 bucket - Bucket_Boruvka::update(buckets[num_buckets - 1], update_idx, checksum); - - // Update higher depth buckets - for (unsigned i = 0; i < num_columns; ++i) { - col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), bkt_per_col); - likely_if(depth < bkt_per_col) { - for (col_hash_t j = 0; j <= depth; ++j) { - size_t bucket_id = i * bkt_per_col + j; - Bucket_Boruvka::update(buckets[bucket_id], update_idx, checksum); - } - } - } -} -#else // Use support finding algorithm instead. Faster but no guarantee of uniform sample. -void Sketch::update(const vec_t update_idx) { - vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); - - // Update depth 0 bucket - Bucket_Boruvka::update(buckets[num_buckets - 1], update_idx, checksum); - - // Update higher depth buckets - for (unsigned i = 0; i < num_columns; ++i) { - col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), bkt_per_col); - size_t bucket_id = i * bkt_per_col + depth; - likely_if(depth < bkt_per_col) { - Bucket_Boruvka::update(buckets[bucket_id], update_idx, checksum); - } - } -} -#endif - -void Sketch::zero_contents() { - for (size_t i = 0; i < num_buckets; i++) { - buckets[i].alpha = 0; - buckets[i].gamma = 0; - } - reset_sample_state(); -} - -SketchSample Sketch::sample() { - if (sample_idx >= num_samples) { - throw OutOfSamplesException(seed, num_samples, sample_idx); - } - - size_t idx = sample_idx++; - size_t first_column = idx * cols_per_sample; - - if (buckets[num_buckets - 1].alpha == 0 && buckets[num_buckets - 1].gamma == 0) - return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return - - if (Bucket_Boruvka::is_good(buckets[num_buckets - 1], checksum_seed())) - return {buckets[num_buckets - 1].alpha, GOOD}; - - for (size_t i = 0; i < cols_per_sample; ++i) { - for (size_t j = 0; j < bkt_per_col; ++j) { - size_t bucket_id = (i + first_column) * bkt_per_col + j; - if (Bucket_Boruvka::is_good(buckets[bucket_id], checksum_seed())) - return {buckets[bucket_id].alpha, GOOD}; - } - } - return {0, FAIL}; -} - -ExhaustiveSketchSample Sketch::exhaustive_sample() { - if (sample_idx >= num_samples) { - throw OutOfSamplesException(seed, num_samples, sample_idx); - } - std::unordered_set ret; - - size_t idx = sample_idx++; - size_t first_column = idx * cols_per_sample; - - unlikely_if (buckets[num_buckets - 1].alpha == 0 && buckets[num_buckets - 1].gamma == 0) - return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return - - unlikely_if (Bucket_Boruvka::is_good(buckets[num_buckets - 1], checksum_seed())) { - ret.insert(buckets[num_buckets - 1].alpha); - return {ret, GOOD}; - } - - for (size_t i = 0; i < cols_per_sample; ++i) { - for (size_t j = 0; j < bkt_per_col; ++j) { - size_t bucket_id = (i + first_column) * bkt_per_col + j; - unlikely_if (Bucket_Boruvka::is_good(buckets[bucket_id], checksum_seed())) { - ret.insert(buckets[bucket_id].alpha); - } - } - } - - unlikely_if (ret.size() == 0) - return {ret, FAIL}; - return {ret, GOOD}; -} - -void Sketch::merge(const Sketch &other) { - for (size_t i = 0; i < num_buckets; ++i) { - buckets[i].alpha ^= other.buckets[i].alpha; - buckets[i].gamma ^= other.buckets[i].gamma; - } -} - -void Sketch::range_merge(const Sketch &other, size_t start_sample, size_t n_samples) { - if (start_sample + n_samples > num_samples) { - assert(false); - sample_idx = num_samples; // sketch is in a fail state! - return; - } - - // update sample idx to point at beginning of this range if before it - sample_idx = std::max(sample_idx, start_sample); - - // merge deterministic buffer - buckets[num_buckets - 1].alpha ^= other.buckets[num_buckets - 1].alpha; - buckets[num_buckets - 1].gamma ^= other.buckets[num_buckets - 1].gamma; - - // merge other buckets - size_t start_bucket_id = start_sample * cols_per_sample * bkt_per_col; - size_t n_buckets = n_samples * cols_per_sample * bkt_per_col; - - for (size_t i = 0; i < n_buckets; i++) { - size_t bucket_id = start_bucket_id + i; - buckets[bucket_id].alpha ^= other.buckets[bucket_id].alpha; - buckets[bucket_id].gamma ^= other.buckets[bucket_id].gamma; - } -} - -void Sketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { - for (size_t i = 0; i < num_buckets; i++) { - buckets[i].alpha ^= raw_buckets[i].alpha; - buckets[i].gamma ^= raw_buckets[i].gamma; - } -} - -void Sketch::serialize(std::ostream &binary_out) const { - binary_out.write((char*) buckets, bucket_array_bytes()); -} - -bool operator==(const Sketch &sketch1, const Sketch &sketch2) { - if (sketch1.num_buckets != sketch2.num_buckets || sketch1.seed != sketch2.seed) - return false; - - for (size_t i = 0; i < sketch1.num_buckets; ++i) { - if (sketch1.buckets[i].alpha != sketch2.buckets[i].alpha || - sketch1.buckets[i].gamma != sketch2.buckets[i].gamma) { - return false; - } - } - - return true; -} - -std::ostream &operator<<(std::ostream &os, const Sketch &sketch) { - Bucket bkt = sketch.buckets[sketch.num_buckets - 1]; - bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); - vec_t a = bkt.alpha; - vec_hash_t c = bkt.gamma; - - os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; - - for (unsigned i = 0; i < sketch.num_columns; ++i) { - for (unsigned j = 0; j < sketch.bkt_per_col; ++j) { - unsigned bucket_id = i * sketch.bkt_per_col + j; - Bucket bkt = sketch.buckets[bucket_id]; - vec_t a = bkt.alpha; - vec_hash_t c = bkt.gamma; - bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); - - os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; - } - os << std::endl; - } - return os; -} diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp new file mode 100644 index 00000000..252e12be --- /dev/null +++ b/src/sparse_sketch.cpp @@ -0,0 +1,354 @@ +#include "sparse_sketch.h" + +#include +#include +#include +#include + +SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) + : seed(seed), + num_samples(_samples), + cols_per_sample(_cols), + num_columns(cols_per_sample * num_samples), + bkt_per_col(calc_bkt_per_col(vector_len)) { + + // plus 1, deterministic bucket + num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; + buckets = new Bucket[num_buckets]; + sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 2]; + + // initialize bucket values + for (size_t i = 0; i < num_buckets; ++i) { + buckets[i].alpha = 0; + buckets[i].gamma = 0; + } +} + +SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, + size_t _cols) + : seed(seed), + num_samples(_samples), + cols_per_sample(_cols), + num_columns(cols_per_sample * num_samples), + bkt_per_col(calc_bkt_per_col(vector_len)) { + + // TODO: Make this actually work for sparse-sketch + num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket + buckets = new Bucket[num_buckets]; + + // Read the serialized Sketch contents + binary_in.read((char *)buckets, bucket_array_bytes()); // TODO: Figure out bucket_array_bytes() in this context +} + +SparseSketch::SparseSketch(const SparseSketch &s) + : seed(s.seed), + num_samples(s.num_samples), + cols_per_sample(s.cols_per_sample), + num_columns(s.num_columns), + bkt_per_col(s.bkt_per_col) { + num_buckets = s.num_buckets; + buckets = new Bucket[num_buckets]; + + std::memcpy(buckets, s.buckets, bucket_array_bytes()); +} + +SparseSketch::~SparseSketch() { delete[] buckets; } + + +// Helper functions for interfacing with SparseBuckets +void SparseSketch::reallocate_if_needed() { + if (number_of_sparse_buckets > num_columns && number_of_sparse_buckets < sparse_capacity) + return; // do not reallocate + else { + const size_t old_buckets = num_buckets; + Bucket *new_buckets; + + if (number_of_sparse_buckets < num_columns && num_dense_rows > min_num_dense_rows) { + // shrink dense region by 1 row + // 1. Scan over deepest row of dense region and add all those buckets to sparse + size_t depth = num_dense_rows - 1; + for (size_t c = 0; c < num_columns; c++) { + Bucket bkt = bucket(c, depth); + if (!Bucket_Boruvka::is_empty(bkt)) { + uint16_t sparse_position = (c << 8) + depth; + update_sparse(sparse_position, bkt.alpha, bkt.gamma); + } + } + + // 2. Allocate new memory + --num_dense_rows; + num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; + new_buckets = new Bucket[num_buckets]; + } else { + // grow dense region by 1 row + // 1. Allocate new memory + ++num_dense_rows; + num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; + new_buckets = new Bucket[num_buckets]; + + // 2. Skip + } + + // 3. Copy over content + size_t dense_buckets = num_columns * num_dense_rows + 1; + for (size_t i = 0; i < dense_buckets; i++) { + new_buckets[i] = buckets[i]; + } + for (size_t i = 0; i < sparse_capacity; i++) { + new_buckets[num_buckets - i] = buckets[old_buckets - i]; + } + + if (num_buckets > old_buckets) { + // 3.5. Scan sparse buckets and move all updates of depth num_dense_rows-1 + // to the new dense row + uint16_t depth_mask = 0xFFFF; + for (size_t i = 0; i < sparse_capacity; i++) { + if ((sparse_buckets[i].position & depth_mask) == num_dense_rows - 1) { + size_t column = sparse_buckets[i].position >> 8; + bucket(column, num_dense_rows - 1) = sparse_buckets[i].bkt; + sparse_buckets[i].position = uint16_t(-1); // tombstone + } + } + } + + // 4. Clean up + std::swap(buckets, new_buckets); + delete[] new_buckets; + } +} + +// Update a bucket value +// Returns 1 if we added a new bucket value +// 0 if the bucket was found and update (but not cleared) +// -1 if the bucket was found and cleared of all content +int SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum) { + SparseBucket *tombstone = nullptr; + uint16_t tombstone_pos = uint16_t(-1); + for (size_t i = 0; i < num_buckets; i++) { + auto &sparse_bucket = sparse_buckets[i]; + if (sparse_bucket.position == 0 || sparse_bucket.position == pos) { + // We apply our update here! + if (sparse_bucket.position == pos) { + // we update bucket + Bucket_Boruvka::update(sparse_bucket.bkt, update_idx, checksum); + + // did we clear it out? + if (Bucket_Boruvka::is_empty(sparse_bucket.bkt)) { + sparse_bucket.position = tombstone_pos; // mark it as tombstone + return -1; + } + } else { + if (tombstone != nullptr) { + // use the tombstone + tombstone->position = pos; + Bucket_Boruvka::update(tombstone->bkt, update_idx, checksum); + } else { + sparse_bucket.position = pos; + Bucket_Boruvka::update(sparse_bucket.bkt, update_idx, checksum); + } + + // we created a new sparse bucket + return 1; + } + } else if (sparse_bucket.position == tombstone_pos && tombstone == nullptr) { + tombstone = &sparse_bucket; + } + } + // this is an error! + throw std::runtime_error("update_sparse(): Failed to find update location!"); +} + +// sample a good bucket from the sparse region if one exists. +// Additionally, specify the column to query from +// TODO: Do we want to include this column thing? +SketchSample SparseSketch::sample_sparse(size_t column) { + for (size_t i = 0; i < sparse_capacity; i++) { + if (size_t(sparse_buckets[i].position >> 8) == column && + Bucket_Boruvka::is_good(sparse_buckets[i].bkt, checksum_seed())) { + return {sparse_buckets[i].bkt.alpha, GOOD}; + } + } + + // We could not find a good bucket + return {0, FAIL}; +} + + +void SparseSketch::update(const vec_t update_idx) { + vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); + + // Update depth 0 bucket + Bucket_Boruvka::update(deterministic_bucket(), update_idx, checksum); + + // Update higher depth buckets + for (unsigned i = 0; i < num_columns; ++i) { + col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), bkt_per_col); + likely_if(depth < bkt_per_col) { + likely_if(depth < num_dense_rows) { + Bucket_Boruvka::update(bucket(i, depth), update_idx, checksum); + } else { + number_of_sparse_buckets += update_sparse((i << 8) | depth, update_idx, checksum); + + // based upon this update to sparse matrix, check if we need to reallocate dense region + reallocate_if_needed(); + } + } + } +} + +// TODO: Switch the L0_SAMPLING flag to instead affect query procedure. +// (Only use deepest bucket. We don't need the alternate update procedure in the code anymore.) + +void SparseSketch::zero_contents() { + // TODO: Should we also set the size of this bucket back to an initial state? + for (size_t i = 0; i < num_buckets; i++) { + buckets[i].alpha = 0; + buckets[i].gamma = 0; + } + reset_sample_state(); +} + +SketchSample SparseSketch::sample() { + if (sample_idx >= num_samples) { + throw OutOfSamplesException(seed, num_samples, sample_idx); + } + + size_t idx = sample_idx++; + size_t first_column = idx * cols_per_sample; + + if (Bucket_Boruvka::is_empty(deterministic_bucket())) + return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return + + if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) + return {deterministic_bucket().alpha, GOOD}; + + for (size_t i = 0; i < cols_per_sample; ++i) { + for (size_t j = 0; j < num_dense_rows; ++j) { + if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) + return {bucket(i + first_column, j).alpha, GOOD}; + } + } + + // TODO: Sample sparse region! + return {0, FAIL}; +} + +ExhaustiveSketchSample SparseSketch::exhaustive_sample() { + if (sample_idx >= num_samples) { + throw OutOfSamplesException(seed, num_samples, sample_idx); + } + std::unordered_set ret; + + size_t idx = sample_idx++; + size_t first_column = idx * cols_per_sample; + + unlikely_if (deterministic_bucket().alpha == 0 && deterministic_bucket().gamma == 0) + return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return + + unlikely_if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) { + ret.insert(deterministic_bucket().alpha); + return {ret, GOOD}; + } + + for (size_t i = 0; i < cols_per_sample; ++i) { + for (size_t j = 0; j < bkt_per_col; ++j) { + unlikely_if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) { + ret.insert(bucket(i + first_column, j).alpha); + } + } + } + + // TODO: Implement this with sparse! + + unlikely_if (ret.size() == 0) + return {ret, FAIL}; + return {ret, GOOD}; +} + +void SparseSketch::merge(const SparseSketch &other) { + for (size_t i = 0; i < num_buckets; ++i) { + buckets[i].alpha ^= other.buckets[i].alpha; + buckets[i].gamma ^= other.buckets[i].gamma; + } + + // TODO: Handle sparse stuff! +} + +void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, size_t n_samples) { + if (start_sample + n_samples > num_samples) { + assert(false); + sample_idx = num_samples; // sketch is in a fail state! + return; + } + + // update sample idx to point at beginning of this range if before it + sample_idx = std::max(sample_idx, start_sample); + + // merge deterministic buffer + deterministic_bucket().alpha ^= other.deterministic_bucket().alpha; + deterministic_bucket().gamma ^= other.deterministic_bucket().gamma; + + // merge other buckets + size_t start_column = start_sample * cols_per_sample; + size_t end_column = (start_sample + n_samples) * cols_per_sample; + + for (size_t i = start_column; i < end_column; i++) { + for (size_t j = 0; j < bkt_per_col; j++) { + bucket(i, j).alpha ^= other.bucket(i, j).alpha; + bucket(i, j).gamma ^= other.bucket(i, j).gamma; + } + } +} + +void SparseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { + for (size_t i = 0; i < num_buckets; i++) { + buckets[i].alpha ^= raw_buckets[i].alpha; + buckets[i].gamma ^= raw_buckets[i].gamma; + } + + // TODO: Handle sparse +} + +void SparseSketch::serialize(std::ostream &binary_out) const { + binary_out.write((char*) buckets, bucket_array_bytes()); + + // TODO: Handle sparse +} + +bool operator==(const SparseSketch &sketch1, const SparseSketch &sketch2) { + if (sketch1.num_buckets != sketch2.num_buckets || sketch1.seed != sketch2.seed) + return false; + + for (size_t i = 0; i < sketch1.num_buckets; ++i) { + if (sketch1.buckets[i].alpha != sketch2.buckets[i].alpha || + sketch1.buckets[i].gamma != sketch2.buckets[i].gamma) { + return false; + } + } + + // TODO: Handle sparse + + return true; +} + +std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { + Bucket bkt = sketch.buckets[sketch.num_buckets - 1]; + bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); + vec_t a = bkt.alpha; + vec_hash_t c = bkt.gamma; + + os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + + for (unsigned i = 0; i < sketch.num_columns; ++i) { + for (unsigned j = 0; j < sketch.bkt_per_col; ++j) { + Bucket bkt = sketch.bucket(i, j); + vec_t a = bkt.alpha; + vec_hash_t c = bkt.gamma; + bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); + + os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + } + os << std::endl; + } + return os; +} diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp index e35f4aa2..f782e7c1 100644 --- a/test/sketch_test.cpp +++ b/test/sketch_test.cpp @@ -215,20 +215,42 @@ TEST(SketchTestSuite, TestSketchMerge) { } TEST(SketchTestSuite, TestSketchRangeMerge) { - Sketch skt1(2048, get_seed(), 10, 3); - Sketch skt2(2048, get_seed(), 10, 3); + size_t seed = get_seed(); + Sketch skt1(2048, seed, 10, 3); + Sketch skt2(2048, seed, 10, 3); - skt1.sample(); + for (vec_t i = 0; i < 1024; i++) { + skt1.update(i); + skt2.update(i + 256); + } + // allowed return values after merging are [0, 255] and [1024, 1279] + vec_t good_1 = 255; + vec_t good_2 = 1024; + vec_t good_3 = good_2 + 255; + + skt1.range_merge(skt2, 0, 1); + SketchSample sample = skt1.sample(); + if (sample.result == GOOD) { + ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); + } + skt1.range_merge(skt2, 1, 1); - - skt1.sample(); + sample = skt1.sample(); + if (sample.result == GOOD) { + ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); + } + skt1.range_merge(skt2, 2, 1); - - skt1.sample(); + sample = skt1.sample(); + if (sample.result == GOOD) { + ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); + } + skt1.range_merge(skt2, 3, 1); - - skt1.sample(); - skt1.range_merge(skt2, 4, 1); + sample = skt1.sample(); + if (sample.result == GOOD) { + ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); + } } /** diff --git a/test/util/graph_verifier.cpp b/test/util/graph_verifier.cpp index 4d35ec1d..3eb7a527 100644 --- a/test/util/graph_verifier.cpp +++ b/test/util/graph_verifier.cpp @@ -93,6 +93,7 @@ void GraphVerifier::verify_connected_components(const ConnectedComponents &cc) { // first check that the number of components is the same for both if (kruskal_ccs != cc.size()) { + std::cout << "expect: " << kruskal_ccs << ", got = " << cc.size() << std::endl; throw IncorrectCCException("Incorrect number of components!"); } From 2b9f78c7974dc8625f99864479e5199f17cb1460 Mon Sep 17 00:00:00 2001 From: Evan West Date: Tue, 4 Mar 2025 23:14:00 -0500 Subject: [PATCH 05/14] more progress --- include/bucket.h | 14 ++++ include/sparse_sketch.h | 12 ++-- src/sparse_sketch.cpp | 144 +++++++++++++++++++++++----------------- 3 files changed, 105 insertions(+), 65 deletions(-) diff --git a/include/bucket.h b/include/bucket.h index 5c5a4df6..95e5d656 100644 --- a/include/bucket.h +++ b/include/bucket.h @@ -12,6 +12,20 @@ struct Bucket { struct SparseBucket { uint16_t position; // (col << 8) | row Bucket bkt; + + // TODO: Use these functions and also maybe optimize + inline uint16_t col() const { + return position >> 8; + } + inline uint16_t row() const { + return position & 0xFFFF; + } + inline void set_col(uint16_t col) { + position = (col << 8) + row(); + } + inline void set_row(uint16_t row) { + position = (col() << 8) + row; + } }; #pragma pack(pop) diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h index 7a8630a7..9b397524 100644 --- a/include/sparse_sketch.h +++ b/include/sparse_sketch.h @@ -88,8 +88,8 @@ class SparseSketch { // that will be using that space size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket)); - int update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum); - SketchSample sample_sparse(size_t column); + void update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum); + SketchSample sample_sparse(size_t first_col, size_t end_col); inline Bucket& deterministic_bucket() { return buckets[0]; @@ -98,14 +98,18 @@ class SparseSketch { return buckets[0]; } + inline size_t position_func(size_t col, size_t row, size_t num_rows) const { + return col * num_rows + row + 1; + } + // return the bucket at a particular index in bucket array inline Bucket& bucket(size_t col, size_t row) { assert(row < num_dense_rows); - return buckets[col * num_dense_rows + row + 1]; + return buckets[position_func(col, row, num_dense_rows)]; } inline const Bucket& bucket(size_t col, size_t row) const { assert(row < num_dense_rows); - return buckets[col * num_dense_rows + row + 1]; + return buckets[position_func(col, row, num_dense_rows)]; } public: diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp index 252e12be..78f19d2f 100644 --- a/src/sparse_sketch.cpp +++ b/src/sparse_sketch.cpp @@ -15,7 +15,7 @@ SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, size_t _samples, siz // plus 1, deterministic bucket num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; buckets = new Bucket[num_buckets]; - sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 2]; + sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 1]; // initialize bucket values for (size_t i = 0; i < num_buckets; ++i) { @@ -57,71 +57,80 @@ SparseSketch::~SparseSketch() { delete[] buckets; } // Helper functions for interfacing with SparseBuckets void SparseSketch::reallocate_if_needed() { + if (num_dense_rows <= min_num_dense_rows) return; // do not reallocate if (number_of_sparse_buckets > num_columns && number_of_sparse_buckets < sparse_capacity) return; // do not reallocate - else { - const size_t old_buckets = num_buckets; - Bucket *new_buckets; - - if (number_of_sparse_buckets < num_columns && num_dense_rows > min_num_dense_rows) { - // shrink dense region by 1 row - // 1. Scan over deepest row of dense region and add all those buckets to sparse - size_t depth = num_dense_rows - 1; - for (size_t c = 0; c < num_columns; c++) { - Bucket bkt = bucket(c, depth); - if (!Bucket_Boruvka::is_empty(bkt)) { - uint16_t sparse_position = (c << 8) + depth; - update_sparse(sparse_position, bkt.alpha, bkt.gamma); - } - } - // 2. Allocate new memory - --num_dense_rows; - num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; - new_buckets = new Bucket[num_buckets]; - } else { - // grow dense region by 1 row - // 1. Allocate new memory - ++num_dense_rows; - num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; - new_buckets = new Bucket[num_buckets]; - - // 2. Skip + // we are performing a reallocation + std::cout << "Reallocating!" << std::endl; + std::cout << "num_sparse: " << number_of_sparse_buckets << std::endl; + std::cout << "capacity: " << sparse_capacity << std::endl; + const size_t old_buckets = num_buckets; + const size_t old_rows = num_dense_rows; + SparseBucket *old_sparse_pointer = sparse_buckets; + Bucket *new_buckets; + + if (number_of_sparse_buckets < num_columns) { + // shrink dense region by 1 row + // Scan over deepest row of dense region and add all those buckets to sparse + size_t depth = num_dense_rows - 1; + for (size_t c = 0; c < num_columns; c++) { + Bucket bkt = bucket(c, depth); + if (!Bucket_Boruvka::is_empty(bkt)) { + uint16_t sparse_position = (c << 8) + depth; + update_sparse(sparse_position, bkt.alpha, bkt.gamma); + } } - // 3. Copy over content - size_t dense_buckets = num_columns * num_dense_rows + 1; - for (size_t i = 0; i < dense_buckets; i++) { - new_buckets[i] = buckets[i]; - } - for (size_t i = 0; i < sparse_capacity; i++) { - new_buckets[num_buckets - i] = buckets[old_buckets - i]; + // Allocate new memory + --num_dense_rows; + num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; + new_buckets = new Bucket[num_buckets]; + } else { + // grow dense region by 1 row + // Allocate new memory + ++num_dense_rows; + num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; + new_buckets = new Bucket[num_buckets]; + } + sparse_buckets = (SparseBucket *) &new_buckets[num_columns * num_dense_rows + 1]; + + // Copy dense content + for (size_t c = 0; c < num_columns; c++) { + for (size_t r = 0; r < std::min(num_dense_rows, old_rows); r++) { + new_buckets[position_func(c, r, num_dense_rows)] = buckets[position_func(c, r, old_rows)]; } + } + // sparse contents + memcpy(sparse_buckets, old_sparse_pointer, sparse_capacity * sizeof(SparseBucket)); - if (num_buckets > old_buckets) { - // 3.5. Scan sparse buckets and move all updates of depth num_dense_rows-1 - // to the new dense row - uint16_t depth_mask = 0xFFFF; - for (size_t i = 0; i < sparse_capacity; i++) { - if ((sparse_buckets[i].position & depth_mask) == num_dense_rows - 1) { - size_t column = sparse_buckets[i].position >> 8; - bucket(column, num_dense_rows - 1) = sparse_buckets[i].bkt; - sparse_buckets[i].position = uint16_t(-1); // tombstone - } + + if (num_buckets > old_buckets) { + // We shrinking + // Scan sparse buckets and move all updates of depth num_dense_rows-1 + // to the new dense row + uint16_t depth_mask = 0xFFFF; + for (size_t i = 0; i < sparse_capacity; i++) { + if ((sparse_buckets[i].position & depth_mask) == num_dense_rows - 1) { + size_t column = sparse_buckets[i].position >> 8; + bucket(column, num_dense_rows - 1) = sparse_buckets[i].bkt; + sparse_buckets[i].position = uint16_t(-1); // tombstone + number_of_sparse_buckets -= 1; } } - - // 4. Clean up - std::swap(buckets, new_buckets); - delete[] new_buckets; } + + // 4. Clean up + std::swap(buckets, new_buckets); + delete[] new_buckets; } // Update a bucket value -// Returns 1 if we added a new bucket value -// 0 if the bucket was found and update (but not cleared) -// -1 if the bucket was found and cleared of all content -int SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum) { +// Changes number_of_sparse_buckets as follows: +// +1 if we added a new bucket value +// 0 if the bucket was found and update (but not cleared) +// -1 if the bucket was found and cleared of all content +void SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum) { SparseBucket *tombstone = nullptr; uint16_t tombstone_pos = uint16_t(-1); for (size_t i = 0; i < num_buckets; i++) { @@ -135,8 +144,9 @@ int SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t check // did we clear it out? if (Bucket_Boruvka::is_empty(sparse_bucket.bkt)) { sparse_bucket.position = tombstone_pos; // mark it as tombstone - return -1; + number_of_sparse_buckets -= 1; } + return; } else { if (tombstone != nullptr) { // use the tombstone @@ -148,22 +158,28 @@ int SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t check } // we created a new sparse bucket - return 1; + number_of_sparse_buckets += 1; + return; } } else if (sparse_bucket.position == tombstone_pos && tombstone == nullptr) { tombstone = &sparse_bucket; + number_of_sparse_buckets += 1; + return; } } // this is an error! + std::cout << "num_sparse: " << number_of_sparse_buckets << std::endl; + std::cout << "capacity: " << sparse_capacity << std::endl; throw std::runtime_error("update_sparse(): Failed to find update location!"); } // sample a good bucket from the sparse region if one exists. // Additionally, specify the column to query from // TODO: Do we want to include this column thing? -SketchSample SparseSketch::sample_sparse(size_t column) { +SketchSample SparseSketch::sample_sparse(size_t first_col, size_t end_col) { for (size_t i = 0; i < sparse_capacity; i++) { - if (size_t(sparse_buckets[i].position >> 8) == column && + if (size_t(sparse_buckets[i].position >> 8) >= first_col && + size_t(sparse_buckets[i].position >> 8) < end_col && Bucket_Boruvka::is_good(sparse_buckets[i].bkt, checksum_seed())) { return {sparse_buckets[i].bkt.alpha, GOOD}; } @@ -187,7 +203,7 @@ void SparseSketch::update(const vec_t update_idx) { likely_if(depth < num_dense_rows) { Bucket_Boruvka::update(bucket(i, depth), update_idx, checksum); } else { - number_of_sparse_buckets += update_sparse((i << 8) | depth, update_idx, checksum); + update_sparse((i << 8) | depth, update_idx, checksum); // based upon this update to sparse matrix, check if we need to reallocate dense region reallocate_if_needed(); @@ -229,8 +245,8 @@ SketchSample SparseSketch::sample() { } } - // TODO: Sample sparse region! - return {0, FAIL}; + // Sample sparse region + return sample_sparse(first_column, first_column + cols_per_sample); } ExhaustiveSketchSample SparseSketch::exhaustive_sample() { @@ -258,7 +274,11 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { } } - // TODO: Implement this with sparse! + // TODO: How do we do exhaustive sampling properly here? + SketchSample sample = sample_sparse(first_column, first_column + cols_per_sample); + if (sample.result == GOOD) { + ret.insert(sample.idx); + } unlikely_if (ret.size() == 0) return {ret, FAIL}; @@ -298,6 +318,8 @@ void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, s bucket(i, j).gamma ^= other.bucket(i, j).gamma; } } + + // TODO: Handle sparse! } void SparseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { From 2711d7fd5e2711e4a6d6c29b4e7c79250a555a64 Mon Sep 17 00:00:00 2001 From: Evan West Date: Mon, 10 Mar 2025 19:48:28 -0400 Subject: [PATCH 06/14] fix and make somewhat fast --- include/bucket.h | 2 +- include/cc_sketch_alg.h | 3 +- include/dense_sketch.h | 12 +- include/sparse_sketch.h | 16 +- src/cc_alg_configuration.cpp | 5 + src/cc_sketch_alg.cpp | 12 +- src/dense_sketch.cpp | 19 ++- src/sparse_sketch.cpp | 321 +++++++++++++++++++++++++---------- test/sketch_test.cpp | 10 +- 9 files changed, 284 insertions(+), 116 deletions(-) diff --git a/include/bucket.h b/include/bucket.h index 95e5d656..070695b1 100644 --- a/include/bucket.h +++ b/include/bucket.h @@ -18,7 +18,7 @@ struct SparseBucket { return position >> 8; } inline uint16_t row() const { - return position & 0xFFFF; + return position & 0xFF; } inline void set_col(uint16_t col) { position = (col << 8) + row(); diff --git a/include/cc_sketch_alg.h b/include/cc_sketch_alg.h index 9e9d3f8c..55408a4a 100644 --- a/include/cc_sketch_alg.h +++ b/include/cc_sketch_alg.h @@ -201,8 +201,9 @@ class CCSketchAlg { * Specifically, the delta is in the form of a pointer to raw bucket data. * @param src_vertex The vertex where the all edges originate. * @param raw_buckets Pointer to the array of buckets from the delta sketch + * @param num_buckets Size of raw_buckets array in number of buckets */ - void apply_raw_buckets_update(node_id_t src_vertex, Bucket *raw_buckets); + void apply_raw_buckets_update(node_id_t src_vertex, Bucket *raw_buckets, size_t num_buckets); /** * The function performs a direct update to the associated sketch. diff --git a/include/dense_sketch.h b/include/dense_sketch.h index 2a2fd199..afd271d7 100644 --- a/include/dense_sketch.h +++ b/include/dense_sketch.h @@ -77,18 +77,19 @@ class DenseSketch { * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) */ DenseSketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1, - size_t cols_per_sample = default_cols_per_sample); + size_t cols_per_sample = default_cols_per_sample); /** * Construct a sketch from a serialized stream * @param vector_len Length of the vector we are sketching * @param seed Random seed of the sketch * @param binary_in Stream holding serialized sketch object + * @param num_buckets Number of buckets in serialized sketch * @param num_samples [Optional] Number of samples this sketch supports (default = 1) * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) */ - DenseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, - size_t cols_per_sample = default_cols_per_sample); + DenseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_buckets, + size_t num_samples = 1, size_t cols_per_sample = default_cols_per_sample); /** * Sketch copy constructor @@ -139,9 +140,10 @@ class DenseSketch { * Perform an in-place merge function without another Sketch and instead * use a raw bucket memory. * We also allow for only a portion of the buckets to be merge at once - * @param raw_bucket Raw bucket data to merge into this sketch + * @param raw_bucket Raw bucket data to merge into this sketch + * @param n_raw_buckets Size of raw_buckets in number of Bucket data-structures */ - void merge_raw_bucket_buffer(const Bucket *raw_buckets); + void merge_raw_bucket_buffer(const Bucket *raw_buckets, size_t n_raw_buckets); /** * Zero out all the buckets of a sketch. diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h index 9b397524..70116a56 100644 --- a/include/sparse_sketch.h +++ b/include/sparse_sketch.h @@ -82,13 +82,14 @@ class SparseSketch { /** * Reallocates the bucket array if necessary to either grow or shrink the dense region */ - void reallocate_if_needed(); + void reallocate_if_needed(int delta); + void dense_realloc(size_t new_num_dense_rows); // This variable lets us know how many Buckets to allocate to make space for the SparseBuckets // that will be using that space size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket)); - void update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum); + void update_sparse(SparseBucket to_add, bool realloc_if_needed = true); SketchSample sample_sparse(size_t first_col, size_t end_col); inline Bucket& deterministic_bucket() { @@ -151,11 +152,12 @@ class SparseSketch { * @param vector_len Length of the vector we are sketching * @param seed Random seed of the sketch * @param binary_in Stream holding serialized sketch object + * @param num_buckets Number of buckets in serialized sketch (dense + sparse_capacity) * @param num_samples [Optional] Number of samples this sketch supports (default = 1) * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) */ - SparseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, - size_t cols_per_sample = default_cols_per_sample); + SparseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_buckets, + size_t num_samples = 1, size_t cols_per_sample = default_cols_per_sample); /** * SparseSketch copy constructor @@ -206,9 +208,10 @@ class SparseSketch { * Perform an in-place merge function without another Sketch and instead * use a raw bucket memory. * We also allow for only a portion of the buckets to be merge at once - * @param raw_bucket Raw bucket data to merge into this sketch + * @param raw_bucket Raw bucket data to merge into this sketch + * @param n_raw_buckets Size of raw_buckets in number of Bucket data-structures */ - void merge_raw_bucket_buffer(const Bucket *raw_buckets); + void merge_raw_bucket_buffer(const Bucket *raw_buckets, size_t n_raw_buckets); /** * Zero out all the buckets of a sketch. @@ -240,6 +243,7 @@ class SparseSketch { inline size_t get_columns() const { return num_columns; } inline size_t get_buckets() const { return num_buckets; } inline size_t get_num_samples() const { return num_samples; } + inline size_t get_num_dense_rows() const { return num_dense_rows; } static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; } diff --git a/src/cc_alg_configuration.cpp b/src/cc_alg_configuration.cpp index becbb7e5..2ce12af1 100644 --- a/src/cc_alg_configuration.cpp +++ b/src/cc_alg_configuration.cpp @@ -34,6 +34,11 @@ std::ostream& operator<< (std::ostream &out, const CCAlgConfiguration &conf) { #else out << " Sketching algorithm = CameoSketch" << std::endl; #endif +#ifdef L0_FULLY_DENSE + out << " Sketch storage = Dense Matrix" << std::endl; +#else + out << " Sketch storage = Hybrid Matrix" << std::endl; +#endif #ifdef NO_EAGER_DSU out << " Using Eager DSU = False" << std::endl; #else diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp index a1e688db..f8e5649d 100644 --- a/src/cc_sketch_alg.cpp +++ b/src/cc_sketch_alg.cpp @@ -53,7 +53,10 @@ CCSketchAlg::CCSketchAlg(node_id_t num_vertices, size_t seed, std::ifstream &bin for (node_id_t i = 0; i < num_vertices; ++i) { representatives->insert(i); - sketches[i] = new Sketch(sketch_vec_len, seed, binary_stream, sketch_num_samples); + size_t num_bkts_in_sketch; + binary_stream.read((char *) &num_bkts_in_sketch, sizeof(num_bkts_in_sketch)); + sketches[i] = + new Sketch(sketch_vec_len, seed, binary_stream, num_bkts_in_sketch, sketch_num_samples); } binary_stream.close(); @@ -117,9 +120,10 @@ void CCSketchAlg::apply_update_batch(int thr_id, node_id_t src_vertex, sketches[src_vertex]->merge(delta_sketch); } -void CCSketchAlg::apply_raw_buckets_update(node_id_t src_vertex, Bucket *raw_buckets) { +void CCSketchAlg::apply_raw_buckets_update(node_id_t src_vertex, Bucket *raw_buckets, + size_t num_buckets) { std::lock_guard lk(sketches[src_vertex]->mutex); - sketches[src_vertex]->merge_raw_bucket_buffer(raw_buckets); + sketches[src_vertex]->merge_raw_bucket_buffer(raw_buckets, num_buckets); } // Note: for performance reasons route updates through the driver instead of calling this function @@ -617,6 +621,8 @@ void CCSketchAlg::write_binary(const std::string &filename) { binary_out.write((char *)&num_vertices, sizeof(num_vertices)); binary_out.write((char *)&config._sketches_factor, sizeof(config._sketches_factor)); for (node_id_t i = 0; i < num_vertices; ++i) { + size_t num_bkts_in_sketch = sketches[i]->get_buckets(); + binary_out.write((char*) &num_bkts_in_sketch, sizeof(num_bkts_in_sketch)); sketches[i]->serialize(binary_out); } binary_out.close(); diff --git a/src/dense_sketch.cpp b/src/dense_sketch.cpp index 9c6c5f73..39b4ddae 100644 --- a/src/dense_sketch.cpp +++ b/src/dense_sketch.cpp @@ -1,9 +1,10 @@ #include "dense_sketch.h" +#include #include +#include #include #include -#include DenseSketch::DenseSketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) : seed(seed), @@ -22,13 +23,17 @@ DenseSketch::DenseSketch(vec_t vector_len, uint64_t seed, size_t _samples, size_ } } -DenseSketch::DenseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, - size_t _cols) +DenseSketch::DenseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, + size_t num_buckets, size_t _samples, size_t _cols) : seed(seed), num_samples(_samples), cols_per_sample(_cols), num_columns(cols_per_sample * num_samples), - bkt_per_col(calc_bkt_per_col(vector_len)) { + bkt_per_col(calc_bkt_per_col(vector_len)), + num_buckets(num_buckets) { + if (num_buckets != num_columns * bkt_per_col + 1) { + throw std::invalid_argument("Serial Constructor: Number of buckets does not match expectation"); + } num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket buckets = new Bucket[num_buckets]; @@ -188,7 +193,11 @@ void DenseSketch::range_merge(const DenseSketch &other, size_t start_sample, siz // std::cout << *this << std::endl; } -void DenseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { +void DenseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets, size_t n_raw_buckets) { + if (n_raw_buckets != num_buckets) { + throw std::invalid_argument("Raw bucket buffer is not the same size as DenseSketch"); + } + for (size_t i = 0; i < num_buckets; i++) { buckets[i].alpha ^= raw_buckets[i].alpha; buckets[i].gamma ^= raw_buckets[i].gamma; diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp index 78f19d2f..712be299 100644 --- a/src/sparse_sketch.cpp +++ b/src/sparse_sketch.cpp @@ -1,9 +1,9 @@ #include "sparse_sketch.h" +#include #include #include #include -#include SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) : seed(seed), @@ -24,20 +24,20 @@ SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, size_t _samples, siz } } -SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, - size_t _cols) +SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, + size_t num_buckets, size_t _samples, size_t _cols) : seed(seed), num_samples(_samples), cols_per_sample(_cols), num_columns(cols_per_sample * num_samples), - bkt_per_col(calc_bkt_per_col(vector_len)) { - - // TODO: Make this actually work for sparse-sketch - num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket + bkt_per_col(calc_bkt_per_col(vector_len)), + num_buckets(num_buckets) { buckets = new Bucket[num_buckets]; + num_dense_rows = (num_buckets - sparse_data_size) / num_columns; + sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 1]; // Read the serialized Sketch contents - binary_in.read((char *)buckets, bucket_array_bytes()); // TODO: Figure out bucket_array_bytes() in this context + binary_in.read((char *)buckets, bucket_array_bytes()); } SparseSketch::SparseSketch(const SparseSketch &s) @@ -45,57 +45,72 @@ SparseSketch::SparseSketch(const SparseSketch &s) num_samples(s.num_samples), cols_per_sample(s.cols_per_sample), num_columns(s.num_columns), - bkt_per_col(s.bkt_per_col) { - num_buckets = s.num_buckets; + bkt_per_col(s.bkt_per_col), + num_buckets(s.num_buckets), + num_dense_rows(s.num_dense_rows) { buckets = new Bucket[num_buckets]; + sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 1]; std::memcpy(buckets, s.buckets, bucket_array_bytes()); } -SparseSketch::~SparseSketch() { delete[] buckets; } +SparseSketch::~SparseSketch() { + // std::cout << "Deleting sketch! buckets = " << buckets << std::endl; + delete[] buckets; +} // Helper functions for interfacing with SparseBuckets -void SparseSketch::reallocate_if_needed() { - if (num_dense_rows <= min_num_dense_rows) return; // do not reallocate - if (number_of_sparse_buckets > num_columns && number_of_sparse_buckets < sparse_capacity) - return; // do not reallocate - +void SparseSketch::dense_realloc(size_t new_num_dense_rows) { // we are performing a reallocation - std::cout << "Reallocating!" << std::endl; - std::cout << "num_sparse: " << number_of_sparse_buckets << std::endl; - std::cout << "capacity: " << sparse_capacity << std::endl; - const size_t old_buckets = num_buckets; const size_t old_rows = num_dense_rows; SparseBucket *old_sparse_pointer = sparse_buckets; Bucket *new_buckets; - if (number_of_sparse_buckets < num_columns) { - // shrink dense region by 1 row - // Scan over deepest row of dense region and add all those buckets to sparse - size_t depth = num_dense_rows - 1; + if (new_num_dense_rows < min_num_dense_rows) { + throw std::runtime_error("new_num_dense_rows too small!"); + } + + if (new_num_dense_rows < num_dense_rows) { + // std::cout << "Shrinking to " << new_num_dense_rows << " from " << old_rows << std::endl; + // shrink dense region + // Scan over the rows we are removing and add all those buckets to sparse for (size_t c = 0; c < num_columns; c++) { - Bucket bkt = bucket(c, depth); - if (!Bucket_Boruvka::is_empty(bkt)) { - uint16_t sparse_position = (c << 8) + depth; - update_sparse(sparse_position, bkt.alpha, bkt.gamma); + for (size_t r = new_num_dense_rows; r < old_rows; r++) { + Bucket bkt = bucket(c, r); + if (!Bucket_Boruvka::is_empty(bkt)) { + SparseBucket new_sparse; + new_sparse.set_col(c); + new_sparse.set_row(r); + new_sparse.bkt = bkt; + update_sparse(new_sparse, false); + } } } // Allocate new memory - --num_dense_rows; + num_dense_rows = new_num_dense_rows; num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; new_buckets = new Bucket[num_buckets]; } else { + // std::cout << "Growing to " << new_num_dense_rows << " from " << old_rows << std::endl; // grow dense region by 1 row // Allocate new memory - ++num_dense_rows; + num_dense_rows = new_num_dense_rows; num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; new_buckets = new Bucket[num_buckets]; + + // initialize new rows to zero + for (size_t c = 0; c < num_columns; c++) { + for (size_t r = old_rows; r < num_dense_rows; r++) { + new_buckets[position_func(c, r, num_dense_rows)] = {0, 0}; + } + } } sparse_buckets = (SparseBucket *) &new_buckets[num_columns * num_dense_rows + 1]; // Copy dense content + new_buckets[0] = deterministic_bucket(); for (size_t c = 0; c < num_columns; c++) { for (size_t r = 0; r < std::min(num_dense_rows, old_rows); r++) { new_buckets[position_func(c, r, num_dense_rows)] = buckets[position_func(c, r, old_rows)]; @@ -105,17 +120,23 @@ void SparseSketch::reallocate_if_needed() { memcpy(sparse_buckets, old_sparse_pointer, sparse_capacity * sizeof(SparseBucket)); - if (num_buckets > old_buckets) { - // We shrinking + if (num_dense_rows > old_rows) { + // We growing // Scan sparse buckets and move all updates of depth num_dense_rows-1 // to the new dense row - uint16_t depth_mask = 0xFFFF; for (size_t i = 0; i < sparse_capacity; i++) { - if ((sparse_buckets[i].position & depth_mask) == num_dense_rows - 1) { - size_t column = sparse_buckets[i].position >> 8; - bucket(column, num_dense_rows - 1) = sparse_buckets[i].bkt; + // std::cout << "sparse_bucket = " << sparse_buckets[i].col() << ", " << sparse_buckets[i].row() + // << ": " << sparse_buckets[i].bkt.alpha << ", " << sparse_buckets[i].bkt.gamma + // << std::endl; + if (sparse_buckets[i].row() < num_dense_rows && sparse_buckets[i].position != 0) { + size_t col = sparse_buckets[i].col(); + size_t row = sparse_buckets[i].row(); + assert(Bucket_Boruvka::is_empty(new_buckets[position_func(col, row, num_dense_rows)])); + new_buckets[position_func(col, row, num_dense_rows)] = sparse_buckets[i].bkt; sparse_buckets[i].position = uint16_t(-1); // tombstone + sparse_buckets[i].bkt = {0, 0}; // clear out tombstone number_of_sparse_buckets -= 1; + // std::cout << "Moving to dense!" << std::endl; } } } @@ -125,48 +146,73 @@ void SparseSketch::reallocate_if_needed() { delete[] new_buckets; } +void SparseSketch::reallocate_if_needed(int delta) { + // if we're currently adding something, don't shrink + if (delta == 1 && number_of_sparse_buckets <= num_columns / 4) { + return; + } + + // while we need to reallocate, attempt to do so. If realloc doesn't solve problem. Do it again. + while ((delta == -1 && number_of_sparse_buckets <= num_columns / 4 && + num_dense_rows > min_num_dense_rows) || + (delta == 1 && number_of_sparse_buckets == sparse_capacity)) { + if (number_of_sparse_buckets >= sparse_capacity) { + dense_realloc(num_dense_rows + 1); + } else { + dense_realloc(num_dense_rows - 1); + } + } +} + // Update a bucket value // Changes number_of_sparse_buckets as follows: // +1 if we added a new bucket value // 0 if the bucket was found and update (but not cleared) // -1 if the bucket was found and cleared of all content -void SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t checksum) { +void SparseSketch::update_sparse(SparseBucket to_add, bool realloc_if_needed) { SparseBucket *tombstone = nullptr; uint16_t tombstone_pos = uint16_t(-1); - for (size_t i = 0; i < num_buckets; i++) { + for (size_t i = 0; i < sparse_capacity; i++) { auto &sparse_bucket = sparse_buckets[i]; - if (sparse_bucket.position == 0 || sparse_bucket.position == pos) { + if (sparse_bucket.position == 0 || sparse_bucket.position == to_add.position) { // We apply our update here! - if (sparse_bucket.position == pos) { + if (sparse_bucket.position == to_add.position) { // we update bucket - Bucket_Boruvka::update(sparse_bucket.bkt, update_idx, checksum); + sparse_bucket.bkt.alpha ^= to_add.bkt.alpha; + sparse_bucket.bkt.gamma ^= to_add.bkt.gamma; // did we clear it out? if (Bucket_Boruvka::is_empty(sparse_bucket.bkt)) { sparse_bucket.position = tombstone_pos; // mark it as tombstone number_of_sparse_buckets -= 1; + if (realloc_if_needed) reallocate_if_needed(-1); } return; } else { if (tombstone != nullptr) { // use the tombstone - tombstone->position = pos; - Bucket_Boruvka::update(tombstone->bkt, update_idx, checksum); + *tombstone = to_add; } else { - sparse_bucket.position = pos; - Bucket_Boruvka::update(sparse_bucket.bkt, update_idx, checksum); + sparse_bucket = to_add; } // we created a new sparse bucket number_of_sparse_buckets += 1; + if (realloc_if_needed) reallocate_if_needed(1); return; } } else if (sparse_bucket.position == tombstone_pos && tombstone == nullptr) { tombstone = &sparse_bucket; - number_of_sparse_buckets += 1; - return; } } + if (tombstone != nullptr) { + // use the tombstone + *tombstone = to_add; + number_of_sparse_buckets += 1; // we created a new sparse bucket + if (realloc_if_needed) reallocate_if_needed(1); + return; + } + // this is an error! std::cout << "num_sparse: " << number_of_sparse_buckets << std::endl; std::cout << "capacity: " << sparse_capacity << std::endl; @@ -178,14 +224,16 @@ void SparseSketch::update_sparse(uint16_t pos, vec_t update_idx, vec_hash_t chec // TODO: Do we want to include this column thing? SketchSample SparseSketch::sample_sparse(size_t first_col, size_t end_col) { for (size_t i = 0; i < sparse_capacity; i++) { - if (size_t(sparse_buckets[i].position >> 8) >= first_col && - size_t(sparse_buckets[i].position >> 8) < end_col && + if (size_t(sparse_buckets[i].col()) >= first_col && + size_t(sparse_buckets[i].col()) < end_col && Bucket_Boruvka::is_good(sparse_buckets[i].bkt, checksum_seed())) { + // std::cout << "Found GOOD sparse bucket" << std::endl; return {sparse_buckets[i].bkt.alpha, GOOD}; } } // We could not find a good bucket + // std::cout << "Sketch FAIL" << std::endl; return {0, FAIL}; } @@ -203,10 +251,7 @@ void SparseSketch::update(const vec_t update_idx) { likely_if(depth < num_dense_rows) { Bucket_Boruvka::update(bucket(i, depth), update_idx, checksum); } else { - update_sparse((i << 8) | depth, update_idx, checksum); - - // based upon this update to sparse matrix, check if we need to reallocate dense region - reallocate_if_needed(); + update_sparse({uint16_t((i << 8) | depth), {update_idx, checksum}}); } } } @@ -222,6 +267,7 @@ void SparseSketch::zero_contents() { buckets[i].gamma = 0; } reset_sample_state(); + number_of_sparse_buckets = 0; } SketchSample SparseSketch::sample() { @@ -232,21 +278,38 @@ SketchSample SparseSketch::sample() { size_t idx = sample_idx++; size_t first_column = idx * cols_per_sample; - if (Bucket_Boruvka::is_empty(deterministic_bucket())) + // std::cout << "Sampling sketch" << std::endl; + // std::cout << "first_col = " << first_column << std::endl; + // std::cout << "end_col = " << first_column + cols_per_sample << std::endl; + // std::cout << *this << std::endl; + + if (Bucket_Boruvka::is_empty(deterministic_bucket())) { + // std::cout << "ZERO!" << std::endl; return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return + } - if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) + if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) { + // std::cout << "Deterministic GOOD" << std::endl; return {deterministic_bucket().alpha, GOOD}; + } + + // Sample sparse region + SketchSample sample = sample_sparse(first_column, first_column + cols_per_sample); + if (sample.result == GOOD) { + return sample; + } - for (size_t i = 0; i < cols_per_sample; ++i) { - for (size_t j = 0; j < num_dense_rows; ++j) { - if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) - return {bucket(i + first_column, j).alpha, GOOD}; + for (size_t c = 0; c < cols_per_sample; ++c) { + for (size_t r = 0; r < num_dense_rows; ++r) { + if (Bucket_Boruvka::is_good(bucket(c + first_column, r), checksum_seed())) { + // std::cout << "Found GOOD dense bucket" << std::endl; + return {bucket(c + first_column, r).alpha, GOOD}; + } } } // Sample sparse region - return sample_sparse(first_column, first_column + cols_per_sample); + return {0, FAIL}; } ExhaustiveSketchSample SparseSketch::exhaustive_sample() { @@ -266,10 +329,10 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { return {ret, GOOD}; } - for (size_t i = 0; i < cols_per_sample; ++i) { - for (size_t j = 0; j < bkt_per_col; ++j) { - unlikely_if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) { - ret.insert(bucket(i + first_column, j).alpha); + for (size_t c = 0; c < cols_per_sample; ++c) { + for (size_t r = 0; r < num_dense_rows; ++r) { + unlikely_if (Bucket_Boruvka::is_good(bucket(c + first_column, r), checksum_seed())) { + ret.insert(bucket(c + first_column, r).alpha); } } } @@ -286,12 +349,38 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { } void SparseSketch::merge(const SparseSketch &other) { - for (size_t i = 0; i < num_buckets; ++i) { - buckets[i].alpha ^= other.buckets[i].alpha; - buckets[i].gamma ^= other.buckets[i].gamma; + deterministic_bucket().alpha ^= other.deterministic_bucket().alpha; + deterministic_bucket().gamma ^= other.deterministic_bucket().gamma; + + // merge all dense buckets from other sketch into this one + for (size_t c = 0; c < num_columns; c++) { + for (size_t r = 0; r < other.num_dense_rows; ++r) { + if (r < num_dense_rows) { + bucket(c, r).alpha ^= other.bucket(c, r).alpha; + bucket(c, r).gamma ^= other.bucket(c, r).gamma; + } else if (!Bucket_Boruvka::is_empty(other.bucket(c, r))) { + SparseBucket sparse_bkt; + sparse_bkt.set_col(c); + sparse_bkt.set_row(r); + sparse_bkt.bkt = other.bucket(c, r); + update_sparse(sparse_bkt); + } + } } - // TODO: Handle sparse stuff! + // Merge all sparse buckets from other sketch into this one + for (size_t i = 0; i < other.sparse_capacity; i++) { + const auto &oth_sparse_bkt = other.sparse_buckets[i]; + if (oth_sparse_bkt.position != uint16_t(-1) && oth_sparse_bkt.position != 0) { + if (oth_sparse_bkt.row() < num_dense_rows) { + auto &bkt = bucket(oth_sparse_bkt.col(), oth_sparse_bkt.row()); + bkt.alpha ^= oth_sparse_bkt.bkt.alpha; + bkt.gamma ^= oth_sparse_bkt.bkt.gamma; + } else { + update_sparse(oth_sparse_bkt); + } + } + } } void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, size_t n_samples) { @@ -312,45 +401,85 @@ void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, s size_t start_column = start_sample * cols_per_sample; size_t end_column = (start_sample + n_samples) * cols_per_sample; - for (size_t i = start_column; i < end_column; i++) { - for (size_t j = 0; j < bkt_per_col; j++) { - bucket(i, j).alpha ^= other.bucket(i, j).alpha; - bucket(i, j).gamma ^= other.bucket(i, j).gamma; + // merge all their dense buckets into us + for (size_t c = start_column; c < end_column; c++) { + for (size_t r = 0; r < other.num_dense_rows; r++) { + if (r < num_dense_rows) { + bucket(c, r).alpha ^= other.bucket(c, r).alpha; + bucket(c, r).gamma ^= other.bucket(c, r).gamma; + } else if (!Bucket_Boruvka::is_empty(other.bucket(c, r))) { + SparseBucket sparse_bkt; + sparse_bkt.set_col(c); + sparse_bkt.set_row(r); + sparse_bkt.bkt = other.bucket(c, r); + update_sparse(sparse_bkt); + } } } - // TODO: Handle sparse! + // Merge all sparse buckets from other sketch's columns into this one + for (size_t i = 0; i < other.sparse_capacity; i++) { + const auto &oth_sparse_bkt = other.sparse_buckets[i]; + if (oth_sparse_bkt.position != uint16_t(-1) && oth_sparse_bkt.position != 0 && + oth_sparse_bkt.col() >= start_column && oth_sparse_bkt.col() < end_column) { + if (oth_sparse_bkt.row() < num_dense_rows) { + auto &bkt = bucket(oth_sparse_bkt.col(), oth_sparse_bkt.row()); + bkt.alpha ^= oth_sparse_bkt.bkt.alpha; + bkt.gamma ^= oth_sparse_bkt.bkt.gamma; + } else { + update_sparse(oth_sparse_bkt); + } + } + } } -void SparseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { - for (size_t i = 0; i < num_buckets; i++) { - buckets[i].alpha ^= raw_buckets[i].alpha; - buckets[i].gamma ^= raw_buckets[i].gamma; +void SparseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets, size_t n_raw_buckets) { + size_t num_merge_dense_rows = (n_raw_buckets - sparse_data_size - 1) / num_columns; + const SparseBucket *raw_sparse = + (const SparseBucket *) &raw_buckets[num_columns * num_merge_dense_rows + 1]; + + deterministic_bucket().alpha ^= raw_buckets[0].alpha; + deterministic_bucket().gamma ^= raw_buckets[0].gamma; + + for (size_t c = 0; c < num_columns; c++) { + for (size_t r = 0; r < num_merge_dense_rows; r++) { + if (r < num_dense_rows) { + bucket(c, r).alpha ^= raw_buckets[position_func(c, r, num_merge_dense_rows)].alpha; + bucket(c, r).gamma ^= raw_buckets[position_func(c, r, num_merge_dense_rows)].gamma; + } else if (!Bucket_Boruvka::is_empty( + raw_buckets[position_func(c, r, num_merge_dense_rows)])) { + SparseBucket sparse_bkt; + sparse_bkt.set_col(c); + sparse_bkt.set_row(r); + sparse_bkt.bkt = raw_buckets[position_func(c, r, num_merge_dense_rows)]; + update_sparse(sparse_bkt); + } + } } - // TODO: Handle sparse + for (size_t i = 0; i < sparse_capacity; i++) { + const auto &oth_sparse_bkt = raw_sparse[i]; + if (oth_sparse_bkt.position != uint16_t(-1) && oth_sparse_bkt.position != 0) { + if (oth_sparse_bkt.row() < num_dense_rows) { + auto &bkt = bucket(oth_sparse_bkt.col(), oth_sparse_bkt.row()); + bkt.alpha ^= oth_sparse_bkt.bkt.alpha; + bkt.gamma ^= oth_sparse_bkt.bkt.gamma; + } else { + update_sparse(oth_sparse_bkt); + } + } + } } void SparseSketch::serialize(std::ostream &binary_out) const { binary_out.write((char*) buckets, bucket_array_bytes()); - - // TODO: Handle sparse } bool operator==(const SparseSketch &sketch1, const SparseSketch &sketch2) { if (sketch1.num_buckets != sketch2.num_buckets || sketch1.seed != sketch2.seed) return false; - for (size_t i = 0; i < sketch1.num_buckets; ++i) { - if (sketch1.buckets[i].alpha != sketch2.buckets[i].alpha || - sketch1.buckets[i].gamma != sketch2.buckets[i].gamma) { - return false; - } - } - - // TODO: Handle sparse - - return true; + return memcmp(sketch1.buckets, sketch2.buckets, sketch1.bucket_array_bytes()) == 0; } std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { @@ -362,7 +491,7 @@ std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; for (unsigned i = 0; i < sketch.num_columns; ++i) { - for (unsigned j = 0; j < sketch.bkt_per_col; ++j) { + for (unsigned j = 0; j < sketch.num_dense_rows; ++j) { Bucket bkt = sketch.bucket(i, j); vec_t a = bkt.alpha; vec_hash_t c = bkt.gamma; @@ -372,5 +501,15 @@ std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { } os << std::endl; } + + os << "Sparse Buckets" << std::endl; + const auto sparse_buckets = sketch.sparse_buckets; + for (size_t i = 0; i < sketch.sparse_capacity; i++) { + bool good = Bucket_Boruvka::is_good(sparse_buckets[i].bkt, sketch.checksum_seed()); + os << " p:" << sparse_buckets[i].col() << ", " << sparse_buckets[i].row() + << ":= a:" << sparse_buckets[i].bkt.alpha << " c:" << sparse_buckets[i].bkt.gamma + << (good ? " good" : " bad") << std::endl; + } + os << std::endl; return os; } diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp index f782e7c1..7ff15c37 100644 --- a/test/sketch_test.cpp +++ b/test/sketch_test.cpp @@ -102,7 +102,6 @@ void test_sketch_sample(unsigned long num_sketches, SampleResult ret_code = query_ret.result; if (ret_code == GOOD) { - //Multiple queries shouldn't happen, but if we do get here fail test ASSERT_LT(res_idx, vec_size) << "Sampled index out of bounds"; if (!test_vec.get_entry(res_idx)) { //Undetected sample error @@ -177,6 +176,7 @@ void test_sketch_merge(unsigned long num_sketches, ASSERT_LT(res_idx, vec_size) << "Sampled index out of bounds"; if (test_vec1.get_entry(res_idx) == test_vec2.get_entry(res_idx)) { sample_incorrect_failures++; + exit(EXIT_FAILURE); } } else if (ret_code == ZERO) { @@ -189,6 +189,8 @@ void test_sketch_merge(unsigned long num_sketches, } if (!vec_zero) { sample_incorrect_failures++; + std::cout << "GOT INCORRECT ZERO!" << std::endl; + exit(EXIT_FAILURE); } } else { // sketch failed @@ -330,7 +332,7 @@ TEST(SketchTestSuite, TestSerialization) { file.close(); auto in_file = std::fstream("./out_sketch.txt", std::ios::in | std::ios::binary); - Sketch reheated(vec_size, seed, in_file, 3, num_columns); + Sketch reheated(vec_size, seed, in_file, sketch.get_buckets(), 3, num_columns); ASSERT_EQ(sketch, reheated); } @@ -455,7 +457,7 @@ TEST(SketchTestSuite, TestRawBucketUpdate) { const Bucket *data = sk1.get_readonly_bucket_ptr(); - sk2.merge_raw_bucket_buffer(data); + sk2.merge_raw_bucket_buffer(data, sk1.get_buckets()); SketchSample sample = sk2.sample(); @@ -468,7 +470,7 @@ TEST(SketchTestSuite, TestRawBucketUpdate) { Bucket *copy_data = new Bucket[sk1.get_buckets()]; memcpy(copy_data, data, sk1.bucket_array_bytes()); - sk2.merge_raw_bucket_buffer(copy_data); + sk2.merge_raw_bucket_buffer(copy_data, sk1.get_buckets()); sk2.reset_sample_state(); sample = sk2.sample(); From 6931393830752ef271e7cac7a86967181c3eb78a Mon Sep 17 00:00:00 2001 From: Evan West Date: Sat, 15 Mar 2025 19:13:09 -0400 Subject: [PATCH 07/14] optimize and make work --- include/bucket.h | 18 --- include/sparse_sketch.h | 88 +++++----- src/sparse_sketch.cpp | 346 ++++++++++++++++++++++++---------------- test/sketch_test.cpp | 9 ++ 4 files changed, 264 insertions(+), 197 deletions(-) diff --git a/include/bucket.h b/include/bucket.h index 070695b1..cccf67fe 100644 --- a/include/bucket.h +++ b/include/bucket.h @@ -9,24 +9,6 @@ struct Bucket { vec_t alpha; vec_hash_t gamma; }; -struct SparseBucket { - uint16_t position; // (col << 8) | row - Bucket bkt; - - // TODO: Use these functions and also maybe optimize - inline uint16_t col() const { - return position >> 8; - } - inline uint16_t row() const { - return position & 0xFF; - } - inline void set_col(uint16_t col) { - position = (col << 8) + row(); - } - inline void set_row(uint16_t row) { - position = (col() << 8) + row; - } -}; #pragma pack(pop) namespace Bucket_Boruvka { diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h index 70116a56..636d6e1b 100644 --- a/include/sparse_sketch.h +++ b/include/sparse_sketch.h @@ -13,6 +13,14 @@ #include "bucket.h" #include "sketch_types.h" +#pragma pack(push,1) +struct SparseBucket { + uint8_t next; // index of next sparse bucket in this column + uint8_t row; // row of sparse bucket + Bucket bkt; // actual bucket content +}; +#pragma pack(pop) + // TODO: Do we want to use row major or column major order? // So the advantage of row-major is that we can update faster. Most updates will only touch // first few rows of data-structure. However, could slow down queries. (Although most query @@ -20,51 +28,30 @@ // if column-major then the column we are merging is contig, if not, then not. // A: Keep column-major for the moment, performance evaluation later. - -// TODO: How do we want to handle raw_bucket_merge() and get_readonly_bucket_ptr()? -// These functions are nice for performance because we can skip serialization but aren't -// strictly necessary. -// A: Make function to get size in bytes of bucket data and have the 'hash table' be contig with -// the bucket data. This way we can still use these functions. - - -// TODO: It would be nice to preallocate the structure if we know how big its probably going to be. -// This would be helpful for delta sketches for example. -// A: Yeah do this - - -// TODO: What are we doing with the num_buckets variable? Could be nice to just be the size of -// buckets array. Could also be upperbound on the size. -// A: Need two variables. Both the current number of buckets (rows) allocated AND the maximum. - -// A strategy that could work well would be to allocate a chunk of memory some of which is given to -// the dense region of the sketch and 3 * sizeof(uint64_t) are given to sparse region. -// 3 -> position, alpha, gamma (could save a little more space by using 16 bits for position) - -/* Memory Allocation of a Sketch. Contiguous - _________________________________________________________________________________________ -| Dense | Sparse | -| Sketch | Bucket | -| Buckets | Region (hash-table) | -| log n * log z buckets | clog n buckets | -|__________________________________________________________|______________________________| +/* Memory Allocation of a SparseSketch. Contiguous (only roughly to scale). + Where z is number of non-zero elements in vector we are sketching. + _________________________________________________________________________________________________ +| Dense | Sparse | Linked List | +| Bucket | Bucket | Metadata | +| Region | Region | for Sparse bkts | +| log n * log z buckets | clog n buckets | clogn/16 buckets | +|_________________________________________________|____________________________|__________________| */ /** - * Sketch for graph processing, either CubeSketch or CameoSketch. + * SparseSketch for graph processing * Sub-linear representation of a vector. */ class SparseSketch { private: - const uint64_t seed; // seed for hash functions - size_t num_samples; // number of samples we can perform - size_t cols_per_sample; // number of columns to use on each sample - size_t num_columns; // Total number of columns. (product of above 2) - size_t bkt_per_col; // maximum number of buckets per column (max number of rows) - size_t num_buckets; // number of total buckets - // (either product of above two or col * dense_rows + sparse_capacity) + const uint64_t seed; // seed for hash functions + const size_t num_samples; // number of samples we can perform + const size_t cols_per_sample; // number of columns to use on each sample + const size_t num_columns; // Total number of columns. (product of above 2) + const size_t bkt_per_col; // maximum number of buckets per column (max number of rows) - size_t sample_idx = 0; // number of samples performed so far + size_t num_buckets; // number of total buckets (col * dense_rows + sparse_capacity) + size_t sample_idx = 0; // number of samples performed so far // Allocated buckets Bucket* buckets; @@ -74,8 +61,9 @@ class SparseSketch { // Variables for sparse representation of lower levels of bucket Matrix // TODO: evaluate implications of this constant - static constexpr double sparse_bucket_constant = 3; // constant factor c (see above) + static constexpr double sparse_bucket_constant = 3; // constant factor c (see diagram) SparseBucket* sparse_buckets; // a pointer into the buckets array + uint8_t *ll_metadata; // pointer to heads of column LLs size_t number_of_sparse_buckets = 0; // cur number of sparse buckets size_t sparse_capacity = sparse_bucket_constant * num_columns; // max number of sparse buckets @@ -85,11 +73,12 @@ class SparseSketch { void reallocate_if_needed(int delta); void dense_realloc(size_t new_num_dense_rows); - // This variable lets us know how many Buckets to allocate to make space for the SparseBuckets - // that will be using that space + // These variables let us know how many Buckets to allocate to make space for the SparseBuckets + // and the LL metadata that will use that space size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket)); + size_t ll_metadata_size = ceil((double(num_columns) + 1) * sizeof(uint8_t) / sizeof(Bucket)); - void update_sparse(SparseBucket to_add, bool realloc_if_needed = true); + void update_sparse(uint8_t col, SparseBucket to_add, bool realloc_if_needed = true); SketchSample sample_sparse(size_t first_col, size_t end_col); inline Bucket& deterministic_bucket() { @@ -113,6 +102,23 @@ class SparseSketch { return buckets[position_func(col, row, num_dense_rows)]; } + size_t calc_num_buckets(size_t new_num_dense_rows) { + return num_columns * new_num_dense_rows + sparse_data_size + ll_metadata_size + 1; + } + + size_t calc_sparse_index(size_t rows) { + return num_columns * rows + 1; + } + + size_t calc_metadata_index(size_t rows) { + return num_columns * rows + sparse_data_size + 1; + } + + void upd_sparse_ptrs() { + sparse_buckets = (SparseBucket *) &buckets[calc_sparse_index(num_dense_rows)]; + ll_metadata = (uint8_t *) &buckets[calc_metadata_index(num_dense_rows)]; + } + public: /** * The below constructors use vector length as their input. However, in graph sketching our input diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp index 712be299..21de0673 100644 --- a/src/sparse_sketch.cpp +++ b/src/sparse_sketch.cpp @@ -13,15 +13,28 @@ SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, size_t _samples, siz bkt_per_col(calc_bkt_per_col(vector_len)) { // plus 1, deterministic bucket - num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; + num_buckets = calc_num_buckets(num_dense_rows); buckets = new Bucket[num_buckets]; - sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 1]; + upd_sparse_ptrs(); // initialize bucket values for (size_t i = 0; i < num_buckets; ++i) { buckets[i].alpha = 0; buckets[i].gamma = 0; } + + // initialize sparse bucket linked lists + // every bucket is currently free, so each points to next + for (size_t i = 0; i < sparse_capacity; i++) { + sparse_buckets[i].next = i + 1; + } + sparse_buckets[sparse_capacity - 1].next = uint8_t(-1); + + // initialize LL metadata + for (size_t i = 0; i < num_columns; i++) { + ll_metadata[i] = uint8_t(-1); // head of each column points nowhere (empty) + } + ll_metadata[num_columns] = 0; // free list head } SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, @@ -34,7 +47,7 @@ SparseSketch::SparseSketch(vec_t vector_len, uint64_t seed, std::istream &binary num_buckets(num_buckets) { buckets = new Bucket[num_buckets]; num_dense_rows = (num_buckets - sparse_data_size) / num_columns; - sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 1]; + upd_sparse_ptrs(); // Read the serialized Sketch contents binary_in.read((char *)buckets, bucket_array_bytes()); @@ -49,7 +62,7 @@ SparseSketch::SparseSketch(const SparseSketch &s) num_buckets(s.num_buckets), num_dense_rows(s.num_dense_rows) { buckets = new Bucket[num_buckets]; - sparse_buckets = (SparseBucket *) &buckets[num_columns * num_dense_rows + 1]; + upd_sparse_ptrs(); std::memcpy(buckets, s.buckets, bucket_array_bytes()); } @@ -65,14 +78,14 @@ void SparseSketch::dense_realloc(size_t new_num_dense_rows) { // we are performing a reallocation const size_t old_rows = num_dense_rows; SparseBucket *old_sparse_pointer = sparse_buckets; - Bucket *new_buckets; + Bucket *old_buckets = buckets; if (new_num_dense_rows < min_num_dense_rows) { throw std::runtime_error("new_num_dense_rows too small!"); } if (new_num_dense_rows < num_dense_rows) { - // std::cout << "Shrinking to " << new_num_dense_rows << " from " << old_rows << std::endl; + // std::cerr << "Shrinking to " << new_num_dense_rows << " from " << old_rows << std::endl; // shrink dense region // Scan over the rows we are removing and add all those buckets to sparse for (size_t c = 0; c < num_columns; c++) { @@ -80,70 +93,70 @@ void SparseSketch::dense_realloc(size_t new_num_dense_rows) { Bucket bkt = bucket(c, r); if (!Bucket_Boruvka::is_empty(bkt)) { SparseBucket new_sparse; - new_sparse.set_col(c); - new_sparse.set_row(r); + new_sparse.row = r; new_sparse.bkt = bkt; - update_sparse(new_sparse, false); + update_sparse(c, new_sparse, false); } } } // Allocate new memory num_dense_rows = new_num_dense_rows; - num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; - new_buckets = new Bucket[num_buckets]; + num_buckets = calc_num_buckets(num_dense_rows); + buckets = new Bucket[num_buckets]; } else { - // std::cout << "Growing to " << new_num_dense_rows << " from " << old_rows << std::endl; + // std::cerr << "Growing to " << new_num_dense_rows << " from " << old_rows << std::endl; // grow dense region by 1 row // Allocate new memory num_dense_rows = new_num_dense_rows; - num_buckets = num_columns * num_dense_rows + sparse_data_size + 1; - new_buckets = new Bucket[num_buckets]; + num_buckets = calc_num_buckets(num_dense_rows); + buckets = new Bucket[num_buckets]; // initialize new rows to zero for (size_t c = 0; c < num_columns; c++) { for (size_t r = old_rows; r < num_dense_rows; r++) { - new_buckets[position_func(c, r, num_dense_rows)] = {0, 0}; + buckets[position_func(c, r, num_dense_rows)] = {0, 0}; } } } - sparse_buckets = (SparseBucket *) &new_buckets[num_columns * num_dense_rows + 1]; + upd_sparse_ptrs(); // Copy dense content - new_buckets[0] = deterministic_bucket(); + buckets[0] = old_buckets[0]; for (size_t c = 0; c < num_columns; c++) { for (size_t r = 0; r < std::min(num_dense_rows, old_rows); r++) { - new_buckets[position_func(c, r, num_dense_rows)] = buckets[position_func(c, r, old_rows)]; + buckets[position_func(c, r, num_dense_rows)] = old_buckets[position_func(c, r, old_rows)]; } } // sparse contents - memcpy(sparse_buckets, old_sparse_pointer, sparse_capacity * sizeof(SparseBucket)); - + memcpy(sparse_buckets, old_sparse_pointer, + (sparse_data_size + ll_metadata_size) * sizeof(Bucket)); if (num_dense_rows > old_rows) { // We growing // Scan sparse buckets and move all updates of depth num_dense_rows-1 // to the new dense row - for (size_t i = 0; i < sparse_capacity; i++) { - // std::cout << "sparse_bucket = " << sparse_buckets[i].col() << ", " << sparse_buckets[i].row() - // << ": " << sparse_buckets[i].bkt.alpha << ", " << sparse_buckets[i].bkt.gamma - // << std::endl; - if (sparse_buckets[i].row() < num_dense_rows && sparse_buckets[i].position != 0) { - size_t col = sparse_buckets[i].col(); - size_t row = sparse_buckets[i].row(); - assert(Bucket_Boruvka::is_empty(new_buckets[position_func(col, row, num_dense_rows)])); - new_buckets[position_func(col, row, num_dense_rows)] = sparse_buckets[i].bkt; - sparse_buckets[i].position = uint16_t(-1); // tombstone - sparse_buckets[i].bkt = {0, 0}; // clear out tombstone + for (size_t c = 0; c < num_columns; c++) { + while (ll_metadata[c] != uint8_t(-1) && sparse_buckets[ll_metadata[c]].row < num_dense_rows) { + // remove this bucket from column ll + uint8_t idx = ll_metadata[c]; + ll_metadata[c] = sparse_buckets[ll_metadata[c]].next; number_of_sparse_buckets -= 1; - // std::cout << "Moving to dense!" << std::endl; + + // add this bucket to dense region + bucket(c, sparse_buckets[idx].row) = sparse_buckets[idx].bkt; + + // add this sparse_bucket to free list + sparse_buckets[idx].bkt = {0, 0}; + sparse_buckets[idx].row = 0; + sparse_buckets[idx].next = ll_metadata[num_columns]; + ll_metadata[num_columns] = idx; } } } // 4. Clean up - std::swap(buckets, new_buckets); - delete[] new_buckets; + delete[] old_buckets; } void SparseSketch::reallocate_if_needed(int delta) { @@ -169,66 +182,75 @@ void SparseSketch::reallocate_if_needed(int delta) { // +1 if we added a new bucket value // 0 if the bucket was found and update (but not cleared) // -1 if the bucket was found and cleared of all content -void SparseSketch::update_sparse(SparseBucket to_add, bool realloc_if_needed) { - SparseBucket *tombstone = nullptr; - uint16_t tombstone_pos = uint16_t(-1); - for (size_t i = 0; i < sparse_capacity; i++) { - auto &sparse_bucket = sparse_buckets[i]; - if (sparse_bucket.position == 0 || sparse_bucket.position == to_add.position) { - // We apply our update here! - if (sparse_bucket.position == to_add.position) { - // we update bucket - sparse_bucket.bkt.alpha ^= to_add.bkt.alpha; - sparse_bucket.bkt.gamma ^= to_add.bkt.gamma; - - // did we clear it out? - if (Bucket_Boruvka::is_empty(sparse_bucket.bkt)) { - sparse_bucket.position = tombstone_pos; // mark it as tombstone - number_of_sparse_buckets -= 1; - if (realloc_if_needed) reallocate_if_needed(-1); - } - return; - } else { - if (tombstone != nullptr) { - // use the tombstone - *tombstone = to_add; +void SparseSketch::update_sparse(uint8_t col, SparseBucket to_add, bool realloc_if_needed) { + uint8_t next_ptr = ll_metadata[col]; + uint8_t prev = uint8_t(-1); + while (next_ptr != uint8_t(-1)) { + if (sparse_buckets[next_ptr].row == to_add.row) { + sparse_buckets[next_ptr].bkt.alpha ^= to_add.bkt.alpha; + sparse_buckets[next_ptr].bkt.gamma ^= to_add.bkt.gamma; + if (Bucket_Boruvka::is_empty(sparse_buckets[next_ptr].bkt)) { + // remove this bucket from column list + if (prev == uint8_t(-1)) { + ll_metadata[col] = sparse_buckets[next_ptr].next; } else { - sparse_bucket = to_add; + sparse_buckets[prev].next = sparse_buckets[next_ptr].next; } + number_of_sparse_buckets -= 1; + + // add this bucket to free list + sparse_buckets[next_ptr].next = ll_metadata[num_columns]; + ll_metadata[num_columns] = next_ptr; - // we created a new sparse bucket - number_of_sparse_buckets += 1; - if (realloc_if_needed) reallocate_if_needed(1); - return; + if (realloc_if_needed) reallocate_if_needed(-1); } - } else if (sparse_bucket.position == tombstone_pos && tombstone == nullptr) { - tombstone = &sparse_bucket; + return; // we've done it! + } else if (sparse_buckets[next_ptr].row > to_add.row) { + break; } + prev = next_ptr; + next_ptr = sparse_buckets[next_ptr].next; } - if (tombstone != nullptr) { - // use the tombstone - *tombstone = to_add; - number_of_sparse_buckets += 1; // we created a new sparse bucket - if (realloc_if_needed) reallocate_if_needed(1); - return; + + // pull a bucket off the free list and set it equal to to_add + uint8_t free_bucket = ll_metadata[num_columns]; + // std::cerr << "free bucket = " << size_t(free_bucket) << std::endl; + // std::cerr << "next bucket = " << size_t(next_ptr) << std::endl; + if (free_bucket == uint8_t(-1)) { + throw std::runtime_error("Found invalid bucket index in LL"); + } + ll_metadata[num_columns] = sparse_buckets[free_bucket].next; + // std::cerr << "free head = " << size_t(ll_metadata[num_columns]) << std::endl; + + // update buffer + sparse_buckets[free_bucket] = to_add; + sparse_buckets[free_bucket].next = next_ptr; + number_of_sparse_buckets += 1; + // std::cerr << "new bucket " << size_t(sparse_buckets[free_bucket].row) << " n = " << size_t(sparse_buckets[free_bucket].next) << std::endl; + + // update column ll + if (prev == uint8_t(-1)) { + ll_metadata[col] = free_bucket; + // std::cerr << "Set column head to new bucket " << size_t(ll_metadata[col]) << std::endl; + } else { + sparse_buckets[prev].next = free_bucket; + // std::cerr << "Placed new bucket in column " << size_t(prev) << "->" << size_t(sparse_buckets[prev].next) << "->" << size_t(sparse_buckets[free_bucket].next) << std::endl; } - // this is an error! - std::cout << "num_sparse: " << number_of_sparse_buckets << std::endl; - std::cout << "capacity: " << sparse_capacity << std::endl; - throw std::runtime_error("update_sparse(): Failed to find update location!"); + if (realloc_if_needed) reallocate_if_needed(1); } // sample a good bucket from the sparse region if one exists. // Additionally, specify the column to query from -// TODO: Do we want to include this column thing? SketchSample SparseSketch::sample_sparse(size_t first_col, size_t end_col) { - for (size_t i = 0; i < sparse_capacity; i++) { - if (size_t(sparse_buckets[i].col()) >= first_col && - size_t(sparse_buckets[i].col()) < end_col && - Bucket_Boruvka::is_good(sparse_buckets[i].bkt, checksum_seed())) { - // std::cout << "Found GOOD sparse bucket" << std::endl; - return {sparse_buckets[i].bkt.alpha, GOOD}; + // std::cerr << "sample_sparse" << std::endl; + for (size_t c = first_col; c < end_col; c++) { + uint8_t idx = ll_metadata[c]; + while (idx != uint8_t(-1)) { + if (Bucket_Boruvka::is_good(sparse_buckets[idx].bkt, checksum_seed())) { + return {sparse_buckets[idx].bkt.alpha, GOOD}; + } + idx = sparse_buckets[idx].next; } } @@ -251,7 +273,7 @@ void SparseSketch::update(const vec_t update_idx) { likely_if(depth < num_dense_rows) { Bucket_Boruvka::update(bucket(i, depth), update_idx, checksum); } else { - update_sparse({uint16_t((i << 8) | depth), {update_idx, checksum}}); + update_sparse(i, {uint8_t(-1), uint8_t(depth), {update_idx, checksum}}); } } } @@ -266,6 +288,20 @@ void SparseSketch::zero_contents() { buckets[i].alpha = 0; buckets[i].gamma = 0; } + + // initialize sparse bucket linked lists + // every bucket is currently free, so each points to next + for (size_t i = 0; i < sparse_capacity; i++) { + sparse_buckets[i].next = i + 1; + } + sparse_buckets[sparse_capacity - 1].next = uint8_t(-1); + + // initialize LL metadata + for (size_t i = 0; i < num_columns; i++) { + ll_metadata[i] = uint8_t(-1); // head of each column points nowhere (empty) + } + ll_metadata[num_columns] = 0; // free list head + reset_sample_state(); number_of_sparse_buckets = 0; } @@ -300,7 +336,7 @@ SketchSample SparseSketch::sample() { } for (size_t c = 0; c < cols_per_sample; ++c) { - for (size_t r = 0; r < num_dense_rows; ++r) { + for (int r = num_dense_rows - 1; r >= 0; --r) { if (Bucket_Boruvka::is_good(bucket(c + first_column, r), checksum_seed())) { // std::cout << "Found GOOD dense bucket" << std::endl; return {bucket(c + first_column, r).alpha, GOOD}; @@ -309,6 +345,8 @@ SketchSample SparseSketch::sample() { } // Sample sparse region + // std::cout << "Sketch is bad" << std::endl; + // std::cout << *this << std::endl; return {0, FAIL}; } @@ -321,7 +359,7 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { size_t idx = sample_idx++; size_t first_column = idx * cols_per_sample; - unlikely_if (deterministic_bucket().alpha == 0 && deterministic_bucket().gamma == 0) + unlikely_if (Bucket_Boruvka::is_empty(deterministic_bucket())) return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return unlikely_if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) { @@ -349,6 +387,13 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { } void SparseSketch::merge(const SparseSketch &other) { + // std::cerr << "PERFORMING A MERGE" << std::endl; + // std::cerr << *this << std::endl; + + // std::cerr << "MERGE SKETCH" << std::endl; + // std::cerr << other << std::endl; + + // merge the deterministic bucket deterministic_bucket().alpha ^= other.deterministic_bucket().alpha; deterministic_bucket().gamma ^= other.deterministic_bucket().gamma; @@ -360,25 +405,28 @@ void SparseSketch::merge(const SparseSketch &other) { bucket(c, r).gamma ^= other.bucket(c, r).gamma; } else if (!Bucket_Boruvka::is_empty(other.bucket(c, r))) { SparseBucket sparse_bkt; - sparse_bkt.set_col(c); - sparse_bkt.set_row(r); + sparse_bkt.row = r; sparse_bkt.bkt = other.bucket(c, r); - update_sparse(sparse_bkt); + update_sparse(c, sparse_bkt); } } } // Merge all sparse buckets from other sketch into this one - for (size_t i = 0; i < other.sparse_capacity; i++) { - const auto &oth_sparse_bkt = other.sparse_buckets[i]; - if (oth_sparse_bkt.position != uint16_t(-1) && oth_sparse_bkt.position != 0) { - if (oth_sparse_bkt.row() < num_dense_rows) { - auto &bkt = bucket(oth_sparse_bkt.col(), oth_sparse_bkt.row()); - bkt.alpha ^= oth_sparse_bkt.bkt.alpha; - bkt.gamma ^= oth_sparse_bkt.bkt.gamma; + for (size_t c = 0; c < num_columns; c++) { + uint8_t this_idx = ll_metadata[c]; + uint8_t oth_idx = other.ll_metadata[c]; + + while (oth_idx != uint8_t(-1)) { + if (other.sparse_buckets[oth_idx].row < num_dense_rows) { + auto &bkt = bucket(c, other.sparse_buckets[oth_idx].row); + bkt.alpha ^= other.sparse_buckets[oth_idx].bkt.alpha; + bkt.gamma ^= other.sparse_buckets[oth_idx].bkt.gamma; } else { - update_sparse(oth_sparse_bkt); + // TODO: This can be made faster by utilizing this_idx and performing a merge operation + update_sparse(c, other.sparse_buckets[oth_idx]); } + oth_idx = other.sparse_buckets[oth_idx].next; } } } @@ -389,18 +437,23 @@ void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, s sample_idx = num_samples; // sketch is in a fail state! return; } + // std::cerr << "SKETCH BEFORE MERGE" << std::endl; + // std::cerr << *this << std::endl; + + // std::cerr << "SKETCH WE MERGE WITH" << std::endl; + // std::cerr << other << std::endl; // update sample idx to point at beginning of this range if before it sample_idx = std::max(sample_idx, start_sample); + // Columns we be merging + size_t start_column = start_sample * cols_per_sample; + size_t end_column = (start_sample + n_samples) * cols_per_sample; + // merge deterministic buffer deterministic_bucket().alpha ^= other.deterministic_bucket().alpha; deterministic_bucket().gamma ^= other.deterministic_bucket().gamma; - // merge other buckets - size_t start_column = start_sample * cols_per_sample; - size_t end_column = (start_sample + n_samples) * cols_per_sample; - // merge all their dense buckets into us for (size_t c = start_column; c < end_column; c++) { for (size_t r = 0; r < other.num_dense_rows; r++) { @@ -409,64 +462,72 @@ void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, s bucket(c, r).gamma ^= other.bucket(c, r).gamma; } else if (!Bucket_Boruvka::is_empty(other.bucket(c, r))) { SparseBucket sparse_bkt; - sparse_bkt.set_col(c); - sparse_bkt.set_row(r); + sparse_bkt.row = r; sparse_bkt.bkt = other.bucket(c, r); - update_sparse(sparse_bkt); + update_sparse(c, sparse_bkt); } } } - // Merge all sparse buckets from other sketch's columns into this one - for (size_t i = 0; i < other.sparse_capacity; i++) { - const auto &oth_sparse_bkt = other.sparse_buckets[i]; - if (oth_sparse_bkt.position != uint16_t(-1) && oth_sparse_bkt.position != 0 && - oth_sparse_bkt.col() >= start_column && oth_sparse_bkt.col() < end_column) { - if (oth_sparse_bkt.row() < num_dense_rows) { - auto &bkt = bucket(oth_sparse_bkt.col(), oth_sparse_bkt.row()); - bkt.alpha ^= oth_sparse_bkt.bkt.alpha; - bkt.gamma ^= oth_sparse_bkt.bkt.gamma; + // Merge all sparse buckets from other sketch into this one + for (size_t c = start_column; c < end_column; c++) { + uint8_t this_idx = ll_metadata[c]; + uint8_t oth_idx = other.ll_metadata[c]; + + while (oth_idx != uint8_t(-1)) { + if (other.sparse_buckets[oth_idx].row < num_dense_rows) { + auto &bkt = bucket(c, other.sparse_buckets[oth_idx].row); + bkt.alpha ^= other.sparse_buckets[oth_idx].bkt.alpha; + bkt.gamma ^= other.sparse_buckets[oth_idx].bkt.gamma; } else { - update_sparse(oth_sparse_bkt); + // TODO: This can be made faster by utilizing this_idx and performing a merge operation + update_sparse(c, other.sparse_buckets[oth_idx]); } + oth_idx = other.sparse_buckets[oth_idx].next; } } + // std::cerr << "SKETCH AFTER MERGE" << std::endl; + // std::cerr << *this << std::endl; } void SparseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets, size_t n_raw_buckets) { - size_t num_merge_dense_rows = (n_raw_buckets - sparse_data_size - 1) / num_columns; - const SparseBucket *raw_sparse = - (const SparseBucket *) &raw_buckets[num_columns * num_merge_dense_rows + 1]; + size_t raw_rows = (n_raw_buckets - sparse_data_size - ll_metadata_size - 1) / num_columns; + const SparseBucket *raw_sparse = (const SparseBucket *) &raw_buckets[calc_sparse_index(raw_rows)]; + const uint8_t *raw_metadata = (const uint8_t *) &raw_buckets[calc_metadata_index(raw_rows)]; deterministic_bucket().alpha ^= raw_buckets[0].alpha; deterministic_bucket().gamma ^= raw_buckets[0].gamma; for (size_t c = 0; c < num_columns; c++) { - for (size_t r = 0; r < num_merge_dense_rows; r++) { + for (size_t r = 0; r < raw_rows; r++) { if (r < num_dense_rows) { - bucket(c, r).alpha ^= raw_buckets[position_func(c, r, num_merge_dense_rows)].alpha; - bucket(c, r).gamma ^= raw_buckets[position_func(c, r, num_merge_dense_rows)].gamma; + bucket(c, r).alpha ^= raw_buckets[position_func(c, r, raw_rows)].alpha; + bucket(c, r).gamma ^= raw_buckets[position_func(c, r, raw_rows)].gamma; } else if (!Bucket_Boruvka::is_empty( - raw_buckets[position_func(c, r, num_merge_dense_rows)])) { + raw_buckets[position_func(c, r, raw_rows)])) { SparseBucket sparse_bkt; - sparse_bkt.set_col(c); - sparse_bkt.set_row(r); - sparse_bkt.bkt = raw_buckets[position_func(c, r, num_merge_dense_rows)]; - update_sparse(sparse_bkt); + sparse_bkt.row = r; + sparse_bkt.bkt = raw_buckets[position_func(c, r, raw_rows)]; + update_sparse(c, sparse_bkt); } } } - for (size_t i = 0; i < sparse_capacity; i++) { - const auto &oth_sparse_bkt = raw_sparse[i]; - if (oth_sparse_bkt.position != uint16_t(-1) && oth_sparse_bkt.position != 0) { - if (oth_sparse_bkt.row() < num_dense_rows) { - auto &bkt = bucket(oth_sparse_bkt.col(), oth_sparse_bkt.row()); - bkt.alpha ^= oth_sparse_bkt.bkt.alpha; - bkt.gamma ^= oth_sparse_bkt.bkt.gamma; + // Merge all sparse buckets from other sketch into this one + for (size_t c = 0; c < num_columns; c++) { + uint8_t this_idx = ll_metadata[c]; + uint8_t oth_idx = raw_metadata[c]; + + while (oth_idx != uint8_t(-1)) { + if (raw_sparse[oth_idx].row < num_dense_rows) { + auto &bkt = bucket(c, raw_sparse[oth_idx].row); + bkt.alpha ^= raw_sparse[oth_idx].bkt.alpha; + bkt.gamma ^= raw_sparse[oth_idx].bkt.gamma; } else { - update_sparse(oth_sparse_bkt); + // TODO: This can be made faster by utilizing this_idx and performing a merge operation + update_sparse(c, raw_sparse[oth_idx]); } + oth_idx = raw_sparse[oth_idx].next; } } } @@ -479,11 +540,12 @@ bool operator==(const SparseSketch &sketch1, const SparseSketch &sketch2) { if (sketch1.num_buckets != sketch2.num_buckets || sketch1.seed != sketch2.seed) return false; - return memcmp(sketch1.buckets, sketch2.buckets, sketch1.bucket_array_bytes()) == 0; + return memcmp(sketch1.buckets, sketch2.buckets, + sketch1.bucket_array_bytes() - sketch1.ll_metadata_size * sizeof(Bucket)) == 0; } std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { - Bucket bkt = sketch.buckets[sketch.num_buckets - 1]; + Bucket bkt = sketch.deterministic_bucket(); bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); vec_t a = bkt.alpha; vec_hash_t c = bkt.gamma; @@ -504,11 +566,19 @@ std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { os << "Sparse Buckets" << std::endl; const auto sparse_buckets = sketch.sparse_buckets; - for (size_t i = 0; i < sketch.sparse_capacity; i++) { - bool good = Bucket_Boruvka::is_good(sparse_buckets[i].bkt, sketch.checksum_seed()); - os << " p:" << sparse_buckets[i].col() << ", " << sparse_buckets[i].row() - << ":= a:" << sparse_buckets[i].bkt.alpha << " c:" << sparse_buckets[i].bkt.gamma - << (good ? " good" : " bad") << std::endl; + for (size_t c = 0; c < sketch.num_columns; c++) { + uint8_t idx = sketch.ll_metadata[c]; + while (idx != uint8_t(-1)) { + bool good = Bucket_Boruvka::is_good(sparse_buckets[idx].bkt, sketch.checksum_seed()); + os << "i: " << size_t(idx) << " n: " << size_t(sparse_buckets[idx].next) << " p:" << c << ", " + << size_t(sparse_buckets[idx].row) << " := a:" << sparse_buckets[idx].bkt.alpha + << " c:" << sparse_buckets[idx].bkt.gamma << (good ? " good" : " bad") << std::endl; + if (idx == sketch.sparse_buckets[idx].next) { + os << "LL error!" << std::endl; + return os; + } + idx = sketch.sparse_buckets[idx].next; + } } os << std::endl; return os; diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp index 7ff15c37..80d2656b 100644 --- a/test/sketch_test.cpp +++ b/test/sketch_test.cpp @@ -220,6 +220,7 @@ TEST(SketchTestSuite, TestSketchRangeMerge) { size_t seed = get_seed(); Sketch skt1(2048, seed, 10, 3); Sketch skt2(2048, seed, 10, 3); + Sketch temp_skt(2048, seed, 10, 3); for (vec_t i = 0; i < 1024; i++) { skt1.update(i); @@ -230,23 +231,31 @@ TEST(SketchTestSuite, TestSketchRangeMerge) { vec_t good_2 = 1024; vec_t good_3 = good_2 + 255; + temp_skt.merge(skt1); + skt1.range_merge(skt2, 0, 1); SketchSample sample = skt1.sample(); if (sample.result == GOOD) { ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); } + skt1.zero_contents(); + skt1.merge(temp_skt); skt1.range_merge(skt2, 1, 1); sample = skt1.sample(); if (sample.result == GOOD) { ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); } + skt1.zero_contents(); + skt1.merge(temp_skt); skt1.range_merge(skt2, 2, 1); sample = skt1.sample(); if (sample.result == GOOD) { ASSERT_TRUE(sample.idx <= good_1 || (sample.idx >= good_2 && sample.idx <= good_3)); } + skt1.zero_contents(); + skt1.merge(temp_skt); skt1.range_merge(skt2, 3, 1); sample = skt1.sample(); From 0936d78cce0cf2d92b09d91426d1e87b4a853b45 Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 20 Mar 2025 16:56:06 -0400 Subject: [PATCH 08/14] optimizations, still debugging --- include/sketch_types.h | 12 ++- include/sparse_sketch.h | 48 ++++++++- src/cc_sketch_alg.cpp | 36 +++---- src/dense_sketch.cpp | 6 +- src/sparse_sketch.cpp | 234 +++++++++++++++++++++++++++++++--------- test/sketch_test.cpp | 16 +-- 6 files changed, 262 insertions(+), 90 deletions(-) diff --git a/include/sketch_types.h b/include/sketch_types.h index 725e7c68..19f67b9f 100644 --- a/include/sketch_types.h +++ b/include/sketch_types.h @@ -1,4 +1,9 @@ #pragma once + +#include +#include + +#include "types.h" // enum SerialType { // FULL, // RANGE, @@ -17,19 +22,18 @@ struct SketchSample { }; struct ExhaustiveSketchSample { - std::unordered_set idxs; + std::vector idxs; SampleResult result; }; class OutOfSamplesException : public std::exception { private: std::string err_msg; + public: OutOfSamplesException(size_t seed, size_t num_samples, size_t sample_idx) : err_msg("This sketch (seed=" + std::to_string(seed) + ", max samples=" + std::to_string(num_samples) + ") cannot be sampled more times (cur idx=" + std::to_string(sample_idx) + ")!") {} - virtual const char* what() const throw() { - return err_msg.c_str(); - } + virtual const char* what() const throw() { return err_msg.c_str(); } }; diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h index 636d6e1b..89539b95 100644 --- a/include/sparse_sketch.h +++ b/include/sparse_sketch.h @@ -56,7 +56,7 @@ class SparseSketch { // Allocated buckets Bucket* buckets; - static constexpr size_t min_num_dense_rows = 4; + static constexpr size_t min_num_dense_rows = 5; size_t num_dense_rows = min_num_dense_rows; // Variables for sparse representation of lower levels of bucket Matrix @@ -81,6 +81,47 @@ class SparseSketch { void update_sparse(uint8_t col, SparseBucket to_add, bool realloc_if_needed = true); SketchSample sample_sparse(size_t first_col, size_t end_col); + inline uint8_t remove_ll_head(size_t col) { + uint8_t temp = ll_metadata[col]; + ll_metadata[col] = sparse_buckets[ll_metadata[col]].next; + return temp; + } + inline uint8_t claim_free_bucket() { + assert(ll_metadata[num_columns] != uint8_t(-1)); + return remove_ll_head(num_columns); + } + inline void insert_to_ll_head(size_t col, uint8_t add_idx) { + sparse_buckets[add_idx].next = ll_metadata[col]; + ll_metadata[col] = add_idx; + } + inline void free_bucket(uint8_t bkt_idx) { + sparse_buckets[bkt_idx].row = 0; + sparse_buckets[bkt_idx].bkt = {0, 0}; + insert_to_ll_head(num_columns, bkt_idx); + } + inline void insert_to_ll(uint8_t add_idx, SparseBucket &prev) { + sparse_buckets[add_idx].next = prev.next; + prev.next = add_idx; + } + inline void remove_from_ll(SparseBucket& bkt_to_remove, SparseBucket &prev) { + prev.next = bkt_to_remove.next; + } + inline bool merge_sparse_bkt(uint8_t our_idx, SparseBucket& oth, uint8_t prev_idx, size_t col) { + SparseBucket &ours = sparse_buckets[our_idx]; + ours.bkt.alpha ^= oth.bkt.alpha; + ours.bkt.gamma ^= oth.bkt.gamma; + if (Bucket_Boruvka::is_empty(ours.bkt)) { + if (prev_idx == uint8_t(-1)) + remove_ll_head(col); + else + remove_from_ll(ours, sparse_buckets[prev_idx]); + + free_bucket(our_idx); + return true; + } + return false; + } + inline Bucket& deterministic_bucket() { return buckets[0]; } @@ -119,6 +160,11 @@ class SparseSketch { ll_metadata = (uint8_t *) &buckets[calc_metadata_index(num_dense_rows)]; } + // given another SparseSketch column, merge it into ours + void merge_sparse_column(SparseBucket *oth_sparse_buckets, uint8_t *oth_ll_metadata, size_t col); + + void validate(); + public: /** * The below constructors use vector length as their input. However, in graph sketching our input diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp index f8e5649d..adcde659 100644 --- a/src/cc_sketch_alg.cpp +++ b/src/cc_sketch_alg.cpp @@ -466,7 +466,7 @@ inline void CCSketchAlg::create_merge_instructions(std::vector &merg } void CCSketchAlg::boruvka_emulation() { - // auto start = std::chrono::steady_clock::now(); + auto start = std::chrono::steady_clock::now(); update_locked = true; cc_alg_start = std::chrono::steady_clock::now(); @@ -486,27 +486,27 @@ void CCSketchAlg::boruvka_emulation() { } size_t round_num = 0; bool modified = true; - // std::cout << std::endl; - // std::cout << " pre boruvka processing = " - // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - // << std::endl; + std::cout << std::endl; + std::cout << " pre boruvka processing = " + << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + << std::endl; while (true) { - // std::cout << " Round: " << round_num << std::endl; - // start = std::chrono::steady_clock::now(); + std::cout << " Round: " << round_num << std::endl; + start = std::chrono::steady_clock::now(); modified = perform_boruvka_round(round_num, merge_instr, global_merges); - // std::cout << " perform_boruvka_round = " - // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - // << std::endl; + std::cout << " perform_boruvka_round = " + << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + << std::endl; if (!modified) break; // calculate updated merge instructions for next round - // start = std::chrono::steady_clock::now(); + start = std::chrono::steady_clock::now(); create_merge_instructions(merge_instr); - // std::cout << " create_merge_instructions = " - // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - // << std::endl; + std::cout << " create_merge_instructions = " + << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + << std::endl; ++round_num; } last_query_rounds = round_num; @@ -534,11 +534,11 @@ ConnectedComponents CCSketchAlg::connected_components() { bool except = false; std::exception_ptr err; try { - // auto start = std::chrono::steady_clock::now(); + auto start = std::chrono::steady_clock::now(); boruvka_emulation(); - // std::cout << " boruvka's algorithm = " - // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - // << std::endl; + std::cout << " boruvka's algorithm = " + << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + << std::endl; } catch (...) { except = true; err = std::current_exception(); diff --git a/src/dense_sketch.cpp b/src/dense_sketch.cpp index 39b4ddae..1b41cc00 100644 --- a/src/dense_sketch.cpp +++ b/src/dense_sketch.cpp @@ -124,7 +124,7 @@ ExhaustiveSketchSample DenseSketch::exhaustive_sample() { if (sample_idx >= num_samples) { throw OutOfSamplesException(seed, num_samples, sample_idx); } - std::unordered_set ret; + std::vector ret; size_t idx = sample_idx++; size_t first_column = idx * cols_per_sample; @@ -133,14 +133,14 @@ ExhaustiveSketchSample DenseSketch::exhaustive_sample() { return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return unlikely_if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) { - ret.insert(deterministic_bucket().alpha); + ret.push_back(deterministic_bucket().alpha); return {ret, GOOD}; } for (size_t i = 0; i < cols_per_sample; ++i) { for (size_t j = 0; j < bkt_per_col; ++j) { unlikely_if (Bucket_Boruvka::is_good(bucket(i + first_column, j), checksum_seed())) { - ret.insert(bucket(i + first_column, j).alpha); + ret.push_back(bucket(i + first_column, j).alpha); } } } diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp index 21de0673..340c457c 100644 --- a/src/sparse_sketch.cpp +++ b/src/sparse_sketch.cpp @@ -84,6 +84,8 @@ void SparseSketch::dense_realloc(size_t new_num_dense_rows) { throw std::runtime_error("new_num_dense_rows too small!"); } + // std::cerr << *this << std::endl; + if (new_num_dense_rows < num_dense_rows) { // std::cerr << "Shrinking to " << new_num_dense_rows << " from " << old_rows << std::endl; // shrink dense region @@ -92,10 +94,11 @@ void SparseSketch::dense_realloc(size_t new_num_dense_rows) { for (size_t r = new_num_dense_rows; r < old_rows; r++) { Bucket bkt = bucket(c, r); if (!Bucket_Boruvka::is_empty(bkt)) { - SparseBucket new_sparse; - new_sparse.row = r; - new_sparse.bkt = bkt; - update_sparse(c, new_sparse, false); + uint8_t free_idx = claim_free_bucket(); + sparse_buckets[free_idx].row = r; + sparse_buckets[free_idx].bkt = bkt; + insert_to_ll_head(c, free_idx); + number_of_sparse_buckets += 1; } } } @@ -139,22 +142,20 @@ void SparseSketch::dense_realloc(size_t new_num_dense_rows) { for (size_t c = 0; c < num_columns; c++) { while (ll_metadata[c] != uint8_t(-1) && sparse_buckets[ll_metadata[c]].row < num_dense_rows) { // remove this bucket from column ll - uint8_t idx = ll_metadata[c]; - ll_metadata[c] = sparse_buckets[ll_metadata[c]].next; + uint8_t idx = remove_ll_head(c); number_of_sparse_buckets -= 1; // add this bucket to dense region bucket(c, sparse_buckets[idx].row) = sparse_buckets[idx].bkt; // add this sparse_bucket to free list - sparse_buckets[idx].bkt = {0, 0}; - sparse_buckets[idx].row = 0; - sparse_buckets[idx].next = ll_metadata[num_columns]; - ll_metadata[num_columns] = idx; + free_bucket(idx); } } } + // std::cerr << *this << std::endl; + // 4. Clean up delete[] old_buckets; } @@ -187,24 +188,12 @@ void SparseSketch::update_sparse(uint8_t col, SparseBucket to_add, bool realloc_ uint8_t prev = uint8_t(-1); while (next_ptr != uint8_t(-1)) { if (sparse_buckets[next_ptr].row == to_add.row) { - sparse_buckets[next_ptr].bkt.alpha ^= to_add.bkt.alpha; - sparse_buckets[next_ptr].bkt.gamma ^= to_add.bkt.gamma; - if (Bucket_Boruvka::is_empty(sparse_buckets[next_ptr].bkt)) { - // remove this bucket from column list - if (prev == uint8_t(-1)) { - ll_metadata[col] = sparse_buckets[next_ptr].next; - } else { - sparse_buckets[prev].next = sparse_buckets[next_ptr].next; - } + bool removed = merge_sparse_bkt(next_ptr, to_add, prev, col); + if (removed) { number_of_sparse_buckets -= 1; - - // add this bucket to free list - sparse_buckets[next_ptr].next = ll_metadata[num_columns]; - ll_metadata[num_columns] = next_ptr; - if (realloc_if_needed) reallocate_if_needed(-1); } - return; // we've done it! + return; } else if (sparse_buckets[next_ptr].row > to_add.row) { break; } @@ -213,27 +202,22 @@ void SparseSketch::update_sparse(uint8_t col, SparseBucket to_add, bool realloc_ } // pull a bucket off the free list and set it equal to to_add - uint8_t free_bucket = ll_metadata[num_columns]; + uint8_t free_bucket = claim_free_bucket(); // std::cerr << "free bucket = " << size_t(free_bucket) << std::endl; // std::cerr << "next bucket = " << size_t(next_ptr) << std::endl; - if (free_bucket == uint8_t(-1)) { - throw std::runtime_error("Found invalid bucket index in LL"); - } - ll_metadata[num_columns] = sparse_buckets[free_bucket].next; // std::cerr << "free head = " << size_t(ll_metadata[num_columns]) << std::endl; - // update buffer + // update bucket sparse_buckets[free_bucket] = to_add; - sparse_buckets[free_bucket].next = next_ptr; number_of_sparse_buckets += 1; // std::cerr << "new bucket " << size_t(sparse_buckets[free_bucket].row) << " n = " << size_t(sparse_buckets[free_bucket].next) << std::endl; // update column ll if (prev == uint8_t(-1)) { - ll_metadata[col] = free_bucket; + insert_to_ll_head(col, free_bucket); // std::cerr << "Set column head to new bucket " << size_t(ll_metadata[col]) << std::endl; } else { - sparse_buckets[prev].next = free_bucket; + insert_to_ll(free_bucket, sparse_buckets[prev]); // std::cerr << "Placed new bucket in column " << size_t(prev) << "->" << size_t(sparse_buckets[prev].next) << "->" << size_t(sparse_buckets[free_bucket].next) << std::endl; } @@ -277,13 +261,14 @@ void SparseSketch::update(const vec_t update_idx) { } } } + + validate(); } // TODO: Switch the L0_SAMPLING flag to instead affect query procedure. // (Only use deepest bucket. We don't need the alternate update procedure in the code anymore.) void SparseSketch::zero_contents() { - // TODO: Should we also set the size of this bucket back to an initial state? for (size_t i = 0; i < num_buckets; i++) { buckets[i].alpha = 0; buckets[i].gamma = 0; @@ -304,6 +289,8 @@ void SparseSketch::zero_contents() { reset_sample_state(); number_of_sparse_buckets = 0; + // if (num_dense_rows > min_num_dense_rows + 4) + // dense_realloc(min_num_dense_rows); } SketchSample SparseSketch::sample() { @@ -354,7 +341,7 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { if (sample_idx >= num_samples) { throw OutOfSamplesException(seed, num_samples, sample_idx); } - std::unordered_set ret; + std::vector ret; size_t idx = sample_idx++; size_t first_column = idx * cols_per_sample; @@ -363,14 +350,14 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return unlikely_if (Bucket_Boruvka::is_good(deterministic_bucket(), checksum_seed())) { - ret.insert(deterministic_bucket().alpha); + ret.push_back(deterministic_bucket().alpha); return {ret, GOOD}; } for (size_t c = 0; c < cols_per_sample; ++c) { for (size_t r = 0; r < num_dense_rows; ++r) { unlikely_if (Bucket_Boruvka::is_good(bucket(c + first_column, r), checksum_seed())) { - ret.insert(bucket(c + first_column, r).alpha); + ret.push_back(bucket(c + first_column, r).alpha); } } } @@ -378,7 +365,7 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { // TODO: How do we do exhaustive sampling properly here? SketchSample sample = sample_sparse(first_column, first_column + cols_per_sample); if (sample.result == GOOD) { - ret.insert(sample.idx); + ret.push_back(sample.idx); } unlikely_if (ret.size() == 0) @@ -386,6 +373,144 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { return {ret, GOOD}; } +void SparseSketch::merge_sparse_column(SparseBucket *oth_sparse_buckets, uint8_t *oth_ll_metadata, + size_t col) { + // std::cerr << "Merging sparse column: " << col << std::endl; + + // std::cerr << "Our column" << std::endl; + // uint8_t idx = ll_metadata[col]; + // while (idx != uint8_t(-1)) { + // bool good = Bucket_Boruvka::is_good(sparse_buckets[idx].bkt, checksum_seed()); + // std::cerr << "i: " << size_t(idx) << " n: " << size_t(sparse_buckets[idx].next) << " r:" + // << size_t(sparse_buckets[idx].row) << " := a:" << sparse_buckets[idx].bkt.alpha + // << " c:" << sparse_buckets[idx].bkt.gamma << (good ? " good" : " bad") << std::endl; + // idx = sparse_buckets[idx].next; + // } + + // std::cerr << "Oth column" << std::endl; + // idx = oth_ll_metadata[col]; + // while (idx != uint8_t(-1)) { + // bool good = Bucket_Boruvka::is_good(oth_sparse_buckets[idx].bkt, checksum_seed()); + // std::cerr << "i: " << size_t(idx) << " n: " << size_t(oth_sparse_buckets[idx].next) << " r:" + // << size_t(oth_sparse_buckets[idx].row) << " := a:" << oth_sparse_buckets[idx].bkt.alpha + // << " c:" << oth_sparse_buckets[idx].bkt.gamma << (good ? " good" : " bad") << std::endl; + // idx = oth_sparse_buckets[idx].next; + // } + + + uint8_t oth_idx = oth_ll_metadata[col]; + uint8_t our_idx = ll_metadata[col]; + uint8_t prev = uint8_t(-1); + + // merge column until one runs out + while (oth_idx != uint8_t(-1) && our_idx != uint8_t(-1)) { + SparseBucket& oth_sparse = oth_sparse_buckets[oth_idx]; + SparseBucket& our_sparse = sparse_buckets[our_idx]; + + if (oth_sparse.row < num_dense_rows) { + // just merge into dense! + bucket(col, oth_sparse.row).alpha ^= oth_sparse.bkt.alpha; + bucket(col, oth_sparse.row).gamma ^= oth_sparse.bkt.gamma; + oth_idx = oth_sparse.next; + continue; + } + + if (oth_sparse.row > our_sparse.row) { + // skip our bucket, sparse doesn't have anything to match it + prev = our_idx; + our_idx = our_sparse.next; + } else if (oth_sparse.row < our_sparse.row) { + // oth has a bucket we don't have, insert it + uint8_t free_bucket = claim_free_bucket(); + // std::cerr << "ours = " << size_t(our_idx) << " free = " << size_t(free_bucket) << std::endl; + + sparse_buckets[free_bucket] = oth_sparse; + if (prev == uint8_t(-1)) { + insert_to_ll_head(col, free_bucket); + } else { + insert_to_ll(free_bucket, sparse_buckets[prev]); + } + number_of_sparse_buckets += 1; + reallocate_if_needed(1); + oth_idx = oth_sparse.next; + prev = free_bucket; + if (ll_metadata[col] == uint8_t(-1) || ll_metadata[col] == our_idx) prev = uint8_t(-1); + } else { + // they are equal, merge them! + uint8_t our_next = our_sparse.next; + uint8_t oth_next = oth_sparse.next; + bool removed = merge_sparse_bkt(our_idx, oth_sparse, prev, col); + if (removed) { + number_of_sparse_buckets -= 1; + reallocate_if_needed(-1); + } else { + prev = our_idx; + } + oth_idx = oth_next; + our_idx = our_next; + } + } + + // if there's more in the other column, merge that stuff in + while (oth_idx != uint8_t(-1)) { + SparseBucket& oth_sparse = oth_sparse_buckets[oth_idx]; + uint8_t free_bucket = claim_free_bucket(); + sparse_buckets[free_bucket] = oth_sparse; + if (prev == uint8_t(-1)) { + insert_to_ll_head(col, free_bucket); + } else { + insert_to_ll(free_bucket, sparse_buckets[prev]); + } + number_of_sparse_buckets += 1; + reallocate_if_needed(1); // TODO: There could be an edge case where the sparse bucket we're looking at becomes dense + prev = free_bucket; + if (ll_metadata[col] == uint8_t(-1)) prev = uint8_t(-1); + oth_idx = oth_sparse.next; + } + + validate(); +} + +void SparseSketch::validate() { + size_t num_alloced = 0; + for (size_t c = 0; c < num_columns; c++) { + uint8_t idx = ll_metadata[c]; + while (idx != uint8_t(-1)) { + if (Bucket_Boruvka::is_empty(sparse_buckets[idx].bkt)) { + std::cerr << "ERROR: Empty bucket found in column " << c << std::endl; + std::cerr << *this << std::endl; + assert(false); + } else { + num_alloced += 1; + } + idx = sparse_buckets[idx].next; + } + } + size_t num_free = 0; + uint8_t idx = ll_metadata[num_columns]; + while (idx != uint8_t(-1)) { + if (!Bucket_Boruvka::is_empty(sparse_buckets[idx].bkt)) { + std::cerr << "ERROR: Non-empty bucket found in free list!" << std::endl; + std::cerr << *this << std::endl; + assert(false); + } else { + num_free += 1; + } + idx = sparse_buckets[idx].next; + } + + if (num_alloced != number_of_sparse_buckets) { + std::cerr << "ERROR: number of sparse buckets does not match expectation!" << std::endl; + std::cerr << *this << std::endl; + assert(false); + } + if (num_free != sparse_capacity - number_of_sparse_buckets) { + std::cerr << "ERROR; number of free buckets does not match expectation!" << std::endl; + std::cerr << *this << std::endl; + assert(num_free == sparse_capacity - number_of_sparse_buckets); + } +} + void SparseSketch::merge(const SparseSketch &other) { // std::cerr << "PERFORMING A MERGE" << std::endl; // std::cerr << *this << std::endl; @@ -407,6 +532,8 @@ void SparseSketch::merge(const SparseSketch &other) { SparseBucket sparse_bkt; sparse_bkt.row = r; sparse_bkt.bkt = other.bucket(c, r); + + // TODO: This could be made a push-front update_sparse(c, sparse_bkt); } } @@ -414,20 +541,7 @@ void SparseSketch::merge(const SparseSketch &other) { // Merge all sparse buckets from other sketch into this one for (size_t c = 0; c < num_columns; c++) { - uint8_t this_idx = ll_metadata[c]; - uint8_t oth_idx = other.ll_metadata[c]; - - while (oth_idx != uint8_t(-1)) { - if (other.sparse_buckets[oth_idx].row < num_dense_rows) { - auto &bkt = bucket(c, other.sparse_buckets[oth_idx].row); - bkt.alpha ^= other.sparse_buckets[oth_idx].bkt.alpha; - bkt.gamma ^= other.sparse_buckets[oth_idx].bkt.gamma; - } else { - // TODO: This can be made faster by utilizing this_idx and performing a merge operation - update_sparse(c, other.sparse_buckets[oth_idx]); - } - oth_idx = other.sparse_buckets[oth_idx].next; - } + merge_sparse_column(other.sparse_buckets, other.ll_metadata, c); } } @@ -580,6 +694,20 @@ std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { idx = sketch.sparse_buckets[idx].next; } } + os << "Free Buckets" << std::endl; + uint8_t idx = sketch.ll_metadata[sketch.num_columns]; + while (idx != uint8_t(-1)) { + bool good = Bucket_Boruvka::is_good(sparse_buckets[idx].bkt, sketch.checksum_seed()); + os << "i: " << size_t(idx) << " n: " << size_t(sparse_buckets[idx].next) << " r:" + << size_t(sparse_buckets[idx].row) << " := a:" << sparse_buckets[idx].bkt.alpha + << " c:" << sparse_buckets[idx].bkt.gamma << (good ? " good" : " bad") << std::endl; + if (idx == sketch.sparse_buckets[idx].next) { + os << "LL error!" << std::endl; + return os; + } + idx = sketch.sparse_buckets[idx].next; + } + os << std::endl; return os; } diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp index 80d2656b..aa80b951 100644 --- a/test/sketch_test.cpp +++ b/test/sketch_test.cpp @@ -176,7 +176,7 @@ void test_sketch_merge(unsigned long num_sketches, ASSERT_LT(res_idx, vec_size) << "Sampled index out of bounds"; if (test_vec1.get_entry(res_idx) == test_vec2.get_entry(res_idx)) { sample_incorrect_failures++; - exit(EXIT_FAILURE); + std::cerr << "GOT A SAMPLE INCORRECT ERROR!" << std::endl; } } else if (ret_code == ZERO) { @@ -190,7 +190,6 @@ void test_sketch_merge(unsigned long num_sketches, if (!vec_zero) { sample_incorrect_failures++; std::cout << "GOT INCORRECT ZERO!" << std::endl; - exit(EXIT_FAILURE); } } else { // sketch failed @@ -211,9 +210,9 @@ void test_sketch_merge(unsigned long num_sketches, } TEST(SketchTestSuite, TestSketchMerge) { - test_sketch_merge(10000, 1e2, 100, 0.001, 0.03); - test_sketch_merge(1000, 1e3, 1000, 0.001, 0.03); - test_sketch_merge(1000, 1e4, 10000, 0.001, 0.03); + test_sketch_merge(10000, 1e2, 100, 0, 0.03); + test_sketch_merge(1000, 1e3, 1000, 0, 0.03); + test_sketch_merge(1000, 1e4, 10000, 0, 0.03); } TEST(SketchTestSuite, TestSketchRangeMerge) { @@ -381,16 +380,11 @@ TEST(SketchTestSuite, TestExhaustiveQuery) { ASSERT_EQ(query_ret.idxs.size(), 0) << query_ret.result; } - // assert everything returned is valid and <= 10 things - ASSERT_LE(query_ret.idxs.size(), 10); + // assert everything returned is valid for (vec_t non_zero : query_ret.idxs) { ASSERT_GT(non_zero, 0); ASSERT_LE(non_zero, 10); } - - // assert everything returned is unique - std::set unique_elms(query_ret.idxs.begin(), query_ret.idxs.end()); - ASSERT_EQ(unique_elms.size(), query_ret.idxs.size()); } } From 7fda92c3881740e4a4c8e220c2986e273f489f72 Mon Sep 17 00:00:00 2001 From: Evan West Date: Fri, 21 Mar 2025 12:51:48 -0400 Subject: [PATCH 09/14] tried to optimize with ll but didn't really work --- include/sparse_sketch.h | 11 ++-- src/sparse_sketch.cpp | 122 ++++++---------------------------------- test/sketch_test.cpp | 5 ++ 3 files changed, 28 insertions(+), 110 deletions(-) diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h index 89539b95..501f9fd1 100644 --- a/include/sparse_sketch.h +++ b/include/sparse_sketch.h @@ -78,7 +78,7 @@ class SparseSketch { size_t sparse_data_size = ceil(double(sparse_capacity) * sizeof(SparseBucket) / sizeof(Bucket)); size_t ll_metadata_size = ceil((double(num_columns) + 1) * sizeof(uint8_t) / sizeof(Bucket)); - void update_sparse(uint8_t col, SparseBucket to_add, bool realloc_if_needed = true); + void update_sparse(uint8_t col, const SparseBucket &to_add); SketchSample sample_sparse(size_t first_col, size_t end_col); inline uint8_t remove_ll_head(size_t col) { @@ -106,7 +106,8 @@ class SparseSketch { inline void remove_from_ll(SparseBucket& bkt_to_remove, SparseBucket &prev) { prev.next = bkt_to_remove.next; } - inline bool merge_sparse_bkt(uint8_t our_idx, SparseBucket& oth, uint8_t prev_idx, size_t col) { + inline bool merge_sparse_bkt(uint8_t our_idx, const SparseBucket& oth, uint8_t prev_idx, + size_t col) { SparseBucket &ours = sparse_buckets[our_idx]; ours.bkt.alpha ^= oth.bkt.alpha; ours.bkt.gamma ^= oth.bkt.gamma; @@ -161,10 +162,8 @@ class SparseSketch { } // given another SparseSketch column, merge it into ours - void merge_sparse_column(SparseBucket *oth_sparse_buckets, uint8_t *oth_ll_metadata, size_t col); - - void validate(); - + void merge_sparse_column(const SparseBucket* oth_sparse_buckets, const uint8_t* oth_ll_metadata, + size_t col); public: /** * The below constructors use vector length as their input. However, in graph sketching our input diff --git a/src/sparse_sketch.cpp b/src/sparse_sketch.cpp index 340c457c..cc51bc6f 100644 --- a/src/sparse_sketch.cpp +++ b/src/sparse_sketch.cpp @@ -183,7 +183,7 @@ void SparseSketch::reallocate_if_needed(int delta) { // +1 if we added a new bucket value // 0 if the bucket was found and update (but not cleared) // -1 if the bucket was found and cleared of all content -void SparseSketch::update_sparse(uint8_t col, SparseBucket to_add, bool realloc_if_needed) { +void SparseSketch::update_sparse(uint8_t col, const SparseBucket &to_add) { uint8_t next_ptr = ll_metadata[col]; uint8_t prev = uint8_t(-1); while (next_ptr != uint8_t(-1)) { @@ -191,7 +191,7 @@ void SparseSketch::update_sparse(uint8_t col, SparseBucket to_add, bool realloc_ bool removed = merge_sparse_bkt(next_ptr, to_add, prev, col); if (removed) { number_of_sparse_buckets -= 1; - if (realloc_if_needed) reallocate_if_needed(-1); + reallocate_if_needed(-1); } return; } else if (sparse_buckets[next_ptr].row > to_add.row) { @@ -221,7 +221,7 @@ void SparseSketch::update_sparse(uint8_t col, SparseBucket to_add, bool realloc_ // std::cerr << "Placed new bucket in column " << size_t(prev) << "->" << size_t(sparse_buckets[prev].next) << "->" << size_t(sparse_buckets[free_bucket].next) << std::endl; } - if (realloc_if_needed) reallocate_if_needed(1); + reallocate_if_needed(1); } // sample a good bucket from the sparse region if one exists. @@ -261,8 +261,6 @@ void SparseSketch::update(const vec_t update_idx) { } } } - - validate(); } // TODO: Switch the L0_SAMPLING flag to instead affect query procedure. @@ -373,38 +371,16 @@ ExhaustiveSketchSample SparseSketch::exhaustive_sample() { return {ret, GOOD}; } -void SparseSketch::merge_sparse_column(SparseBucket *oth_sparse_buckets, uint8_t *oth_ll_metadata, - size_t col) { +void SparseSketch::merge_sparse_column(const SparseBucket *oth_sparse_buckets, + const uint8_t *oth_ll_metadata, size_t col) { // std::cerr << "Merging sparse column: " << col << std::endl; - - // std::cerr << "Our column" << std::endl; - // uint8_t idx = ll_metadata[col]; - // while (idx != uint8_t(-1)) { - // bool good = Bucket_Boruvka::is_good(sparse_buckets[idx].bkt, checksum_seed()); - // std::cerr << "i: " << size_t(idx) << " n: " << size_t(sparse_buckets[idx].next) << " r:" - // << size_t(sparse_buckets[idx].row) << " := a:" << sparse_buckets[idx].bkt.alpha - // << " c:" << sparse_buckets[idx].bkt.gamma << (good ? " good" : " bad") << std::endl; - // idx = sparse_buckets[idx].next; - // } - - // std::cerr << "Oth column" << std::endl; - // idx = oth_ll_metadata[col]; - // while (idx != uint8_t(-1)) { - // bool good = Bucket_Boruvka::is_good(oth_sparse_buckets[idx].bkt, checksum_seed()); - // std::cerr << "i: " << size_t(idx) << " n: " << size_t(oth_sparse_buckets[idx].next) << " r:" - // << size_t(oth_sparse_buckets[idx].row) << " := a:" << oth_sparse_buckets[idx].bkt.alpha - // << " c:" << oth_sparse_buckets[idx].bkt.gamma << (good ? " good" : " bad") << std::endl; - // idx = oth_sparse_buckets[idx].next; - // } - - uint8_t oth_idx = oth_ll_metadata[col]; uint8_t our_idx = ll_metadata[col]; uint8_t prev = uint8_t(-1); // merge column until one runs out while (oth_idx != uint8_t(-1) && our_idx != uint8_t(-1)) { - SparseBucket& oth_sparse = oth_sparse_buckets[oth_idx]; + const SparseBucket& oth_sparse = oth_sparse_buckets[oth_idx]; SparseBucket& our_sparse = sparse_buckets[our_idx]; if (oth_sparse.row < num_dense_rows) { @@ -453,7 +429,14 @@ void SparseSketch::merge_sparse_column(SparseBucket *oth_sparse_buckets, uint8_t // if there's more in the other column, merge that stuff in while (oth_idx != uint8_t(-1)) { - SparseBucket& oth_sparse = oth_sparse_buckets[oth_idx]; + const SparseBucket& oth_sparse = oth_sparse_buckets[oth_idx]; + if (oth_sparse.row < num_dense_rows) { + bucket(col, oth_sparse.row).alpha ^= oth_sparse.bkt.alpha; + bucket(col, oth_sparse.row).gamma ^= oth_sparse.bkt.gamma; + oth_idx = oth_sparse.next; + continue; + } + uint8_t free_bucket = claim_free_bucket(); sparse_buckets[free_bucket] = oth_sparse; if (prev == uint8_t(-1)) { @@ -462,53 +445,11 @@ void SparseSketch::merge_sparse_column(SparseBucket *oth_sparse_buckets, uint8_t insert_to_ll(free_bucket, sparse_buckets[prev]); } number_of_sparse_buckets += 1; - reallocate_if_needed(1); // TODO: There could be an edge case where the sparse bucket we're looking at becomes dense + reallocate_if_needed(1); prev = free_bucket; if (ll_metadata[col] == uint8_t(-1)) prev = uint8_t(-1); oth_idx = oth_sparse.next; } - - validate(); -} - -void SparseSketch::validate() { - size_t num_alloced = 0; - for (size_t c = 0; c < num_columns; c++) { - uint8_t idx = ll_metadata[c]; - while (idx != uint8_t(-1)) { - if (Bucket_Boruvka::is_empty(sparse_buckets[idx].bkt)) { - std::cerr << "ERROR: Empty bucket found in column " << c << std::endl; - std::cerr << *this << std::endl; - assert(false); - } else { - num_alloced += 1; - } - idx = sparse_buckets[idx].next; - } - } - size_t num_free = 0; - uint8_t idx = ll_metadata[num_columns]; - while (idx != uint8_t(-1)) { - if (!Bucket_Boruvka::is_empty(sparse_buckets[idx].bkt)) { - std::cerr << "ERROR: Non-empty bucket found in free list!" << std::endl; - std::cerr << *this << std::endl; - assert(false); - } else { - num_free += 1; - } - idx = sparse_buckets[idx].next; - } - - if (num_alloced != number_of_sparse_buckets) { - std::cerr << "ERROR: number of sparse buckets does not match expectation!" << std::endl; - std::cerr << *this << std::endl; - assert(false); - } - if (num_free != sparse_capacity - number_of_sparse_buckets) { - std::cerr << "ERROR; number of free buckets does not match expectation!" << std::endl; - std::cerr << *this << std::endl; - assert(num_free == sparse_capacity - number_of_sparse_buckets); - } } void SparseSketch::merge(const SparseSketch &other) { @@ -532,8 +473,6 @@ void SparseSketch::merge(const SparseSketch &other) { SparseBucket sparse_bkt; sparse_bkt.row = r; sparse_bkt.bkt = other.bucket(c, r); - - // TODO: This could be made a push-front update_sparse(c, sparse_bkt); } } @@ -585,20 +524,7 @@ void SparseSketch::range_merge(const SparseSketch &other, size_t start_sample, s // Merge all sparse buckets from other sketch into this one for (size_t c = start_column; c < end_column; c++) { - uint8_t this_idx = ll_metadata[c]; - uint8_t oth_idx = other.ll_metadata[c]; - - while (oth_idx != uint8_t(-1)) { - if (other.sparse_buckets[oth_idx].row < num_dense_rows) { - auto &bkt = bucket(c, other.sparse_buckets[oth_idx].row); - bkt.alpha ^= other.sparse_buckets[oth_idx].bkt.alpha; - bkt.gamma ^= other.sparse_buckets[oth_idx].bkt.gamma; - } else { - // TODO: This can be made faster by utilizing this_idx and performing a merge operation - update_sparse(c, other.sparse_buckets[oth_idx]); - } - oth_idx = other.sparse_buckets[oth_idx].next; - } + merge_sparse_column(other.sparse_buckets, other.ll_metadata, c); } // std::cerr << "SKETCH AFTER MERGE" << std::endl; // std::cerr << *this << std::endl; @@ -629,20 +555,7 @@ void SparseSketch::merge_raw_bucket_buffer(const Bucket *raw_buckets, size_t n_r // Merge all sparse buckets from other sketch into this one for (size_t c = 0; c < num_columns; c++) { - uint8_t this_idx = ll_metadata[c]; - uint8_t oth_idx = raw_metadata[c]; - - while (oth_idx != uint8_t(-1)) { - if (raw_sparse[oth_idx].row < num_dense_rows) { - auto &bkt = bucket(c, raw_sparse[oth_idx].row); - bkt.alpha ^= raw_sparse[oth_idx].bkt.alpha; - bkt.gamma ^= raw_sparse[oth_idx].bkt.gamma; - } else { - // TODO: This can be made faster by utilizing this_idx and performing a merge operation - update_sparse(c, raw_sparse[oth_idx]); - } - oth_idx = raw_sparse[oth_idx].next; - } + merge_sparse_column(raw_sparse, raw_metadata, c); } } @@ -666,6 +579,7 @@ std::ostream &operator<<(std::ostream &os, const SparseSketch &sketch) { os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + os << "Number of dense rows = " << sketch.num_dense_rows << std::endl; for (unsigned i = 0; i < sketch.num_columns; ++i) { for (unsigned j = 0; j < sketch.num_dense_rows; ++j) { Bucket bkt = sketch.bucket(i, j); diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp index aa80b951..efe223b4 100644 --- a/test/sketch_test.cpp +++ b/test/sketch_test.cpp @@ -167,6 +167,7 @@ void test_sketch_merge(unsigned long num_sketches, sketch2.update(test_vec2.get_update(j)); } sketch1.merge(sketch2); + Sketch backup(sketch1); try { SketchSample query_ret = sketch1.sample(); vec_t res_idx = query_ret.idx; @@ -177,6 +178,10 @@ void test_sketch_merge(unsigned long num_sketches, if (test_vec1.get_entry(res_idx) == test_vec2.get_entry(res_idx)) { sample_incorrect_failures++; std::cerr << "GOT A SAMPLE INCORRECT ERROR!" << std::endl; + std::cerr << "Got: " << res_idx << std::endl; + std::cerr << sketch1 << std::endl; + std::cerr << backup << std::endl; + std::cerr << sketch2 << std::endl; } } else if (ret_code == ZERO) { From 949dbe21c68ed520f49405272eba39b20915ac5e Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 1 May 2025 15:15:37 -0400 Subject: [PATCH 10/14] remove pht changes, turn off query print --- CMakeLists.txt | 2 +- include/graph_sketch_driver.h | 35 +++++++++++++++++------------------ src/cc_sketch_alg.cpp | 22 +++++++++++----------- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb54756b..312e077c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ FetchContent_Declare( GutterTree GIT_REPOSITORY https://github.com/GraphStreamingProject/GutterTree.git - GIT_TAG better_pht + GIT_TAG main ) # Get StreamingUtilities diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h index c05df28a..8e0a9d3f 100644 --- a/include/graph_sketch_driver.h +++ b/include/graph_sketch_driver.h @@ -152,29 +152,28 @@ class GraphSketchDriver { #endif while (true) { - bool got_breakpoint = false; size_t updates = stream->get_update_buffer(update_array, update_array_size); - - if (update_array[updates - 1].type == BREAKPOINT) { - --updates; - got_breakpoint = true; - } - gts->process_stream_upd_batch(update_array, updates, thr_id); - for (size_t i = 0; i < updates; i++) { - GraphUpdate upd = {update_array[i].edge, (UpdateType) update_array[i].type}; - sketching_alg->pre_insert(upd, thr_id); + GraphUpdate upd; + upd.edge = update_array[i].edge; + upd.type = static_cast(update_array[i].type); + if (upd.type == BREAKPOINT) { + // reached the breakpoint. Update verifier if applicable and return #ifdef VERIFY_SAMPLES_F - local_verifier.edge_update(upd.edge); + std::lock_guard lk(verifier_mtx); + verifier->combine(local_verifier); #endif - } - - if (got_breakpoint) { + return; + } + else { + sketching_alg->pre_insert(upd, thr_id); + Edge edge = upd.edge; + gts->insert({edge.src, edge.dst}, thr_id); + gts->insert({edge.dst, edge.src}, thr_id); #ifdef VERIFY_SAMPLES_F - std::lock_guard lk(verifier_mtx); - verifier->combine(local_verifier); + local_verifier.edge_update(edge); #endif - return; + } } } }; @@ -227,4 +226,4 @@ class GraphSketchDriver { // time hooks for experiments std::chrono::steady_clock::time_point flush_start; std::chrono::steady_clock::time_point flush_end; -}; +}; \ No newline at end of file diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp index adcde659..b3860a35 100644 --- a/src/cc_sketch_alg.cpp +++ b/src/cc_sketch_alg.cpp @@ -486,27 +486,27 @@ void CCSketchAlg::boruvka_emulation() { } size_t round_num = 0; bool modified = true; - std::cout << std::endl; - std::cout << " pre boruvka processing = " - << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - << std::endl; + // std::cout << std::endl; + // std::cout << " pre boruvka processing = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; while (true) { - std::cout << " Round: " << round_num << std::endl; + // std::cout << " Round: " << round_num << std::endl; start = std::chrono::steady_clock::now(); modified = perform_boruvka_round(round_num, merge_instr, global_merges); - std::cout << " perform_boruvka_round = " - << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - << std::endl; + // std::cout << " perform_boruvka_round = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; if (!modified) break; // calculate updated merge instructions for next round start = std::chrono::steady_clock::now(); create_merge_instructions(merge_instr); - std::cout << " create_merge_instructions = " - << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - << std::endl; + // std::cout << " create_merge_instructions = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; ++round_num; } last_query_rounds = round_num; From 8d432ddfa04e1abcd2ab96ab4ffd82afe76ce35d Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 1 May 2025 15:18:11 -0400 Subject: [PATCH 11/14] update workflow --- .github/workflows/cmake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index ca10a22b..e8512465 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, ubuntu-20.04] + os: [ubuntu-latest] flags: ['"-DL0_SAMPLING"', '"-DNO_EAGER_DSU"', '""'] steps: From 676b2e595ccefb090e1ac4f94fa412f2d484e373 Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 1 May 2025 15:57:08 -0400 Subject: [PATCH 12/14] adjust constant --- include/sparse_sketch.h | 2 +- src/cc_sketch_alg.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/sparse_sketch.h b/include/sparse_sketch.h index 501f9fd1..b17c6562 100644 --- a/include/sparse_sketch.h +++ b/include/sparse_sketch.h @@ -56,7 +56,7 @@ class SparseSketch { // Allocated buckets Bucket* buckets; - static constexpr size_t min_num_dense_rows = 5; + static constexpr size_t min_num_dense_rows = 6; size_t num_dense_rows = min_num_dense_rows; // Variables for sparse representation of lower levels of bucket Matrix diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp index b3860a35..87b194aa 100644 --- a/src/cc_sketch_alg.cpp +++ b/src/cc_sketch_alg.cpp @@ -534,11 +534,11 @@ ConnectedComponents CCSketchAlg::connected_components() { bool except = false; std::exception_ptr err; try { - auto start = std::chrono::steady_clock::now(); + // auto start = std::chrono::steady_clock::now(); boruvka_emulation(); - std::cout << " boruvka's algorithm = " - << std::chrono::duration(std::chrono::steady_clock::now() - start).count() - << std::endl; + // std::cout << " boruvka's algorithm = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; } catch (...) { except = true; err = std::current_exception(); From 37b1da53c22d824fb142ca6094f7495b67a37f12 Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 1 May 2025 16:14:53 -0400 Subject: [PATCH 13/14] Update graph_sketch_driver.h --- include/graph_sketch_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h index 8e0a9d3f..e627e443 100644 --- a/include/graph_sketch_driver.h +++ b/include/graph_sketch_driver.h @@ -226,4 +226,4 @@ class GraphSketchDriver { // time hooks for experiments std::chrono::steady_clock::time_point flush_start; std::chrono::steady_clock::time_point flush_end; -}; \ No newline at end of file +}; From 107ceb49d36f04c64eae675ec344b5394c4a0a68 Mon Sep 17 00:00:00 2001 From: Evan West Date: Thu, 1 May 2025 16:19:44 -0400 Subject: [PATCH 14/14] clean up --- src/cc_sketch_alg.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp index 87b194aa..f8e5649d 100644 --- a/src/cc_sketch_alg.cpp +++ b/src/cc_sketch_alg.cpp @@ -466,7 +466,7 @@ inline void CCSketchAlg::create_merge_instructions(std::vector &merg } void CCSketchAlg::boruvka_emulation() { - auto start = std::chrono::steady_clock::now(); + // auto start = std::chrono::steady_clock::now(); update_locked = true; cc_alg_start = std::chrono::steady_clock::now(); @@ -493,7 +493,7 @@ void CCSketchAlg::boruvka_emulation() { while (true) { // std::cout << " Round: " << round_num << std::endl; - start = std::chrono::steady_clock::now(); + // start = std::chrono::steady_clock::now(); modified = perform_boruvka_round(round_num, merge_instr, global_merges); // std::cout << " perform_boruvka_round = " // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() @@ -502,7 +502,7 @@ void CCSketchAlg::boruvka_emulation() { if (!modified) break; // calculate updated merge instructions for next round - start = std::chrono::steady_clock::now(); + // start = std::chrono::steady_clock::now(); create_merge_instructions(merge_instr); // std::cout << " create_merge_instructions = " // << std::chrono::duration(std::chrono::steady_clock::now() - start).count()