|
| 1 | +#pragma once |
| 2 | +#include <graph_zeppelin_common.h> |
| 3 | +#include <gtest/gtest_prod.h> |
| 4 | +#include <sys/mman.h> |
| 5 | + |
| 6 | +#include <fstream> |
| 7 | +#include <unordered_set> |
| 8 | +#include <cmath> |
| 9 | +#include <cassert> |
| 10 | +#include <mutex> |
| 11 | + |
| 12 | +#include "util.h" |
| 13 | +#include "bucket.h" |
| 14 | +#include "sketch_types.h" |
| 15 | + |
| 16 | +/** |
| 17 | + * Sketch for graph processing, either CubeSketch or CameoSketch. |
| 18 | + * Sub-linear representation of a vector. |
| 19 | + */ |
| 20 | +class DenseSketch { |
| 21 | + private: |
| 22 | + const uint64_t seed; // seed for hash functions |
| 23 | + size_t num_samples; // number of samples we can perform |
| 24 | + size_t cols_per_sample; // number of columns to use on each sample |
| 25 | + size_t num_columns; // Total number of columns. (product of above 2) |
| 26 | + size_t bkt_per_col; // maximum number of buckets per column (max number of rows) |
| 27 | + size_t num_buckets; // number of total buckets product of above two |
| 28 | + size_t sample_idx = 0; // number of samples performed so far |
| 29 | + |
| 30 | + // Allocated buckets |
| 31 | + Bucket* buckets; |
| 32 | + |
| 33 | + inline Bucket& deterministic_bucket() { |
| 34 | + return buckets[0]; |
| 35 | + } |
| 36 | + inline const Bucket& deterministic_bucket() const { |
| 37 | + return buckets[0]; |
| 38 | + } |
| 39 | + |
| 40 | + // return the bucket at a particular index in bucket array |
| 41 | + inline Bucket& bucket(size_t col, size_t row) { |
| 42 | + return buckets[col * bkt_per_col + row + 1]; |
| 43 | + } |
| 44 | + inline const Bucket& bucket(size_t col, size_t row) const { |
| 45 | + return buckets[col * bkt_per_col + row + 1]; |
| 46 | + } |
| 47 | + |
| 48 | + public: |
| 49 | + /** |
| 50 | + * The below constructors use vector length as their input. However, in graph sketching our input |
| 51 | + * is the number of vertices. This function converts from number of graph vertices to vector |
| 52 | + * length. |
| 53 | + * @param num_vertices Number of graph vertices |
| 54 | + * @return The length of the vector to sketch |
| 55 | + */ |
| 56 | + static vec_t calc_vector_length(node_id_t num_vertices) { |
| 57 | + return ceil(double(num_vertices) * (num_vertices - 1) / 2); |
| 58 | + } |
| 59 | + |
| 60 | + /** |
| 61 | + * This function computes the number of samples a Sketch should support in order to solve |
| 62 | + * connected components. Optionally, can increase or decrease the number of samples by a |
| 63 | + * multiplicative factor. |
| 64 | + * @param num_vertices Number of graph vertices |
| 65 | + * @param f Multiplicative sample factor |
| 66 | + * @return The number of samples |
| 67 | + */ |
| 68 | + static size_t calc_cc_samples(node_id_t num_vertices, double f) { |
| 69 | + return std::max(size_t(18), (size_t) ceil(f * log2(num_vertices) / num_samples_div)); |
| 70 | + } |
| 71 | + |
| 72 | + /** |
| 73 | + * Construct a sketch object |
| 74 | + * @param vector_len Length of the vector we are sketching |
| 75 | + * @param seed Random seed of the sketch |
| 76 | + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) |
| 77 | + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) |
| 78 | + */ |
| 79 | + DenseSketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1, |
| 80 | + size_t cols_per_sample = default_cols_per_sample); |
| 81 | + |
| 82 | + /** |
| 83 | + * Construct a sketch from a serialized stream |
| 84 | + * @param vector_len Length of the vector we are sketching |
| 85 | + * @param seed Random seed of the sketch |
| 86 | + * @param binary_in Stream holding serialized sketch object |
| 87 | + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) |
| 88 | + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) |
| 89 | + */ |
| 90 | + DenseSketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, |
| 91 | + size_t cols_per_sample = default_cols_per_sample); |
| 92 | + |
| 93 | + /** |
| 94 | + * Sketch copy constructor |
| 95 | + * @param s The sketch to copy. |
| 96 | + */ |
| 97 | + DenseSketch(const DenseSketch& s); |
| 98 | + |
| 99 | + ~DenseSketch(); |
| 100 | + |
| 101 | + /** |
| 102 | + * Update a sketch based on information about one of its indices. |
| 103 | + * @param update the point update. |
| 104 | + */ |
| 105 | + void update(const vec_t update); |
| 106 | + |
| 107 | + /** |
| 108 | + * Function to sample from the sketch. |
| 109 | + * cols_per_sample determines the number of columns we allocate to this query |
| 110 | + * @return A pair with the result index and a code indicating the type of result. |
| 111 | + */ |
| 112 | + SketchSample sample(); |
| 113 | + |
| 114 | + /** |
| 115 | + * Function to sample from the appropriate columns to return 1 or more non-zero indices |
| 116 | + * @return A pair with the result indices and a code indicating the type of result. |
| 117 | + */ |
| 118 | + ExhaustiveSketchSample exhaustive_sample(); |
| 119 | + |
| 120 | + std::mutex mutex; // lock the sketch for applying updates in multithreaded processing |
| 121 | + |
| 122 | + /** |
| 123 | + * In-place merge function. |
| 124 | + * @param other Sketch to merge into caller |
| 125 | + */ |
| 126 | + void merge(const DenseSketch &other); |
| 127 | + |
| 128 | + /** |
| 129 | + * In-place range merge function. Updates the caller Sketch. |
| 130 | + * The range merge only merges some of the Sketches |
| 131 | + * This function should only be used if you know what you're doing |
| 132 | + * @param other Sketch to merge into caller |
| 133 | + * @param start_sample Index of first sample to merge |
| 134 | + * @param n_samples Number of samples to merge |
| 135 | + */ |
| 136 | + void range_merge(const DenseSketch &other, size_t start_sample, size_t n_samples); |
| 137 | + |
| 138 | + /** |
| 139 | + * Perform an in-place merge function without another Sketch and instead |
| 140 | + * use a raw bucket memory. |
| 141 | + * We also allow for only a portion of the buckets to be merge at once |
| 142 | + * @param raw_bucket Raw bucket data to merge into this sketch |
| 143 | + */ |
| 144 | + void merge_raw_bucket_buffer(const Bucket *raw_buckets); |
| 145 | + |
| 146 | + /** |
| 147 | + * Zero out all the buckets of a sketch. |
| 148 | + */ |
| 149 | + void zero_contents(); |
| 150 | + |
| 151 | + friend bool operator==(const DenseSketch& sketch1, const DenseSketch& sketch2); |
| 152 | + friend std::ostream& operator<<(std::ostream& os, const DenseSketch& sketch); |
| 153 | + |
| 154 | + /** |
| 155 | + * Serialize the sketch to a binary output stream. |
| 156 | + * @param binary_out the stream to write to. |
| 157 | + */ |
| 158 | + void serialize(std::ostream& binary_out) const; |
| 159 | + |
| 160 | + inline void reset_sample_state() { |
| 161 | + sample_idx = 0; |
| 162 | + } |
| 163 | + |
| 164 | + // return the size of the sketching datastructure in bytes (just the buckets, not the metadata) |
| 165 | + inline size_t bucket_array_bytes() const { |
| 166 | + return num_buckets * sizeof(Bucket); |
| 167 | + } |
| 168 | + |
| 169 | + inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; } |
| 170 | + inline uint64_t get_seed() const { return seed; } |
| 171 | + inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; } |
| 172 | + inline size_t checksum_seed() const { return seed; } |
| 173 | + inline size_t get_columns() const { return num_columns; } |
| 174 | + inline size_t get_buckets() const { return num_buckets; } |
| 175 | + inline size_t get_num_samples() const { return num_samples; } |
| 176 | + |
| 177 | + static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; } |
| 178 | + |
| 179 | + static constexpr size_t default_cols_per_sample = 1; |
| 180 | + static constexpr double num_samples_div = 1 - log2(2 - 0.8); |
| 181 | +}; |
0 commit comments