From 5ed55091479cc0403c600833fbcac883c4420e96 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 20 Aug 2024 23:43:24 +0200 Subject: [PATCH 1/7] Created generic for hashers --- src/hyperloglog.rs | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index bfab7ce..a4ac77d 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -5,38 +5,30 @@ //! 1. https://github.com/crepererum/pdatastructs.rs/blob/3997ed50f6b6871c9e53c4c5e0f48f431405fc63/src/hyperloglog.rs //! 2. https://github.com/apache/arrow-datafusion/blob/f203d863f5c8bc9f133f6dd9b2e34e57ac3cdddc/datafusion/physical-expr/src/aggregate/hyperloglog.rs -use std::hash::Hash; - -use ahash::RandomState; +use std::hash::{Hash, Hasher}; +use ahash::AHasher; /// By default, we use 2**14 registers like redis const DEFAULT_P: usize = 14_usize; -/// Fixed seed -const SEED: RandomState = RandomState::with_seeds( - 0x355e438b4b1478c7_u64, - 0xd0e8453cd135b473_u64, - 0xf7b252066a57836a_u64, - 0xb8a829e3713c09bf_u64, -); - /// Note: We don't make HyperLogLog as static struct by keeping `PhantomData` /// Callers should take care of its hash function to be unchanged. /// P is the bucket number, must be [4, 18] /// Q = 64 - P /// Register num is 1 << P #[derive(Clone, Debug, Eq, PartialEq)] -pub struct HyperLogLog { +pub struct HyperLogLog { pub(crate) registers: Vec, + _hasher: std::marker::PhantomData, } -impl Default for HyperLogLog

{ +impl Default for HyperLogLog { fn default() -> Self { Self::new() } } -impl HyperLogLog

{ +impl HyperLogLog { /// note that this method should not be invoked in untrusted environment pub fn new() -> Self { assert!( @@ -47,13 +39,14 @@ impl HyperLogLog

{ Self { registers: vec![0; 1 << P], + _hasher: std::marker::PhantomData, } } pub fn with_registers(registers: Vec) -> Self { assert_eq!(registers.len(), Self::number_registers()); - Self { registers } + Self { registers, _hasher: std::marker::PhantomData } } /// Adds an hash to the HyperLogLog. @@ -68,7 +61,9 @@ impl HyperLogLog

{ /// Adds an object to the HyperLogLog. /// Though we could pass different types into this method, caller should notice that pub fn add_object(&mut self, obj: &T) { - let hash = SEED.hash_one(obj); + let mut hasher = H::default(); + obj.hash(&mut hasher); + let hash = hasher.finish(); self.add_hash(hash); } @@ -190,7 +185,7 @@ fn hll_tau(x: f64) -> f64 { #[cfg(test)] mod tests { - use crate::HyperLogLog; + use super::*; const P: usize = 14; const NUM_REGISTERS: usize = 1 << P; @@ -216,7 +211,7 @@ mod tests { macro_rules! sized_number_test { ($SIZE: expr, $T: tt) => {{ - let mut hll = HyperLogLog::

::new(); + let mut hll = HyperLogLog::::new(); for i in 0..$SIZE { hll.add_object(&(i as $T)); } @@ -245,13 +240,13 @@ mod tests { #[test] fn test_empty() { - let hll = HyperLogLog::

::new(); + let hll = HyperLogLog::::new(); assert_eq!(hll.count(), 0); } #[test] fn test_one() { - let mut hll = HyperLogLog::

::new(); + let mut hll = HyperLogLog::::new(); hll.add_hash(1); assert_eq!(hll.count(), 1); } @@ -283,19 +278,19 @@ mod tests { #[test] fn test_empty_merge() { - let mut hll = HyperLogLog::

::new(); - hll.merge(&HyperLogLog::

::new()); + let mut hll = HyperLogLog::::new(); + hll.merge(&HyperLogLog::::new()); assert_eq!(hll.count(), 0); } #[test] fn test_merge_overlapped() { - let mut hll = HyperLogLog::

::new(); + let mut hll = HyperLogLog::::new(); for i in 0..1000 { hll.add_object(&i); } - let other = HyperLogLog::

::new(); + let other = HyperLogLog::::new(); for i in 0..1000 { hll.add_object(&i); } @@ -306,7 +301,7 @@ mod tests { #[test] fn test_repetition() { - let mut hll = HyperLogLog::

::new(); + let mut hll = HyperLogLog::::new(); for i in 0..1_000_000 { hll.add_object(&(i % 1000)); } From 8c875c83bad429855af368635757e372a5856a91 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Wed, 21 Aug 2024 07:46:34 +0200 Subject: [PATCH 2/7] Added default value --- src/hyperloglog.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index b8ec152..7a78e4c 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -18,12 +18,12 @@ const DEFAULT_P: usize = 14_usize; /// Register num is 1 << P #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemSize, mem_dbg::MemDbg))] -pub struct HyperLogLog { +pub struct HyperLogLog { pub(crate) registers: Vec, _hasher: std::marker::PhantomData, } -impl Default for HyperLogLog { +impl Default for HyperLogLog { fn default() -> Self { Self::new() } From ab83d23bfac4e35ac087a85181a90a9f505e3a35 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Thu, 22 Aug 2024 14:49:50 +0200 Subject: [PATCH 3/7] Formatted code --- src/hyperloglog.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index 7a78e4c..2adf83e 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -5,8 +5,8 @@ //! 1. https://github.com/crepererum/pdatastructs.rs/blob/3997ed50f6b6871c9e53c4c5e0f48f431405fc63/src/hyperloglog.rs //! 2. https://github.com/apache/arrow-datafusion/blob/f203d863f5c8bc9f133f6dd9b2e34e57ac3cdddc/datafusion/physical-expr/src/aggregate/hyperloglog.rs -use std::hash::{Hash, Hasher}; use ahash::AHasher; +use std::hash::{Hash, Hasher}; /// By default, we use 2**14 registers like redis const DEFAULT_P: usize = 14_usize; @@ -18,12 +18,12 @@ const DEFAULT_P: usize = 14_usize; /// Register num is 1 << P #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemSize, mem_dbg::MemDbg))] -pub struct HyperLogLog { +pub struct HyperLogLog { pub(crate) registers: Vec, _hasher: std::marker::PhantomData, } -impl Default for HyperLogLog { +impl Default for HyperLogLog { fn default() -> Self { Self::new() } @@ -47,7 +47,10 @@ impl HyperLogLog { pub fn with_registers(registers: Vec) -> Self { assert_eq!(registers.len(), Self::number_registers()); - Self { registers, _hasher: std::marker::PhantomData } + Self { + registers, + _hasher: std::marker::PhantomData, + } } /// Adds an hash to the HyperLogLog. From e9aab75ac02297f85a211d9a0fde48a6bc9590d2 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 17 Sep 2024 18:51:07 +0200 Subject: [PATCH 4/7] Updated serde for variant with hasher generic --- src/serde.rs | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/serde.rs b/src/serde.rs index 4ac27f3..062f0bb 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -1,4 +1,5 @@ use crate::HyperLogLog; +use core::hash::Hasher; #[derive(serde::Serialize, borsh::BorshSerialize)] enum HyperLogLogVariantRef<'a, const P: usize> { @@ -14,30 +15,33 @@ enum HyperLogLogVariant { Full(Vec), } -impl From> for HyperLogLog

{ +impl From> for HyperLogLog { fn from(value: HyperLogLogVariant

) -> Self { match value { - HyperLogLogVariant::Empty => HyperLogLog::

::new(), + HyperLogLogVariant::Empty => Self::new(), HyperLogLogVariant::Sparse { data } => { let mut registers = vec![0; 1 << P]; for (index, val) in data { registers[index as usize] = val; } - HyperLogLog::

{ registers } + Self::with_registers(registers) } - HyperLogLogVariant::Full(registers) => HyperLogLog::

{ registers }, + HyperLogLogVariant::Full(registers) => Self::with_registers(registers), } } } -impl<'a, const P: usize> From<&'a HyperLogLog

> for HyperLogLogVariantRef<'a, P> { - fn from(hll: &'a HyperLogLog

) -> Self { - let none_empty_registers = HyperLogLog::

::number_registers() - hll.num_empty_registers(); +impl<'a, H: Default + Hasher, const P: usize> From<&'a HyperLogLog> + for HyperLogLogVariantRef<'a, P> +{ + fn from(hll: &'a HyperLogLog) -> Self { + let none_empty_registers = + HyperLogLog::::number_registers() - hll.num_empty_registers(); if none_empty_registers == 0 { HyperLogLogVariantRef::Empty - } else if none_empty_registers * 3 <= HyperLogLog::

::number_registers() { + } else if none_empty_registers * 3 <= HyperLogLog::::number_registers() { // If the number of empty registers is larger enough, we can use sparse serialize to reduce the binary size // each register in sparse format will occupy 3 bytes, 2 for register index and 1 for register value. let sparse_data: Vec<(u16, u8)> = hll @@ -60,7 +64,7 @@ impl<'a, const P: usize> From<&'a HyperLogLog

> for HyperLogLogVariantRef<'a, } } -impl serde::Serialize for HyperLogLog

{ +impl serde::Serialize for HyperLogLog { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -70,7 +74,7 @@ impl serde::Serialize for HyperLogLog

{ } } -impl<'de, const P: usize> serde::Deserialize<'de> for HyperLogLog

{ +impl<'de, H: Default + Hasher, const P: usize> serde::Deserialize<'de> for HyperLogLog { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, @@ -80,14 +84,14 @@ impl<'de, const P: usize> serde::Deserialize<'de> for HyperLogLog

{ } } -impl borsh::BorshSerialize for HyperLogLog

{ +impl borsh::BorshSerialize for HyperLogLog { fn serialize(&self, writer: &mut W) -> std::io::Result<()> { let v: HyperLogLogVariantRef<'_, P> = self.into(); v.serialize(writer) } } -impl borsh::BorshDeserialize for HyperLogLog

{ +impl borsh::BorshDeserialize for HyperLogLog { fn deserialize_reader(reader: &mut R) -> std::io::Result { let v = HyperLogLogVariant::

::deserialize_reader(reader)?; Ok(v.into()) @@ -101,7 +105,7 @@ mod tests { const P: usize = 14; #[test] fn test_serde() { - let mut hll = HyperLogLog::

::new(); + let mut hll = HyperLogLog::::new(); json_serde_equal(&hll); for i in 0..100000 { @@ -109,7 +113,7 @@ mod tests { } json_serde_equal(&hll); - let hll = HyperLogLog::

::with_registers(vec![1; 1 << P]); + let hll = HyperLogLog::::with_registers(vec![1; 1 << P]); json_serde_equal(&hll); } From 2c9aa3a728b65b4e2990b7b5daffb11990c5d440 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 17 Sep 2024 18:54:22 +0200 Subject: [PATCH 5/7] Updated test suite for serde --- src/hyperloglog.rs | 10 +++++++++- src/serde.rs | 5 +++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index 2adf83e..2c6e121 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -16,7 +16,7 @@ const DEFAULT_P: usize = 14_usize; /// P is the bucket number, must be [4, 18] /// Q = 64 - P /// Register num is 1 << P -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug)] #[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemSize, mem_dbg::MemDbg))] pub struct HyperLogLog { pub(crate) registers: Vec, @@ -29,6 +29,14 @@ impl Default for HyperLogLog { } } +impl PartialEq for HyperLogLog { + fn eq(&self, other: &Self) -> bool { + self.registers == other.registers + } +} + +impl Eq for HyperLogLog {} + impl HyperLogLog { /// note that this method should not be invoked in untrusted environment pub fn new() -> Self { diff --git a/src/serde.rs b/src/serde.rs index 062f0bb..5be9530 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -101,11 +101,12 @@ impl borsh::BorshDeserialize for HyperLogLo #[cfg(test)] mod tests { use crate::HyperLogLog; + use ahash::AHasher; const P: usize = 14; #[test] fn test_serde() { - let mut hll = HyperLogLog::::new(); + let mut hll = HyperLogLog::::new(); json_serde_equal(&hll); for i in 0..100000 { @@ -113,7 +114,7 @@ mod tests { } json_serde_equal(&hll); - let hll = HyperLogLog::::with_registers(vec![1; 1 << P]); + let hll = HyperLogLog::::with_registers(vec![1; 1 << P]); json_serde_equal(&hll); } From 0e1a80562c3da132950b50de4e7471f90c3e44fc Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 17 Sep 2024 18:56:06 +0200 Subject: [PATCH 6/7] Resolved clippy code smell --- src/hyperloglog.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index 2c6e121..6eba46a 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -11,6 +11,8 @@ use std::hash::{Hash, Hasher}; /// By default, we use 2**14 registers like redis const DEFAULT_P: usize = 14_usize; +/// HyperLogLog is a probabilistic data structure used to estimate the cardinality of a multiset. +/// /// Note: We don't make HyperLogLog as static struct by keeping `PhantomData` /// Callers should take care of its hash function to be unchanged. /// P is the bucket number, must be [4, 18] From f72d3c8446934c16094d884a9c26794491f44295 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 17 Sep 2024 18:57:09 +0200 Subject: [PATCH 7/7] Formatted document --- src/hyperloglog.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index 6eba46a..3fa0084 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -12,7 +12,7 @@ use std::hash::{Hash, Hasher}; const DEFAULT_P: usize = 14_usize; /// HyperLogLog is a probabilistic data structure used to estimate the cardinality of a multiset. -/// +/// /// Note: We don't make HyperLogLog as static struct by keeping `PhantomData` /// Callers should take care of its hash function to be unchanged. /// P is the bucket number, must be [4, 18]