diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89392b3..7980a4c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,9 @@ jobs: - name: Run cargo check run: cargo check + - name: Run cargo check with serde + run: cargo check --features with_serde + test: name: Test Suite runs-on: ubuntu-latest @@ -31,6 +34,9 @@ jobs: toolchain: stable - name: Run cargo test + run: cargo test + + - name: Run cargo test with serde run: cargo test --features with_serde lints: @@ -50,4 +56,7 @@ jobs: run: cargo fmt --all -- --check - name: Run cargo clippy + run: cargo clippy -- -D warnings + + - name: Run cargo clippy with serde run: cargo clippy --features with_serde -- -D warnings diff --git a/Cargo.toml b/Cargo.toml index 648a5b8..8c34529 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,27 +1,29 @@ [package] -name = "cardinality-estimator" -version = "1.0.2" +name = "cardinality-estimator-safe" +version = "2.1.0" edition = "2021" -authors = ["Alex Bocharov "] -description = "A crate for estimating the cardinality of distinct elements in a stream or dataset." -documentation = "https://docs.rs/cardinality-estimator" +authors = ["Alex Bocharov ", "phil"] +description = "Estimate the cardinality of distinct elements in a stream or dataset with no unsafe code" +documentation = "https://docs.rs/cardinality-estimator-safe" license = "Apache-2.0" readme = "README.md" -repository = "https://github.com/cloudflare/cardinality-estimator" +repository = "https://github.com/uniphil/cardinality-estimator-safe" keywords = ["cardinality", "distinct-count", "hyperloglog", "probabilistic", "sketch"] categories = ["algorithms", "data-structures"] [dependencies] enum_dispatch = "0.3.13" -serde = { version = "1.0", optional = true } +serde = { version = "1.0", features = ["derive"], optional = true } wyhash = "0.5.0" [dev-dependencies] amadeus-streaming = "0.4.3" +cardinality-estimator = "1.0.2" criterion = { version = "0.5.0", features = ["html_reports"] } dhat = "0.3.3" hyperloglog = "1.0.2" hyperloglogplus = "0.4.1" +postcard = { version = "1.1.1", features=["alloc"] } pprof = { version = "0.14.0", features = ["flamegraph", "criterion", "protobuf-codec"] } probabilistic-collections = "0.7.0" rand = "0.8.5" diff --git a/README.md b/README.md index 0e0d4c1..51f4fe1 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,28 @@ -# cardinality-estimator -![build](https://img.shields.io/github/actions/workflow/status/cloudflare/cardinality-estimator/ci.yml?branch=main) -[![docs.rs](https://docs.rs/cardinality-estimator/badge.svg)](https://docs.rs/cardinality-estimator) -[![crates.io](https://img.shields.io/crates/v/cardinality-estimator.svg)](https://crates.io/crates/cardinality-estimator) +# cardinality-estimator-safe +![build](https://img.shields.io/github/actions/workflow/status/uniphil/cardinality-estimator-safe/ci.yml?branch=main) +[![docs.rs](https://docs.rs/cardinality-estimator-safe/badge.svg)](https://docs.rs/cardinality-estimator-safe) +[![crates.io](https://img.shields.io/crates/v/cardinality-estimator-safe.svg)](https://crates.io/crates/cardinality-estimator-safe) [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE) +`cardinality-estimator-safe` is a fork of Cloudflare's `cardinality-estimator`, replacing its data representations with boring old owned data and eliminating all uses of `unsafe`. Its serialization formats are intended to be reasonable with `serde_json`, and efficient with `bincode`. + `cardinality-estimator` is a Rust crate designed to estimate the number of distinct elements in a stream or dataset in an efficient manner. -This library uses HyperLogLog++ with an optimized low memory footprint and high accuracy approach, suitable for large-scale data analysis tasks. +This library uses HyperLogLog++ ~~with an optimized low memory footprint~~ and high accuracy approach, suitable for large-scale data analysis tasks. We're using `cardinality-estimator` for large-scale machine learning, computing cardinality features across multiple dimensions of the request. ## Overview -Our `cardinality-estimator` is highly efficient in terms of memory usage, latency, and accuracy. +Our `cardinality-estimator` is highly efficient in terms of ~~memory usage, latency, and~~ accuracy. This is achieved by leveraging a combination of unique data structure design, efficient algorithms, and HyperLogLog++ for high cardinality ranges. ## Getting Started -To use `cardinality-estimator`, add it to your `Cargo.toml` under `[dependencies]`: +To use `cardinality-estimator-safe`, add it to your `Cargo.toml` under `[dependencies]`: ```toml [dependencies] -cardinality-estimator = "1.0.0" +cardinality-estimator-safe = "2.1.0" ``` -Then, import `cardinality-estimator` in your Rust program: +Then, import `cardinality-estimator-safe` in your Rust program: ```rust -use cardinality_estimator::CardinalityEstimator; +use cardinality_estimator_safe::CardinalityEstimator; let mut estimator = CardinalityEstimator::<12, 6>::new(); estimator.insert("test"); @@ -37,15 +39,17 @@ The data is stored in three different representations - `Small`, `Array`, and `H For instance, for a cardinality of 0 to 2, only **8 bytes** of stack memory and 0 bytes of heap memory are used. ## Low latency -The crate offers low latency by using auto-vectorization for slice operations via compiler hints to use SIMD instructions. +~~The crate offers low latency by using auto-vectorization for slice operations via compiler hints to use SIMD instructions.~~ The number of zero registers and registers' harmonic sum are stored and updated dynamically as more data is inserted, resulting in fast estimate operations. ## High accuracy -The cardinality-estimator achieves high accuracy by using precise counting for small cardinality ranges and HyperLogLog++ with LogLog-Beta bias correction for larger ranges. +The cardinality-estimator-safe achieves high accuracy by using precise counting for small cardinality ranges and HyperLogLog++ with LogLog-Beta bias correction for larger ranges. This provides expected error rates as low as 0.02% for large cardinalities. ## Benchmarks +Benchmarks are added to directly compare Cloudflare's `cardinality-estimator` with `cardinality-estimator-safe`. They are included beside Cloudflare's original benchmarks for context. + To run benchmarks you first need to install `cargo-criterion` binary: ```shell cargo install cargo-criterion @@ -73,6 +77,9 @@ Benchmarks presented below are executed on Linux laptop with `13th Gen Intel(R) ### Memory usage ![Cardinality Estimators Memory Usage](benches/memory_bytes.png) +#### vs. cardinality-estimator-safe +![Cardinality Estimators Memory Usage](benches/memory_bytes_safe.png) + Table below compares memory usage of different cardinality estimators. The number in each cell represents `stack memory bytes / heap memory bytes / heap memory blocks` at each measured cardinality. @@ -108,6 +115,10 @@ Note, that `hyperloglogplus` implementation has particularly high memory usage e ### Insert performance ![Cardinality Estimators Insert Time](benches/insert_time.png) +#### vs cardinality-estimator-safe + +![Cardinality Estimators Insert Time](benches/insert_time_safe.png) + Table below represents insert time in nanoseconds per element. Our `cardinality-estimator` demonstrates the lowest insert time for most of the cardinalities. @@ -140,6 +151,10 @@ Our `cardinality-estimator` demonstrates the lowest insert time for most of the ### Estimate performance ![Cardinality Estimators Estimate Time](benches/estimate_time.png) +#### vs cardinality-estimator-safe +![Cardinality Estimators Estimate Time](benches/estimate_time_safe.png) + + Table below represents estimate time in nanoseconds per call. Our `cardinality-estimator` shows the lowest estimate time for most of the cardinalities, especially smaller cardinalities up to 128. @@ -175,6 +190,9 @@ Implementations `probabilistic-collections`, `hyperloglogplus` and `hyperloglogp ### Error rate ![Cardinality Estimators Error Rate](benches/error_rate.png) +#### vs cardinality-estimator-safe +![Cardinality Estimators Error Rate](benches/error_rate_safe.png) + Table below represents average absolute relative error across 100 runs of estimator on random elements at given cardinality. Our `cardinality-estimator` performs on par well with `amadeus-streaming` and `hyperloglog` estimators, but has especially smaller low error rate for cardinalities up to 128. diff --git a/benches/analyze.py b/benches/analyze.py index cbefc21..13bcb2d 100644 --- a/benches/analyze.py +++ b/benches/analyze.py @@ -60,6 +60,7 @@ def render_comparison(bench_results_path, df, operation, metric, yscale, ylim=No colors = { 'cardinality-estimator': 'green', + 'cardinality-estimator-safe': 'blue', 'amadeus-streaming': 'blue', 'probabilistic-collections': 'red', 'hyperloglog': 'purple', diff --git a/benches/cardinality_estimator.rs b/benches/cardinality_estimator.rs index c626256..7cce6b6 100644 --- a/benches/cardinality_estimator.rs +++ b/benches/cardinality_estimator.rs @@ -4,6 +4,7 @@ static ALLOC: dhat::Alloc = dhat::Alloc; use std::hash::{BuildHasherDefault, Hash}; use cardinality_estimator::CardinalityEstimator; +use cardinality_estimator_safe::CardinalityEstimator as CardinalityEstimatorSafe; use criterion::measurement::WallTime; use criterion::{ black_box, criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion, Throughput, @@ -44,10 +45,11 @@ fn benchmark(c: &mut Criterion) { for &cardinality in &cardinalities { group.throughput(Throughput::Elements(cardinality.max(1) as u64)); bench_insert::(&mut group, cardinality); - bench_insert::(&mut group, cardinality); - bench_insert::(&mut group, cardinality); - bench_insert::(&mut group, cardinality); - bench_insert::(&mut group, cardinality); + bench_insert::(&mut group, cardinality); + // bench_insert::(&mut group, cardinality); + // bench_insert::(&mut group, cardinality); + // bench_insert::(&mut group, cardinality); + // bench_insert::(&mut group, cardinality); } group.finish(); @@ -55,10 +57,11 @@ fn benchmark(c: &mut Criterion) { group.throughput(Throughput::Elements(1)); for &cardinality in &cardinalities { bench_estimate::(&mut group, cardinality); - bench_estimate::(&mut group, cardinality); - bench_estimate::(&mut group, cardinality); - bench_estimate::(&mut group, cardinality); - bench_estimate::(&mut group, cardinality); + bench_estimate::(&mut group, cardinality); + // bench_estimate::(&mut group, cardinality); + // bench_estimate::(&mut group, cardinality); + // bench_estimate::(&mut group, cardinality); + // bench_estimate::(&mut group, cardinality); } group.finish(); @@ -67,10 +70,13 @@ fn benchmark(c: &mut Criterion) { .map(|&cardinality| StatRecord { cardinality, cardinality_estimator: measure_allocations::(cardinality), - amadeus_streaming: measure_allocations::(cardinality), - probabilistic_collections: measure_allocations::(cardinality), - hyperloglog: measure_allocations::(cardinality), - hyperloglogplus: measure_allocations::(cardinality), + cardinality_estimator_safe: measure_allocations::( + cardinality, + ), + // amadeus_streaming: measure_allocations::(cardinality), + // probabilistic_collections: measure_allocations::(cardinality), + // hyperloglog: measure_allocations::(cardinality), + // hyperloglogplus: measure_allocations::(cardinality), }) .collect(); @@ -86,10 +92,11 @@ fn benchmark(c: &mut Criterion) { .map(|&cardinality| StatRecord { cardinality, cardinality_estimator: measure_error::(cardinality), - amadeus_streaming: measure_error::(cardinality), - probabilistic_collections: measure_error::(cardinality), - hyperloglog: measure_error::(cardinality), - hyperloglogplus: measure_error::(cardinality), + cardinality_estimator_safe: measure_error::(cardinality), + // amadeus_streaming: measure_error::(cardinality), + // probabilistic_collections: measure_error::(cardinality), + // hyperloglog: measure_error::(cardinality), + // hyperloglogplus: measure_error::(cardinality), }) .collect(); @@ -189,10 +196,11 @@ fn measure_error>(cardinality: usize) -> Str struct StatRecord { cardinality: usize, cardinality_estimator: String, - amadeus_streaming: String, - probabilistic_collections: String, - hyperloglog: String, - hyperloglogplus: String, + cardinality_estimator_safe: String, + // amadeus_streaming: String, + // probabilistic_collections: String, + // hyperloglog: String, + // hyperloglogplus: String, } struct CardinalityEstimatorMut(CardinalityEstimator); @@ -219,6 +227,30 @@ impl CardinalityEstimatorTrait for CardinalityEstimatorMut { } } +struct CardinalityEstimatorSafeMut(CardinalityEstimatorSafe); + +impl CardinalityEstimatorTrait for CardinalityEstimatorSafeMut { + fn new() -> Self { + Self(CardinalityEstimatorSafe::new()) + } + + fn insert(&mut self, item: &usize) { + self.0.insert(item); + } + + fn estimate(&mut self) -> usize { + self.0.estimate() + } + + fn merge(&mut self, rhs: &Self) { + self.0.merge(&rhs.0); + } + + fn name() -> String { + "cardinality-estimator-safe".to_string() + } +} + struct AmadeusStreamingEstimator(amadeus_streaming::HyperLogLog); impl CardinalityEstimatorTrait for AmadeusStreamingEstimator { diff --git a/benches/error_rate_safe.png b/benches/error_rate_safe.png new file mode 100644 index 0000000..2876c60 Binary files /dev/null and b/benches/error_rate_safe.png differ diff --git a/benches/estimate_time_safe.png b/benches/estimate_time_safe.png new file mode 100644 index 0000000..3109a1c Binary files /dev/null and b/benches/estimate_time_safe.png differ diff --git a/benches/insert_time_safe.png b/benches/insert_time_safe.png new file mode 100644 index 0000000..009fbd1 Binary files /dev/null and b/benches/insert_time_safe.png differ diff --git a/benches/memory_bytes_safe.png b/benches/memory_bytes_safe.png new file mode 100644 index 0000000..6e714b0 Binary files /dev/null and b/benches/memory_bytes_safe.png differ diff --git a/examples/estimator.rs b/examples/estimator.rs index 38434e6..84e0def 100644 --- a/examples/estimator.rs +++ b/examples/estimator.rs @@ -1,4 +1,4 @@ -use cardinality_estimator::CardinalityEstimator; +use cardinality_estimator_safe::CardinalityEstimator; fn main() { let mut estimator1 = CardinalityEstimator::::new(); diff --git a/examples/json.rs b/examples/json.rs new file mode 100644 index 0000000..c9b3872 --- /dev/null +++ b/examples/json.rs @@ -0,0 +1,40 @@ +#[cfg(feature = "with_serde")] +fn main() { + let mut estimator = + cardinality_estimator_safe::CardinalityEstimator::::new(); + + println!( + "serialized empty estimator (small): {}", + serde_json::to_string_pretty(&estimator).unwrap() + ); + + estimator.insert(&0); + + println!( + "serialized with one insert (small): {}", + serde_json::to_string_pretty(&estimator).unwrap() + ); + + estimator.insert(&1); + estimator.insert(&2); + + println!( + "serialized with three inserts (array): {}", + serde_json::to_string_pretty(&estimator).unwrap() + ); + + for i in 3..1000 { + estimator.insert(&i); + } + + println!( + "serialized with many inserts (HLL): {}", + serde_json::to_string_pretty(&estimator).unwrap() + ); +} + +#[cfg(not(feature = "with_serde"))] +fn main() -> Result<(), u32> { + eprintln!("this example requires --features with_serde"); + Err(1) +} diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index ea1167a..e2b3551 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -10,6 +10,7 @@ cargo-fuzz = true [dependencies] cardinality-estimator = { path = "..", features = ["with_serde"] } libfuzzer-sys = "0.4" +postcard = { version = "1.1.1", features = ["alloc"] } serde_json = "1.0.115" wyhash = "0.5.0" @@ -26,3 +27,17 @@ path = "fuzz_targets/serde.rs" test = false doc = false bench = false + +[[bin]] +name = "serde_json_array" +path = "fuzz_targets/serde_json_array.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "serde_postcard" +path = "fuzz_targets/serde_postcard.rs" +test = false +doc = false +bench = false diff --git a/fuzz/fuzz_targets/estimator.rs b/fuzz/fuzz_targets/estimator.rs index 5afb559..5686790 100644 --- a/fuzz/fuzz_targets/estimator.rs +++ b/fuzz/fuzz_targets/estimator.rs @@ -1,6 +1,6 @@ #![no_main] -use cardinality_estimator::estimator::CardinalityEstimator; +use cardinality_estimator_safe::estimator::CardinalityEstimator; use libfuzzer_sys::fuzz_target; use wyhash::wyhash; diff --git a/fuzz/fuzz_targets/serde.rs b/fuzz/fuzz_targets/serde.rs index 5bd903b..04d09ef 100644 --- a/fuzz/fuzz_targets/serde.rs +++ b/fuzz/fuzz_targets/serde.rs @@ -1,6 +1,6 @@ #![no_main] -use cardinality_estimator::estimator::CardinalityEstimator; +use cardinality_estimator_safe::estimator::CardinalityEstimator; use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { diff --git a/fuzz/fuzz_targets/serde_json_array.rs b/fuzz/fuzz_targets/serde_json_array.rs new file mode 100644 index 0000000..9b77dc7 --- /dev/null +++ b/fuzz/fuzz_targets/serde_json_array.rs @@ -0,0 +1,19 @@ +#![no_main] + +use serde_json::Value; +use cardinality_estimator_safe::estimator::CardinalityEstimator; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + // pretty naiive version, u8s directly into each number position + let json: serde_json::Value = match data.len() { + 0 => Value::Array(vec![]), + 1 => Value::Array(vec![data[0].into()]), + _ => Value::Array(vec![data[0].into(), + Value::Array(data[1..].iter().map(|n| (*n).into()).collect())]), + }; + if let Ok(mut estimator) = serde_json::from_value::>(json) { + estimator.insert(&1); + assert!(estimator.estimate() > 0); + } +}); diff --git a/fuzz/fuzz_targets/serde_postcard.rs b/fuzz/fuzz_targets/serde_postcard.rs new file mode 100644 index 0000000..ca7fbd9 --- /dev/null +++ b/fuzz/fuzz_targets/serde_postcard.rs @@ -0,0 +1,11 @@ +#![no_main] + +use cardinality_estimator_safe::estimator::CardinalityEstimator; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + if let Ok(mut estimator) = postcard::from_bytes::>(data) { + estimator.insert(&1); + assert!(estimator.estimate() > 0); + } +}); diff --git a/src/array.rs b/src/array.rs index 2951054..20e5a94 100644 --- a/src/array.rs +++ b/src/array.rs @@ -11,176 +11,88 @@ //! - data[N..] - store zeros used for future hashes use std::fmt::{Debug, Formatter}; -use std::mem::{size_of, size_of_val}; +use std::mem::size_of_val; use std::ops::Deref; -use std::slice; use crate::hyperloglog::HyperLogLog; -use crate::representation::{RepresentationTrait, REPRESENTATION_ARRAY}; +use crate::representation::{Representation, RepresentationTrait}; +#[cfg(feature = "with_serde")] +use serde::{Deserialize, Serialize}; /// Maximum number of elements stored in array representation pub(crate) const MAX_CAPACITY: usize = 128; -/// Bit offset of the array's length -const LEN_OFFSET: usize = 56; -/// Mask used for accessing heap allocated data stored at the pointer in `data` field. -const PTR_MASK: usize = ((1 << LEN_OFFSET) - 1) & !3; /// Array representation container -pub(crate) struct Array<'a, const P: usize, const W: usize> { - /// Number of items stored in the array - len: usize, - /// Array of items. Not all items are used, only first `len` of them. - /// The length of the slice is actually the current capacity. - arr: &'a mut [u32], -} +#[cfg_attr(feature = "with_serde", derive(Serialize, Deserialize))] +pub(crate) struct Array(Vec); -impl<'a, const P: usize, const W: usize> Array<'a, P, W> { +impl Array { /// Insert encoded hash into `Array` representation /// Returns true on success, false otherwise. #[inline] pub(crate) fn insert(&mut self, h: u32) -> bool { - let cap = self.arr.len(); - let found = if cap == 4 { - contains_fixed_vectorized::<4>(self.arr.try_into().unwrap(), h) - } else if cap == 8 { - contains_fixed_vectorized::<8>(self.arr.try_into().unwrap(), h) - } else { - // calculate rounded up slice length for efficient look up in batches - let rlen = 16 * self.len.div_ceil(16); - // SAFETY: `rlen` guaranteed to be within `self.arr` boundaries - contains_vectorized::<16>(unsafe { self.arr.get_unchecked(..rlen) }, h) - }; - - if found { + if self.0.contains(&h) { return true; } - - if self.len < cap { - // if there are available slots in current array - append to it - self.arr[self.len] = h; - self.len += 1; + if self.0.len() < MAX_CAPACITY { + self.0.push(h); return true; } - - if cap < MAX_CAPACITY { - // double array capacity up to `MAX_CAPACITY` - let new_arr = Self::from_vec(vec![0; cap * 2], self.len + 1); - new_arr.arr[..self.len].copy_from_slice(self.arr); - new_arr.arr[self.len] = h; - unsafe { self.drop() }; - *self = new_arr; - return true; - }; - false } /// Create new instance of `Array` representation from vector #[inline] - pub(crate) fn from_vec(mut arr: Vec, len: usize) -> Array<'a, P, W> { - let cap = arr.len(); - let ptr = arr.as_mut_ptr(); - std::mem::forget(arr); - // SAFETY: valid pointer from vector being used to create slice reference - let arr = unsafe { slice::from_raw_parts_mut(ptr, cap) }; - Self { len, arr } + pub(crate) fn from_vec(arr: Vec, _len: usize) -> Array { + Self(arr) } } -impl RepresentationTrait for Array<'_, P, W> { +impl RepresentationTrait for Array { /// Insert encoded hash into `HyperLogLog` representation. #[inline] - fn insert_encoded_hash(&mut self, h: u32) -> usize { + fn insert_encoded_hash(&mut self, h: u32) -> Option> { if self.insert(h) { - self.to_data() + None } else { // upgrade from `Array` to `HyperLogLog` representation let mut hll = HyperLogLog::::new(self); - unsafe { self.drop() }; - hll.insert_encoded_hash(h) + hll.insert_encoded_hash(h); + Some(Representation::Hll(hll)) } } /// Return cardinality estimate of `Array` representation #[inline] fn estimate(&self) -> usize { - self.len + self.0.len() } /// Return memory size of `Array` representation #[inline] fn size_of(&self) -> usize { - size_of::() + size_of_val(self.arr) - } - - /// Free memory occupied by the `Array` representation - /// SAFETY: caller of this method must ensure that `self.arr` holds valid slice elements. - #[inline] - unsafe fn drop(&mut self) { - drop(Box::from_raw(self.arr)); - } - - /// Convert `Array` representation to `data` - #[inline] - fn to_data(&self) -> usize { - (self.len << LEN_OFFSET) | (PTR_MASK & self.arr.as_ptr() as usize) | REPRESENTATION_ARRAY + size_of_val(self) } } -impl Debug for Array<'_, P, W> { +impl Debug for Array { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str(&self.to_string()) } } -impl PartialEq for Array<'_, P, W> { +impl PartialEq for Array { fn eq(&self, other: &Self) -> bool { self.deref() == other.deref() } } -impl From for Array<'_, P, W> { - /// Create new instance of `Array` from given `data` - #[inline] - fn from(data: usize) -> Self { - let ptr = (data & PTR_MASK) as *mut u32; - let len = data >> LEN_OFFSET; - let cap = len.next_power_of_two(); - let arr = unsafe { slice::from_raw_parts_mut(ptr, cap) }; - Self { len, arr } - } -} - -impl Deref for Array<'_, P, W> { +impl Deref for Array { type Target = [u32]; fn deref(&self) -> &Self::Target { - &self.arr[..self.len] - } -} - -/// Vectorized linear array search benefiting from SIMD instructions (e.g. AVX2). -/// -/// Input slice length assumed to be divisible by `N` to perform efficient -/// batch comparisons of slice elements to provided value `v`. -/// -/// Assembly output: https://godbolt.org/z/eb8Kob9fa -/// Background reading: https://tinyurl.com/2e4srh2d -#[inline] -fn contains_vectorized(a: &[u32], v: u32) -> bool { - debug_assert_eq!(a.len() % N, 0); - a.chunks_exact(N) - .any(|chunk| contains_fixed_vectorized::(chunk.try_into().unwrap(), v)) -} - -/// Vectorized linear fixed array search -#[inline] -fn contains_fixed_vectorized(a: [u32; N], v: u32) -> bool { - let mut res = false; - for x in a { - res |= x == v + &self.0 } - res } #[cfg(test)] diff --git a/src/estimator.rs b/src/estimator.rs index 6b0d543..8cf2bc1 100644 --- a/src/estimator.rs +++ b/src/estimator.rs @@ -6,19 +6,28 @@ use std::ops::Deref; use wyhash::WyHash; use crate::representation::{Representation, RepresentationTrait}; +#[cfg(feature = "with_serde")] +use serde::{Deserialize, Serialize}; /// Ensure that only 64-bit architecture is being used. #[cfg(target_pointer_width = "64")] +#[cfg_attr(feature = "with_serde", derive(Serialize, Deserialize))] +#[cfg_attr( + feature = "with_serde", + serde(from = "Representation", into = "Representation",) +)] pub struct CardinalityEstimator where T: Hash + ?Sized, H: Hasher + Default, { /// Data field represents tagged pointer with its format described in lib.rs - pub(crate) data: usize, + pub(crate) data: Representation, /// Zero-sized build hasher + #[cfg_attr(feature = "with_serde", serde(skip))] build_hasher: BuildHasherDefault, /// Zero-sized phantom data for type `T` + #[cfg_attr(feature = "with_serde", serde(skip))] _phantom_data: PhantomData, } @@ -35,7 +44,7 @@ where Self { // Start with empty small representation - data: 0, + data: Representation::Small(Default::default()), build_hasher: BuildHasherDefault::default(), _phantom_data: PhantomData, } @@ -57,7 +66,7 @@ where /// Merge cardinality estimators #[inline] pub fn merge(&mut self, rhs: &Self) { - match (self.representation(), rhs.representation()) { + match (&mut self.data, &rhs.data) { (_, Representation::Small(rhs_small)) => { for h in rhs_small.items() { if h != 0 { @@ -73,30 +82,32 @@ where (Representation::Small(lhs_small), Representation::Hll(rhs_hll)) => { let mut hll = rhs_hll.clone(); for h in lhs_small.items() { - if h != 0 { - hll.insert_encoded_hash(h); - } + if hll.insert_encoded_hash(h).is_some() { + panic!("inserting into hll rep must yield hll rep"); + }; } - self.data = hll.to_data(); + self.data = Representation::Hll(hll); } - (Representation::Array(mut lhs_arr), Representation::Hll(rhs_hll)) => { + (Representation::Array(lhs_arr), Representation::Hll(rhs_hll)) => { let mut hll = rhs_hll.clone(); - for &h in lhs_arr.deref() { - hll.insert_encoded_hash(h); + for &h in &**lhs_arr { + // todo: gross don't use deref + if hll.insert_encoded_hash(h).is_some() { + panic!("inserting into hll rep must yield hll rep"); + }; } - unsafe { lhs_arr.drop() }; - self.data = hll.to_data(); + self.data = Representation::Hll(hll); } - (Representation::Hll(mut lhs_hll), Representation::Hll(rhs_hll)) => { - lhs_hll.merge(&rhs_hll); + (Representation::Hll(ref mut lhs_hll), Representation::Hll(rhs_hll)) => { + lhs_hll.merge(rhs_hll); } } } /// Returns the representation type of `CardinalityEstimator`. #[inline] - pub(crate) fn representation(&self) -> Representation { - Representation::::from_data(self.data) + pub(crate) fn representation(&self) -> &Representation { + &self.data } /// Insert hash into `CardinalityEstimator` @@ -108,7 +119,7 @@ where /// Insert encoded hash into `CardinalityEstimator` #[inline] fn insert_encoded_hash(&mut self, h: u32) { - self.data = self.representation().insert_encoded_hash(h); + self.data.iec(h); } /// Compute the sparse encoding of the given hash @@ -148,18 +159,6 @@ where } } -impl Drop for CardinalityEstimator -where - T: Hash + ?Sized, - H: Hasher + Default, -{ - /// Free memory occupied by `CardinalityEstimator` - #[inline] - fn drop(&mut self) { - unsafe { self.representation().drop() }; - } -} - impl PartialEq for CardinalityEstimator where T: Hash + ?Sized, @@ -181,72 +180,100 @@ where } } +#[cfg(feature = "with_serde")] +impl From> + for CardinalityEstimator +where + T: Hash + ?Sized, + H: Hasher + Default, +{ + fn from(rep: Representation) -> Self { + Self { + data: rep, + build_hasher: BuildHasherDefault::default(), + _phantom_data: PhantomData, + } + } +} + +#[cfg(feature = "with_serde")] +impl From> + for Representation +where + T: Hash + ?Sized, + H: Hasher + Default, +{ + fn from(est: CardinalityEstimator) -> Self { + est.data + } +} + #[cfg(test)] pub mod tests { use super::*; use test_case::test_case; - #[test_case(0 => "representation: Small(estimate: 0, size: 8), avg_err: 0.0000")] - #[test_case(1 => "representation: Small(estimate: 1, size: 8), avg_err: 0.0000")] - #[test_case(2 => "representation: Small(estimate: 2, size: 8), avg_err: 0.0000")] - #[test_case(3 => "representation: Array(estimate: 3, size: 24), avg_err: 0.0000")] - #[test_case(4 => "representation: Array(estimate: 4, size: 24), avg_err: 0.0000")] - #[test_case(8 => "representation: Array(estimate: 8, size: 40), avg_err: 0.0000")] - #[test_case(16 => "representation: Array(estimate: 16, size: 72), avg_err: 0.0000")] - #[test_case(17 => "representation: Array(estimate: 17, size: 136), avg_err: 0.0000")] - #[test_case(28 => "representation: Array(estimate: 28, size: 136), avg_err: 0.0000")] - #[test_case(29 => "representation: Array(estimate: 29, size: 136), avg_err: 0.0000")] - #[test_case(56 => "representation: Array(estimate: 56, size: 264), avg_err: 0.0000")] - #[test_case(57 => "representation: Array(estimate: 57, size: 264), avg_err: 0.0000")] - #[test_case(128 => "representation: Array(estimate: 128, size: 520), avg_err: 0.0000")] - #[test_case(129 => "representation: Hll(estimate: 131, size: 660), avg_err: 0.0001")] - #[test_case(256 => "representation: Hll(estimate: 264, size: 660), avg_err: 0.0119")] - #[test_case(512 => "representation: Hll(estimate: 512, size: 660), avg_err: 0.0151")] - #[test_case(1024 => "representation: Hll(estimate: 1033, size: 660), avg_err: 0.0172")] - #[test_case(10_000 => "representation: Hll(estimate: 10417, size: 660), avg_err: 0.0281")] - #[test_case(100_000 => "representation: Hll(estimate: 93099, size: 660), avg_err: 0.0351")] + #[test_case(0 => "representation: Small(estimate: 0), avg_err: 0.0000")] + #[test_case(1 => "representation: Small(estimate: 1), avg_err: 0.0000")] + #[test_case(2 => "representation: Small(estimate: 2), avg_err: 0.0000")] + #[test_case(3 => "representation: Array(estimate: 3), avg_err: 0.0000")] + #[test_case(4 => "representation: Array(estimate: 4), avg_err: 0.0000")] + #[test_case(8 => "representation: Array(estimate: 8), avg_err: 0.0000")] + #[test_case(16 => "representation: Array(estimate: 16), avg_err: 0.0000")] + #[test_case(17 => "representation: Array(estimate: 17), avg_err: 0.0000")] + #[test_case(28 => "representation: Array(estimate: 28), avg_err: 0.0000")] + #[test_case(29 => "representation: Array(estimate: 29), avg_err: 0.0000")] + #[test_case(56 => "representation: Array(estimate: 56), avg_err: 0.0000")] + #[test_case(57 => "representation: Array(estimate: 57), avg_err: 0.0000")] + #[test_case(128 => "representation: Array(estimate: 128), avg_err: 0.0000")] + #[test_case(129 => "representation: Hll(estimate: 131), avg_err: 0.0001")] + #[test_case(256 => "representation: Hll(estimate: 264), avg_err: 0.0119")] + #[test_case(512 => "representation: Hll(estimate: 512), avg_err: 0.0151")] + #[test_case(1024 => "representation: Hll(estimate: 1033), avg_err: 0.0172")] + #[test_case(10_000 => "representation: Hll(estimate: 10417), avg_err: 0.0281")] + #[test_case(100_000 => "representation: Hll(estimate: 93099), avg_err: 0.0351")] fn test_estimator_p10_w5(n: usize) -> String { evaluate_cardinality_estimator(CardinalityEstimator::::new(), n) } - #[test_case(0 => "representation: Small(estimate: 0, size: 8), avg_err: 0.0000")] - #[test_case(1 => "representation: Small(estimate: 1, size: 8), avg_err: 0.0000")] - #[test_case(2 => "representation: Small(estimate: 2, size: 8), avg_err: 0.0000")] - #[test_case(3 => "representation: Array(estimate: 3, size: 24), avg_err: 0.0000")] - #[test_case(4 => "representation: Array(estimate: 4, size: 24), avg_err: 0.0000")] - #[test_case(8 => "representation: Array(estimate: 8, size: 40), avg_err: 0.0000")] - #[test_case(16 => "representation: Array(estimate: 16, size: 72), avg_err: 0.0000")] - #[test_case(32 => "representation: Array(estimate: 32, size: 136), avg_err: 0.0000")] - #[test_case(64 => "representation: Array(estimate: 64, size: 264), avg_err: 0.0000")] - #[test_case(128 => "representation: Array(estimate: 128, size: 520), avg_err: 0.0000")] - #[test_case(129 => "representation: Hll(estimate: 130, size: 3092), avg_err: 0.0001")] - #[test_case(256 => "representation: Hll(estimate: 254, size: 3092), avg_err: 0.0029")] - #[test_case(512 => "representation: Hll(estimate: 498, size: 3092), avg_err: 0.0068")] - #[test_case(1024 => "representation: Hll(estimate: 1012, size: 3092), avg_err: 0.0130")] - #[test_case(4096 => "representation: Hll(estimate: 4105, size: 3092), avg_err: 0.0089")] - #[test_case(10_000 => "representation: Hll(estimate: 10068, size: 3092), avg_err: 0.0087")] - #[test_case(100_000 => "representation: Hll(estimate: 95628, size: 3092), avg_err: 0.0182")] + #[test_case(0 => "representation: Small(estimate: 0), avg_err: 0.0000")] + #[test_case(1 => "representation: Small(estimate: 1), avg_err: 0.0000")] + #[test_case(2 => "representation: Small(estimate: 2), avg_err: 0.0000")] + #[test_case(3 => "representation: Array(estimate: 3), avg_err: 0.0000")] + #[test_case(4 => "representation: Array(estimate: 4), avg_err: 0.0000")] + #[test_case(8 => "representation: Array(estimate: 8), avg_err: 0.0000")] + #[test_case(16 => "representation: Array(estimate: 16), avg_err: 0.0000")] + #[test_case(32 => "representation: Array(estimate: 32), avg_err: 0.0000")] + #[test_case(64 => "representation: Array(estimate: 64), avg_err: 0.0000")] + #[test_case(128 => "representation: Array(estimate: 128), avg_err: 0.0000")] + #[test_case(129 => "representation: Hll(estimate: 130), avg_err: 0.0001")] + #[test_case(256 => "representation: Hll(estimate: 254), avg_err: 0.0029")] + #[test_case(512 => "representation: Hll(estimate: 498), avg_err: 0.0068")] + #[test_case(1024 => "representation: Hll(estimate: 1012), avg_err: 0.0130")] + #[test_case(4096 => "representation: Hll(estimate: 4105), avg_err: 0.0089")] + #[test_case(10_000 => "representation: Hll(estimate: 10068), avg_err: 0.0087")] + #[test_case(100_000 => "representation: Hll(estimate: 95628), avg_err: 0.0182")] fn test_estimator_p12_w6(n: usize) -> String { evaluate_cardinality_estimator(CardinalityEstimator::::new(), n) } - #[test_case(0 => "representation: Small(estimate: 0, size: 8), avg_err: 0.0000")] - #[test_case(1 => "representation: Small(estimate: 1, size: 8), avg_err: 0.0000")] - #[test_case(2 => "representation: Small(estimate: 2, size: 8), avg_err: 0.0000")] - #[test_case(3 => "representation: Array(estimate: 3, size: 24), avg_err: 0.0000")] - #[test_case(4 => "representation: Array(estimate: 4, size: 24), avg_err: 0.0000")] - #[test_case(8 => "representation: Array(estimate: 8, size: 40), avg_err: 0.0000")] - #[test_case(16 => "representation: Array(estimate: 16, size: 72), avg_err: 0.0000")] - #[test_case(32 => "representation: Array(estimate: 32, size: 136), avg_err: 0.0000")] - #[test_case(64 => "representation: Array(estimate: 64, size: 264), avg_err: 0.0000")] - #[test_case(128 => "representation: Array(estimate: 128, size: 520), avg_err: 0.0000")] - #[test_case(129 => "representation: Hll(estimate: 129, size: 196628), avg_err: 0.0000")] - #[test_case(256 => "representation: Hll(estimate: 256, size: 196628), avg_err: 0.0000")] - #[test_case(512 => "representation: Hll(estimate: 511, size: 196628), avg_err: 0.0004")] - #[test_case(1024 => "representation: Hll(estimate: 1022, size: 196628), avg_err: 0.0014")] - #[test_case(4096 => "representation: Hll(estimate: 4100, size: 196628), avg_err: 0.0009")] - #[test_case(10_000 => "representation: Hll(estimate: 10007, size: 196628), avg_err: 0.0008")] - #[test_case(100_000 => "representation: Hll(estimate: 100240, size: 196628), avg_err: 0.0011")] + #[test_case(0 => "representation: Small(estimate: 0), avg_err: 0.0000")] + #[test_case(1 => "representation: Small(estimate: 1), avg_err: 0.0000")] + #[test_case(2 => "representation: Small(estimate: 2), avg_err: 0.0000")] + #[test_case(3 => "representation: Array(estimate: 3), avg_err: 0.0000")] + #[test_case(4 => "representation: Array(estimate: 4), avg_err: 0.0000")] + #[test_case(8 => "representation: Array(estimate: 8), avg_err: 0.0000")] + #[test_case(16 => "representation: Array(estimate: 16), avg_err: 0.0000")] + #[test_case(32 => "representation: Array(estimate: 32), avg_err: 0.0000")] + #[test_case(64 => "representation: Array(estimate: 64), avg_err: 0.0000")] + #[test_case(128 => "representation: Array(estimate: 128), avg_err: 0.0000")] + #[test_case(129 => "representation: Hll(estimate: 129), avg_err: 0.0000")] + #[test_case(256 => "representation: Hll(estimate: 256), avg_err: 0.0000")] + #[test_case(512 => "representation: Hll(estimate: 511), avg_err: 0.0004")] + #[test_case(1024 => "representation: Hll(estimate: 1022), avg_err: 0.0014")] + #[test_case(4096 => "representation: Hll(estimate: 4100), avg_err: 0.0009")] + #[test_case(10_000 => "representation: Hll(estimate: 10007), avg_err: 0.0008")] + #[test_case(100_000 => "representation: Hll(estimate: 100240), avg_err: 0.0011")] fn test_estimator_p18_w6(n: usize) -> String { evaluate_cardinality_estimator(CardinalityEstimator::::new(), n) } @@ -284,38 +311,38 @@ pub mod tests { ) } - #[test_case(0, 0 => "Small(estimate: 0, size: 8)")] - #[test_case(0, 1 => "Small(estimate: 1, size: 8)")] - #[test_case(1, 0 => "Small(estimate: 1, size: 8)")] - #[test_case(1, 1 => "Small(estimate: 2, size: 8)")] - #[test_case(1, 2 => "Array(estimate: 3, size: 24)")] - #[test_case(2, 1 => "Array(estimate: 3, size: 24)")] - #[test_case(2, 2 => "Array(estimate: 4, size: 24)")] - #[test_case(2, 3 => "Array(estimate: 5, size: 40)")] - #[test_case(2, 4 => "Array(estimate: 6, size: 40)")] - #[test_case(4, 2 => "Array(estimate: 6, size: 40)")] - #[test_case(3, 2 => "Array(estimate: 5, size: 40)")] - #[test_case(3, 3 => "Array(estimate: 6, size: 40)")] - #[test_case(3, 4 => "Array(estimate: 7, size: 40)")] - #[test_case(4, 3 => "Array(estimate: 7, size: 40)")] - #[test_case(4, 4 => "Array(estimate: 8, size: 40)")] - #[test_case(4, 8 => "Array(estimate: 12, size: 72)")] - #[test_case(8, 4 => "Array(estimate: 12, size: 72)")] - #[test_case(4, 12 => "Array(estimate: 16, size: 72)")] - #[test_case(12, 4 => "Array(estimate: 16, size: 72)")] - #[test_case(1, 127 => "Array(estimate: 128, size: 520)")] - #[test_case(1, 128 => "Hll(estimate: 130, size: 3092)")] - #[test_case(127, 1 => "Array(estimate: 128, size: 520)")] - #[test_case(128, 1 => "Hll(estimate: 130, size: 3092)")] - #[test_case(128, 128 => "Hll(estimate: 254, size: 3092)")] - #[test_case(512, 512 => "Hll(estimate: 1012, size: 3092)")] - #[test_case(10000, 0 => "Hll(estimate: 10068, size: 3092)")] - #[test_case(0, 10000 => "Hll(estimate: 10068, size: 3092)")] - #[test_case(4, 10000 => "Hll(estimate: 10068, size: 3092)")] - #[test_case(10000, 4 => "Hll(estimate: 10068, size: 3092)")] - #[test_case(17, 10000 => "Hll(estimate: 10073, size: 3092)")] - #[test_case(10000, 17 => "Hll(estimate: 10073, size: 3092)")] - #[test_case(10000, 10000 => "Hll(estimate: 19974, size: 3092)")] + #[test_case(0, 0 => "Small(estimate: 0)")] + #[test_case(0, 1 => "Small(estimate: 1)")] + #[test_case(1, 0 => "Small(estimate: 1)")] + #[test_case(1, 1 => "Small(estimate: 2)")] + #[test_case(1, 2 => "Array(estimate: 3)")] + #[test_case(2, 1 => "Array(estimate: 3)")] + #[test_case(2, 2 => "Array(estimate: 4)")] + #[test_case(2, 3 => "Array(estimate: 5)")] + #[test_case(2, 4 => "Array(estimate: 6)")] + #[test_case(4, 2 => "Array(estimate: 6)")] + #[test_case(3, 2 => "Array(estimate: 5)")] + #[test_case(3, 3 => "Array(estimate: 6)")] + #[test_case(3, 4 => "Array(estimate: 7)")] + #[test_case(4, 3 => "Array(estimate: 7)")] + #[test_case(4, 4 => "Array(estimate: 8)")] + #[test_case(4, 8 => "Array(estimate: 12)")] + #[test_case(8, 4 => "Array(estimate: 12)")] + #[test_case(4, 12 => "Array(estimate: 16)")] + #[test_case(12, 4 => "Array(estimate: 16)")] + #[test_case(1, 127 => "Array(estimate: 128)")] + #[test_case(1, 128 => "Hll(estimate: 130)")] + #[test_case(127, 1 => "Array(estimate: 128)")] + #[test_case(128, 1 => "Hll(estimate: 130)")] + #[test_case(128, 128 => "Hll(estimate: 254)")] + #[test_case(512, 512 => "Hll(estimate: 1012)")] + #[test_case(10000, 0 => "Hll(estimate: 10068)")] + #[test_case(0, 10000 => "Hll(estimate: 10068)")] + #[test_case(4, 10000 => "Hll(estimate: 10068)")] + #[test_case(10000, 4 => "Hll(estimate: 10068)")] + #[test_case(17, 10000 => "Hll(estimate: 10073)")] + #[test_case(10000, 17 => "Hll(estimate: 10073)")] + #[test_case(10000, 10000 => "Hll(estimate: 19974)")] fn test_merge(lhs_n: usize, rhs_n: usize) -> String { let mut lhs = CardinalityEstimator::::new(); for i in 0..lhs_n { diff --git a/src/hyperloglog.rs b/src/hyperloglog.rs index c6e8a28..972a9b4 100644 --- a/src/hyperloglog.rs +++ b/src/hyperloglog.rs @@ -14,20 +14,24 @@ //! - data[2..] - stores register ranks using `W` bits per each register. use std::fmt::{Debug, Formatter}; -use std::mem::{size_of, size_of_val}; -use std::slice; +use std::mem::size_of_val; -use crate::representation::{RepresentationTrait, REPRESENTATION_HLL}; +use crate::representation::{Representation, RepresentationTrait}; +#[cfg(feature = "with_serde")] +use serde::{Deserialize, Serialize}; -/// Mask used for accessing heap allocated data stored at the pointer in `data` field. -const PTR_MASK: usize = !3; - -#[derive(PartialEq)] -pub(crate) struct HyperLogLog<'a, const P: usize = 12, const W: usize = 6> { - pub(crate) data: &'a mut [u32], +#[derive(Clone, PartialEq)] +#[cfg_attr(feature = "with_serde", derive(Serialize, Deserialize))] +pub(crate) struct HyperLogLog { + #[cfg_attr(feature = "with_serde", serde(rename = "z"))] + zeros: u32, + #[cfg_attr(feature = "with_serde", serde(rename = "s"))] + harmonic_sum: f32, + #[cfg_attr(feature = "with_serde", serde(rename = "r"))] + pub(crate) registers: Vec, } -impl HyperLogLog<'_, P, W> { +impl HyperLogLog { /// Number of HyperLogLog registers const M: usize = 1 << P; /// HyperLogLog representation `u32` slice length based on #registers, stored zero registers, harmonic sum, and @@ -37,16 +41,17 @@ impl HyperLogLog<'_, P, W> { /// Create new instance of `HyperLogLog` representation from items #[inline] pub(crate) fn new(items: &[u32]) -> Self { - let mut hll_data = vec![0u32; Self::HLL_SLICE_LEN]; - let data = (PTR_MASK & hll_data.as_mut_ptr() as usize) | 3; - std::mem::forget(hll_data); - let mut hll = Self::from(data); - - hll.data[0] = Self::M as u32; - hll.data[1] = (Self::M as f32).to_bits(); + // TODO: this is wrong, need to actually compute things + let mut hll = Self { + zeros: Self::M as u32, + harmonic_sum: Self::M as f32, + registers: vec![0; Self::HLL_SLICE_LEN], + }; for &h in items.iter() { - hll.insert_encoded_hash(h); + if hll.insert_encoded_hash(h).is_some() { + panic!("inserting into hll rep must yield none"); + }; } hll @@ -73,10 +78,12 @@ impl HyperLogLog<'_, P, W> { #[inline] fn get_register(&self, idx: u32) -> u32 { let bit_idx = (idx as usize) * W; - let u32_idx = (bit_idx / 32) + 2; + let u32_idx = bit_idx / 32; let bit_pos = bit_idx % 32; - // SAFETY: `self.data` is always guaranteed to have these elements. - let bits = unsafe { self.data.get_unchecked(u32_idx..u32_idx + 2) }; + let bits = self + .registers + .get(u32_idx..u32_idx + 2) + .expect("get_register: `self.registers` is always guaranteed to have these elements."); let bits_1 = W.min(32 - bit_pos); let bits_2 = W - bits_1; let mask_1 = (1 << bits_1) - 1; @@ -89,10 +96,12 @@ impl HyperLogLog<'_, P, W> { #[inline] fn set_register(&mut self, idx: u32, old_rank: u32, new_rank: u32) { let bit_idx = (idx as usize) * W; - let u32_idx = (bit_idx / 32) + 2; + let u32_idx = bit_idx / 32; let bit_pos = bit_idx % 32; - // SAFETY: `self.data` is always guaranteed to have these elements. - let bits = unsafe { self.data.get_unchecked_mut(u32_idx..u32_idx + 2) }; + let bits = self + .registers + .get_mut(u32_idx..u32_idx + 2) + .expect("set_register: `self.registers` is always guaranteed to have these elements."); let bits_1 = W.min(32 - bit_pos); let bits_2 = W - bits_1; let mask_1 = (1 << bits_1) - 1; @@ -105,14 +114,10 @@ impl HyperLogLog<'_, P, W> { bits[1] |= (new_rank >> bits_1) & mask_2; // Update HyperLogLog's number of zero registers and harmonic sum - // SAFETY: `self.data` is always guaranteed to have 0-th and 1-st elements. - let zeros_and_sum = unsafe { self.data.get_unchecked_mut(0..2) }; - zeros_and_sum[0] -= u32::from(old_rank == 0) & u32::from(zeros_and_sum[0] > 0); - let mut sum = f32::from_bits(zeros_and_sum[1]); - sum -= 1.0 / ((1u64 << u64::from(old_rank)) as f32); - sum += 1.0 / ((1u64 << u64::from(new_rank)) as f32); - zeros_and_sum[1] = sum.to_bits(); + self.zeros -= u32::from(old_rank == 0) & u32::from(self.zeros > 0); + self.harmonic_sum -= 1.0 / ((1u64 << u64::from(old_rank)) as f32); + self.harmonic_sum += 1.0 / ((1u64 << u64::from(new_rank)) as f32); } /// Merge two `HyperLogLog` representations. @@ -128,21 +133,20 @@ impl HyperLogLog<'_, P, W> { } } -impl RepresentationTrait for HyperLogLog<'_, P, W> { +impl RepresentationTrait for HyperLogLog { /// Insert encoded hash into `HyperLogLog` representation. #[inline] - fn insert_encoded_hash(&mut self, h: u32) -> usize { + fn insert_encoded_hash(&mut self, h: u32) -> Option> { let (idx, rank) = Self::decode_hash(h); self.update_rank(idx, rank); - self.to_data() + None } /// Return cardinality estimate of `HyperLogLog` representation #[inline] fn estimate(&self) -> usize { - // SAFETY: `self.data` is always guaranteed to have 0-th and 1-st elements. - let zeros = unsafe { *self.data.get_unchecked(0) }; - let sum = f64::from(f32::from_bits(unsafe { *self.data.get_unchecked(1) })); + let zeros = self.zeros; + let sum = f64::from(self.harmonic_sum); let estimate = alpha(Self::M) * ((Self::M * (Self::M - zeros as usize)) as f64) / (sum + beta_horner(f64::from(zeros), P)); (estimate + 0.5) as usize @@ -151,56 +155,19 @@ impl RepresentationTrait for HyperLogLog<'_, P, /// Return memory size of `HyperLogLog` #[inline] fn size_of(&self) -> usize { - size_of::() + size_of_val(self.data) - } - - /// Free memory occupied by the `HyperLogLog` representation - /// SAFETY: caller of this method must ensure that `self.data` holds valid slice elements. - #[inline] - unsafe fn drop(&mut self) { - drop(Box::from_raw(self.data)); - } - - /// Convert `HyperLogLog` representation to `data` - #[inline] - fn to_data(&self) -> usize { - (PTR_MASK & self.data.as_ptr() as usize) | REPRESENTATION_HLL + size_of_val(self) } } -impl From for HyperLogLog<'_, P, W> { - /// Create new instance of `HyperLogLog` from given `data` - #[inline] - fn from(data: usize) -> Self { - let ptr = (data & PTR_MASK) as *mut u32; - // SAFETY: caller of this method must ensure that `data` contains valid slice pointer. - let data = unsafe { slice::from_raw_parts_mut(ptr, Self::HLL_SLICE_LEN) }; - Self { data } - } -} - -impl From> for HyperLogLog<'_, P, W> { +impl From> for HyperLogLog { /// Create new instance of `HyperLogLog` from given `hll_data` #[inline] - fn from(mut hll_data: Vec) -> Self { - let data = (PTR_MASK & hll_data.as_mut_ptr() as usize) | 3; - std::mem::forget(hll_data); - Self::from(data) - } -} - -impl Clone for HyperLogLog<'_, P, W> { - /// Clone `HyperLogLog` representation - #[inline] - fn clone(&self) -> Self { - let mut hll_data = self.data.to_vec(); - let data = (PTR_MASK & hll_data.as_mut_ptr() as usize) | 3; - std::mem::forget(hll_data); - Self::from(data) + fn from(hll_data: Vec) -> Self { + Self::new(&hll_data) } } -impl Debug for HyperLogLog<'_, P, W> { +impl Debug for HyperLogLog { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str(&self.to_string()) } @@ -406,6 +373,6 @@ mod tests { #[test] fn hyerloglog_size() { - assert_eq!(std::mem::size_of::>(), 16); + assert_eq!(std::mem::size_of::>(), 32); } } diff --git a/src/representation.rs b/src/representation.rs index bc6e1f0..f3dca88 100644 --- a/src/representation.rs +++ b/src/representation.rs @@ -1,105 +1,47 @@ -use std::hash::{Hash, Hasher}; - use enum_dispatch::enum_dispatch; -use crate::array::{Array, MAX_CAPACITY}; +use crate::array::Array; use crate::hyperloglog::HyperLogLog; -use crate::representation::RepresentationError::*; use crate::small::Small; -use crate::CardinalityEstimator; - -/// Masks used for storing and retrieving representation type stored in lowest 2 bits of `data` field. -pub(crate) const REPRESENTATION_MASK: usize = 0x0000_0000_0000_0003; -pub(crate) const REPRESENTATION_SMALL: usize = 0x0000_0000_0000_0000; -pub(crate) const REPRESENTATION_ARRAY: usize = 0x0000_0000_0000_0001; -pub(crate) const REPRESENTATION_HLL: usize = 0x0000_0000_0000_0003; +#[cfg(feature = "with_serde")] +use serde::{Deserialize, Serialize}; /// Representation types supported by `CardinalityEstimator` #[repr(u8)] #[derive(Debug, PartialEq)] +#[cfg_attr(feature = "with_serde", derive(Serialize, Deserialize))] #[enum_dispatch] -pub(crate) enum Representation<'a, const P: usize, const W: usize> { +pub(crate) enum Representation { + #[cfg_attr(feature = "with_serde", serde(rename = "s"))] Small(Small), - Array(Array<'a, P, W>), - Hll(HyperLogLog<'a, P, W>), + #[cfg_attr(feature = "with_serde", serde(rename = "a"))] + Array(Array), + #[cfg_attr(feature = "with_serde", serde(rename = "h"))] + Hll(HyperLogLog), } /// Representation trait which must be implemented by all representations. #[enum_dispatch(Representation)] -pub(crate) trait RepresentationTrait { - fn insert_encoded_hash(&mut self, h: u32) -> usize; +pub(crate) trait RepresentationTrait { + fn insert_encoded_hash(&mut self, h: u32) -> Option>; fn estimate(&self) -> usize; fn size_of(&self) -> usize; - unsafe fn drop(&mut self); - fn to_data(&self) -> usize; fn to_string(&self) -> String { - format!("estimate: {}, size: {}", self.estimate(), self.size_of()) + format!("estimate: {}", self.estimate()) } } -/// Representation error -#[derive(Debug)] -pub enum RepresentationError { - InvalidRepresentation, - SmallRepresentationInvalid, - ArrayRepresentationInvalid, - HllRepresentationInvalid, -} - -impl Representation<'_, P, W> { - /// Returns the representation type of `CardinalityEstimator`. - /// - /// This method extracts the representation based on the lowest 2 bits of `data`. - /// - /// Valid encodings: - /// - `0` for `Small` representation - /// - `1` for `Array` representation - /// - `3` for `HLL` representation - /// - /// If `data` is not encoded as 0, 1, or 3, the function defaults to `Small` with value of 0 - /// as a safe fallback to handle unexpected conditions. - #[inline] - pub(crate) fn from_data(data: usize) -> Self { - match data & REPRESENTATION_MASK { - REPRESENTATION_SMALL => Representation::Small(Small::from(data)), - REPRESENTATION_ARRAY => Representation::Array(Array::from(data)), - REPRESENTATION_HLL => Representation::Hll(HyperLogLog::::from(data)), - _ => Representation::Small(Small::from(0)), +impl Representation { + pub fn iec(&mut self, h: u32) { + if let Some(mut upgraded) = self.insert_encoded_hash(h) { + std::mem::swap(self, &mut upgraded) } } +} - /// Create new cardinality estimator from data and optional vector - pub fn try_from( - data: usize, - opt_vec: Option>, - ) -> Result, RepresentationError> - where - T: Hash + ?Sized, - H: Hasher + Default, - { - let mut estimator = CardinalityEstimator::::new(); - estimator.data = match data & REPRESENTATION_MASK { - REPRESENTATION_SMALL if opt_vec.is_some() => return Err(SmallRepresentationInvalid), - REPRESENTATION_SMALL => Small::::from(data).to_data(), - REPRESENTATION_ARRAY => { - let vec = opt_vec.ok_or(ArrayRepresentationInvalid)?; - let len = vec.len(); - if len <= 2 || len > MAX_CAPACITY { - return Err(ArrayRepresentationInvalid); - } - Array::::from_vec(vec, len).to_data() - } - REPRESENTATION_HLL => { - let vec = opt_vec.ok_or(HllRepresentationInvalid)?; - if vec.len() != HyperLogLog::::HLL_SLICE_LEN { - return Err(HllRepresentationInvalid); - } - HyperLogLog::::from(vec).to_data() - } - _ => return Err(InvalidRepresentation), - }; - - Ok(estimator) +impl Default for Representation { + fn default() -> Self { + Representation::Small(Default::default()) } } @@ -109,6 +51,6 @@ mod tests { #[test] fn small_size() { - assert_eq!(std::mem::size_of::>(), 32); + assert_eq!(std::mem::size_of::>(), 40); } } diff --git a/src/serde.rs b/src/serde.rs index 3b4e4bc..f97d095 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -1,87 +1,10 @@ //! # Serde module for CardinalityEstimator //! -//! This module provides serde-based (serialization and deserialization) features for -//! `CardinalityEstimator`. It uses `serde`'s custom serialization and deserialization mechanisms. -//! -//! `CardinalityEstimator` has a usize field, `data`, and an optional `Vec` hidden behind a -//! pointer within `data`. During serialization, these fields are converted into a tuple: -//! `(data, Option>)`. -//! -//! During deserialization, the tuple is converted back into the `CardinalityEstimator` struct, -//! handling the case where the `Vec` may be `None` (indicating a "small" estimator). -//! -//! This allows `CardinalityEstimator` to be easily serialized/deserialized, for storage, -//! transmission, and reconstruction. -//! -//! Refer to the serde documentation for more details on custom serialization and deserialization: -//! - [Serialization](https://serde.rs/impl-serialize.html) -//! - [Deserialization](https://serde.rs/impl-deserialize.html) -use std::hash::{Hash, Hasher}; -use std::ops::Deref; - -use serde::de::Error; -use serde::ser::SerializeTuple; -use serde::{Deserialize, Serialize}; - -use crate::estimator::CardinalityEstimator; -use crate::representation::Representation; - -impl Serialize for CardinalityEstimator -where - T: Hash + ?Sized, - H: Hasher + Default, -{ - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - // Begin a new serialized tuple with two elements. - let mut tup = serializer.serialize_tuple(2)?; - - // The first element is the data field of the estimator. - tup.serialize_element(&self.data)?; - match self.representation() { - Representation::Small(_) => { - // If the estimator is small, the second element is a None value. This indicates that - // the estimator is using the small data optimization and has no separate slice data. - tup.serialize_element(&None::>)?; - } - Representation::Array(arr) => { - // If the estimator is slice, the second element is a option containing slice data. - tup.serialize_element(&Some(arr.deref()))?; - } - Representation::Hll(hll) => { - // If the estimator is HLL, the second element is a option containing HLL data. - tup.serialize_element(&Some(hll.data))?; - } - } - - // Finalize the tuple. - tup.end() - } -} - -impl<'de, T, H, const P: usize, const W: usize> Deserialize<'de> - for CardinalityEstimator -where - T: Hash + ?Sized, - H: Hasher + Default, -{ - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - // Deserialize the tuple that was serialized by the serialize method. The first element - // of the tuple is the data field of the estimator, and the second element is an Option - // that contains the array data if the estimator is not small. - let (data, opt_vec): (usize, Option>) = Deserialize::deserialize(deserializer)?; - Representation::try_from(data, opt_vec).map_err(|e| Error::custom(format!("{:?}", e))) - } -} +//! This module now only provides basic tests for derived serializationa and deserialization. #[cfg(test)] pub mod tests { - use super::*; + use crate::estimator::CardinalityEstimator; use test_case::test_case; #[test_case(0; "empty set")] @@ -110,6 +33,23 @@ pub mod tests { original_estimator.representation(), deserialized_estimator.representation() ); + + // run each case with postcard serialization as well + + let postcard_serialized = + postcard::to_allocvec(&original_estimator).expect("serialization failed"); + assert!( + !postcard_serialized.is_empty(), + "postcard_serialized bytes should not be empty" + ); + + let postcard_estimator: CardinalityEstimator = + postcard::from_bytes(&postcard_serialized).expect("deserialization failed"); + + assert_eq!( + original_estimator.representation(), + postcard_estimator.representation() + ); } #[test] @@ -130,5 +70,8 @@ pub mod tests { fn test_failed_deserialization(input: &[u8]) { let result: Result, _> = serde_json::from_slice(input); assert!(result.is_err()); + + let result: Result, _> = postcard::from_bytes(input); + assert!(result.is_err()); } } diff --git a/src/small.rs b/src/small.rs index acb15a0..c4f8a1f 100644 --- a/src/small.rs +++ b/src/small.rs @@ -9,14 +9,17 @@ use std::fmt::{Debug, Formatter}; use crate::array::Array; -use crate::representation::{RepresentationTrait, REPRESENTATION_SMALL}; +use crate::representation::{Representation, RepresentationTrait}; +#[cfg(feature = "with_serde")] +use serde::{Deserialize, Serialize}; /// Mask used for extracting hashes stored in small representation (31 bits) -const SMALL_MASK: usize = 0x0000_0000_7fff_ffff; +const SMALL_MASK: u64 = 0x0000_0000_7fff_ffff; /// Small representation container -#[derive(PartialEq)] -pub(crate) struct Small(usize); +#[derive(PartialEq, Default)] +#[cfg_attr(feature = "with_serde", derive(Serialize, Deserialize))] +pub(crate) struct Small(u64); impl Small { /// Insert encoded hash into `Small` representation. @@ -25,7 +28,7 @@ impl Small { pub(crate) fn insert(&mut self, h: u32) -> bool { let h1 = self.h1(); if h1 == 0 { - self.0 |= (h as usize) << 2; + self.0 |= u64::from(h) << 2; return true; } else if h1 == h { return true; @@ -33,7 +36,7 @@ impl Small { let h2 = self.h2(); if h2 == 0 { - self.0 |= (h as usize) << 33; + self.0 |= u64::from(h) << 33; return true; } else if h2 == h { return true; @@ -61,16 +64,16 @@ impl Small { } } -impl RepresentationTrait for Small { +impl RepresentationTrait for Small { /// Insert encoded hash into `Small` representation. - fn insert_encoded_hash(&mut self, h: u32) -> usize { + fn insert_encoded_hash(&mut self, h: u32) -> Option> { if self.insert(h) { - self.to_data() + None } else { // upgrade from `Small` to `Array` representation let items = self.items(); - let arr = Array::::from_vec(vec![items[0], items[1], h, 0], 3); - arr.to_data() + let arr = Array::::from_vec(vec![items[0], items[1], h], 3); + Some(Representation::Array(arr)) } } @@ -88,16 +91,6 @@ impl RepresentationTrait for Small { fn size_of(&self) -> usize { std::mem::size_of::() } - - /// Free memory occupied by the `Small` representation - #[inline] - unsafe fn drop(&mut self) {} - - /// Convert `Small` representation to `data` - #[inline] - fn to_data(&self) -> usize { - self.0 | REPRESENTATION_SMALL - } } impl Debug for Small { @@ -106,9 +99,9 @@ impl Debug for Small { } } -impl From for Small { +impl From for Small { /// Create new instance of `Small` from given `data` - fn from(data: usize) -> Self { + fn from(data: u64) -> Self { Self(data) } }