uniphil · uniphil · May 6, 2025 · Apr 11, 2025 · Apr 12, 2025 · May 1, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -18,6 +18,9 @@ jobs:
       - name: Run cargo check
         run: cargo check
 
+      - name: Run cargo check with serde
+        run: cargo check --features with_serde
+
   test:
     name: Test Suite
     runs-on: ubuntu-latest
@@ -31,6 +34,9 @@ jobs:
           toolchain: stable
 
       - name: Run cargo test
+        run: cargo test
+
+      - name: Run cargo test with serde
         run: cargo test --features with_serde
 
   lints:
@@ -50,4 +56,7 @@ jobs:
         run: cargo fmt --all -- --check
 
       - name: Run cargo clippy
+        run: cargo clippy -- -D warnings
+
+      - name: Run cargo clippy with serde
         run: cargo clippy --features with_serde -- -D warnings
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,27 +1,29 @@
 [package]
-name = "cardinality-estimator"
-version = "1.0.2"
+name = "cardinality-estimator-safe"
+version = "2.1.0"
 edition = "2021"
-authors = ["Alex Bocharov <bocharov.alexandr@gmail.com>"]
-description = "A crate for estimating the cardinality of distinct elements in a stream or dataset."
-documentation = "https://docs.rs/cardinality-estimator"
+authors = ["Alex Bocharov <bocharov.alexandr@gmail.com>", "phil"]
+description = "Estimate the cardinality of distinct elements in a stream or dataset with no unsafe code"
+documentation = "https://docs.rs/cardinality-estimator-safe"
 license = "Apache-2.0"
 readme = "README.md"
-repository = "https://github.com/cloudflare/cardinality-estimator"
+repository = "https://github.com/uniphil/cardinality-estimator-safe"
 keywords = ["cardinality", "distinct-count", "hyperloglog", "probabilistic", "sketch"]
 categories = ["algorithms", "data-structures"]
 
 [dependencies]
 enum_dispatch = "0.3.13"
-serde = { version = "1.0", optional = true }
+serde = { version = "1.0", features = ["derive"], optional = true }
 wyhash = "0.5.0"
 
 [dev-dependencies]
 amadeus-streaming = "0.4.3"
+cardinality-estimator = "1.0.2"
 criterion = { version = "0.5.0", features = ["html_reports"] }
 dhat = "0.3.3"
 hyperloglog = "1.0.2"
 hyperloglogplus = "0.4.1"
+postcard = { version = "1.1.1", features=["alloc"] }
 pprof = { version = "0.14.0", features = ["flamegraph", "criterion", "protobuf-codec"] }
 probabilistic-collections = "0.7.0"
 rand = "0.8.5"

diff --git a/README.md b/README.md
@@ -1,26 +1,28 @@
-# cardinality-estimator
-![build](https://img.shields.io/github/actions/workflow/status/cloudflare/cardinality-estimator/ci.yml?branch=main)
-[![docs.rs](https://docs.rs/cardinality-estimator/badge.svg)](https://docs.rs/cardinality-estimator)
-[![crates.io](https://img.shields.io/crates/v/cardinality-estimator.svg)](https://crates.io/crates/cardinality-estimator)
+# cardinality-estimator-safe
+![build](https://img.shields.io/github/actions/workflow/status/uniphil/cardinality-estimator-safe/ci.yml?branch=main)
+[![docs.rs](https://docs.rs/cardinality-estimator-safe/badge.svg)](https://docs.rs/cardinality-estimator-safe)
+[![crates.io](https://img.shields.io/crates/v/cardinality-estimator-safe.svg)](https://crates.io/crates/cardinality-estimator-safe)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
 
+`cardinality-estimator-safe` is a fork of Cloudflare's `cardinality-estimator`, replacing its data representations with boring old owned data and eliminating all uses of `unsafe`. Its serialization formats are intended to be reasonable with `serde_json`, and efficient with `bincode`.
+
 `cardinality-estimator` is a Rust crate designed to estimate the number of distinct elements in a stream or dataset in an efficient manner.
-This library uses HyperLogLog++ with an optimized low memory footprint and high accuracy approach, suitable for large-scale data analysis tasks.
+This library uses HyperLogLog++ ~~with an optimized low memory footprint~~ and high accuracy approach, suitable for large-scale data analysis tasks.
 We're using `cardinality-estimator` for large-scale machine learning, computing cardinality features across multiple dimensions of the request.
 
 ## Overview
-Our `cardinality-estimator` is highly efficient in terms of memory usage, latency, and accuracy.
+Our `cardinality-estimator` is highly efficient in terms of ~~memory usage, latency, and~~ accuracy.
 This is achieved by leveraging a combination of unique data structure design, efficient algorithms, and HyperLogLog++ for high cardinality ranges.
 
 ## Getting Started
-To use `cardinality-estimator`, add it to your `Cargo.toml` under `[dependencies]`:
+To use `cardinality-estimator-safe`, add it to your `Cargo.toml` under `[dependencies]`:
 ```toml
 [dependencies]
-cardinality-estimator = "1.0.0"
+cardinality-estimator-safe = "2.1.0"
 ```
-Then, import `cardinality-estimator` in your Rust program:
+Then, import `cardinality-estimator-safe` in your Rust program:
 ```rust
-use cardinality_estimator::CardinalityEstimator;
+use cardinality_estimator_safe::CardinalityEstimator;
 
 let mut estimator = CardinalityEstimator::<12, 6>::new();
 estimator.insert("test");
@@ -37,15 +39,17 @@ The data is stored in three different representations - `Small`, `Array`, and `H
 For instance, for a cardinality of 0 to 2, only **8 bytes** of stack memory and 0 bytes of heap memory are used.
 
 ## Low latency
-The crate offers low latency by using auto-vectorization for slice operations via compiler hints to use SIMD instructions.
+~~The crate offers low latency by using auto-vectorization for slice operations via compiler hints to use SIMD instructions.~~
 The number of zero registers and registers' harmonic sum are stored and updated dynamically as more data is inserted, resulting in fast estimate operations.
 
 ## High accuracy
-The cardinality-estimator achieves high accuracy by using precise counting for small cardinality ranges and HyperLogLog++ with LogLog-Beta bias correction for larger ranges.
+The cardinality-estimator-safe achieves high accuracy by using precise counting for small cardinality ranges and HyperLogLog++ with LogLog-Beta bias correction for larger ranges.
 This provides expected error rates as low as 0.02% for large cardinalities.
 
 ## Benchmarks
 
+Benchmarks are added to directly compare Cloudflare's `cardinality-estimator` with `cardinality-estimator-safe`. They are included beside Cloudflare's original benchmarks for context.
+
 To run benchmarks you first need to install `cargo-criterion` binary:
 ```shell
 cargo install cargo-criterion
@@ -73,6 +77,9 @@ Benchmarks presented below are executed on Linux laptop with `13th Gen Intel(R)
 ### Memory usage
 ![Cardinality Estimators Memory Usage](benches/memory_bytes.png)
 
+#### vs. cardinality-estimator-safe
+![Cardinality Estimators Memory Usage](benches/memory_bytes_safe.png)
+
 Table below compares memory usage of different cardinality estimators.
 The number in each cell represents `stack memory bytes / heap memory bytes / heap memory blocks` at each measured cardinality.
 
@@ -108,6 +115,10 @@ Note, that `hyperloglogplus` implementation has particularly high memory usage e
 ### Insert performance
 ![Cardinality Estimators Insert Time](benches/insert_time.png)
 
+#### vs cardinality-estimator-safe
+
+![Cardinality Estimators Insert Time](benches/insert_time_safe.png)
+
 Table below represents insert time in nanoseconds per element.
 
 Our `cardinality-estimator` demonstrates the lowest insert time for most of the cardinalities.
@@ -140,6 +151,10 @@ Our `cardinality-estimator` demonstrates the lowest insert time for most of the
 ### Estimate performance
 ![Cardinality Estimators Estimate Time](benches/estimate_time.png)
 
+#### vs cardinality-estimator-safe
+![Cardinality Estimators Estimate Time](benches/estimate_time_safe.png)
+
+
 Table below represents estimate time in nanoseconds per call.
 
 Our `cardinality-estimator` shows the lowest estimate time for most of the cardinalities, especially smaller cardinalities up to 128.
@@ -175,6 +190,9 @@ Implementations `probabilistic-collections`, `hyperloglogplus` and `hyperloglogp
 ### Error rate
 ![Cardinality Estimators Error Rate](benches/error_rate.png)
 
+#### vs cardinality-estimator-safe
+![Cardinality Estimators Error Rate](benches/error_rate_safe.png)
+
 Table below represents average absolute relative error across 100 runs of estimator on random elements at given cardinality.
 
 Our `cardinality-estimator` performs on par well with `amadeus-streaming` and `hyperloglog` estimators, but has especially smaller low error rate for cardinalities up to 128.

diff --git a/benches/analyze.py b/benches/analyze.py
@@ -60,6 +60,7 @@ def render_comparison(bench_results_path, df, operation, metric, yscale, ylim=No
 
     colors = {
         'cardinality-estimator': 'green',
+        'cardinality-estimator-safe': 'blue',
         'amadeus-streaming': 'blue',
         'probabilistic-collections': 'red',
         'hyperloglog': 'purple',

diff --git a/benches/cardinality_estimator.rs b/benches/cardinality_estimator.rs
@@ -4,6 +4,7 @@ static ALLOC: dhat::Alloc = dhat::Alloc;
 use std::hash::{BuildHasherDefault, Hash};
 
 use cardinality_estimator::CardinalityEstimator;
+use cardinality_estimator_safe::CardinalityEstimator as CardinalityEstimatorSafe;
 use criterion::measurement::WallTime;
 use criterion::{
     black_box, criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion, Throughput,
@@ -44,21 +45,23 @@ fn benchmark(c: &mut Criterion) {
     for &cardinality in &cardinalities {
         group.throughput(Throughput::Elements(cardinality.max(1) as u64));
         bench_insert::<CardinalityEstimatorMut>(&mut group, cardinality);
-        bench_insert::<AmadeusStreamingEstimator>(&mut group, cardinality);
-        bench_insert::<ProbabilisticCollections>(&mut group, cardinality);
-        bench_insert::<HyperLogLog>(&mut group, cardinality);
-        bench_insert::<HyperLogLogPlus>(&mut group, cardinality);
+        bench_insert::<CardinalityEstimatorSafeMut>(&mut group, cardinality);
+        // bench_insert::<AmadeusStreamingEstimator>(&mut group, cardinality);
+        // bench_insert::<ProbabilisticCollections>(&mut group, cardinality);
+        // bench_insert::<HyperLogLog>(&mut group, cardinality);
+        // bench_insert::<HyperLogLogPlus>(&mut group, cardinality);
     }
     group.finish();
 
     let mut group = c.benchmark_group("estimate");
     group.throughput(Throughput::Elements(1));
     for &cardinality in &cardinalities {
         bench_estimate::<CardinalityEstimatorMut>(&mut group, cardinality);
-        bench_estimate::<AmadeusStreamingEstimator>(&mut group, cardinality);
-        bench_estimate::<ProbabilisticCollections>(&mut group, cardinality);
-        bench_estimate::<HyperLogLog>(&mut group, cardinality);
-        bench_estimate::<HyperLogLogPlus>(&mut group, cardinality);
+        bench_estimate::<CardinalityEstimatorSafeMut>(&mut group, cardinality);
+        // bench_estimate::<AmadeusStreamingEstimator>(&mut group, cardinality);
+        // bench_estimate::<ProbabilisticCollections>(&mut group, cardinality);
+        // bench_estimate::<HyperLogLog>(&mut group, cardinality);
+        // bench_estimate::<HyperLogLogPlus>(&mut group, cardinality);
     }
     group.finish();
 
@@ -67,10 +70,13 @@ fn benchmark(c: &mut Criterion) {
         .map(|&cardinality| StatRecord {
             cardinality,
             cardinality_estimator: measure_allocations::<CardinalityEstimatorMut>(cardinality),
-            amadeus_streaming: measure_allocations::<AmadeusStreamingEstimator>(cardinality),
-            probabilistic_collections: measure_allocations::<ProbabilisticCollections>(cardinality),
-            hyperloglog: measure_allocations::<HyperLogLog>(cardinality),
-            hyperloglogplus: measure_allocations::<HyperLogLogPlus>(cardinality),
+            cardinality_estimator_safe: measure_allocations::<CardinalityEstimatorSafeMut>(
+                cardinality,
+            ),
+            // amadeus_streaming: measure_allocations::<AmadeusStreamingEstimator>(cardinality),
+            // probabilistic_collections: measure_allocations::<ProbabilisticCollections>(cardinality),
+            // hyperloglog: measure_allocations::<HyperLogLog>(cardinality),
+            // hyperloglogplus: measure_allocations::<HyperLogLogPlus>(cardinality),
         })
         .collect();
 
@@ -86,10 +92,11 @@ fn benchmark(c: &mut Criterion) {
         .map(|&cardinality| StatRecord {
             cardinality,
             cardinality_estimator: measure_error::<CardinalityEstimatorMut>(cardinality),
-            amadeus_streaming: measure_error::<AmadeusStreamingEstimator>(cardinality),
-            probabilistic_collections: measure_error::<ProbabilisticCollections>(cardinality),
-            hyperloglog: measure_error::<HyperLogLog>(cardinality),
-            hyperloglogplus: measure_error::<HyperLogLogPlus>(cardinality),
+            cardinality_estimator_safe: measure_error::<CardinalityEstimatorSafeMut>(cardinality),
+            // amadeus_streaming: measure_error::<AmadeusStreamingEstimator>(cardinality),
+            // probabilistic_collections: measure_error::<ProbabilisticCollections>(cardinality),
+            // hyperloglog: measure_error::<HyperLogLog>(cardinality),
+            // hyperloglogplus: measure_error::<HyperLogLogPlus>(cardinality),
         })
         .collect();
 
@@ -189,10 +196,11 @@ fn measure_error<E: CardinalityEstimatorTrait<usize>>(cardinality: usize) -> Str
 struct StatRecord {
     cardinality: usize,
     cardinality_estimator: String,
-    amadeus_streaming: String,
-    probabilistic_collections: String,
-    hyperloglog: String,
-    hyperloglogplus: String,
+    cardinality_estimator_safe: String,
+    // amadeus_streaming: String,
+    // probabilistic_collections: String,
+    // hyperloglog: String,
+    // hyperloglogplus: String,
 }
 
 struct CardinalityEstimatorMut(CardinalityEstimator<usize>);
@@ -219,6 +227,30 @@ impl CardinalityEstimatorTrait<usize> for CardinalityEstimatorMut {
     }
 }
 
+struct CardinalityEstimatorSafeMut(CardinalityEstimatorSafe<usize>);
+
+impl CardinalityEstimatorTrait<usize> for CardinalityEstimatorSafeMut {
+    fn new() -> Self {
+        Self(CardinalityEstimatorSafe::new())
+    }
+
+    fn insert(&mut self, item: &usize) {
+        self.0.insert(item);
+    }
+
+    fn estimate(&mut self) -> usize {
+        self.0.estimate()
+    }
+
+    fn merge(&mut self, rhs: &Self) {
+        self.0.merge(&rhs.0);
+    }
+
+    fn name() -> String {
+        "cardinality-estimator-safe".to_string()
+    }
+}
+
 struct AmadeusStreamingEstimator(amadeus_streaming::HyperLogLog<usize>);
 
 impl CardinalityEstimatorTrait<usize> for AmadeusStreamingEstimator {

diff --git a/benches/error_rate_safe.png b/benches/error_rate_safe.png
diff --git a/benches/estimate_time_safe.png b/benches/estimate_time_safe.png
diff --git a/benches/insert_time_safe.png b/benches/insert_time_safe.png
diff --git a/benches/memory_bytes_safe.png b/benches/memory_bytes_safe.png
diff --git a/examples/estimator.rs b/examples/estimator.rs
@@ -1,4 +1,4 @@
-use cardinality_estimator::CardinalityEstimator;
+use cardinality_estimator_safe::CardinalityEstimator;
 
 fn main() {
     let mut estimator1 = CardinalityEstimator::<usize>::new();

diff --git a/examples/json.rs b/examples/json.rs
@@ -0,0 +1,40 @@
+#[cfg(feature = "with_serde")]
+fn main() {
+    let mut estimator =
+        cardinality_estimator_safe::CardinalityEstimator::<usize, wyhash::WyHash, 8, 5>::new();
+
+    println!(
+        "serialized empty estimator (small): {}",
+        serde_json::to_string_pretty(&estimator).unwrap()
+    );
+
+    estimator.insert(&0);
+
+    println!(
+        "serialized with one insert (small): {}",
+        serde_json::to_string_pretty(&estimator).unwrap()
+    );
+
+    estimator.insert(&1);
+    estimator.insert(&2);
+
+    println!(
+        "serialized with three inserts (array): {}",
+        serde_json::to_string_pretty(&estimator).unwrap()
+    );
+
+    for i in 3..1000 {
+        estimator.insert(&i);
+    }
+
+    println!(
+        "serialized with many inserts (HLL): {}",
+        serde_json::to_string_pretty(&estimator).unwrap()
+    );
+}
+
+#[cfg(not(feature = "with_serde"))]
+fn main() -> Result<(), u32> {
+    eprintln!("this example requires --features with_serde");
+    Err(1)
+}
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -10,6 +10,7 @@ cargo-fuzz = true
 [dependencies]
 cardinality-estimator = { path = "..", features = ["with_serde"] }
 libfuzzer-sys = "0.4"
+postcard = { version = "1.1.1", features = ["alloc"] }
 serde_json = "1.0.115"
 wyhash = "0.5.0"
 
@@ -26,3 +27,17 @@ path = "fuzz_targets/serde.rs"
 test = false
 doc = false
 bench = false
+
+[[bin]]
+name = "serde_json_array"
+path = "fuzz_targets/serde_json_array.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "serde_postcard"
+path = "fuzz_targets/serde_postcard.rs"
+test = false
+doc = false
+bench = false
diff --git a/fuzz/fuzz_targets/estimator.rs b/fuzz/fuzz_targets/estimator.rs
@@ -1,6 +1,6 @@
 #![no_main]
 
-use cardinality_estimator::estimator::CardinalityEstimator;
+use cardinality_estimator_safe::estimator::CardinalityEstimator;
 use libfuzzer_sys::fuzz_target;
 use wyhash::wyhash;
 

diff --git a/fuzz/fuzz_targets/serde.rs b/fuzz/fuzz_targets/serde.rs
@@ -1,6 +1,6 @@
 #![no_main]
 
-use cardinality_estimator::estimator::CardinalityEstimator;
+use cardinality_estimator_safe::estimator::CardinalityEstimator;
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {