Add: Hashing benchmarks

ashvardanian · ashvardanian · commit b906b91065a8 · 2024-12-06T21:10:39.000Z
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,9 +1,10 @@
 {
   "cSpell.words": [
-    "stringwars",
     "memchr",
     "memmem",
+    "rapidfuzz",
     "rfind",
+    "stringwars",
     "stringzilla"
   ]
 }
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,12 +6,14 @@ edition = "2018"
 [dependencies]
 rand = "0.8.5"
 criterion = "0.5.1"
-stringzilla = { version = "3.3.0" }
+# stringzilla = { version = "3.3.0" }
+stringzilla = { path = "../StringZilla-dev" }
 
 # Feature-based dependencies for benchmarks
 [features]
 bench_find = ["memchr"]
 bench_levenshtein = ["rapidfuzz"]
+bench_hash = ["ahash", "xxhash-rust"]
 
 [dependencies.memchr]
 version = "2.7.1"
@@ -22,6 +24,15 @@ optional = true
 version = "0.5.0"
 optional = true
 
+[dependencies.ahash]
+version = "0.8"
+optional = true
+
+[dependencies.xxhash-rust]
+version = "0.8"
+optional = true
+features = ["xxh3", "const_xxh3"]
+
 [[bench]]
 name = "bench_find"
 path = "bench_find.rs"
@@ -33,3 +44,9 @@ name = "bench_levenshtein"
 path = "bench_levenshtein.rs"
 harness = false
 required-features = ["bench_levenshtein"]
+
+[[bench]]
+name = "bench_hash"
+path = "bench_hash.rs"
+harness = false
+required-features = ["bench_hash"]
diff --git a/README.md b/README.md
@@ -66,10 +66,14 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable
 - Edit Distance:
 
     ```bash
-    STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
-    STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
+    STRINGWARS_MODE=lines STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
+    STRINGWARS_MODE=words STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
     ```
 
+    Edit distance benchmarks compute the Levenshtein distance between consecutive pairs of whitespace-delimited words or newline-delimited lines.
+    They include byte-level and character-level operations and also run for the bounded case - when the maximum allowed distance is predefined.
+    By default, the maximum allowed distance is set to 15% of the longer string in each pair.
+
 - Hashing:
 
     ```bash
diff --git a/bench_hash.rs b/bench_hash.rs
@@ -0,0 +1,110 @@
+use std::env;
+use std::fs;
+
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use std::hash::{BuildHasher, Hasher};
+
+use stringzilla::sz::{checksum as sz_checksum, hash as sz_hash};
+use stringzilla::StringZilla;
+
+use ahash::AHasher;
+use xxhash_rust::const_xxh3::xxh3_64 as const_xxh3;
+use xxhash_rust::xxh3::xxh3_64;
+
+// Mode: "lines", "words", "file"
+// STRINGWARS_MODE controls how we interpret the input data.
+fn configure_bench() -> Criterion {
+    Criterion::default()
+        .sample_size(1000) // Number of iterations per benchmark.
+        .warm_up_time(std::time::Duration::from_secs(10)) // Let CPU frequencies settle.
+        .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time.
+}
+
+fn bench_hash(c: &mut Criterion) {
+    let dataset_path =
+        env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set");
+    let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string());
+
+    let content = fs::read_to_string(&dataset_path).expect("Could not read dataset");
+    let units: Vec<&str> = match mode.as_str() {
+        "lines" => content.lines().collect(),
+        "words" => content.split_whitespace().collect(),
+        "file" => {
+            // In "file" mode, treat the entire content as a single unit.
+            vec![&content]
+        }
+        other => panic!(
+            "Unknown STRINGWARS_MODE: {}. Use 'lines', 'words', or 'file'.",
+            other
+        ),
+    };
+
+    if units.is_empty() {
+        panic!("No data found for hashing in the provided dataset.");
+    }
+
+    // Calculate total bytes processed for throughput reporting
+    let total_bytes: usize = units.iter().map(|u| u.len()).sum();
+
+    let mut g = c.benchmark_group("hash");
+    g.throughput(Throughput::Bytes(total_bytes as u64));
+
+    perform_hashing_benchmarks(&mut g, &units);
+
+    g.finish();
+}
+
+fn perform_hashing_benchmarks(
+    g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
+    units: &[&str],
+) {
+    // Benchmark StringZilla checksums
+    let mut index = 0;
+    g.bench_function("stringzilla::checksum", |b| {
+        b.iter(|| {
+            let unit = units[index];
+            let _hash = sz_checksum(unit.as_bytes());
+            index = (index + 1) % units.len();
+        })
+    });
+
+    // Benchmark StringZilla hashing
+    let mut index = 0;
+    g.bench_function("stringzilla::hash", |b| {
+        b.iter(|| {
+            let unit = units[index];
+            let _hash = sz_hash(unit.as_bytes());
+            index = (index + 1) % units.len();
+        })
+    });
+
+    // Benchmark aHash
+    let mut index = 0;
+    let ahash_builder = ahash::RandomState::new();
+    g.bench_function("aHash", |b| {
+        b.iter(|| {
+            let unit = units[index];
+            let mut hasher = ahash_builder.build_hasher();
+            hasher.write(unit.as_bytes());
+            let _hash = hasher.finish();
+            index = (index + 1) % units.len();
+        })
+    });
+
+    // Benchmark xxHash (xxh3)
+    let mut index = 0;
+    g.bench_function("xxh3", |b| {
+        b.iter(|| {
+            let unit = units[index];
+            let _hash = xxh3_64(unit.as_bytes());
+            index = (index + 1) % units.len();
+        })
+    });
+}
+
+criterion_group! {
+    name = bench_hash_group;
+    config = configure_bench();
+    targets = bench_hash
+}
+criterion_main!(bench_hash_group);
diff --git a/bench_levenshtein.rs b/bench_levenshtein.rs

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,10 @@`
`1`	`1`	`{`
`2`	`2`	`"cSpell.words": [`
`3`		`- "stringwars",`
`4`	`3`	`"memchr",`
`5`	`4`	`"memmem",`
	`5`	`+ "rapidfuzz",`
`6`	`6`	`"rfind",`
	`7`	`+ "stringwars",`
`7`	`8`	`"stringzilla"`
`8`	`9`	`]`
`9`	`10`	`}`