Skip to content

Commit b906b91

Browse files
committed
Add: Hashing benchmarks
1 parent c842fe9 commit b906b91

File tree

6 files changed

+327
-38
lines changed

6 files changed

+327
-38
lines changed

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
{
22
"cSpell.words": [
3-
"stringwars",
43
"memchr",
54
"memmem",
5+
"rapidfuzz",
66
"rfind",
7+
"stringwars",
78
"stringzilla"
89
]
910
}

Cargo.lock

Lines changed: 48 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ edition = "2018"
66
[dependencies]
77
rand = "0.8.5"
88
criterion = "0.5.1"
9-
stringzilla = { version = "3.3.0" }
9+
# stringzilla = { version = "3.3.0" }
10+
stringzilla = { path = "../StringZilla-dev" }
1011

1112
# Feature-based dependencies for benchmarks
1213
[features]
1314
bench_find = ["memchr"]
1415
bench_levenshtein = ["rapidfuzz"]
16+
bench_hash = ["ahash", "xxhash-rust"]
1517

1618
[dependencies.memchr]
1719
version = "2.7.1"
@@ -22,6 +24,15 @@ optional = true
2224
version = "0.5.0"
2325
optional = true
2426

27+
[dependencies.ahash]
28+
version = "0.8"
29+
optional = true
30+
31+
[dependencies.xxhash-rust]
32+
version = "0.8"
33+
optional = true
34+
features = ["xxh3", "const_xxh3"]
35+
2536
[[bench]]
2637
name = "bench_find"
2738
path = "bench_find.rs"
@@ -33,3 +44,9 @@ name = "bench_levenshtein"
3344
path = "bench_levenshtein.rs"
3445
harness = false
3546
required-features = ["bench_levenshtein"]
47+
48+
[[bench]]
49+
name = "bench_hash"
50+
path = "bench_hash.rs"
51+
harness = false
52+
required-features = ["bench_hash"]

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,14 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable
6666
- Edit Distance:
6767

6868
```bash
69-
STRINGWARS_MODE=lines STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
70-
STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
69+
STRINGWARS_MODE=lines STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
70+
STRINGWARS_MODE=words STRINGWARS_ERROR_BOUND=15 STRINGWARS_DATASET=README.md cargo criterion --features bench_levenshtein bench_levenshtein --jobs 8
7171
```
7272

73+
Edit distance benchmarks compute the Levenshtein distance between consecutive pairs of whitespace-delimited words or newline-delimited lines.
74+
They include byte-level and character-level operations and also run for the bounded case - when the maximum allowed distance is predefined.
75+
By default, the maximum allowed distance is set to 15% of the longer string in each pair.
76+
7377
- Hashing:
7478

7579
```bash

bench_hash.rs

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
use std::env;
2+
use std::fs;
3+
4+
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
5+
use std::hash::{BuildHasher, Hasher};
6+
7+
use stringzilla::sz::{checksum as sz_checksum, hash as sz_hash};
8+
use stringzilla::StringZilla;
9+
10+
use ahash::AHasher;
11+
use xxhash_rust::const_xxh3::xxh3_64 as const_xxh3;
12+
use xxhash_rust::xxh3::xxh3_64;
13+
14+
// Mode: "lines", "words", "file"
15+
// STRINGWARS_MODE controls how we interpret the input data.
16+
fn configure_bench() -> Criterion {
17+
Criterion::default()
18+
.sample_size(1000) // Number of iterations per benchmark.
19+
.warm_up_time(std::time::Duration::from_secs(10)) // Let CPU frequencies settle.
20+
.measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time.
21+
}
22+
23+
fn bench_hash(c: &mut Criterion) {
24+
let dataset_path =
25+
env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set");
26+
let mode = env::var("STRINGWARS_MODE").unwrap_or_else(|_| "lines".to_string());
27+
28+
let content = fs::read_to_string(&dataset_path).expect("Could not read dataset");
29+
let units: Vec<&str> = match mode.as_str() {
30+
"lines" => content.lines().collect(),
31+
"words" => content.split_whitespace().collect(),
32+
"file" => {
33+
// In "file" mode, treat the entire content as a single unit.
34+
vec![&content]
35+
}
36+
other => panic!(
37+
"Unknown STRINGWARS_MODE: {}. Use 'lines', 'words', or 'file'.",
38+
other
39+
),
40+
};
41+
42+
if units.is_empty() {
43+
panic!("No data found for hashing in the provided dataset.");
44+
}
45+
46+
// Calculate total bytes processed for throughput reporting
47+
let total_bytes: usize = units.iter().map(|u| u.len()).sum();
48+
49+
let mut g = c.benchmark_group("hash");
50+
g.throughput(Throughput::Bytes(total_bytes as u64));
51+
52+
perform_hashing_benchmarks(&mut g, &units);
53+
54+
g.finish();
55+
}
56+
57+
fn perform_hashing_benchmarks(
58+
g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>,
59+
units: &[&str],
60+
) {
61+
// Benchmark StringZilla checksums
62+
let mut index = 0;
63+
g.bench_function("stringzilla::checksum", |b| {
64+
b.iter(|| {
65+
let unit = units[index];
66+
let _hash = sz_checksum(unit.as_bytes());
67+
index = (index + 1) % units.len();
68+
})
69+
});
70+
71+
// Benchmark StringZilla hashing
72+
let mut index = 0;
73+
g.bench_function("stringzilla::hash", |b| {
74+
b.iter(|| {
75+
let unit = units[index];
76+
let _hash = sz_hash(unit.as_bytes());
77+
index = (index + 1) % units.len();
78+
})
79+
});
80+
81+
// Benchmark aHash
82+
let mut index = 0;
83+
let ahash_builder = ahash::RandomState::new();
84+
g.bench_function("aHash", |b| {
85+
b.iter(|| {
86+
let unit = units[index];
87+
let mut hasher = ahash_builder.build_hasher();
88+
hasher.write(unit.as_bytes());
89+
let _hash = hasher.finish();
90+
index = (index + 1) % units.len();
91+
})
92+
});
93+
94+
// Benchmark xxHash (xxh3)
95+
let mut index = 0;
96+
g.bench_function("xxh3", |b| {
97+
b.iter(|| {
98+
let unit = units[index];
99+
let _hash = xxh3_64(unit.as_bytes());
100+
index = (index + 1) % units.len();
101+
})
102+
});
103+
}
104+
105+
criterion_group! {
106+
name = bench_hash_group;
107+
config = configure_bench();
108+
targets = bench_hash
109+
}
110+
criterion_main!(bench_hash_group);

0 commit comments

Comments
 (0)