Skip to content

Commit bd23a21

Browse files
committed
Add: Placeholder for TF-IDF
ashvardanian/SimSIMD#239
1 parent b906b91 commit bd23a21

File tree

3 files changed

+63
-1
lines changed

3 files changed

+63
-1
lines changed

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"rapidfuzz",
66
"rfind",
77
"stringwars",
8-
"stringzilla"
8+
"stringzilla",
9+
"tfidf"
910
]
1011
}

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ So, to accelerate the development of the [`stringzilla`](https://github.com/ashv
1313
- [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances.
1414
- [`aHash`](https://github.com/tkaitchuck/aHash) for hashing.
1515
- [`aho_corasick`](https://github.com/BurntSushi/aho-corasick) for multi-pattern search.
16+
- [`tantivy`](https://github.com/quickwit-oss/tantivy) for document retrieval.
1617

1718
Of course, the functionality of the projects is different, as are the APIs and the usage patterns.
1819
So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations.
@@ -82,6 +83,15 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable
8283
STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8
8384
```
8485

86+
- Document retrieval with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf):
87+
88+
```bash
89+
STRINGWARS_DATASET=README.md cargo criterion --features bench_tfidf bench_tfidf --jobs 8
90+
```
91+
92+
The TF-IDF benchmarks compute the term frequency-inverse document frequency for each word in the input file.
93+
The benchmark relies on a hybrid of StringZilla and SimSIMD to achieve the best performance.
94+
8595
On Windows using PowerShell you'd need to set the environment variable differently:
8696
8797
```powershell

bench_tfidf.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
use std::env;
2+
use std::fs;
3+
4+
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
5+
6+
use memchr::memmem;
7+
use stringzilla::StringZilla;
8+
9+
fn configure_bench() -> Criterion {
10+
Criterion::default()
11+
.sample_size(1000) // Test this many needles.
12+
.warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle.
13+
.measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time.
14+
}
15+
16+
fn bench_tfidf(c: &mut Criterion) {
17+
// Get the haystack path from the environment variable.
18+
let dataset_path =
19+
env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set");
20+
let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack");
21+
22+
// Tokenize the haystack content by white space.
23+
let needles: Vec<&str> = haystack_content.split_whitespace().collect();
24+
if needles.is_empty() {
25+
panic!("No tokens found in the haystack.");
26+
}
27+
28+
let haystack = haystack_content.as_bytes();
29+
let haystack_length = haystack.len();
30+
31+
// Benchmarks for forward search
32+
let mut g = c.benchmark_group("search-forward");
33+
g.throughput(Throughput::Bytes(haystack_length as u64));
34+
perform_forward_benchmarks(&mut g, &needles, haystack);
35+
g.finish();
36+
37+
// Benchmarks for reverse search
38+
let mut g = c.benchmark_group("search-reverse");
39+
g.throughput(Throughput::Bytes(haystack_length as u64));
40+
perform_reverse_benchmarks(&mut g, &needles, haystack);
41+
g.finish();
42+
}
43+
44+
...
45+
46+
criterion_group! {
47+
name = bench_tfidf_group;
48+
config = configure_bench();
49+
targets = bench_tfidf
50+
}
51+
criterion_main!(bench_tfidf_group);

0 commit comments

Comments
 (0)