Add: Placeholder for TF-IDF

ashvardanian · ashvardanian · commit bd23a21d960a · 2024-12-08T20:22:19.000Z
ashvardanian/SimSIMD#239
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -5,6 +5,7 @@
     "rapidfuzz",
     "rfind",
     "stringwars",
-    "stringzilla"
+    "stringzilla",
+    "tfidf"
   ]
 }
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ So, to accelerate the development of the [`stringzilla`](https://github.com/ashv
 - [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances.
 - [`aHash`](https://github.com/tkaitchuck/aHash) for hashing.
 - [`aho_corasick`](https://github.com/BurntSushi/aho-corasick) for multi-pattern search.
+- [`tantivy`](https://github.com/quickwit-oss/tantivy) for document retrieval.
 
 Of course, the functionality of the projects is different, as are the APIs and the usage patterns.
 So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations.
@@ -82,6 +83,15 @@ To run them on Linux and MacOS, pass the dataset path as an environment variable
     STRINGWARS_MODE=words STRINGWARS_DATASET=README.md cargo criterion --features bench_hash bench_hash --jobs 8
     ```
 
+- Document retrieval with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf):
+
+    ```bash
+    STRINGWARS_DATASET=README.md cargo criterion --features bench_tfidf bench_tfidf --jobs 8
+    ```
+
+    The TF-IDF benchmarks compute the term frequency-inverse document frequency for each word in the input file.
+    The benchmark relies on a hybrid of StringZilla and SimSIMD to achieve the best performance.
+
 On Windows using PowerShell you'd need to set the environment variable differently:
 
 ```powershell
diff --git a/bench_tfidf.rs b/bench_tfidf.rs
@@ -0,0 +1,51 @@
+use std::env;
+use std::fs;
+
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+
+use memchr::memmem;
+use stringzilla::StringZilla;
+
+fn configure_bench() -> Criterion {
+    Criterion::default()
+        .sample_size(1000) // Test this many needles.
+        .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle.
+        .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time.
+}
+
+fn bench_tfidf(c: &mut Criterion) {
+    // Get the haystack path from the environment variable.
+    let dataset_path =
+        env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set");
+    let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack");
+
+    // Tokenize the haystack content by white space.
+    let needles: Vec<&str> = haystack_content.split_whitespace().collect();
+    if needles.is_empty() {
+        panic!("No tokens found in the haystack.");
+    }
+
+    let haystack = haystack_content.as_bytes();
+    let haystack_length = haystack.len();
+
+    // Benchmarks for forward search
+    let mut g = c.benchmark_group("search-forward");
+    g.throughput(Throughput::Bytes(haystack_length as u64));
+    perform_forward_benchmarks(&mut g, &needles, haystack);
+    g.finish();
+
+    // Benchmarks for reverse search
+    let mut g = c.benchmark_group("search-reverse");
+    g.throughput(Throughput::Bytes(haystack_length as u64));
+    perform_reverse_benchmarks(&mut g, &needles, haystack);
+    g.finish();
+}
+
+...
+
+criterion_group! {
+    name = bench_tfidf_group;
+    config = configure_bench();
+    targets = bench_tfidf
+}
+criterion_main!(bench_tfidf_group);

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`	`"rapidfuzz",`
`6`	`6`	`"rfind",`
`7`	`7`	`"stringwars",`
`8`		`- "stringzilla"`
	`8`	`+ "stringzilla",`
	`9`	`+ "tfidf"`
`9`	`10`	`]`
`10`	`11`	`}`