github · hendrikvanantwerpen · Oct 4, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024
diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "bpe-openai"
+version = "0.1.0"
+edition = "2021"
+description = "Prebuilt fast byte-pair encoders for OpenAI."
+repository = "https://github.com/github/rust-gems"
+license = "MIT"
+keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
+categories = ["algorithms", "data-structures", "encoding", "science"]
+
+[lib]
+crate-type = ["lib", "staticlib"]
+bench = false
+
+[dependencies]
+bpe = { version = "0.1.0", path = "../bpe" }
+rmp-serde = "1"
+serde = { version = "1" }
+
+[build-dependencies]
+bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
+rmp-serde = "1"
+tiktoken-rs = { version = "0.5" }
+serde = { version = "1" }
diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md
@@ -0,0 +1,42 @@
+# OpenAI Byte Pair Encoders
+
+Fast tokenizers for OpenAI token sets based on the [bpe](https://crates.io/crates/bpe) crate.
+Serialized BPE instances are generated during build and lazily loaded at runtime as static values.
+The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
+For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
+
+Supported token sets:
+
+- r50k
+- p50k
+- cl100k
+- o200k
+
+## Usage
+
+Add a dependency by running
+
+```sh
+cargo add bpe-openai
+```
+
+or by adding the following to `Cargo.toml`
+
+```toml
+[dependencies]
+bpe-openai = "0.1"
+```
+
+Counting tokens is as simple as:
+
+```rust
+use bpe_openai::cl100k;
+
+fn main() {
+  let bpe = cl100k();
+  let count = bpe.count("Hello, world!");
+  println!("{tokens}");
+}
+```
+
+For more detailed documentation we refer to [bpe](https://crates.io/crates/bpe).
diff --git a/crates/bpe-openai/build.rs b/crates/bpe-openai/build.rs
@@ -0,0 +1,51 @@
+use std::env;
+use std::fs::File;
+use std::path::PathBuf;
+
+use bpe::byte_pair_encoding::BytePairEncoding;
+use serde::Serialize;
+use tiktoken_rs::CoreBPE;
+
+fn main() {
+    serialize_tokens(
+        "r50k",
+        &tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
+        50256,
+        1,
+    );
+    serialize_tokens(
+        "p50k",
+        &tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
+        50280,
+        1,
+    );
+    serialize_tokens(
+        "cl100k",
+        &tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
+        100256,
+        17846336922010275747,
+    );
+    serialize_tokens(
+        "cl100k",
+        &tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
+        100256,
+        17846336922010275747,
+    );
+    serialize_tokens(
+        "o200k",
+        &tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
+        199998,
+        17846336922010275747,
+    );
+    println!("cargo::rerun-if-changed=build.rs");
+}
+
+fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
+    let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
+    path.push(format!("bpe_{name}.dict"));
+    let file = File::create(path).expect("can create output file");
+    let mut serializer = rmp_serde::Serializer::new(file);
+    let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
+    bpe.serialize(&mut serializer)
+        .expect("serialization succeeds");
+}
diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -0,0 +1,66 @@
+use std::sync::LazyLock;
+
+use bpe::byte_pair_encoding::BytePairEncoding;
+
+static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
+    rmp_serde::from_slice(bytes).expect("")
+});
+
+pub use bpe::*;
+
+pub fn r50k() -> &'static BytePairEncoding {
+    &BPE_R50K
+}
+
+pub fn p50k() -> &'static BytePairEncoding {
+    &BPE_P50K
+}
+
+pub fn cl100k() -> &'static BytePairEncoding {
+    &BPE_CL100K
+}
+
+pub fn o200k() -> &'static BytePairEncoding {
+    &BPE_O200K
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn can_load_r50k() {
+        r50k().count("".as_bytes());
+    }
+
+    #[test]
+    fn can_load_p50k() {
+        p50k().count("".as_bytes());
+    }
+
+    #[test]
+    fn can_load_cl100k() {
+        cl100k().count("".as_bytes());
+    }
+
+    #[test]
+    fn can_load_o200k() {
+        o200k().count("".as_bytes());
+    }
+}
diff --git a/crates/bpe/Cargo.toml b/crates/bpe/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "bpe"
-version = "0.0.1"
+version = "0.1.0"
 edition = "2021"
 description = "Fast byte-pair encoding implementation."
 repository = "https://github.com/github/rust-gems"
@@ -16,6 +16,7 @@ bench = false
 name = "performance"
 path = "benches/performance.rs"
 harness = false
+test = false
 
 [features]
 rand = ["dep:rand"]

diff --git a/crates/bpe/README.md b/crates/bpe/README.md
@@ -227,6 +227,12 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac
 
 ![encoding runtime comparison](./benches/result/encoding-o200k.svg)
 
+The graph below shows encoding results for input that is particularly challenging for tiktoken.
+The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
+This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.
+
+![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg)
+
 ### Incremental encoding
 
 Incremental encoding tokenizes a text while appending bytes.

diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benches/performance.rs
@@ -10,21 +10,28 @@ use criterion::{
 use rand::{thread_rng, Rng};
 use tiktoken_rs::CoreBPE;
 
-static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> =
-    LazyLock::new(|| {
-        [
-            (
-                "cl100k",
-                BytePairEncoding::cl100k(),
-                tiktoken_rs::cl100k_base().unwrap(),
+static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| {
+    [
+        (
+            "cl100k",
+            BytePairEncoding::from_tiktoken(
+                &tiktoken_rs::cl100k_base_singleton().lock(),
+                100256,
+                Some(17846336922010275747),
             ),
-            (
-                "o200k",
-                BytePairEncoding::o200k(),
-                tiktoken_rs::o200k_base().unwrap(),
+            tiktoken_rs::cl100k_base().unwrap(),
+        ),
+        (
+            "o200k",
+            BytePairEncoding::from_tiktoken(
+                &tiktoken_rs::o200k_base_singleton().lock(),
+                199998,
+                Some(17846336922010275747),
             ),
-        ]
-    });
+            tiktoken_rs::o200k_base().unwrap(),
+        ),
+    ]
+});
 
 fn counting_benchmark(c: &mut Criterion) {
     for (name, bpe, _) in TOKENIZERS.iter() {
@@ -160,6 +167,31 @@ fn appending_benchmark(c: &mut Criterion) {
     }
 }
 
+fn worstcase_benchmark(c: &mut Criterion) {
+    for (name, bpe, tiktoken) in TOKENIZERS.iter() {
+        let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
+        let input = text.as_bytes();
+
+        let mut group = c.benchmark_group(format!("worstcase-{name}"));
+        for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
+            group.throughput(criterion::Throughput::Bytes(bytes as u64));
+            group.bench_with_input(
+                BenchmarkId::new("backtracking", bytes),
+                &bytes,
+                |b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))),
+            );
+            group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
+                b.iter_batched(
+                    || select_test_bytes(input, *bytes),
+                    |input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        group.finish();
+    }
+}
+
 fn is_char_boundary(b: u8) -> bool {
     // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
     // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
@@ -188,12 +220,24 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
     text
 }
 
+fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
+    let mut start = thread_rng().gen_range(0..input.len() - bytes);
+    while start > 0 && !is_char_boundary(input[start]) {
+        start -= 1;
+    }
+    let mut end = start + bytes;
+    while end < input.len() && !is_char_boundary(input[end]) {
+        end += 1;
+    }
+    &input[start..end]
+}
+
 criterion_group!(
     name = benches;
     config = Criterion::default()
                 .warm_up_time(Duration::from_millis(500))
-                .measurement_time(Duration::from_millis(1000))
+                .measurement_time(Duration::from_millis(4000))
                 .nresamples(1000);
-    targets = counting_benchmark, encoding_benchmark, appending_benchmark
+    targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark
 );
 criterion_main!(benches);
diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/benches/result/appending-o200k.svg