Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[package]
name = "bpe-openai"
version = "0.1.0"
edition = "2021"
description = "Prebuilt fast byte-pair encoders for OpenAI."
repository = "https://github.com/github/rust-gems"
license = "MIT"
keywords = ["tokenizer", "algorithm", "encoding", "bpe"]
categories = ["algorithms", "data-structures", "encoding", "science"]

[lib]
crate-type = ["lib", "staticlib"]
bench = false

[dependencies]
bpe = { version = "0.1.0", path = "../bpe" }
rmp-serde = "1"
serde = { version = "1" }

[build-dependencies]
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
rmp-serde = "1"
tiktoken-rs = { version = "0.5" }
serde = { version = "1" }
42 changes: 42 additions & 0 deletions crates/bpe-openai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# OpenAI Byte Pair Encoders

Fast tokenizers for OpenAI token sets based on the [bpe](https://crates.io/crates/bpe) crate.
Serialized BPE instances are generated during build and lazily loaded at runtime as static values.
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.

Supported token sets:

- r50k
- p50k
- cl100k
- o200k

## Usage

Add a dependency by running

```sh
cargo add bpe-openai
```

or by adding the following to `Cargo.toml`

```toml
[dependencies]
bpe-openai = "0.1"
```

Counting tokens is as simple as:

```rust
use bpe_openai::cl100k;

fn main() {
let bpe = cl100k();
let count = bpe.count("Hello, world!");
println!("{tokens}");
}
```

For more detailed documentation we refer to [bpe](https://crates.io/crates/bpe).
51 changes: 51 additions & 0 deletions crates/bpe-openai/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::env;
use std::fs::File;
use std::path::PathBuf;

use bpe::byte_pair_encoding::BytePairEncoding;
use serde::Serialize;
use tiktoken_rs::CoreBPE;

fn main() {
serialize_tokens(
"r50k",
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
50256,
1,
);
serialize_tokens(
"p50k",
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
50280,
1,
);
serialize_tokens(
"cl100k",
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
100256,
17846336922010275747,
);
serialize_tokens(
"cl100k",
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
100256,
17846336922010275747,
);
serialize_tokens(
"o200k",
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
199998,
17846336922010275747,
);
println!("cargo::rerun-if-changed=build.rs");
}

fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
path.push(format!("bpe_{name}.dict"));
let file = File::create(path).expect("can create output file");
let mut serializer = rmp_serde::Serializer::new(file);
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
bpe.serialize(&mut serializer)
.expect("serialization succeeds");
}
66 changes: 66 additions & 0 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::sync::LazyLock;

use bpe::byte_pair_encoding::BytePairEncoding;

static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
rmp_serde::from_slice(bytes).expect("")
});

pub use bpe::*;

pub fn r50k() -> &'static BytePairEncoding {
&BPE_R50K
}

pub fn p50k() -> &'static BytePairEncoding {
&BPE_P50K
}

pub fn cl100k() -> &'static BytePairEncoding {
&BPE_CL100K
}

pub fn o200k() -> &'static BytePairEncoding {
&BPE_O200K
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn can_load_r50k() {
r50k().count("".as_bytes());
}

#[test]
fn can_load_p50k() {
p50k().count("".as_bytes());
}

#[test]
fn can_load_cl100k() {
cl100k().count("".as_bytes());
}

#[test]
fn can_load_o200k() {
o200k().count("".as_bytes());
}
}
3 changes: 2 additions & 1 deletion crates/bpe/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bpe"
version = "0.0.1"
version = "0.1.0"
edition = "2021"
description = "Fast byte-pair encoding implementation."
repository = "https://github.com/github/rust-gems"
Expand All @@ -16,6 +16,7 @@ bench = false
name = "performance"
path = "benches/performance.rs"
harness = false
test = false

[features]
rand = ["dep:rand"]
Expand Down
6 changes: 6 additions & 0 deletions crates/bpe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,12 @@ If the requirement of correct BPE output can be relaxed, then the Greedy approac

![encoding runtime comparison](./benches/result/encoding-o200k.svg)

The graph below shows encoding results for input that is particularly challenging for tiktoken.
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.

![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg)

### Incremental encoding

Incremental encoding tokenizes a text while appending bytes.
Expand Down
74 changes: 59 additions & 15 deletions crates/bpe/benches/performance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,28 @@ use criterion::{
use rand::{thread_rng, Rng};
use tiktoken_rs::CoreBPE;

static TOKENIZERS: LazyLock<[(&'static str, &'static BytePairEncoding, CoreBPE); 2]> =
LazyLock::new(|| {
[
(
"cl100k",
BytePairEncoding::cl100k(),
tiktoken_rs::cl100k_base().unwrap(),
static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| {
[
(
"cl100k",
BytePairEncoding::from_tiktoken(
&tiktoken_rs::cl100k_base_singleton().lock(),
100256,
Some(17846336922010275747),
),
(
"o200k",
BytePairEncoding::o200k(),
tiktoken_rs::o200k_base().unwrap(),
tiktoken_rs::cl100k_base().unwrap(),
),
(
"o200k",
BytePairEncoding::from_tiktoken(
&tiktoken_rs::o200k_base_singleton().lock(),
199998,
Some(17846336922010275747),
),
]
});
tiktoken_rs::o200k_base().unwrap(),
),
]
});

fn counting_benchmark(c: &mut Criterion) {
for (name, bpe, _) in TOKENIZERS.iter() {
Expand Down Expand Up @@ -160,6 +167,31 @@ fn appending_benchmark(c: &mut Criterion) {
}
}

fn worstcase_benchmark(c: &mut Criterion) {
for (name, bpe, tiktoken) in TOKENIZERS.iter() {
let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
let input = text.as_bytes();

let mut group = c.benchmark_group(format!("worstcase-{name}"));
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
group.throughput(criterion::Throughput::Bytes(bytes as u64));
group.bench_with_input(
BenchmarkId::new("backtracking", bytes),
&bytes,
|b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))),
);
group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| {
b.iter_batched(
|| select_test_bytes(input, *bytes),
|input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()),
criterion::BatchSize::SmallInput,
)
});
}
group.finish();
}
}

fn is_char_boundary(b: u8) -> bool {
// Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
// Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
Expand Down Expand Up @@ -188,12 +220,24 @@ fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
text
}

fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
let mut start = thread_rng().gen_range(0..input.len() - bytes);
while start > 0 && !is_char_boundary(input[start]) {
start -= 1;
}
let mut end = start + bytes;
while end < input.len() && !is_char_boundary(input[end]) {
end += 1;
}
&input[start..end]
}

criterion_group!(
name = benches;
config = Criterion::default()
.warm_up_time(Duration::from_millis(500))
.measurement_time(Duration::from_millis(1000))
.measurement_time(Duration::from_millis(4000))
.nresamples(1000);
targets = counting_benchmark, encoding_benchmark, appending_benchmark
targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark
);
criterion_main!(benches);
20 changes: 10 additions & 10 deletions crates/bpe/benches/result/appending-o200k.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading