Skip to content

Commit 0b4cae9

Browse files
author
Hendrik van Antwerpen
committed
Add other token sets as well
1 parent fa4edb5 commit 0b4cae9

File tree

3 files changed

+48
-0
lines changed

3 files changed

+48
-0
lines changed

crates/bpe-openai/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i
77

88
Supported token sets:
99

10+
- r50k
11+
- p50k
1012
- cl100k
1113
- o200k
1214

crates/bpe-openai/build.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,24 @@ use serde::Serialize;
77
use tiktoken_rs::CoreBPE;
88

99
fn main() {
10+
serialize_tokens(
11+
"r50k",
12+
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
13+
50256,
14+
1,
15+
);
16+
serialize_tokens(
17+
"p50k",
18+
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
19+
50280,
20+
1,
21+
);
22+
serialize_tokens(
23+
"cl100k",
24+
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
25+
100256,
26+
17846336922010275747,
27+
);
1028
serialize_tokens(
1129
"cl100k",
1230
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),

crates/bpe-openai/src/lib.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@ use std::sync::LazyLock;
22

33
use bpe::byte_pair_encoding::BytePairEncoding;
44

5+
static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
6+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
7+
rmp_serde::from_slice(bytes).expect("")
8+
});
9+
10+
static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
11+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
12+
rmp_serde::from_slice(bytes).expect("")
13+
});
14+
515
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
616
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
717
rmp_serde::from_slice(bytes).expect("")
@@ -14,6 +24,14 @@ static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
1424

1525
pub use bpe::*;
1626

27+
pub fn r50k() -> &'static BytePairEncoding {
28+
&BPE_R50K
29+
}
30+
31+
pub fn p50k() -> &'static BytePairEncoding {
32+
&BPE_P50K
33+
}
34+
1735
pub fn cl100k() -> &'static BytePairEncoding {
1836
&BPE_CL100K
1937
}
@@ -26,6 +44,16 @@ pub fn o200k() -> &'static BytePairEncoding {
2644
mod tests {
2745
use super::*;
2846

47+
#[test]
48+
fn can_load_r50k() {
49+
r50k().count("".as_bytes());
50+
}
51+
52+
#[test]
53+
fn can_load_p50k() {
54+
p50k().count("".as_bytes());
55+
}
56+
2957
#[test]
3058
fn can_load_cl100k() {
3159
cl100k().count("".as_bytes());

0 commit comments

Comments
 (0)