From e51f8d239a1083d0854202afbccd931ea41a7895 Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:24:55 -0400
Subject: [PATCH 1/8] Create append.rs
---
tokenizers/src/normalizers/append.rs | 40 ++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
create mode 100644 tokenizers/src/normalizers/append.rs
diff --git a/tokenizers/src/normalizers/append.rs b/tokenizers/src/normalizers/append.rs
new file mode 100644
index 000000000..e7c266224
--- /dev/null
+++ b/tokenizers/src/normalizers/append.rs
@@ -0,0 +1,40 @@
+use crate::tokenizer::{NormalizedString, Normalizer, Result};
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+#[serde(tag = "type")]
+pub struct Append {
+ pub append: String,
+}
+
+impl Append {
+ pub fn new(append: String) -> Self {
+ Self { append }
+ }
+}
+
+impl Normalizer for Append {
+ /// Append the normalized string inplace
+ fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
+ if !normalized.is_empty() {
+ normalized.append(&self.append);
+ }
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_append() {
+ let original = "Hello";
+ let normalized = "Hello▁";
+ assert_ne!(original, normalized);
+ let mut n = NormalizedString::from(original);
+ let append = Append::new("▁".to_string());
+ append.normalize(&mut n).unwrap();
+ assert_eq!(&n.get(), &normalized);
+ }
+}
From 94a744ba8b59c46da73c5d6bee5bcdfe358c2511 Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:28:13 -0400
Subject: [PATCH 2/8] Add Append
---
bindings/python/src/normalizers.rs | 29 ++++++++++++++++++++++++++++-
1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs
index 3cd59a3c7..d1495bb48 100644
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -8,7 +8,7 @@ use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
use serde::ser::SerializeStruct;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::normalizers::{
- BertNormalizer, ByteLevel, Lowercase, Nmt, NormalizerWrapper, Precompiled, Prepend, Replace,
+ BertNormalizer, ByteLevel, Lowercase, Nmt, NormalizerWrapper, Precompiled, Append, Prepend, Replace,
Strip, StripAccents, NFC, NFD, NFKC, NFKD,
};
use tk::{NormalizedString, Normalizer};
@@ -82,6 +82,10 @@ impl PyNormalizer {
.into_pyobject(py)?
.into_any()
.into(),
+ NormalizerWrapper::Append(_) => Py::new(py, (PyAppend {}, base))?
+ .into_pyobject(py)?
+ .into_any()
+ .into(),
NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?
.into_pyobject(py)?
.into_any()
@@ -514,6 +518,28 @@ impl PyStrip {
}
}
+/// Append normalizer
+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Append")]
+pub struct PyAppend {}
+#[pymethods]
+impl PyAppend {
+ #[getter]
+ fn get_append(self_: PyRef) -> String {
+ getter!(self_, Append, append)
+ }
+
+ #[setter]
+ fn set_append(self_: PyRef, append: String) {
+ setter!(self_, Append, append, append)
+ }
+
+ #[new]
+ #[pyo3(signature = (append="▁".to_string()), text_signature = "(self, append)")]
+ fn new(append: String) -> (Self, PyNormalizer) {
+ (PyAppend {}, Append::new(append).into())
+ }
+}
+
/// Prepend normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
pub struct PyPrepend {}
@@ -810,6 +836,7 @@ pub fn normalizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
+ m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
From 891e34a8d1437f50a4dedd2e4d9921ff4af84aaf Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:31:00 -0400
Subject: [PATCH 3/8] Add Append
---
.../tokenizers/normalizers/__init__.pyi | 41 +++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
index 1f5555104..b86252d1c 100644
--- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
@@ -428,6 +428,47 @@ class Precompiled(Normalizer):
"""
pass
+class Append(Normalizer):
+ """
+ Append normalizer
+ """
+ def __init__(self, append):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
class Prepend(Normalizer):
"""
Prepend normalizer
From 1ef7fbabaecf234f091fa55edff8a109024b42f3 Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:34:25 -0400
Subject: [PATCH 4/8] Add Append
---
bindings/node/src/normalizers.rs | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/bindings/node/src/normalizers.rs b/bindings/node/src/normalizers.rs
index e51154752..c744b1769 100644
--- a/bindings/node/src/normalizers.rs
+++ b/bindings/node/src/normalizers.rs
@@ -44,6 +44,15 @@ impl tk::Normalizer for Normalizer {
}
}
+#[napi]
+pub fn append_normalizer(append: String) -> Normalizer {
+ Normalizer {
+ normalizer: Some(Arc::new(RwLock::new(
+ tk::normalizers::append::Append::new(append).into(),
+ ))),
+ }
+}
+
#[napi]
pub fn prepend_normalizer(prepend: String) -> Normalizer {
Normalizer {
From b157c3e2d49e1e4626847fb5d72290fb35814f63 Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:37:52 -0400
Subject: [PATCH 5/8] Add Append
---
tokenizers/src/normalizers/mod.rs | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/tokenizers/src/normalizers/mod.rs b/tokenizers/src/normalizers/mod.rs
index f400f13da..d33dc202a 100644
--- a/tokenizers/src/normalizers/mod.rs
+++ b/tokenizers/src/normalizers/mod.rs
@@ -1,6 +1,7 @@
pub mod bert;
pub mod byte_level;
pub mod precompiled;
+pub mod append;
pub mod prepend;
pub mod replace;
pub mod strip;
@@ -9,6 +10,7 @@ pub mod utils;
pub use crate::normalizers::bert::BertNormalizer;
pub use crate::normalizers::byte_level::ByteLevel;
pub use crate::normalizers::precompiled::Precompiled;
+pub use crate::normalizers::append::Append;
pub use crate::normalizers::prepend::Prepend;
pub use crate::normalizers::replace::Replace;
pub use crate::normalizers::strip::{Strip, StripAccents};
@@ -34,6 +36,7 @@ pub enum NormalizerWrapper {
Nmt(Nmt),
Precompiled(Precompiled),
Replace(Replace),
+ Append(Append),
Prepend(Prepend),
ByteLevel(ByteLevel),
}
@@ -64,6 +67,7 @@ impl<'de> Deserialize<'de> for NormalizerWrapper {
Nmt,
Precompiled,
Replace,
+ Append,
Prepend,
ByteLevel,
}
@@ -90,6 +94,7 @@ impl<'de> Deserialize<'de> for NormalizerWrapper {
Nmt(Nmt),
Precompiled(Precompiled),
Replace(Replace),
+ Append(Append),
Prepend(Prepend),
ByteLevel(ByteLevel),
}
@@ -145,6 +150,9 @@ impl<'de> Deserialize<'de> for NormalizerWrapper {
EnumType::Replace => NormalizerWrapper::Replace(
serde_json::from_value(values).map_err(serde::de::Error::custom)?,
),
+ EnumType::Append => NormalizerWrapper::Append(
+ serde_json::from_value(values).map_err(serde::de::Error::custom)?,
+ ),
EnumType::Prepend => NormalizerWrapper::Prepend(
serde_json::from_value(values).map_err(serde::de::Error::custom)?,
),
@@ -173,6 +181,7 @@ impl<'de> Deserialize<'de> for NormalizerWrapper {
NormalizerUntagged::Nmt(bpe) => NormalizerWrapper::Nmt(bpe),
NormalizerUntagged::Precompiled(bpe) => NormalizerWrapper::Precompiled(bpe),
NormalizerUntagged::Replace(bpe) => NormalizerWrapper::Replace(bpe),
+ NormalizerUntagged::Append(bpe) => NormalizerWrapper::Append(bpe),
NormalizerUntagged::Prepend(bpe) => NormalizerWrapper::Prepend(bpe),
NormalizerUntagged::ByteLevel(bpe) => NormalizerWrapper::ByteLevel(bpe),
}
@@ -196,6 +205,7 @@ impl Normalizer for NormalizerWrapper {
Self::Nmt(lc) => lc.normalize(normalized),
Self::Precompiled(lc) => lc.normalize(normalized),
Self::Replace(lc) => lc.normalize(normalized),
+ Self::Append(lc) => lc.normalize(normalized),
Self::Prepend(lc) => lc.normalize(normalized),
Self::ByteLevel(lc) => lc.normalize(normalized),
}
@@ -214,6 +224,7 @@ impl_enum_from!(Lowercase, NormalizerWrapper, Lowercase);
impl_enum_from!(Nmt, NormalizerWrapper, Nmt);
impl_enum_from!(Precompiled, NormalizerWrapper, Precompiled);
impl_enum_from!(Replace, NormalizerWrapper, Replace);
+impl_enum_from!(Append, NormalizerWrapper, Append);
impl_enum_from!(Prepend, NormalizerWrapper, Prepend);
impl_enum_from!(ByteLevel, NormalizerWrapper, ByteLevel);
@@ -239,6 +250,13 @@ mod tests {
_ => panic!("Expected an error here"),
}
+ let json = r#"{"append":"a"}"#;
+ let reconstructed = serde_json::from_str::(json);
+ assert!(matches!(
+ reconstructed.unwrap(),
+ NormalizerWrapper::Append(_)
+ ));
+
let json = r#"{"prepend":"a"}"#;
let reconstructed = serde_json::from_str::(json);
assert!(matches!(
From 11aa7a653bbebda595c36d834c28aefa4e704a60 Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:48:35 -0400
Subject: [PATCH 6/8] Update README.md
---
README.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/README.md b/README.md
index dd5dbe41b..1f80666b5 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,14 @@
+
+# Key Feature
+This fork adds the Append normalization to the library.
+
Provides an implementation of today's most used tokenizers, with a focus on performance and
versatility.
+
## Main features:
- Train new vocabularies and tokenize, using today's most used tokenizers.
From d52c15dd55250e8377eb45a74b271dc5bd2c00eb Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:54:00 -0400
Subject: [PATCH 7/8] Add Append
---
bindings/python/py_src/tokenizers/normalizers/__init__.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.py b/bindings/python/py_src/tokenizers/normalizers/__init__.py
index 86d233bd2..9bb6e3ce4 100644
--- a/bindings/python/py_src/tokenizers/normalizers/__init__.py
+++ b/bindings/python/py_src/tokenizers/normalizers/__init__.py
@@ -9,6 +9,7 @@
NFKC = normalizers.NFKC
Sequence = normalizers.Sequence
Lowercase = normalizers.Lowercase
+Append = normalizers.Append
Prepend = normalizers.Prepend
Strip = normalizers.Strip
StripAccents = normalizers.StripAccents
From 6e90602ff68c1b0dd82c7aeb0ab398902af75262 Mon Sep 17 00:00:00 2001
From: Austin Davis
Date: Mon, 24 Mar 2025 14:58:47 -0400
Subject: [PATCH 8/8] Update README.md
---
README.md | 5 -----
1 file changed, 5 deletions(-)
diff --git a/README.md b/README.md
index 1f80666b5..dd5dbe41b 100644
--- a/README.md
+++ b/README.md
@@ -13,14 +13,9 @@
-
-# Key Feature
-This fork adds the Append normalization to the library.
-
Provides an implementation of today's most used tokenizers, with a focus on performance and
versatility.
-
## Main features:
- Train new vocabularies and tokenize, using today's most used tokenizers.