From 95b228101c9df6cf9272c0e24ffd3ae5f1283b25 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 8 Oct 2023 18:02:43 -0400
Subject: [PATCH 01/21] Update to edition 2018

Remove unneeded `extern crate`s
---
 Cargo.toml  |  2 ++
 README.md   |  6 ++----
 src/lib.rs  | 29 +++++++++--------------------
 src/simd.rs |  1 +
 4 files changed, 14 insertions(+), 24 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 4970861..c1e2fb7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,8 @@ documentation = "https://docs.rs/jetscii/"
 
 license = "MIT OR Apache-2.0"
 
+edition = "2018"
+
 [features]
 benchmarks = []
 pattern = []
diff --git a/README.md b/README.md
index ea3397d..2f1861c 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,7 @@ characters or byte slices for sets of bytes.
 ### Searching for a set of ASCII characters
 
 ```rust
-#[macro_use]
-extern crate jetscii;
+use jetscii::ascii_chars;
 
 fn main() {
     let part_number = "86-J52:rev1";
@@ -23,8 +22,7 @@ fn main() {
 ### Searching for a set of bytes
 
 ```rust
-#[macro_use]
-extern crate jetscii;
+use jetscii::bytes;
 
 fn main() {
     let raw_data = [0x00, 0x01, 0x10, 0xFF, 0x42];
diff --git a/src/lib.rs b/src/lib.rs
index 7c30098..7224cd1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,8 +9,7 @@
 //! ### Searching for a set of ASCII characters
 //!
 //! ```rust
-//! #[macro_use]
-//! extern crate jetscii;
+//! use jetscii::ascii_chars;
 //!
 //! fn main() {
 //!     let part_number = "86-J52:rev1";
@@ -22,8 +21,7 @@
 //! ### Searching for a set of bytes
 //!
 //! ```rust
-//! #[macro_use]
-//! extern crate jetscii;
+//! use jetscii::bytes;
 //!
 //! fn main() {
 //!     let raw_data = [0x00, 0x01, 0x10, 0xFF, 0x42];
@@ -61,8 +59,7 @@
 //!
 //! ```
 //! # #[cfg(feature = "pattern")]
-//! #[macro_use]
-//! extern crate jetscii;
+//! use jetscii::ascii_chars;
 //!
 //! fn main() {
 //! # #[cfg(feature = "pattern")] {
@@ -141,16 +138,6 @@
 //! | **<code>Substring::new("xyzzy").find(s)</code>** | **11475 MB/s** |
 //! | <code>s.find("xyzzy")</code>                     | 5391 MB/s      |
 
-#[cfg(test)]
-#[macro_use]
-extern crate lazy_static;
-#[cfg(test)]
-extern crate memmap;
-#[cfg(test)]
-extern crate proptest;
-#[cfg(test)]
-extern crate region;
-
 use std::marker::PhantomData;
 
 include!(concat!(env!("OUT_DIR"), "/src/macros.rs"));
@@ -220,7 +207,7 @@ where
     /// intrinsics are not available. The closure **must** search for
     /// the same bytes as in the array.
     #[allow(unused_variables)]
-    pub /* const */ fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
+    pub fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
         Bytes {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
             simd: simd::Bytes::new(bytes, len),
@@ -265,7 +252,7 @@ where
     /// ### Panics
     ///
     /// - If you provide a non-ASCII byte.
-    pub /* const */ fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
+    pub fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
         for &b in &chars {
             assert!(b < 128, "Cannot have non-ASCII bytes");
         }
@@ -292,7 +279,7 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub /* const */ fn new(needle: &'a [u8]) -> Self {
+    pub fn new(needle: &'a [u8]) -> Self {
         ByteSubstring {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
             simd: simd::ByteSubstring::new(needle),
@@ -327,7 +314,7 @@ pub type ByteSubstringConst = ByteSubstring<'static>;
 pub struct Substring<'a>(ByteSubstring<'a>);
 
 impl<'a> Substring<'a> {
-    pub /* const */ fn new(needle: &'a str) -> Self {
+    pub fn new(needle: &'a str) -> Self {
         Substring(ByteSubstring::new(needle.as_bytes()))
     }
 
@@ -350,6 +337,8 @@ pub type SubstringConst = Substring<'static>;
 mod bench {
     extern crate test;
 
+    use lazy_static::lazy_static;
+
     use super::*;
 
     lazy_static! {
diff --git a/src/simd.rs b/src/simd.rs
index b03a7c0..6ba0d45 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -320,6 +320,7 @@ mod test {
     use std::{fmt, str};
     use memmap::MmapMut;
     use region::Protection;
+    use lazy_static::lazy_static;
 
     use super::*;
 

From 43961a5abc4fa32258e01f364ef8c41061cfc7d9 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 8 Oct 2023 18:17:50 -0400
Subject: [PATCH 02/21] Make functions const

---
 src/fallback.rs |  6 +++---
 src/lib.rs      | 14 ++++++++------
 src/simd.rs     | 20 +++++++++++++-------
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/fallback.rs b/src/fallback.rs
index 0f7b21b..9dc79bc 100644
--- a/src/fallback.rs
+++ b/src/fallback.rs
@@ -12,7 +12,7 @@ impl<F> Bytes<F>
 where
     F: Fn(u8) -> bool,
 {
-    pub /* const */ fn new(fallback: F) -> Self {
+    pub const fn new(fallback: F) -> Self {
         Bytes { fallback }
     }
 
@@ -26,12 +26,12 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub /* const */ fn new(needle: &'a[u8]) -> Self {
+    pub const fn new(needle: &'a[u8]) -> Self {
         ByteSubstring { needle }
     }
 
     #[cfg(feature = "pattern")]
-    pub fn needle_len(&self) -> usize {
+    pub const fn needle_len(&self) -> usize {
         self.needle.len()
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index 7224cd1..ef2fd96 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -207,7 +207,7 @@ where
     /// intrinsics are not available. The closure **must** search for
     /// the same bytes as in the array.
     #[allow(unused_variables)]
-    pub fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
+    pub const fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
         Bytes {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
             simd: simd::Bytes::new(bytes, len),
@@ -252,9 +252,11 @@ where
     /// ### Panics
     ///
     /// - If you provide a non-ASCII byte.
-    pub fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
-        for &b in &chars {
-            assert!(b < 128, "Cannot have non-ASCII bytes");
+    pub const fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
+        let mut i = 0;
+        while i < chars.len() {
+            assert!(chars[i] < 128, "Cannot have non-ASCII bytes");
+            i += 1;
         }
         AsciiChars(Bytes::new(chars, len, fallback))
     }
@@ -279,7 +281,7 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub fn new(needle: &'a [u8]) -> Self {
+    pub const fn new(needle: &'a [u8]) -> Self {
         ByteSubstring {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
             simd: simd::ByteSubstring::new(needle),
@@ -314,7 +316,7 @@ pub type ByteSubstringConst = ByteSubstring<'static>;
 pub struct Substring<'a>(ByteSubstring<'a>);
 
 impl<'a> Substring<'a> {
-    pub fn new(needle: &'a str) -> Self {
+    pub const fn new(needle: &'a str) -> Self {
         Substring(ByteSubstring::new(needle.as_bytes()))
     }
 
diff --git a/src/simd.rs b/src/simd.rs
index 6ba0d45..504543a 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -237,7 +237,7 @@ pub struct Bytes {
 }
 
 impl Bytes {
-    pub /* const */ fn new(bytes: [u8; 16], needle_len: i32) -> Self {
+    pub const fn new(bytes: [u8; 16], needle_len: i32) -> Self {
         Bytes {
             needle: unsafe { TransmuteToSimd { bytes }.simd },
             needle_len,
@@ -267,12 +267,18 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub /* const */ fn new(needle: &'a[u8]) -> Self {
-        use std::cmp;
-
+    pub const fn new(needle: &'a[u8]) -> Self {
         let mut simd_needle = [0; 16];
-        let len = cmp::min(simd_needle.len(), needle.len());
-        simd_needle[..len].copy_from_slice(&needle[..len]);
+        let len = if simd_needle.len() < needle.len() {
+            simd_needle.len()
+        } else {
+            needle.len()
+        };
+        let mut i = 0;
+        while i < len {
+            simd_needle[i] = needle[i];
+            i += 1;
+        }
         ByteSubstring {
             complete_needle: needle,
             needle: unsafe { TransmuteToSimd { bytes: simd_needle }.simd },
@@ -281,7 +287,7 @@ impl<'a> ByteSubstring<'a> {
     }
 
     #[cfg(feature = "pattern")]
-    pub fn needle_len(&self) -> usize {
+    pub const fn needle_len(&self) -> usize {
         self.complete_needle.len()
     }
 

From d3bda487ead04e31ffac284c521c38954b4afdeb Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Fri, 13 Oct 2023 18:05:52 -0400
Subject: [PATCH 03/21] Allow using `static` with the output of the
 `bytes`/`ascii_chars` macros only

---
 build.rs   |  4 ++--
 src/lib.rs | 44 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/build.rs b/build.rs
index 20a8d92..f21f763 100644
--- a/build.rs
+++ b/build.rs
@@ -58,7 +58,7 @@ fn macros_bytes(f: &mut File, base: &Path) {
             let closure = format!("|c| {}", closure_body.join(" || "));
 
             format!(
-                "({}) => ($crate::Bytes::new([{}], {}, {}));\n",
+                "({}) => ($crate::Bytes::new_const([{}], {}, {}));\n",
                 args, array, max, closure
             )
         })
@@ -93,7 +93,7 @@ fn macros_ascii_chars(f: &mut File, base: &Path) {
             let closure = format!("|c| {}", closure_body.join(" || "));
 
             format!(
-                "({}) => ($crate::AsciiChars::new([{}], {}, {}));\n",
+                "({}) => ($crate::AsciiChars::new_const([{}], {}, {}));\n",
                 args, array, max, closure
             )
         })
diff --git a/src/lib.rs b/src/lib.rs
index ef2fd96..5805a5c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -207,7 +207,7 @@ where
     /// intrinsics are not available. The closure **must** search for
     /// the same bytes as in the array.
     #[allow(unused_variables)]
-    pub const fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
+    pub fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
         Bytes {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
             simd: simd::Bytes::new(bytes, len),
@@ -219,6 +219,28 @@ where
         }
     }
 
+    // This constructor is used by the `bytes!` macro, and shouldn't be relied on directly.
+    //
+    // To allow F to be dropped in a const context, we require F to be Copy
+    // (thus, have no destructor). If we _know_ we're using sse4.2, we don't use
+    // the fallback at all, so it will be dropped in this function.
+    #[doc(hidden)]
+    #[allow(unused_variables)]
+    pub const fn new_const(bytes: [u8; 16], len: i32, fallback: F) -> Self
+    where
+        F: Copy,
+    {
+        Self {
+            #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
+            simd: simd::Bytes::new(bytes, len),
+
+            #[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
+            fallback: fallback::Bytes::new(fallback),
+
+            _fallback: PhantomData,
+        }
+    }
+
     /// Searches the slice for the first matching byte in the set.
     #[inline]
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
@@ -252,13 +274,29 @@ where
     /// ### Panics
     ///
     /// - If you provide a non-ASCII byte.
-    pub const fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
+    pub fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
+        for &b in &chars {
+            assert!(b < 128, "Cannot have non-ASCII bytes");
+        }
+        Self(Bytes::new(chars, len, fallback))
+    }
+
+    // This constructor is used by the `ascii_chars!` macro, and shouldn't be relied on directly.
+    //
+    // To allow F to be dropped in a const context, we require F to be Copy
+    // (thus, have no destructor). If we _know_ we're using sse4.2, we don't use
+    // the fallback at all, so it will be dropped in this function.
+    #[doc(hidden)]
+    pub const fn new_const(chars: [u8; 16], len: i32, fallback: F) -> Self
+    where
+        F: Copy,
+    {
         let mut i = 0;
         while i < chars.len() {
             assert!(chars[i] < 128, "Cannot have non-ASCII bytes");
             i += 1;
         }
-        AsciiChars(Bytes::new(chars, len, fallback))
+        Self(Bytes::new_const(chars, len, fallback))
     }
 
     /// Searches the string for the first matching ASCII byte in the set.

From 6bf8735af54da8cebd34227e15c3318ef06aef6d Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 8 Oct 2023 22:17:34 -0400
Subject: [PATCH 04/21] Port benchmarks to stable criterion

Also, add some benchmarks against memchr
---
 Cargo.toml            |   7 ++
 benches/benchmarks.rs | 143 ++++++++++++++++++++++++++++++++++++++
 src/lib.rs            | 158 ------------------------------------------
 3 files changed, 150 insertions(+), 158 deletions(-)
 create mode 100644 benches/benchmarks.rs

diff --git a/Cargo.toml b/Cargo.toml
index c1e2fb7..7fab73e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ license = "MIT OR Apache-2.0"
 edition = "2018"
 
 [features]
+# This feature is now a no-op, but we keep it around for backwards compatibility
 benchmarks = []
 pattern = []
 
@@ -23,3 +24,9 @@ proptest = "1.0.0"
 lazy_static = "1.0.0"
 region = "3.0.0"
 memmap = "0.7.0"
+criterion = "0.5.0"
+memchr = "2.0.0"
+
+[[bench]]
+name = "benchmarks"
+harness = false
diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
new file mode 100644
index 0000000..4010fe6
--- /dev/null
+++ b/benches/benchmarks.rs
@@ -0,0 +1,143 @@
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use jetscii::{ascii_chars, AsciiCharsConst, SubstringConst};
+use std::hint::black_box;
+use std::sync::OnceLock;
+
+static SPACE: AsciiCharsConst = ascii_chars!(' ');
+static XML_DELIM_3: AsciiCharsConst = ascii_chars!('<', '>', '&');
+static XML_DELIM_5: AsciiCharsConst = ascii_chars!('<', '>', '&', '\'', '"');
+
+static SUBSTRING: SubstringConst = SubstringConst::new("xyzzy");
+
+fn prefix_string() -> String {
+    "a".repeat(5 * 1024 * 1024)
+}
+
+fn spaces(c: &mut Criterion) {
+    let mut haystack = prefix_string();
+    haystack.push(' ');
+    let haystack = black_box(haystack);
+
+    let mut group = c.benchmark_group("find_last_space");
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    group.bench_function("ascii_chars", |b| {
+        b.iter(|| SPACE.find(&haystack));
+    });
+    group.bench_function("stdlib_find_string", |b| {
+        b.iter(|| haystack.find(" "));
+    });
+    group.bench_function("stdlib_find_char", |b| {
+        b.iter(|| haystack.find(' '));
+    });
+    group.bench_function("stdlib_find_char_set", |b| {
+        b.iter(|| haystack.find(&[' '][..]));
+    });
+    group.bench_function("stdlib_find_closure", |b| {
+        b.iter(|| haystack.find(|c| c == ' '));
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| haystack.bytes().position(|c| c == b' '));
+    });
+    group.bench_function("memchr", |b| {
+        b.iter(|| memchr::memchr(b' ', haystack.as_bytes()));
+    });
+}
+
+fn xml3(c: &mut Criterion) {
+    let mut haystack = prefix_string();
+    haystack.push('&');
+    let haystack = black_box(haystack);
+
+    let mut group = c.benchmark_group("find_xml_3");
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    group.bench_function("ascii_chars", |b| {
+        b.iter(|| XML_DELIM_3.find(&haystack));
+    });
+    group.bench_function("stdlib_find_char_set", |b| {
+        b.iter(|| haystack.find(&['<', '>', '&'][..]));
+    });
+    group.bench_function("stdlib_find_closure", |b| {
+        b.iter(|| haystack.find(|c| c == '<' || c == '>' || c == '&'));
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| {
+            haystack
+                .bytes()
+                .position(|c| c == b'<' || c == b'>' || c == b'&')
+        });
+    });
+    group.bench_function("memchr", |b| {
+        b.iter(|| memchr::memchr3(b'<', b'>', b'&', haystack.as_bytes()));
+    });
+}
+
+fn xml5(c: &mut Criterion) {
+    let mut haystack = prefix_string();
+    haystack.push('"');
+    let haystack = black_box(haystack);
+
+    let mut group = c.benchmark_group("find_xml_5");
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    group.bench_function("ascii_chars", |b| {
+        b.iter(|| XML_DELIM_5.find(&haystack));
+    });
+    group.bench_function("stdlib_find_char_set", |b| {
+        b.iter(|| haystack.find(&['<', '>', '&', '\'', '"'][..]));
+    });
+    group.bench_function("stdlib_find_closure", |b| {
+        b.iter(|| haystack.find(|c| c == '<' || c == '>' || c == '&' || c == '\'' || c == '"'));
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| {
+            haystack
+                .bytes()
+                .position(|c| c == b'<' || c == b'>' || c == b'&' || c == b'\'' || c == b'"')
+        });
+    });
+    group.bench_function("memchr", |b| {
+        b.iter(|| {
+            let bytes = haystack.as_bytes();
+            let indexes = [
+                memchr::memchr3(b'<', b'>', b'&', bytes),
+                memchr::memchr2(b'\'', b'"', bytes),
+            ];
+            indexes.iter().copied().flatten().min()
+        });
+    });
+}
+
+fn memchr_finder() -> &'static memchr::memmem::Finder<'static> {
+    static FINDER: OnceLock<memchr::memmem::Finder> = OnceLock::new();
+    FINDER.get_or_init(|| memchr::memmem::Finder::new("xyzzy"))
+}
+
+fn substring() -> &'static SubstringConst {
+    static SUBSTRING: OnceLock<SubstringConst> = OnceLock::new();
+    SUBSTRING.get_or_init(|| SubstringConst::new("xyzzy"))
+}
+
+fn substr(c: &mut Criterion) {
+    let mut haystack = prefix_string();
+    haystack.push_str("xyzzy");
+    let haystack = black_box(haystack);
+
+    let mut group = c.benchmark_group("find_substring");
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    group.bench_function("substring", |b| {
+        b.iter(|| SUBSTRING.find(&haystack));
+    });
+    group.bench_function("stdlib_find_string", |b| {
+        b.iter(|| haystack.find("xyzzy"));
+    });
+    group.bench_function("memchr", |b| {
+        let finder = memchr::memmem::Finder::new(b"xyzzy");
+        b.iter(|| finder.find(haystack.as_bytes()));
+    });
+}
+
+criterion_group!(benches, spaces, xml3, xml5, substr);
+criterion_main!(benches);
diff --git a/src/lib.rs b/src/lib.rs
index 5805a5c..4751f61 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,4 @@
 #![cfg_attr(feature = "pattern", feature(pattern))]
-#![cfg_attr(feature = "benchmarks", feature(test))]
 
 //! A tiny library to efficiently search strings for sets of ASCII
 //! characters or byte slices for sets of bytes.
@@ -372,160 +371,3 @@ impl<'a> Substring<'a> {
 
 /// A convenience type that can be used in a constant or static.
 pub type SubstringConst = Substring<'static>;
-
-#[cfg(all(test, feature = "benchmarks"))]
-mod bench {
-    extern crate test;
-
-    use lazy_static::lazy_static;
-
-    use super::*;
-
-    lazy_static! {
-        static ref SPACE: AsciiCharsConst = ascii_chars!(' ');
-        static ref XML_DELIM_3: AsciiCharsConst = ascii_chars!('<', '>', '&');
-        static ref XML_DELIM_5: AsciiCharsConst = ascii_chars!('<', '>', '&', '\'', '"');
-    }
-
-    fn prefix_string() -> String {
-        "a".repeat(5 * 1024 * 1024)
-    }
-
-    fn bench_space<F>(b: &mut test::Bencher, f: F)
-    where
-        F: Fn(&str) -> Option<usize>,
-    {
-        let mut haystack = prefix_string();
-        haystack.push(' ');
-
-        b.iter(|| test::black_box(f(&haystack)));
-        b.bytes = haystack.len() as u64;
-    }
-
-    #[bench]
-    fn space_ascii_chars(b: &mut test::Bencher) {
-        bench_space(b, |hs| SPACE.find(hs))
-    }
-
-    #[bench]
-    fn space_stdlib_find_string(b: &mut test::Bencher) {
-        bench_space(b, |hs| hs.find(" "))
-    }
-
-    #[bench]
-    fn space_stdlib_find_char(b: &mut test::Bencher) {
-        bench_space(b, |hs| hs.find(' '))
-    }
-
-    #[bench]
-    fn space_stdlib_find_char_set(b: &mut test::Bencher) {
-        bench_space(b, |hs| hs.find(&[' '][..]))
-    }
-
-    #[bench]
-    fn space_stdlib_find_closure(b: &mut test::Bencher) {
-        bench_space(b, |hs| hs.find(|c| c == ' '))
-    }
-
-    #[bench]
-    fn space_stdlib_iterator_position(b: &mut test::Bencher) {
-        bench_space(b, |hs| hs.as_bytes().iter().position(|&v| v == b' '))
-    }
-
-    fn bench_xml_delim_3<F>(b: &mut test::Bencher, f: F)
-    where
-        F: Fn(&str) -> Option<usize>,
-    {
-        let mut haystack = prefix_string();
-        haystack.push('&');
-
-        b.iter(|| test::black_box(f(&haystack)));
-        b.bytes = haystack.len() as u64;
-    }
-
-    #[bench]
-    fn xml_delim_3_ascii_chars(b: &mut test::Bencher) {
-        bench_xml_delim_3(b, |hs| XML_DELIM_3.find(hs))
-    }
-
-    #[bench]
-    fn xml_delim_3_stdlib_find_char_set(b: &mut test::Bencher) {
-        bench_xml_delim_3(b, |hs| hs.find(&['<', '>', '&'][..]))
-    }
-
-    #[bench]
-    fn xml_delim_3_stdlib_find_char_closure(b: &mut test::Bencher) {
-        bench_xml_delim_3(b, |hs| hs.find(|c| c == '<' || c == '>' || c == '&'))
-    }
-
-    #[bench]
-    fn xml_delim_3_stdlib_iterator_position(b: &mut test::Bencher) {
-        bench_xml_delim_3(b, |hs| {
-            hs.as_bytes()
-                .iter()
-                .position(|&c| c == b'<' || c == b'>' || c == b'&')
-        })
-    }
-
-    fn bench_xml_delim_5<F>(b: &mut test::Bencher, f: F)
-    where
-        F: Fn(&str) -> Option<usize>,
-    {
-        let mut haystack = prefix_string();
-        haystack.push('"');
-
-        b.iter(|| test::black_box(f(&haystack)));
-        b.bytes = haystack.len() as u64;
-    }
-
-    #[bench]
-    fn xml_delim_5_ascii_chars(b: &mut test::Bencher) {
-        bench_xml_delim_5(b, |hs| XML_DELIM_5.find(hs))
-    }
-
-    #[bench]
-    fn xml_delim_5_stdlib_find_char_set(b: &mut test::Bencher) {
-        bench_xml_delim_5(b, |hs| hs.find(&['<', '>', '&', '\'', '"'][..]))
-    }
-
-    #[bench]
-    fn xml_delim_5_stdlib_find_char_closure(b: &mut test::Bencher) {
-        bench_xml_delim_5(b, |hs| {
-            hs.find(|c| c == '<' || c == '>' || c == '&' || c == '\'' || c == '"')
-        })
-    }
-
-    #[bench]
-    fn xml_delim_5_stdlib_iterator_position(b: &mut test::Bencher) {
-        bench_xml_delim_3(b, |hs| {
-            hs.as_bytes()
-                .iter()
-                .position(|&c| c == b'<' || c == b'>' || c == b'&' || c == b'\'' || c == b'"')
-        })
-    }
-
-    lazy_static! {
-        static ref XYZZY: Substring<'static> = Substring::new("xyzzy");
-    }
-
-    fn bench_substring<F>(b: &mut test::Bencher, f: F)
-    where
-        F: Fn(&str) -> Option<usize>,
-    {
-        let mut haystack = prefix_string();
-        haystack.push_str("xyzzy");
-
-        b.iter(|| test::black_box(f(&haystack)));
-        b.bytes = haystack.len() as u64;
-    }
-
-    #[bench]
-    fn substring_with_created_searcher(b: &mut test::Bencher) {
-        bench_substring(b, |hs| XYZZY.find(hs))
-    }
-
-    #[bench]
-    fn substring_stdlib_find(b: &mut test::Bencher) {
-        bench_substring(b, |hs| hs.find("xyzzy"))
-    }
-}

From 25e62c77722b8ec37432062ccaf3103e37c591d4 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Mon, 9 Oct 2023 00:48:32 -0400
Subject: [PATCH 05/21] Add a benchmark for a full 16 chars

---
 benches/benchmarks.rs | 89 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 9 deletions(-)

diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index 4010fe6..419b003 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -1,11 +1,12 @@
 use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 use jetscii::{ascii_chars, AsciiCharsConst, SubstringConst};
 use std::hint::black_box;
-use std::sync::OnceLock;
 
 static SPACE: AsciiCharsConst = ascii_chars!(' ');
 static XML_DELIM_3: AsciiCharsConst = ascii_chars!('<', '>', '&');
 static XML_DELIM_5: AsciiCharsConst = ascii_chars!('<', '>', '&', '\'', '"');
+static BIG_16: AsciiCharsConst =
+    ascii_chars!('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P');
 
 static SUBSTRING: SubstringConst = SubstringConst::new("xyzzy");
 
@@ -109,14 +110,84 @@ fn xml5(c: &mut Criterion) {
     });
 }
 
-fn memchr_finder() -> &'static memchr::memmem::Finder<'static> {
-    static FINDER: OnceLock<memchr::memmem::Finder> = OnceLock::new();
-    FINDER.get_or_init(|| memchr::memmem::Finder::new("xyzzy"))
-}
+fn big_16(c: &mut Criterion) {
+    let mut haystack = prefix_string();
+    haystack.push('P');
+    let haystack = black_box(haystack);
 
-fn substring() -> &'static SubstringConst {
-    static SUBSTRING: OnceLock<SubstringConst> = OnceLock::new();
-    SUBSTRING.get_or_init(|| SubstringConst::new("xyzzy"))
+    let mut group = c.benchmark_group("find_big_16");
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+
+    group.bench_function("ascii_chars", |b| {
+        b.iter(|| BIG_16.find(&haystack));
+    });
+    group.bench_function("stdlib_find_char_set", |b| {
+        b.iter(|| {
+            haystack.find(
+                &[
+                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                ][..],
+            )
+        });
+    });
+    group.bench_function("stdlib_find_closure", |b| {
+        b.iter(|| {
+            haystack.find(|c| {
+                c == 'A'
+                    || c == 'B'
+                    || c == 'C'
+                    || c == 'D'
+                    || c == 'E'
+                    || c == 'F'
+                    || c == 'G'
+                    || c == 'H'
+                    || c == 'I'
+                    || c == 'J'
+                    || c == 'K'
+                    || c == 'L'
+                    || c == 'M'
+                    || c == 'N'
+                    || c == 'O'
+                    || c == 'P'
+            })
+        });
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| {
+            haystack.bytes().position(|c| {
+                c == b'A'
+                    || c == b'B'
+                    || c == b'C'
+                    || c == b'D'
+                    || c == b'E'
+                    || c == b'F'
+                    || c == b'G'
+                    || c == b'H'
+                    || c == b'I'
+                    || c == b'J'
+                    || c == b'K'
+                    || c == b'L'
+                    || c == b'M'
+                    || c == b'N'
+                    || c == b'O'
+                    || c == b'P'
+            })
+        });
+    });
+    group.bench_function("memchr", |b| {
+        b.iter(|| {
+            let bytes = haystack.as_bytes();
+            let indexes = [
+                memchr::memchr3(b'A', b'B', b'C', bytes),
+                memchr::memchr3(b'D', b'E', b'F', bytes),
+                memchr::memchr3(b'G', b'H', b'I', bytes),
+                memchr::memchr3(b'J', b'K', b'L', bytes),
+                memchr::memchr3(b'M', b'N', b'O', bytes),
+                memchr::memchr(b'P', bytes),
+            ];
+            indexes.iter().copied().flatten().min()
+        })
+    });
 }
 
 fn substr(c: &mut Criterion) {
@@ -139,5 +210,5 @@ fn substr(c: &mut Criterion) {
     });
 }
 
-criterion_group!(benches, spaces, xml3, xml5, substr);
+criterion_group!(benches, spaces, xml3, xml5, big_16, substr);
 criterion_main!(benches);

From 0c426090254390f6731a50e4f182c30b15075f15 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Fri, 13 Oct 2023 17:55:32 -0400
Subject: [PATCH 06/21] Add benchmark for worst case for memchr

Add a test that looks for the first item in a long haystack
---
 benches/benchmarks.rs | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index 419b003..1f2124d 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -188,6 +188,85 @@ fn big_16(c: &mut Criterion) {
             indexes.iter().copied().flatten().min()
         })
     });
+
+    group.finish();
+
+    let mut haystack = prefix_string();
+    haystack.insert(0, 'P');
+    let haystack = black_box(haystack);
+    let mut group = c.benchmark_group("find_big_16_early_return");
+    group.throughput(Throughput::Bytes(1));
+
+    group.bench_function("ascii_chars", |b| {
+        b.iter(|| BIG_16.find(&haystack));
+    });
+    group.bench_function("stdlib_find_char_set", |b| {
+        b.iter(|| {
+            haystack.find(
+                &[
+                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                ][..],
+            )
+        });
+    });
+    group.bench_function("stdlib_find_closure", |b| {
+        b.iter(|| {
+            haystack.find(|c| {
+                c == 'A'
+                    || c == 'B'
+                    || c == 'C'
+                    || c == 'D'
+                    || c == 'E'
+                    || c == 'F'
+                    || c == 'G'
+                    || c == 'H'
+                    || c == 'I'
+                    || c == 'J'
+                    || c == 'K'
+                    || c == 'L'
+                    || c == 'M'
+                    || c == 'N'
+                    || c == 'O'
+                    || c == 'P'
+            })
+        });
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| {
+            haystack.bytes().position(|c| {
+                c == b'A'
+                    || c == b'B'
+                    || c == b'C'
+                    || c == b'D'
+                    || c == b'E'
+                    || c == b'F'
+                    || c == b'G'
+                    || c == b'H'
+                    || c == b'I'
+                    || c == b'J'
+                    || c == b'K'
+                    || c == b'L'
+                    || c == b'M'
+                    || c == b'N'
+                    || c == b'O'
+                    || c == b'P'
+            })
+        });
+    });
+    group.bench_function("memchr", |b| {
+        b.iter(|| {
+            let bytes = haystack.as_bytes();
+            let indexes = [
+                memchr::memchr3(b'A', b'B', b'C', bytes),
+                memchr::memchr3(b'D', b'E', b'F', bytes),
+                memchr::memchr3(b'G', b'H', b'I', bytes),
+                memchr::memchr3(b'J', b'K', b'L', bytes),
+                memchr::memchr3(b'M', b'N', b'O', bytes),
+                memchr::memchr(b'P', bytes),
+            ];
+            indexes.iter().copied().flatten().min()
+        })
+    });
 }
 
 fn substr(c: &mut Criterion) {

From d85b2359bf72ef51a7ff512ec73dcfcc93b3da9c Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 8 Oct 2023 23:02:56 -0400
Subject: [PATCH 07/21] Use memmap2

The memmap crate is unmaintained, instead, use the maintained memmap2 crate
---
 Cargo.toml  | 2 +-
 src/simd.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 7fab73e..064f8c6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ pattern = []
 proptest = "1.0.0"
 lazy_static = "1.0.0"
 region = "3.0.0"
-memmap = "0.7.0"
+memmap2 = "0.9.0"
 criterion = "0.5.0"
 memchr = "2.0.0"
 
diff --git a/src/simd.rs b/src/simd.rs
index 504543a..8e07156 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -324,7 +324,7 @@ impl<'a, 'b> PackedCompareControl for &'b ByteSubstring<'a> {
 mod test {
     use proptest::prelude::*;
     use std::{fmt, str};
-    use memmap::MmapMut;
+    use memmap2::MmapMut;
     use region::Protection;
     use lazy_static::lazy_static;
 

From 0b1259dd9beb7a8e8c558744b1d2b9d9cc41150d Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Mon, 9 Oct 2023 00:33:24 -0400
Subject: [PATCH 08/21] Remove bounds on struct definitions

Structs don't need the bounds, only the implementations
---
 src/fallback.rs | 5 +----
 src/lib.rs      | 9 ++-------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/fallback.rs b/src/fallback.rs
index 9dc79bc..5049c7a 100644
--- a/src/fallback.rs
+++ b/src/fallback.rs
@@ -1,10 +1,7 @@
 // TODO: Try boxing the closure to see if we can hide the type
 // TODO: Or maybe use a function pointer?
 
-pub struct Bytes<F>
-where
-    F: Fn(u8) -> bool,
-{
+pub struct Bytes<F> {
     fallback: F,
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 4751f61..af28294 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -180,10 +180,7 @@ macro_rules! dispatch {
 }
 
 /// Searches a slice for a set of bytes. Up to 16 bytes may be used.
-pub struct Bytes<F>
-where
-    F: Fn(u8) -> bool,
-{
+pub struct Bytes<F> {
     #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
     simd: simd::Bytes,
 
@@ -255,9 +252,7 @@ pub type BytesConst = Bytes<fn(u8) -> bool>;
 
 /// Searches a string for a set of ASCII characters. Up to 16
 /// characters may be used.
-pub struct AsciiChars<F>(Bytes<F>)
-where
-    F: Fn(u8) -> bool;
+pub struct AsciiChars<F>(Bytes<F>);
 
 impl<F> AsciiChars<F>
 where

From f7fd4290a0785c24c6b74fda2f39ed4725a0d16e Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Mon, 9 Oct 2023 01:20:22 -0400
Subject: [PATCH 09/21] Update changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ad8704..551b164 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,11 @@ to [Semantic Versioning][].
 
 ## [Unreleased]
 
+- Use `const fn`s where possible, which allows e.g.
+  `static SPACE: AsciiCharsConst = ascii_chars!(' ');`
+  and `static SUBSTRING: SubstringConst = SubstringConst::new("xyzzy");`
+- Trait bounds removed from generic types, bounds are only required for impls
+
 ## [0.5.3] - 2022-07-06
 
 - Fix buffer overflows in find. (#55)

From 8dc6fcae4db54e31f11b96011a14600052b5ae28 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Mon, 9 Oct 2023 20:45:10 -0400
Subject: [PATCH 10/21] Apply clippy suggesions

Mostly just adding #[must_use]
---
 src/fallback.rs |  2 +-
 src/lib.rs      | 12 ++++++++++++
 src/simd.rs     |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/fallback.rs b/src/fallback.rs
index 5049c7a..27466f1 100644
--- a/src/fallback.rs
+++ b/src/fallback.rs
@@ -14,7 +14,7 @@ where
     }
 
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
-        haystack.iter().cloned().position(&self.fallback)
+        haystack.iter().copied().position(&self.fallback)
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index af28294..53b7b8b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -203,6 +203,7 @@ where
     /// intrinsics are not available. The closure **must** search for
     /// the same bytes as in the array.
     #[allow(unused_variables)]
+    #[must_use]
     pub fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
         Bytes {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
@@ -222,6 +223,7 @@ where
     // the fallback at all, so it will be dropped in this function.
     #[doc(hidden)]
     #[allow(unused_variables)]
+    #[must_use]
     pub const fn new_const(bytes: [u8; 16], len: i32, fallback: F) -> Self
     where
         F: Copy,
@@ -239,6 +241,7 @@ where
 
     /// Searches the slice for the first matching byte in the set.
     #[inline]
+    #[must_use]
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
         dispatch! {
             simd: unsafe { self.simd.find(haystack) },
@@ -268,6 +271,7 @@ where
     /// ### Panics
     ///
     /// - If you provide a non-ASCII byte.
+    #[must_use]
     pub fn new(chars: [u8; 16], len: i32, fallback: F) -> Self {
         for &b in &chars {
             assert!(b < 128, "Cannot have non-ASCII bytes");
@@ -281,6 +285,7 @@ where
     // (thus, have no destructor). If we _know_ we're using sse4.2, we don't use
     // the fallback at all, so it will be dropped in this function.
     #[doc(hidden)]
+    #[must_use]
     pub const fn new_const(chars: [u8; 16], len: i32, fallback: F) -> Self
     where
         F: Copy,
@@ -295,6 +300,7 @@ where
 
     /// Searches the string for the first matching ASCII byte in the set.
     #[inline]
+    #[must_use]
     pub fn find(&self, haystack: &str) -> Option<usize> {
         self.0.find(haystack.as_bytes())
     }
@@ -313,6 +319,7 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
+    #[must_use]
     pub const fn new(needle: &'a [u8]) -> Self {
         ByteSubstring {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
@@ -324,6 +331,7 @@ impl<'a> ByteSubstring<'a> {
     }
 
     #[cfg(feature = "pattern")]
+    #[must_use]
     fn needle_len(&self) -> usize {
         dispatch! {
             simd: self.simd.needle_len(),
@@ -333,6 +341,7 @@ impl<'a> ByteSubstring<'a> {
 
     /// Searches the slice for the first occurence of the subslice.
     #[inline]
+    #[must_use]
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
         dispatch! {
             simd: unsafe { self.simd.find(haystack) },
@@ -348,17 +357,20 @@ pub type ByteSubstringConst = ByteSubstring<'static>;
 pub struct Substring<'a>(ByteSubstring<'a>);
 
 impl<'a> Substring<'a> {
+    #[must_use]
     pub const fn new(needle: &'a str) -> Self {
         Substring(ByteSubstring::new(needle.as_bytes()))
     }
 
     #[cfg(feature = "pattern")]
+    #[must_use]
     fn needle_len(&self) -> usize {
         self.0.needle_len()
     }
 
     /// Searches the string for the first occurence of the substring.
     #[inline]
+    #[must_use]
     pub fn find(&self, haystack: &str) -> Option<usize> {
         self.0.find(haystack.as_bytes())
     }
diff --git a/src/simd.rs b/src/simd.rs
index 8e07156..f633e1d 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -41,7 +41,7 @@ where
     packed.cmpestri(haystack.as_ptr(), haystack.len() as i32)
 }
 
-/// The PCMPxSTRx instructions always read 16 bytes worth of
+/// The `PCMPxSTRx` instructions always read 16 bytes worth of
 /// data. Although the instructions handle unaligned memory access
 /// just fine, they might attempt to read off the end of a page
 /// and into a protected area.

From df92a1c0988f4c1cfc2105be094ea8e22fe81c09 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Fri, 13 Oct 2023 22:15:56 -0400
Subject: [PATCH 11/21] Add inline to some small functions used in sse impl

This speeds up the criteron benchmarks by almost 2x

I believe this is needed because e.g. Bytes::find is inlined, and calls `find`
generically, which will call PackedCompareControl methods. So the code calling
the methods will be inlined into the calling crate, but the implemetations of
the PackedCompareControl are not accessable to the code in the calling crate,
so they will end up as actual function calls. However these functions are
_super_ simple, and inlining them helps a LOT, so adding `#[inline]` to these
functions, and making their implementation available to calling crates has a
huge effect.

This was only seen when moving to criterion because previously, nightly
benchmarks were implemented in the library crate itself, and so these functions
were already elegable for inlining. Criteron results were actually more
accurate to what callers of the crate would actually see!
---
 src/simd.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/simd.rs b/src/simd.rs
index f633e1d..89db034 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -252,9 +252,12 @@ impl Bytes {
 }
 
 impl<'b> PackedCompareControl for &'b Bytes {
+    #[inline]
     fn needle(&self) -> __m128i {
         self.needle
     }
+
+    #[inline]
     fn needle_len(&self) -> i32 {
         self.needle_len
     }
@@ -312,9 +315,12 @@ impl<'a> ByteSubstring<'a> {
 }
 
 impl<'a, 'b> PackedCompareControl for &'b ByteSubstring<'a> {
+    #[inline]
     fn needle(&self) -> __m128i {
         self.needle
     }
+
+    #[inline]
     fn needle_len(&self) -> i32 {
         self.needle_len
     }

From 0cf5a366c75106d78ecc83e33a95aa94a41f1d0e Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sat, 14 Oct 2023 15:47:07 -0400
Subject: [PATCH 12/21] Add the "teddy" algorithm from aho-corasick

Per suggestion from @BurntSushi [here](https://github.com/tafia/quick-xml/pull/664#issuecomment-1763371610)

On my M1, tt appears to be slower but competitive with memchr up to memchr3,
then start being the from 5-16
---
 Cargo.toml            |  1 +
 benches/benchmarks.rs | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index 064f8c6..1bccce0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ benchmarks = []
 pattern = []
 
 [dev-dependencies]
+aho-corasick = "1.1.0"
 proptest = "1.0.0"
 lazy_static = "1.0.0"
 region = "3.0.0"
diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index 1f2124d..4e9a3ef 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -40,6 +40,10 @@ fn spaces(c: &mut Criterion) {
     group.bench_function("stdlib_iter_position", |b| {
         b.iter(|| haystack.bytes().position(|c| c == b' '));
     });
+    group.bench_function("teddy", |b| {
+        let searcher = aho_corasick::packed::Searcher::new([" "]).unwrap();
+        b.iter(|| searcher.find(&haystack).map(|m| m.start()));
+    });
     group.bench_function("memchr", |b| {
         b.iter(|| memchr::memchr(b' ', haystack.as_bytes()));
     });
@@ -69,6 +73,10 @@ fn xml3(c: &mut Criterion) {
                 .position(|c| c == b'<' || c == b'>' || c == b'&')
         });
     });
+    group.bench_function("teddy", |b| {
+        let searcher = aho_corasick::packed::Searcher::new(["<", ">", "&"]).unwrap();
+        b.iter(|| searcher.find(&haystack).map(|m| m.start()));
+    });
     group.bench_function("memchr", |b| {
         b.iter(|| memchr::memchr3(b'<', b'>', b'&', haystack.as_bytes()));
     });
@@ -98,6 +106,10 @@ fn xml5(c: &mut Criterion) {
                 .position(|c| c == b'<' || c == b'>' || c == b'&' || c == b'\'' || c == b'"')
         });
     });
+    group.bench_function("teddy", |b| {
+        let searcher = aho_corasick::packed::Searcher::new(["<", ">", "&", "'", "\""]).unwrap();
+        b.iter(|| searcher.find(&haystack).map(|m| m.start()));
+    });
     group.bench_function("memchr", |b| {
         b.iter(|| {
             let bytes = haystack.as_bytes();
@@ -174,6 +186,10 @@ fn big_16(c: &mut Criterion) {
             })
         });
     });
+    group.bench_function("teddy", |b| {
+        let searcher = aho_corasick::packed::Searcher::new(b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b))).unwrap();
+        b.iter(|| searcher.find(&haystack).map(|m| m.start()));
+    });
     group.bench_function("memchr", |b| {
         b.iter(|| {
             let bytes = haystack.as_bytes();
@@ -253,6 +269,10 @@ fn big_16(c: &mut Criterion) {
             })
         });
     });
+    group.bench_function("teddy", |b| {
+        let searcher = aho_corasick::packed::Searcher::new(b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b))).unwrap();
+        b.iter(|| searcher.find(&haystack).map(|m| m.start()));
+    });
     group.bench_function("memchr", |b| {
         b.iter(|| {
             let bytes = haystack.as_bytes();

From 08ff586765e196f329cee4aed48a5f3295e26143 Mon Sep 17 00:00:00 2001
From: Jake Goulding <jake.goulding@gmail.com>
Date: Tue, 17 Oct 2023 18:47:57 -0400
Subject: [PATCH 13/21] Fix CI by setting up target configuration for ASAN

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8d63685..06ae85e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -78,5 +78,6 @@ jobs:
         uses: ./.github/actions/test
         with:
           toolchain: nightly
+          target: "x86_64-unknown-linux-gnu"
           rustflags: "-Z sanitizer=address"
           rustdocflags: "-Z sanitizer=address"

From 434c045ba75de9c37870962b91353ee9ab8b2ee7 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Tue, 17 Oct 2023 21:37:21 -0400
Subject: [PATCH 14/21] Don't make things const

We may not want to be stuck with const-constructable implementations
---
 CHANGELOG.md          |  3 ---
 benches/benchmarks.rs | 57 ++++++++++++++++++++++++++++++++-----------
 build.rs              |  4 +--
 src/fallback.rs       |  6 ++---
 src/lib.rs            | 46 ++--------------------------------
 src/simd.rs           |  6 ++---
 6 files changed, 53 insertions(+), 69 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 551b164..b572f42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,9 +10,6 @@ to [Semantic Versioning][].
 
 ## [Unreleased]
 
-- Use `const fn`s where possible, which allows e.g.
-  `static SPACE: AsciiCharsConst = ascii_chars!(' ');`
-  and `static SUBSTRING: SubstringConst = SubstringConst::new("xyzzy");`
 - Trait bounds removed from generic types, bounds are only required for impls
 
 ## [0.5.3] - 2022-07-06
diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index 4e9a3ef..ffafaa2 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -1,14 +1,37 @@
 use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 use jetscii::{ascii_chars, AsciiCharsConst, SubstringConst};
 use std::hint::black_box;
+use std::sync::OnceLock;
 
-static SPACE: AsciiCharsConst = ascii_chars!(' ');
-static XML_DELIM_3: AsciiCharsConst = ascii_chars!('<', '>', '&');
-static XML_DELIM_5: AsciiCharsConst = ascii_chars!('<', '>', '&', '\'', '"');
-static BIG_16: AsciiCharsConst =
-    ascii_chars!('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P');
+static SPACE: OnceLock<AsciiCharsConst> = OnceLock::new();
 
-static SUBSTRING: SubstringConst = SubstringConst::new("xyzzy");
+fn space() -> &'static AsciiCharsConst {
+    SPACE.get_or_init(|| ascii_chars!(' '))
+}
+
+static XML_DELIM_3: OnceLock<AsciiCharsConst> = OnceLock::new();
+
+fn xml_delim_3() -> &'static AsciiCharsConst {
+    XML_DELIM_3.get_or_init(|| ascii_chars!('<', '>', '&'))
+}
+
+static XML_DELIM_5: OnceLock<AsciiCharsConst> = OnceLock::new();
+
+fn xml_delim_5() -> &'static AsciiCharsConst {
+    XML_DELIM_5.get_or_init(|| ascii_chars!('<', '>', '&', '\'', '"'))
+}
+
+static BIG_16: OnceLock<AsciiCharsConst> = OnceLock::new();
+
+fn big_16() -> &'static AsciiCharsConst {
+    BIG_16.get_or_init(|| ascii_chars!('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'))
+}
+
+static SUBSTRING: OnceLock<SubstringConst> = OnceLock::new();
+
+fn substring() -> &'static SubstringConst {
+    SUBSTRING.get_or_init(|| SubstringConst::new("xyzzy"))
+}
 
 fn prefix_string() -> String {
     "a".repeat(5 * 1024 * 1024)
@@ -23,7 +46,8 @@ fn spaces(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(haystack.len() as u64));
 
     group.bench_function("ascii_chars", |b| {
-        b.iter(|| SPACE.find(&haystack));
+        let space = space();
+        b.iter(|| space.find(&haystack));
     });
     group.bench_function("stdlib_find_string", |b| {
         b.iter(|| haystack.find(" "));
@@ -58,7 +82,8 @@ fn xml3(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(haystack.len() as u64));
 
     group.bench_function("ascii_chars", |b| {
-        b.iter(|| XML_DELIM_3.find(&haystack));
+        let xml_delim_3 = xml_delim_3();
+        b.iter(|| xml_delim_3.find(&haystack));
     });
     group.bench_function("stdlib_find_char_set", |b| {
         b.iter(|| haystack.find(&['<', '>', '&'][..]));
@@ -91,7 +116,8 @@ fn xml5(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(haystack.len() as u64));
 
     group.bench_function("ascii_chars", |b| {
-        b.iter(|| XML_DELIM_5.find(&haystack));
+        let xml_delim_5 = xml_delim_5();
+        b.iter(|| xml_delim_5.find(&haystack));
     });
     group.bench_function("stdlib_find_char_set", |b| {
         b.iter(|| haystack.find(&['<', '>', '&', '\'', '"'][..]));
@@ -122,7 +148,7 @@ fn xml5(c: &mut Criterion) {
     });
 }
 
-fn big_16(c: &mut Criterion) {
+fn big_16_benches(c: &mut Criterion) {
     let mut haystack = prefix_string();
     haystack.push('P');
     let haystack = black_box(haystack);
@@ -131,7 +157,8 @@ fn big_16(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(haystack.len() as u64));
 
     group.bench_function("ascii_chars", |b| {
-        b.iter(|| BIG_16.find(&haystack));
+        let big_16 = big_16();
+        b.iter(|| big_16.find(&haystack));
     });
     group.bench_function("stdlib_find_char_set", |b| {
         b.iter(|| {
@@ -214,7 +241,8 @@ fn big_16(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(1));
 
     group.bench_function("ascii_chars", |b| {
-        b.iter(|| BIG_16.find(&haystack));
+        let big_16 = big_16();
+        b.iter(|| big_16.find(&haystack));
     });
     group.bench_function("stdlib_find_char_set", |b| {
         b.iter(|| {
@@ -298,7 +326,8 @@ fn substr(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(haystack.len() as u64));
 
     group.bench_function("substring", |b| {
-        b.iter(|| SUBSTRING.find(&haystack));
+        let substring = substring();
+        b.iter(|| substring.find(&haystack));
     });
     group.bench_function("stdlib_find_string", |b| {
         b.iter(|| haystack.find("xyzzy"));
@@ -309,5 +338,5 @@ fn substr(c: &mut Criterion) {
     });
 }
 
-criterion_group!(benches, spaces, xml3, xml5, big_16, substr);
+criterion_group!(benches, spaces, xml3, xml5, big_16_benches, substr);
 criterion_main!(benches);
diff --git a/build.rs b/build.rs
index f21f763..20a8d92 100644
--- a/build.rs
+++ b/build.rs
@@ -58,7 +58,7 @@ fn macros_bytes(f: &mut File, base: &Path) {
             let closure = format!("|c| {}", closure_body.join(" || "));
 
             format!(
-                "({}) => ($crate::Bytes::new_const([{}], {}, {}));\n",
+                "({}) => ($crate::Bytes::new([{}], {}, {}));\n",
                 args, array, max, closure
             )
         })
@@ -93,7 +93,7 @@ fn macros_ascii_chars(f: &mut File, base: &Path) {
             let closure = format!("|c| {}", closure_body.join(" || "));
 
             format!(
-                "({}) => ($crate::AsciiChars::new_const([{}], {}, {}));\n",
+                "({}) => ($crate::AsciiChars::new([{}], {}, {}));\n",
                 args, array, max, closure
             )
         })
diff --git a/src/fallback.rs b/src/fallback.rs
index 27466f1..48d4b62 100644
--- a/src/fallback.rs
+++ b/src/fallback.rs
@@ -9,7 +9,7 @@ impl<F> Bytes<F>
 where
     F: Fn(u8) -> bool,
 {
-    pub const fn new(fallback: F) -> Self {
+    pub fn new(fallback: F) -> Self {
         Bytes { fallback }
     }
 
@@ -23,12 +23,12 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub const fn new(needle: &'a[u8]) -> Self {
+    pub fn new(needle: &'a[u8]) -> Self {
         ByteSubstring { needle }
     }
 
     #[cfg(feature = "pattern")]
-    pub const fn needle_len(&self) -> usize {
+    pub fn needle_len(&self) -> usize {
         self.needle.len()
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index 53b7b8b..f6b875c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -216,29 +216,6 @@ where
         }
     }
 
-    // This constructor is used by the `bytes!` macro, and shouldn't be relied on directly.
-    //
-    // To allow F to be dropped in a const context, we require F to be Copy
-    // (thus, have no destructor). If we _know_ we're using sse4.2, we don't use
-    // the fallback at all, so it will be dropped in this function.
-    #[doc(hidden)]
-    #[allow(unused_variables)]
-    #[must_use]
-    pub const fn new_const(bytes: [u8; 16], len: i32, fallback: F) -> Self
-    where
-        F: Copy,
-    {
-        Self {
-            #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
-            simd: simd::Bytes::new(bytes, len),
-
-            #[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
-            fallback: fallback::Bytes::new(fallback),
-
-            _fallback: PhantomData,
-        }
-    }
-
     /// Searches the slice for the first matching byte in the set.
     #[inline]
     #[must_use]
@@ -279,25 +256,6 @@ where
         Self(Bytes::new(chars, len, fallback))
     }
 
-    // This constructor is used by the `ascii_chars!` macro, and shouldn't be relied on directly.
-    //
-    // To allow F to be dropped in a const context, we require F to be Copy
-    // (thus, have no destructor). If we _know_ we're using sse4.2, we don't use
-    // the fallback at all, so it will be dropped in this function.
-    #[doc(hidden)]
-    #[must_use]
-    pub const fn new_const(chars: [u8; 16], len: i32, fallback: F) -> Self
-    where
-        F: Copy,
-    {
-        let mut i = 0;
-        while i < chars.len() {
-            assert!(chars[i] < 128, "Cannot have non-ASCII bytes");
-            i += 1;
-        }
-        Self(Bytes::new_const(chars, len, fallback))
-    }
-
     /// Searches the string for the first matching ASCII byte in the set.
     #[inline]
     #[must_use]
@@ -320,7 +278,7 @@ pub struct ByteSubstring<'a> {
 
 impl<'a> ByteSubstring<'a> {
     #[must_use]
-    pub const fn new(needle: &'a [u8]) -> Self {
+    pub fn new(needle: &'a [u8]) -> Self {
         ByteSubstring {
             #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
             simd: simd::ByteSubstring::new(needle),
@@ -358,7 +316,7 @@ pub struct Substring<'a>(ByteSubstring<'a>);
 
 impl<'a> Substring<'a> {
     #[must_use]
-    pub const fn new(needle: &'a str) -> Self {
+    pub fn new(needle: &'a str) -> Self {
         Substring(ByteSubstring::new(needle.as_bytes()))
     }
 
diff --git a/src/simd.rs b/src/simd.rs
index 89db034..608171d 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -237,7 +237,7 @@ pub struct Bytes {
 }
 
 impl Bytes {
-    pub const fn new(bytes: [u8; 16], needle_len: i32) -> Self {
+    pub fn new(bytes: [u8; 16], needle_len: i32) -> Self {
         Bytes {
             needle: unsafe { TransmuteToSimd { bytes }.simd },
             needle_len,
@@ -270,7 +270,7 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub const fn new(needle: &'a[u8]) -> Self {
+    pub fn new(needle: &'a[u8]) -> Self {
         let mut simd_needle = [0; 16];
         let len = if simd_needle.len() < needle.len() {
             simd_needle.len()
@@ -290,7 +290,7 @@ impl<'a> ByteSubstring<'a> {
     }
 
     #[cfg(feature = "pattern")]
-    pub const fn needle_len(&self) -> usize {
+    pub fn needle_len(&self) -> usize {
         self.complete_needle.len()
     }
 

From be8622ed20763d70304bbed64d9be4cb6844e093 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 11 May 2025 22:24:52 -0400
Subject: [PATCH 15/21] configure unexpected_cfgs lint

---
 Cargo.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index 1bccce0..043e2b1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -31,3 +31,6 @@ memchr = "2.0.0"
 [[bench]]
 name = "benchmarks"
 harness = false
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(jetscii_sse4_2, values("no", "maybe", "yes"))'] }
\ No newline at end of file

From d48378d597688a19d8e85c179251673545c68731 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Mon, 19 May 2025 12:26:44 -0400
Subject: [PATCH 16/21] remove some extra benchmarks that don't make sense

---
 benches/benchmarks.rs | 127 ------------------------------------------
 1 file changed, 127 deletions(-)

diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index ffafaa2..2116ed8 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -49,21 +49,6 @@ fn spaces(c: &mut Criterion) {
         let space = space();
         b.iter(|| space.find(&haystack));
     });
-    group.bench_function("stdlib_find_string", |b| {
-        b.iter(|| haystack.find(" "));
-    });
-    group.bench_function("stdlib_find_char", |b| {
-        b.iter(|| haystack.find(' '));
-    });
-    group.bench_function("stdlib_find_char_set", |b| {
-        b.iter(|| haystack.find(&[' '][..]));
-    });
-    group.bench_function("stdlib_find_closure", |b| {
-        b.iter(|| haystack.find(|c| c == ' '));
-    });
-    group.bench_function("stdlib_iter_position", |b| {
-        b.iter(|| haystack.bytes().position(|c| c == b' '));
-    });
     group.bench_function("teddy", |b| {
         let searcher = aho_corasick::packed::Searcher::new([" "]).unwrap();
         b.iter(|| searcher.find(&haystack).map(|m| m.start()));
@@ -85,12 +70,6 @@ fn xml3(c: &mut Criterion) {
         let xml_delim_3 = xml_delim_3();
         b.iter(|| xml_delim_3.find(&haystack));
     });
-    group.bench_function("stdlib_find_char_set", |b| {
-        b.iter(|| haystack.find(&['<', '>', '&'][..]));
-    });
-    group.bench_function("stdlib_find_closure", |b| {
-        b.iter(|| haystack.find(|c| c == '<' || c == '>' || c == '&'));
-    });
     group.bench_function("stdlib_iter_position", |b| {
         b.iter(|| {
             haystack
@@ -119,12 +98,6 @@ fn xml5(c: &mut Criterion) {
         let xml_delim_5 = xml_delim_5();
         b.iter(|| xml_delim_5.find(&haystack));
     });
-    group.bench_function("stdlib_find_char_set", |b| {
-        b.iter(|| haystack.find(&['<', '>', '&', '\'', '"'][..]));
-    });
-    group.bench_function("stdlib_find_closure", |b| {
-        b.iter(|| haystack.find(|c| c == '<' || c == '>' || c == '&' || c == '\'' || c == '"'));
-    });
     group.bench_function("stdlib_iter_position", |b| {
         b.iter(|| {
             haystack
@@ -136,16 +109,6 @@ fn xml5(c: &mut Criterion) {
         let searcher = aho_corasick::packed::Searcher::new(["<", ">", "&", "'", "\""]).unwrap();
         b.iter(|| searcher.find(&haystack).map(|m| m.start()));
     });
-    group.bench_function("memchr", |b| {
-        b.iter(|| {
-            let bytes = haystack.as_bytes();
-            let indexes = [
-                memchr::memchr3(b'<', b'>', b'&', bytes),
-                memchr::memchr2(b'\'', b'"', bytes),
-            ];
-            indexes.iter().copied().flatten().min()
-        });
-    });
 }
 
 fn big_16_benches(c: &mut Criterion) {
@@ -160,37 +123,6 @@ fn big_16_benches(c: &mut Criterion) {
         let big_16 = big_16();
         b.iter(|| big_16.find(&haystack));
     });
-    group.bench_function("stdlib_find_char_set", |b| {
-        b.iter(|| {
-            haystack.find(
-                &[
-                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
-                ][..],
-            )
-        });
-    });
-    group.bench_function("stdlib_find_closure", |b| {
-        b.iter(|| {
-            haystack.find(|c| {
-                c == 'A'
-                    || c == 'B'
-                    || c == 'C'
-                    || c == 'D'
-                    || c == 'E'
-                    || c == 'F'
-                    || c == 'G'
-                    || c == 'H'
-                    || c == 'I'
-                    || c == 'J'
-                    || c == 'K'
-                    || c == 'L'
-                    || c == 'M'
-                    || c == 'N'
-                    || c == 'O'
-                    || c == 'P'
-            })
-        });
-    });
     group.bench_function("stdlib_iter_position", |b| {
         b.iter(|| {
             haystack.bytes().position(|c| {
@@ -217,20 +149,6 @@ fn big_16_benches(c: &mut Criterion) {
         let searcher = aho_corasick::packed::Searcher::new(b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b))).unwrap();
         b.iter(|| searcher.find(&haystack).map(|m| m.start()));
     });
-    group.bench_function("memchr", |b| {
-        b.iter(|| {
-            let bytes = haystack.as_bytes();
-            let indexes = [
-                memchr::memchr3(b'A', b'B', b'C', bytes),
-                memchr::memchr3(b'D', b'E', b'F', bytes),
-                memchr::memchr3(b'G', b'H', b'I', bytes),
-                memchr::memchr3(b'J', b'K', b'L', bytes),
-                memchr::memchr3(b'M', b'N', b'O', bytes),
-                memchr::memchr(b'P', bytes),
-            ];
-            indexes.iter().copied().flatten().min()
-        })
-    });
 
     group.finish();
 
@@ -244,37 +162,6 @@ fn big_16_benches(c: &mut Criterion) {
         let big_16 = big_16();
         b.iter(|| big_16.find(&haystack));
     });
-    group.bench_function("stdlib_find_char_set", |b| {
-        b.iter(|| {
-            haystack.find(
-                &[
-                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
-                ][..],
-            )
-        });
-    });
-    group.bench_function("stdlib_find_closure", |b| {
-        b.iter(|| {
-            haystack.find(|c| {
-                c == 'A'
-                    || c == 'B'
-                    || c == 'C'
-                    || c == 'D'
-                    || c == 'E'
-                    || c == 'F'
-                    || c == 'G'
-                    || c == 'H'
-                    || c == 'I'
-                    || c == 'J'
-                    || c == 'K'
-                    || c == 'L'
-                    || c == 'M'
-                    || c == 'N'
-                    || c == 'O'
-                    || c == 'P'
-            })
-        });
-    });
     group.bench_function("stdlib_iter_position", |b| {
         b.iter(|| {
             haystack.bytes().position(|c| {
@@ -301,20 +188,6 @@ fn big_16_benches(c: &mut Criterion) {
         let searcher = aho_corasick::packed::Searcher::new(b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b))).unwrap();
         b.iter(|| searcher.find(&haystack).map(|m| m.start()));
     });
-    group.bench_function("memchr", |b| {
-        b.iter(|| {
-            let bytes = haystack.as_bytes();
-            let indexes = [
-                memchr::memchr3(b'A', b'B', b'C', bytes),
-                memchr::memchr3(b'D', b'E', b'F', bytes),
-                memchr::memchr3(b'G', b'H', b'I', bytes),
-                memchr::memchr3(b'J', b'K', b'L', bytes),
-                memchr::memchr3(b'M', b'N', b'O', bytes),
-                memchr::memchr(b'P', bytes),
-            ];
-            indexes.iter().copied().flatten().min()
-        })
-    });
 }
 
 fn substr(c: &mut Criterion) {

From be3c68fdfde0ef0d0f0f9f4554a9cac5330611b7 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 18 May 2025 22:37:56 -0400
Subject: [PATCH 17/21] Add top level tests

Move the simd-only tests to the top level

This allows testing even when sse4.2 isn't enabled: when it is
available, it will still test the simd implementation, but will test the
fallback otherwise.
---
 src/simd.rs        | 436 --------------------------------------------
 tests/top_level.rs | 442 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 442 insertions(+), 436 deletions(-)
 create mode 100644 tests/top_level.rs

diff --git a/src/simd.rs b/src/simd.rs
index 608171d..694209e 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -325,439 +325,3 @@ impl<'a, 'b> PackedCompareControl for &'b ByteSubstring<'a> {
         self.needle_len
     }
 }
-
-#[cfg(test)]
-mod test {
-    use proptest::prelude::*;
-    use std::{fmt, str};
-    use memmap2::MmapMut;
-    use region::Protection;
-    use lazy_static::lazy_static;
-
-    use super::*;
-
-    lazy_static! {
-        static ref SPACE: Bytes = simd_bytes!(b' ');
-        static ref XML_DELIM_3: Bytes = simd_bytes!(b'<', b'>', b'&');
-        static ref XML_DELIM_5: Bytes = simd_bytes!(b'<', b'>', b'&', b'\'', b'"');
-    }
-
-    trait SliceFindPolyfill<T> {
-        fn find_any(&self, needles: &[T]) -> Option<usize>;
-        fn find_seq(&self, needle: &[T]) -> Option<usize>;
-    }
-
-    impl<T> SliceFindPolyfill<T> for [T]
-    where
-        T: PartialEq,
-    {
-        fn find_any(&self, needles: &[T]) -> Option<usize> {
-            self.iter().position(|c| needles.contains(c))
-        }
-
-        fn find_seq(&self, needle: &[T]) -> Option<usize> {
-            (0..self.len()).find(|&l| self[l..].starts_with(needle))
-        }
-    }
-
-    struct Haystack {
-        data: Vec<u8>,
-        start: usize,
-    }
-
-    impl Haystack {
-        fn without_start(&self) -> &[u8] {
-            &self.data
-        }
-
-        fn with_start(&self) -> &[u8] {
-            &self.data[self.start..]
-        }
-    }
-
-    // Knowing the address of the data can be important
-    impl fmt::Debug for Haystack {
-        fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-            f.debug_struct("Haystack")
-                .field("data", &self.data)
-                .field("(addr)", &self.data.as_ptr())
-                .field("start", &self.start)
-                .finish()
-        }
-    }
-
-    /// Creates a set of bytes and an offset inside them. Allows
-    /// checking arbitrary memory offsets, not just where the
-    /// allocator placed a value.
-    fn haystack() -> BoxedStrategy<Haystack> {
-        any::<Vec<u8>>()
-            .prop_flat_map(|data| {
-                let len = 0..=data.len();
-                (Just(data), len)
-            })
-            .prop_map(|(data, start)| Haystack { data, start })
-            .boxed()
-    }
-
-    #[derive(Debug)]
-    struct Needle {
-        data: [u8; 16],
-        len: usize,
-    }
-
-    impl Needle {
-        fn as_slice(&self) -> &[u8] {
-            &self.data[..self.len]
-        }
-    }
-
-    /// Creates an array and the number of valid values
-    fn needle() -> BoxedStrategy<Needle> {
-        (any::<[u8; 16]>(), 0..=16_usize)
-            .prop_map(|(data, len)| Needle { data, len })
-            .boxed()
-    }
-
-    proptest! {
-        #[test]
-        fn works_as_find_does_for_up_to_and_including_16_bytes(
-            (haystack, needle) in (haystack(), needle())
-        ) {
-           let haystack = haystack.without_start();
-
-           let us = unsafe { Bytes::new(needle.data, needle.len as i32).find(haystack) };
-           let them = haystack.find_any(needle.as_slice());
-           assert_eq!(us, them);
-        }
-
-        #[test]
-        fn works_as_find_does_for_various_memory_offsets(
-            (needle, haystack) in (needle(), haystack())
-        ) {
-            let haystack = haystack.with_start();
-
-            let us = unsafe { Bytes::new(needle.data, needle.len as i32).find(haystack) };
-            let them = haystack.find_any(needle.as_slice());
-            assert_eq!(us, them);
-        }
-    }
-
-    #[test]
-    fn can_search_for_null_bytes() {
-        unsafe {
-            let null = simd_bytes!(b'\0');
-            assert_eq!(Some(1), null.find(b"a\0"));
-            assert_eq!(Some(0), null.find(b"\0"));
-            assert_eq!(None, null.find(b""));
-        }
-    }
-
-    #[test]
-    fn can_search_in_null_bytes() {
-        unsafe {
-            let a = simd_bytes!(b'a');
-            assert_eq!(Some(1), a.find(b"\0a"));
-            assert_eq!(None, a.find(b"\0"));
-        }
-    }
-
-    #[test]
-    fn space_is_found() {
-        unsafe {
-            // Since the algorithm operates on 16-byte chunks, it's
-            // important to cover tests around that boundary. Since 16
-            // isn't that big of a number, we might as well do all of
-            // them.
-
-            assert_eq!(Some(0), SPACE.find(b" "));
-            assert_eq!(Some(1), SPACE.find(b"0 "));
-            assert_eq!(Some(2), SPACE.find(b"01 "));
-            assert_eq!(Some(3), SPACE.find(b"012 "));
-            assert_eq!(Some(4), SPACE.find(b"0123 "));
-            assert_eq!(Some(5), SPACE.find(b"01234 "));
-            assert_eq!(Some(6), SPACE.find(b"012345 "));
-            assert_eq!(Some(7), SPACE.find(b"0123456 "));
-            assert_eq!(Some(8), SPACE.find(b"01234567 "));
-            assert_eq!(Some(9), SPACE.find(b"012345678 "));
-            assert_eq!(Some(10), SPACE.find(b"0123456789 "));
-            assert_eq!(Some(11), SPACE.find(b"0123456789A "));
-            assert_eq!(Some(12), SPACE.find(b"0123456789AB "));
-            assert_eq!(Some(13), SPACE.find(b"0123456789ABC "));
-            assert_eq!(Some(14), SPACE.find(b"0123456789ABCD "));
-            assert_eq!(Some(15), SPACE.find(b"0123456789ABCDE "));
-            assert_eq!(Some(16), SPACE.find(b"0123456789ABCDEF "));
-            assert_eq!(Some(17), SPACE.find(b"0123456789ABCDEFG "));
-        }
-    }
-
-    #[test]
-    fn space_not_found() {
-        unsafe {
-            // Since the algorithm operates on 16-byte chunks, it's
-            // important to cover tests around that boundary. Since 16
-            // isn't that big of a number, we might as well do all of
-            // them.
-
-            assert_eq!(None, SPACE.find(b""));
-            assert_eq!(None, SPACE.find(b"0"));
-            assert_eq!(None, SPACE.find(b"01"));
-            assert_eq!(None, SPACE.find(b"012"));
-            assert_eq!(None, SPACE.find(b"0123"));
-            assert_eq!(None, SPACE.find(b"01234"));
-            assert_eq!(None, SPACE.find(b"012345"));
-            assert_eq!(None, SPACE.find(b"0123456"));
-            assert_eq!(None, SPACE.find(b"01234567"));
-            assert_eq!(None, SPACE.find(b"012345678"));
-            assert_eq!(None, SPACE.find(b"0123456789"));
-            assert_eq!(None, SPACE.find(b"0123456789A"));
-            assert_eq!(None, SPACE.find(b"0123456789AB"));
-            assert_eq!(None, SPACE.find(b"0123456789ABC"));
-            assert_eq!(None, SPACE.find(b"0123456789ABCD"));
-            assert_eq!(None, SPACE.find(b"0123456789ABCDE"));
-            assert_eq!(None, SPACE.find(b"0123456789ABCDEF"));
-            assert_eq!(None, SPACE.find(b"0123456789ABCDEFG"));
-        }
-    }
-
-    #[test]
-    fn works_on_nonaligned_beginnings() {
-        unsafe {
-            // We have special code for strings that don't lie on 16-byte
-            // boundaries. Since allocation seems to happen on these
-            // boundaries by default, let's walk around a bit.
-
-            let s = b"0123456789ABCDEF ".to_vec();
-
-            assert_eq!(Some(16), SPACE.find(&s[0..]));
-            assert_eq!(Some(15), SPACE.find(&s[1..]));
-            assert_eq!(Some(14), SPACE.find(&s[2..]));
-            assert_eq!(Some(13), SPACE.find(&s[3..]));
-            assert_eq!(Some(12), SPACE.find(&s[4..]));
-            assert_eq!(Some(11), SPACE.find(&s[5..]));
-            assert_eq!(Some(10), SPACE.find(&s[6..]));
-            assert_eq!(Some(9), SPACE.find(&s[7..]));
-            assert_eq!(Some(8), SPACE.find(&s[8..]));
-            assert_eq!(Some(7), SPACE.find(&s[9..]));
-            assert_eq!(Some(6), SPACE.find(&s[10..]));
-            assert_eq!(Some(5), SPACE.find(&s[11..]));
-            assert_eq!(Some(4), SPACE.find(&s[12..]));
-            assert_eq!(Some(3), SPACE.find(&s[13..]));
-            assert_eq!(Some(2), SPACE.find(&s[14..]));
-            assert_eq!(Some(1), SPACE.find(&s[15..]));
-            assert_eq!(Some(0), SPACE.find(&s[16..]));
-            assert_eq!(None, SPACE.find(&s[17..]));
-        }
-    }
-
-    #[test]
-    fn misalignment_does_not_cause_a_false_positive_before_start() {
-        const AAAA: u8 = 0x01;
-
-        let needle = Needle {
-            data: [
-                AAAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            ],
-            len: 1,
-        };
-        let haystack = Haystack {
-            data: vec![
-                AAAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                0x00, 0x00,
-            ],
-            start: 1,
-        };
-
-        let haystack = haystack.with_start();
-
-        // Needs to trigger the misalignment code
-        assert_ne!(0, (haystack.as_ptr() as usize) % 16);
-        // There are 64 bits in the mask and we check to make sure the
-        // result is less than the haystack
-        assert!(haystack.len() > 64);
-
-        let us = unsafe { Bytes::new(needle.data, needle.len as i32).find(haystack) };
-        assert_eq!(None, us);
-    }
-
-    #[test]
-    fn xml_delim_3_is_found() {
-        unsafe {
-            assert_eq!(Some(0), XML_DELIM_3.find(b"<"));
-            assert_eq!(Some(0), XML_DELIM_3.find(b">"));
-            assert_eq!(Some(0), XML_DELIM_3.find(b"&"));
-            assert_eq!(None, XML_DELIM_3.find(b""));
-        }
-    }
-
-    #[test]
-    fn xml_delim_5_is_found() {
-        unsafe {
-            assert_eq!(Some(0), XML_DELIM_5.find(b"<"));
-            assert_eq!(Some(0), XML_DELIM_5.find(b">"));
-            assert_eq!(Some(0), XML_DELIM_5.find(b"&"));
-            assert_eq!(Some(0), XML_DELIM_5.find(b"'"));
-            assert_eq!(Some(0), XML_DELIM_5.find(b"\""));
-            assert_eq!(None, XML_DELIM_5.find(b""));
-        }
-    }
-
-    proptest! {
-        #[test]
-        fn works_as_find_does_for_byte_substrings(
-            (needle, haystack) in (any::<Vec<u8>>(), any::<Vec<u8>>())
-        ) {
-            let us = unsafe {
-                let s = ByteSubstring::new(&needle);
-                s.find(&haystack)
-            };
-            let them = haystack.find_seq(&needle);
-            assert_eq!(us, them);
-        }
-    }
-
-    #[test]
-    fn byte_substring_is_found() {
-        unsafe {
-            let substr = ByteSubstring::new(b"zz");
-            assert_eq!(Some(0), substr.find(b"zz"));
-            assert_eq!(Some(1), substr.find(b"0zz"));
-            assert_eq!(Some(2), substr.find(b"01zz"));
-            assert_eq!(Some(3), substr.find(b"012zz"));
-            assert_eq!(Some(4), substr.find(b"0123zz"));
-            assert_eq!(Some(5), substr.find(b"01234zz"));
-            assert_eq!(Some(6), substr.find(b"012345zz"));
-            assert_eq!(Some(7), substr.find(b"0123456zz"));
-            assert_eq!(Some(8), substr.find(b"01234567zz"));
-            assert_eq!(Some(9), substr.find(b"012345678zz"));
-            assert_eq!(Some(10), substr.find(b"0123456789zz"));
-            assert_eq!(Some(11), substr.find(b"0123456789Azz"));
-            assert_eq!(Some(12), substr.find(b"0123456789ABzz"));
-            assert_eq!(Some(13), substr.find(b"0123456789ABCzz"));
-            assert_eq!(Some(14), substr.find(b"0123456789ABCDzz"));
-            assert_eq!(Some(15), substr.find(b"0123456789ABCDEzz"));
-            assert_eq!(Some(16), substr.find(b"0123456789ABCDEFzz"));
-            assert_eq!(Some(17), substr.find(b"0123456789ABCDEFGzz"));
-        }
-    }
-
-    #[test]
-    fn byte_substring_is_not_found() {
-        unsafe {
-            let substr = ByteSubstring::new(b"zz");
-            assert_eq!(None, substr.find(b""));
-            assert_eq!(None, substr.find(b"0"));
-            assert_eq!(None, substr.find(b"01"));
-            assert_eq!(None, substr.find(b"012"));
-            assert_eq!(None, substr.find(b"0123"));
-            assert_eq!(None, substr.find(b"01234"));
-            assert_eq!(None, substr.find(b"012345"));
-            assert_eq!(None, substr.find(b"0123456"));
-            assert_eq!(None, substr.find(b"01234567"));
-            assert_eq!(None, substr.find(b"012345678"));
-            assert_eq!(None, substr.find(b"0123456789"));
-            assert_eq!(None, substr.find(b"0123456789A"));
-            assert_eq!(None, substr.find(b"0123456789AB"));
-            assert_eq!(None, substr.find(b"0123456789ABC"));
-            assert_eq!(None, substr.find(b"0123456789ABCD"));
-            assert_eq!(None, substr.find(b"0123456789ABCDE"));
-            assert_eq!(None, substr.find(b"0123456789ABCDEF"));
-            assert_eq!(None, substr.find(b"0123456789ABCDEFG"));
-        }
-    }
-
-    #[test]
-    fn byte_substring_has_false_positive() {
-        unsafe {
-            // The PCMPESTRI instruction will mark the "a" before "ab" as
-            // a match because it cannot look beyond the 16 byte window
-            // of the haystack. We need to double-check any match to
-            // ensure it completely matches.
-
-            let substr = ByteSubstring::new(b"ab");
-            assert_eq!(Some(16), substr.find(b"aaaaaaaaaaaaaaaaab"))
-            //   this "a" is a false positive ~~~~~~~~~~~~~~~^
-        };
-    }
-
-    #[test]
-    fn byte_substring_needle_is_longer_than_16_bytes() {
-        unsafe {
-            let needle = b"0123456789abcdefg";
-            let haystack = b"0123456789abcdefgh";
-            assert_eq!(Some(0), ByteSubstring::new(needle).find(haystack));
-        }
-    }
-
-    fn with_guarded_string(value: &str, f: impl FnOnce(&str)) {
-        // Allocate a string that ends directly before a
-        // read-protected page.
-
-        let page_size = region::page::size();
-        assert!(value.len() <= page_size);
-
-        // Map two rw-accessible pages of anonymous memory
-        let mut mmap = MmapMut::map_anon(2 * page_size).unwrap();
-
-        let (first_page, second_page) = mmap.split_at_mut(page_size);
-
-        // Prohibit any access to the second page, so that any attempt
-        // to read or write it would trigger a segfault
-        unsafe {
-            region::protect(second_page.as_ptr(), page_size, Protection::NONE).unwrap();
-        }
-
-        // Copy bytes to the end of the first page
-        let dest = &mut first_page[page_size - value.len()..];
-        dest.copy_from_slice(value.as_bytes());
-        f(unsafe { str::from_utf8_unchecked(dest) });
-    }
-
-    #[test]
-    fn works_at_page_boundary() {
-        // PCMPxSTRx instructions are known to read 16 bytes at a
-        // time. This behaviour may cause accidental segfaults by
-        // reading past the page boundary.
-        //
-        // For now, this test failing crashes the whole test
-        // suite. This could be fixed by setting a custom signal
-        // handler, though Rust lacks such facilities at the moment.
-
-        // Allocate a 16-byte string at page boundary.  To verify this
-        // test, set protect=false to prevent segfaults.
-        with_guarded_string("0123456789abcdef", |text| {
-            // Will search for the last char
-            let needle = simd_bytes!(b'f');
-
-            // Check all suffixes of our 16-byte string
-            for offset in 0..text.len() {
-                let tail = &text[offset..];
-                unsafe {
-                    assert_eq!(Some(tail.len() - 1), needle.find(tail.as_bytes()));
-                }
-            }
-        });
-    }
-
-    #[test]
-    fn does_not_access_memory_after_haystack_when_haystack_is_multiple_of_16_bytes_and_no_match() {
-        // For now, this test failing crashes the whole test
-        // suite. This could be fixed by setting a custom signal
-        // handler, though Rust lacks such facilities at the moment.
-        with_guarded_string("0123456789abcdef", |text| {
-            // Will search for a char not present
-            let needle = simd_bytes!(b'z');
-
-            unsafe {
-                assert_eq!(None, needle.find(text.as_bytes()));
-            }
-        });
-    }
-}
diff --git a/tests/top_level.rs b/tests/top_level.rs
new file mode 100644
index 0000000..e453592
--- /dev/null
+++ b/tests/top_level.rs
@@ -0,0 +1,442 @@
+use jetscii::{bytes, ByteSubstring, Bytes, BytesConst};
+use lazy_static::lazy_static;
+use memmap2::MmapMut;
+use proptest::prelude::*;
+use region::Protection;
+use std::{fmt, str};
+
+lazy_static! {
+    static ref SPACE: BytesConst = bytes!(b' ');
+    static ref XML_DELIM_3: BytesConst = bytes!(b'<', b'>', b'&');
+    static ref XML_DELIM_5: BytesConst = bytes!(b'<', b'>', b'&', b'\'', b'"');
+}
+
+trait SliceFindPolyfill<T> {
+    fn find_any(&self, needles: &[T]) -> Option<usize>;
+    fn find_seq(&self, needle: &[T]) -> Option<usize>;
+}
+
+impl<T> SliceFindPolyfill<T> for [T]
+where
+    T: PartialEq,
+{
+    fn find_any(&self, needles: &[T]) -> Option<usize> {
+        self.iter().position(|c| needles.contains(c))
+    }
+
+    fn find_seq(&self, needle: &[T]) -> Option<usize> {
+        (0..self.len()).find(|&l| self[l..].starts_with(needle))
+    }
+}
+
+struct Haystack {
+    data: Vec<u8>,
+    start: usize,
+}
+
+impl Haystack {
+    fn without_start(&self) -> &[u8] {
+        &self.data
+    }
+
+    fn with_start(&self) -> &[u8] {
+        &self.data[self.start..]
+    }
+}
+
+// Knowing the address of the data can be important
+impl fmt::Debug for Haystack {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("Haystack")
+            .field("data", &self.data)
+            .field("(addr)", &self.data.as_ptr())
+            .field("start", &self.start)
+            .finish()
+    }
+}
+
+/// Creates a set of bytes and an offset inside them. Allows
+/// checking arbitrary memory offsets, not just where the
+/// allocator placed a value.
+fn haystack() -> BoxedStrategy<Haystack> {
+    any::<Vec<u8>>()
+        .prop_flat_map(|data| {
+            let len = 0..=data.len();
+            (Just(data), len)
+        })
+        .prop_map(|(data, start)| Haystack { data, start })
+        .boxed()
+}
+
+#[derive(Debug)]
+struct Needle {
+    data: [u8; 16],
+    len: usize,
+}
+
+impl Needle {
+    fn as_slice(&self) -> &[u8] {
+        &self.data[..self.len]
+    }
+}
+
+/// Creates an array and the number of valid values
+fn needle() -> BoxedStrategy<Needle> {
+    (any::<[u8; 16]>(), 0..=16_usize)
+        .prop_map(|(data, len)| Needle { data, len })
+        .boxed()
+}
+
+proptest! {
+    #[test]
+    fn works_as_find_does_for_up_to_and_including_16_bytes(
+        (haystack, needle) in (haystack(), needle())
+    ) {
+       let haystack = haystack.without_start();
+
+       let us = Bytes::new(needle.data, needle.len as i32, |b| needle.as_slice().contains(&b)).find(haystack);
+       let them = haystack.find_any(needle.as_slice());
+       assert_eq!(us, them);
+    }
+
+    #[test]
+    fn iter_works(
+        (haystack, needle) in (haystack(), needle())
+    ) {
+       let haystack = haystack.without_start();
+
+       let bytes = Bytes::new(needle.data, needle.len as i32, |b| needle.as_slice().contains(&b));
+       let mut us = bytes.iter(haystack);
+       let mut them = haystack.iter().enumerate().filter_map(|(i, b)| {
+            if needle.as_slice().contains(b) {
+                Some(i)
+            } else {
+                None
+            }
+       });
+       loop {
+           match (us.next(), them.next()) {
+                (Some(us), Some(them)) => { assert_eq!(us, them); }
+                (Some(_), None) => panic!("them iterator ended before us"),
+                (None, Some(_)) => panic!("us iterator ended before them"),
+                (None, None) => break,
+           }
+       }
+    }
+
+    #[test]
+    fn works_as_find_does_for_various_memory_offsets(
+        (needle, haystack) in (needle(), haystack())
+    ) {
+        let haystack = haystack.with_start();
+
+        let us = Bytes::new(needle.data, needle.len as i32, |b| needle.as_slice().contains(&b)).find(haystack);
+        let them = haystack.find_any(needle.as_slice());
+        assert_eq!(us, them);
+    }
+}
+
+#[test]
+fn can_search_for_null_bytes() {
+    let null = bytes!(b'\0');
+    assert_eq!(Some(1), null.find(b"a\0"));
+    assert_eq!(Some(0), null.find(b"\0"));
+    assert_eq!(None, null.find(b""));
+}
+
+#[test]
+fn can_search_in_null_bytes() {
+    let a = bytes!(b'a');
+    assert_eq!(Some(1), a.find(b"\0a"));
+    assert_eq!(None, a.find(b"\0"));
+}
+
+#[test]
+fn space_is_found() {
+    // Since the simd algorithm operates on 16-byte chunks, it's
+    // important to cover tests around that boundary. Since 16
+    // isn't that big of a number, we might as well do all of
+    // them.
+
+    assert_eq!(Some(0), SPACE.find(b" "));
+    assert_eq!(Some(1), SPACE.find(b"0 "));
+    assert_eq!(Some(2), SPACE.find(b"01 "));
+    assert_eq!(Some(3), SPACE.find(b"012 "));
+    assert_eq!(Some(4), SPACE.find(b"0123 "));
+    assert_eq!(Some(5), SPACE.find(b"01234 "));
+    assert_eq!(Some(6), SPACE.find(b"012345 "));
+    assert_eq!(Some(7), SPACE.find(b"0123456 "));
+    assert_eq!(Some(8), SPACE.find(b"01234567 "));
+    assert_eq!(Some(9), SPACE.find(b"012345678 "));
+    assert_eq!(Some(10), SPACE.find(b"0123456789 "));
+    assert_eq!(Some(11), SPACE.find(b"0123456789A "));
+    assert_eq!(Some(12), SPACE.find(b"0123456789AB "));
+    assert_eq!(Some(13), SPACE.find(b"0123456789ABC "));
+    assert_eq!(Some(14), SPACE.find(b"0123456789ABCD "));
+    assert_eq!(Some(15), SPACE.find(b"0123456789ABCDE "));
+    assert_eq!(Some(16), SPACE.find(b"0123456789ABCDEF "));
+    assert_eq!(Some(17), SPACE.find(b"0123456789ABCDEFG "));
+}
+
+#[test]
+fn space_not_found() {
+    // Since the simd algorithm operates on 16-byte chunks, it's
+    // important to cover tests around that boundary. Since 16
+    // isn't that big of a number, we might as well do all of
+    // them.
+
+    assert_eq!(None, SPACE.find(b""));
+    assert_eq!(None, SPACE.find(b"0"));
+    assert_eq!(None, SPACE.find(b"01"));
+    assert_eq!(None, SPACE.find(b"012"));
+    assert_eq!(None, SPACE.find(b"0123"));
+    assert_eq!(None, SPACE.find(b"01234"));
+    assert_eq!(None, SPACE.find(b"012345"));
+    assert_eq!(None, SPACE.find(b"0123456"));
+    assert_eq!(None, SPACE.find(b"01234567"));
+    assert_eq!(None, SPACE.find(b"012345678"));
+    assert_eq!(None, SPACE.find(b"0123456789"));
+    assert_eq!(None, SPACE.find(b"0123456789A"));
+    assert_eq!(None, SPACE.find(b"0123456789AB"));
+    assert_eq!(None, SPACE.find(b"0123456789ABC"));
+    assert_eq!(None, SPACE.find(b"0123456789ABCD"));
+    assert_eq!(None, SPACE.find(b"0123456789ABCDE"));
+    assert_eq!(None, SPACE.find(b"0123456789ABCDEF"));
+    assert_eq!(None, SPACE.find(b"0123456789ABCDEFG"));
+}
+
+#[test]
+fn works_on_nonaligned_beginnings() {
+    // We have special code for strings that don't lie on 16-byte
+    // boundaries. Since allocation seems to happen on these
+    // boundaries by default, let's walk around a bit.
+
+    let s = b"0123456789ABCDEF ".to_vec();
+
+    assert_eq!(Some(16), SPACE.find(&s[0..]));
+    assert_eq!(Some(15), SPACE.find(&s[1..]));
+    assert_eq!(Some(14), SPACE.find(&s[2..]));
+    assert_eq!(Some(13), SPACE.find(&s[3..]));
+    assert_eq!(Some(12), SPACE.find(&s[4..]));
+    assert_eq!(Some(11), SPACE.find(&s[5..]));
+    assert_eq!(Some(10), SPACE.find(&s[6..]));
+    assert_eq!(Some(9), SPACE.find(&s[7..]));
+    assert_eq!(Some(8), SPACE.find(&s[8..]));
+    assert_eq!(Some(7), SPACE.find(&s[9..]));
+    assert_eq!(Some(6), SPACE.find(&s[10..]));
+    assert_eq!(Some(5), SPACE.find(&s[11..]));
+    assert_eq!(Some(4), SPACE.find(&s[12..]));
+    assert_eq!(Some(3), SPACE.find(&s[13..]));
+    assert_eq!(Some(2), SPACE.find(&s[14..]));
+    assert_eq!(Some(1), SPACE.find(&s[15..]));
+    assert_eq!(Some(0), SPACE.find(&s[16..]));
+    assert_eq!(None, SPACE.find(&s[17..]));
+}
+
+#[test]
+fn misalignment_does_not_cause_a_false_positive_before_start() {
+    const AAAA: u8 = 0x01;
+
+    let needle = Needle {
+        data: [
+            AAAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00,
+        ],
+        len: 1,
+    };
+    let haystack = Haystack {
+        data: vec![
+            AAAA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ],
+        start: 1,
+    };
+
+    let haystack = haystack.with_start();
+
+    // Needs to trigger the misalignment code
+    assert_ne!(0, (haystack.as_ptr() as usize) % 16);
+    // There are 64 bits in the mask and we check to make sure the
+    // result is less than the haystack
+    assert!(haystack.len() > 64);
+
+    let us = Bytes::new(needle.data, needle.len as i32, |b| {
+        needle.as_slice().contains(&b)
+    })
+    .find(haystack);
+    assert_eq!(None, us);
+}
+
+#[test]
+fn xml_delim_3_is_found() {
+    assert_eq!(Some(0), XML_DELIM_3.find(b"<"));
+    assert_eq!(Some(0), XML_DELIM_3.find(b">"));
+    assert_eq!(Some(0), XML_DELIM_3.find(b"&"));
+    assert_eq!(None, XML_DELIM_3.find(b""));
+}
+
+#[test]
+fn xml_delim_5_is_found() {
+    assert_eq!(Some(0), XML_DELIM_5.find(b"<"));
+    assert_eq!(Some(0), XML_DELIM_5.find(b">"));
+    assert_eq!(Some(0), XML_DELIM_5.find(b"&"));
+    assert_eq!(Some(0), XML_DELIM_5.find(b"'"));
+    assert_eq!(Some(0), XML_DELIM_5.find(b"\""));
+    assert_eq!(None, XML_DELIM_5.find(b""));
+}
+
+#[test]
+fn do_not_find_zeros_after_end() {
+    // The simd algorithm will end up with a partial vector when the haystack isn't
+    // a multiple of 16 bytes. The rest of the vector will be filled with zeros.
+    // Ensure we don't find a match in the remainder of that vector, which would be considered
+    // after the end of the haystack.
+    let needle = bytes!(b'\0', b'*');
+    let haystack = b"123456";
+    assert_eq!(None, needle.find(haystack));
+}
+
+proptest! {
+    #[test]
+    fn works_as_find_does_for_byte_substrings(
+        (needle, haystack) in (any::<Vec<u8>>(), any::<Vec<u8>>())
+    ) {
+        if !needle.is_empty() {
+            let us = {
+                let s = ByteSubstring::new(&needle);
+                s.find(&haystack)
+            };
+            let them = haystack.find_seq(&needle);
+            assert_eq!(us, them);
+        }
+    }
+}
+
+#[test]
+fn byte_substring_is_found() {
+    let substr = ByteSubstring::new(b"zz");
+    assert_eq!(Some(0), substr.find(b"zz"));
+    assert_eq!(Some(1), substr.find(b"0zz"));
+    assert_eq!(Some(2), substr.find(b"01zz"));
+    assert_eq!(Some(3), substr.find(b"012zz"));
+    assert_eq!(Some(4), substr.find(b"0123zz"));
+    assert_eq!(Some(5), substr.find(b"01234zz"));
+    assert_eq!(Some(6), substr.find(b"012345zz"));
+    assert_eq!(Some(7), substr.find(b"0123456zz"));
+    assert_eq!(Some(8), substr.find(b"01234567zz"));
+    assert_eq!(Some(9), substr.find(b"012345678zz"));
+    assert_eq!(Some(10), substr.find(b"0123456789zz"));
+    assert_eq!(Some(11), substr.find(b"0123456789Azz"));
+    assert_eq!(Some(12), substr.find(b"0123456789ABzz"));
+    assert_eq!(Some(13), substr.find(b"0123456789ABCzz"));
+    assert_eq!(Some(14), substr.find(b"0123456789ABCDzz"));
+    assert_eq!(Some(15), substr.find(b"0123456789ABCDEzz"));
+    assert_eq!(Some(16), substr.find(b"0123456789ABCDEFzz"));
+    assert_eq!(Some(17), substr.find(b"0123456789ABCDEFGzz"));
+}
+
+#[test]
+fn byte_substring_is_not_found() {
+    let substr = ByteSubstring::new(b"zz");
+    assert_eq!(None, substr.find(b""));
+    assert_eq!(None, substr.find(b"0"));
+    assert_eq!(None, substr.find(b"01"));
+    assert_eq!(None, substr.find(b"012"));
+    assert_eq!(None, substr.find(b"0123"));
+    assert_eq!(None, substr.find(b"01234"));
+    assert_eq!(None, substr.find(b"012345"));
+    assert_eq!(None, substr.find(b"0123456"));
+    assert_eq!(None, substr.find(b"01234567"));
+    assert_eq!(None, substr.find(b"012345678"));
+    assert_eq!(None, substr.find(b"0123456789"));
+    assert_eq!(None, substr.find(b"0123456789A"));
+    assert_eq!(None, substr.find(b"0123456789AB"));
+    assert_eq!(None, substr.find(b"0123456789ABC"));
+    assert_eq!(None, substr.find(b"0123456789ABCD"));
+    assert_eq!(None, substr.find(b"0123456789ABCDE"));
+    assert_eq!(None, substr.find(b"0123456789ABCDEF"));
+    assert_eq!(None, substr.find(b"0123456789ABCDEFG"));
+}
+
+#[test]
+fn byte_substring_has_false_positive() {
+    // The PCMPESTRI instruction will mark the "a" before "ab" as
+    // a match because it cannot look beyond the 16 byte window
+    // of the haystack. We need to double-check any match to
+    // ensure it completely matches.
+
+    let substr = ByteSubstring::new(b"ab");
+    assert_eq!(Some(16), substr.find(b"aaaaaaaaaaaaaaaaab"))
+    //   this "a" is a false positive ~~~~~~~~~~~~~~~^
+}
+
+#[test]
+fn byte_substring_needle_is_longer_than_16_bytes() {
+    let needle = b"0123456789abcdefg";
+    let haystack = b"0123456789abcdefgh";
+    assert_eq!(Some(0), ByteSubstring::new(needle).find(haystack));
+}
+
+fn with_guarded_string(value: &str, f: impl FnOnce(&str)) {
+    // Allocate a string that ends directly before a
+    // read-protected page.
+
+    let page_size = region::page::size();
+    assert!(value.len() <= page_size);
+
+    // Map two rw-accessible pages of anonymous memory
+    let mut mmap = MmapMut::map_anon(2 * page_size).unwrap();
+
+    let (first_page, second_page) = mmap.split_at_mut(page_size);
+
+    // Prohibit any access to the second page, so that any attempt
+    // to read or write it would trigger a segfault
+    unsafe {
+        region::protect(second_page.as_ptr(), page_size, Protection::NONE).unwrap();
+    }
+
+    // Copy bytes to the end of the first page
+    let dest = &mut first_page[page_size - value.len()..];
+    dest.copy_from_slice(value.as_bytes());
+    f(unsafe { str::from_utf8_unchecked(dest) });
+}
+
+#[test]
+fn works_at_page_boundary() {
+    // PCMPxSTRx instructions are known to read 16 bytes at a
+    // time. This behaviour may cause accidental segfaults by
+    // reading past the page boundary.
+    //
+    // For now, this test failing crashes the whole test
+    // suite. This could be fixed by setting a custom signal
+    // handler, though Rust lacks such facilities at the moment.
+
+    // Allocate a 16-byte string at page boundary.  To verify this
+    // test, set protect=false to prevent segfaults.
+    with_guarded_string("0123456789abcdef", |text| {
+        // Will search for the last char
+        let needle = bytes!(b'f');
+
+        // Check all suffixes of our 16-byte string
+        for offset in 0..text.len() {
+            let tail = &text[offset..];
+            assert_eq!(Some(tail.len() - 1), needle.find(tail.as_bytes()));
+        }
+    });
+}
+
+#[test]
+fn does_not_access_memory_after_haystack_when_haystack_is_multiple_of_16_bytes_and_no_match() {
+    // For now, this test failing crashes the whole test
+    // suite. This could be fixed by setting a custom signal
+    // handler, though Rust lacks such facilities at the moment.
+    with_guarded_string("0123456789abcdef", |text| {
+        // Will search for a char not present
+        let needle = bytes!(b'z');
+
+        assert_eq!(None, needle.find(text.as_bytes()));
+    });
+}

From cf7a8f0f24c4a2a970c5d436e273cbf49a09b614 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 18 May 2025 22:37:56 -0400
Subject: [PATCH 18/21] Add aarch64 neon simd

This moves mentions of "simd" to be x86 specific. Also, do everything
with #[cfg], rather than requiring custom cfgs populated in the build.rs
---
 CHANGELOG.md                 |   1 +
 Cargo.toml                   |   5 +-
 build.rs                     |  19 +-----
 src/lib.rs                   | 127 ++++++++++++++++++++++++-----------
 src/simd/aarch64.rs          | 107 +++++++++++++++++++++++++++++
 src/simd/mod.rs              |   4 ++
 src/{simd.rs => simd/x86.rs} |  17 +++--
 7 files changed, 216 insertions(+), 64 deletions(-)
 create mode 100644 src/simd/aarch64.rs
 create mode 100644 src/simd/mod.rs
 rename src/{simd.rs => simd/x86.rs} (95%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b572f42..52b7402 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ to [Semantic Versioning][].
 ## [Unreleased]
 
 - Trait bounds removed from generic types, bounds are only required for impls
+- Added SIMD implementation of `Bytes`/`AsciiChars` for aarch64 neon
 
 ## [0.5.3] - 2022-07-06
 
diff --git a/Cargo.toml b/Cargo.toml
index 043e2b1..c28fc0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,4 @@ memchr = "2.0.0"
 
 [[bench]]
 name = "benchmarks"
-harness = false
-
-[lints.rust]
-unexpected_cfgs = { level = "warn", check-cfg = ['cfg(jetscii_sse4_2, values("no", "maybe", "yes"))'] }
\ No newline at end of file
+harness = false
\ No newline at end of file
diff --git a/build.rs b/build.rs
index 20a8d92..197952b 100644
--- a/build.rs
+++ b/build.rs
@@ -4,28 +4,11 @@ use std::io::prelude::*;
 use std::path::{Path, PathBuf};
 
 fn main() {
-    cfg();
     macros();
     simd_macros();
     println!("cargo:rerun-if-changed=build.rs");
 }
 
-fn cfg() {
-    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
-    let target_feature = env::var("CARGO_CFG_TARGET_FEATURE").unwrap_or_default();
-
-    let ok_arch = matches!(&*target_arch, "x86" | "x86_64");
-    let sse4_2_guaranteed = target_feature.split(',').any(|f| f == "sse4.2");
-
-    if sse4_2_guaranteed {
-        println!(r#"cargo:rustc-cfg=jetscii_sse4_2="yes""#);
-    } else if ok_arch {
-        println!(r#"cargo:rustc-cfg=jetscii_sse4_2="maybe""#);
-    } else {
-        println!(r#"cargo:rustc-cfg=jetscii_sse4_2="no""#);
-    }
-}
-
 fn macros() {
     let mut base: PathBuf = env::var_os("OUT_DIR").unwrap().into();
     base.push("src");
@@ -134,7 +117,7 @@ fn simd_macros() {
             let array = array.join(", ");
 
             format!(
-                "({}) => ($crate::simd::Bytes::new([{}], {}));\n",
+                "({}) => ($crate::simd::x86::Bytes::new([{}], {}));\n",
                 args, array, max
             )
         })
diff --git a/src/lib.rs b/src/lib.rs
index f6b875c..4ccf908 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -141,50 +141,78 @@ use std::marker::PhantomData;
 
 include!(concat!(env!("OUT_DIR"), "/src/macros.rs"));
 
-#[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
 mod simd;
 
-#[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
+// This module may not be used if e.g. we statically know we have the sse4.2 target feature
+#[allow(unused)]
 mod fallback;
 
 #[cfg(feature = "pattern")]
 mod pattern;
 
 macro_rules! dispatch {
-    (simd: $simd:expr,fallback: $fallback:expr,) => {
-        // If we can tell at compile time that we have support,
-        // call the optimized code directly.
-        #[cfg(jetscii_sse4_2 = "yes")]
+    (x86: $x86:expr, aarch64: $aarch64:expr, fallback: $fallback:expr,) => ({
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
-            $simd
+            dispatch!(
+                target_feature: "sse4.2",
+                is_feature_enabled: is_x86_feature_detected,
+                simd: $x86,
+                fallback: $fallback,
+            )
+        }
+
+        #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+        {
+            dispatch!(
+                target_feature: "neon",
+                is_feature_enabled: is_aarch64_feature_detected,
+                simd: $aarch64,
+                fallback: $fallback,
+            )
         }
 
         // If we can tell at compile time that we will *never* have
         // support, call the fallback directly.
-        #[cfg(jetscii_sse4_2 = "no")]
+        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64", target_arch = "arm64ec")))]
         {
             $fallback
         }
-
-        // Otherwise, we will be run on a machine with or without
-        // support, so we perform runtime detection.
-        #[cfg(jetscii_sse4_2 = "maybe")]
-        {
-            if is_x86_feature_detected!("sse4.2") {
+    });
+    (target_feature: $target_feature:tt, is_feature_enabled: $is_feature_enabled:ident, simd: $simd:expr, fallback: $fallback:expr,) => ({
+            // If we can tell at compile time that we have support,
+            // call the optimized code directly.
+            #[cfg(target_feature = $target_feature)]
+            {
                 $simd
-            } else {
-                $fallback
             }
-        }
-    };
+            // Otherwise, we will be run on a machine with or without
+            // support, so we perform runtime detection.
+            #[cfg(not(target_feature = $target_feature))]
+            {
+                if std::arch::$is_feature_enabled!($target_feature) {
+                    $simd
+                } else {
+                    $fallback
+                }
+            }
+    });
 }
 
 /// Searches a slice for a set of bytes. Up to 16 bytes may be used.
 pub struct Bytes<F> {
-    #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
-    simd: simd::Bytes,
-
-    #[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    x86: simd::x86::Bytes,
+
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    aarch64: simd::aarch64::Bytes,
+
+    #[cfg(not(
+      any(
+        all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.2"),
+        all(any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon"),
+      )
+    ))]
     fallback: fallback::Bytes<F>,
 
     // Since we might not use the fallback implementation, we add this
@@ -206,10 +234,18 @@ where
     #[must_use]
     pub fn new(bytes: [u8; 16], len: i32, fallback: F) -> Self {
         Bytes {
-            #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
-            simd: simd::Bytes::new(bytes, len),
-
-            #[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            x86: simd::x86::Bytes::new(bytes, len),
+
+            #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+            aarch64: simd::aarch64::Bytes::new(bytes, len),
+
+            #[cfg(not(
+                any(
+                    all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.2"),
+                    all(any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon"),
+                )
+            ))]
             fallback: fallback::Bytes::new(fallback),
 
             _fallback: PhantomData,
@@ -221,7 +257,8 @@ where
     #[must_use]
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
         dispatch! {
-            simd: unsafe { self.simd.find(haystack) },
+            x86: unsafe { self.x86.find(haystack) },
+            aarch64: unsafe { self.aarch64.find(haystack) },
             fallback: self.fallback.find(haystack),
         }
     }
@@ -269,21 +306,33 @@ pub type AsciiCharsConst = AsciiChars<fn(u8) -> bool>;
 
 /// Searches a slice for the first occurence of the subslice.
 pub struct ByteSubstring<'a> {
-    #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
-    simd: simd::ByteSubstring<'a>,
-
-    #[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    x86: simd::x86::ByteSubstring<'a>,
+
+    #[cfg(not(
+        any(
+            all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.2"),
+            // No aarch64 simd implementation of substring search
+            // all(any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon"),
+        )
+    ))]
     fallback: fallback::ByteSubstring<'a>,
 }
 
 impl<'a> ByteSubstring<'a> {
     #[must_use]
     pub fn new(needle: &'a [u8]) -> Self {
-        ByteSubstring {
-            #[cfg(any(jetscii_sse4_2 = "yes", jetscii_sse4_2 = "maybe"))]
-            simd: simd::ByteSubstring::new(needle),
-
-            #[cfg(any(jetscii_sse4_2 = "maybe", jetscii_sse4_2 = "no"))]
+        Self {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            x86: simd::x86::ByteSubstring::new(needle),
+
+            #[cfg(not(
+                any(
+                    all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.2"),
+                    // No aarch64 simd implementation of substring search
+                    // all(any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon"),
+                )
+            ))]
             fallback: fallback::ByteSubstring::new(needle),
         }
     }
@@ -292,7 +341,8 @@ impl<'a> ByteSubstring<'a> {
     #[must_use]
     fn needle_len(&self) -> usize {
         dispatch! {
-            simd: self.simd.needle_len(),
+            x86: self.x86.needle_len(),
+            aarch64: self.fallback.needle_len(),
             fallback: self.fallback.needle_len(),
         }
     }
@@ -302,7 +352,8 @@ impl<'a> ByteSubstring<'a> {
     #[must_use]
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
         dispatch! {
-            simd: unsafe { self.simd.find(haystack) },
+            x86: unsafe { self.x86.find(haystack) },
+            aarch64: self.fallback.find(haystack),
             fallback: self.fallback.find(haystack),
         }
     }
diff --git a/src/simd/aarch64.rs b/src/simd/aarch64.rs
new file mode 100644
index 0000000..ca0217b
--- /dev/null
+++ b/src/simd/aarch64.rs
@@ -0,0 +1,107 @@
+use std::arch::aarch64::*;
+use std::mem::transmute;
+
+pub struct Bytes {
+    bitset: uint8x16x2_t,
+}
+
+type Vector = uint8x16_t;
+
+/// Mapping from a number `i` in 0..=7 to a bit mask with the `i`-th bit set.
+const N_TO_N_BITS_TABLE: uint8x16_t = unsafe {
+    let mut bits = [0u8; 16];
+    let mut i = 0u8;
+    while i < 8 {
+        bits[i as usize] = 1 << i;
+        i += 1;
+    }
+    transmute(bits)
+};
+
+impl Bytes {
+    pub fn new(bytes: [u8; 16], needle_len: i32) -> Self {
+        assert!((0..=16).contains(&needle_len));
+        let needle_len = needle_len as u8;
+
+        // Make a bitset from the bytes to search for
+        let mut bitset = [0u8; 256 / 8];
+        for i in 0..needle_len {
+            let i = usize::from(i);
+            let value = usize::from(bytes[i]);
+            let byte = value / 8;
+            let bit = value % 8;
+            let mask = 1 << bit;
+            bitset[byte] |= mask;
+        }
+        let bitset = unsafe { transmute(bitset) };
+
+        Bytes { bitset }
+    }
+
+    #[target_feature(enable = "neon")]
+    pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+        let mut vectors = haystack.chunks_exact(size_of::<Vector>());
+
+        let mut offset = 0;
+        for vector in vectors.by_ref() {
+            let vector = unsafe { vld1q_u8(vector.as_ptr()) };
+            let result = self.locale_in_vector(vector);
+            if let Some(element) = first_element_set(result) {
+                return Some(offset + usize::from(element));
+            }
+            offset += size_of_val(&vector);
+        }
+        let remaining = vectors.remainder();
+        let mut fake_vector = [0; size_of::<Vector>()];
+        fake_vector[..remaining.len()].copy_from_slice(remaining);
+        let vector = unsafe { vld1q_u8(fake_vector.as_ptr()) };
+        let result = self.locale_in_vector(vector);
+        if let Some(element) = first_element_set(result) {
+            if element < remaining.len() as u8 {
+                return Some(offset + usize::from(element));
+            }
+        }
+
+        None
+    }
+
+    /// Given a vector of 16 bytes of input, return a vector of true/false values.
+    ///
+    /// Each element in the output will be 0/255 based on if the input byte at that position
+    /// is equal to one of the values being searched for.
+    #[inline]
+    #[target_feature(enable = "neon")]
+    fn locale_in_vector(&self, v: Vector) -> Vector {
+        let high_bits = vshrq_n_u8::<3>(v);
+        let low_bit_masks = vqtbl2q_u8(self.bitset, high_bits);
+        let low_bits = vandq_u8(v, vdupq_n_u8(0b0111));
+        let low_bits = vqtbl1q_u8(N_TO_N_BITS_TABLE, low_bits);
+        vtstq_u8(low_bits, low_bit_masks)
+    }
+}
+
+/// Convert a vector into a u64
+///
+/// Returns a value where the first 4 bits are the low 4 bits of the first element,
+/// the next 4 bits are the high 4 bits of the second element, and so on.
+///
+/// For a "bool" vector (where every element is either 255 or 0), the resulting u64
+/// will have groups of 4 bits from each element. e.g. the number of trailing zero bits
+/// is 4 times the number of trailing zero elements.
+#[target_feature(enable = "neon")]
+fn vector_to_bitset(vector: Vector) -> u64 {
+    let result_vector = vshrn_n_u16::<4>(vreinterpretq_u16_u8(vector));
+    vget_lane_u64::<0>(vreinterpret_u64_u8(result_vector))
+}
+
+/// Find the first element in a "bool" vector that is set or None if all elements are zero
+#[target_feature(enable = "neon")]
+fn first_element_set(vector: Vector) -> Option<u8> {
+    let bitset = vector_to_bitset(vector);
+    let first_bit = bitset.trailing_zeros() / 4;
+    if first_bit < 16 {
+        Some(first_bit as u8)
+    } else {
+        None
+    }
+}
diff --git a/src/simd/mod.rs b/src/simd/mod.rs
new file mode 100644
index 0000000..6359edd
--- /dev/null
+++ b/src/simd/mod.rs
@@ -0,0 +1,4 @@
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+pub mod aarch64;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub mod x86;
diff --git a/src/simd.rs b/src/simd/x86.rs
similarity index 95%
rename from src/simd.rs
rename to src/simd/x86.rs
index 694209e..d9a6258 100644
--- a/src/simd.rs
+++ b/src/simd/x86.rs
@@ -30,7 +30,10 @@ trait PackedCompareControl {
 
 #[inline]
 #[target_feature(enable = "sse4.2")]
-unsafe fn find_small<C, const CONTROL_BYTE: i32>(packed: PackedCompare<C, CONTROL_BYTE>, haystack: &[u8]) -> Option<usize>
+unsafe fn find_small<C, const CONTROL_BYTE: i32>(
+    packed: PackedCompare<C, CONTROL_BYTE>,
+    haystack: &[u8],
+) -> Option<usize>
 where
     C: PackedCompareControl,
 {
@@ -52,7 +55,10 @@ where
 /// beginning of the byte slice.
 #[inline]
 #[target_feature(enable = "sse4.2")]
-unsafe fn find<C, const CONTROL_BYTE: i32>(packed: PackedCompare<C, CONTROL_BYTE>, mut haystack: &[u8]) -> Option<usize>
+unsafe fn find<C, const CONTROL_BYTE: i32>(
+    packed: PackedCompare<C, CONTROL_BYTE>,
+    mut haystack: &[u8],
+) -> Option<usize>
 where
     C: PackedCompareControl,
 {
@@ -270,7 +276,7 @@ pub struct ByteSubstring<'a> {
 }
 
 impl<'a> ByteSubstring<'a> {
-    pub fn new(needle: &'a[u8]) -> Self {
+    pub fn new(needle: &'a [u8]) -> Self {
         let mut simd_needle = [0; 16];
         let len = if simd_needle.len() < needle.len() {
             simd_needle.len()
@@ -299,7 +305,10 @@ impl<'a> ByteSubstring<'a> {
     pub unsafe fn find(&self, haystack: &[u8]) -> Option<usize> {
         let mut offset = 0;
 
-        while let Some(idx) = find(PackedCompare::<_, _SIDD_CMP_EQUAL_ORDERED>(self), &haystack[offset..]) {
+        while let Some(idx) = find(
+            PackedCompare::<_, _SIDD_CMP_EQUAL_ORDERED>(self),
+            &haystack[offset..],
+        ) {
             let abs_offset = offset + idx;
             // Found a match, but is it really?
             if haystack[abs_offset..].starts_with(self.complete_needle) {

From 3b7081fd091adb533239d172b376e31d8d936da5 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Mon, 19 May 2025 14:20:44 -0400
Subject: [PATCH 19/21] add benchmarks iterating over all positions of the
 items

This includes pretty frequent instances
---
 benches/benchmarks.rs     | 117 ++++++++++++++-
 benches/plant_catalog.xml | 291 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 403 insertions(+), 5 deletions(-)
 create mode 100644 benches/plant_catalog.xml

diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index 2116ed8..dfe3ee2 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -1,4 +1,4 @@
-use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
 use jetscii::{ascii_chars, AsciiCharsConst, SubstringConst};
 use std::hint::black_box;
 use std::sync::OnceLock;
@@ -24,7 +24,9 @@ fn xml_delim_5() -> &'static AsciiCharsConst {
 static BIG_16: OnceLock<AsciiCharsConst> = OnceLock::new();
 
 fn big_16() -> &'static AsciiCharsConst {
-    BIG_16.get_or_init(|| ascii_chars!('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'))
+    BIG_16.get_or_init(|| {
+        ascii_chars!('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P')
+    })
 }
 
 static SUBSTRING: OnceLock<SubstringConst> = OnceLock::new();
@@ -146,7 +148,10 @@ fn big_16_benches(c: &mut Criterion) {
         });
     });
     group.bench_function("teddy", |b| {
-        let searcher = aho_corasick::packed::Searcher::new(b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b))).unwrap();
+        let searcher = aho_corasick::packed::Searcher::new(
+            b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b)),
+        )
+        .unwrap();
         b.iter(|| searcher.find(&haystack).map(|m| m.start()));
     });
 
@@ -185,7 +190,10 @@ fn big_16_benches(c: &mut Criterion) {
         });
     });
     group.bench_function("teddy", |b| {
-        let searcher = aho_corasick::packed::Searcher::new(b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b))).unwrap();
+        let searcher = aho_corasick::packed::Searcher::new(
+            b"ABCDEFGHIJKLMNOP".iter().map(|b| std::array::from_ref(b)),
+        )
+        .unwrap();
         b.iter(|| searcher.find(&haystack).map(|m| m.start()));
     });
 }
@@ -211,5 +219,104 @@ fn substr(c: &mut Criterion) {
     });
 }
 
-criterion_group!(benches, spaces, xml3, xml5, big_16_benches, substr);
+fn iterate_xml_many_match(c: &mut Criterion) {
+    let haystack = black_box(include_str!("plant_catalog.xml"));
+    let mut group = c.benchmark_group("iterate_xml_3");
+
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+    group.bench_function("ascii_chars", |b| {
+        let xml_delim_3 = xml_delim_3();
+        b.iter(|| {
+            let mut haystack = &haystack[..];
+            let mut offset = 0;
+            while let Some(pos) = xml_delim_3.find(haystack) {
+                haystack = &haystack[pos + 1..];
+                offset += pos;
+                black_box(offset);
+            }
+        });
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| {
+            let mut haystack = &haystack[..];
+            let mut offset = 0;
+            while let Some(pos) = haystack
+                .bytes()
+                .position(|c| c == b'<' || c == b'>' || c == b'&')
+            {
+                haystack = &haystack[pos + 1..];
+                offset += pos;
+                black_box(offset);
+            }
+        });
+    });
+    group.bench_function("memchr", |b| {
+        b.iter_batched(
+            || memchr::memchr3_iter(b'<', b'>', b'&', haystack.as_bytes()),
+            |iter| {
+                for offset in iter {
+                    black_box(offset);
+                }
+            },
+            BatchSize::SmallInput,
+        );
+    });
+    group.finish();
+}
+
+fn iterate_few_match(c: &mut Criterion) {
+    let haystack = black_box(include_str!("plant_catalog.xml"));
+    let mut group = c.benchmark_group("iterate_few_matches");
+    let chars: AsciiCharsConst = ascii_chars!(b'?', b'-', b'\0');
+
+    group.throughput(Throughput::Bytes(haystack.len() as u64));
+    group.bench_function("ascii_chars", |b| {
+        b.iter(|| {
+            let mut haystack = &haystack[..];
+            let mut offset = 0;
+            while let Some(pos) = chars.find(haystack) {
+                haystack = &haystack[pos + 1..];
+                offset += pos;
+                black_box(offset);
+            }
+        });
+    });
+    group.bench_function("stdlib_iter_position", |b| {
+        b.iter(|| {
+            let mut haystack = &haystack[..];
+            let mut offset = 0;
+            while let Some(pos) = haystack
+                .bytes()
+                .position(|c| c == b'?' || c == b'-' || c == b'\0')
+            {
+                haystack = &haystack[pos + 1..];
+                offset += pos;
+                black_box(offset);
+            }
+        });
+    });
+    group.bench_function("memchr", |b| {
+        b.iter_batched(
+            || memchr::memchr3_iter(b'?', b'-', b'\0', haystack.as_bytes()),
+            |iter| {
+                for offset in iter {
+                    black_box(offset);
+                }
+            },
+            BatchSize::SmallInput,
+        );
+    });
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    spaces,
+    xml3,
+    xml5,
+    big_16_benches,
+    substr,
+    iterate_xml_many_match,
+    iterate_few_match,
+);
 criterion_main!(benches);
diff --git a/benches/plant_catalog.xml b/benches/plant_catalog.xml
new file mode 100644
index 0000000..4275db9
--- /dev/null
+++ b/benches/plant_catalog.xml
@@ -0,0 +1,291 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CATALOG>
+  <PLANT>
+    <COMMON>Bloodroot</COMMON>
+    <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$2.44</PRICE>
+    <AVAILABILITY>031599</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Columbine</COMMON>
+    <BOTANICAL>Aquilegia canadensis</BOTANICAL>
+    <ZONE>3</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$9.37</PRICE>
+    <AVAILABILITY>030699</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Marsh Marigold</COMMON>
+    <BOTANICAL>Caltha palustris</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Mostly Sunny</LIGHT>
+    <PRICE>$6.81</PRICE>
+    <AVAILABILITY>051799</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Cowslip</COMMON>
+    <BOTANICAL>Caltha palustris</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$9.90</PRICE>
+    <AVAILABILITY>030699</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Dutchman's-Breeches</COMMON>
+    <BOTANICAL>Dicentra cucullaria</BOTANICAL>
+    <ZONE>3</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$6.44</PRICE>
+    <AVAILABILITY>012099</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Ginger, Wild</COMMON>
+    <BOTANICAL>Asarum canadense</BOTANICAL>
+    <ZONE>3</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$9.03</PRICE>
+    <AVAILABILITY>041899</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Hepatica</COMMON>
+    <BOTANICAL>Hepatica americana</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$4.45</PRICE>
+    <AVAILABILITY>012699</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Liverleaf</COMMON>
+    <BOTANICAL>Hepatica americana</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$3.99</PRICE>
+    <AVAILABILITY>010299</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Jack-In-The-Pulpit</COMMON>
+    <BOTANICAL>Arisaema triphyllum</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$3.23</PRICE>
+    <AVAILABILITY>020199</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Mayapple</COMMON>
+    <BOTANICAL>Podophyllum peltatum</BOTANICAL>
+    <ZONE>3</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$2.98</PRICE>
+    <AVAILABILITY>060599</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Phlox, Woodland</COMMON>
+    <BOTANICAL>Phlox divaricata</BOTANICAL>
+    <ZONE>3</ZONE>
+    <LIGHT>Sun or Shade</LIGHT>
+    <PRICE>$2.80</PRICE>
+    <AVAILABILITY>012299</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Phlox, Blue</COMMON>
+    <BOTANICAL>Phlox divaricata</BOTANICAL>
+    <ZONE>3</ZONE>
+    <LIGHT>Sun or Shade</LIGHT>
+    <PRICE>$5.59</PRICE>
+    <AVAILABILITY>021699</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Spring-Beauty</COMMON>
+    <BOTANICAL>Claytonia Virginica</BOTANICAL>
+    <ZONE>7</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$6.59</PRICE>
+    <AVAILABILITY>020199</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Trillium</COMMON>
+    <BOTANICAL>Trillium grandiflorum</BOTANICAL>
+    <ZONE>5</ZONE>
+    <LIGHT>Sun or Shade</LIGHT>
+    <PRICE>$3.90</PRICE>
+    <AVAILABILITY>042999</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Wake Robin</COMMON>
+    <BOTANICAL>Trillium grandiflorum</BOTANICAL>
+    <ZONE>5</ZONE>
+    <LIGHT>Sun or Shade</LIGHT>
+    <PRICE>$3.20</PRICE>
+    <AVAILABILITY>022199</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Violet, Dog-Tooth</COMMON>
+    <BOTANICAL>Erythronium americanum</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$9.04</PRICE>
+    <AVAILABILITY>020199</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Trout Lily</COMMON>
+    <BOTANICAL>Erythronium americanum</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$6.94</PRICE>
+    <AVAILABILITY>032499</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Adder's-Tongue</COMMON>
+    <BOTANICAL>Erythronium americanum</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$9.58</PRICE>
+    <AVAILABILITY>041399</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Anemone</COMMON>
+    <BOTANICAL>Anemone blanda</BOTANICAL>
+    <ZONE>6</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$8.86</PRICE>
+    <AVAILABILITY>122698</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Grecian Windflower</COMMON>
+    <BOTANICAL>Anemone blanda</BOTANICAL>
+    <ZONE>6</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$9.16</PRICE>
+    <AVAILABILITY>071099</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Bee Balm</COMMON>
+    <BOTANICAL>Monarda didyma</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$4.59</PRICE>
+    <AVAILABILITY>050399</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Bergamot</COMMON>
+    <BOTANICAL>Monarda didyma</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$7.16</PRICE>
+    <AVAILABILITY>042799</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Black-Eyed Susan</COMMON>
+    <BOTANICAL>Rudbeckia hirta</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Sunny</LIGHT>
+    <PRICE>$9.80</PRICE>
+    <AVAILABILITY>061899</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Buttercup</COMMON>
+    <BOTANICAL>Ranunculus</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$2.57</PRICE>
+    <AVAILABILITY>061099</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Crowfoot</COMMON>
+    <BOTANICAL>Ranunculus</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$9.34</PRICE>
+    <AVAILABILITY>040399</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Butterfly Weed</COMMON>
+    <BOTANICAL>Asclepias tuberosa</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Sunny</LIGHT>
+    <PRICE>$2.78</PRICE>
+    <AVAILABILITY>063099</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Cinquefoil</COMMON>
+    <BOTANICAL>Potentilla</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$7.06</PRICE>
+    <AVAILABILITY>052599</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Primrose</COMMON>
+    <BOTANICAL>Oenothera</BOTANICAL>
+    <ZONE>3 - 5</ZONE>
+    <LIGHT>Sunny</LIGHT>
+    <PRICE>$6.56</PRICE>
+    <AVAILABILITY>013099</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Gentian</COMMON>
+    <BOTANICAL>Gentiana</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Sun or Shade</LIGHT>
+    <PRICE>$7.81</PRICE>
+    <AVAILABILITY>051899</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Blue Gentian</COMMON>
+    <BOTANICAL>Gentiana</BOTANICAL>
+    <ZONE>4</ZONE>
+    <LIGHT>Sun or Shade</LIGHT>
+    <PRICE>$8.56</PRICE>
+    <AVAILABILITY>050299</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Jacob's Ladder</COMMON>
+    <BOTANICAL>Polemonium caeruleum</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$9.26</PRICE>
+    <AVAILABILITY>022199</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Greek Valerian</COMMON>
+    <BOTANICAL>Polemonium caeruleum</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$4.36</PRICE>
+    <AVAILABILITY>071499</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>California Poppy</COMMON>
+    <BOTANICAL>Eschscholzia californica</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Sun</LIGHT>
+    <PRICE>$7.89</PRICE>
+    <AVAILABILITY>032799</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Shooting Star</COMMON>
+    <BOTANICAL>Dodecatheon</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Mostly Shady</LIGHT>
+    <PRICE>$8.60</PRICE>
+    <AVAILABILITY>051399</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Snakeroot</COMMON>
+    <BOTANICAL>Cimicifuga</BOTANICAL>
+    <ZONE>Annual</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$5.63</PRICE>
+    <AVAILABILITY>071199</AVAILABILITY>
+  </PLANT>
+  <PLANT>
+    <COMMON>Cardinal Flower</COMMON>
+    <BOTANICAL>Lobelia cardinalis</BOTANICAL>
+    <ZONE>2</ZONE>
+    <LIGHT>Shade</LIGHT>
+    <PRICE>$3.02</PRICE>
+    <AVAILABILITY>022299</AVAILABILITY>
+  </PLANT>
+</CATALOG>

From 45404ad4ed7d449538e9154958f5f119706f7f57 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Sun, 18 May 2025 22:37:56 -0400
Subject: [PATCH 20/21] Add an `iter` function

For aarch64, we can do quite a bit better than just calling the `find`
function repeatedly: we build a bitset of 64 bits where we've already
found if they match the set of bits we're looking for. We can then
efficently iterate over those set bits.

It may be possible to do something similar in the x86 simd
implementation.
---
 CHANGELOG.md          |   1 +
 benches/benchmarks.rs |  18 ++---
 src/fallback.rs       |  37 +++++++++
 src/lib.rs            | 183 +++++++++++++++++++++++++++++++++++++++---
 src/simd/aarch64.rs   | 144 +++++++++++++++++++++++++++++++++
 src/simd/x86.rs       |  35 ++++++++
 6 files changed, 397 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52b7402..0d54ebe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ to [Semantic Versioning][].
 
 - Trait bounds removed from generic types, bounds are only required for impls
 - Added SIMD implementation of `Bytes`/`AsciiChars` for aarch64 neon
+- Added the the ability to efficiently iterate over the indexes of matching elements (`Bytes::iter`/`AsciiChars::iter`)
 
 ## [0.5.3] - 2022-07-06
 
diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index dfe3ee2..5ced6fc 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -226,15 +226,15 @@ fn iterate_xml_many_match(c: &mut Criterion) {
     group.throughput(Throughput::Bytes(haystack.len() as u64));
     group.bench_function("ascii_chars", |b| {
         let xml_delim_3 = xml_delim_3();
-        b.iter(|| {
-            let mut haystack = &haystack[..];
-            let mut offset = 0;
-            while let Some(pos) = xml_delim_3.find(haystack) {
-                haystack = &haystack[pos + 1..];
-                offset += pos;
-                black_box(offset);
-            }
-        });
+        b.iter_batched(
+            || xml_delim_3.as_bytes().iter(haystack.as_bytes()),
+            |iter| {
+                for offset in iter {
+                    black_box(offset);
+                }
+            },
+            BatchSize::SmallInput,
+        );
     });
     group.bench_function("stdlib_iter_position", |b| {
         b.iter(|| {
diff --git a/src/fallback.rs b/src/fallback.rs
index 48d4b62..9201bbf 100644
--- a/src/fallback.rs
+++ b/src/fallback.rs
@@ -16,6 +16,14 @@ where
     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
         haystack.iter().copied().position(&self.fallback)
     }
+
+    pub fn iter<'a>(&'a self, haystack: &'a [u8]) -> BytesIter<'a, F> {
+        BytesIter {
+            bytes: self,
+            haystack,
+            offset: 0,
+        }
+    }
 }
 
 pub struct ByteSubstring<'a> {
@@ -38,3 +46,32 @@ impl<'a> ByteSubstring<'a> {
             .position(|window| window == self.needle)
     }
 }
+
+pub struct BytesIter<'a, F> {
+    bytes: &'a Bytes<F>,
+    haystack: &'a [u8],
+    offset: usize,
+}
+impl<'a, F> Iterator for BytesIter<'a, F>
+where 
+    F: Fn(u8) -> bool,
+{
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let idx = self.bytes.find(self.haystack);
+        if let Some(idx) = idx {
+            self.haystack = &self.haystack[idx + 1..];
+            let result = self.offset + idx;
+            self.offset = result + 1;
+            Some(result)
+        } else {
+            self.haystack = &[];
+            None
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.haystack.len()))
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 4ccf908..535dab4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -207,12 +207,16 @@ pub struct Bytes<F> {
     #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
     aarch64: simd::aarch64::Bytes,
 
-    #[cfg(not(
-      any(
-        all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.2"),
-        all(any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon"),
-      )
-    ))]
+    #[cfg(not(any(
+        all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse4.2"
+        ),
+        all(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            target_feature = "neon"
+        ),
+    )))]
     fallback: fallback::Bytes<F>,
 
     // Since we might not use the fallback implementation, we add this
@@ -240,12 +244,16 @@ where
             #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
             aarch64: simd::aarch64::Bytes::new(bytes, len),
 
-            #[cfg(not(
-                any(
-                    all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse4.2"),
-                    all(any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon"),
-                )
-            ))]
+            #[cfg(not(any(
+                all(
+                    any(target_arch = "x86", target_arch = "x86_64"),
+                    target_feature = "sse4.2"
+                ),
+                all(
+                    any(target_arch = "aarch64", target_arch = "arm64ec"),
+                    target_feature = "neon"
+                ),
+            )))]
             fallback: fallback::Bytes::new(fallback),
 
             _fallback: PhantomData,
@@ -262,11 +270,131 @@ where
             fallback: self.fallback.find(haystack),
         }
     }
+
+    pub fn iter<'a>(&'a self, haystack: &'a [u8]) -> BytesIter<'a, F> {
+        dispatch! {
+            x86: BytesIter::X86(self.x86.iter(haystack)),
+            aarch64: BytesIter::Aarch64(self.aarch64.iter(haystack)),
+            fallback: BytesIter::Fallback(self.fallback.iter(haystack)),
+        }
+    }
 }
 
 /// A convenience type that can be used in a constant or static.
 pub type BytesConst = Bytes<fn(u8) -> bool>;
 
+pub enum BytesIter<'a, F> {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    X86(simd::x86::BytesIter<'a>),
+
+    #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+    Aarch64(simd::aarch64::BytesIter<'a>),
+
+    #[cfg(not(any(
+        all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse4.2"
+        ),
+        all(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            target_feature = "neon"
+        ),
+    )))]
+    Fallback(fallback::BytesIter<'a, F>),
+    // Since we might not use the fallback implementation, we add this
+    // to avoid unused type parameters.
+    // This type is impossible to construct, but we still use the F parameter
+    _Impossible(std::convert::Infallible, PhantomData<F>),
+}
+
+impl<'a, F> Iterator for BytesIter<'a, F>
+where
+    F: Fn(u8) -> bool,
+{
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Self::X86(iter) => iter.next(),
+
+            #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+            Self::Aarch64(iter) => iter.next(),
+
+            #[cfg(not(any(
+                all(
+                    any(target_arch = "x86", target_arch = "x86_64"),
+                    target_feature = "sse4.2"
+                ),
+                all(
+                    any(target_arch = "aarch64", target_arch = "arm64ec"),
+                    target_feature = "neon"
+                ),
+            )))]
+            Self::Fallback(iter) => iter.next(),
+            // Since we might not use the fallback implementation, we add this
+            // to avoid unused type parameters.
+            // This type is impossible to construct, but we still use the F parameter
+            &mut Self::_Impossible(infailable, _) => match infailable {},
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        match self {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Self::X86(iter) => iter.size_hint(),
+
+            #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+            Self::Aarch64(iter) => iter.size_hint(),
+
+            #[cfg(not(any(
+                all(
+                    any(target_arch = "x86", target_arch = "x86_64"),
+                    target_feature = "sse4.2"
+                ),
+                all(
+                    any(target_arch = "aarch64", target_arch = "arm64ec"),
+                    target_feature = "neon"
+                ),
+            )))]
+            Self::Fallback(iter) => iter.size_hint(),
+            // Since we might not use the fallback implementation, we add this
+            // to avoid unused type parameters.
+            // This type is impossible to construct, but we still use the F parameter
+            &Self::_Impossible(infailable, _) => match infailable {},
+        }
+    }
+
+    fn count(self) -> usize
+    where
+        Self: Sized,
+    {
+        match self {
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Self::X86(iter) => iter.count(),
+
+            #[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+            Self::Aarch64(iter) => iter.count(),
+
+            #[cfg(not(any(
+                all(
+                    any(target_arch = "x86", target_arch = "x86_64"),
+                    target_feature = "sse4.2"
+                ),
+                all(
+                    any(target_arch = "aarch64", target_arch = "arm64ec"),
+                    target_feature = "neon"
+                ),
+            )))]
+            Self::Fallback(iter) => iter.count(),
+            // Since we might not use the fallback implementation, we add this
+            // to avoid unused type parameters.
+            // This type is impossible to construct, but we still use the F parameter
+            Self::_Impossible(infailable, _) => match infailable {},
+        }
+    }
+}
+
 /// Searches a string for a set of ASCII characters. Up to 16
 /// characters may be used.
 pub struct AsciiChars<F>(Bytes<F>);
@@ -299,11 +427,42 @@ where
     pub fn find(&self, haystack: &str) -> Option<usize> {
         self.0.find(haystack.as_bytes())
     }
+
+    /// Return an iterator over the indices of the specified characters in the haystack.
+    pub fn iter<'a>(&'a self, haystack: &'a str) -> AsciiCharsIter<'a, F> {
+        AsciiCharsIter(self.0.iter(haystack.as_bytes()))
+    }
+
+    /// Get a [`Bytes`] reference with the same ascii characters
+    pub fn as_bytes(&self) -> &Bytes<F> {
+        &self.0
+    }
 }
 
 /// A convenience type that can be used in a constant or static.
 pub type AsciiCharsConst = AsciiChars<fn(u8) -> bool>;
 
+pub struct AsciiCharsIter<'a, F>(BytesIter<'a, F>);
+
+impl<F> Iterator for AsciiCharsIter<'_, F>
+where
+    F: Fn(u8) -> bool,
+{
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.0.next()
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.0.size_hint()
+    }
+
+    fn count(self) -> usize {
+        self.0.count()
+    }
+}
+
 /// Searches a slice for the first occurence of the subslice.
 pub struct ByteSubstring<'a> {
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
diff --git a/src/simd/aarch64.rs b/src/simd/aarch64.rs
index ca0217b..46a8420 100644
--- a/src/simd/aarch64.rs
+++ b/src/simd/aarch64.rs
@@ -1,11 +1,13 @@
 use std::arch::aarch64::*;
 use std::mem::transmute;
 
+#[derive(Copy, Clone)]
 pub struct Bytes {
     bitset: uint8x16x2_t,
 }
 
 type Vector = uint8x16_t;
+type Chunk = uint8x16x4_t;
 
 /// Mapping from a number `i` in 0..=7 to a bit mask with the `i`-th bit set.
 const N_TO_N_BITS_TABLE: uint8x16_t = unsafe {
@@ -65,6 +67,10 @@ impl Bytes {
         None
     }
 
+    pub fn iter<'a>(self, haystack: &'a [u8]) -> BytesIter<'a> {
+        BytesIter::new(self, haystack)
+    }
+
     /// Given a vector of 16 bytes of input, return a vector of true/false values.
     ///
     /// Each element in the output will be 0/255 based on if the input byte at that position
@@ -78,6 +84,144 @@ impl Bytes {
         let low_bits = vqtbl1q_u8(N_TO_N_BITS_TABLE, low_bits);
         vtstq_u8(low_bits, low_bit_masks)
     }
+
+    /// Returns a u64 where each set bit indicates a match in the chunk.
+    ///
+    /// Takes a "deinterleaved" chunk of 4 vectors, each with 16 bytes.
+    /// The first element of the input is the first element of the first vector,
+    /// the second element of the input is the first element of the second vector,
+    /// the fifth element of the input is the second element of the first vector, etc.
+    #[inline]
+    #[target_feature(enable = "neon")]
+    fn locate_in_chunk(&self, chunk: Chunk) -> u64 {
+        let chunk_values = [chunk.0, chunk.1, chunk.2, chunk.3];
+        // Get 4 "bool" vectors indicating if each element in the chunk is a match
+        let matching_elements = chunk_values.map(|v| self.locale_in_vector(v));
+
+        // Pack bits from the 4 vectors into a single vector
+
+        // shift the second vector right by one, insert the top bit from the first vector
+        // The top two bits each element of temp0 are from the first and second vector
+        let temp0 = vsriq_n_u8::<1>(matching_elements[1], matching_elements[0]);
+
+        // shift the fourth vector right by one, insert the top bit from the third vector
+        // The top two bits each element of temp1 are from the third and fourth vector
+        let temp1 = vsriq_n_u8::<1>(matching_elements[3], matching_elements[2]);
+
+        // shift temp1 (the top two bits of which are from the third and fourth vector) right by 2,
+        // insert the top two bits from temp0 (the top two bits of which are from the first and
+        // second vector)
+        // The top four bits of each element of temp2 are from the first, second, third, and fourth
+        // vector
+        let temp2 = vsriq_n_u8::<2>(temp1, temp0);
+
+        // duplicate the top 4 bits into the bottom 4 bits of each element
+        let temp3 = vsriq_n_u8::<4>(temp2, temp2);
+
+        // The top/bottom 4 bits of each element are the same, so converting to a 64 bit bitset
+        // takes those 4 bits from each element and places them next to each other
+        vector_to_bitset(temp3)
+    }
+}
+
+pub struct BytesIter<'a> {
+    /// The bytes to search for
+    bytes: Bytes,
+    /// The remaining haystack (after the current bitset chunk)
+    haystack: &'a [u8],
+    /// The current offset (from the start of the original haystack)
+    offset: usize,
+    /// A reversed bitset of the the current chunk
+    ///
+    /// e.g. the most significant bit is set if the next byte matches one of the searched bytes
+    current_bitset: u64,
+}
+
+impl<'a> BytesIter<'a> {
+    fn new(bytes: Bytes, haystack: &'a [u8]) -> Self {
+        Self {
+            bytes,
+            haystack,
+            offset: 0,
+            current_bitset: 0,
+        }
+    }
+
+    #[target_feature(enable = "neon")]
+    fn fill_bitset(&mut self) {
+        while let Some((chunk, rest)) = self.haystack.split_at_checked(size_of::<Chunk>()) {
+            self.haystack = rest;
+            let chunk = unsafe { vld4q_u8(chunk.as_ptr()) };
+            let bitset = self.bytes.locate_in_chunk(chunk);
+            if bitset != 0 {
+                // aarch64 doesn't have a count trailing zeros instruction, so
+                // reverse the bits so we use leading_zeros instead
+                self.current_bitset = bitset.reverse_bits();
+                return;
+            }
+            self.offset += size_of_val(&chunk);
+        }
+        let mut fake_chunk = [0; size_of::<Chunk>()];
+        fake_chunk[..self.haystack.len()].copy_from_slice(self.haystack);
+        let chunk = unsafe { vld4q_u8(fake_chunk.as_ptr()) };
+        self.current_bitset = self.bytes.locate_in_chunk(chunk);
+        let mask = !(u64::MAX << self.haystack.len() as u64);
+        self.current_bitset &= mask;
+        // aarch64 doesn't have a count trailing zeros instruction, so
+        // reverse the bits so we use leading_zeros instead
+        self.current_bitset = self.current_bitset.reverse_bits();
+        self.haystack = &[];
+    }
+}
+
+impl<'a> Iterator for BytesIter<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut first_bit = self.current_bitset.leading_zeros();
+        if first_bit == 64 {
+            unsafe {
+                self.fill_bitset();
+            }
+            first_bit = self.current_bitset.leading_zeros();
+            if first_bit == 64 {
+                return None;
+            }
+        }
+        // toggle the highest bit
+        self.current_bitset ^= 1 << (63 - first_bit);
+        let result = self.offset + first_bit as usize;
+        if self.current_bitset == 0 {
+            self.offset += 64;
+        }
+        Some(result)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let min = self.current_bitset.count_ones() as usize;
+        let max = min.checked_add(self.haystack.len());
+        (min, max)
+    }
+
+    // We can be a little faster by avoiding iterating through the bits by counting bits directly
+    fn count(self) -> usize {
+        let mut count = self.current_bitset.count_ones() as usize;
+
+        let mut chunks = self.haystack.chunks_exact(size_of::<Chunk>());
+        for chunk in chunks.by_ref() {
+            let chunk = unsafe { vld4q_u8(chunk.as_ptr()) };
+            let result = unsafe { self.bytes.locate_in_chunk(chunk) };
+            count += result.count_ones() as usize;
+        }
+        let remaining = chunks.remainder();
+        let mut fake_chunk = [0; size_of::<Chunk>()];
+        fake_chunk[..remaining.len()].copy_from_slice(remaining);
+        let chunk = unsafe { vld4q_u8(fake_chunk.as_ptr()) };
+        let result = unsafe { self.bytes.locate_in_chunk(chunk) };
+        let mask = !(u64::MAX << self.haystack.len() as u64);
+        count += (result & mask).count_ones() as usize;
+        count
+    }
 }
 
 /// Convert a vector into a u64
diff --git a/src/simd/x86.rs b/src/simd/x86.rs
index d9a6258..2ff4fc9 100644
--- a/src/simd/x86.rs
+++ b/src/simd/x86.rs
@@ -237,6 +237,7 @@ impl<'a> Misalignment<'a> {
     }
 }
 
+#[derive(Copy, Clone)]
 pub struct Bytes {
     needle: __m128i,
     needle_len: i32,
@@ -255,6 +256,40 @@ impl Bytes {
     pub unsafe fn find(&self, haystack: &[u8]) -> Option<usize> {
         find(PackedCompare::<_, 0>(self), haystack)
     }
+
+    pub fn iter(self, haystack: &[u8]) -> BytesIter<'_> {
+        BytesIter {
+            bytes: self,
+            haystack,
+            offset: 0,
+        }
+    }
+}
+
+pub struct BytesIter<'a> {
+    bytes: Bytes,
+    haystack: &'a [u8],
+    offset: usize,
+}
+
+impl Iterator for BytesIter<'_> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let next_offset = unsafe { self.bytes.find(self.haystack) };
+        match next_offset {
+            Some(i) => {
+                self.haystack = &self.haystack[i + 1..];
+                let res = self.offset + i;
+                self.offset = res + 1;
+                Some(res)
+            }
+            None => {
+                self.haystack = &[];
+                None
+            }
+        }
+    }
 }
 
 impl<'b> PackedCompareControl for &'b Bytes {

From be472e1a7b630115fa473406b4da9b0630dfec52 Mon Sep 17 00:00:00 2001
From: Zachary Dremann <dremann@gmail.com>
Date: Wed, 21 May 2025 02:52:35 -0400
Subject: [PATCH 21/21] add overall algorithm comments and links

---
 src/simd/aarch64.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/simd/aarch64.rs b/src/simd/aarch64.rs
index 46a8420..7bdce74 100644
--- a/src/simd/aarch64.rs
+++ b/src/simd/aarch64.rs
@@ -1,3 +1,13 @@
+//! AArch64 NEON SIMD implementation
+//!
+//! Based on [this algorithm][1] simplified somewhat by aarch64 neon (by the ability to make a
+//! table of 32 bytes by combining two vectors), combined with 64 byte movemask via interleaving
+//! from [here][2], and the iteration idea (using a u64 bitset of known matches) from [here][3].
+//!
+//! [1]: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
+//! [2]: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+//! [3]: https://lemire.me/blog/2024/07/20/scan-html-even-faster-with-simd-instructions-c-and-c/
+
 use std::arch::aarch64::*;
 use std::mem::transmute;