From 5f7fb6e240343a15fa7d91a03f0e5565757b45fb Mon Sep 17 00:00:00 2001
From: Dan Mattheiss <dmattheiss@gmail.com>
Date: Thu, 4 Jun 2026 18:12:04 -0400
Subject: [PATCH 1/3] bpe-openai: use Regex::find instead of captures for
 pretokenization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Splits iterator only needs each match's extent and pattern id; it never reads a
capture group (the patterns contain none beyond the implicit whole match). It still
called captures(), which clears and fills a Captures for every piece and forces a
capture-reporting search. find() returns the same overall match and can use a faster
DFA-based search that only reports the match start and end.

Output is identical — find and captures agree on the overall match — verified by the
tiktoken-equivalence tests (cl100k + o200k).

Adds a pretokenization-cl100k benchmark that isolates Tokenizer::split (the existing
comparison benchmark only measures full encode, where pretokenization is a minority of
the time on the random-token corpus). Measured with it (cargo bench, M1), split
throughput improves at every input size, most on short inputs where the per-piece
engine overhead dominates:

    split/10     +57%
    split/100    +16%
    split/1000   +11%
    split/10000   +9%

Full Tokenizer::encode (comparison-cl100k) is unchanged within noise, since
pretokenization is a minority of encode time on that corpus.
---
 crates/bpe-openai/src/lib.rs         |  7 +------
 crates/bpe/benchmarks/performance.rs | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
index 51f514f8..7a0aad48 100644
--- a/crates/bpe-openai/src/lib.rs
+++ b/crates/bpe-openai/src/lib.rs
@@ -4,7 +4,6 @@ use bpe::byte_pair_encoding::BytePairEncoding;
 use either::Either;
 use regex_automata::{
     meta::{BuildError, Regex},
-    util::captures::Captures,
     Anchored, Input,
 };
 
@@ -177,7 +176,6 @@ impl Pretokenizer {
             lookahead: &self.lookahead,
             text,
             last: 0,
-            caps: Captures::matches(self.pat.group_info().clone()),
         }
     }
 }
@@ -195,7 +193,6 @@ struct Splits<'a> {
     lookahead: &'a [bool],
     text: &'a str,
     last: usize,
-    caps: Captures,
 }
 
 impl<'a> Iterator for Splits<'a> {
@@ -203,9 +200,7 @@ impl<'a> Iterator for Splits<'a> {
 
     fn next(&mut self) -> Option<Self::Item> {
         let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes);
-        self.caps.clear();
-        self.pat.captures(input, &mut self.caps);
-        let m = self.caps.get_match()?;
+        let m = self.pat.find(input)?;
         let start = self.last;
         let mut end = self.last + m.range().end;
         if self.lookahead[m.pattern().as_usize()] {
diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
index 9b2a257a..f52941ca 100644
--- a/crates/bpe/benchmarks/performance.rs
+++ b/crates/bpe/benchmarks/performance.rs
@@ -244,12 +244,32 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
     }
 }
 
+fn pretokenization_benchmark(c: &mut Criterion) {
+    for (name, tok, _, _) in TOKENIZERS.iter() {
+        let text = create_test_string(&tok.bpe, 80_000);
+
+        let mut group = c.benchmark_group(format!("pretokenization-{name}"));
+        group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
+        for bytes in [10, 100, 1000, 10000] {
+            group.throughput(criterion::Throughput::Bytes(bytes as u64));
+            group.bench_with_input(BenchmarkId::new("split", bytes), &bytes, |b, bytes| {
+                b.iter_batched(
+                    || select_test_string(&text, *bytes),
+                    |text| tok.split(text).count(),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        group.finish();
+    }
+}
+
 criterion_group!(
     name = benches;
     config = Criterion::default()
                 .warm_up_time(Duration::from_millis(500))
                 .measurement_time(Duration::from_millis(4000))
                 .nresamples(1000);
-    targets = counting_benchmark, encoding_benchmark, appending_benchmark, comparison_benchmark, worstcase_comparison_benchmark
+    targets = counting_benchmark, encoding_benchmark, appending_benchmark, pretokenization_benchmark, comparison_benchmark, worstcase_comparison_benchmark
 );
 criterion_main!(benches);

From 4b3cda544bb92a33c7b9847a1eea355fda813f69 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Fri, 5 Jun 2026 11:39:44 +0200
Subject: [PATCH 2/3] Apply suggestion from @aneubeck

trying to trigger codeql which is blocking the merge :(
---
 crates/bpe/benchmarks/performance.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
index f52941ca..6a71f3d9 100644
--- a/crates/bpe/benchmarks/performance.rs
+++ b/crates/bpe/benchmarks/performance.rs
@@ -245,8 +245,8 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
 }
 
 fn pretokenization_benchmark(c: &mut Criterion) {
-    for (name, tok, _, _) in TOKENIZERS.iter() {
-        let text = create_test_string(&tok.bpe, 80_000);
+    for (name, tokenizer, _, _) in TOKENIZERS.iter() {
+        let text = create_test_string(&tokenizer.bpe, 80_000);
 
         let mut group = c.benchmark_group(format!("pretokenization-{name}"));
         group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
@@ -255,7 +255,7 @@ fn pretokenization_benchmark(c: &mut Criterion) {
             group.bench_with_input(BenchmarkId::new("split", bytes), &bytes, |b, bytes| {
                 b.iter_batched(
                     || select_test_string(&text, *bytes),
-                    |text| tok.split(text).count(),
+                    |text| tokenizer.split(text).count(),
                     criterion::BatchSize::SmallInput,
                 )
             });

From 9c65f5ae8a13290ca2a8139deb966e81d0a3a819 Mon Sep 17 00:00:00 2001
From: Alexander Neubeck <aneubeck@github.com>
Date: Fri, 5 Jun 2026 11:46:21 +0200
Subject: [PATCH 3/3] Apply suggestion from @aneubeck

---
 crates/bpe/benchmarks/performance.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
index 6a71f3d9..f52941ca 100644
--- a/crates/bpe/benchmarks/performance.rs
+++ b/crates/bpe/benchmarks/performance.rs
@@ -245,8 +245,8 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) {
 }
 
 fn pretokenization_benchmark(c: &mut Criterion) {
-    for (name, tokenizer, _, _) in TOKENIZERS.iter() {
-        let text = create_test_string(&tokenizer.bpe, 80_000);
+    for (name, tok, _, _) in TOKENIZERS.iter() {
+        let text = create_test_string(&tok.bpe, 80_000);
 
         let mut group = c.benchmark_group(format!("pretokenization-{name}"));
         group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
@@ -255,7 +255,7 @@ fn pretokenization_benchmark(c: &mut Criterion) {
             group.bench_with_input(BenchmarkId::new("split", bytes), &bytes, |b, bytes| {
                 b.iter_batched(
                     || select_test_string(&text, *bytes),
-                    |text| tokenizer.split(text).count(),
+                    |text| tok.split(text).count(),
                     criterion::BatchSize::SmallInput,
                 )
             });