From 5f7fb6e240343a15fa7d91a03f0e5565757b45fb Mon Sep 17 00:00:00 2001 From: Dan Mattheiss Date: Thu, 4 Jun 2026 18:12:04 -0400 Subject: [PATCH 1/3] bpe-openai: use Regex::find instead of captures for pretokenization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Splits iterator only needs each match's extent and pattern id; it never reads a capture group (the patterns contain none beyond the implicit whole match). It still called captures(), which clears and fills a Captures for every piece and forces a capture-reporting search. find() returns the same overall match and can use a faster DFA-based search that only reports the match start and end. Output is identical — find and captures agree on the overall match — verified by the tiktoken-equivalence tests (cl100k + o200k). Adds a pretokenization-cl100k benchmark that isolates Tokenizer::split (the existing comparison benchmark only measures full encode, where pretokenization is a minority of the time on the random-token corpus). Measured with it (cargo bench, M1), split throughput improves at every input size, most on short inputs where the per-piece engine overhead dominates: split/10 +57% split/100 +16% split/1000 +11% split/10000 +9% Full Tokenizer::encode (comparison-cl100k) is unchanged within noise, since pretokenization is a minority of encode time on that corpus. --- crates/bpe-openai/src/lib.rs | 7 +------ crates/bpe/benchmarks/performance.rs | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 51f514f8..7a0aad48 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -4,7 +4,6 @@ use bpe::byte_pair_encoding::BytePairEncoding; use either::Either; use regex_automata::{ meta::{BuildError, Regex}, - util::captures::Captures, Anchored, Input, }; @@ -177,7 +176,6 @@ impl Pretokenizer { lookahead: &self.lookahead, text, last: 0, - caps: Captures::matches(self.pat.group_info().clone()), } } } @@ -195,7 +193,6 @@ struct Splits<'a> { lookahead: &'a [bool], text: &'a str, last: usize, - caps: Captures, } impl<'a> Iterator for Splits<'a> { @@ -203,9 +200,7 @@ impl<'a> Iterator for Splits<'a> { fn next(&mut self) -> Option { let input = Input::new(&self.text[self.last..]).anchored(Anchored::Yes); - self.caps.clear(); - self.pat.captures(input, &mut self.caps); - let m = self.caps.get_match()?; + let m = self.pat.find(input)?; let start = self.last; let mut end = self.last + m.range().end; if self.lookahead[m.pattern().as_usize()] { diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs index 9b2a257a..f52941ca 100644 --- a/crates/bpe/benchmarks/performance.rs +++ b/crates/bpe/benchmarks/performance.rs @@ -244,12 +244,32 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) { } } +fn pretokenization_benchmark(c: &mut Criterion) { + for (name, tok, _, _) in TOKENIZERS.iter() { + let text = create_test_string(&tok.bpe, 80_000); + + let mut group = c.benchmark_group(format!("pretokenization-{name}")); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for bytes in [10, 100, 1000, 10000] { + group.throughput(criterion::Throughput::Bytes(bytes as u64)); + group.bench_with_input(BenchmarkId::new("split", bytes), &bytes, |b, bytes| { + b.iter_batched( + || select_test_string(&text, *bytes), + |text| tok.split(text).count(), + criterion::BatchSize::SmallInput, + ) + }); + } + group.finish(); + } +} + criterion_group!( name = benches; config = Criterion::default() .warm_up_time(Duration::from_millis(500)) .measurement_time(Duration::from_millis(4000)) .nresamples(1000); - targets = counting_benchmark, encoding_benchmark, appending_benchmark, comparison_benchmark, worstcase_comparison_benchmark + targets = counting_benchmark, encoding_benchmark, appending_benchmark, pretokenization_benchmark, comparison_benchmark, worstcase_comparison_benchmark ); criterion_main!(benches); From 4b3cda544bb92a33c7b9847a1eea355fda813f69 Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Fri, 5 Jun 2026 11:39:44 +0200 Subject: [PATCH 2/3] Apply suggestion from @aneubeck trying to trigger codeql which is blocking the merge :( --- crates/bpe/benchmarks/performance.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs index f52941ca..6a71f3d9 100644 --- a/crates/bpe/benchmarks/performance.rs +++ b/crates/bpe/benchmarks/performance.rs @@ -245,8 +245,8 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) { } fn pretokenization_benchmark(c: &mut Criterion) { - for (name, tok, _, _) in TOKENIZERS.iter() { - let text = create_test_string(&tok.bpe, 80_000); + for (name, tokenizer, _, _) in TOKENIZERS.iter() { + let text = create_test_string(&tokenizer.bpe, 80_000); let mut group = c.benchmark_group(format!("pretokenization-{name}")); group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); @@ -255,7 +255,7 @@ fn pretokenization_benchmark(c: &mut Criterion) { group.bench_with_input(BenchmarkId::new("split", bytes), &bytes, |b, bytes| { b.iter_batched( || select_test_string(&text, *bytes), - |text| tok.split(text).count(), + |text| tokenizer.split(text).count(), criterion::BatchSize::SmallInput, ) }); From 9c65f5ae8a13290ca2a8139deb966e81d0a3a819 Mon Sep 17 00:00:00 2001 From: Alexander Neubeck Date: Fri, 5 Jun 2026 11:46:21 +0200 Subject: [PATCH 3/3] Apply suggestion from @aneubeck --- crates/bpe/benchmarks/performance.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs index 6a71f3d9..f52941ca 100644 --- a/crates/bpe/benchmarks/performance.rs +++ b/crates/bpe/benchmarks/performance.rs @@ -245,8 +245,8 @@ fn worstcase_comparison_benchmark(c: &mut Criterion) { } fn pretokenization_benchmark(c: &mut Criterion) { - for (name, tokenizer, _, _) in TOKENIZERS.iter() { - let text = create_test_string(&tokenizer.bpe, 80_000); + for (name, tok, _, _) in TOKENIZERS.iter() { + let text = create_test_string(&tok.bpe, 80_000); let mut group = c.benchmark_group(format!("pretokenization-{name}")); group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); @@ -255,7 +255,7 @@ fn pretokenization_benchmark(c: &mut Criterion) { group.bench_with_input(BenchmarkId::new("split", bytes), &bytes, |b, bytes| { b.iter_batched( || select_test_string(&text, *bytes), - |text| tokenizer.split(text).count(), + |text| tok.split(text).count(), criterion::BatchSize::SmallInput, ) });