From 52d0ec4cd84893a08b0c7f3ea7133d1ecaa39be7 Mon Sep 17 00:00:00 2001 From: pakrym-oai Date: Thu, 20 Nov 2025 11:15:04 -0800 Subject: [PATCH] Delete tiktoken-rs (#7018) --- codex-rs/Cargo.lock | 51 +---- codex-rs/Cargo.toml | 4 - codex-rs/core/Cargo.toml | 1 - codex-rs/core/src/codex.rs | 4 - codex-rs/core/src/config/edit.rs | 4 - codex-rs/core/src/context_manager/history.rs | 31 ++-- codex-rs/core/tests/suite/compact.rs | 2 +- codex-rs/utils/tokenizer/Cargo.toml | 17 -- codex-rs/utils/tokenizer/src/lib.rs | 186 ------------------- 9 files changed, 17 insertions(+), 283 deletions(-) delete mode 100644 codex-rs/utils/tokenizer/Cargo.toml delete mode 100644 codex-rs/utils/tokenizer/src/lib.rs diff --git a/codex-rs/Cargo.lock b/codex-rs/Cargo.lock index 01abeee6a..fbbdb3d1f 100644 --- a/codex-rs/Cargo.lock +++ b/codex-rs/Cargo.lock @@ -260,7 +260,7 @@ dependencies = [ "memchr", "proc-macro2", "quote", - "rustc-hash 2.1.1", + "rustc-hash", "serde", "serde_derive", "syn 2.0.104", @@ -1108,7 +1108,6 @@ dependencies = [ "codex-utils-pty", "codex-utils-readiness", "codex-utils-string", - "codex-utils-tokenizer", "codex-windows-sandbox", "core-foundation 0.9.4", "core_test_support", @@ -1628,18 +1627,6 @@ dependencies = [ name = "codex-utils-string" version = "0.0.0" -[[package]] -name = "codex-utils-tokenizer" -version = "0.0.0" -dependencies = [ - "anyhow", - "codex-utils-cache", - "pretty_assertions", - "thiserror 2.0.17", - "tiktoken-rs", - "tokio", -] - [[package]] name = "codex-windows-sandbox" version = "0.1.0" @@ -2461,17 +2448,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "fancy-regex" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2" -dependencies = [ - "bit-set", - "regex-automata", - "regex-syntax 0.8.5", -] - [[package]] name = "fastrand" version = "2.3.0" @@ -4796,7 +4772,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", "socket2 0.6.0", "thiserror 2.0.17", @@ -4816,7 +4792,7 @@ dependencies = [ "lru-slab", "rand 0.9.2", "ring", - "rustc-hash 2.1.1", + "rustc-hash", "rustls", "rustls-pki-types", "slab", @@ -5161,12 +5137,6 @@ version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -6387,21 +6357,6 @@ dependencies = [ "zune-jpeg", ] -[[package]] -name = "tiktoken-rs" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a19830747d9034cd9da43a60eaa8e552dfda7712424aebf187b7a60126bae0d" -dependencies = [ - "anyhow", - "base64", - "bstr", - "fancy-regex", - "lazy_static", - "regex", - "rustc-hash 1.1.0", -] - [[package]] name = "time" version = "0.3.44" diff --git a/codex-rs/Cargo.toml b/codex-rs/Cargo.toml index 61ab7749b..b34cd46b4 100644 --- a/codex-rs/Cargo.toml +++ b/codex-rs/Cargo.toml @@ -41,7 +41,6 @@ members = [ "utils/pty", "utils/readiness", "utils/string", - "utils/tokenizer", ] resolver = "2" @@ -90,7 +89,6 @@ codex-utils-json-to-toml = { path = "utils/json-to-toml" } codex-utils-pty = { path = "utils/pty" } codex-utils-readiness = { path = "utils/readiness" } codex-utils-string = { path = "utils/string" } -codex-utils-tokenizer = { path = "utils/tokenizer" } codex-windows-sandbox = { path = "windows-sandbox-rs" } core_test_support = { path = "core/tests/common" } mcp-types = { path = "mcp-types" } @@ -190,7 +188,6 @@ tempfile = "3.23.0" test-log = "0.2.18" textwrap = "0.16.2" thiserror = "2.0.17" -tiktoken-rs = "0.9" time = "0.3" tiny_http = "0.12" tokio = "1" @@ -268,7 +265,6 @@ ignored = [ "icu_provider", "openssl-sys", "codex-utils-readiness", - "codex-utils-tokenizer", ] [profile.release] diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml index 565b9ddd5..669a9a63f 100644 --- a/codex-rs/core/Cargo.toml +++ b/codex-rs/core/Cargo.toml @@ -33,7 +33,6 @@ codex-rmcp-client = { workspace = true } codex-utils-pty = { workspace = true } codex-utils-readiness = { workspace = true } codex-utils-string = { workspace = true } -codex-utils-tokenizer = { workspace = true } codex-windows-sandbox = { package = "codex-windows-sandbox", path = "../windows-sandbox-rs" } dirs = { workspace = true } dunce = { workspace = true } diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index 55dfe1704..1cf4e923c 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -133,7 +133,6 @@ use codex_protocol::protocol::InitialHistory; use codex_protocol::user_input::UserInput; use codex_utils_readiness::Readiness; use codex_utils_readiness::ReadinessFlag; -use codex_utils_tokenizer::warm_model_cache; /// The high-level interface to the Codex system. /// It operates as a queue pair where you send submissions and receive events. @@ -555,9 +554,6 @@ impl Session { // Create the mutable state for the Session. let state = SessionState::new(session_configuration.clone()); - // Warm the tokenizer cache for the session model without blocking startup. - warm_model_cache(&session_configuration.model); - let services = SessionServices { mcp_connection_manager: Arc::new(RwLock::new(McpConnectionManager::default())), mcp_startup_cancellation_token: CancellationToken::new(), diff --git a/codex-rs/core/src/config/edit.rs b/codex-rs/core/src/config/edit.rs index bd7c16a0a..b8862fa5c 100644 --- a/codex-rs/core/src/config/edit.rs +++ b/codex-rs/core/src/config/edit.rs @@ -4,7 +4,6 @@ use crate::config::types::Notice; use anyhow::Context; use codex_protocol::config_types::ReasoningEffort; use codex_protocol::config_types::TrustLevel; -use codex_utils_tokenizer::warm_model_cache; use std::collections::BTreeMap; use std::path::Path; use std::path::PathBuf; @@ -231,9 +230,6 @@ impl ConfigDocument { fn apply(&mut self, edit: &ConfigEdit) -> anyhow::Result { match edit { ConfigEdit::SetModel { model, effort } => Ok({ - if let Some(model) = &model { - warm_model_cache(model) - } let mut mutated = false; mutated |= self.write_profile_value( &["model"], diff --git a/codex-rs/core/src/context_manager/history.rs b/codex-rs/core/src/context_manager/history.rs index 8575fe9f6..8eefcbf85 100644 --- a/codex-rs/core/src/context_manager/history.rs +++ b/codex-rs/core/src/context_manager/history.rs @@ -1,13 +1,13 @@ use crate::codex::TurnContext; use crate::context_manager::normalize; use crate::truncate::TruncationPolicy; +use crate::truncate::approx_token_count; use crate::truncate::truncate_function_output_items_with_policy; use crate::truncate::truncate_text; use codex_protocol::models::FunctionCallOutputPayload; use codex_protocol::models::ResponseItem; use codex_protocol::protocol::TokenUsage; use codex_protocol::protocol::TokenUsageInfo; -use codex_utils_tokenizer::Tokenizer; use std::ops::Deref; /// Transcript of conversation history @@ -74,26 +74,21 @@ impl ContextManager { history } - // Estimate the number of tokens in the history. Return None if no tokenizer - // is available. This does not consider the reasoning traces. - // /!\ The value is a lower bound estimate and does not represent the exact - // context length. + // Estimate token usage using byte-based heuristics from the truncation helpers. + // This is a coarse lower bound, not a tokenizer-accurate count. pub(crate) fn estimate_token_count(&self, turn_context: &TurnContext) -> Option { - let model = turn_context.client.get_model(); - let tokenizer = Tokenizer::for_model(model.as_str()).ok()?; let model_family = turn_context.client.get_model_family(); + let base_tokens = + i64::try_from(approx_token_count(model_family.base_instructions.as_str())) + .unwrap_or(i64::MAX); - Some( - self.items - .iter() - .map(|item| { - serde_json::to_string(&item) - .map(|item| tokenizer.count(&item)) - .unwrap_or_default() - }) - .sum::() - + tokenizer.count(model_family.base_instructions.as_str()), - ) + let items_tokens = self.items.iter().fold(0i64, |acc, item| { + let serialized = serde_json::to_string(item).unwrap_or_default(); + let item_tokens = i64::try_from(approx_token_count(&serialized)).unwrap_or(i64::MAX); + acc.saturating_add(item_tokens) + }); + + Some(base_tokens.saturating_add(items_tokens)) } pub(crate) fn remove_first_item(&mut self) { diff --git a/codex-rs/core/tests/suite/compact.rs b/codex-rs/core/tests/suite/compact.rs index 5359f3b1a..1324d3edb 100644 --- a/codex-rs/core/tests/suite/compact.rs +++ b/codex-rs/core/tests/suite/compact.rs @@ -384,7 +384,7 @@ async fn manual_compact_uses_custom_prompt() { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn manual_compact_emits_estimated_token_usage_event() { +async fn manual_compact_emits_api_and_local_token_usage_events() { skip_if_no_network!(); let server = start_mock_server().await; diff --git a/codex-rs/utils/tokenizer/Cargo.toml b/codex-rs/utils/tokenizer/Cargo.toml deleted file mode 100644 index 7669d6b98..000000000 --- a/codex-rs/utils/tokenizer/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -edition.workspace = true -name = "codex-utils-tokenizer" -version.workspace = true - -[lints] -workspace = true - -[dependencies] -anyhow = { workspace = true } -codex-utils-cache = { workspace = true } -thiserror = { workspace = true } -tiktoken-rs = { workspace = true } -tokio = { workspace = true } - -[dev-dependencies] -pretty_assertions = { workspace = true } diff --git a/codex-rs/utils/tokenizer/src/lib.rs b/codex-rs/utils/tokenizer/src/lib.rs deleted file mode 100644 index 1e151393a..000000000 --- a/codex-rs/utils/tokenizer/src/lib.rs +++ /dev/null @@ -1,186 +0,0 @@ -use std::fmt; -use std::num::NonZeroUsize; -use std::sync::OnceLock; - -use anyhow::Error as AnyhowError; -use codex_utils_cache::BlockingLruCache; -use thiserror::Error; -use tiktoken_rs::CoreBPE; - -/// Supported local encodings. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum EncodingKind { - O200kBase, - Cl100kBase, -} - -impl fmt::Display for EncodingKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::O200kBase => f.write_str("o200k_base"), - Self::Cl100kBase => f.write_str("cl100k_base"), - } - } -} - -/// Tokenizer error type. -#[derive(Debug, Error)] -pub enum TokenizerError { - #[error("failed to load encoding {kind}")] - LoadEncoding { - kind: EncodingKind, - #[source] - source: AnyhowError, - }, - #[error("failed to decode tokens")] - Decode { - #[source] - source: AnyhowError, - }, -} - -fn model_cache() -> &'static BlockingLruCache { - static MODEL_CACHE: OnceLock> = OnceLock::new(); - MODEL_CACHE - .get_or_init(|| BlockingLruCache::new(NonZeroUsize::new(64).unwrap_or(NonZeroUsize::MIN))) -} - -/// Fire-and-forget function used to pre-warm model tokenizer loading. This is done -/// on a best-effort basis, without any guarantee about the state of the cache -/// before or after. -/// Only working in Tokio runtimes -pub fn warm_model_cache(model: &str) { - if tokio::runtime::Handle::try_current().is_err() { - return; - } - let model = model.to_string(); - tokio::spawn(async move { - let _ = Tokenizer::for_model(&model); - }); -} - -/// Thin wrapper around a `tiktoken_rs::CoreBPE` tokenizer. -#[derive(Clone)] -pub struct Tokenizer { - inner: CoreBPE, -} - -impl Tokenizer { - /// Build a tokenizer for a specific encoding. - pub fn new(kind: EncodingKind) -> Result { - let loader: fn() -> anyhow::Result = match kind { - EncodingKind::O200kBase => tiktoken_rs::o200k_base, - EncodingKind::Cl100kBase => tiktoken_rs::cl100k_base, - }; - - let inner = loader().map_err(|source| TokenizerError::LoadEncoding { kind, source })?; - Ok(Self { inner }) - } - - /// Default to `O200kBase` - pub fn try_default() -> Result { - Self::new(EncodingKind::O200kBase) - } - - /// Build a tokenizer using an `OpenAI` model name (maps to an encoding). - /// Falls back to the `O200kBase` encoding when the model is unknown. - pub fn for_model(model: &str) -> Result { - let inner = model_cache().get_or_try_insert_with(model.to_owned(), || { - match tiktoken_rs::get_bpe_from_model(model) { - Ok(inner) => Ok(inner), - Err(_model_error) => Tokenizer::new(EncodingKind::O200kBase).map(|e| e.inner), - } - })?; - Ok(Self { inner }) - } - - /// Encode text to token IDs. If `with_special_tokens` is true, special - /// tokens are allowed and may appear in the result. - #[must_use] - pub fn encode(&self, text: &str, with_special_tokens: bool) -> Vec { - let raw = if with_special_tokens { - self.inner.encode_with_special_tokens(text) - } else { - self.inner.encode_ordinary(text) - }; - raw.into_iter().map(|t| t as i32).collect() - } - - /// Count tokens in `text` as a signed integer. - #[must_use] - pub fn count(&self, text: &str) -> i64 { - // Signed length to satisfy our style preference. - i64::try_from(self.inner.encode_ordinary(text).len()).unwrap_or(i64::MAX) - } - - /// Decode token IDs back to text. - pub fn decode(&self, tokens: &[i32]) -> Result { - let raw: Vec = tokens.iter().map(|t| *t as u32).collect(); - self.inner - .decode(raw) - .map_err(|source| TokenizerError::Decode { source }) - } -} - -impl fmt::Debug for Tokenizer { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "Tokenizer {{ inner: }}") - } -} -#[cfg(test)] -mod tests { - use super::*; - use pretty_assertions::assert_eq; - - #[test] - fn cl100k_base_roundtrip_simple() -> Result<(), TokenizerError> { - let tok = Tokenizer::new(EncodingKind::Cl100kBase)?; - let s = "hello world"; - let ids = tok.encode(s, false); - // Stable expectation for cl100k_base - assert_eq!(ids, vec![15339, 1917]); - let back = tok.decode(&ids)?; - assert_eq!(back, s); - Ok(()) - } - - #[test] - fn preserves_whitespace_and_special_tokens_flag() -> Result<(), TokenizerError> { - let tok = Tokenizer::new(EncodingKind::Cl100kBase)?; - let s = "This has multiple spaces"; - let ids_no_special = tok.encode(s, false); - let round = tok.decode(&ids_no_special)?; - assert_eq!(round, s); - - // With special tokens allowed, result may be identical for normal text, - // but the API should still function. - let ids_with_special = tok.encode(s, true); - let round2 = tok.decode(&ids_with_special)?; - assert_eq!(round2, s); - Ok(()) - } - - #[test] - fn model_mapping_builds_tokenizer() -> Result<(), TokenizerError> { - // Choose a long-standing model alias that maps to cl100k_base. - let tok = Tokenizer::for_model("gpt-5.1")?; - let ids = tok.encode("ok", false); - let back = tok.decode(&ids)?; - assert_eq!(back, "ok"); - Ok(()) - } - - #[test] - fn unknown_model_defaults_to_o200k_base() -> Result<(), TokenizerError> { - let fallback = Tokenizer::new(EncodingKind::O200kBase)?; - let tok = Tokenizer::for_model("does-not-exist")?; - let text = "fallback please"; - assert_eq!(tok.encode(text, false), fallback.encode(text, false)); - Ok(()) - } - - #[test] - fn warm_model_cache_without_runtime_is_noop() { - warm_model_cache("gpt-5"); - } -}