Delete tiktoken-rs (#7018)

2025-11-20 11:15:04 -08:00 · 2025-11-20 11:15:04 -08:00 · 52d0ec4cd8
commit 52d0ec4cd8
parent 397279d46e
9 changed files with 17 additions and 283 deletions
--- a/codex-rs/Cargo.lock
+++ b/codex-rs/Cargo.lock
@ -260,7 +260,7 @@ dependencies = [
 "memchr",
 "proc-macro2",
 "quote",
- "rustc-hash 2.1.1",
+ "rustc-hash",
 "serde",
 "serde_derive",
 "syn 2.0.104",
@ -1108,7 +1108,6 @@ dependencies = [
 "codex-utils-pty",
 "codex-utils-readiness",
 "codex-utils-string",
- "codex-utils-tokenizer",
 "codex-windows-sandbox",
 "core-foundation 0.9.4",
 "core_test_support",
@ -1628,18 +1627,6 @@ dependencies = [
 name = "codex-utils-string"
 version = "0.0.0"

-[[package]]
-name = "codex-utils-tokenizer"
-version = "0.0.0"
-dependencies = [
- "anyhow",
- "codex-utils-cache",
- "pretty_assertions",
- "thiserror 2.0.17",
- "tiktoken-rs",
- "tokio",
-]
-
 [[package]]
 name = "codex-windows-sandbox"
 version = "0.1.0"
@ -2461,17 +2448,6 @@ dependencies = [
 "once_cell",
 ]

-[[package]]
-name = "fancy-regex"
-version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
-dependencies = [
- "bit-set",
- "regex-automata",
- "regex-syntax 0.8.5",
-]
-
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@ -4796,7 +4772,7 @@ dependencies = [
 "pin-project-lite",
 "quinn-proto",
 "quinn-udp",
- "rustc-hash 2.1.1",
+ "rustc-hash",
 "rustls",
 "socket2 0.6.0",
 "thiserror 2.0.17",
@ -4816,7 +4792,7 @@ dependencies = [
 "lru-slab",
 "rand 0.9.2",
 "ring",
- "rustc-hash 2.1.1",
+ "rustc-hash",
 "rustls",
 "rustls-pki-types",
 "slab",
@ -5161,12 +5137,6 @@ version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f"

-[[package]]
-name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
 [[package]]
 name = "rustc-hash"
 version = "2.1.1"
@ -6387,21 +6357,6 @@ dependencies = [
 "zune-jpeg",
 ]

-[[package]]
-name = "tiktoken-rs"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a19830747d9034cd9da43a60eaa8e552dfda7712424aebf187b7a60126bae0d"
-dependencies = [
- "anyhow",
- "base64",
- "bstr",
- "fancy-regex",
- "lazy_static",
- "regex",
- "rustc-hash 1.1.0",
-]
-
 [[package]]
 name = "time"
 version = "0.3.44"
--- a/codex-rs/Cargo.toml
+++ b/codex-rs/Cargo.toml
@ -41,7 +41,6 @@ members = [
    "utils/pty",
    "utils/readiness",
    "utils/string",
-    "utils/tokenizer",
 ]
 resolver = "2"

@ -90,7 +89,6 @@ codex-utils-json-to-toml = { path = "utils/json-to-toml" }
 codex-utils-pty = { path = "utils/pty" }
 codex-utils-readiness = { path = "utils/readiness" }
 codex-utils-string = { path = "utils/string" }
-codex-utils-tokenizer = { path = "utils/tokenizer" }
 codex-windows-sandbox = { path = "windows-sandbox-rs" }
 core_test_support = { path = "core/tests/common" }
 mcp-types = { path = "mcp-types" }
@ -190,7 +188,6 @@ tempfile = "3.23.0"
 test-log = "0.2.18"
 textwrap = "0.16.2"
 thiserror = "2.0.17"
-tiktoken-rs = "0.9"
 time = "0.3"
 tiny_http = "0.12"
 tokio = "1"
@ -268,7 +265,6 @@ ignored = [
    "icu_provider",
    "openssl-sys",
    "codex-utils-readiness",
-    "codex-utils-tokenizer",
 ]

 [profile.release]
--- a/codex-rs/core/Cargo.toml
+++ b/codex-rs/core/Cargo.toml
@ -33,7 +33,6 @@ codex-rmcp-client = { workspace = true }
 codex-utils-pty = { workspace = true }
 codex-utils-readiness = { workspace = true }
 codex-utils-string = { workspace = true }
-codex-utils-tokenizer = { workspace = true }
 codex-windows-sandbox = { package = "codex-windows-sandbox", path = "../windows-sandbox-rs" }
 dirs = { workspace = true }
 dunce = { workspace = true }
--- a/codex-rs/core/src/codex.rs
+++ b/codex-rs/core/src/codex.rs
@ -133,7 +133,6 @@ use codex_protocol::protocol::InitialHistory;
 use codex_protocol::user_input::UserInput;
 use codex_utils_readiness::Readiness;
 use codex_utils_readiness::ReadinessFlag;
-use codex_utils_tokenizer::warm_model_cache;

 /// The high-level interface to the Codex system.
 /// It operates as a queue pair where you send submissions and receive events.
@ -555,9 +554,6 @@ impl Session {
        // Create the mutable state for the Session.
        let state = SessionState::new(session_configuration.clone());

-        // Warm the tokenizer cache for the session model without blocking startup.
-        warm_model_cache(&session_configuration.model);
-
        let services = SessionServices {
            mcp_connection_manager: Arc::new(RwLock::new(McpConnectionManager::default())),
            mcp_startup_cancellation_token: CancellationToken::new(),
--- a/codex-rs/core/src/config/edit.rs
+++ b/codex-rs/core/src/config/edit.rs
@ -4,7 +4,6 @@ use crate::config::types::Notice;
 use anyhow::Context;
 use codex_protocol::config_types::ReasoningEffort;
 use codex_protocol::config_types::TrustLevel;
-use codex_utils_tokenizer::warm_model_cache;
 use std::collections::BTreeMap;
 use std::path::Path;
 use std::path::PathBuf;
@ -231,9 +230,6 @@ impl ConfigDocument {
    fn apply(&mut self, edit: &ConfigEdit) -> anyhow::Result<bool> {
        match edit {
            ConfigEdit::SetModel { model, effort } => Ok({
-                if let Some(model) = &model {
-                    warm_model_cache(model)
-                }
                let mut mutated = false;
                mutated |= self.write_profile_value(
                    &["model"],
--- a/codex-rs/core/src/context_manager/history.rs
+++ b/codex-rs/core/src/context_manager/history.rs
@ -1,13 +1,13 @@
 use crate::codex::TurnContext;
 use crate::context_manager::normalize;
 use crate::truncate::TruncationPolicy;
+use crate::truncate::approx_token_count;
 use crate::truncate::truncate_function_output_items_with_policy;
 use crate::truncate::truncate_text;
 use codex_protocol::models::FunctionCallOutputPayload;
 use codex_protocol::models::ResponseItem;
 use codex_protocol::protocol::TokenUsage;
 use codex_protocol::protocol::TokenUsageInfo;
-use codex_utils_tokenizer::Tokenizer;
 use std::ops::Deref;

 /// Transcript of conversation history
@ -74,26 +74,21 @@ impl ContextManager {
        history
    }

-    // Estimate the number of tokens in the history. Return None if no tokenizer
-    // is available. This does not consider the reasoning traces.
-    // /!\ The value is a lower bound estimate and does not represent the exact
-    // context length.
+    // Estimate token usage using byte-based heuristics from the truncation helpers.
+    // This is a coarse lower bound, not a tokenizer-accurate count.
    pub(crate) fn estimate_token_count(&self, turn_context: &TurnContext) -> Option<i64> {
-        let model = turn_context.client.get_model();
-        let tokenizer = Tokenizer::for_model(model.as_str()).ok()?;
        let model_family = turn_context.client.get_model_family();
+        let base_tokens =
+            i64::try_from(approx_token_count(model_family.base_instructions.as_str()))
+                .unwrap_or(i64::MAX);

-        Some(
-            self.items
-                .iter()
-                .map(|item| {
-                    serde_json::to_string(&item)
-                        .map(|item| tokenizer.count(&item))
-                        .unwrap_or_default()
-                })
-                .sum::<i64>()
-                + tokenizer.count(model_family.base_instructions.as_str()),
-        )
+        let items_tokens = self.items.iter().fold(0i64, |acc, item| {
+            let serialized = serde_json::to_string(item).unwrap_or_default();
+            let item_tokens = i64::try_from(approx_token_count(&serialized)).unwrap_or(i64::MAX);
+            acc.saturating_add(item_tokens)
+        });
+
+        Some(base_tokens.saturating_add(items_tokens))
    }

    pub(crate) fn remove_first_item(&mut self) {
--- a/codex-rs/core/tests/suite/compact.rs
+++ b/codex-rs/core/tests/suite/compact.rs
@ -384,7 +384,7 @@ async fn manual_compact_uses_custom_prompt() {
 }

 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-async fn manual_compact_emits_estimated_token_usage_event() {
+async fn manual_compact_emits_api_and_local_token_usage_events() {
    skip_if_no_network!();

    let server = start_mock_server().await;
--- a/codex-rs/utils/tokenizer/Cargo.toml
+++ b/codex-rs/utils/tokenizer/Cargo.toml
@ -1,17 +0,0 @@
-[package]
-edition.workspace = true
-name = "codex-utils-tokenizer"
-version.workspace = true
-
-[lints]
-workspace = true
-
-[dependencies]
-anyhow = { workspace = true }
-codex-utils-cache = { workspace = true }
-thiserror = { workspace = true }
-tiktoken-rs = { workspace = true }
-tokio = { workspace = true }
-
-[dev-dependencies]
-pretty_assertions = { workspace = true }
--- a/codex-rs/utils/tokenizer/src/lib.rs
+++ b/codex-rs/utils/tokenizer/src/lib.rs
@ -1,186 +0,0 @@
-use std::fmt;
-use std::num::NonZeroUsize;
-use std::sync::OnceLock;
-
-use anyhow::Error as AnyhowError;
-use codex_utils_cache::BlockingLruCache;
-use thiserror::Error;
-use tiktoken_rs::CoreBPE;
-
-/// Supported local encodings.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-pub enum EncodingKind {
-    O200kBase,
-    Cl100kBase,
-}
-
-impl fmt::Display for EncodingKind {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::O200kBase => f.write_str("o200k_base"),
-            Self::Cl100kBase => f.write_str("cl100k_base"),
-        }
-    }
-}
-
-/// Tokenizer error type.
-#[derive(Debug, Error)]
-pub enum TokenizerError {
-    #[error("failed to load encoding {kind}")]
-    LoadEncoding {
-        kind: EncodingKind,
-        #[source]
-        source: AnyhowError,
-    },
-    #[error("failed to decode tokens")]
-    Decode {
-        #[source]
-        source: AnyhowError,
-    },
-}
-
-fn model_cache() -> &'static BlockingLruCache<String, CoreBPE> {
-    static MODEL_CACHE: OnceLock<BlockingLruCache<String, CoreBPE>> = OnceLock::new();
-    MODEL_CACHE
-        .get_or_init(|| BlockingLruCache::new(NonZeroUsize::new(64).unwrap_or(NonZeroUsize::MIN)))
-}
-
-/// Fire-and-forget function used to pre-warm model tokenizer loading. This is done
-/// on a best-effort basis, without any guarantee about the state of the cache
-/// before or after.
-/// Only working in Tokio runtimes
-pub fn warm_model_cache(model: &str) {
-    if tokio::runtime::Handle::try_current().is_err() {
-        return;
-    }
-    let model = model.to_string();
-    tokio::spawn(async move {
-        let _ = Tokenizer::for_model(&model);
-    });
-}
-
-/// Thin wrapper around a `tiktoken_rs::CoreBPE` tokenizer.
-#[derive(Clone)]
-pub struct Tokenizer {
-    inner: CoreBPE,
-}
-
-impl Tokenizer {
-    /// Build a tokenizer for a specific encoding.
-    pub fn new(kind: EncodingKind) -> Result<Self, TokenizerError> {
-        let loader: fn() -> anyhow::Result<CoreBPE> = match kind {
-            EncodingKind::O200kBase => tiktoken_rs::o200k_base,
-            EncodingKind::Cl100kBase => tiktoken_rs::cl100k_base,
-        };
-
-        let inner = loader().map_err(|source| TokenizerError::LoadEncoding { kind, source })?;
-        Ok(Self { inner })
-    }
-
-    /// Default to `O200kBase`
-    pub fn try_default() -> Result<Self, TokenizerError> {
-        Self::new(EncodingKind::O200kBase)
-    }
-
-    /// Build a tokenizer using an `OpenAI` model name (maps to an encoding).
-    /// Falls back to the `O200kBase` encoding when the model is unknown.
-    pub fn for_model(model: &str) -> Result<Self, TokenizerError> {
-        let inner = model_cache().get_or_try_insert_with(model.to_owned(), || {
-            match tiktoken_rs::get_bpe_from_model(model) {
-                Ok(inner) => Ok(inner),
-                Err(_model_error) => Tokenizer::new(EncodingKind::O200kBase).map(|e| e.inner),
-            }
-        })?;
-        Ok(Self { inner })
-    }
-
-    /// Encode text to token IDs. If `with_special_tokens` is true, special
-    /// tokens are allowed and may appear in the result.
-    #[must_use]
-    pub fn encode(&self, text: &str, with_special_tokens: bool) -> Vec<i32> {
-        let raw = if with_special_tokens {
-            self.inner.encode_with_special_tokens(text)
-        } else {
-            self.inner.encode_ordinary(text)
-        };
-        raw.into_iter().map(|t| t as i32).collect()
-    }
-
-    /// Count tokens in `text` as a signed integer.
-    #[must_use]
-    pub fn count(&self, text: &str) -> i64 {
-        // Signed length to satisfy our style preference.
-        i64::try_from(self.inner.encode_ordinary(text).len()).unwrap_or(i64::MAX)
-    }
-
-    /// Decode token IDs back to text.
-    pub fn decode(&self, tokens: &[i32]) -> Result<String, TokenizerError> {
-        let raw: Vec<u32> = tokens.iter().map(|t| *t as u32).collect();
-        self.inner
-            .decode(raw)
-            .map_err(|source| TokenizerError::Decode { source })
-    }
-}
-
-impl fmt::Debug for Tokenizer {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "Tokenizer {{ inner: <CoreBPE> }}")
-    }
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use pretty_assertions::assert_eq;
-
-    #[test]
-    fn cl100k_base_roundtrip_simple() -> Result<(), TokenizerError> {
-        let tok = Tokenizer::new(EncodingKind::Cl100kBase)?;
-        let s = "hello world";
-        let ids = tok.encode(s, false);
-        // Stable expectation for cl100k_base
-        assert_eq!(ids, vec![15339, 1917]);
-        let back = tok.decode(&ids)?;
-        assert_eq!(back, s);
-        Ok(())
-    }
-
-    #[test]
-    fn preserves_whitespace_and_special_tokens_flag() -> Result<(), TokenizerError> {
-        let tok = Tokenizer::new(EncodingKind::Cl100kBase)?;
-        let s = "This  has   multiple   spaces";
-        let ids_no_special = tok.encode(s, false);
-        let round = tok.decode(&ids_no_special)?;
-        assert_eq!(round, s);
-
-        // With special tokens allowed, result may be identical for normal text,
-        // but the API should still function.
-        let ids_with_special = tok.encode(s, true);
-        let round2 = tok.decode(&ids_with_special)?;
-        assert_eq!(round2, s);
-        Ok(())
-    }
-
-    #[test]
-    fn model_mapping_builds_tokenizer() -> Result<(), TokenizerError> {
-        // Choose a long-standing model alias that maps to cl100k_base.
-        let tok = Tokenizer::for_model("gpt-5.1")?;
-        let ids = tok.encode("ok", false);
-        let back = tok.decode(&ids)?;
-        assert_eq!(back, "ok");
-        Ok(())
-    }
-
-    #[test]
-    fn unknown_model_defaults_to_o200k_base() -> Result<(), TokenizerError> {
-        let fallback = Tokenizer::new(EncodingKind::O200kBase)?;
-        let tok = Tokenizer::for_model("does-not-exist")?;
-        let text = "fallback please";
-        assert_eq!(tok.encode(text, false), fallback.encode(text, false));
-        Ok(())
-    }
-
-    #[test]
-    fn warm_model_cache_without_runtime_is_noop() {
-        warm_model_cache("gpt-5");
-    }
-}