Delete tiktoken-rs (#7018)

This commit is contained in:
pakrym-oai 2025-11-20 11:15:04 -08:00 committed by GitHub
parent 397279d46e
commit 52d0ec4cd8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 17 additions and 283 deletions

51
codex-rs/Cargo.lock generated
View file

@ -260,7 +260,7 @@ dependencies = [
"memchr",
"proc-macro2",
"quote",
"rustc-hash 2.1.1",
"rustc-hash",
"serde",
"serde_derive",
"syn 2.0.104",
@ -1108,7 +1108,6 @@ dependencies = [
"codex-utils-pty",
"codex-utils-readiness",
"codex-utils-string",
"codex-utils-tokenizer",
"codex-windows-sandbox",
"core-foundation 0.9.4",
"core_test_support",
@ -1628,18 +1627,6 @@ dependencies = [
name = "codex-utils-string"
version = "0.0.0"
[[package]]
name = "codex-utils-tokenizer"
version = "0.0.0"
dependencies = [
"anyhow",
"codex-utils-cache",
"pretty_assertions",
"thiserror 2.0.17",
"tiktoken-rs",
"tokio",
]
[[package]]
name = "codex-windows-sandbox"
version = "0.1.0"
@ -2461,17 +2448,6 @@ dependencies = [
"once_cell",
]
[[package]]
name = "fancy-regex"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
dependencies = [
"bit-set",
"regex-automata",
"regex-syntax 0.8.5",
]
[[package]]
name = "fastrand"
version = "2.3.0"
@ -4796,7 +4772,7 @@ dependencies = [
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash 2.1.1",
"rustc-hash",
"rustls",
"socket2 0.6.0",
"thiserror 2.0.17",
@ -4816,7 +4792,7 @@ dependencies = [
"lru-slab",
"rand 0.9.2",
"ring",
"rustc-hash 2.1.1",
"rustc-hash",
"rustls",
"rustls-pki-types",
"slab",
@ -5161,12 +5137,6 @@ version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.1.1"
@ -6387,21 +6357,6 @@ dependencies = [
"zune-jpeg",
]
[[package]]
name = "tiktoken-rs"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a19830747d9034cd9da43a60eaa8e552dfda7712424aebf187b7a60126bae0d"
dependencies = [
"anyhow",
"base64",
"bstr",
"fancy-regex",
"lazy_static",
"regex",
"rustc-hash 1.1.0",
]
[[package]]
name = "time"
version = "0.3.44"

View file

@ -41,7 +41,6 @@ members = [
"utils/pty",
"utils/readiness",
"utils/string",
"utils/tokenizer",
]
resolver = "2"
@ -90,7 +89,6 @@ codex-utils-json-to-toml = { path = "utils/json-to-toml" }
codex-utils-pty = { path = "utils/pty" }
codex-utils-readiness = { path = "utils/readiness" }
codex-utils-string = { path = "utils/string" }
codex-utils-tokenizer = { path = "utils/tokenizer" }
codex-windows-sandbox = { path = "windows-sandbox-rs" }
core_test_support = { path = "core/tests/common" }
mcp-types = { path = "mcp-types" }
@ -190,7 +188,6 @@ tempfile = "3.23.0"
test-log = "0.2.18"
textwrap = "0.16.2"
thiserror = "2.0.17"
tiktoken-rs = "0.9"
time = "0.3"
tiny_http = "0.12"
tokio = "1"
@ -268,7 +265,6 @@ ignored = [
"icu_provider",
"openssl-sys",
"codex-utils-readiness",
"codex-utils-tokenizer",
]
[profile.release]

View file

@ -33,7 +33,6 @@ codex-rmcp-client = { workspace = true }
codex-utils-pty = { workspace = true }
codex-utils-readiness = { workspace = true }
codex-utils-string = { workspace = true }
codex-utils-tokenizer = { workspace = true }
codex-windows-sandbox = { package = "codex-windows-sandbox", path = "../windows-sandbox-rs" }
dirs = { workspace = true }
dunce = { workspace = true }

View file

@ -133,7 +133,6 @@ use codex_protocol::protocol::InitialHistory;
use codex_protocol::user_input::UserInput;
use codex_utils_readiness::Readiness;
use codex_utils_readiness::ReadinessFlag;
use codex_utils_tokenizer::warm_model_cache;
/// The high-level interface to the Codex system.
/// It operates as a queue pair where you send submissions and receive events.
@ -555,9 +554,6 @@ impl Session {
// Create the mutable state for the Session.
let state = SessionState::new(session_configuration.clone());
// Warm the tokenizer cache for the session model without blocking startup.
warm_model_cache(&session_configuration.model);
let services = SessionServices {
mcp_connection_manager: Arc::new(RwLock::new(McpConnectionManager::default())),
mcp_startup_cancellation_token: CancellationToken::new(),

View file

@ -4,7 +4,6 @@ use crate::config::types::Notice;
use anyhow::Context;
use codex_protocol::config_types::ReasoningEffort;
use codex_protocol::config_types::TrustLevel;
use codex_utils_tokenizer::warm_model_cache;
use std::collections::BTreeMap;
use std::path::Path;
use std::path::PathBuf;
@ -231,9 +230,6 @@ impl ConfigDocument {
fn apply(&mut self, edit: &ConfigEdit) -> anyhow::Result<bool> {
match edit {
ConfigEdit::SetModel { model, effort } => Ok({
if let Some(model) = &model {
warm_model_cache(model)
}
let mut mutated = false;
mutated |= self.write_profile_value(
&["model"],

View file

@ -1,13 +1,13 @@
use crate::codex::TurnContext;
use crate::context_manager::normalize;
use crate::truncate::TruncationPolicy;
use crate::truncate::approx_token_count;
use crate::truncate::truncate_function_output_items_with_policy;
use crate::truncate::truncate_text;
use codex_protocol::models::FunctionCallOutputPayload;
use codex_protocol::models::ResponseItem;
use codex_protocol::protocol::TokenUsage;
use codex_protocol::protocol::TokenUsageInfo;
use codex_utils_tokenizer::Tokenizer;
use std::ops::Deref;
/// Transcript of conversation history
@ -74,26 +74,21 @@ impl ContextManager {
history
}
// Estimate the number of tokens in the history. Return None if no tokenizer
// is available. This does not consider the reasoning traces.
// /!\ The value is a lower bound estimate and does not represent the exact
// context length.
// Estimate token usage using byte-based heuristics from the truncation helpers.
// This is a coarse lower bound, not a tokenizer-accurate count.
pub(crate) fn estimate_token_count(&self, turn_context: &TurnContext) -> Option<i64> {
let model = turn_context.client.get_model();
let tokenizer = Tokenizer::for_model(model.as_str()).ok()?;
let model_family = turn_context.client.get_model_family();
let base_tokens =
i64::try_from(approx_token_count(model_family.base_instructions.as_str()))
.unwrap_or(i64::MAX);
Some(
self.items
.iter()
.map(|item| {
serde_json::to_string(&item)
.map(|item| tokenizer.count(&item))
.unwrap_or_default()
})
.sum::<i64>()
+ tokenizer.count(model_family.base_instructions.as_str()),
)
let items_tokens = self.items.iter().fold(0i64, |acc, item| {
let serialized = serde_json::to_string(item).unwrap_or_default();
let item_tokens = i64::try_from(approx_token_count(&serialized)).unwrap_or(i64::MAX);
acc.saturating_add(item_tokens)
});
Some(base_tokens.saturating_add(items_tokens))
}
pub(crate) fn remove_first_item(&mut self) {

View file

@ -384,7 +384,7 @@ async fn manual_compact_uses_custom_prompt() {
}
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn manual_compact_emits_estimated_token_usage_event() {
async fn manual_compact_emits_api_and_local_token_usage_events() {
skip_if_no_network!();
let server = start_mock_server().await;

View file

@ -1,17 +0,0 @@
[package]
edition.workspace = true
name = "codex-utils-tokenizer"
version.workspace = true
[lints]
workspace = true
[dependencies]
anyhow = { workspace = true }
codex-utils-cache = { workspace = true }
thiserror = { workspace = true }
tiktoken-rs = { workspace = true }
tokio = { workspace = true }
[dev-dependencies]
pretty_assertions = { workspace = true }

View file

@ -1,186 +0,0 @@
use std::fmt;
use std::num::NonZeroUsize;
use std::sync::OnceLock;
use anyhow::Error as AnyhowError;
use codex_utils_cache::BlockingLruCache;
use thiserror::Error;
use tiktoken_rs::CoreBPE;
/// Supported local encodings.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum EncodingKind {
O200kBase,
Cl100kBase,
}
impl fmt::Display for EncodingKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::O200kBase => f.write_str("o200k_base"),
Self::Cl100kBase => f.write_str("cl100k_base"),
}
}
}
/// Tokenizer error type.
#[derive(Debug, Error)]
pub enum TokenizerError {
#[error("failed to load encoding {kind}")]
LoadEncoding {
kind: EncodingKind,
#[source]
source: AnyhowError,
},
#[error("failed to decode tokens")]
Decode {
#[source]
source: AnyhowError,
},
}
fn model_cache() -> &'static BlockingLruCache<String, CoreBPE> {
static MODEL_CACHE: OnceLock<BlockingLruCache<String, CoreBPE>> = OnceLock::new();
MODEL_CACHE
.get_or_init(|| BlockingLruCache::new(NonZeroUsize::new(64).unwrap_or(NonZeroUsize::MIN)))
}
/// Fire-and-forget function used to pre-warm model tokenizer loading. This is done
/// on a best-effort basis, without any guarantee about the state of the cache
/// before or after.
/// Only working in Tokio runtimes
pub fn warm_model_cache(model: &str) {
if tokio::runtime::Handle::try_current().is_err() {
return;
}
let model = model.to_string();
tokio::spawn(async move {
let _ = Tokenizer::for_model(&model);
});
}
/// Thin wrapper around a `tiktoken_rs::CoreBPE` tokenizer.
#[derive(Clone)]
pub struct Tokenizer {
inner: CoreBPE,
}
impl Tokenizer {
/// Build a tokenizer for a specific encoding.
pub fn new(kind: EncodingKind) -> Result<Self, TokenizerError> {
let loader: fn() -> anyhow::Result<CoreBPE> = match kind {
EncodingKind::O200kBase => tiktoken_rs::o200k_base,
EncodingKind::Cl100kBase => tiktoken_rs::cl100k_base,
};
let inner = loader().map_err(|source| TokenizerError::LoadEncoding { kind, source })?;
Ok(Self { inner })
}
/// Default to `O200kBase`
pub fn try_default() -> Result<Self, TokenizerError> {
Self::new(EncodingKind::O200kBase)
}
/// Build a tokenizer using an `OpenAI` model name (maps to an encoding).
/// Falls back to the `O200kBase` encoding when the model is unknown.
pub fn for_model(model: &str) -> Result<Self, TokenizerError> {
let inner = model_cache().get_or_try_insert_with(model.to_owned(), || {
match tiktoken_rs::get_bpe_from_model(model) {
Ok(inner) => Ok(inner),
Err(_model_error) => Tokenizer::new(EncodingKind::O200kBase).map(|e| e.inner),
}
})?;
Ok(Self { inner })
}
/// Encode text to token IDs. If `with_special_tokens` is true, special
/// tokens are allowed and may appear in the result.
#[must_use]
pub fn encode(&self, text: &str, with_special_tokens: bool) -> Vec<i32> {
let raw = if with_special_tokens {
self.inner.encode_with_special_tokens(text)
} else {
self.inner.encode_ordinary(text)
};
raw.into_iter().map(|t| t as i32).collect()
}
/// Count tokens in `text` as a signed integer.
#[must_use]
pub fn count(&self, text: &str) -> i64 {
// Signed length to satisfy our style preference.
i64::try_from(self.inner.encode_ordinary(text).len()).unwrap_or(i64::MAX)
}
/// Decode token IDs back to text.
pub fn decode(&self, tokens: &[i32]) -> Result<String, TokenizerError> {
let raw: Vec<u32> = tokens.iter().map(|t| *t as u32).collect();
self.inner
.decode(raw)
.map_err(|source| TokenizerError::Decode { source })
}
}
impl fmt::Debug for Tokenizer {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Tokenizer {{ inner: <CoreBPE> }}")
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn cl100k_base_roundtrip_simple() -> Result<(), TokenizerError> {
let tok = Tokenizer::new(EncodingKind::Cl100kBase)?;
let s = "hello world";
let ids = tok.encode(s, false);
// Stable expectation for cl100k_base
assert_eq!(ids, vec![15339, 1917]);
let back = tok.decode(&ids)?;
assert_eq!(back, s);
Ok(())
}
#[test]
fn preserves_whitespace_and_special_tokens_flag() -> Result<(), TokenizerError> {
let tok = Tokenizer::new(EncodingKind::Cl100kBase)?;
let s = "This has multiple spaces";
let ids_no_special = tok.encode(s, false);
let round = tok.decode(&ids_no_special)?;
assert_eq!(round, s);
// With special tokens allowed, result may be identical for normal text,
// but the API should still function.
let ids_with_special = tok.encode(s, true);
let round2 = tok.decode(&ids_with_special)?;
assert_eq!(round2, s);
Ok(())
}
#[test]
fn model_mapping_builds_tokenizer() -> Result<(), TokenizerError> {
// Choose a long-standing model alias that maps to cl100k_base.
let tok = Tokenizer::for_model("gpt-5.1")?;
let ids = tok.encode("ok", false);
let back = tok.decode(&ids)?;
assert_eq!(back, "ok");
Ok(())
}
#[test]
fn unknown_model_defaults_to_o200k_base() -> Result<(), TokenizerError> {
let fallback = Tokenizer::new(EncodingKind::O200kBase)?;
let tok = Tokenizer::for_model("does-not-exist")?;
let text = "fallback please";
assert_eq!(tok.encode(text, false), fallback.encode(text, false));
Ok(())
}
#[test]
fn warm_model_cache_without_runtime_is_noop() {
warm_model_cache("gpt-5");
}
}