Delete tiktoken-rs (#7018)
This commit is contained in:
parent
397279d46e
commit
52d0ec4cd8
9 changed files with 17 additions and 283 deletions
51
codex-rs/Cargo.lock
generated
51
codex-rs/Cargo.lock
generated
|
|
@ -260,7 +260,7 @@ dependencies = [
|
|||
"memchr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustc-hash",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"syn 2.0.104",
|
||||
|
|
@ -1108,7 +1108,6 @@ dependencies = [
|
|||
"codex-utils-pty",
|
||||
"codex-utils-readiness",
|
||||
"codex-utils-string",
|
||||
"codex-utils-tokenizer",
|
||||
"codex-windows-sandbox",
|
||||
"core-foundation 0.9.4",
|
||||
"core_test_support",
|
||||
|
|
@ -1628,18 +1627,6 @@ dependencies = [
|
|||
name = "codex-utils-string"
|
||||
version = "0.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "codex-utils-tokenizer"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"codex-utils-cache",
|
||||
"pretty_assertions",
|
||||
"thiserror 2.0.17",
|
||||
"tiktoken-rs",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "codex-windows-sandbox"
|
||||
version = "0.1.0"
|
||||
|
|
@ -2461,17 +2448,6 @@ dependencies = [
|
|||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fancy-regex"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
|
||||
dependencies = [
|
||||
"bit-set",
|
||||
"regex-automata",
|
||||
"regex-syntax 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
|
|
@ -4796,7 +4772,7 @@ dependencies = [
|
|||
"pin-project-lite",
|
||||
"quinn-proto",
|
||||
"quinn-udp",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"socket2 0.6.0",
|
||||
"thiserror 2.0.17",
|
||||
|
|
@ -4816,7 +4792,7 @@ dependencies = [
|
|||
"lru-slab",
|
||||
"rand 0.9.2",
|
||||
"ring",
|
||||
"rustc-hash 2.1.1",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"slab",
|
||||
|
|
@ -5161,12 +5137,6 @@ version = "0.1.25"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
|
|
@ -6387,21 +6357,6 @@ dependencies = [
|
|||
"zune-jpeg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiktoken-rs"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a19830747d9034cd9da43a60eaa8e552dfda7712424aebf187b7a60126bae0d"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64",
|
||||
"bstr",
|
||||
"fancy-regex",
|
||||
"lazy_static",
|
||||
"regex",
|
||||
"rustc-hash 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.44"
|
||||
|
|
|
|||
|
|
@ -41,7 +41,6 @@ members = [
|
|||
"utils/pty",
|
||||
"utils/readiness",
|
||||
"utils/string",
|
||||
"utils/tokenizer",
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
|
|
@ -90,7 +89,6 @@ codex-utils-json-to-toml = { path = "utils/json-to-toml" }
|
|||
codex-utils-pty = { path = "utils/pty" }
|
||||
codex-utils-readiness = { path = "utils/readiness" }
|
||||
codex-utils-string = { path = "utils/string" }
|
||||
codex-utils-tokenizer = { path = "utils/tokenizer" }
|
||||
codex-windows-sandbox = { path = "windows-sandbox-rs" }
|
||||
core_test_support = { path = "core/tests/common" }
|
||||
mcp-types = { path = "mcp-types" }
|
||||
|
|
@ -190,7 +188,6 @@ tempfile = "3.23.0"
|
|||
test-log = "0.2.18"
|
||||
textwrap = "0.16.2"
|
||||
thiserror = "2.0.17"
|
||||
tiktoken-rs = "0.9"
|
||||
time = "0.3"
|
||||
tiny_http = "0.12"
|
||||
tokio = "1"
|
||||
|
|
@ -268,7 +265,6 @@ ignored = [
|
|||
"icu_provider",
|
||||
"openssl-sys",
|
||||
"codex-utils-readiness",
|
||||
"codex-utils-tokenizer",
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@ codex-rmcp-client = { workspace = true }
|
|||
codex-utils-pty = { workspace = true }
|
||||
codex-utils-readiness = { workspace = true }
|
||||
codex-utils-string = { workspace = true }
|
||||
codex-utils-tokenizer = { workspace = true }
|
||||
codex-windows-sandbox = { package = "codex-windows-sandbox", path = "../windows-sandbox-rs" }
|
||||
dirs = { workspace = true }
|
||||
dunce = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -133,7 +133,6 @@ use codex_protocol::protocol::InitialHistory;
|
|||
use codex_protocol::user_input::UserInput;
|
||||
use codex_utils_readiness::Readiness;
|
||||
use codex_utils_readiness::ReadinessFlag;
|
||||
use codex_utils_tokenizer::warm_model_cache;
|
||||
|
||||
/// The high-level interface to the Codex system.
|
||||
/// It operates as a queue pair where you send submissions and receive events.
|
||||
|
|
@ -555,9 +554,6 @@ impl Session {
|
|||
// Create the mutable state for the Session.
|
||||
let state = SessionState::new(session_configuration.clone());
|
||||
|
||||
// Warm the tokenizer cache for the session model without blocking startup.
|
||||
warm_model_cache(&session_configuration.model);
|
||||
|
||||
let services = SessionServices {
|
||||
mcp_connection_manager: Arc::new(RwLock::new(McpConnectionManager::default())),
|
||||
mcp_startup_cancellation_token: CancellationToken::new(),
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ use crate::config::types::Notice;
|
|||
use anyhow::Context;
|
||||
use codex_protocol::config_types::ReasoningEffort;
|
||||
use codex_protocol::config_types::TrustLevel;
|
||||
use codex_utils_tokenizer::warm_model_cache;
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
|
|
@ -231,9 +230,6 @@ impl ConfigDocument {
|
|||
fn apply(&mut self, edit: &ConfigEdit) -> anyhow::Result<bool> {
|
||||
match edit {
|
||||
ConfigEdit::SetModel { model, effort } => Ok({
|
||||
if let Some(model) = &model {
|
||||
warm_model_cache(model)
|
||||
}
|
||||
let mut mutated = false;
|
||||
mutated |= self.write_profile_value(
|
||||
&["model"],
|
||||
|
|
|
|||
|
|
@ -1,13 +1,13 @@
|
|||
use crate::codex::TurnContext;
|
||||
use crate::context_manager::normalize;
|
||||
use crate::truncate::TruncationPolicy;
|
||||
use crate::truncate::approx_token_count;
|
||||
use crate::truncate::truncate_function_output_items_with_policy;
|
||||
use crate::truncate::truncate_text;
|
||||
use codex_protocol::models::FunctionCallOutputPayload;
|
||||
use codex_protocol::models::ResponseItem;
|
||||
use codex_protocol::protocol::TokenUsage;
|
||||
use codex_protocol::protocol::TokenUsageInfo;
|
||||
use codex_utils_tokenizer::Tokenizer;
|
||||
use std::ops::Deref;
|
||||
|
||||
/// Transcript of conversation history
|
||||
|
|
@ -74,26 +74,21 @@ impl ContextManager {
|
|||
history
|
||||
}
|
||||
|
||||
// Estimate the number of tokens in the history. Return None if no tokenizer
|
||||
// is available. This does not consider the reasoning traces.
|
||||
// /!\ The value is a lower bound estimate and does not represent the exact
|
||||
// context length.
|
||||
// Estimate token usage using byte-based heuristics from the truncation helpers.
|
||||
// This is a coarse lower bound, not a tokenizer-accurate count.
|
||||
pub(crate) fn estimate_token_count(&self, turn_context: &TurnContext) -> Option<i64> {
|
||||
let model = turn_context.client.get_model();
|
||||
let tokenizer = Tokenizer::for_model(model.as_str()).ok()?;
|
||||
let model_family = turn_context.client.get_model_family();
|
||||
let base_tokens =
|
||||
i64::try_from(approx_token_count(model_family.base_instructions.as_str()))
|
||||
.unwrap_or(i64::MAX);
|
||||
|
||||
Some(
|
||||
self.items
|
||||
.iter()
|
||||
.map(|item| {
|
||||
serde_json::to_string(&item)
|
||||
.map(|item| tokenizer.count(&item))
|
||||
.unwrap_or_default()
|
||||
})
|
||||
.sum::<i64>()
|
||||
+ tokenizer.count(model_family.base_instructions.as_str()),
|
||||
)
|
||||
let items_tokens = self.items.iter().fold(0i64, |acc, item| {
|
||||
let serialized = serde_json::to_string(item).unwrap_or_default();
|
||||
let item_tokens = i64::try_from(approx_token_count(&serialized)).unwrap_or(i64::MAX);
|
||||
acc.saturating_add(item_tokens)
|
||||
});
|
||||
|
||||
Some(base_tokens.saturating_add(items_tokens))
|
||||
}
|
||||
|
||||
pub(crate) fn remove_first_item(&mut self) {
|
||||
|
|
|
|||
|
|
@ -384,7 +384,7 @@ async fn manual_compact_uses_custom_prompt() {
|
|||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn manual_compact_emits_estimated_token_usage_event() {
|
||||
async fn manual_compact_emits_api_and_local_token_usage_events() {
|
||||
skip_if_no_network!();
|
||||
|
||||
let server = start_mock_server().await;
|
||||
|
|
|
|||
|
|
@ -1,17 +0,0 @@
|
|||
[package]
|
||||
edition.workspace = true
|
||||
name = "codex-utils-tokenizer"
|
||||
version.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
codex-utils-cache = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tiktoken-rs = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = { workspace = true }
|
||||
|
|
@ -1,186 +0,0 @@
|
|||
use std::fmt;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use anyhow::Error as AnyhowError;
|
||||
use codex_utils_cache::BlockingLruCache;
|
||||
use thiserror::Error;
|
||||
use tiktoken_rs::CoreBPE;
|
||||
|
||||
/// Supported local encodings.
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||
pub enum EncodingKind {
|
||||
O200kBase,
|
||||
Cl100kBase,
|
||||
}
|
||||
|
||||
impl fmt::Display for EncodingKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::O200kBase => f.write_str("o200k_base"),
|
||||
Self::Cl100kBase => f.write_str("cl100k_base"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenizer error type.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum TokenizerError {
|
||||
#[error("failed to load encoding {kind}")]
|
||||
LoadEncoding {
|
||||
kind: EncodingKind,
|
||||
#[source]
|
||||
source: AnyhowError,
|
||||
},
|
||||
#[error("failed to decode tokens")]
|
||||
Decode {
|
||||
#[source]
|
||||
source: AnyhowError,
|
||||
},
|
||||
}
|
||||
|
||||
fn model_cache() -> &'static BlockingLruCache<String, CoreBPE> {
|
||||
static MODEL_CACHE: OnceLock<BlockingLruCache<String, CoreBPE>> = OnceLock::new();
|
||||
MODEL_CACHE
|
||||
.get_or_init(|| BlockingLruCache::new(NonZeroUsize::new(64).unwrap_or(NonZeroUsize::MIN)))
|
||||
}
|
||||
|
||||
/// Fire-and-forget function used to pre-warm model tokenizer loading. This is done
|
||||
/// on a best-effort basis, without any guarantee about the state of the cache
|
||||
/// before or after.
|
||||
/// Only working in Tokio runtimes
|
||||
pub fn warm_model_cache(model: &str) {
|
||||
if tokio::runtime::Handle::try_current().is_err() {
|
||||
return;
|
||||
}
|
||||
let model = model.to_string();
|
||||
tokio::spawn(async move {
|
||||
let _ = Tokenizer::for_model(&model);
|
||||
});
|
||||
}
|
||||
|
||||
/// Thin wrapper around a `tiktoken_rs::CoreBPE` tokenizer.
|
||||
#[derive(Clone)]
|
||||
pub struct Tokenizer {
|
||||
inner: CoreBPE,
|
||||
}
|
||||
|
||||
impl Tokenizer {
|
||||
/// Build a tokenizer for a specific encoding.
|
||||
pub fn new(kind: EncodingKind) -> Result<Self, TokenizerError> {
|
||||
let loader: fn() -> anyhow::Result<CoreBPE> = match kind {
|
||||
EncodingKind::O200kBase => tiktoken_rs::o200k_base,
|
||||
EncodingKind::Cl100kBase => tiktoken_rs::cl100k_base,
|
||||
};
|
||||
|
||||
let inner = loader().map_err(|source| TokenizerError::LoadEncoding { kind, source })?;
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
/// Default to `O200kBase`
|
||||
pub fn try_default() -> Result<Self, TokenizerError> {
|
||||
Self::new(EncodingKind::O200kBase)
|
||||
}
|
||||
|
||||
/// Build a tokenizer using an `OpenAI` model name (maps to an encoding).
|
||||
/// Falls back to the `O200kBase` encoding when the model is unknown.
|
||||
pub fn for_model(model: &str) -> Result<Self, TokenizerError> {
|
||||
let inner = model_cache().get_or_try_insert_with(model.to_owned(), || {
|
||||
match tiktoken_rs::get_bpe_from_model(model) {
|
||||
Ok(inner) => Ok(inner),
|
||||
Err(_model_error) => Tokenizer::new(EncodingKind::O200kBase).map(|e| e.inner),
|
||||
}
|
||||
})?;
|
||||
Ok(Self { inner })
|
||||
}
|
||||
|
||||
/// Encode text to token IDs. If `with_special_tokens` is true, special
|
||||
/// tokens are allowed and may appear in the result.
|
||||
#[must_use]
|
||||
pub fn encode(&self, text: &str, with_special_tokens: bool) -> Vec<i32> {
|
||||
let raw = if with_special_tokens {
|
||||
self.inner.encode_with_special_tokens(text)
|
||||
} else {
|
||||
self.inner.encode_ordinary(text)
|
||||
};
|
||||
raw.into_iter().map(|t| t as i32).collect()
|
||||
}
|
||||
|
||||
/// Count tokens in `text` as a signed integer.
|
||||
#[must_use]
|
||||
pub fn count(&self, text: &str) -> i64 {
|
||||
// Signed length to satisfy our style preference.
|
||||
i64::try_from(self.inner.encode_ordinary(text).len()).unwrap_or(i64::MAX)
|
||||
}
|
||||
|
||||
/// Decode token IDs back to text.
|
||||
pub fn decode(&self, tokens: &[i32]) -> Result<String, TokenizerError> {
|
||||
let raw: Vec<u32> = tokens.iter().map(|t| *t as u32).collect();
|
||||
self.inner
|
||||
.decode(raw)
|
||||
.map_err(|source| TokenizerError::Decode { source })
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Tokenizer {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Tokenizer {{ inner: <CoreBPE> }}")
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn cl100k_base_roundtrip_simple() -> Result<(), TokenizerError> {
|
||||
let tok = Tokenizer::new(EncodingKind::Cl100kBase)?;
|
||||
let s = "hello world";
|
||||
let ids = tok.encode(s, false);
|
||||
// Stable expectation for cl100k_base
|
||||
assert_eq!(ids, vec![15339, 1917]);
|
||||
let back = tok.decode(&ids)?;
|
||||
assert_eq!(back, s);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn preserves_whitespace_and_special_tokens_flag() -> Result<(), TokenizerError> {
|
||||
let tok = Tokenizer::new(EncodingKind::Cl100kBase)?;
|
||||
let s = "This has multiple spaces";
|
||||
let ids_no_special = tok.encode(s, false);
|
||||
let round = tok.decode(&ids_no_special)?;
|
||||
assert_eq!(round, s);
|
||||
|
||||
// With special tokens allowed, result may be identical for normal text,
|
||||
// but the API should still function.
|
||||
let ids_with_special = tok.encode(s, true);
|
||||
let round2 = tok.decode(&ids_with_special)?;
|
||||
assert_eq!(round2, s);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn model_mapping_builds_tokenizer() -> Result<(), TokenizerError> {
|
||||
// Choose a long-standing model alias that maps to cl100k_base.
|
||||
let tok = Tokenizer::for_model("gpt-5.1")?;
|
||||
let ids = tok.encode("ok", false);
|
||||
let back = tok.decode(&ids)?;
|
||||
assert_eq!(back, "ok");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_model_defaults_to_o200k_base() -> Result<(), TokenizerError> {
|
||||
let fallback = Tokenizer::new(EncodingKind::O200kBase)?;
|
||||
let tok = Tokenizer::for_model("does-not-exist")?;
|
||||
let text = "fallback please";
|
||||
assert_eq!(tok.encode(text, false), fallback.encode(text, false));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn warm_model_cache_without_runtime_is_noop() {
|
||||
warm_model_cache("gpt-5");
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue