Standardize on u64 for token counts (#32869)

Previously we were using a mix of `u32` and `usize`, e.g. `max_tokens:
usize, max_output_tokens: Option<u32>` in the same `struct`.

Although [tiktoken](https://github.com/openai/tiktoken) uses `usize`,
token counts should be consistent across targets (e.g. the same model
doesn't suddenly get a smaller context window if you're compiling for
wasm32), and these token counts could end up getting serialized using a
binary protocol, so `usize` is not the right choice for token counts.

I chose to standardize on `u64` over `u32` because we don't store many
of them (so the extra size should be insignificant) and future models
may exceed `u32::MAX` tokens.

Release Notes:

- N/A
This commit is contained in:
Richard Feldman 2025-06-17 10:43:07 -04:00 committed by GitHub
parent a391d67366
commit 5405c2c2d3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 191 additions and 192 deletions

View file

@ -35,18 +35,18 @@ impl Default for KeepAlive {
pub struct Model {
pub name: String,
pub display_name: Option<String>,
pub max_tokens: usize,
pub max_tokens: u64,
pub keep_alive: Option<KeepAlive>,
pub supports_tools: Option<bool>,
pub supports_vision: Option<bool>,
pub supports_thinking: Option<bool>,
}
fn get_max_tokens(name: &str) -> usize {
fn get_max_tokens(name: &str) -> u64 {
/// Default context length for unknown models.
const DEFAULT_TOKENS: usize = 4096;
const DEFAULT_TOKENS: u64 = 4096;
/// Magic number. Lets many Ollama models work with ~16GB of ram.
const MAXIMUM_TOKENS: usize = 16384;
const MAXIMUM_TOKENS: u64 = 16384;
match name.split(':').next().unwrap() {
"phi" | "tinyllama" | "granite-code" => 2048,
@ -67,7 +67,7 @@ impl Model {
pub fn new(
name: &str,
display_name: Option<&str>,
max_tokens: Option<usize>,
max_tokens: Option<u64>,
supports_tools: Option<bool>,
supports_vision: Option<bool>,
supports_thinking: Option<bool>,
@ -93,7 +93,7 @@ impl Model {
self.display_name.as_ref().unwrap_or(&self.name)
}
pub fn max_token_count(&self) -> usize {
pub fn max_token_count(&self) -> u64 {
self.max_tokens
}
}
@ -165,7 +165,7 @@ impl ChatRequest {
// https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values
#[derive(Serialize, Default, Debug)]
pub struct ChatOptions {
pub num_ctx: Option<usize>,
pub num_ctx: Option<u64>,
pub num_predict: Option<isize>,
pub stop: Option<Vec<String>>,
pub temperature: Option<f32>,
@ -183,8 +183,8 @@ pub struct ChatResponseDelta {
pub done_reason: Option<String>,
#[allow(unused)]
pub done: bool,
pub prompt_eval_count: Option<u32>,
pub eval_count: Option<u32>,
pub prompt_eval_count: Option<u64>,
pub eval_count: Option<u64>,
}
#[derive(Serialize, Deserialize)]