Ollama max_tokens settings (#17025)
- Support `available_models` for Ollama - Clamp default max tokens (context length) to 16384. - Add documentation for ollama context configuration.
This commit is contained in:
parent
d401ab1efc
commit
b62e63349b
5 changed files with 92 additions and 35 deletions
|
@ -66,40 +66,37 @@ impl Default for KeepAlive {
|
|||
#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
|
||||
pub struct Model {
|
||||
pub name: String,
|
||||
pub display_name: Option<String>,
|
||||
pub max_tokens: usize,
|
||||
pub keep_alive: Option<KeepAlive>,
|
||||
}
|
||||
|
||||
// This could be dynamically retrieved via the API (1 call per model)
|
||||
// curl -s http://localhost:11434/api/show -d '{"model": "llama3.1:latest"}' | jq '.model_info."llama.context_length"'
|
||||
fn get_max_tokens(name: &str) -> usize {
|
||||
match name {
|
||||
"dolphin-llama3:8b-256k" => 262144, // 256K
|
||||
_ => match name.split(':').next().unwrap() {
|
||||
"mistral-nemo" => 1024000, // 1M
|
||||
"deepseek-coder-v2" => 163840, // 160K
|
||||
"llama3.1" | "phi3" | "command-r" | "command-r-plus" => 131072, // 128K
|
||||
"codeqwen" => 65536, // 64K
|
||||
"mistral" | "mistral-large" | "dolphin-mistral" | "codestral" // 32K
|
||||
| "mistral-openorca" | "dolphin-mixtral" | "mixstral" | "llava"
|
||||
| "qwen" | "qwen2" | "wizardlm2" | "wizard-math" => 32768,
|
||||
"codellama" | "stable-code" | "deepseek-coder" | "starcoder2" // 16K
|
||||
| "wizardcoder" => 16384,
|
||||
"llama3" | "gemma2" | "gemma" | "codegemma" | "dolphin-llama3" // 8K
|
||||
| "llava-llama3" | "starcoder" | "openchat" | "aya" => 8192,
|
||||
"llama2" | "yi" | "llama2-chinese" | "vicuna" | "nous-hermes2" // 4K
|
||||
| "stablelm2" => 4096,
|
||||
"phi" | "orca-mini" | "tinyllama" | "granite-code" => 2048, // 2K
|
||||
_ => 2048, // 2K (default)
|
||||
},
|
||||
/// Default context length for unknown models.
|
||||
const DEFAULT_TOKENS: usize = 2048;
|
||||
/// Magic number. Lets many Ollama models work with ~16GB of ram.
|
||||
const MAXIMUM_TOKENS: usize = 16384;
|
||||
|
||||
match name.split(':').next().unwrap() {
|
||||
"phi" | "tinyllama" | "granite-code" => 2048,
|
||||
"llama2" | "yi" | "vicuna" | "stablelm2" => 4096,
|
||||
"llama3" | "gemma2" | "gemma" | "codegemma" | "starcoder" | "aya" => 8192,
|
||||
"codellama" | "starcoder2" => 16384,
|
||||
"mistral" | "codestral" | "mixstral" | "llava" | "qwen2" | "dolphin-mixtral" => 32768,
|
||||
"llama3.1" | "phi3" | "phi3.5" | "command-r" | "deepseek-coder-v2" => 128000,
|
||||
_ => DEFAULT_TOKENS,
|
||||
}
|
||||
.clamp(1, MAXIMUM_TOKENS)
|
||||
}
|
||||
|
||||
impl Model {
|
||||
pub fn new(name: &str) -> Self {
|
||||
pub fn new(name: &str, display_name: Option<&str>, max_tokens: Option<usize>) -> Self {
|
||||
Self {
|
||||
name: name.to_owned(),
|
||||
max_tokens: get_max_tokens(name),
|
||||
display_name: display_name
|
||||
.map(ToString::to_string)
|
||||
.or_else(|| name.strip_suffix(":latest").map(ToString::to_string)),
|
||||
max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)),
|
||||
keep_alive: Some(KeepAlive::indefinite()),
|
||||
}
|
||||
}
|
||||
|
@ -109,7 +106,7 @@ impl Model {
|
|||
}
|
||||
|
||||
pub fn display_name(&self) -> &str {
|
||||
&self.name
|
||||
self.display_name.as_ref().unwrap_or(&self.name)
|
||||
}
|
||||
|
||||
pub fn max_token_count(&self) -> usize {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue