Ollama max_tokens settings (#17025)

- Support `available_models` for Ollama
- Clamp default max tokens (context length) to 16384.
- Add documentation for ollama context configuration.
This commit is contained in:
Peter Tripp 2024-08-30 12:52:00 +00:00 committed by GitHub
parent d401ab1efc
commit b62e63349b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 92 additions and 35 deletions

View file

@ -135,6 +135,7 @@ impl AssistantSettingsContent {
Some(language_model::settings::OllamaSettingsContent {
api_url,
low_speed_timeout_in_seconds,
available_models: None,
});
}
},
@ -295,7 +296,7 @@ impl AssistantSettingsContent {
_ => (None, None),
};
settings.provider = Some(AssistantProviderContentV1::Ollama {
default_model: Some(ollama::Model::new(&model)),
default_model: Some(ollama::Model::new(&model, None, None)),
api_url,
low_speed_timeout_in_seconds,
});

View file

@ -6,8 +6,10 @@ use ollama::{
get_models, preload_model, stream_chat_completion, ChatMessage, ChatOptions, ChatRequest,
ChatResponseDelta, OllamaToolCall,
};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use settings::{Settings, SettingsStore};
use std::{sync::Arc, time::Duration};
use std::{collections::BTreeMap, sync::Arc, time::Duration};
use ui::{prelude::*, ButtonLike, Indicator};
use util::ResultExt;
@ -28,6 +30,17 @@ const PROVIDER_NAME: &str = "Ollama";
pub struct OllamaSettings {
pub api_url: String,
pub low_speed_timeout: Option<Duration>,
pub available_models: Vec<AvailableModel>,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct AvailableModel {
/// The model name in the Ollama API (e.g. "llama3.1:latest")
pub name: String,
/// The model's name in Zed's UI, such as in the model selector dropdown menu in the assistant panel.
pub display_name: Option<String>,
/// The Context Length parameter to the model (aka num_ctx or n_ctx)
pub max_tokens: usize,
}
pub struct OllamaLanguageModelProvider {
@ -61,7 +74,7 @@ impl State {
// indicating which models are embedding models,
// simply filter out models with "-embed" in their name
.filter(|model| !model.name.contains("-embed"))
.map(|model| ollama::Model::new(&model.name))
.map(|model| ollama::Model::new(&model.name, None, None))
.collect();
models.sort_by(|a, b| a.name.cmp(&b.name));
@ -123,10 +136,32 @@ impl LanguageModelProvider for OllamaLanguageModelProvider {
}
fn provided_models(&self, cx: &AppContext) -> Vec<Arc<dyn LanguageModel>> {
self.state
.read(cx)
let mut models: BTreeMap<String, ollama::Model> = BTreeMap::default();
// Add models from the Ollama API
for model in self.state.read(cx).available_models.iter() {
models.insert(model.name.clone(), model.clone());
}
// Override with available models from settings
for model in AllLanguageModelSettings::get_global(cx)
.ollama
.available_models
.iter()
{
models.insert(
model.name.clone(),
ollama::Model {
name: model.name.clone(),
display_name: model.display_name.clone(),
max_tokens: model.max_tokens,
keep_alive: None,
},
);
}
models
.into_values()
.map(|model| {
Arc::new(OllamaLanguageModel {
id: LanguageModelId::from(model.name.clone()),

View file

@ -152,6 +152,7 @@ pub struct AnthropicSettingsContentV1 {
pub struct OllamaSettingsContent {
pub api_url: Option<String>,
pub low_speed_timeout_in_seconds: Option<u64>,
pub available_models: Option<Vec<provider::ollama::AvailableModel>>,
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)]
@ -276,6 +277,9 @@ impl settings::Settings for AllLanguageModelSettings {
anthropic.as_ref().and_then(|s| s.available_models.clone()),
);
// Ollama
let ollama = value.ollama.clone();
merge(
&mut settings.ollama.api_url,
value.ollama.as_ref().and_then(|s| s.api_url.clone()),
@ -288,6 +292,10 @@ impl settings::Settings for AllLanguageModelSettings {
settings.ollama.low_speed_timeout =
Some(Duration::from_secs(low_speed_timeout_in_seconds));
}
merge(
&mut settings.ollama.available_models,
ollama.as_ref().and_then(|s| s.available_models.clone()),
);
// OpenAI
let (openai, upgraded) = match value.openai.clone().map(|s| s.upgrade()) {

View file

@ -66,40 +66,37 @@ impl Default for KeepAlive {
#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
pub struct Model {
pub name: String,
pub display_name: Option<String>,
pub max_tokens: usize,
pub keep_alive: Option<KeepAlive>,
}
// This could be dynamically retrieved via the API (1 call per model)
// curl -s http://localhost:11434/api/show -d '{"model": "llama3.1:latest"}' | jq '.model_info."llama.context_length"'
fn get_max_tokens(name: &str) -> usize {
match name {
"dolphin-llama3:8b-256k" => 262144, // 256K
_ => match name.split(':').next().unwrap() {
"mistral-nemo" => 1024000, // 1M
"deepseek-coder-v2" => 163840, // 160K
"llama3.1" | "phi3" | "command-r" | "command-r-plus" => 131072, // 128K
"codeqwen" => 65536, // 64K
"mistral" | "mistral-large" | "dolphin-mistral" | "codestral" // 32K
| "mistral-openorca" | "dolphin-mixtral" | "mixstral" | "llava"
| "qwen" | "qwen2" | "wizardlm2" | "wizard-math" => 32768,
"codellama" | "stable-code" | "deepseek-coder" | "starcoder2" // 16K
| "wizardcoder" => 16384,
"llama3" | "gemma2" | "gemma" | "codegemma" | "dolphin-llama3" // 8K
| "llava-llama3" | "starcoder" | "openchat" | "aya" => 8192,
"llama2" | "yi" | "llama2-chinese" | "vicuna" | "nous-hermes2" // 4K
| "stablelm2" => 4096,
"phi" | "orca-mini" | "tinyllama" | "granite-code" => 2048, // 2K
_ => 2048, // 2K (default)
},
/// Default context length for unknown models.
const DEFAULT_TOKENS: usize = 2048;
/// Magic number. Lets many Ollama models work with ~16GB of ram.
const MAXIMUM_TOKENS: usize = 16384;
match name.split(':').next().unwrap() {
"phi" | "tinyllama" | "granite-code" => 2048,
"llama2" | "yi" | "vicuna" | "stablelm2" => 4096,
"llama3" | "gemma2" | "gemma" | "codegemma" | "starcoder" | "aya" => 8192,
"codellama" | "starcoder2" => 16384,
"mistral" | "codestral" | "mixstral" | "llava" | "qwen2" | "dolphin-mixtral" => 32768,
"llama3.1" | "phi3" | "phi3.5" | "command-r" | "deepseek-coder-v2" => 128000,
_ => DEFAULT_TOKENS,
}
.clamp(1, MAXIMUM_TOKENS)
}
impl Model {
pub fn new(name: &str) -> Self {
pub fn new(name: &str, display_name: Option<&str>, max_tokens: Option<usize>) -> Self {
Self {
name: name.to_owned(),
max_tokens: get_max_tokens(name),
display_name: display_name
.map(ToString::to_string)
.or_else(|| name.strip_suffix(":latest").map(ToString::to_string)),
max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)),
keep_alive: Some(KeepAlive::indefinite()),
}
}
@ -109,7 +106,7 @@ impl Model {
}
pub fn display_name(&self) -> &str {
&self.name
self.display_name.as_ref().unwrap_or(&self.name)
}
pub fn max_token_count(&self) -> usize {