diff --git a/crates/assistant/src/assistant_settings.rs b/crates/assistant/src/assistant_settings.rs index 9d89fe2920..478598edbe 100644 --- a/crates/assistant/src/assistant_settings.rs +++ b/crates/assistant/src/assistant_settings.rs @@ -135,6 +135,7 @@ impl AssistantSettingsContent { Some(language_model::settings::OllamaSettingsContent { api_url, low_speed_timeout_in_seconds, + available_models: None, }); } }, @@ -295,7 +296,7 @@ impl AssistantSettingsContent { _ => (None, None), }; settings.provider = Some(AssistantProviderContentV1::Ollama { - default_model: Some(ollama::Model::new(&model)), + default_model: Some(ollama::Model::new(&model, None, None)), api_url, low_speed_timeout_in_seconds, }); diff --git a/crates/language_model/src/provider/ollama.rs b/crates/language_model/src/provider/ollama.rs index 0ff3d70a6a..bdbbe9657a 100644 --- a/crates/language_model/src/provider/ollama.rs +++ b/crates/language_model/src/provider/ollama.rs @@ -6,8 +6,10 @@ use ollama::{ get_models, preload_model, stream_chat_completion, ChatMessage, ChatOptions, ChatRequest, ChatResponseDelta, OllamaToolCall, }; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use settings::{Settings, SettingsStore}; -use std::{sync::Arc, time::Duration}; +use std::{collections::BTreeMap, sync::Arc, time::Duration}; use ui::{prelude::*, ButtonLike, Indicator}; use util::ResultExt; @@ -28,6 +30,17 @@ const PROVIDER_NAME: &str = "Ollama"; pub struct OllamaSettings { pub api_url: String, pub low_speed_timeout: Option, + pub available_models: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, JsonSchema)] +pub struct AvailableModel { + /// The model name in the Ollama API (e.g. "llama3.1:latest") + pub name: String, + /// The model's name in Zed's UI, such as in the model selector dropdown menu in the assistant panel. + pub display_name: Option, + /// The Context Length parameter to the model (aka num_ctx or n_ctx) + pub max_tokens: usize, } pub struct OllamaLanguageModelProvider { @@ -61,7 +74,7 @@ impl State { // indicating which models are embedding models, // simply filter out models with "-embed" in their name .filter(|model| !model.name.contains("-embed")) - .map(|model| ollama::Model::new(&model.name)) + .map(|model| ollama::Model::new(&model.name, None, None)) .collect(); models.sort_by(|a, b| a.name.cmp(&b.name)); @@ -123,10 +136,32 @@ impl LanguageModelProvider for OllamaLanguageModelProvider { } fn provided_models(&self, cx: &AppContext) -> Vec> { - self.state - .read(cx) + let mut models: BTreeMap = BTreeMap::default(); + + // Add models from the Ollama API + for model in self.state.read(cx).available_models.iter() { + models.insert(model.name.clone(), model.clone()); + } + + // Override with available models from settings + for model in AllLanguageModelSettings::get_global(cx) + .ollama .available_models .iter() + { + models.insert( + model.name.clone(), + ollama::Model { + name: model.name.clone(), + display_name: model.display_name.clone(), + max_tokens: model.max_tokens, + keep_alive: None, + }, + ); + } + + models + .into_values() .map(|model| { Arc::new(OllamaLanguageModel { id: LanguageModelId::from(model.name.clone()), diff --git a/crates/language_model/src/settings.rs b/crates/language_model/src/settings.rs index 923ad5b5e9..8d3838d236 100644 --- a/crates/language_model/src/settings.rs +++ b/crates/language_model/src/settings.rs @@ -152,6 +152,7 @@ pub struct AnthropicSettingsContentV1 { pub struct OllamaSettingsContent { pub api_url: Option, pub low_speed_timeout_in_seconds: Option, + pub available_models: Option>, } #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, JsonSchema)] @@ -276,6 +277,9 @@ impl settings::Settings for AllLanguageModelSettings { anthropic.as_ref().and_then(|s| s.available_models.clone()), ); + // Ollama + let ollama = value.ollama.clone(); + merge( &mut settings.ollama.api_url, value.ollama.as_ref().and_then(|s| s.api_url.clone()), @@ -288,6 +292,10 @@ impl settings::Settings for AllLanguageModelSettings { settings.ollama.low_speed_timeout = Some(Duration::from_secs(low_speed_timeout_in_seconds)); } + merge( + &mut settings.ollama.available_models, + ollama.as_ref().and_then(|s| s.available_models.clone()), + ); // OpenAI let (openai, upgraded) = match value.openai.clone().map(|s| s.upgrade()) { diff --git a/crates/ollama/src/ollama.rs b/crates/ollama/src/ollama.rs index 145adc36a3..c47fcb3d79 100644 --- a/crates/ollama/src/ollama.rs +++ b/crates/ollama/src/ollama.rs @@ -66,40 +66,37 @@ impl Default for KeepAlive { #[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] pub struct Model { pub name: String, + pub display_name: Option, pub max_tokens: usize, pub keep_alive: Option, } -// This could be dynamically retrieved via the API (1 call per model) -// curl -s http://localhost:11434/api/show -d '{"model": "llama3.1:latest"}' | jq '.model_info."llama.context_length"' fn get_max_tokens(name: &str) -> usize { - match name { - "dolphin-llama3:8b-256k" => 262144, // 256K - _ => match name.split(':').next().unwrap() { - "mistral-nemo" => 1024000, // 1M - "deepseek-coder-v2" => 163840, // 160K - "llama3.1" | "phi3" | "command-r" | "command-r-plus" => 131072, // 128K - "codeqwen" => 65536, // 64K - "mistral" | "mistral-large" | "dolphin-mistral" | "codestral" // 32K - | "mistral-openorca" | "dolphin-mixtral" | "mixstral" | "llava" - | "qwen" | "qwen2" | "wizardlm2" | "wizard-math" => 32768, - "codellama" | "stable-code" | "deepseek-coder" | "starcoder2" // 16K - | "wizardcoder" => 16384, - "llama3" | "gemma2" | "gemma" | "codegemma" | "dolphin-llama3" // 8K - | "llava-llama3" | "starcoder" | "openchat" | "aya" => 8192, - "llama2" | "yi" | "llama2-chinese" | "vicuna" | "nous-hermes2" // 4K - | "stablelm2" => 4096, - "phi" | "orca-mini" | "tinyllama" | "granite-code" => 2048, // 2K - _ => 2048, // 2K (default) - }, + /// Default context length for unknown models. + const DEFAULT_TOKENS: usize = 2048; + /// Magic number. Lets many Ollama models work with ~16GB of ram. + const MAXIMUM_TOKENS: usize = 16384; + + match name.split(':').next().unwrap() { + "phi" | "tinyllama" | "granite-code" => 2048, + "llama2" | "yi" | "vicuna" | "stablelm2" => 4096, + "llama3" | "gemma2" | "gemma" | "codegemma" | "starcoder" | "aya" => 8192, + "codellama" | "starcoder2" => 16384, + "mistral" | "codestral" | "mixstral" | "llava" | "qwen2" | "dolphin-mixtral" => 32768, + "llama3.1" | "phi3" | "phi3.5" | "command-r" | "deepseek-coder-v2" => 128000, + _ => DEFAULT_TOKENS, } + .clamp(1, MAXIMUM_TOKENS) } impl Model { - pub fn new(name: &str) -> Self { + pub fn new(name: &str, display_name: Option<&str>, max_tokens: Option) -> Self { Self { name: name.to_owned(), - max_tokens: get_max_tokens(name), + display_name: display_name + .map(ToString::to_string) + .or_else(|| name.strip_suffix(":latest").map(ToString::to_string)), + max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)), keep_alive: Some(KeepAlive::indefinite()), } } @@ -109,7 +106,7 @@ impl Model { } pub fn display_name(&self) -> &str { - &self.name + self.display_name.as_ref().unwrap_or(&self.name) } pub fn max_token_count(&self) -> usize { diff --git a/docs/src/assistant/configuration.md b/docs/src/assistant/configuration.md index 2b9541ded3..adac828567 100644 --- a/docs/src/assistant/configuration.md +++ b/docs/src/assistant/configuration.md @@ -108,33 +108,49 @@ Custom models will be listed in the model dropdown in the assistant panel. Download and install Ollama from [ollama.com/download](https://ollama.com/download) (Linux or macOS) and ensure it's running with `ollama --version`. -You can use Ollama with the Zed assistant by making Ollama appear as an OpenAPI endpoint. - -1. Download, for example, the `mistral` model with Ollama: +1. Download one of the [available models](https://ollama.com/models), for example, for `mistral`: ```sh ollama pull mistral ``` -2. Make sure that the Ollama server is running. You can start it either via running the Ollama app, or launching: +2. Make sure that the Ollama server is running. You can start it either via running Ollama.app (MacOS) or launching: ```sh ollama serve ``` 3. In the assistant panel, select one of the Ollama models using the model dropdown. -4. (Optional) If you want to change the default URL that is used to access the Ollama server, you can do so by adding the following settings: + +4. (Optional) Specify a [custom api_url](#custom-endpoint) or [custom `low_speed_timeout_in_seconds`](#provider-timeout) if required. + +#### Ollama Context Length {#ollama-context}} + +Zed has pre-configured maximum context lengths (`max_tokens`) to match the capabilities of common models. Zed API requests to Ollama include this as `num_ctx` parameter, but the default values do not exceed `16384` so users with ~16GB of ram are able to use most models out of the box. See [get_max_tokens in ollama.rs](https://github.com/zed-industries/zed/blob/main/crates/ollama/src/ollama.rs) for a complete set of defaults. + +**Note**: Tokens counts displayed in the assistant panel are only estimates and will differ from the models native tokenizer. + +Depending on your hardware or use-case you may wish to limit or increase the context length for a specific model via settings.json: ```json { "language_models": { "ollama": { - "api_url": "http://localhost:11434" + "low_speed_timeout_in_seconds": 120, + "available_models": [ + { + "provider": "ollama", + "name": "mistral:latest", + "max_tokens": 32768 + } + ] } } } ``` +If you specify a context length that is too large for your hardware, Ollama will log an error. You can watch these logs by running: `tail -f ~/.ollama/logs/ollama.log` (MacOS) or `journalctl -u ollama -f` (Linux). Depending on the memory available on your machine, you may need to adjust the context length to a smaller value. + ### OpenAI {#openai} 1. Visit the OpenAI platform and [create an API key](https://platform.openai.com/account/api-keys)