From 67f149a4bcbd3decc9a6c68a35ae3137095580a1 Mon Sep 17 00:00:00 2001 From: Peter Tripp Date: Mon, 16 Sep 2024 18:47:25 -0400 Subject: [PATCH] Ollama: Specify keep_alive via settings (#17906) --- crates/language_model/src/provider/ollama.rs | 6 ++++-- docs/src/assistant/configuration.md | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/language_model/src/provider/ollama.rs b/crates/language_model/src/provider/ollama.rs index cfcca1fb7a..6a3190dee7 100644 --- a/crates/language_model/src/provider/ollama.rs +++ b/crates/language_model/src/provider/ollama.rs @@ -4,7 +4,7 @@ use gpui::{AnyView, AppContext, AsyncAppContext, ModelContext, Subscription, Tas use http_client::HttpClient; use ollama::{ get_models, preload_model, stream_chat_completion, ChatMessage, ChatOptions, ChatRequest, - ChatResponseDelta, OllamaToolCall, + ChatResponseDelta, KeepAlive, OllamaToolCall, }; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -42,6 +42,8 @@ pub struct AvailableModel { pub display_name: Option, /// The Context Length parameter to the model (aka num_ctx or n_ctx) pub max_tokens: usize, + /// The number of seconds to keep the connection open after the last request + pub keep_alive: Option, } pub struct OllamaLanguageModelProvider { @@ -156,7 +158,7 @@ impl LanguageModelProvider for OllamaLanguageModelProvider { name: model.name.clone(), display_name: model.display_name.clone(), max_tokens: model.max_tokens, - keep_alive: None, + keep_alive: model.keep_alive.clone(), }, ); } diff --git a/docs/src/assistant/configuration.md b/docs/src/assistant/configuration.md index 4d9870e896..bcdf461e2c 100644 --- a/docs/src/assistant/configuration.md +++ b/docs/src/assistant/configuration.md @@ -152,6 +152,8 @@ Depending on your hardware or use-case you may wish to limit or increase the con If you specify a context length that is too large for your hardware, Ollama will log an error. You can watch these logs by running: `tail -f ~/.ollama/logs/ollama.log` (MacOS) or `journalctl -u ollama -f` (Linux). Depending on the memory available on your machine, you may need to adjust the context length to a smaller value. +You may also optionally specify a value for `keep_alive` for each available model. This can be an integer (seconds) or alternately a string duration like "5m", "10m", "1h", "1d", etc., For example `"keep_alive": "120s"` will allow the remote server to unload the model (freeing up GPU VRAM) after 120seconds. + ### OpenAI {#openai} 1. Visit the OpenAI platform and [create an API key](https://platform.openai.com/account/api-keys)