Ollama Provider for Assistant (#12902)

Closes #4424.

A few design decisions that may need some rethinking or later PRs:

* Other providers have a check for authentication. I use this
opportunity to fetch the models which doubles as a way of finding out if
the Ollama server is running.
* Ollama has _no_ API for getting the max tokens per model
* Ollama has _no_ API for getting the current token count
https://github.com/ollama/ollama/issues/1716
* Ollama does allow setting the `num_ctx` so I've defaulted this to
4096. It can be overridden in settings.
* Ollama models will be "slow" to start inference because they're
loading the model into memory. It's faster after that. There's no UI
affordance to show that the model is being loaded.

Release Notes:

- Added an Ollama Provider for the assistant. If you have
[Ollama](https://ollama.com/) running locally on your machine, you can
enable it in your settings under:

```jsonc
"assistant": {
    "version": "1",
    "provider": {
      "name": "ollama",
      // Recommended setting to allow for model startup
      "low_speed_timeout_in_seconds": 30,
    }
}
```

Chat like usual

<img width="1840" alt="image"
src="https://github.com/zed-industries/zed/assets/836375/4e0af266-4c4f-4d9e-9d74-1a91f76a12fe">

Interact with any model from the [Ollama
Library](https://ollama.com/library)

<img width="587" alt="image"
src="https://github.com/zed-industries/zed/assets/836375/87433ac6-bf87-4a99-89e1-96a93bf8de8a">

Open up the terminal to download new models via `ollama pull`:


![image](https://github.com/zed-industries/zed/assets/836375/af7ec411-76bf-41c7-ba81-64bbaeea98a8)
This commit is contained in:
Kyle Kelley 2024-06-11 17:35:27 -07:00 committed by GitHub
parent 127b9ed857
commit 4cb8d6f40e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 624 additions and 1 deletions

View file

@ -12,7 +12,7 @@ mod streaming_diff;
pub use assistant_panel::AssistantPanel;
use assistant_settings::{AnthropicModel, AssistantSettings, CloudModel, OpenAiModel};
use assistant_settings::{AnthropicModel, AssistantSettings, CloudModel, OllamaModel, OpenAiModel};
use assistant_slash_command::SlashCommandRegistry;
use client::{proto, Client};
use command_palette_hooks::CommandPaletteFilter;
@ -91,6 +91,7 @@ pub enum LanguageModel {
Cloud(CloudModel),
OpenAi(OpenAiModel),
Anthropic(AnthropicModel),
Ollama(OllamaModel),
}
impl Default for LanguageModel {
@ -105,6 +106,7 @@ impl LanguageModel {
LanguageModel::OpenAi(model) => format!("openai/{}", model.id()),
LanguageModel::Anthropic(model) => format!("anthropic/{}", model.id()),
LanguageModel::Cloud(model) => format!("zed.dev/{}", model.id()),
LanguageModel::Ollama(model) => format!("ollama/{}", model.id()),
}
}
@ -113,6 +115,7 @@ impl LanguageModel {
LanguageModel::OpenAi(model) => model.display_name().into(),
LanguageModel::Anthropic(model) => model.display_name().into(),
LanguageModel::Cloud(model) => model.display_name().into(),
LanguageModel::Ollama(model) => model.display_name().into(),
}
}
@ -121,6 +124,7 @@ impl LanguageModel {
LanguageModel::OpenAi(model) => model.max_token_count(),
LanguageModel::Anthropic(model) => model.max_token_count(),
LanguageModel::Cloud(model) => model.max_token_count(),
LanguageModel::Ollama(model) => model.max_token_count(),
}
}
@ -129,6 +133,7 @@ impl LanguageModel {
LanguageModel::OpenAi(model) => model.id(),
LanguageModel::Anthropic(model) => model.id(),
LanguageModel::Cloud(model) => model.id(),
LanguageModel::Ollama(model) => model.id(),
}
}
}
@ -179,6 +184,7 @@ impl LanguageModelRequest {
match &self.model {
LanguageModel::OpenAi(_) => {}
LanguageModel::Anthropic(_) => {}
LanguageModel::Ollama(_) => {}
LanguageModel::Cloud(model) => match model {
CloudModel::Claude3Opus | CloudModel::Claude3Sonnet | CloudModel::Claude3Haiku => {
preprocess_anthropic_request(self);

View file

@ -2,6 +2,7 @@ use std::fmt;
pub use anthropic::Model as AnthropicModel;
use gpui::Pixels;
pub use ollama::Model as OllamaModel;
pub use open_ai::Model as OpenAiModel;
use schemars::{
schema::{InstanceType, Metadata, Schema, SchemaObject},
@ -168,6 +169,11 @@ pub enum AssistantProvider {
api_url: String,
low_speed_timeout_in_seconds: Option<u64>,
},
Ollama {
model: OllamaModel,
api_url: String,
low_speed_timeout_in_seconds: Option<u64>,
},
}
impl Default for AssistantProvider {
@ -197,6 +203,12 @@ pub enum AssistantProviderContent {
api_url: Option<String>,
low_speed_timeout_in_seconds: Option<u64>,
},
#[serde(rename = "ollama")]
Ollama {
default_model: Option<OllamaModel>,
api_url: Option<String>,
low_speed_timeout_in_seconds: Option<u64>,
},
}
#[derive(Debug, Default)]
@ -328,6 +340,13 @@ impl AssistantSettingsContent {
low_speed_timeout_in_seconds: None,
})
}
LanguageModel::Ollama(model) => {
*provider = Some(AssistantProviderContent::Ollama {
default_model: Some(model),
api_url: None,
low_speed_timeout_in_seconds: None,
})
}
},
},
},
@ -472,6 +491,27 @@ impl Settings for AssistantSettings {
Some(low_speed_timeout_in_seconds_override);
}
}
(
AssistantProvider::Ollama {
model,
api_url,
low_speed_timeout_in_seconds,
},
AssistantProviderContent::Ollama {
default_model: model_override,
api_url: api_url_override,
low_speed_timeout_in_seconds: low_speed_timeout_in_seconds_override,
},
) => {
merge(model, model_override);
merge(api_url, api_url_override);
if let Some(low_speed_timeout_in_seconds_override) =
low_speed_timeout_in_seconds_override
{
*low_speed_timeout_in_seconds =
Some(low_speed_timeout_in_seconds_override);
}
}
(
AssistantProvider::Anthropic {
model,
@ -519,6 +559,15 @@ impl Settings for AssistantSettings {
.unwrap_or_else(|| anthropic::ANTHROPIC_API_URL.into()),
low_speed_timeout_in_seconds,
},
AssistantProviderContent::Ollama {
default_model: model,
api_url,
low_speed_timeout_in_seconds,
} => AssistantProvider::Ollama {
model: model.unwrap_or_default(),
api_url: api_url.unwrap_or_else(|| ollama::OLLAMA_API_URL.into()),
low_speed_timeout_in_seconds,
},
};
}
}

View file

@ -2,12 +2,14 @@ mod anthropic;
mod cloud;
#[cfg(test)]
mod fake;
mod ollama;
mod open_ai;
pub use anthropic::*;
pub use cloud::*;
#[cfg(test)]
pub use fake::*;
pub use ollama::*;
pub use open_ai::*;
use crate::{
@ -50,6 +52,17 @@ pub fn init(client: Arc<Client>, cx: &mut AppContext) {
low_speed_timeout_in_seconds.map(Duration::from_secs),
settings_version,
)),
AssistantProvider::Ollama {
model,
api_url,
low_speed_timeout_in_seconds,
} => CompletionProvider::Ollama(OllamaCompletionProvider::new(
model.clone(),
api_url.clone(),
client.http_client(),
low_speed_timeout_in_seconds.map(Duration::from_secs),
settings_version,
)),
};
cx.set_global(provider);
@ -87,6 +100,23 @@ pub fn init(client: Arc<Client>, cx: &mut AppContext) {
settings_version,
);
}
(
CompletionProvider::Ollama(provider),
AssistantProvider::Ollama {
model,
api_url,
low_speed_timeout_in_seconds,
},
) => {
provider.update(
model.clone(),
api_url.clone(),
low_speed_timeout_in_seconds.map(Duration::from_secs),
settings_version,
);
}
(CompletionProvider::Cloud(provider), AssistantProvider::ZedDotDev { model }) => {
provider.update(model.clone(), settings_version);
}
@ -130,6 +160,22 @@ pub fn init(client: Arc<Client>, cx: &mut AppContext) {
settings_version,
));
}
(
_,
AssistantProvider::Ollama {
model,
api_url,
low_speed_timeout_in_seconds,
},
) => {
*provider = CompletionProvider::Ollama(OllamaCompletionProvider::new(
model.clone(),
api_url.clone(),
client.http_client(),
low_speed_timeout_in_seconds.map(Duration::from_secs),
settings_version,
));
}
}
})
})
@ -142,6 +188,7 @@ pub enum CompletionProvider {
Cloud(CloudCompletionProvider),
#[cfg(test)]
Fake(FakeCompletionProvider),
Ollama(OllamaCompletionProvider),
}
impl gpui::Global for CompletionProvider {}
@ -165,6 +212,10 @@ impl CompletionProvider {
.available_models()
.map(LanguageModel::Cloud)
.collect(),
CompletionProvider::Ollama(provider) => provider
.available_models()
.map(|model| LanguageModel::Ollama(model.clone()))
.collect(),
#[cfg(test)]
CompletionProvider::Fake(_) => unimplemented!(),
}
@ -175,6 +226,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.settings_version(),
CompletionProvider::Anthropic(provider) => provider.settings_version(),
CompletionProvider::Cloud(provider) => provider.settings_version(),
CompletionProvider::Ollama(provider) => provider.settings_version(),
#[cfg(test)]
CompletionProvider::Fake(_) => unimplemented!(),
}
@ -185,6 +237,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.is_authenticated(),
CompletionProvider::Anthropic(provider) => provider.is_authenticated(),
CompletionProvider::Cloud(provider) => provider.is_authenticated(),
CompletionProvider::Ollama(provider) => provider.is_authenticated(),
#[cfg(test)]
CompletionProvider::Fake(_) => true,
}
@ -195,6 +248,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.authenticate(cx),
CompletionProvider::Anthropic(provider) => provider.authenticate(cx),
CompletionProvider::Cloud(provider) => provider.authenticate(cx),
CompletionProvider::Ollama(provider) => provider.authenticate(cx),
#[cfg(test)]
CompletionProvider::Fake(_) => Task::ready(Ok(())),
}
@ -205,6 +259,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.authentication_prompt(cx),
CompletionProvider::Anthropic(provider) => provider.authentication_prompt(cx),
CompletionProvider::Cloud(provider) => provider.authentication_prompt(cx),
CompletionProvider::Ollama(provider) => provider.authentication_prompt(cx),
#[cfg(test)]
CompletionProvider::Fake(_) => unimplemented!(),
}
@ -215,6 +270,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.reset_credentials(cx),
CompletionProvider::Anthropic(provider) => provider.reset_credentials(cx),
CompletionProvider::Cloud(_) => Task::ready(Ok(())),
CompletionProvider::Ollama(provider) => provider.reset_credentials(cx),
#[cfg(test)]
CompletionProvider::Fake(_) => Task::ready(Ok(())),
}
@ -225,6 +281,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => LanguageModel::OpenAi(provider.model()),
CompletionProvider::Anthropic(provider) => LanguageModel::Anthropic(provider.model()),
CompletionProvider::Cloud(provider) => LanguageModel::Cloud(provider.model()),
CompletionProvider::Ollama(provider) => LanguageModel::Ollama(provider.model()),
#[cfg(test)]
CompletionProvider::Fake(_) => LanguageModel::default(),
}
@ -239,6 +296,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.count_tokens(request, cx),
CompletionProvider::Anthropic(provider) => provider.count_tokens(request, cx),
CompletionProvider::Cloud(provider) => provider.count_tokens(request, cx),
CompletionProvider::Ollama(provider) => provider.count_tokens(request, cx),
#[cfg(test)]
CompletionProvider::Fake(_) => futures::FutureExt::boxed(futures::future::ready(Ok(0))),
}
@ -252,6 +310,7 @@ impl CompletionProvider {
CompletionProvider::OpenAi(provider) => provider.complete(request),
CompletionProvider::Anthropic(provider) => provider.complete(request),
CompletionProvider::Cloud(provider) => provider.complete(request),
CompletionProvider::Ollama(provider) => provider.complete(request),
#[cfg(test)]
CompletionProvider::Fake(provider) => provider.complete(),
}

View file

@ -0,0 +1,246 @@
use crate::{
assistant_settings::OllamaModel, CompletionProvider, LanguageModel, LanguageModelRequest, Role,
};
use anyhow::Result;
use futures::StreamExt as _;
use futures::{future::BoxFuture, stream::BoxStream, FutureExt};
use gpui::{AnyView, AppContext, Task};
use http::HttpClient;
use ollama::{
get_models, stream_chat_completion, ChatMessage, ChatOptions, ChatRequest, Role as OllamaRole,
};
use std::sync::Arc;
use std::time::Duration;
use ui::{prelude::*, ButtonLike, ElevationIndex};
const OLLAMA_DOWNLOAD_URL: &str = "https://ollama.com/download";
pub struct OllamaCompletionProvider {
api_url: String,
model: OllamaModel,
http_client: Arc<dyn HttpClient>,
low_speed_timeout: Option<Duration>,
settings_version: usize,
available_models: Vec<OllamaModel>,
}
impl OllamaCompletionProvider {
pub fn new(
model: OllamaModel,
api_url: String,
http_client: Arc<dyn HttpClient>,
low_speed_timeout: Option<Duration>,
settings_version: usize,
) -> Self {
Self {
api_url,
model,
http_client,
low_speed_timeout,
settings_version,
available_models: Default::default(),
}
}
pub fn update(
&mut self,
model: OllamaModel,
api_url: String,
low_speed_timeout: Option<Duration>,
settings_version: usize,
) {
self.model = model;
self.api_url = api_url;
self.low_speed_timeout = low_speed_timeout;
self.settings_version = settings_version;
}
pub fn available_models(&self) -> impl Iterator<Item = &OllamaModel> {
self.available_models.iter()
}
pub fn settings_version(&self) -> usize {
self.settings_version
}
pub fn is_authenticated(&self) -> bool {
!self.available_models.is_empty()
}
pub fn authenticate(&self, cx: &AppContext) -> Task<Result<()>> {
if self.is_authenticated() {
Task::ready(Ok(()))
} else {
self.fetch_models(cx)
}
}
pub fn reset_credentials(&self, cx: &AppContext) -> Task<Result<()>> {
self.fetch_models(cx)
}
pub fn fetch_models(&self, cx: &AppContext) -> Task<Result<()>> {
let http_client = self.http_client.clone();
let api_url = self.api_url.clone();
// As a proxy for the server being "authenticated", we'll check if its up by fetching the models
cx.spawn(|mut cx| async move {
let models = get_models(http_client.as_ref(), &api_url, None).await?;
let mut models: Vec<OllamaModel> = models
.into_iter()
// Since there is no metadata from the Ollama API
// indicating which models are embedding models,
// simply filter out models with "-embed" in their name
.filter(|model| !model.name.contains("-embed"))
.map(|model| OllamaModel::new(&model.name, &model.details.parameter_size))
.collect();
models.sort_by(|a, b| a.name.cmp(&b.name));
cx.update_global::<CompletionProvider, _>(|provider, _cx| {
if let CompletionProvider::Ollama(provider) = provider {
provider.available_models = models;
}
})
})
}
pub fn authentication_prompt(&self, cx: &mut WindowContext) -> AnyView {
cx.new_view(|cx| DownloadOllamaMessage::new(cx)).into()
}
pub fn model(&self) -> OllamaModel {
self.model.clone()
}
pub fn count_tokens(
&self,
request: LanguageModelRequest,
_cx: &AppContext,
) -> BoxFuture<'static, Result<usize>> {
// There is no endpoint for this _yet_ in Ollama
// see: https://github.com/ollama/ollama/issues/1716 and https://github.com/ollama/ollama/issues/3582
let token_count = request
.messages
.iter()
.map(|msg| msg.content.chars().count())
.sum::<usize>()
/ 4;
async move { Ok(token_count) }.boxed()
}
pub fn complete(
&self,
request: LanguageModelRequest,
) -> BoxFuture<'static, Result<BoxStream<'static, Result<String>>>> {
let request = self.to_ollama_request(request);
let http_client = self.http_client.clone();
let api_url = self.api_url.clone();
let low_speed_timeout = self.low_speed_timeout;
async move {
let request =
stream_chat_completion(http_client.as_ref(), &api_url, request, low_speed_timeout);
let response = request.await?;
let stream = response
.filter_map(|response| async move {
match response {
Ok(delta) => {
let content = match delta.message {
ChatMessage::User { content } => content,
ChatMessage::Assistant { content } => content,
ChatMessage::System { content } => content,
};
Some(Ok(content))
}
Err(error) => Some(Err(error)),
}
})
.boxed();
Ok(stream)
}
.boxed()
}
fn to_ollama_request(&self, request: LanguageModelRequest) -> ChatRequest {
let model = match request.model {
LanguageModel::Ollama(model) => model,
_ => self.model(),
};
ChatRequest {
model: model.name,
messages: request
.messages
.into_iter()
.map(|msg| match msg.role {
Role::User => ChatMessage::User {
content: msg.content,
},
Role::Assistant => ChatMessage::Assistant {
content: msg.content,
},
Role::System => ChatMessage::System {
content: msg.content,
},
})
.collect(),
keep_alive: model.keep_alive,
stream: true,
options: Some(ChatOptions {
num_ctx: Some(model.max_tokens),
stop: Some(request.stop),
temperature: Some(request.temperature),
..Default::default()
}),
}
}
}
impl From<Role> for ollama::Role {
fn from(val: Role) -> Self {
match val {
Role::User => OllamaRole::User,
Role::Assistant => OllamaRole::Assistant,
Role::System => OllamaRole::System,
}
}
}
struct DownloadOllamaMessage {}
impl DownloadOllamaMessage {
pub fn new(_cx: &mut ViewContext<Self>) -> Self {
Self {}
}
fn render_download_button(&self, _cx: &mut ViewContext<Self>) -> impl IntoElement {
ButtonLike::new("download_ollama_button")
.style(ButtonStyle::Filled)
.size(ButtonSize::Large)
.layer(ElevationIndex::ModalSurface)
.child(Label::new("Get Ollama"))
.on_click(move |_, cx| cx.open_url(OLLAMA_DOWNLOAD_URL))
}
}
impl Render for DownloadOllamaMessage {
fn render(&mut self, cx: &mut ViewContext<Self>) -> impl IntoElement {
v_flex()
.p_4()
.size_full()
.child(Label::new("To use Ollama models via the assistant, Ollama must be running on your machine.").size(LabelSize::Large))
.child(
h_flex()
.w_full()
.p_4()
.justify_center()
.child(
self.render_download_button(cx)
)
)
.into_any()
}
}