language_models: Add images support for Ollama vision models (#31883)
Ollama supports vision to process input images. This PR adds support for same. I have tested this with gemma3:4b and have attached the screenshot of it working. <img width="435" alt="image" src="https://github.com/user-attachments/assets/5f17d742-0a37-4e6c-b4d8-05b750a0a158" /> Release Notes: - Add image support for [Ollama vision models](https://ollama.com/search?c=vision)
This commit is contained in:
parent
a60bea8a3d
commit
59686f1f44
3 changed files with 136 additions and 26 deletions
|
@ -372,6 +372,7 @@ impl AgentSettingsContent {
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
Some(language_model.supports_tools()),
|
Some(language_model.supports_tools()),
|
||||||
|
Some(language_model.supports_images()),
|
||||||
None,
|
None,
|
||||||
)),
|
)),
|
||||||
api_url,
|
api_url,
|
||||||
|
|
|
@ -4,14 +4,11 @@ use futures::{Stream, TryFutureExt, stream};
|
||||||
use gpui::{AnyView, App, AsyncApp, Context, Subscription, Task};
|
use gpui::{AnyView, App, AsyncApp, Context, Subscription, Task};
|
||||||
use http_client::HttpClient;
|
use http_client::HttpClient;
|
||||||
use language_model::{
|
use language_model::{
|
||||||
AuthenticateError, LanguageModelCompletionError, LanguageModelCompletionEvent,
|
AuthenticateError, LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
|
||||||
|
LanguageModelId, LanguageModelName, LanguageModelProvider, LanguageModelProviderId,
|
||||||
|
LanguageModelProviderName, LanguageModelProviderState, LanguageModelRequest,
|
||||||
LanguageModelRequestTool, LanguageModelToolChoice, LanguageModelToolUse,
|
LanguageModelRequestTool, LanguageModelToolChoice, LanguageModelToolUse,
|
||||||
LanguageModelToolUseId, MessageContent, StopReason,
|
LanguageModelToolUseId, MessageContent, RateLimiter, Role, StopReason,
|
||||||
};
|
|
||||||
use language_model::{
|
|
||||||
LanguageModel, LanguageModelId, LanguageModelName, LanguageModelProvider,
|
|
||||||
LanguageModelProviderId, LanguageModelProviderName, LanguageModelProviderState,
|
|
||||||
LanguageModelRequest, RateLimiter, Role,
|
|
||||||
};
|
};
|
||||||
use ollama::{
|
use ollama::{
|
||||||
ChatMessage, ChatOptions, ChatRequest, ChatResponseDelta, KeepAlive, OllamaFunctionTool,
|
ChatMessage, ChatOptions, ChatRequest, ChatResponseDelta, KeepAlive, OllamaFunctionTool,
|
||||||
|
@ -54,6 +51,8 @@ pub struct AvailableModel {
|
||||||
pub keep_alive: Option<KeepAlive>,
|
pub keep_alive: Option<KeepAlive>,
|
||||||
/// Whether the model supports tools
|
/// Whether the model supports tools
|
||||||
pub supports_tools: Option<bool>,
|
pub supports_tools: Option<bool>,
|
||||||
|
/// Whether the model supports vision
|
||||||
|
pub supports_images: Option<bool>,
|
||||||
/// Whether to enable think mode
|
/// Whether to enable think mode
|
||||||
pub supports_thinking: Option<bool>,
|
pub supports_thinking: Option<bool>,
|
||||||
}
|
}
|
||||||
|
@ -101,6 +100,7 @@ impl State {
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
Some(capabilities.supports_tools()),
|
Some(capabilities.supports_tools()),
|
||||||
|
Some(capabilities.supports_vision()),
|
||||||
Some(capabilities.supports_thinking()),
|
Some(capabilities.supports_thinking()),
|
||||||
);
|
);
|
||||||
Ok(ollama_model)
|
Ok(ollama_model)
|
||||||
|
@ -222,6 +222,7 @@ impl LanguageModelProvider for OllamaLanguageModelProvider {
|
||||||
max_tokens: model.max_tokens,
|
max_tokens: model.max_tokens,
|
||||||
keep_alive: model.keep_alive.clone(),
|
keep_alive: model.keep_alive.clone(),
|
||||||
supports_tools: model.supports_tools,
|
supports_tools: model.supports_tools,
|
||||||
|
supports_vision: model.supports_images,
|
||||||
supports_thinking: model.supports_thinking,
|
supports_thinking: model.supports_thinking,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
@ -277,30 +278,59 @@ pub struct OllamaLanguageModel {
|
||||||
|
|
||||||
impl OllamaLanguageModel {
|
impl OllamaLanguageModel {
|
||||||
fn to_ollama_request(&self, request: LanguageModelRequest) -> ChatRequest {
|
fn to_ollama_request(&self, request: LanguageModelRequest) -> ChatRequest {
|
||||||
|
let supports_vision = self.model.supports_vision.unwrap_or(false);
|
||||||
|
|
||||||
ChatRequest {
|
ChatRequest {
|
||||||
model: self.model.name.clone(),
|
model: self.model.name.clone(),
|
||||||
messages: request
|
messages: request
|
||||||
.messages
|
.messages
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|msg| match msg.role {
|
.map(|msg| {
|
||||||
Role::User => ChatMessage::User {
|
let images = if supports_vision {
|
||||||
content: msg.string_contents(),
|
msg.content
|
||||||
},
|
.iter()
|
||||||
Role::Assistant => {
|
.filter_map(|content| match content {
|
||||||
let content = msg.string_contents();
|
MessageContent::Image(image) => Some(image.source.to_string()),
|
||||||
let thinking = msg.content.into_iter().find_map(|content| match content {
|
_ => None,
|
||||||
MessageContent::Thinking { text, .. } if !text.is_empty() => Some(text),
|
})
|
||||||
_ => None,
|
.collect::<Vec<String>>()
|
||||||
});
|
} else {
|
||||||
ChatMessage::Assistant {
|
vec![]
|
||||||
content,
|
};
|
||||||
tool_calls: None,
|
|
||||||
thinking,
|
match msg.role {
|
||||||
|
Role::User => ChatMessage::User {
|
||||||
|
content: msg.string_contents(),
|
||||||
|
images: if images.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(images)
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Role::Assistant => {
|
||||||
|
let content = msg.string_contents();
|
||||||
|
let thinking =
|
||||||
|
msg.content.into_iter().find_map(|content| match content {
|
||||||
|
MessageContent::Thinking { text, .. } if !text.is_empty() => {
|
||||||
|
Some(text)
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
});
|
||||||
|
ChatMessage::Assistant {
|
||||||
|
content,
|
||||||
|
tool_calls: None,
|
||||||
|
images: if images.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(images)
|
||||||
|
},
|
||||||
|
thinking,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Role::System => ChatMessage::System {
|
||||||
|
content: msg.string_contents(),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
Role::System => ChatMessage::System {
|
|
||||||
content: msg.string_contents(),
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
.collect(),
|
.collect(),
|
||||||
keep_alive: self.model.keep_alive.clone().unwrap_or_default(),
|
keep_alive: self.model.keep_alive.clone().unwrap_or_default(),
|
||||||
|
@ -339,7 +369,7 @@ impl LanguageModel for OllamaLanguageModel {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn supports_images(&self) -> bool {
|
fn supports_images(&self) -> bool {
|
||||||
false
|
self.model.supports_vision.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||||
|
@ -437,7 +467,7 @@ fn map_to_language_model_completion_events(
|
||||||
let mut events = Vec::new();
|
let mut events = Vec::new();
|
||||||
|
|
||||||
match delta.message {
|
match delta.message {
|
||||||
ChatMessage::User { content } => {
|
ChatMessage::User { content, images: _ } => {
|
||||||
events.push(Ok(LanguageModelCompletionEvent::Text(content)));
|
events.push(Ok(LanguageModelCompletionEvent::Text(content)));
|
||||||
}
|
}
|
||||||
ChatMessage::System { content } => {
|
ChatMessage::System { content } => {
|
||||||
|
@ -446,6 +476,7 @@ fn map_to_language_model_completion_events(
|
||||||
ChatMessage::Assistant {
|
ChatMessage::Assistant {
|
||||||
content,
|
content,
|
||||||
tool_calls,
|
tool_calls,
|
||||||
|
images: _,
|
||||||
thinking,
|
thinking,
|
||||||
} => {
|
} => {
|
||||||
if let Some(text) = thinking {
|
if let Some(text) = thinking {
|
||||||
|
|
|
@ -38,6 +38,7 @@ pub struct Model {
|
||||||
pub max_tokens: usize,
|
pub max_tokens: usize,
|
||||||
pub keep_alive: Option<KeepAlive>,
|
pub keep_alive: Option<KeepAlive>,
|
||||||
pub supports_tools: Option<bool>,
|
pub supports_tools: Option<bool>,
|
||||||
|
pub supports_vision: Option<bool>,
|
||||||
pub supports_thinking: Option<bool>,
|
pub supports_thinking: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -68,6 +69,7 @@ impl Model {
|
||||||
display_name: Option<&str>,
|
display_name: Option<&str>,
|
||||||
max_tokens: Option<usize>,
|
max_tokens: Option<usize>,
|
||||||
supports_tools: Option<bool>,
|
supports_tools: Option<bool>,
|
||||||
|
supports_vision: Option<bool>,
|
||||||
supports_thinking: Option<bool>,
|
supports_thinking: Option<bool>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
@ -78,6 +80,7 @@ impl Model {
|
||||||
max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)),
|
max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)),
|
||||||
keep_alive: Some(KeepAlive::indefinite()),
|
keep_alive: Some(KeepAlive::indefinite()),
|
||||||
supports_tools,
|
supports_tools,
|
||||||
|
supports_vision,
|
||||||
supports_thinking,
|
supports_thinking,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -101,10 +104,14 @@ pub enum ChatMessage {
|
||||||
Assistant {
|
Assistant {
|
||||||
content: String,
|
content: String,
|
||||||
tool_calls: Option<Vec<OllamaToolCall>>,
|
tool_calls: Option<Vec<OllamaToolCall>>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
images: Option<Vec<String>>,
|
||||||
thinking: Option<String>,
|
thinking: Option<String>,
|
||||||
},
|
},
|
||||||
User {
|
User {
|
||||||
content: String,
|
content: String,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
images: Option<Vec<String>>,
|
||||||
},
|
},
|
||||||
System {
|
System {
|
||||||
content: String,
|
content: String,
|
||||||
|
@ -221,6 +228,10 @@ impl ModelShow {
|
||||||
self.capabilities.iter().any(|v| v == "tools")
|
self.capabilities.iter().any(|v| v == "tools")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn supports_vision(&self) -> bool {
|
||||||
|
self.capabilities.iter().any(|v| v == "vision")
|
||||||
|
}
|
||||||
|
|
||||||
pub fn supports_thinking(&self) -> bool {
|
pub fn supports_thinking(&self) -> bool {
|
||||||
self.capabilities.iter().any(|v| v == "thinking")
|
self.capabilities.iter().any(|v| v == "thinking")
|
||||||
}
|
}
|
||||||
|
@ -468,6 +479,7 @@ mod tests {
|
||||||
ChatMessage::Assistant {
|
ChatMessage::Assistant {
|
||||||
content,
|
content,
|
||||||
tool_calls,
|
tool_calls,
|
||||||
|
images: _,
|
||||||
thinking,
|
thinking,
|
||||||
} => {
|
} => {
|
||||||
assert!(content.is_empty());
|
assert!(content.is_empty());
|
||||||
|
@ -534,4 +546,70 @@ mod tests {
|
||||||
assert!(result.capabilities.contains(&"tools".to_string()));
|
assert!(result.capabilities.contains(&"tools".to_string()));
|
||||||
assert!(result.capabilities.contains(&"completion".to_string()));
|
assert!(result.capabilities.contains(&"completion".to_string()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize_chat_request_with_images() {
|
||||||
|
let base64_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
|
||||||
|
|
||||||
|
let request = ChatRequest {
|
||||||
|
model: "llava".to_string(),
|
||||||
|
messages: vec![ChatMessage::User {
|
||||||
|
content: "What do you see in this image?".to_string(),
|
||||||
|
images: Some(vec![base64_image.to_string()]),
|
||||||
|
}],
|
||||||
|
stream: false,
|
||||||
|
keep_alive: KeepAlive::default(),
|
||||||
|
options: None,
|
||||||
|
think: None,
|
||||||
|
tools: vec![],
|
||||||
|
};
|
||||||
|
|
||||||
|
let serialized = serde_json::to_string(&request).unwrap();
|
||||||
|
assert!(serialized.contains("images"));
|
||||||
|
assert!(serialized.contains(base64_image));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize_chat_request_without_images() {
|
||||||
|
let request = ChatRequest {
|
||||||
|
model: "llama3.2".to_string(),
|
||||||
|
messages: vec![ChatMessage::User {
|
||||||
|
content: "Hello, world!".to_string(),
|
||||||
|
images: None,
|
||||||
|
}],
|
||||||
|
stream: false,
|
||||||
|
keep_alive: KeepAlive::default(),
|
||||||
|
options: None,
|
||||||
|
think: None,
|
||||||
|
tools: vec![],
|
||||||
|
};
|
||||||
|
|
||||||
|
let serialized = serde_json::to_string(&request).unwrap();
|
||||||
|
assert!(!serialized.contains("images"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_json_format_with_images() {
|
||||||
|
let base64_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
|
||||||
|
|
||||||
|
let request = ChatRequest {
|
||||||
|
model: "llava".to_string(),
|
||||||
|
messages: vec![ChatMessage::User {
|
||||||
|
content: "What do you see?".to_string(),
|
||||||
|
images: Some(vec![base64_image.to_string()]),
|
||||||
|
}],
|
||||||
|
stream: false,
|
||||||
|
keep_alive: KeepAlive::default(),
|
||||||
|
options: None,
|
||||||
|
think: None,
|
||||||
|
tools: vec![],
|
||||||
|
};
|
||||||
|
|
||||||
|
let serialized = serde_json::to_string(&request).unwrap();
|
||||||
|
|
||||||
|
let parsed: serde_json::Value = serde_json::from_str(&serialized).unwrap();
|
||||||
|
let message_images = parsed["messages"][0]["images"].as_array().unwrap();
|
||||||
|
assert_eq!(message_images.len(), 1);
|
||||||
|
assert_eq!(message_images[0].as_str().unwrap(), base64_image);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue