language_models: Add images support for Ollama vision models (#31883)

Ollama supports vision to process input images. This PR adds support for
same. I have tested this with gemma3:4b and have attached the screenshot
of it working.

<img width="435" alt="image"
src="https://github.com/user-attachments/assets/5f17d742-0a37-4e6c-b4d8-05b750a0a158"
/>


Release Notes:

- Add image support for [Ollama vision models](https://ollama.com/search?c=vision)
This commit is contained in:
Umesh Yadav 2025-06-03 14:42:59 +05:30 committed by GitHub
parent a60bea8a3d
commit 59686f1f44
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 136 additions and 26 deletions

View file

@ -372,6 +372,7 @@ impl AgentSettingsContent {
None, None,
None, None,
Some(language_model.supports_tools()), Some(language_model.supports_tools()),
Some(language_model.supports_images()),
None, None,
)), )),
api_url, api_url,

View file

@ -4,14 +4,11 @@ use futures::{Stream, TryFutureExt, stream};
use gpui::{AnyView, App, AsyncApp, Context, Subscription, Task}; use gpui::{AnyView, App, AsyncApp, Context, Subscription, Task};
use http_client::HttpClient; use http_client::HttpClient;
use language_model::{ use language_model::{
AuthenticateError, LanguageModelCompletionError, LanguageModelCompletionEvent, AuthenticateError, LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
LanguageModelId, LanguageModelName, LanguageModelProvider, LanguageModelProviderId,
LanguageModelProviderName, LanguageModelProviderState, LanguageModelRequest,
LanguageModelRequestTool, LanguageModelToolChoice, LanguageModelToolUse, LanguageModelRequestTool, LanguageModelToolChoice, LanguageModelToolUse,
LanguageModelToolUseId, MessageContent, StopReason, LanguageModelToolUseId, MessageContent, RateLimiter, Role, StopReason,
};
use language_model::{
LanguageModel, LanguageModelId, LanguageModelName, LanguageModelProvider,
LanguageModelProviderId, LanguageModelProviderName, LanguageModelProviderState,
LanguageModelRequest, RateLimiter, Role,
}; };
use ollama::{ use ollama::{
ChatMessage, ChatOptions, ChatRequest, ChatResponseDelta, KeepAlive, OllamaFunctionTool, ChatMessage, ChatOptions, ChatRequest, ChatResponseDelta, KeepAlive, OllamaFunctionTool,
@ -54,6 +51,8 @@ pub struct AvailableModel {
pub keep_alive: Option<KeepAlive>, pub keep_alive: Option<KeepAlive>,
/// Whether the model supports tools /// Whether the model supports tools
pub supports_tools: Option<bool>, pub supports_tools: Option<bool>,
/// Whether the model supports vision
pub supports_images: Option<bool>,
/// Whether to enable think mode /// Whether to enable think mode
pub supports_thinking: Option<bool>, pub supports_thinking: Option<bool>,
} }
@ -101,6 +100,7 @@ impl State {
None, None,
None, None,
Some(capabilities.supports_tools()), Some(capabilities.supports_tools()),
Some(capabilities.supports_vision()),
Some(capabilities.supports_thinking()), Some(capabilities.supports_thinking()),
); );
Ok(ollama_model) Ok(ollama_model)
@ -222,6 +222,7 @@ impl LanguageModelProvider for OllamaLanguageModelProvider {
max_tokens: model.max_tokens, max_tokens: model.max_tokens,
keep_alive: model.keep_alive.clone(), keep_alive: model.keep_alive.clone(),
supports_tools: model.supports_tools, supports_tools: model.supports_tools,
supports_vision: model.supports_images,
supports_thinking: model.supports_thinking, supports_thinking: model.supports_thinking,
}, },
); );
@ -277,30 +278,59 @@ pub struct OllamaLanguageModel {
impl OllamaLanguageModel { impl OllamaLanguageModel {
fn to_ollama_request(&self, request: LanguageModelRequest) -> ChatRequest { fn to_ollama_request(&self, request: LanguageModelRequest) -> ChatRequest {
let supports_vision = self.model.supports_vision.unwrap_or(false);
ChatRequest { ChatRequest {
model: self.model.name.clone(), model: self.model.name.clone(),
messages: request messages: request
.messages .messages
.into_iter() .into_iter()
.map(|msg| match msg.role { .map(|msg| {
Role::User => ChatMessage::User { let images = if supports_vision {
content: msg.string_contents(), msg.content
}, .iter()
Role::Assistant => { .filter_map(|content| match content {
let content = msg.string_contents(); MessageContent::Image(image) => Some(image.source.to_string()),
let thinking = msg.content.into_iter().find_map(|content| match content { _ => None,
MessageContent::Thinking { text, .. } if !text.is_empty() => Some(text), })
_ => None, .collect::<Vec<String>>()
}); } else {
ChatMessage::Assistant { vec![]
content, };
tool_calls: None,
thinking, match msg.role {
Role::User => ChatMessage::User {
content: msg.string_contents(),
images: if images.is_empty() {
None
} else {
Some(images)
},
},
Role::Assistant => {
let content = msg.string_contents();
let thinking =
msg.content.into_iter().find_map(|content| match content {
MessageContent::Thinking { text, .. } if !text.is_empty() => {
Some(text)
}
_ => None,
});
ChatMessage::Assistant {
content,
tool_calls: None,
images: if images.is_empty() {
None
} else {
Some(images)
},
thinking,
}
} }
Role::System => ChatMessage::System {
content: msg.string_contents(),
},
} }
Role::System => ChatMessage::System {
content: msg.string_contents(),
},
}) })
.collect(), .collect(),
keep_alive: self.model.keep_alive.clone().unwrap_or_default(), keep_alive: self.model.keep_alive.clone().unwrap_or_default(),
@ -339,7 +369,7 @@ impl LanguageModel for OllamaLanguageModel {
} }
fn supports_images(&self) -> bool { fn supports_images(&self) -> bool {
false self.model.supports_vision.unwrap_or(false)
} }
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool { fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
@ -437,7 +467,7 @@ fn map_to_language_model_completion_events(
let mut events = Vec::new(); let mut events = Vec::new();
match delta.message { match delta.message {
ChatMessage::User { content } => { ChatMessage::User { content, images: _ } => {
events.push(Ok(LanguageModelCompletionEvent::Text(content))); events.push(Ok(LanguageModelCompletionEvent::Text(content)));
} }
ChatMessage::System { content } => { ChatMessage::System { content } => {
@ -446,6 +476,7 @@ fn map_to_language_model_completion_events(
ChatMessage::Assistant { ChatMessage::Assistant {
content, content,
tool_calls, tool_calls,
images: _,
thinking, thinking,
} => { } => {
if let Some(text) = thinking { if let Some(text) = thinking {

View file

@ -38,6 +38,7 @@ pub struct Model {
pub max_tokens: usize, pub max_tokens: usize,
pub keep_alive: Option<KeepAlive>, pub keep_alive: Option<KeepAlive>,
pub supports_tools: Option<bool>, pub supports_tools: Option<bool>,
pub supports_vision: Option<bool>,
pub supports_thinking: Option<bool>, pub supports_thinking: Option<bool>,
} }
@ -68,6 +69,7 @@ impl Model {
display_name: Option<&str>, display_name: Option<&str>,
max_tokens: Option<usize>, max_tokens: Option<usize>,
supports_tools: Option<bool>, supports_tools: Option<bool>,
supports_vision: Option<bool>,
supports_thinking: Option<bool>, supports_thinking: Option<bool>,
) -> Self { ) -> Self {
Self { Self {
@ -78,6 +80,7 @@ impl Model {
max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)), max_tokens: max_tokens.unwrap_or_else(|| get_max_tokens(name)),
keep_alive: Some(KeepAlive::indefinite()), keep_alive: Some(KeepAlive::indefinite()),
supports_tools, supports_tools,
supports_vision,
supports_thinking, supports_thinking,
} }
} }
@ -101,10 +104,14 @@ pub enum ChatMessage {
Assistant { Assistant {
content: String, content: String,
tool_calls: Option<Vec<OllamaToolCall>>, tool_calls: Option<Vec<OllamaToolCall>>,
#[serde(skip_serializing_if = "Option::is_none")]
images: Option<Vec<String>>,
thinking: Option<String>, thinking: Option<String>,
}, },
User { User {
content: String, content: String,
#[serde(skip_serializing_if = "Option::is_none")]
images: Option<Vec<String>>,
}, },
System { System {
content: String, content: String,
@ -221,6 +228,10 @@ impl ModelShow {
self.capabilities.iter().any(|v| v == "tools") self.capabilities.iter().any(|v| v == "tools")
} }
pub fn supports_vision(&self) -> bool {
self.capabilities.iter().any(|v| v == "vision")
}
pub fn supports_thinking(&self) -> bool { pub fn supports_thinking(&self) -> bool {
self.capabilities.iter().any(|v| v == "thinking") self.capabilities.iter().any(|v| v == "thinking")
} }
@ -468,6 +479,7 @@ mod tests {
ChatMessage::Assistant { ChatMessage::Assistant {
content, content,
tool_calls, tool_calls,
images: _,
thinking, thinking,
} => { } => {
assert!(content.is_empty()); assert!(content.is_empty());
@ -534,4 +546,70 @@ mod tests {
assert!(result.capabilities.contains(&"tools".to_string())); assert!(result.capabilities.contains(&"tools".to_string()));
assert!(result.capabilities.contains(&"completion".to_string())); assert!(result.capabilities.contains(&"completion".to_string()));
} }
#[test]
fn serialize_chat_request_with_images() {
let base64_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
let request = ChatRequest {
model: "llava".to_string(),
messages: vec![ChatMessage::User {
content: "What do you see in this image?".to_string(),
images: Some(vec![base64_image.to_string()]),
}],
stream: false,
keep_alive: KeepAlive::default(),
options: None,
think: None,
tools: vec![],
};
let serialized = serde_json::to_string(&request).unwrap();
assert!(serialized.contains("images"));
assert!(serialized.contains(base64_image));
}
#[test]
fn serialize_chat_request_without_images() {
let request = ChatRequest {
model: "llama3.2".to_string(),
messages: vec![ChatMessage::User {
content: "Hello, world!".to_string(),
images: None,
}],
stream: false,
keep_alive: KeepAlive::default(),
options: None,
think: None,
tools: vec![],
};
let serialized = serde_json::to_string(&request).unwrap();
assert!(!serialized.contains("images"));
}
#[test]
fn test_json_format_with_images() {
let base64_image = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==";
let request = ChatRequest {
model: "llava".to_string(),
messages: vec![ChatMessage::User {
content: "What do you see?".to_string(),
images: Some(vec![base64_image.to_string()]),
}],
stream: false,
keep_alive: KeepAlive::default(),
options: None,
think: None,
tools: vec![],
};
let serialized = serde_json::to_string(&request).unwrap();
let parsed: serde_json::Value = serde_json::from_str(&serialized).unwrap();
let message_images = parsed["messages"][0]["images"].as_array().unwrap();
assert_eq!(message_images.len(), 1);
assert_eq!(message_images[0].as_str().unwrap(), base64_image);
}
} }