diff --git a/crates/copilot/src/copilot_chat.rs b/crates/copilot/src/copilot_chat.rs index 536872b0d1..1d5baff286 100644 --- a/crates/copilot/src/copilot_chat.rs +++ b/crates/copilot/src/copilot_chat.rs @@ -96,6 +96,10 @@ struct ModelSupportedFeatures { streaming: bool, #[serde(default)] tool_calls: bool, + #[serde(default)] + parallel_tool_calls: bool, + #[serde(default)] + vision: bool, } #[derive(Clone, Copy, Serialize, Deserialize, Debug, Eq, PartialEq)] @@ -107,6 +111,20 @@ pub enum ModelVendor { Anthropic, } +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)] +#[serde(tag = "type")] +pub enum ChatMessageContent { + #[serde(rename = "text")] + Text { text: String }, + #[serde(rename = "image_url")] + Image { image_url: ImageUrl }, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)] +pub struct ImageUrl { + pub url: String, +} + impl Model { pub fn uses_streaming(&self) -> bool { self.capabilities.supports.streaming @@ -131,6 +149,14 @@ impl Model { pub fn vendor(&self) -> ModelVendor { self.vendor } + + pub fn supports_vision(&self) -> bool { + self.capabilities.supports.vision + } + + pub fn supports_parallel_tool_calls(&self) -> bool { + self.capabilities.supports.parallel_tool_calls + } } #[derive(Serialize, Deserialize)] @@ -177,7 +203,7 @@ pub enum ChatMessage { tool_calls: Vec, }, User { - content: String, + content: Vec, }, System { content: String, @@ -536,7 +562,8 @@ async fn stream_completion( ) .header("Authorization", format!("Bearer {}", api_key)) .header("Content-Type", "application/json") - .header("Copilot-Integration-Id", "vscode-chat"); + .header("Copilot-Integration-Id", "vscode-chat") + .header("Copilot-Vision-Request", "true"); let is_streaming = request.stream; diff --git a/crates/language_model/src/request.rs b/crates/language_model/src/request.rs index 55263b743c..11befb5101 100644 --- a/crates/language_model/src/request.rs +++ b/crates/language_model/src/request.rs @@ -104,6 +104,10 @@ impl LanguageModelImage { // so this method is more of a rough guess. (width * height) / 750 } + + pub fn to_base64_url(&self) -> String { + format!("data:image/png;base64,{}", self.source) + } } fn encode_as_base64(data: Arc, image: image::DynamicImage) -> Result> { diff --git a/crates/language_models/src/provider/copilot_chat.rs b/crates/language_models/src/provider/copilot_chat.rs index c33c3b1d9c..82a2501022 100644 --- a/crates/language_models/src/provider/copilot_chat.rs +++ b/crates/language_models/src/provider/copilot_chat.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use anyhow::{Result, anyhow}; use collections::HashMap; use copilot::copilot_chat::{ - ChatMessage, CopilotChat, Model as CopilotChatModel, ModelVendor, + ChatMessage, ChatMessageContent, CopilotChat, ImageUrl, Model as CopilotChatModel, ModelVendor, Request as CopilotChatRequest, ResponseEvent, Tool, ToolCall, }; use copilot::{Copilot, Status}; @@ -444,23 +444,6 @@ impl CopilotChatLanguageModel { let mut tool_called = false; let mut messages: Vec = Vec::new(); for message in request_messages { - let text_content = { - let mut buffer = String::new(); - for string in message.content.iter().filter_map(|content| match content { - MessageContent::Text(text) | MessageContent::Thinking { text, .. } => { - Some(text.as_str()) - } - MessageContent::ToolUse(_) - | MessageContent::RedactedThinking(_) - | MessageContent::ToolResult(_) - | MessageContent::Image(_) => None, - }) { - buffer.push_str(string); - } - - buffer - }; - match message.role { Role::User => { for content in &message.content { @@ -472,9 +455,36 @@ impl CopilotChatLanguageModel { } } - if !text_content.is_empty() { + let mut content_parts = Vec::new(); + for content in &message.content { + match content { + MessageContent::Text(text) | MessageContent::Thinking { text, .. } + if !text.is_empty() => + { + if let Some(ChatMessageContent::Text { text: text_content }) = + content_parts.last_mut() + { + text_content.push_str(text); + } else { + content_parts.push(ChatMessageContent::Text { + text: text.to_string(), + }); + } + } + MessageContent::Image(image) if self.model.supports_vision() => { + content_parts.push(ChatMessageContent::Image { + image_url: ImageUrl { + url: image.to_base64_url(), + }, + }); + } + _ => {} + } + } + + if !content_parts.is_empty() { messages.push(ChatMessage::User { - content: text_content, + content: content_parts, }); } } @@ -495,6 +505,23 @@ impl CopilotChatLanguageModel { } } + let text_content = { + let mut buffer = String::new(); + for string in message.content.iter().filter_map(|content| match content { + MessageContent::Text(text) | MessageContent::Thinking { text, .. } => { + Some(text.as_str()) + } + MessageContent::ToolUse(_) + | MessageContent::RedactedThinking(_) + | MessageContent::ToolResult(_) + | MessageContent::Image(_) => None, + }) { + buffer.push_str(string); + } + + buffer + }; + messages.push(ChatMessage::Assistant { content: if text_content.is_empty() { None