Have read_file support images (#30435)
This is very basic support for them. There are a number of other TODOs before this is really a first-class supported feature, so not adding any release notes for it; for now, this PR just makes it so that if read_file tries to read a PNG (which has come up in practice), it at least correctly sends it to Anthropic instead of messing up. This also lays the groundwork for future PRs for more first-class support for images in tool calls across more image file formats and LLM providers. Release Notes: - N/A --------- Co-authored-by: Agus Zubiaga <hi@aguz.me> Co-authored-by: Agus Zubiaga <agus@zed.dev>
This commit is contained in:
parent
f01af006e1
commit
8fdf309a4a
30 changed files with 557 additions and 194 deletions
|
@ -1,6 +1,9 @@
|
|||
use crate::AllLanguageModelSettings;
|
||||
use crate::ui::InstructionListItem;
|
||||
use anthropic::{AnthropicError, AnthropicModelMode, ContentDelta, Event, ResponseContent, Usage};
|
||||
use anthropic::{
|
||||
AnthropicError, AnthropicModelMode, ContentDelta, Event, ResponseContent, ToolResultContent,
|
||||
ToolResultPart, Usage,
|
||||
};
|
||||
use anyhow::{Context as _, Result, anyhow};
|
||||
use collections::{BTreeMap, HashMap};
|
||||
use credentials_provider::CredentialsProvider;
|
||||
|
@ -15,8 +18,8 @@ use language_model::{
|
|||
AuthenticateError, LanguageModel, LanguageModelCacheConfiguration,
|
||||
LanguageModelCompletionError, LanguageModelId, LanguageModelKnownError, LanguageModelName,
|
||||
LanguageModelProvider, LanguageModelProviderId, LanguageModelProviderName,
|
||||
LanguageModelProviderState, LanguageModelRequest, LanguageModelToolChoice, MessageContent,
|
||||
RateLimiter, Role,
|
||||
LanguageModelProviderState, LanguageModelRequest, LanguageModelToolChoice,
|
||||
LanguageModelToolResultContent, MessageContent, RateLimiter, Role,
|
||||
};
|
||||
use language_model::{LanguageModelCompletionEvent, LanguageModelToolUse, StopReason};
|
||||
use schemars::JsonSchema;
|
||||
|
@ -346,9 +349,14 @@ pub fn count_anthropic_tokens(
|
|||
MessageContent::ToolUse(_tool_use) => {
|
||||
// TODO: Estimate token usage from tool uses.
|
||||
}
|
||||
MessageContent::ToolResult(tool_result) => {
|
||||
string_contents.push_str(&tool_result.content);
|
||||
}
|
||||
MessageContent::ToolResult(tool_result) => match &tool_result.content {
|
||||
LanguageModelToolResultContent::Text(txt) => {
|
||||
string_contents.push_str(txt);
|
||||
}
|
||||
LanguageModelToolResultContent::Image(image) => {
|
||||
tokens_from_images += image.estimate_tokens();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -421,6 +429,10 @@ impl LanguageModel for AnthropicModel {
|
|||
true
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||
match choice {
|
||||
LanguageModelToolChoice::Auto
|
||||
|
@ -575,7 +587,20 @@ pub fn into_anthropic(
|
|||
Some(anthropic::RequestContent::ToolResult {
|
||||
tool_use_id: tool_result.tool_use_id.to_string(),
|
||||
is_error: tool_result.is_error,
|
||||
content: tool_result.content.to_string(),
|
||||
content: match tool_result.content {
|
||||
LanguageModelToolResultContent::Text(text) => {
|
||||
ToolResultContent::JustText(text.to_string())
|
||||
}
|
||||
LanguageModelToolResultContent::Image(image) => {
|
||||
ToolResultContent::Multipart(vec![ToolResultPart::Image {
|
||||
source: anthropic::ImageSource {
|
||||
source_type: "base64".to_string(),
|
||||
media_type: "image/png".to_string(),
|
||||
data: image.source.to_string(),
|
||||
},
|
||||
}])
|
||||
}
|
||||
},
|
||||
cache_control,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -36,7 +36,8 @@ use language_model::{
|
|||
LanguageModelCompletionError, LanguageModelCompletionEvent, LanguageModelId, LanguageModelName,
|
||||
LanguageModelProvider, LanguageModelProviderId, LanguageModelProviderName,
|
||||
LanguageModelProviderState, LanguageModelRequest, LanguageModelToolChoice,
|
||||
LanguageModelToolUse, MessageContent, RateLimiter, Role, TokenUsage,
|
||||
LanguageModelToolResultContent, LanguageModelToolUse, MessageContent, RateLimiter, Role,
|
||||
TokenUsage,
|
||||
};
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
@ -490,6 +491,10 @@ impl LanguageModel for BedrockModel {
|
|||
self.model.supports_tool_use()
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||
match choice {
|
||||
LanguageModelToolChoice::Auto | LanguageModelToolChoice::Any => {
|
||||
|
@ -635,9 +640,17 @@ pub fn into_bedrock(
|
|||
MessageContent::ToolResult(tool_result) => {
|
||||
BedrockToolResultBlock::builder()
|
||||
.tool_use_id(tool_result.tool_use_id.to_string())
|
||||
.content(BedrockToolResultContentBlock::Text(
|
||||
tool_result.content.to_string(),
|
||||
))
|
||||
.content(match tool_result.content {
|
||||
LanguageModelToolResultContent::Text(text) => {
|
||||
BedrockToolResultContentBlock::Text(text.to_string())
|
||||
}
|
||||
LanguageModelToolResultContent::Image(_) => {
|
||||
BedrockToolResultContentBlock::Text(
|
||||
// TODO: Bedrock image support
|
||||
"[Tool responded with an image, but Zed doesn't support these in Bedrock models yet]".to_string()
|
||||
)
|
||||
}
|
||||
})
|
||||
.status({
|
||||
if tool_result.is_error {
|
||||
BedrockToolResultStatus::Error
|
||||
|
@ -762,9 +775,14 @@ pub fn get_bedrock_tokens(
|
|||
MessageContent::ToolUse(_tool_use) => {
|
||||
// TODO: Estimate token usage from tool uses.
|
||||
}
|
||||
MessageContent::ToolResult(tool_result) => {
|
||||
string_contents.push_str(&tool_result.content);
|
||||
}
|
||||
MessageContent::ToolResult(tool_result) => match tool_result.content {
|
||||
LanguageModelToolResultContent::Text(text) => {
|
||||
string_contents.push_str(&text);
|
||||
}
|
||||
LanguageModelToolResultContent::Image(image) => {
|
||||
tokens_from_images += image.estimate_tokens();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -686,6 +686,14 @@ impl LanguageModel for CloudLanguageModel {
|
|||
}
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
match self.model {
|
||||
CloudModel::Anthropic(_) => true,
|
||||
CloudModel::Google(_) => true,
|
||||
CloudModel::OpenAi(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||
match choice {
|
||||
LanguageModelToolChoice::Auto
|
||||
|
|
|
@ -5,8 +5,9 @@ use std::sync::Arc;
|
|||
use anyhow::{Result, anyhow};
|
||||
use collections::HashMap;
|
||||
use copilot::copilot_chat::{
|
||||
ChatMessage, ChatMessageContent, CopilotChat, ImageUrl, Model as CopilotChatModel, ModelVendor,
|
||||
Request as CopilotChatRequest, ResponseEvent, Tool, ToolCall,
|
||||
ChatMessage, ChatMessageContent, ChatMessagePart, CopilotChat, ImageUrl,
|
||||
Model as CopilotChatModel, ModelVendor, Request as CopilotChatRequest, ResponseEvent, Tool,
|
||||
ToolCall,
|
||||
};
|
||||
use copilot::{Copilot, Status};
|
||||
use futures::future::BoxFuture;
|
||||
|
@ -20,12 +21,14 @@ use language_model::{
|
|||
AuthenticateError, LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
|
||||
LanguageModelId, LanguageModelName, LanguageModelProvider, LanguageModelProviderId,
|
||||
LanguageModelProviderName, LanguageModelProviderState, LanguageModelRequest,
|
||||
LanguageModelRequestMessage, LanguageModelToolChoice, LanguageModelToolSchemaFormat,
|
||||
LanguageModelToolUse, MessageContent, RateLimiter, Role, StopReason,
|
||||
LanguageModelRequestMessage, LanguageModelToolChoice, LanguageModelToolResultContent,
|
||||
LanguageModelToolSchemaFormat, LanguageModelToolUse, MessageContent, RateLimiter, Role,
|
||||
StopReason,
|
||||
};
|
||||
use settings::SettingsStore;
|
||||
use std::time::Duration;
|
||||
use ui::prelude::*;
|
||||
use util::debug_panic;
|
||||
|
||||
use super::anthropic::count_anthropic_tokens;
|
||||
use super::google::count_google_tokens;
|
||||
|
@ -198,6 +201,10 @@ impl LanguageModel for CopilotChatLanguageModel {
|
|||
self.model.supports_tools()
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
self.model.supports_vision()
|
||||
}
|
||||
|
||||
fn tool_input_format(&self) -> LanguageModelToolSchemaFormat {
|
||||
match self.model.vendor() {
|
||||
ModelVendor::OpenAI | ModelVendor::Anthropic => {
|
||||
|
@ -447,9 +454,28 @@ fn into_copilot_chat(
|
|||
Role::User => {
|
||||
for content in &message.content {
|
||||
if let MessageContent::ToolResult(tool_result) = content {
|
||||
let content = match &tool_result.content {
|
||||
LanguageModelToolResultContent::Text(text) => text.to_string().into(),
|
||||
LanguageModelToolResultContent::Image(image) => {
|
||||
if model.supports_vision() {
|
||||
ChatMessageContent::Multipart(vec![ChatMessagePart::Image {
|
||||
image_url: ImageUrl {
|
||||
url: image.to_base64_url(),
|
||||
},
|
||||
}])
|
||||
} else {
|
||||
debug_panic!(
|
||||
"This should be caught at {} level",
|
||||
tool_result.tool_name
|
||||
);
|
||||
"[Tool responded with an image, but this model does not support vision]".to_string().into()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
messages.push(ChatMessage::Tool {
|
||||
tool_call_id: tool_result.tool_use_id.to_string(),
|
||||
content: tool_result.content.to_string(),
|
||||
content,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -460,18 +486,18 @@ fn into_copilot_chat(
|
|||
MessageContent::Text(text) | MessageContent::Thinking { text, .. }
|
||||
if !text.is_empty() =>
|
||||
{
|
||||
if let Some(ChatMessageContent::Text { text: text_content }) =
|
||||
if let Some(ChatMessagePart::Text { text: text_content }) =
|
||||
content_parts.last_mut()
|
||||
{
|
||||
text_content.push_str(text);
|
||||
} else {
|
||||
content_parts.push(ChatMessageContent::Text {
|
||||
content_parts.push(ChatMessagePart::Text {
|
||||
text: text.to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
MessageContent::Image(image) if model.supports_vision() => {
|
||||
content_parts.push(ChatMessageContent::Image {
|
||||
content_parts.push(ChatMessagePart::Image {
|
||||
image_url: ImageUrl {
|
||||
url: image.to_base64_url(),
|
||||
},
|
||||
|
@ -483,7 +509,7 @@ fn into_copilot_chat(
|
|||
|
||||
if !content_parts.is_empty() {
|
||||
messages.push(ChatMessage::User {
|
||||
content: content_parts,
|
||||
content: content_parts.into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -523,9 +549,9 @@ fn into_copilot_chat(
|
|||
|
||||
messages.push(ChatMessage::Assistant {
|
||||
content: if text_content.is_empty() {
|
||||
None
|
||||
ChatMessageContent::empty()
|
||||
} else {
|
||||
Some(text_content)
|
||||
text_content.into()
|
||||
},
|
||||
tool_calls,
|
||||
});
|
||||
|
|
|
@ -287,6 +287,10 @@ impl LanguageModel for DeepSeekLanguageModel {
|
|||
false
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn telemetry_id(&self) -> String {
|
||||
format!("deepseek/{}", self.model.id())
|
||||
}
|
||||
|
|
|
@ -313,6 +313,10 @@ impl LanguageModel for GoogleLanguageModel {
|
|||
true
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||
match choice {
|
||||
LanguageModelToolChoice::Auto
|
||||
|
|
|
@ -285,6 +285,10 @@ impl LanguageModel for LmStudioLanguageModel {
|
|||
false
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, _choice: LanguageModelToolChoice) -> bool {
|
||||
false
|
||||
}
|
||||
|
|
|
@ -303,6 +303,10 @@ impl LanguageModel for MistralLanguageModel {
|
|||
false
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, _choice: LanguageModelToolChoice) -> bool {
|
||||
false
|
||||
}
|
||||
|
|
|
@ -325,6 +325,10 @@ impl LanguageModel for OllamaLanguageModel {
|
|||
self.model.supports_tools.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||
match choice {
|
||||
LanguageModelToolChoice::Auto => false,
|
||||
|
|
|
@ -12,7 +12,8 @@ use language_model::{
|
|||
AuthenticateError, LanguageModel, LanguageModelCompletionError, LanguageModelCompletionEvent,
|
||||
LanguageModelId, LanguageModelName, LanguageModelProvider, LanguageModelProviderId,
|
||||
LanguageModelProviderName, LanguageModelProviderState, LanguageModelRequest,
|
||||
LanguageModelToolChoice, LanguageModelToolUse, MessageContent, RateLimiter, Role, StopReason,
|
||||
LanguageModelToolChoice, LanguageModelToolResultContent, LanguageModelToolUse, MessageContent,
|
||||
RateLimiter, Role, StopReason,
|
||||
};
|
||||
use open_ai::{Model, ResponseStreamEvent, stream_completion};
|
||||
use schemars::JsonSchema;
|
||||
|
@ -295,6 +296,10 @@ impl LanguageModel for OpenAiLanguageModel {
|
|||
true
|
||||
}
|
||||
|
||||
fn supports_images(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn supports_tool_choice(&self, choice: LanguageModelToolChoice) -> bool {
|
||||
match choice {
|
||||
LanguageModelToolChoice::Auto => true,
|
||||
|
@ -392,8 +397,16 @@ pub fn into_open_ai(
|
|||
}
|
||||
}
|
||||
MessageContent::ToolResult(tool_result) => {
|
||||
let content = match &tool_result.content {
|
||||
LanguageModelToolResultContent::Text(text) => text.to_string(),
|
||||
LanguageModelToolResultContent::Image(_) => {
|
||||
// TODO: Open AI image support
|
||||
"[Tool responded with an image, but Zed doesn't support these in Open AI models yet]".to_string()
|
||||
}
|
||||
};
|
||||
|
||||
messages.push(open_ai::RequestMessage::Tool {
|
||||
content: tool_result.content.to_string(),
|
||||
content,
|
||||
tool_call_id: tool_result.tool_use_id.to_string(),
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue