Have read_file support images (#30435)

This is very basic support for them. There are a number of other TODOs
before this is really a first-class supported feature, so not adding any
release notes for it; for now, this PR just makes it so that if
read_file tries to read a PNG (which has come up in practice), it at
least correctly sends it to Anthropic instead of messing up.

This also lays the groundwork for future PRs for more first-class
support for images in tool calls across more image file formats and LLM
providers.

Release Notes:

- N/A

---------

Co-authored-by: Agus Zubiaga <hi@aguz.me>
Co-authored-by: Agus Zubiaga <agus@zed.dev>
This commit is contained in:
Richard Feldman 2025-05-13 10:58:00 +02:00 committed by GitHub
parent f01af006e1
commit 8fdf309a4a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
30 changed files with 557 additions and 194 deletions

View file

@ -157,6 +157,10 @@ impl LanguageModel for FakeLanguageModel {
false
}
fn supports_images(&self) -> bool {
false
}
fn telemetry_id(&self) -> String {
"fake".to_string()
}

View file

@ -243,6 +243,9 @@ pub trait LanguageModel: Send + Sync {
LanguageModelAvailability::Public
}
/// Whether this model supports images
fn supports_images(&self) -> bool;
/// Whether this model supports tools.
fn supports_tools(&self) -> bool;

View file

@ -21,6 +21,16 @@ pub struct LanguageModelImage {
size: Size<DevicePixels>,
}
impl LanguageModelImage {
pub fn len(&self) -> usize {
self.source.len()
}
pub fn is_empty(&self) -> bool {
self.source.is_empty()
}
}
impl std::fmt::Debug for LanguageModelImage {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("LanguageModelImage")
@ -134,10 +144,45 @@ pub struct LanguageModelToolResult {
pub tool_use_id: LanguageModelToolUseId,
pub tool_name: Arc<str>,
pub is_error: bool,
pub content: Arc<str>,
pub content: LanguageModelToolResultContent,
pub output: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Deserialize, Serialize, Eq, PartialEq, Hash)]
#[serde(untagged)]
pub enum LanguageModelToolResultContent {
Text(Arc<str>),
Image(LanguageModelImage),
}
impl LanguageModelToolResultContent {
pub fn to_str(&self) -> Option<&str> {
match self {
Self::Text(text) => Some(&text),
Self::Image(_) => None,
}
}
pub fn is_empty(&self) -> bool {
match self {
Self::Text(text) => text.chars().all(|c| c.is_whitespace()),
Self::Image(_) => false,
}
}
}
impl From<&str> for LanguageModelToolResultContent {
fn from(value: &str) -> Self {
Self::Text(Arc::from(value))
}
}
impl From<String> for LanguageModelToolResultContent {
fn from(value: String) -> Self {
Self::Text(Arc::from(value))
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
pub enum MessageContent {
Text(String),
@ -151,6 +196,29 @@ pub enum MessageContent {
ToolResult(LanguageModelToolResult),
}
impl MessageContent {
pub fn to_str(&self) -> Option<&str> {
match self {
MessageContent::Text(text) => Some(text.as_str()),
MessageContent::Thinking { text, .. } => Some(text.as_str()),
MessageContent::RedactedThinking(_) => None,
MessageContent::ToolResult(tool_result) => tool_result.content.to_str(),
MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
}
}
pub fn is_empty(&self) -> bool {
match self {
MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
MessageContent::Thinking { text, .. } => text.chars().all(|c| c.is_whitespace()),
MessageContent::ToolResult(tool_result) => tool_result.content.is_empty(),
MessageContent::RedactedThinking(_)
| MessageContent::ToolUse(_)
| MessageContent::Image(_) => false,
}
}
}
impl From<String> for MessageContent {
fn from(value: String) -> Self {
MessageContent::Text(value)
@ -173,13 +241,7 @@ pub struct LanguageModelRequestMessage {
impl LanguageModelRequestMessage {
pub fn string_contents(&self) -> String {
let mut buffer = String::new();
for string in self.content.iter().filter_map(|content| match content {
MessageContent::Text(text) => Some(text.as_str()),
MessageContent::Thinking { text, .. } => Some(text.as_str()),
MessageContent::RedactedThinking(_) => None,
MessageContent::ToolResult(tool_result) => Some(tool_result.content.as_ref()),
MessageContent::ToolUse(_) | MessageContent::Image(_) => None,
}) {
for string in self.content.iter().filter_map(|content| content.to_str()) {
buffer.push_str(string);
}
@ -187,16 +249,7 @@ impl LanguageModelRequestMessage {
}
pub fn contents_empty(&self) -> bool {
self.content.iter().all(|content| match content {
MessageContent::Text(text) => text.chars().all(|c| c.is_whitespace()),
MessageContent::Thinking { text, .. } => text.chars().all(|c| c.is_whitespace()),
MessageContent::ToolResult(tool_result) => {
tool_result.content.chars().all(|c| c.is_whitespace())
}
MessageContent::RedactedThinking(_)
| MessageContent::ToolUse(_)
| MessageContent::Image(_) => false,
})
self.content.iter().all(|content| content.is_empty())
}
}