Copy/paste images into editors (Mac only) (#15782)

For future reference: WIP branch of copy/pasting a mixture of images and text: https://github.com/zed-industries/zed/tree/copy-paste-images - we'll come back to that one after landing this one. Release Notes: - You can now paste images into the Assistant Panel to include them as context. Currently works only on Mac, and with Anthropic models. Future support is planned for more models, operating systems, and image clipboard operations. --------- Co-authored-by: Antonio <antonio@zed.dev> Co-authored-by: Mikayla <mikayla@zed.dev> Co-authored-by: Jason <jason@zed.dev> Co-authored-by: Kyle <kylek@zed.dev>
2024-08-13 13:18:25 -04:00 · 2024-08-13 13:18:25 -04:00 · b1a581e81b
commit b1a581e81b
parent e3b0de5dda
58 changed files with 2983 additions and 1708 deletions
--- a/crates/language_model/Cargo.toml
+++ b/crates/language_model/Cargo.toml
@ -50,6 +50,9 @@ theme.workspace = true
 tiktoken-rs.workspace = true
 ui.workspace = true
 util.workspace = true
+base64.workspace = true
+image.workspace = true
+

 [dev-dependencies]
 ctor.workspace = true
--- a/crates/language_model/src/provider/anthropic.rs
+++ b/crates/language_model/src/provider/anthropic.rs
@ -221,24 +221,44 @@ pub fn count_anthropic_tokens(
 ) -> BoxFuture<'static, Result<usize>> {
    cx.background_executor()
        .spawn(async move {
-            let messages = request
-                .messages
-                .into_iter()
-                .map(|message| tiktoken_rs::ChatCompletionRequestMessage {
-                    role: match message.role {
-                        Role::User => "user".into(),
-                        Role::Assistant => "assistant".into(),
-                        Role::System => "system".into(),
-                    },
-                    content: Some(message.content),
-                    name: None,
-                    function_call: None,
-                })
-                .collect::<Vec<_>>();
+            let messages = request.messages;
+            let mut tokens_from_images = 0;
+            let mut string_messages = Vec::with_capacity(messages.len());
+
+            for message in messages {
+                use crate::MessageContent;
+
+                let mut string_contents = String::new();
+
+                for content in message.content {
+                    match content {
+                        MessageContent::Text(string) => {
+                            string_contents.push_str(&string);
+                        }
+                        MessageContent::Image(image) => {
+                            tokens_from_images += image.estimate_tokens();
+                        }
+                    }
+                }
+
+                if !string_contents.is_empty() {
+                    string_messages.push(tiktoken_rs::ChatCompletionRequestMessage {
+                        role: match message.role {
+                            Role::User => "user".into(),
+                            Role::Assistant => "assistant".into(),
+                            Role::System => "system".into(),
+                        },
+                        content: Some(string_contents),
+                        name: None,
+                        function_call: None,
+                    });
+                }
+            }

            // Tiktoken doesn't yet support these models, so we manually use the
            // same tokenizer as GPT-4.
-            tiktoken_rs::num_tokens_from_messages("gpt-4", &messages)
+            tiktoken_rs::num_tokens_from_messages("gpt-4", &string_messages)
+                .map(|tokens| tokens + tokens_from_images)
        })
        .boxed()
 }
--- a/crates/language_model/src/provider/copilot_chat.rs
+++ b/crates/language_model/src/provider/copilot_chat.rs
@ -193,7 +193,7 @@ impl LanguageModel for CopilotChatLanguageModel {
        cx: &AsyncAppContext,
    ) -> BoxFuture<'static, Result<BoxStream<'static, Result<String>>>> {
        if let Some(message) = request.messages.last() {
-            if message.content.trim().is_empty() {
+            if message.contents_empty() {
                const EMPTY_PROMPT_MSG: &str =
                    "Empty prompts aren't allowed. Please provide a non-empty prompt.";
                return futures::future::ready(Err(anyhow::anyhow!(EMPTY_PROMPT_MSG))).boxed();
@ -270,7 +270,7 @@ impl CopilotChatLanguageModel {
                        Role::Assistant => CopilotChatRole::Assistant,
                        Role::System => CopilotChatRole::System,
                    },
-                    content: msg.content,
+                    content: msg.string_contents(),
                })
                .collect(),
        )
--- a/crates/language_model/src/provider/ollama.rs
+++ b/crates/language_model/src/provider/ollama.rs
@ -182,14 +182,14 @@ impl OllamaLanguageModel {
                .into_iter()
                .map(|msg| match msg.role {
                    Role::User => ChatMessage::User {
-                        content: msg.content,
+                        content: msg.string_contents(),
                    },
                    Role::Assistant => ChatMessage::Assistant {
-                        content: msg.content,
+                        content: msg.string_contents(),
                        tool_calls: None,
                    },
                    Role::System => ChatMessage::System {
-                        content: msg.content,
+                        content: msg.string_contents(),
                    },
                })
                .collect(),
@ -257,7 +257,7 @@ impl LanguageModel for OllamaLanguageModel {
        let token_count = request
            .messages
            .iter()
-            .map(|msg| msg.content.chars().count())
+            .map(|msg| msg.string_contents().chars().count())
            .sum::<usize>()
            / 4;

--- a/crates/language_model/src/provider/open_ai.rs
+++ b/crates/language_model/src/provider/open_ai.rs
@ -363,7 +363,7 @@ pub fn count_open_ai_tokens(
                        Role::Assistant => "assistant".into(),
                        Role::System => "system".into(),
                    },
-                    content: Some(message.content),
+                    content: Some(message.string_contents()),
                    name: None,
                    function_call: None,
                })
--- a/crates/language_model/src/request.rs
+++ b/crates/language_model/src/request.rs
@ -1,10 +1,223 @@
+use std::io::{Cursor, Write};
+
 use crate::role::Role;
+use base64::write::EncoderWriter;
+use gpui::{point, size, AppContext, DevicePixels, Image, ObjectFit, RenderImage, Size, Task};
+use image::{codecs::png::PngEncoder, imageops::resize, DynamicImage, ImageDecoder};
 use serde::{Deserialize, Serialize};
+use ui::{px, SharedString};
+use util::ResultExt;
+
+#[derive(Clone, PartialEq, Eq, Serialize, Deserialize, Debug, Hash)]
+pub struct LanguageModelImage {
+    // A base64 encoded PNG image
+    pub source: SharedString,
+    size: Size<DevicePixels>,
+}
+
+const ANTHROPIC_SIZE_LIMT: f32 = 1568.0; // Anthropic wants uploaded images to be smaller than this in both dimensions
+
+impl LanguageModelImage {
+    pub fn from_image(data: Image, cx: &mut AppContext) -> Task<Option<Self>> {
+        cx.background_executor().spawn(async move {
+            match data.format() {
+                gpui::ImageFormat::Png
+                | gpui::ImageFormat::Jpeg
+                | gpui::ImageFormat::Webp
+                | gpui::ImageFormat::Gif => {}
+                _ => return None,
+            };
+
+            let image = image::codecs::png::PngDecoder::new(Cursor::new(data.bytes())).log_err()?;
+            let (width, height) = image.dimensions();
+            let image_size = size(DevicePixels(width as i32), DevicePixels(height as i32));
+
+            let mut base64_image = Vec::new();
+
+            {
+                let mut base64_encoder = EncoderWriter::new(
+                    Cursor::new(&mut base64_image),
+                    &base64::engine::general_purpose::STANDARD,
+                );
+
+                if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
+                    || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
+                {
+                    let new_bounds = ObjectFit::ScaleDown.get_bounds(
+                        gpui::Bounds {
+                            origin: point(px(0.0), px(0.0)),
+                            size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
+                        },
+                        image_size,
+                    );
+                    let image = DynamicImage::from_decoder(image).log_err()?.resize(
+                        new_bounds.size.width.0 as u32,
+                        new_bounds.size.height.0 as u32,
+                        image::imageops::FilterType::Triangle,
+                    );
+
+                    let mut png = Vec::new();
+                    image
+                        .write_with_encoder(PngEncoder::new(&mut png))
+                        .log_err()?;
+
+                    base64_encoder.write_all(png.as_slice()).log_err()?;
+                } else {
+                    base64_encoder.write_all(data.bytes()).log_err()?;
+                }
+            }
+
+            // SAFETY: The base64 encoder should not produce non-UTF8
+            let source = unsafe { String::from_utf8_unchecked(base64_image) };
+
+            Some(LanguageModelImage {
+                size: image_size,
+                source: source.into(),
+            })
+        })
+    }
+
+    /// Resolves image into an LLM-ready format (base64)
+    pub fn from_render_image(data: &RenderImage) -> Option<Self> {
+        let image_size = data.size(0);
+
+        let mut bytes = data.as_bytes(0).unwrap_or(&[]).to_vec();
+        // Convert from BGRA to RGBA.
+        for pixel in bytes.chunks_exact_mut(4) {
+            pixel.swap(2, 0);
+        }
+        let mut image = image::RgbaImage::from_vec(
+            image_size.width.0 as u32,
+            image_size.height.0 as u32,
+            bytes,
+        )
+        .expect("We already know this works");
+
+        // https://docs.anthropic.com/en/docs/build-with-claude/vision
+        if image_size.width.0 > ANTHROPIC_SIZE_LIMT as i32
+            || image_size.height.0 > ANTHROPIC_SIZE_LIMT as i32
+        {
+            let new_bounds = ObjectFit::ScaleDown.get_bounds(
+                gpui::Bounds {
+                    origin: point(px(0.0), px(0.0)),
+                    size: size(px(ANTHROPIC_SIZE_LIMT), px(ANTHROPIC_SIZE_LIMT)),
+                },
+                image_size,
+            );
+
+            image = resize(
+                &image,
+                new_bounds.size.width.0 as u32,
+                new_bounds.size.height.0 as u32,
+                image::imageops::FilterType::Triangle,
+            );
+        }
+
+        let mut png = Vec::new();
+
+        image
+            .write_with_encoder(PngEncoder::new(&mut png))
+            .log_err()?;
+
+        let mut base64_image = Vec::new();
+
+        {
+            let mut base64_encoder = EncoderWriter::new(
+                Cursor::new(&mut base64_image),
+                &base64::engine::general_purpose::STANDARD,
+            );
+
+            base64_encoder.write_all(png.as_slice()).log_err()?;
+        }
+
+        // SAFETY: The base64 encoder should not produce non-UTF8
+        let source = unsafe { String::from_utf8_unchecked(base64_image) };
+
+        Some(LanguageModelImage {
+            size: image_size,
+            source: source.into(),
+        })
+    }
+
+    pub fn estimate_tokens(&self) -> usize {
+        let width = self.size.width.0.unsigned_abs() as usize;
+        let height = self.size.height.0.unsigned_abs() as usize;
+
+        // From: https://docs.anthropic.com/en/docs/build-with-claude/vision#calculate-image-costs
+        // Note that are a lot of conditions on anthropic's API, and OpenAI doesn't use this,
+        // so this method is more of a rough guess
+        (width * height) / 750
+    }
+}
+
+#[derive(Clone, Serialize, Deserialize, Eq, PartialEq, Hash)]
+pub enum MessageContent {
+    Text(String),
+    Image(LanguageModelImage),
+}
+
+impl std::fmt::Debug for MessageContent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            MessageContent::Text(t) => f.debug_struct("MessageContent").field("text", t).finish(),
+            MessageContent::Image(i) => f
+                .debug_struct("MessageContent")
+                .field("image", &i.source.len())
+                .finish(),
+        }
+    }
+}
+
+impl MessageContent {
+    pub fn as_string(&self) -> &str {
+        match self {
+            MessageContent::Text(s) => s.as_str(),
+            MessageContent::Image(_) => "",
+        }
+    }
+}
+
+impl From<String> for MessageContent {
+    fn from(value: String) -> Self {
+        MessageContent::Text(value)
+    }
+}
+
+impl From<&str> for MessageContent {
+    fn from(value: &str) -> Self {
+        MessageContent::Text(value.to_string())
+    }
+}

 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Hash)]
 pub struct LanguageModelRequestMessage {
    pub role: Role,
-    pub content: String,
+    pub content: Vec<MessageContent>,
+}
+
+impl LanguageModelRequestMessage {
+    pub fn string_contents(&self) -> String {
+        let mut string_buffer = String::new();
+        for string in self.content.iter().filter_map(|content| match content {
+            MessageContent::Text(s) => Some(s),
+            MessageContent::Image(_) => None,
+        }) {
+            string_buffer.push_str(string.as_str())
+        }
+        string_buffer
+    }
+
+    pub fn contents_empty(&self) -> bool {
+        self.content.is_empty()
+            || self
+                .content
+                .get(0)
+                .map(|content| match content {
+                    MessageContent::Text(s) => s.is_empty(),
+                    MessageContent::Image(_) => true,
+                })
+                .unwrap_or(false)
+    }
 }

 #[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)]
@ -23,14 +236,14 @@ impl LanguageModelRequest {
                .into_iter()
                .map(|msg| match msg.role {
                    Role::User => open_ai::RequestMessage::User {
-                        content: msg.content,
+                        content: msg.string_contents(),
                    },
                    Role::Assistant => open_ai::RequestMessage::Assistant {
-                        content: Some(msg.content),
+                        content: Some(msg.string_contents()),
                        tool_calls: Vec::new(),
                    },
                    Role::System => open_ai::RequestMessage::System {
-                        content: msg.content,
+                        content: msg.string_contents(),
                    },
                })
                .collect(),
@ -51,7 +264,7 @@ impl LanguageModelRequest {
                .into_iter()
                .map(|msg| google_ai::Content {
                    parts: vec![google_ai::Part::TextPart(google_ai::TextPart {
-                        text: msg.content,
+                        text: msg.string_contents(),
                    })],
                    role: match msg.role {
                        Role::User => google_ai::Role::User,
@ -77,7 +290,7 @@ impl LanguageModelRequest {
        let mut system_message = String::new();

        for message in self.messages {
-            if message.content.is_empty() {
+            if message.contents_empty() {
                continue;
            }

@ -85,8 +298,11 @@ impl LanguageModelRequest {
                Role::User | Role::Assistant => {
                    if let Some(last_message) = new_messages.last_mut() {
                        if last_message.role == message.role {
-                            last_message.content.push_str("\n\n");
-                            last_message.content.push_str(&message.content);
+                            // TODO: is this append done properly?
+                            last_message.content.push(MessageContent::Text(format!(
+                                "\n\n{}",
+                                message.string_contents()
+                            )));
                            continue;
                        }
                    }
@ -97,7 +313,7 @@ impl LanguageModelRequest {
                    if !system_message.is_empty() {
                        system_message.push_str("\n\n");
                    }
-                    system_message.push_str(&message.content);
+                    system_message.push_str(&message.string_contents());
                }
            }
        }
@ -113,9 +329,24 @@ impl LanguageModelRequest {
                            Role::Assistant => anthropic::Role::Assistant,
                            Role::System => return None,
                        },
-                        content: vec![anthropic::Content::Text {
-                            text: message.content,
-                        }],
+                        content: message
+                            .content
+                            .into_iter()
+                            // TODO: filter out the empty messages in the message construction step
+                            .filter_map(|content| match content {
+                                MessageContent::Text(t) if !t.is_empty() => {
+                                    Some(anthropic::Content::Text { text: t })
+                                }
+                                MessageContent::Image(i) => Some(anthropic::Content::Image {
+                                    source: anthropic::ImageSource {
+                                        source_type: "base64".to_string(),
+                                        media_type: "image/png".to_string(),
+                                        data: i.source.to_string(),
+                                    },
+                                }),
+                                _ => None,
+                            })
+                            .collect(),
                    })
                })
                .collect(),