Code block evals (#29619)

Add a targeted eval for code block formatting, and revise the system prompt accordingly. ### Eval before, n=8 <img width="728" alt="eval before" src="https://github.com/user-attachments/assets/552b6146-3d26-4eaa-86f9-9fc36c0cadf2" /> ### Eval after prompt change, n=8 (excluding the new evals, so just testing the prompt change) <img width="717" alt="eval after" src="https://github.com/user-attachments/assets/c78c7a54-4c65-470c-b135-8691584cd73e" /> Release Notes: - N/A
2025-04-29 18:52:09 -04:00 · 2025-04-29 18:52:09 -04:00 · d7004030b3
commit d7004030b3
parent 2508e491d5
10 changed files with 536 additions and 134 deletions
--- a/crates/eval/Cargo.toml
+++ b/crates/eval/Cargo.toml
@ -44,6 +44,7 @@ language_extension.workspace = true
 language_model.workspace = true
 language_models.workspace = true
 languages = { workspace = true, features = ["load-grammars"] }
+markdown.workspace = true
 node_runtime.workspace = true
 pathdiff.workspace = true
 paths.workspace = true
--- a/crates/eval/src/example.rs
+++ b/crates/eval/src/example.rs
@ -10,13 +10,13 @@ use crate::{
    ToolMetrics,
    assertions::{AssertionsReport, RanAssertion, RanAssertionResult},
 };
-use agent::{ContextLoadResult, ThreadEvent};
+use agent::{ContextLoadResult, Thread, ThreadEvent};
 use anyhow::{Result, anyhow};
 use async_trait::async_trait;
 use buffer_diff::DiffHunkStatus;
 use collections::HashMap;
 use futures::{FutureExt as _, StreamExt, channel::mpsc, select_biased};
-use gpui::{AppContext, AsyncApp, Entity};
+use gpui::{App, AppContext, AsyncApp, Entity};
 use language_model::{LanguageModel, Role, StopReason};

 pub const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
@ -314,7 +314,7 @@ impl ExampleContext {
            for message in thread.messages().skip(message_count_before) {
                messages.push(Message {
                    _role: message.role,
-                    _text: message.to_string(),
+                    text: message.to_string(),
                    tool_use: thread
                        .tool_uses_for_message(message.id, cx)
                        .into_iter()
@ -362,6 +362,90 @@ impl ExampleContext {
            })
            .unwrap()
    }
+
+    pub fn agent_thread(&self) -> Entity<Thread> {
+        self.agent_thread.clone()
+    }
+}
+
+impl AppContext for ExampleContext {
+    type Result<T> = anyhow::Result<T>;
+
+    fn new<T: 'static>(
+        &mut self,
+        build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
+    ) -> Self::Result<Entity<T>> {
+        self.app.new(build_entity)
+    }
+
+    fn reserve_entity<T: 'static>(&mut self) -> Self::Result<gpui::Reservation<T>> {
+        self.app.reserve_entity()
+    }
+
+    fn insert_entity<T: 'static>(
+        &mut self,
+        reservation: gpui::Reservation<T>,
+        build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
+    ) -> Self::Result<Entity<T>> {
+        self.app.insert_entity(reservation, build_entity)
+    }
+
+    fn update_entity<T, R>(
+        &mut self,
+        handle: &Entity<T>,
+        update: impl FnOnce(&mut T, &mut gpui::Context<T>) -> R,
+    ) -> Self::Result<R>
+    where
+        T: 'static,
+    {
+        self.app.update_entity(handle, update)
+    }
+
+    fn read_entity<T, R>(
+        &self,
+        handle: &Entity<T>,
+        read: impl FnOnce(&T, &App) -> R,
+    ) -> Self::Result<R>
+    where
+        T: 'static,
+    {
+        self.app.read_entity(handle, read)
+    }
+
+    fn update_window<T, F>(&mut self, window: gpui::AnyWindowHandle, f: F) -> Result<T>
+    where
+        F: FnOnce(gpui::AnyView, &mut gpui::Window, &mut App) -> T,
+    {
+        self.app.update_window(window, f)
+    }
+
+    fn read_window<T, R>(
+        &self,
+        window: &gpui::WindowHandle<T>,
+        read: impl FnOnce(Entity<T>, &App) -> R,
+    ) -> Result<R>
+    where
+        T: 'static,
+    {
+        self.app.read_window(window, read)
+    }
+
+    fn background_spawn<R>(
+        &self,
+        future: impl std::future::Future<Output = R> + Send + 'static,
+    ) -> gpui::Task<R>
+    where
+        R: Send + 'static,
+    {
+        self.app.background_spawn(future)
+    }
+
+    fn read_global<G, R>(&self, callback: impl FnOnce(&G, &App) -> R) -> Self::Result<R>
+    where
+        G: gpui::Global,
+    {
+        self.app.read_global(callback)
+    }
 }

 #[derive(Debug)]
@ -391,12 +475,16 @@ impl Response {
    pub fn tool_uses(&self) -> impl Iterator<Item = &ToolUse> {
        self.messages.iter().flat_map(|msg| &msg.tool_use)
    }
+
+    pub fn texts(&self) -> impl Iterator<Item = String> {
+        self.messages.iter().map(|message| message.text.clone())
+    }
 }

 #[derive(Debug)]
 pub struct Message {
    _role: Role,
-    _text: String,
+    text: String,
    tool_use: Vec<ToolUse>,
 }

--- a/crates/eval/src/examples/code_block_citations.rs
+++ b/crates/eval/src/examples/code_block_citations.rs
@ -0,0 +1,191 @@
+use anyhow::Result;
+use async_trait::async_trait;
+use markdown::PathWithRange;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
+
+pub struct CodeBlockCitations;
+
+const FENCE: &str = "```";
+
+#[async_trait(?Send)]
+impl Example for CodeBlockCitations {
+    fn meta(&self) -> ExampleMetadata {
+        ExampleMetadata {
+            name: "code_block_citations".to_string(),
+            url: "https://github.com/zed-industries/zed.git".to_string(),
+            revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
+            language_server: Some(LanguageServer {
+                file_extension: "rs".to_string(),
+                allow_preexisting_diagnostics: false,
+            }),
+            max_assertions: None,
+        }
+    }
+
+    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+        const FILENAME: &str = "assistant_tool.rs";
+        cx.push_user_message(format!(
+            r#"
+            Show me the method bodies of all the methods of the `Tool` trait in {FILENAME}.
+
+            Please show each method in a separate code snippet.
+            "#
+        ));
+
+        // Verify that the messages all have the correct formatting.
+        let texts: Vec<String> = cx.run_to_end().await?.texts().collect();
+        let closing_fence = format!("\n{FENCE}");
+
+        for text in texts.iter() {
+            let mut text = text.as_str();
+
+            while let Some(index) = text.find(FENCE) {
+                // Advance text past the opening backticks.
+                text = &text[index + FENCE.len()..];
+
+                // Find the closing backticks.
+                let content_len = text.find(&closing_fence);
+
+                // Verify the citation format - e.g. ```path/to/foo.txt#L123-456
+                if let Some(citation_len) = text.find('\n') {
+                    let citation = &text[..citation_len];
+
+                    if let Ok(()) =
+                        cx.assert(citation.contains("/"), format!("Slash in {citation:?}",))
+                    {
+                        let path_range = PathWithRange::new(citation);
+                        let path = cx
+                            .agent_thread()
+                            .update(cx, |thread, cx| {
+                                thread
+                                    .project()
+                                    .read(cx)
+                                    .find_project_path(path_range.path, cx)
+                            })
+                            .ok()
+                            .flatten();
+
+                        if let Ok(path) = cx.assert_some(path, format!("Valid path: {citation:?}"))
+                        {
+                            let buffer_text = {
+                                let buffer = match cx.agent_thread().update(cx, |thread, cx| {
+                                    thread
+                                        .project()
+                                        .update(cx, |project, cx| project.open_buffer(path, cx))
+                                }) {
+                                    Ok(buffer_task) => buffer_task.await.ok(),
+                                    Err(err) => {
+                                        cx.assert(
+                                            false,
+                                            format!("Expected Ok(buffer), not {err:?}"),
+                                        )
+                                        .ok();
+                                        break;
+                                    }
+                                };
+
+                                let Ok(buffer_text) = cx.assert_some(
+                                    buffer.and_then(|buffer| {
+                                        buffer.read_with(cx, |buffer, _| buffer.text()).ok()
+                                    }),
+                                    "Reading buffer text succeeded",
+                                ) else {
+                                    continue;
+                                };
+                                buffer_text
+                            };
+
+                            if let Some(content_len) = content_len {
+                                // + 1 because there's a newline character after the citation.
+                                let content =
+                                    &text[(citation.len() + 1)..content_len - (citation.len() + 1)];
+
+                                cx.assert(
+                                    buffer_text.contains(&content),
+                                    "Code block content was found in file",
+                                )
+                                .ok();
+
+                                if let Some(range) = path_range.range {
+                                    let start_line_index = range.start.line.saturating_sub(1);
+                                    let line_count =
+                                        range.end.line.saturating_sub(start_line_index);
+                                    let mut snippet = buffer_text
+                                        .lines()
+                                        .skip(start_line_index as usize)
+                                        .take(line_count as usize)
+                                        .collect::<Vec<&str>>()
+                                        .join("\n");
+
+                                    if let Some(start_col) = range.start.col {
+                                        snippet = snippet[start_col as usize..].to_string();
+                                    }
+
+                                    if let Some(end_col) = range.end.col {
+                                        let last_line = snippet.lines().last().unwrap();
+                                        snippet = snippet
+                                            [..snippet.len() - last_line.len() + end_col as usize]
+                                            .to_string();
+                                    }
+
+                                    cx.assert_eq(
+                                        snippet.as_str(),
+                                        content,
+                                        "Code block snippet was at specified line/col",
+                                    )
+                                    .ok();
+                                }
+                            }
+                        }
+                    }
+                } else {
+                    cx.assert(
+                        false,
+                        format!("Opening {FENCE} did not have a newline anywhere after it."),
+                    )
+                    .ok();
+                }
+
+                if let Some(content_len) = content_len {
+                    // Advance past the closing backticks
+                    text = &text[content_len + FENCE.len()..];
+                } else {
+                    // There were no closing backticks associated with these opening backticks.
+                    cx.assert(
+                        false,
+                        "Code block opening had matching closing backticks.".to_string(),
+                    )
+                    .ok();
+
+                    // There are no more code blocks to parse, so we're done.
+                    break;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
+        vec![
+            JudgeAssertion {
+                id: "trait method bodies are shown".to_string(),
+                description:
+                    "All method bodies of the Tool trait are shown."
+                        .to_string(),
+            },
+            JudgeAssertion {
+                id: "code blocks used".to_string(),
+                description:
+                   "All code snippets are rendered inside markdown code blocks (as opposed to any other formatting besides code blocks)."
+                        .to_string(),
+            },
+            JudgeAssertion {
+              id: "code blocks use backticks".to_string(),
+              description:
+                  format!("All markdown code blocks use backtick fences ({FENCE}) rather than indentation.")
+            }
+        ]
+    }
+}
--- a/crates/eval/src/examples/mod.rs
+++ b/crates/eval/src/examples/mod.rs
@ -12,12 +12,14 @@ use util::serde::default_true;
 use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};

 mod add_arg_to_trait_method;
+mod code_block_citations;
 mod file_search;

 pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
    let mut threads: Vec<Rc<dyn Example>> = vec![
        Rc::new(file_search::FileSearchExample),
        Rc::new(add_arg_to_trait_method::AddArgToTraitMethod),
+        Rc::new(code_block_citations::CodeBlockCitations),
    ];

    for example_path in list_declarative_examples(examples_dir).unwrap() {