eval: Fine-grained assertions (#29246)

- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
2025-04-22 23:58:58 -03:00 · 2025-04-22 23:58:58 -03:00 · ce1a674eba
commit ce1a674eba
parent 0d3fe474db
18 changed files with 1969 additions and 1229 deletions
--- a/crates/eval/src/examples/file_search.rs
+++ b/crates/eval/src/examples/file_search.rs
@ -0,0 +1,53 @@
+use anyhow::Result;
+use assistant_tools::PathSearchToolInput;
+use async_trait::async_trait;
+use regex::Regex;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata};
+
+pub struct FileSearchExample;
+
+#[async_trait(?Send)]
+impl Example for FileSearchExample {
+    fn meta(&self) -> ExampleMetadata {
+        ExampleMetadata {
+            name: "file_search".to_string(),
+            url: "https://github.com/zed-industries/zed.git".to_string(),
+            revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
+            language_server: None,
+            max_assertions: Some(4),
+        }
+    }
+
+    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+        const FILENAME: &str = "find_replace_file_tool.rs";
+        cx.push_user_message(format!(
+                r#"
+        Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
+
+        The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
+        markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
+        background for lines that were added. We should have a div per diff line.
+        "#
+        ));
+
+        let response = cx.run_turn().await?;
+        let tool_use = response.expect_tool("path_search", cx)?;
+        let input = tool_use.expect_input::<PathSearchToolInput>(cx)?;
+
+        let glob = input.glob;
+        cx.assert(
+            glob.ends_with(FILENAME),
+            format!("glob ends with `{FILENAME}`"),
+        )?;
+
+        let without_filename = glob.replace(FILENAME, "");
+        let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
+            .unwrap()
+            .is_match(&without_filename);
+
+        cx.assert(matches, "glob starts with either `**` or `zed`")?;
+
+        Ok(())
+    }
+}
--- a/crates/eval/src/examples/find_and_replace_diff_card.toml
+++ b/crates/eval/src/examples/find_and_replace_diff_card.toml
@ -0,0 +1,43 @@
+url = "https://github.com/zed-industries/zed.git"
+revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
+language_extension = "rs"
+
+prompt = """
+Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
+The card should implement the `Render` trait.
+
+The card should show a diff. It should be a beautifully presented diff.
+The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
+I want to see a red background for lines that were deleted and a green background for lines
+that were added. We should have a div per diff line.
+"""
+
+[diff_assertions]
+
+modify_find_and_replace_tool = """
+The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
+The struct should contain an `output` field that is the same as the task we were returning before,
+and a new `card` field that contains a view for the card.
+"""
+
+card_implementation = """
+The card should be a view that displays a diff.
+Each line in the diff should be colored according to whether it was added, removed or unchanged.
+"""
+
+[thread_assertions]
+
+path_search = """
+The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
+(*Not* grep, for example, or reading the file based on a guess at the path.)
+This is because we gave the model a filename and it needs to turn that into a real path.
+"""
+
+read_file_from_path_search = """
+After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
+"""
+
+symbol_search = """
+When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
+on what path the Render trait might be in.
+"""
--- a/crates/eval/src/examples/mod.rs
+++ b/crates/eval/src/examples/mod.rs
@ -0,0 +1,128 @@
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::Deserialize;
+use std::collections::BTreeMap;
+use std::fs;
+use std::{
+    path::{Path, PathBuf},
+    rc::Rc,
+};
+use util::serde::default_true;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
+
+mod file_search;
+
+pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
+    let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
+
+    for example_path in list_declarative_examples(examples_dir).unwrap() {
+        threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
+    }
+
+    threads
+}
+
+struct DeclarativeExample {
+    metadata: ExampleMetadata,
+    prompt: String,
+    diff_assertions: Vec<JudgeAssertion>,
+    thread_assertions: Vec<JudgeAssertion>,
+}
+
+impl DeclarativeExample {
+    pub fn load(example_path: &Path) -> Result<Self> {
+        let name = Self::name_from_path(example_path);
+        let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
+
+        let language_server = if base.require_lsp {
+            Some(crate::example::LanguageServer {
+                file_extension: base
+                    .language_extension
+                    .expect("Language extension is required when require_lsp = true"),
+                allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
+            })
+        } else {
+            None
+        };
+
+        let metadata = ExampleMetadata {
+            name,
+            url: base.url,
+            revision: base.revision,
+            language_server,
+            max_assertions: None,
+        };
+
+        Ok(DeclarativeExample {
+            metadata,
+            prompt: base.prompt,
+            thread_assertions: base
+                .thread_assertions
+                .into_iter()
+                .map(|(id, description)| JudgeAssertion { id, description })
+                .collect(),
+            diff_assertions: base
+                .diff_assertions
+                .into_iter()
+                .map(|(id, description)| JudgeAssertion { id, description })
+                .collect(),
+        })
+    }
+
+    pub fn name_from_path(path: &Path) -> String {
+        path.file_stem().unwrap().to_string_lossy().to_string()
+    }
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct ExampleToml {
+    pub url: String,
+    pub revision: String,
+    pub language_extension: Option<String>,
+    pub insert_id: Option<String>,
+    #[serde(default = "default_true")]
+    pub require_lsp: bool,
+    #[serde(default)]
+    pub allow_preexisting_diagnostics: bool,
+    pub prompt: String,
+    #[serde(default)]
+    pub diff_assertions: BTreeMap<String, String>,
+    #[serde(default)]
+    pub thread_assertions: BTreeMap<String, String>,
+}
+
+#[async_trait(?Send)]
+impl Example for DeclarativeExample {
+    fn meta(&self) -> ExampleMetadata {
+        self.metadata.clone()
+    }
+
+    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+        cx.push_user_message(&self.prompt);
+        let _ = cx.run_to_end().await;
+        Ok(())
+    }
+
+    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
+        self.diff_assertions.clone()
+    }
+
+    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
+        self.thread_assertions.clone()
+    }
+}
+
+fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
+    let path = std::fs::canonicalize(examples_dir).unwrap();
+    let entries = std::fs::read_dir(path).unwrap();
+    let mut result_paths = Vec::new();
+    for entry in entries {
+        let entry = entry?;
+        let path = entry.path();
+        if path.extension() == Some("toml".as_ref()) {
+            result_paths.push(path);
+        }
+    }
+    Ok(result_paths)
+}