eval: Fine-grained assertions (#29246)

- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
2025-04-22 23:58:58 -03:00 · 2025-04-22 23:58:58 -03:00 · ce1a674eba
commit ce1a674eba
parent 0d3fe474db
18 changed files with 1969 additions and 1229 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4895,6 +4895,7 @@ dependencies = [
 "anyhow",
 "assistant_tool",
 "assistant_tools",
+ "async-trait",
 "async-watch",
 "chrono",
 "clap",
@ -4915,13 +4916,14 @@ dependencies = [
 "language_models",
 "languages",
 "node_runtime",
- "parking_lot",
 "paths",
 "project",
 "prompt_store",
+ "regex",
 "release_channel",
 "reqwest_client",
 "serde",
+ "serde_json",
 "settings",
 "shellexpand 2.1.2",
 "smol",
--- a/crates/agent/src/thread.rs
+++ b/crates/agent/src/thread.rs
@ -315,6 +315,7 @@ pub struct Thread {
    request_callback: Option<
        Box<dyn FnMut(&LanguageModelRequest, &[Result<LanguageModelCompletionEvent, String>])>,
    >,
+    remaining_turns: u32,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -368,6 +369,7 @@ impl Thread {
            message_feedback: HashMap::default(),
            last_auto_capture_at: None,
            request_callback: None,
+            remaining_turns: u32::MAX,
        }
    }

@ -442,6 +444,7 @@ impl Thread {
            message_feedback: HashMap::default(),
            last_auto_capture_at: None,
            request_callback: None,
+            remaining_turns: u32::MAX,
        }
    }

@ -522,7 +525,7 @@ impl Thread {
        self.messages.iter().find(|message| message.id == id)
    }

-    pub fn messages(&self) -> impl Iterator<Item = &Message> {
+    pub fn messages(&self) -> impl ExactSizeIterator<Item = &Message> {
        self.messages.iter()
    }

@ -958,7 +961,21 @@ impl Thread {
        })
    }

+    pub fn remaining_turns(&self) -> u32 {
+        self.remaining_turns
+    }
+
+    pub fn set_remaining_turns(&mut self, remaining_turns: u32) {
+        self.remaining_turns = remaining_turns;
+    }
+
    pub fn send_to_model(&mut self, model: Arc<dyn LanguageModel>, cx: &mut Context<Self>) {
+        if self.remaining_turns == 0 {
+            return;
+        }
+
+        self.remaining_turns -= 1;
+
        let mut request = self.to_completion_request(cx);
        if model.supports_tools() {
            request.tools = {
--- a/crates/assistant_tools/src/assistant_tools.rs
+++ b/crates/assistant_tools/src/assistant_tools.rs
@ -56,6 +56,8 @@ use crate::symbol_info_tool::SymbolInfoTool;
 use crate::terminal_tool::TerminalTool;
 use crate::thinking_tool::ThinkingTool;

+pub use path_search_tool::PathSearchToolInput;
+
 pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
    assistant_tool::init(cx);

--- a/crates/eval/Cargo.toml
+++ b/crates/eval/Cargo.toml
@ -9,6 +9,7 @@ agent.workspace = true
 anyhow.workspace = true
 assistant_tool.workspace = true
 assistant_tools.workspace = true
+async-trait.workspace = true
 async-watch.workspace = true
 chrono.workspace = true
 clap.workspace = true
@ -29,13 +30,14 @@ language_model.workspace = true
 language_models.workspace = true
 languages = { workspace = true, features = ["load-grammars"] }
 node_runtime.workspace = true
-parking_lot.workspace = true
 paths.workspace = true
 project.workspace = true
 prompt_store.workspace = true
+regex.workspace = true
 release_channel.workspace = true
 reqwest_client.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 settings.workspace = true
 shellexpand.workspace = true
 smol.workspace = true
@ -45,7 +47,6 @@ unindent.workspace = true
 util.workspace = true
 uuid = { version = "1.6", features = ["v4"] }
 workspace-hack.workspace = true
-
 [[bin]]
 name = "eval"
 path = "src/eval.rs"
--- a/crates/eval/examples/find_and_replace_diff_card/base.toml
+++ b/crates/eval/examples/find_and_replace_diff_card/base.toml
@ -1,3 +0,0 @@
-url = "https://github.com/zed-industries/zed.git"
-revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
-language_extension = "rs"
--- a/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md
+++ b/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md
@ -1,2 +0,0 @@
- The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. The struct should contain an `output` field that is the same as the task we were returning before, and a new `card` field that contains a view for the card.
- The card should be a view that displays a diff. Each line in the diff should be colored according to whether it was added, removed or unchanged.
--- a/crates/eval/examples/find_and_replace_diff_card/prompt.md
+++ b/crates/eval/examples/find_and_replace_diff_card/prompt.md
@ -1,3 +0,0 @@
-Look at the `find_replace_file_tool.rs`. I want to implement a card for it. The card should implement the `Render` trait.
-
-The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green background for lines that were added. We should have a div per diff line.
--- a/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md
+++ b/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md
@ -1,3 +0,0 @@
- The first tool call should be to path search including "find_replace_file_tool.rs" in the string. (*Not* grep, for example, or reading the file based on a guess at the path.) This is because we gave the model a filename and it needs to turn that into a real path.
- After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
- When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information on what path the Render trait might be in.
--- a/crates/eval/src/assertions.rs
+++ b/crates/eval/src/assertions.rs
@ -0,0 +1,157 @@
+use serde::{Deserialize, Serialize};
+use std::fmt::Write;
+use std::fmt::{self};
+
+#[derive(Default, Debug, Serialize, Deserialize, Clone)]
+pub struct AssertionsReport {
+    pub ran: Vec<RanAssertion>,
+    pub max: Option<usize>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RanAssertion {
+    pub id: String,
+    pub result: Result<RanAssertionResult, String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RanAssertionResult {
+    pub analysis: Option<String>,
+    pub passed: bool,
+}
+
+impl AssertionsReport {
+    pub fn new(max: Option<usize>) -> Self {
+        AssertionsReport {
+            ran: Vec::new(),
+            max,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.ran.is_empty()
+    }
+
+    pub fn total_count(&self) -> usize {
+        self.run_count().max(self.max.unwrap_or(0))
+    }
+
+    pub fn run_count(&self) -> usize {
+        self.ran.len()
+    }
+
+    pub fn passed_count(&self) -> usize {
+        self.ran
+            .iter()
+            .filter(|a| a.result.as_ref().map_or(false, |result| result.passed))
+            .count()
+    }
+
+    pub fn passed_percentage(&self) -> f32 {
+        if self.total_count() == 0 {
+            0.0
+        } else {
+            (self.passed_count() as f32 / self.total_count() as f32) * 100.0
+        }
+    }
+}
+
+const ROUND_WIDTH: usize = "Round".len();
+const ASSERTIONS_WIDTH: usize = 42;
+const RESULTS_WIDTH: usize = 8;
+
+pub fn print_table_header() {
+    println!(
+        "┌─{}─┬─{}─┬─{}─┐",
+        "─".repeat(ROUND_WIDTH),
+        "─".repeat(ASSERTIONS_WIDTH),
+        "─".repeat(RESULTS_WIDTH)
+    );
+
+    println!(
+        "│ {:^ROUND_WIDTH$} │ {:^ASSERTIONS_WIDTH$} │ {:^RESULTS_WIDTH$} │",
+        "Round", "Assertion", "Result"
+    );
+
+    println!(
+        "├─{}─┼─{}─┼─{}─┤",
+        "─".repeat(ROUND_WIDTH),
+        "─".repeat(ASSERTIONS_WIDTH),
+        "─".repeat(RESULTS_WIDTH)
+    )
+}
+
+pub fn display_error_row(f: &mut String, round: usize, error: String) -> fmt::Result {
+    let last_two_columns = ASSERTIONS_WIDTH + RESULTS_WIDTH;
+    writeln!(
+        f,
+        "│ {:^ROUND_WIDTH$} │ {:<last_two_columns$} |",
+        round,
+        truncate(&error, last_two_columns)
+    )
+}
+
+pub fn display_table_row(f: &mut String, round: usize, assertion: &RanAssertion) -> fmt::Result {
+    let result = match &assertion.result {
+        Ok(result) if result.passed => "\x1b[32m✔︎ Passed\x1b[0m",
+        Ok(_) => "\x1b[31m✗ Failed\x1b[0m",
+        Err(_) => "\x1b[31m💥 Judge Error\x1b[0m",
+    };
+
+    writeln!(
+        f,
+        "│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
+        round,
+        truncate(&assertion.id, ASSERTIONS_WIDTH),
+        result
+    )
+}
+
+pub fn print_table_round_summary<'a>(
+    round: &str,
+    reports: impl Iterator<Item = &'a AssertionsReport>,
+) {
+    let mut passed = 0;
+    let mut total = 0;
+    for report in reports {
+        passed += report.passed_count();
+        total += report.total_count();
+    }
+
+    println!(
+        "│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
+        round,
+        "total",
+        format!("{}%", (passed as f32 / total as f32 * 100.0).floor())
+    )
+}
+
+pub fn print_table_footer() {
+    println!(
+        "└─{}─┴─{}─┴─{}─┘",
+        "─".repeat(ROUND_WIDTH),
+        "─".repeat(ASSERTIONS_WIDTH),
+        "─".repeat(RESULTS_WIDTH)
+    )
+}
+
+pub fn print_table_divider() {
+    println!(
+        "├─{}─┼─{}─┼─{}─┤",
+        "─".repeat(ROUND_WIDTH),
+        "─".repeat(ASSERTIONS_WIDTH),
+        "─".repeat(RESULTS_WIDTH)
+    )
+}
+
+fn truncate(assertion: &str, max_width: usize) -> String {
+    if assertion.len() <= max_width {
+        assertion.to_string()
+    } else {
+        let mut end_ix = max_width - 1;
+        while !assertion.is_char_boundary(end_ix) {
+            end_ix -= 1;
+        }
+        format!("{}…", &assertion[..end_ix])
+    }
+}
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -1,13 +1,16 @@
+mod assertions;
 mod example;
+mod examples;
 mod ids;
+mod instance;
 mod tool_metrics;

-pub(crate) use example::*;
-use parking_lot::Mutex;
+use assertions::display_error_row;
+use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 pub(crate) use tool_metrics::*;

 use ::fs::RealFs;
-use anyhow::{Result, anyhow};
+use anyhow::anyhow;
 use clap::Parser;
 use client::{Client, ProxySettings, UserStore};
 use collections::{HashMap, HashSet};
@ -25,18 +28,20 @@ use prompt_store::PromptBuilder;
 use release_channel::AppVersion;
 use reqwest_client::ReqwestClient;
 use settings::{Settings, SettingsStore};
+use std::cell::RefCell;
 use std::collections::VecDeque;
 use std::env;
 use std::path::{Path, PathBuf};
+use std::rc::Rc;
 use std::sync::Arc;
 use util::ResultExt as _;

 #[derive(Parser, Debug)]
 #[command(name = "eval", disable_version_flag = true)]
 struct Args {
-    /// Runs all examples that contain these substrings. If unspecified, all examples are run.
+    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
    #[arg(value_name = "EXAMPLE_SUBSTRING")]
-    examples: Vec<String>,
+    filter: Vec<String>,
    /// Model to use (default: "claude-3-7-sonnet-latest")
    #[arg(long, default_value = "claude-3-7-sonnet-latest")]
    model: String,
@ -66,43 +71,30 @@ fn main() {
        .parent()
        .unwrap()
        .parent()
+        .unwrap()
+        .canonicalize()
        .unwrap();
-    let eval_crate_dir = root_dir.join("crates/eval");
+    let eval_crate_dir = root_dir.join("crates").join("eval");
    let repos_dir = eval_crate_dir.join("repos");
    let worktrees_dir = eval_crate_dir.join("worktrees");
-    let examples_dir = eval_crate_dir.join("examples");
-    let runs_dir = eval_crate_dir.join("runs");
-    let run_dir = runs_dir.join(format!("{}", run_timestamp));
+    let examples_dir = eval_crate_dir.join("src").join("examples");
+    let run_dir = eval_crate_dir
+        .join("runs")
+        .join(format!("{}", run_timestamp));
    std::fs::create_dir_all(&run_dir).unwrap();
    std::fs::create_dir_all(&repos_dir).unwrap();
    std::fs::create_dir_all(&worktrees_dir).unwrap();
    std::fs::create_dir_all(&examples_dir).unwrap();
    std::fs::create_dir_all(&paths::config_dir()).unwrap();

-    let zed_commit_sha = commit_sha_for_path(root_dir);
-    let zed_branch_name = git_branch_for_path(root_dir);
+    let zed_commit_sha = commit_sha_for_path(&root_dir);
+    let zed_branch_name = git_branch_for_path(&root_dir);
    let args = Args::parse();
-    let all_available_examples = list_all_examples(&examples_dir).unwrap();
-
-    let example_paths = all_available_examples
-        .iter()
-        .filter_map(|example_path| {
-            let name = example_path.file_name()?.to_string_lossy();
-            if args.examples.is_empty()
-                || args
-                    .examples
-                    .iter()
-                    .any(|name_substring| name.contains(name_substring))
-            {
-                Some(example_path.clone())
-            } else {
-                None
-            }
-        })
-        .collect::<Vec<_>>();
+    let languages: HashSet<String> = args.languages.into_iter().collect();

    let http_client = Arc::new(ReqwestClient::new());
    let app = Application::headless().with_http_client(http_client.clone());
+    let all_threads = examples::all(&examples_dir);

    app.run(move |cx| {
        let app_state = init(cx);
@ -163,28 +155,40 @@ fn main() {

            let mut skipped = Vec::new();

-            for example_path in &example_paths {
-                let example = Example::load_from_directory(
-                    example_path,
-                    &run_dir,
-                    &worktrees_dir,
-                    &repos_dir,
-                )?;
-
-                if !example
-                    .base
-                    .language_extension
-                    .as_ref()
-                    .map_or(false, |lang| args.languages.contains(lang))
+            for thread in all_threads {
+                let meta = thread.meta();
+                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
                {
-                    skipped.push(example.name);
+                    skipped.push(meta.name);
                    continue;
                }

-                examples.extend(example.repeat(args.repetitions));
+                if meta.language_server.map_or(false, |language| {
+                    !languages.contains(&language.file_extension)
+                }) {
+                    skipped.push(meta.name);
+                    continue;
+                }
+
+                // TODO: This creates a worktree per repetition. Ideally these examples should
+                // either be run sequentially on the same worktree, or reuse worktrees when there
+                // are more examples to run than the concurrency limit.
+                for repetition_number in 0..args.repetitions {
+                    let example_instance = ExampleInstance::new(
+                        thread.clone(),
+                        &repos_dir,
+                        &run_dir,
+                        &worktrees_dir,
+                        repetition_number,
+                    );
+
+                    examples.push(example_instance);
+                }
            }

-            println!("Skipped examples: {}\n", skipped.join(", "));
+            if !skipped.is_empty() {
+                println!("Skipped threads: {}", skipped.join(", "));
+            }

            if examples.is_empty() {
                eprintln!("Filter matched no examples");
@ -196,22 +200,23 @@ fn main() {

            let max_name_width = examples
                .iter()
-                .map(|e| e.repetition_name().len())
+                .map(|e| e.worktree_name().len())
                .max()
                .unwrap_or(0);
-            for (i, example) in examples.iter_mut().enumerate() {
+
+            for (i, example_instance) in examples.iter_mut().enumerate() {
                let color = COLORS[i % COLORS.len()].to_string();
-                example.set_log_prefix_style(&color, max_name_width);
+                example_instance.set_log_prefix_style(&color, max_name_width);

                println!(
                    "{}Logging to: {}",
-                    example.log_prefix,
-                    example.run_directory_path().display()
+                    example_instance.log_prefix,
+                    example_instance.run_directory.display()
                );

-                let repo_url = example.base.url.clone();
+                let repo_url = example_instance.repo_url();
                if repo_urls.insert(repo_url.clone()) {
-                    let repo_path = example.repo_path.clone();
+                    let repo_path = example_instance.repo_path.clone();

                    if !repo_path.join(".git").is_dir() {
                        println!(
@ -251,12 +256,12 @@ fn main() {

            future::join_all(clone_tasks).await;

-            for example in examples.iter_mut() {
-                example.fetch().await?;
+            for example_instance in examples.iter_mut() {
+                example_instance.fetch().await?;
            }

-            let examples = Arc::new(Mutex::new(VecDeque::from(examples)));
-            let results_by_example_name = Arc::new(Mutex::new(HashMap::default()));
+            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
+            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));

            future::join_all((0..args.concurrency).map(|_| {
                let app_state = app_state.clone();
@ -268,7 +273,7 @@ fn main() {
                let results = results_by_example_name.clone();
                cx.spawn(async move |cx| {
                    loop {
-                        let Some(mut example) = examples.lock().pop_front() else {
+                        let Some(mut example) = examples.borrow_mut().pop_front() else {
                            break;
                        };
                        let result = async {
@ -291,7 +296,7 @@ fn main() {
                        }
                        .await;
                        results
-                            .lock()
+                            .borrow_mut()
                            .entry(example.name.clone())
                            .or_insert(Vec::new())
                            .push((example.clone(), result));
@ -300,98 +305,156 @@ fn main() {
            }))
            .await;

-            println!("\n\n");
-            print_header("EVAL RESULTS");
+            print_h1("EVAL RESULTS");

            let mut diff_scores = Vec::new();
            let mut thread_scores = Vec::new();
+            let mut programmatic_scores = Vec::new();
            let mut error_count = 0;

-            for (example_name, results) in results_by_example_name.lock().iter_mut() {
-                print_header(&example_name);
+            for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() {
+                print_h2(&example_name);

                results.sort_unstable_by_key(|(example, _)| example.repetition);
                let mut example_cumulative_tool_metrics = ToolMetrics::default();

-                println!("┌───────┬──────┬────────┐");
-                println!("│ Round │ Diff │ Thread │");
-                println!("├───────┼──────┼────────┤");
-                for (example, result) in results {
-                    let run_dir_path = example.run_directory_path();
-                    let relative_run_dir_path = run_dir_path.strip_prefix(root_dir).unwrap();
+                let mut table_rows = String::new();

+                for (example, result) in results.iter() {
                    match result {
                        Err(err) => {
-                            println!(
-                                "|{:^7}│{:^6}│{:^8}│ {:?}{}",
+                            display_error_row(
+                                &mut table_rows,
                                example.repetition,
-                                "N/A",
-                                "N/A",
-                                err,
-                                relative_run_dir_path.display()
-                            );
+                                err.to_string(),
+                            )?;
                            error_count += 1;
                        }
-                        Ok((run_output, judge_result)) => {
+                        Ok((run_output, judge_output)) => {
                            cumulative_tool_metrics.merge(&run_output.tool_metrics);
                            example_cumulative_tool_metrics.merge(&run_output.tool_metrics);

-                            match judge_result {
-                                Ok(judge_output) => {
-                                    diff_scores.push(judge_output.diff.score());
-                                    thread_scores.push(judge_output.thread.score());
-                                    println!(
-                                        "|{:^7}│{:^6}│{:^8}│ {}",
+                            if !run_output.programmatic_assertions.total_count() > 0 {
+                                for assertion in &run_output.programmatic_assertions.ran {
+                                    assertions::display_table_row(
+                                        &mut table_rows,
                                        example.repetition,
-                                        format!("{}%", judge_output.diff.score()),
-                                        format!("{}%", judge_output.thread.score()),
-                                        relative_run_dir_path.display()
-                                    );
+                                        assertion,
+                                    )?;
                                }
-                                Err(err) => {
-                                    println!(
-                                        "|{:^7}│{:^6}│{:^8}│{:?}│ {}",
+
+                                programmatic_scores
+                                    .push(run_output.programmatic_assertions.passed_percentage())
+                            }
+
+                            if !judge_output.diff.is_empty() {
+                                diff_scores.push(judge_output.diff.passed_percentage());
+
+                                for assertion in &judge_output.diff.ran {
+                                    assertions::display_table_row(
+                                        &mut table_rows,
                                        example.repetition,
-                                        "N/A",
-                                        "N/A",
-                                        err,
-                                        relative_run_dir_path.display()
-                                    );
+                                        assertion,
+                                    )?;
+                                }
+                            }
+
+                            if !judge_output.thread.is_empty() {
+                                thread_scores.push(judge_output.thread.passed_percentage());
+
+                                for assertion in &judge_output.thread.ran {
+                                    assertions::display_table_row(
+                                        &mut table_rows,
+                                        example.repetition,
+                                        assertion,
+                                    )?;
                                }
                            }
                        }
                    }
                }

-                println!("└───────┴──────┴────────┘");
-                println!("{}", example_cumulative_tool_metrics);
+                if !table_rows.is_empty() {
+                    assertions::print_table_header();
+                    print!("{}", table_rows);
+
+                    assertions::print_table_divider();
+
+                    for (example, result) in results.iter() {
+                        if let Ok((run_output, judge_output)) = result {
+                            assertions::print_table_round_summary(
+                                &example.repetition.to_string(),
+                                [
+                                    &run_output.programmatic_assertions,
+                                    &judge_output.diff,
+                                    &judge_output.thread,
+                                ]
+                                .into_iter(),
+                            )
+                        }
+                    }
+
+                    assertions::print_table_divider();
+
+                    assertions::print_table_round_summary(
+                        "avg",
+                        results.iter().flat_map(|(_, result)| {
+                            result.iter().flat_map(|(run_output, judge_output)| {
+                                [
+                                    &run_output.programmatic_assertions,
+                                    &judge_output.diff,
+                                    &judge_output.thread,
+                                ]
+                                .into_iter()
+                            })
+                        }),
+                    );
+
+                    assertions::print_table_footer();
+                }
+
+                if !example_cumulative_tool_metrics.is_empty() {
+                    println!("{}", &example_cumulative_tool_metrics);
+                }
            }

-            let diff_score_count = diff_scores.len();
-            let average_diff_score = diff_scores
-                .into_iter()
-                .map(|score| score as f32)
-                .sum::<f32>()
-                / (diff_score_count as f32);
+            if results_by_example_name.borrow().len() > 1 {
+                print_h1("AGGREGATE");

-            if error_count > 0 {
-                println!("\n{error_count} examples failed to run!");
+                if error_count > 0 {
+                    println!("\n{error_count} examples failed to run!");
+                }
+
+                let programmatic_score_count = programmatic_scores.len();
+                if programmatic_score_count > 0 {
+                    let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
+                        / (programmatic_score_count as f32))
+                        .floor();
+                    println!("Average programmatic score: {average_programmatic_score}%");
+                }
+
+                let diff_score_count = diff_scores.len();
+                if diff_score_count > 0 {
+                    let average_diff_score =
+                        (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
+                    println!("Average diff score: {average_diff_score}%");
+                }
+
+                let thread_score_count = thread_scores.len();
+
+                if thread_score_count > 0 {
+                    let average_thread_score = (thread_scores.into_iter().sum::<f32>()
+                        / (thread_score_count as f32))
+                        .floor();
+                    println!("Average thread score: {average_thread_score}%");
+                }
+
+                println!("");
+
+                print_h2("CUMULATIVE TOOL METRICS");
+                println!("{}", cumulative_tool_metrics);
            }

-            println!("\nAverage code diff score: {average_diff_score}");
-
-            let thread_score_count = thread_scores.len();
-            let average_thread_score = thread_scores
-                .into_iter()
-                .map(|score| score as f32)
-                .sum::<f32>()
-                / (thread_score_count as f32);
-
-            println!("\nAverage thread score: {average_thread_score}");
-
-            print_header("CUMULATIVE TOOL METRICS");
-            println!("{}", cumulative_tool_metrics);
-
            app_state.client.telemetry().flush_events().await;

            cx.update(|cx| cx.quit())
@ -400,20 +463,6 @@ fn main() {
    });
 }

-fn list_all_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
-    let path = std::fs::canonicalize(examples_dir).unwrap();
-    let entries = std::fs::read_dir(path).unwrap();
-    let mut result_paths = Vec::new();
-    for entry in entries {
-        let entry = entry?;
-        let path = entry.path();
-        if path.is_dir() {
-            result_paths.push(path);
-        }
-    }
-    Ok(result_paths)
-}
-
 /// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
 pub struct AgentAppState {
    pub languages: Arc<LanguageRegistry>,
@ -570,7 +619,7 @@ pub fn git_branch_for_path(repo_path: &Path) -> String {
 }

 async fn judge_example(
-    example: Example,
+    example: ExampleInstance,
    model: Arc<dyn LanguageModel>,
    zed_commit_sha: &str,
    zed_branch_name: &str,
@ -578,19 +627,9 @@ async fn judge_example(
    run_output: &RunOutput,
    enable_telemetry: bool,
    cx: &AsyncApp,
-) -> Result<JudgeOutput> {
+) -> JudgeOutput {
    let judge_output = example.judge(model.clone(), &run_output, cx).await;

-    let diff_evaluation;
-    let thread_evaluation;
-    if let Ok(output) = judge_output.as_ref() {
-        diff_evaluation = Some(output.diff.clone());
-        thread_evaluation = Some(output.thread.clone());
-    } else {
-        diff_evaluation = None;
-        thread_evaluation = None;
-    }
-
    if enable_telemetry {
        telemetry::event!(
            "Agent Example Evaluated",
@ -599,15 +638,15 @@ async fn judge_example(
            run_id = run_id,
            example_name = example.name.clone(),
            example_repetition = example.repetition,
-            diff_evaluation = diff_evaluation,
-            thread_evaluation = thread_evaluation,
+            diff_evaluation = judge_output.diff.clone(),
+            thread_evaluation = judge_output.thread.clone(),
            tool_metrics = run_output.tool_metrics,
            response_count = run_output.response_count,
            token_usage = run_output.token_usage,
            model = model.telemetry_id(),
            model_provider = model.provider_id().to_string(),
-            repository_url = example.base.url.clone(),
-            repository_revision = example.base.revision.clone(),
+            repository_url = example.repo_url(),
+            repository_revision = example.revision(),
            diagnostic_summary_before = run_output.diagnostic_summary_before,
            diagnostic_summary_after = run_output.diagnostic_summary_after,
            diagnostics_before = run_output.diagnostics_before,
@ -618,8 +657,16 @@ async fn judge_example(
    judge_output
 }

-fn print_header(header: &str) {
-    println!("\n========================================");
-    println!("{:^40}", header);
-    println!("========================================\n");
+const HEADER_WIDTH: usize = 65;
+
+fn print_h1(header: &str) {
+    println!("\n\n{:=^HEADER_WIDTH$}", "");
+    println!("{:^HEADER_WIDTH$}", header);
+    println!("{:=^HEADER_WIDTH$}\n", "");
+}
+
+fn print_h2(header: &str) {
+    println!("\n{:-^HEADER_WIDTH$}", "");
+    println!("{:^HEADER_WIDTH$}", header);
+    println!("{:-^HEADER_WIDTH$}\n", "");
 }
--- a/crates/eval/src/example.rs
+++ b/crates/eval/src/example.rs
--- a/crates/eval/src/examples/file_search.rs
+++ b/crates/eval/src/examples/file_search.rs
@ -0,0 +1,53 @@
+use anyhow::Result;
+use assistant_tools::PathSearchToolInput;
+use async_trait::async_trait;
+use regex::Regex;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata};
+
+pub struct FileSearchExample;
+
+#[async_trait(?Send)]
+impl Example for FileSearchExample {
+    fn meta(&self) -> ExampleMetadata {
+        ExampleMetadata {
+            name: "file_search".to_string(),
+            url: "https://github.com/zed-industries/zed.git".to_string(),
+            revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
+            language_server: None,
+            max_assertions: Some(4),
+        }
+    }
+
+    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+        const FILENAME: &str = "find_replace_file_tool.rs";
+        cx.push_user_message(format!(
+                r#"
+        Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
+
+        The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
+        markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
+        background for lines that were added. We should have a div per diff line.
+        "#
+        ));
+
+        let response = cx.run_turn().await?;
+        let tool_use = response.expect_tool("path_search", cx)?;
+        let input = tool_use.expect_input::<PathSearchToolInput>(cx)?;
+
+        let glob = input.glob;
+        cx.assert(
+            glob.ends_with(FILENAME),
+            format!("glob ends with `{FILENAME}`"),
+        )?;
+
+        let without_filename = glob.replace(FILENAME, "");
+        let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
+            .unwrap()
+            .is_match(&without_filename);
+
+        cx.assert(matches, "glob starts with either `**` or `zed`")?;
+
+        Ok(())
+    }
+}
--- a/crates/eval/src/examples/find_and_replace_diff_card.toml
+++ b/crates/eval/src/examples/find_and_replace_diff_card.toml
@ -0,0 +1,43 @@
+url = "https://github.com/zed-industries/zed.git"
+revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
+language_extension = "rs"
+
+prompt = """
+Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
+The card should implement the `Render` trait.
+
+The card should show a diff. It should be a beautifully presented diff.
+The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
+I want to see a red background for lines that were deleted and a green background for lines
+that were added. We should have a div per diff line.
+"""
+
+[diff_assertions]
+
+modify_find_and_replace_tool = """
+The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
+The struct should contain an `output` field that is the same as the task we were returning before,
+and a new `card` field that contains a view for the card.
+"""
+
+card_implementation = """
+The card should be a view that displays a diff.
+Each line in the diff should be colored according to whether it was added, removed or unchanged.
+"""
+
+[thread_assertions]
+
+path_search = """
+The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
+(*Not* grep, for example, or reading the file based on a guess at the path.)
+This is because we gave the model a filename and it needs to turn that into a real path.
+"""
+
+read_file_from_path_search = """
+After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
+"""
+
+symbol_search = """
+When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
+on what path the Render trait might be in.
+"""
--- a/crates/eval/src/examples/mod.rs
+++ b/crates/eval/src/examples/mod.rs
@ -0,0 +1,128 @@
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::Deserialize;
+use std::collections::BTreeMap;
+use std::fs;
+use std::{
+    path::{Path, PathBuf},
+    rc::Rc,
+};
+use util::serde::default_true;
+
+use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
+
+mod file_search;
+
+pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
+    let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
+
+    for example_path in list_declarative_examples(examples_dir).unwrap() {
+        threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
+    }
+
+    threads
+}
+
+struct DeclarativeExample {
+    metadata: ExampleMetadata,
+    prompt: String,
+    diff_assertions: Vec<JudgeAssertion>,
+    thread_assertions: Vec<JudgeAssertion>,
+}
+
+impl DeclarativeExample {
+    pub fn load(example_path: &Path) -> Result<Self> {
+        let name = Self::name_from_path(example_path);
+        let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
+
+        let language_server = if base.require_lsp {
+            Some(crate::example::LanguageServer {
+                file_extension: base
+                    .language_extension
+                    .expect("Language extension is required when require_lsp = true"),
+                allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
+            })
+        } else {
+            None
+        };
+
+        let metadata = ExampleMetadata {
+            name,
+            url: base.url,
+            revision: base.revision,
+            language_server,
+            max_assertions: None,
+        };
+
+        Ok(DeclarativeExample {
+            metadata,
+            prompt: base.prompt,
+            thread_assertions: base
+                .thread_assertions
+                .into_iter()
+                .map(|(id, description)| JudgeAssertion { id, description })
+                .collect(),
+            diff_assertions: base
+                .diff_assertions
+                .into_iter()
+                .map(|(id, description)| JudgeAssertion { id, description })
+                .collect(),
+        })
+    }
+
+    pub fn name_from_path(path: &Path) -> String {
+        path.file_stem().unwrap().to_string_lossy().to_string()
+    }
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct ExampleToml {
+    pub url: String,
+    pub revision: String,
+    pub language_extension: Option<String>,
+    pub insert_id: Option<String>,
+    #[serde(default = "default_true")]
+    pub require_lsp: bool,
+    #[serde(default)]
+    pub allow_preexisting_diagnostics: bool,
+    pub prompt: String,
+    #[serde(default)]
+    pub diff_assertions: BTreeMap<String, String>,
+    #[serde(default)]
+    pub thread_assertions: BTreeMap<String, String>,
+}
+
+#[async_trait(?Send)]
+impl Example for DeclarativeExample {
+    fn meta(&self) -> ExampleMetadata {
+        self.metadata.clone()
+    }
+
+    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
+        cx.push_user_message(&self.prompt);
+        let _ = cx.run_to_end().await;
+        Ok(())
+    }
+
+    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
+        self.diff_assertions.clone()
+    }
+
+    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
+        self.thread_assertions.clone()
+    }
+}
+
+fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
+    let path = std::fs::canonicalize(examples_dir).unwrap();
+    let entries = std::fs::read_dir(path).unwrap();
+    let mut result_paths = Vec::new();
+    for entry in entries {
+        let entry = entry?;
+        let path = entry.path();
+        if path.extension() == Some("toml".as_ref()) {
+            result_paths.push(path);
+        }
+    }
+    Ok(result_paths)
+}
--- a/crates/eval/src/instance.rs
+++ b/crates/eval/src/instance.rs
--- a/crates/eval/src/judge_diff_prompt.hbs
+++ b/crates/eval/src/judge_diff_prompt.hbs
@ -1,5 +1,5 @@
-You are an expert software developer. Your task is to evaluate a diff produced by an AI agent in response to a prompt.
-Here is the prompt and the diff:
+You are an expert software developer. Your task is to evaluate a diff produced by an AI agent
+in response to a prompt. Here is the prompt and the diff:

 <prompt>
 {{{prompt}}}
@ -9,17 +9,17 @@ Here is the prompt and the diff:
 {{{repository_diff}}}
 </diff>

-Evaluate how many of the following criteria were satisfied by the diff:
+Evaluate whether or not the diff passes the following assertion:

-<criteria>
-{{criteria}}
- There are no changes unrelated to the prompt
-</criteria>
+<assertion>
+{{assertion}}
+</assertion>

 Analyze the diff hunk by hunk, and structure your answer in the following XML format:

 ```
 <analysis>{YOUR ANALYSIS HERE}</analysis>
-<total_criteria>{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED}</total_criteria>
-<passing_criteria>{THE NUMBER OF CRITERIA THAT ARE MET BY THE DIFF}</passing_criteria>
+<passed>{PASSED_ASSERTION}</passed>
 ```
+
+Where `PASSED_ASSERTION` is either `true` or `false`.
--- a/crates/eval/src/judge_thread_prompt.hbs
+++ b/crates/eval/src/judge_thread_prompt.hbs
@ -1,19 +1,21 @@
-You are an expert software developer. Your task is to evaluate an AI agent's messages and tool calls in this conversation:
+You are an expert software developer.
+Your task is to evaluate an AI agent's messages and tool calls in this conversation:

 <messages>
 {{{messages}}}
 </messages>

-You must count how many of the following criteria were satisfied by the messages:
+Evaluate whether or not the sequence of messages passes the following assertion:

-<criteria>
-{{{criteria}}}
-</criteria>
+<assertion>
+{{{assertion}}}
+</assertion>

 Analyze the messages one by one, and structure your answer in the following XML format:

 ```
 <analysis>{YOUR ANALYSIS HERE}</analysis>
-<total_criteria>{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED}</total_criteria>
-<passing_criteria>{THE NUMBER OF CRITERIA THAT ARE MET BY THE MESSAGES}</passing_criteria>
+<passed>{PASSED_ASSERTION}</passed>
 ```
+
+Where `PASSED_ASSERTION` is either `true` or `false`.
--- a/crates/eval/src/tool_metrics.rs
+++ b/crates/eval/src/tool_metrics.rs
@ -24,6 +24,10 @@ impl ToolMetrics {
            *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
        }
    }
+
+    pub fn is_empty(&self) -> bool {
+        self.use_counts.is_empty() && self.failure_counts.is_empty()
+    }
 }

 impl Display for ToolMetrics {
@ -79,7 +83,7 @@ impl Display for ToolMetrics {
            let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
            writeln!(
                f,
-                "│{:^30}│{:^10}│{:^10}│{:^10}│",
+                "│{:<30}│{:^10}│{:^10}│{:^10}│",
                tool_name,
                use_count,
                failure_count,