eval: Fine-grained assertions (#29246)

- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
2025-04-22 23:58:58 -03:00 · 2025-04-22 23:58:58 -03:00 · ce1a674eba
commit ce1a674eba
parent 0d3fe474db
18 changed files with 1969 additions and 1229 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4895,6 +4895,7 @@ dependencies = [
 "anyhow",
 "assistant_tool",
 "assistant_tools",
 "async-trait",
 "async-watch",
 "chrono",
 "clap",
@ -4915,13 +4916,14 @@ dependencies = [
 "language_models",
 "languages",
 "node_runtime",
 "parking_lot",
 "paths",
 "project",
 "prompt_store",
 "regex",
 "release_channel",
 "reqwest_client",
 "serde",
 "serde_json",
 "settings",
 "shellexpand 2.1.2",
 "smol",
--- a/crates/agent/src/thread.rs
+++ b/crates/agent/src/thread.rs
@ -315,6 +315,7 @@ pub struct Thread {
    request_callback: Option<
        Box<dyn FnMut(&LanguageModelRequest, &[Result<LanguageModelCompletionEvent, String>])>,
    >,
    remaining_turns: u32,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -368,6 +369,7 @@ impl Thread {
            message_feedback: HashMap::default(),
            last_auto_capture_at: None,
            request_callback: None,
            remaining_turns: u32::MAX,
        }
    }
@ -442,6 +444,7 @@ impl Thread {
            message_feedback: HashMap::default(),
            last_auto_capture_at: None,
            request_callback: None,
            remaining_turns: u32::MAX,
        }
    }
@ -522,7 +525,7 @@ impl Thread {
        self.messages.iter().find(|message| message.id == id)
    }
-    pub fn messages(&self) -> impl Iterator<Item = &Message> {
+    pub fn messages(&self) -> impl ExactSizeIterator<Item = &Message> {
        self.messages.iter()
    }
@ -958,7 +961,21 @@ impl Thread {
        })
    }
    pub fn remaining_turns(&self) -> u32 {
        self.remaining_turns
    }
    pub fn set_remaining_turns(&mut self, remaining_turns: u32) {
        self.remaining_turns = remaining_turns;
    }
    pub fn send_to_model(&mut self, model: Arc<dyn LanguageModel>, cx: &mut Context<Self>) {
        if self.remaining_turns == 0 {
            return;
        }
        self.remaining_turns -= 1;
        let mut request = self.to_completion_request(cx);
        if model.supports_tools() {
            request.tools = {
--- a/crates/assistant_tools/src/assistant_tools.rs
+++ b/crates/assistant_tools/src/assistant_tools.rs
@ -56,6 +56,8 @@ use crate::symbol_info_tool::SymbolInfoTool;
 use crate::terminal_tool::TerminalTool;
 use crate::thinking_tool::ThinkingTool;
 pub use path_search_tool::PathSearchToolInput;
 pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
    assistant_tool::init(cx);
--- a/crates/eval/Cargo.toml
+++ b/crates/eval/Cargo.toml
@ -9,6 +9,7 @@ agent.workspace = true
 anyhow.workspace = true
 assistant_tool.workspace = true
 assistant_tools.workspace = true
 async-trait.workspace = true
 async-watch.workspace = true
 chrono.workspace = true
 clap.workspace = true
@ -29,13 +30,14 @@ language_model.workspace = true
 language_models.workspace = true
 languages = { workspace = true, features = ["load-grammars"] }
 node_runtime.workspace = true
 parking_lot.workspace = true
 paths.workspace = true
 project.workspace = true
 prompt_store.workspace = true
 regex.workspace = true
 release_channel.workspace = true
 reqwest_client.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 settings.workspace = true
 shellexpand.workspace = true
 smol.workspace = true
@ -45,7 +47,6 @@ unindent.workspace = true
 util.workspace = true
 uuid = { version = "1.6", features = ["v4"] }
 workspace-hack.workspace = true
 [[bin]]
 name = "eval"
 path = "src/eval.rs"
--- a/crates/eval/examples/find_and_replace_diff_card/base.toml
+++ b/crates/eval/examples/find_and_replace_diff_card/base.toml
@ -1,3 +0,0 @@
 url = "https://github.com/zed-industries/zed.git"
 revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
 language_extension = "rs"
--- a/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md
+++ b/crates/eval/examples/find_and_replace_diff_card/diff_criteria.md
@ -1,2 +0,0 @@
 - The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. The struct should contain an `output` field that is the same as the task we were returning before, and a new `card` field that contains a view for the card.
 - The card should be a view that displays a diff. Each line in the diff should be colored according to whether it was added, removed or unchanged.
--- a/crates/eval/examples/find_and_replace_diff_card/prompt.md
+++ b/crates/eval/examples/find_and_replace_diff_card/prompt.md
@ -1,3 +0,0 @@
 Look at the `find_replace_file_tool.rs`. I want to implement a card for it. The card should implement the `Render` trait.
 The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green background for lines that were added. We should have a div per diff line.
--- a/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md
+++ b/crates/eval/examples/find_and_replace_diff_card/thread_criteria.md
@ -1,3 +0,0 @@
 - The first tool call should be to path search including "find_replace_file_tool.rs" in the string. (*Not* grep, for example, or reading the file based on a guess at the path.) This is because we gave the model a filename and it needs to turn that into a real path.
 - After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
 - When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information on what path the Render trait might be in.
--- a/crates/eval/src/assertions.rs
+++ b/crates/eval/src/assertions.rs
@ -0,0 +1,157 @@
 use serde::{Deserialize, Serialize};
 use std::fmt::Write;
 use std::fmt::{self};
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct AssertionsReport {
    pub ran: Vec<RanAssertion>,
    pub max: Option<usize>,
 }
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct RanAssertion {
    pub id: String,
    pub result: Result<RanAssertionResult, String>,
 }
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct RanAssertionResult {
    pub analysis: Option<String>,
    pub passed: bool,
 }
 impl AssertionsReport {
    pub fn new(max: Option<usize>) -> Self {
        AssertionsReport {
            ran: Vec::new(),
            max,
        }
    }
    pub fn is_empty(&self) -> bool {
        self.ran.is_empty()
    }
    pub fn total_count(&self) -> usize {
        self.run_count().max(self.max.unwrap_or(0))
    }
    pub fn run_count(&self) -> usize {
        self.ran.len()
    }
    pub fn passed_count(&self) -> usize {
        self.ran
            .iter()
            .filter(|a| a.result.as_ref().map_or(false, |result| result.passed))
            .count()
    }
    pub fn passed_percentage(&self) -> f32 {
        if self.total_count() == 0 {
            0.0
        } else {
            (self.passed_count() as f32 / self.total_count() as f32) * 100.0
        }
    }
 }
 const ROUND_WIDTH: usize = "Round".len();
 const ASSERTIONS_WIDTH: usize = 42;
 const RESULTS_WIDTH: usize = 8;
 pub fn print_table_header() {
    println!(
        "┌─{}─┬─{}─┬─{}─┐",
        "─".repeat(ROUND_WIDTH),
        "─".repeat(ASSERTIONS_WIDTH),
        "─".repeat(RESULTS_WIDTH)
    );
    println!(
        "│ {:^ROUND_WIDTH$} │ {:^ASSERTIONS_WIDTH$} │ {:^RESULTS_WIDTH$} │",
        "Round", "Assertion", "Result"
    );
    println!(
        "├─{}─┼─{}─┼─{}─┤",
        "─".repeat(ROUND_WIDTH),
        "─".repeat(ASSERTIONS_WIDTH),
        "─".repeat(RESULTS_WIDTH)
    )
 }
 pub fn display_error_row(f: &mut String, round: usize, error: String) -> fmt::Result {
    let last_two_columns = ASSERTIONS_WIDTH + RESULTS_WIDTH;
    writeln!(
        f,
        "│ {:^ROUND_WIDTH$} │ {:<last_two_columns$} |",
        round,
        truncate(&error, last_two_columns)
    )
 }
 pub fn display_table_row(f: &mut String, round: usize, assertion: &RanAssertion) -> fmt::Result {
    let result = match &assertion.result {
        Ok(result) if result.passed => "\x1b[32m✔︎ Passed\x1b[0m",
        Ok(_) => "\x1b[31m✗ Failed\x1b[0m",
        Err(_) => "\x1b[31m💥 Judge Error\x1b[0m",
    };
    writeln!(
        f,
        "│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
        round,
        truncate(&assertion.id, ASSERTIONS_WIDTH),
        result
    )
 }
 pub fn print_table_round_summary<'a>(
    round: &str,
    reports: impl Iterator<Item = &'a AssertionsReport>,
 ) {
    let mut passed = 0;
    let mut total = 0;
    for report in reports {
        passed += report.passed_count();
        total += report.total_count();
    }
    println!(
        "│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
        round,
        "total",
        format!("{}%", (passed as f32 / total as f32 * 100.0).floor())
    )
 }
 pub fn print_table_footer() {
    println!(
        "└─{}─┴─{}─┴─{}─┘",
        "─".repeat(ROUND_WIDTH),
        "─".repeat(ASSERTIONS_WIDTH),
        "─".repeat(RESULTS_WIDTH)
    )
 }
 pub fn print_table_divider() {
    println!(
        "├─{}─┼─{}─┼─{}─┤",
        "─".repeat(ROUND_WIDTH),
        "─".repeat(ASSERTIONS_WIDTH),
        "─".repeat(RESULTS_WIDTH)
    )
 }
 fn truncate(assertion: &str, max_width: usize) -> String {
    if assertion.len() <= max_width {
        assertion.to_string()
    } else {
        let mut end_ix = max_width - 1;
        while !assertion.is_char_boundary(end_ix) {
            end_ix -= 1;
        }
        format!("{}…", &assertion[..end_ix])
    }
 }
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -1,13 +1,16 @@
 mod assertions;
 mod example;
 mod examples;
 mod ids;
 mod instance;
 mod tool_metrics;
-pub(crate) use example::*;
+use assertions::display_error_row;
-use parking_lot::Mutex;
+use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
 pub(crate) use tool_metrics::*;
 use ::fs::RealFs;
-use anyhow::{Result, anyhow};
+use anyhow::anyhow;
 use clap::Parser;
 use client::{Client, ProxySettings, UserStore};
 use collections::{HashMap, HashSet};
@ -25,18 +28,20 @@ use prompt_store::PromptBuilder;
 use release_channel::AppVersion;
 use reqwest_client::ReqwestClient;
 use settings::{Settings, SettingsStore};
 use std::cell::RefCell;
 use std::collections::VecDeque;
 use std::env;
 use std::path::{Path, PathBuf};
 use std::rc::Rc;
 use std::sync::Arc;
 use util::ResultExt as _;
 #[derive(Parser, Debug)]
 #[command(name = "eval", disable_version_flag = true)]
 struct Args {
-    /// Runs all examples that contain these substrings. If unspecified, all examples are run.
+    /// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
    #[arg(value_name = "EXAMPLE_SUBSTRING")]
-    examples: Vec<String>,
+    filter: Vec<String>,
    /// Model to use (default: "claude-3-7-sonnet-latest")
    #[arg(long, default_value = "claude-3-7-sonnet-latest")]
    model: String,
@ -66,43 +71,30 @@ fn main() {
        .parent()
        .unwrap()
        .parent()
        .unwrap()
        .canonicalize()
        .unwrap();
-    let eval_crate_dir = root_dir.join("crates/eval");
+    let eval_crate_dir = root_dir.join("crates").join("eval");
    let repos_dir = eval_crate_dir.join("repos");
    let worktrees_dir = eval_crate_dir.join("worktrees");
-    let examples_dir = eval_crate_dir.join("examples");
+    let examples_dir = eval_crate_dir.join("src").join("examples");
-    let runs_dir = eval_crate_dir.join("runs");
+    let run_dir = eval_crate_dir
-    let run_dir = runs_dir.join(format!("{}", run_timestamp));
+        .join("runs")
        .join(format!("{}", run_timestamp));
    std::fs::create_dir_all(&run_dir).unwrap();
    std::fs::create_dir_all(&repos_dir).unwrap();
    std::fs::create_dir_all(&worktrees_dir).unwrap();
    std::fs::create_dir_all(&examples_dir).unwrap();
    std::fs::create_dir_all(&paths::config_dir()).unwrap();
-    let zed_commit_sha = commit_sha_for_path(root_dir);
+    let zed_commit_sha = commit_sha_for_path(&root_dir);
-    let zed_branch_name = git_branch_for_path(root_dir);
+    let zed_branch_name = git_branch_for_path(&root_dir);
    let args = Args::parse();
-    let all_available_examples = list_all_examples(&examples_dir).unwrap();
+    let languages: HashSet<String> = args.languages.into_iter().collect();
    let example_paths = all_available_examples
        .iter()
        .filter_map(|example_path| {
            let name = example_path.file_name()?.to_string_lossy();
            if args.examples.is_empty()
                || args
                    .examples
                    .iter()
                    .any(|name_substring| name.contains(name_substring))
            {
                Some(example_path.clone())
            } else {
                None
            }
        })
        .collect::<Vec<_>>();
    let http_client = Arc::new(ReqwestClient::new());
    let app = Application::headless().with_http_client(http_client.clone());
    let all_threads = examples::all(&examples_dir);
    app.run(move |cx| {
        let app_state = init(cx);
@ -163,28 +155,40 @@ fn main() {
            let mut skipped = Vec::new();
-            for example_path in &example_paths {
+            for thread in all_threads {
-                let example = Example::load_from_directory(
+                let meta = thread.meta();
-                    example_path,
+                if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
                    &run_dir,
                    &worktrees_dir,
                    &repos_dir,
                )?;
                if !example
                    .base
                    .language_extension
                    .as_ref()
                    .map_or(false, |lang| args.languages.contains(lang))
                {
-                    skipped.push(example.name);
+                    skipped.push(meta.name);
                    continue;
                }
-                examples.extend(example.repeat(args.repetitions));
+                if meta.language_server.map_or(false, |language| {
                    !languages.contains(&language.file_extension)
                }) {
                    skipped.push(meta.name);
                    continue;
                }
                // TODO: This creates a worktree per repetition. Ideally these examples should
                // either be run sequentially on the same worktree, or reuse worktrees when there
                // are more examples to run than the concurrency limit.
                for repetition_number in 0..args.repetitions {
                    let example_instance = ExampleInstance::new(
                        thread.clone(),
                        &repos_dir,
                        &run_dir,
                        &worktrees_dir,
                        repetition_number,
                    );
                    examples.push(example_instance);
                }
            }
-            println!("Skipped examples: {}\n", skipped.join(", "));
+            if !skipped.is_empty() {
                println!("Skipped threads: {}", skipped.join(", "));
            }
            if examples.is_empty() {
                eprintln!("Filter matched no examples");
@ -196,22 +200,23 @@ fn main() {
            let max_name_width = examples
                .iter()
-                .map(|e| e.repetition_name().len())
+                .map(|e| e.worktree_name().len())
                .max()
                .unwrap_or(0);
-            for (i, example) in examples.iter_mut().enumerate() {
+
            for (i, example_instance) in examples.iter_mut().enumerate() {
                let color = COLORS[i % COLORS.len()].to_string();
-                example.set_log_prefix_style(&color, max_name_width);
+                example_instance.set_log_prefix_style(&color, max_name_width);
                println!(
                    "{}Logging to: {}",
-                    example.log_prefix,
+                    example_instance.log_prefix,
-                    example.run_directory_path().display()
+                    example_instance.run_directory.display()
                );
-                let repo_url = example.base.url.clone();
+                let repo_url = example_instance.repo_url();
                if repo_urls.insert(repo_url.clone()) {
-                    let repo_path = example.repo_path.clone();
+                    let repo_path = example_instance.repo_path.clone();
                    if !repo_path.join(".git").is_dir() {
                        println!(
@ -251,12 +256,12 @@ fn main() {
            future::join_all(clone_tasks).await;
-            for example in examples.iter_mut() {
+            for example_instance in examples.iter_mut() {
-                example.fetch().await?;
+                example_instance.fetch().await?;
            }
-            let examples = Arc::new(Mutex::new(VecDeque::from(examples)));
+            let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
-            let results_by_example_name = Arc::new(Mutex::new(HashMap::default()));
+            let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
            future::join_all((0..args.concurrency).map(|_| {
                let app_state = app_state.clone();
@ -268,7 +273,7 @@ fn main() {
                let results = results_by_example_name.clone();
                cx.spawn(async move |cx| {
                    loop {
-                        let Some(mut example) = examples.lock().pop_front() else {
+                        let Some(mut example) = examples.borrow_mut().pop_front() else {
                            break;
                        };
                        let result = async {
@ -291,7 +296,7 @@ fn main() {
                        }
                        .await;
                        results
-                            .lock()
+                            .borrow_mut()
                            .entry(example.name.clone())
                            .or_insert(Vec::new())
                            .push((example.clone(), result));
@ -300,98 +305,156 @@ fn main() {
            }))
            .await;
-            println!("\n\n");
+            print_h1("EVAL RESULTS");
            print_header("EVAL RESULTS");
            let mut diff_scores = Vec::new();
            let mut thread_scores = Vec::new();
            let mut programmatic_scores = Vec::new();
            let mut error_count = 0;
-            for (example_name, results) in results_by_example_name.lock().iter_mut() {
+            for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() {
-                print_header(&example_name);
+                print_h2(&example_name);
                results.sort_unstable_by_key(|(example, _)| example.repetition);
                let mut example_cumulative_tool_metrics = ToolMetrics::default();
-                println!("┌───────┬──────┬────────┐");
+                let mut table_rows = String::new();
                println!("│ Round │ Diff │ Thread │");
                println!("├───────┼──────┼────────┤");
                for (example, result) in results {
                    let run_dir_path = example.run_directory_path();
                    let relative_run_dir_path = run_dir_path.strip_prefix(root_dir).unwrap();
                for (example, result) in results.iter() {
                    match result {
                        Err(err) => {
-                            println!(
+                            display_error_row(
-                                "|{:^7}│{:^6}│{:^8}│ {:?}{}",
+                                &mut table_rows,
                                example.repetition,
-                                "N/A",
+                                err.to_string(),
-                                "N/A",
+                            )?;
                                err,
                                relative_run_dir_path.display()
                            );
                            error_count += 1;
                        }
-                        Ok((run_output, judge_result)) => {
+                        Ok((run_output, judge_output)) => {
                            cumulative_tool_metrics.merge(&run_output.tool_metrics);
                            example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
-                            match judge_result {
+                            if !run_output.programmatic_assertions.total_count() > 0 {
-                                Ok(judge_output) => {
+                                for assertion in &run_output.programmatic_assertions.ran {
-                                    diff_scores.push(judge_output.diff.score());
+                                    assertions::display_table_row(
-                                    thread_scores.push(judge_output.thread.score());
+                                        &mut table_rows,
                                    println!(
                                        "|{:^7}│{:^6}│{:^8}│ {}",
                                        example.repetition,
-                                        format!("{}%", judge_output.diff.score()),
+                                        assertion,
-                                        format!("{}%", judge_output.thread.score()),
+                                    )?;
                                        relative_run_dir_path.display()
                                    );
                                }
-                                Err(err) => {
+
-                                    println!(
+                                programmatic_scores
-                                        "|{:^7}│{:^6}│{:^8}│{:?}│ {}",
+                                    .push(run_output.programmatic_assertions.passed_percentage())
                            }
                            if !judge_output.diff.is_empty() {
                                diff_scores.push(judge_output.diff.passed_percentage());
                                for assertion in &judge_output.diff.ran {
                                    assertions::display_table_row(
                                        &mut table_rows,
                                        example.repetition,
-                                        "N/A",
+                                        assertion,
-                                        "N/A",
+                                    )?;
-                                        err,
+                                }
-                                        relative_run_dir_path.display()
+                            }
-                                    );
+
                            if !judge_output.thread.is_empty() {
                                thread_scores.push(judge_output.thread.passed_percentage());
                                for assertion in &judge_output.thread.ran {
                                    assertions::display_table_row(
                                        &mut table_rows,
                                        example.repetition,
                                        assertion,
                                    )?;
                                }
                            }
                        }
                    }
                }
-                println!("└───────┴──────┴────────┘");
+                if !table_rows.is_empty() {
-                println!("{}", example_cumulative_tool_metrics);
+                    assertions::print_table_header();
                    print!("{}", table_rows);
                    assertions::print_table_divider();
                    for (example, result) in results.iter() {
                        if let Ok((run_output, judge_output)) = result {
                            assertions::print_table_round_summary(
                                &example.repetition.to_string(),
                                [
                                    &run_output.programmatic_assertions,
                                    &judge_output.diff,
                                    &judge_output.thread,
                                ]
                                .into_iter(),
                            )
                        }
                    }
                    assertions::print_table_divider();
                    assertions::print_table_round_summary(
                        "avg",
                        results.iter().flat_map(|(_, result)| {
                            result.iter().flat_map(|(run_output, judge_output)| {
                                [
                                    &run_output.programmatic_assertions,
                                    &judge_output.diff,
                                    &judge_output.thread,
                                ]
                                .into_iter()
                            })
                        }),
                    );
                    assertions::print_table_footer();
                }
                if !example_cumulative_tool_metrics.is_empty() {
                    println!("{}", &example_cumulative_tool_metrics);
                }
            }
-            let diff_score_count = diff_scores.len();
+            if results_by_example_name.borrow().len() > 1 {
-            let average_diff_score = diff_scores
+                print_h1("AGGREGATE");
                .into_iter()
                .map(|score| score as f32)
                .sum::<f32>()
                / (diff_score_count as f32);
-            if error_count > 0 {
+                if error_count > 0 {
-                println!("\n{error_count} examples failed to run!");
+                    println!("\n{error_count} examples failed to run!");
                }
                let programmatic_score_count = programmatic_scores.len();
                if programmatic_score_count > 0 {
                    let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
                        / (programmatic_score_count as f32))
                        .floor();
                    println!("Average programmatic score: {average_programmatic_score}%");
                }
                let diff_score_count = diff_scores.len();
                if diff_score_count > 0 {
                    let average_diff_score =
                        (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
                    println!("Average diff score: {average_diff_score}%");
                }
                let thread_score_count = thread_scores.len();
                if thread_score_count > 0 {
                    let average_thread_score = (thread_scores.into_iter().sum::<f32>()
                        / (thread_score_count as f32))
                        .floor();
                    println!("Average thread score: {average_thread_score}%");
                }
                println!("");
                print_h2("CUMULATIVE TOOL METRICS");
                println!("{}", cumulative_tool_metrics);
            }
            println!("\nAverage code diff score: {average_diff_score}");
            let thread_score_count = thread_scores.len();
            let average_thread_score = thread_scores
                .into_iter()
                .map(|score| score as f32)
                .sum::<f32>()
                / (thread_score_count as f32);
            println!("\nAverage thread score: {average_thread_score}");
            print_header("CUMULATIVE TOOL METRICS");
            println!("{}", cumulative_tool_metrics);
            app_state.client.telemetry().flush_events().await;
            cx.update(|cx| cx.quit())
@ -400,20 +463,6 @@ fn main() {
    });
 }
 fn list_all_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
    let path = std::fs::canonicalize(examples_dir).unwrap();
    let entries = std::fs::read_dir(path).unwrap();
    let mut result_paths = Vec::new();
    for entry in entries {
        let entry = entry?;
        let path = entry.path();
        if path.is_dir() {
            result_paths.push(path);
        }
    }
    Ok(result_paths)
 }
 /// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
 pub struct AgentAppState {
    pub languages: Arc<LanguageRegistry>,
@ -570,7 +619,7 @@ pub fn git_branch_for_path(repo_path: &Path) -> String {
 }
 async fn judge_example(
-    example: Example,
+    example: ExampleInstance,
    model: Arc<dyn LanguageModel>,
    zed_commit_sha: &str,
    zed_branch_name: &str,
@ -578,19 +627,9 @@ async fn judge_example(
    run_output: &RunOutput,
    enable_telemetry: bool,
    cx: &AsyncApp,
-) -> Result<JudgeOutput> {
+) -> JudgeOutput {
    let judge_output = example.judge(model.clone(), &run_output, cx).await;
    let diff_evaluation;
    let thread_evaluation;
    if let Ok(output) = judge_output.as_ref() {
        diff_evaluation = Some(output.diff.clone());
        thread_evaluation = Some(output.thread.clone());
    } else {
        diff_evaluation = None;
        thread_evaluation = None;
    }
    if enable_telemetry {
        telemetry::event!(
            "Agent Example Evaluated",
@ -599,15 +638,15 @@ async fn judge_example(
            run_id = run_id,
            example_name = example.name.clone(),
            example_repetition = example.repetition,
-            diff_evaluation = diff_evaluation,
+            diff_evaluation = judge_output.diff.clone(),
-            thread_evaluation = thread_evaluation,
+            thread_evaluation = judge_output.thread.clone(),
            tool_metrics = run_output.tool_metrics,
            response_count = run_output.response_count,
            token_usage = run_output.token_usage,
            model = model.telemetry_id(),
            model_provider = model.provider_id().to_string(),
-            repository_url = example.base.url.clone(),
+            repository_url = example.repo_url(),
-            repository_revision = example.base.revision.clone(),
+            repository_revision = example.revision(),
            diagnostic_summary_before = run_output.diagnostic_summary_before,
            diagnostic_summary_after = run_output.diagnostic_summary_after,
            diagnostics_before = run_output.diagnostics_before,
@ -618,8 +657,16 @@ async fn judge_example(
    judge_output
 }
-fn print_header(header: &str) {
+const HEADER_WIDTH: usize = 65;
-    println!("\n========================================");
+
-    println!("{:^40}", header);
+fn print_h1(header: &str) {
-    println!("========================================\n");
+    println!("\n\n{:=^HEADER_WIDTH$}", "");
    println!("{:^HEADER_WIDTH$}", header);
    println!("{:=^HEADER_WIDTH$}\n", "");
 }
 fn print_h2(header: &str) {
    println!("\n{:-^HEADER_WIDTH$}", "");
    println!("{:^HEADER_WIDTH$}", header);
    println!("{:-^HEADER_WIDTH$}\n", "");
 }
--- a/crates/eval/src/example.rs
+++ b/crates/eval/src/example.rs
--- a/crates/eval/src/examples/file_search.rs
+++ b/crates/eval/src/examples/file_search.rs
@ -0,0 +1,53 @@
 use anyhow::Result;
 use assistant_tools::PathSearchToolInput;
 use async_trait::async_trait;
 use regex::Regex;
 use crate::example::{Example, ExampleContext, ExampleMetadata};
 pub struct FileSearchExample;
 #[async_trait(?Send)]
 impl Example for FileSearchExample {
    fn meta(&self) -> ExampleMetadata {
        ExampleMetadata {
            name: "file_search".to_string(),
            url: "https://github.com/zed-industries/zed.git".to_string(),
            revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
            language_server: None,
            max_assertions: Some(4),
        }
    }
    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
        const FILENAME: &str = "find_replace_file_tool.rs";
        cx.push_user_message(format!(
                r#"
        Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
        The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
        markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
        background for lines that were added. We should have a div per diff line.
        "#
        ));
        let response = cx.run_turn().await?;
        let tool_use = response.expect_tool("path_search", cx)?;
        let input = tool_use.expect_input::<PathSearchToolInput>(cx)?;
        let glob = input.glob;
        cx.assert(
            glob.ends_with(FILENAME),
            format!("glob ends with `{FILENAME}`"),
        )?;
        let without_filename = glob.replace(FILENAME, "");
        let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
            .unwrap()
            .is_match(&without_filename);
        cx.assert(matches, "glob starts with either `**` or `zed`")?;
        Ok(())
    }
 }
--- a/crates/eval/src/examples/find_and_replace_diff_card.toml
+++ b/crates/eval/src/examples/find_and_replace_diff_card.toml
@ -0,0 +1,43 @@
 url = "https://github.com/zed-industries/zed.git"
 revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
 language_extension = "rs"
 prompt = """
 Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
 The card should implement the `Render` trait.
 The card should show a diff. It should be a beautifully presented diff.
 The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
 I want to see a red background for lines that were deleted and a green background for lines
 that were added. We should have a div per diff line.
 """
 [diff_assertions]
 modify_find_and_replace_tool = """
 The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
 The struct should contain an `output` field that is the same as the task we were returning before,
 and a new `card` field that contains a view for the card.
 """
 card_implementation = """
 The card should be a view that displays a diff.
 Each line in the diff should be colored according to whether it was added, removed or unchanged.
 """
 [thread_assertions]
 path_search = """
 The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
 (*Not* grep, for example, or reading the file based on a guess at the path.)
 This is because we gave the model a filename and it needs to turn that into a real path.
 """
 read_file_from_path_search = """
 After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
 """
 symbol_search = """
 When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
 on what path the Render trait might be in.
 """
--- a/crates/eval/src/examples/mod.rs
+++ b/crates/eval/src/examples/mod.rs
@ -0,0 +1,128 @@
 use anyhow::Result;
 use async_trait::async_trait;
 use serde::Deserialize;
 use std::collections::BTreeMap;
 use std::fs;
 use std::{
    path::{Path, PathBuf},
    rc::Rc,
 };
 use util::serde::default_true;
 use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
 mod file_search;
 pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
    let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
    for example_path in list_declarative_examples(examples_dir).unwrap() {
        threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
    }
    threads
 }
 struct DeclarativeExample {
    metadata: ExampleMetadata,
    prompt: String,
    diff_assertions: Vec<JudgeAssertion>,
    thread_assertions: Vec<JudgeAssertion>,
 }
 impl DeclarativeExample {
    pub fn load(example_path: &Path) -> Result<Self> {
        let name = Self::name_from_path(example_path);
        let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
        let language_server = if base.require_lsp {
            Some(crate::example::LanguageServer {
                file_extension: base
                    .language_extension
                    .expect("Language extension is required when require_lsp = true"),
                allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
            })
        } else {
            None
        };
        let metadata = ExampleMetadata {
            name,
            url: base.url,
            revision: base.revision,
            language_server,
            max_assertions: None,
        };
        Ok(DeclarativeExample {
            metadata,
            prompt: base.prompt,
            thread_assertions: base
                .thread_assertions
                .into_iter()
                .map(|(id, description)| JudgeAssertion { id, description })
                .collect(),
            diff_assertions: base
                .diff_assertions
                .into_iter()
                .map(|(id, description)| JudgeAssertion { id, description })
                .collect(),
        })
    }
    pub fn name_from_path(path: &Path) -> String {
        path.file_stem().unwrap().to_string_lossy().to_string()
    }
 }
 #[derive(Clone, Debug, Deserialize)]
 pub struct ExampleToml {
    pub url: String,
    pub revision: String,
    pub language_extension: Option<String>,
    pub insert_id: Option<String>,
    #[serde(default = "default_true")]
    pub require_lsp: bool,
    #[serde(default)]
    pub allow_preexisting_diagnostics: bool,
    pub prompt: String,
    #[serde(default)]
    pub diff_assertions: BTreeMap<String, String>,
    #[serde(default)]
    pub thread_assertions: BTreeMap<String, String>,
 }
 #[async_trait(?Send)]
 impl Example for DeclarativeExample {
    fn meta(&self) -> ExampleMetadata {
        self.metadata.clone()
    }
    async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
        cx.push_user_message(&self.prompt);
        let _ = cx.run_to_end().await;
        Ok(())
    }
    fn diff_assertions(&self) -> Vec<JudgeAssertion> {
        self.diff_assertions.clone()
    }
    fn thread_assertions(&self) -> Vec<JudgeAssertion> {
        self.thread_assertions.clone()
    }
 }
 fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
    let path = std::fs::canonicalize(examples_dir).unwrap();
    let entries = std::fs::read_dir(path).unwrap();
    let mut result_paths = Vec::new();
    for entry in entries {
        let entry = entry?;
        let path = entry.path();
        if path.extension() == Some("toml".as_ref()) {
            result_paths.push(path);
        }
    }
    Ok(result_paths)
 }
--- a/crates/eval/src/instance.rs
+++ b/crates/eval/src/instance.rs
--- a/crates/eval/src/judge_diff_prompt.hbs
+++ b/crates/eval/src/judge_diff_prompt.hbs
@ -1,5 +1,5 @@
-You are an expert software developer. Your task is to evaluate a diff produced by an AI agent in response to a prompt.
+You are an expert software developer. Your task is to evaluate a diff produced by an AI agent
-Here is the prompt and the diff:
+in response to a prompt. Here is the prompt and the diff:
 <prompt>
 {{{prompt}}}
@ -9,17 +9,17 @@ Here is the prompt and the diff:
 {{{repository_diff}}}
 </diff>
-Evaluate how many of the following criteria were satisfied by the diff:
+Evaluate whether or not the diff passes the following assertion:
-<criteria>
+<assertion>
-{{criteria}}
+{{assertion}}
- There are no changes unrelated to the prompt
+</assertion>
 </criteria>
 Analyze the diff hunk by hunk, and structure your answer in the following XML format:
 ```
 <analysis>{YOUR ANALYSIS HERE}</analysis>
-<total_criteria>{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED}</total_criteria>
+<passed>{PASSED_ASSERTION}</passed>
 <passing_criteria>{THE NUMBER OF CRITERIA THAT ARE MET BY THE DIFF}</passing_criteria>
 ```
 Where `PASSED_ASSERTION` is either `true` or `false`.
--- a/crates/eval/src/judge_thread_prompt.hbs
+++ b/crates/eval/src/judge_thread_prompt.hbs
@ -1,19 +1,21 @@
-You are an expert software developer. Your task is to evaluate an AI agent's messages and tool calls in this conversation:
+You are an expert software developer.
 Your task is to evaluate an AI agent's messages and tool calls in this conversation:
 <messages>
 {{{messages}}}
 </messages>
-You must count how many of the following criteria were satisfied by the messages:
+Evaluate whether or not the sequence of messages passes the following assertion:
-<criteria>
+<assertion>
-{{{criteria}}}
+{{{assertion}}}
-</criteria>
+</assertion>
 Analyze the messages one by one, and structure your answer in the following XML format:
 ```
 <analysis>{YOUR ANALYSIS HERE}</analysis>
-<total_criteria>{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED}</total_criteria>
+<passed>{PASSED_ASSERTION}</passed>
 <passing_criteria>{THE NUMBER OF CRITERIA THAT ARE MET BY THE MESSAGES}</passing_criteria>
 ```
 Where `PASSED_ASSERTION` is either `true` or `false`.
--- a/crates/eval/src/tool_metrics.rs
+++ b/crates/eval/src/tool_metrics.rs
@ -24,6 +24,10 @@ impl ToolMetrics {
            *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
        }
    }
    pub fn is_empty(&self) -> bool {
        self.use_counts.is_empty() && self.failure_counts.is_empty()
    }
 }
 impl Display for ToolMetrics {
@ -79,7 +83,7 @@ impl Display for ToolMetrics {
            let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
            writeln!(
                f,
-                "│{:^30}│{:^10}│{:^10}│{:^10}│",
+                "│{:<30}│{:^10}│{:^10}│{:^10}│",
                tool_name,
                use_count,
                failure_count,
		`@ -1,2 +0,0 @@`
			- The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. The struct should contain an `output` field that is the same as the task we were returning before, and a new `card` field that contains a view for the card.
			`- The card should be a view that displays a diff. Each line in the diff should be colored according to whether it was added, removed or unchanged.`
		`@ -1,3 +0,0 @@`
			Look at the `find_replace_file_tool.rs`. I want to implement a card for it. The card should implement the `Render` trait.

			The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green background for lines that were added. We should have a div per diff line.