Add new action to run agent eval (#29158)

The old one wasn't linking, and https://github.com/zed-industries/zed/pull/29081 has a bunch of merge conflicts. Wanted to start simple/small. ## Todo * [x] Remove low-signal examples * [x] Make the eval run on a cron, on main, and on any PR with the `run-eval` label * [x] Noise in logs about failure to write settings ``` [2025-04-21T20:45:04Z ERROR settings] Failed to write settings to file "/home/runner/.config/zed/settings.json" Caused by: No such file or directory (os error 2) at path "/home/runner/.config/zed/.tmpLewFEs" ``` * [x] `Agentic loop stalled` (https://github.com/zed-industries/zed/actions/runs/14581044243/job/40897622894) * [x] Make sure that events are recorded in snowflake * [ ] Change judge criteria to be more explicit about meanings of scores Release Notes: - N/A --------- Co-authored-by: Antonio Scandurra <me@as-cii.com> Co-authored-by: Agus Zubiaga <hi@aguz.me> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
2025-04-21 22:30:21 -06:00 · 2025-04-21 22:30:21 -06:00 · 458ffaa134
commit 458ffaa134
parent b14356d1d3
58 changed files with 291 additions and 385 deletions
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -24,13 +24,11 @@ use prompt_store::PromptBuilder;
 use release_channel::AppVersion;
 use reqwest_client::ReqwestClient;
 use settings::{Settings, SettingsStore};
+use std::env;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use std::usize;
 use util::ResultExt as _;

-pub const RUNS_DIR: &str = "./crates/eval/runs";
-
 #[derive(Parser, Debug)]
 #[command(name = "eval", disable_version_flag = true)]
 struct Args {
@ -57,8 +55,36 @@ struct Args {
 fn main() {
    env_logger::init();

+    let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
+    let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
+    let session_id = uuid::Uuid::new_v4().to_string();
+    let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
+    let run_id = match env::var("GITHUB_RUN_ID") {
+        Ok(run_id) => format!("github/{}", run_id),
+        Err(_) => format!("local/{}", run_timestamp),
+    };
+
+    let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap();
+    let eval_crate_dir = root_dir.join("crates/eval");
+    let repos_dir = eval_crate_dir.join("repos");
+    let worktrees_dir = eval_crate_dir.join("worktrees");
+    let examples_dir = eval_crate_dir.join("examples");
+    let runs_dir = eval_crate_dir.join("runs");
+    let run_dir = runs_dir.join(format!("{}", run_timestamp));
+    std::fs::create_dir_all(&run_dir).unwrap();
+    std::fs::create_dir_all(&repos_dir).unwrap();
+    std::fs::create_dir_all(&worktrees_dir).unwrap();
+    std::fs::create_dir_all(&examples_dir).unwrap();
+    std::fs::create_dir_all(&paths::config_dir()).unwrap();
+
+    let zed_commit_sha = commit_sha_for_path(root_dir);
+    let zed_branch_name = git_branch_for_path(root_dir);
    let args = Args::parse();
-    let all_available_examples = list_all_examples().unwrap();
+    let all_available_examples = list_all_examples(&examples_dir).unwrap();

    let example_paths = all_available_examples
        .iter()
@ -83,14 +109,20 @@ fn main() {
    app.run(move |cx| {
        let app_state = init(cx);

-        let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
-        let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
-        let session_id = uuid::Uuid::new_v4().to_string();
+        let telemetry = app_state.client.telemetry();
+        telemetry.start(system_id, installation_id, session_id, cx);

-        app_state
-            .client
-            .telemetry()
-            .start(system_id, installation_id, session_id, cx);
+        let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
+            && telemetry.has_checksum_seed();
+        if enable_telemetry {
+            println!("Telemetry enabled");
+            telemetry::event!(
+                "Agent Eval Started",
+                zed_commit_sha = zed_commit_sha,
+                zed_branch_name = zed_branch_name,
+                run_id = run_id,
+            );
+        }

        let mut cumulative_tool_metrics = ToolMetrics::default();

@ -114,15 +146,6 @@ fn main() {
        cx.spawn(async move |cx| {
            authenticate_task.await.unwrap();

-            std::fs::create_dir_all(REPOS_DIR)?;
-            std::fs::create_dir_all(WORKTREES_DIR)?;
-
-            let run_dir = Path::new(RUNS_DIR).join(format!(
-                "{}",
-                chrono::Local::now().format("%Y-%m-%d_%H-%M-%S")
-            ));
-            std::fs::create_dir_all(&run_dir)?;
-
            let mut examples = Vec::new();

            const COLORS: [&str; 12] = [
@ -144,7 +167,12 @@ fn main() {
            let mut skipped = Vec::new();

            for example_path in &example_paths {
-                let example = Example::load_from_directory(example_path, &run_dir)?;
+                let example = Example::load_from_directory(
+                    example_path,
+                    &run_dir,
+                    &worktrees_dir,
+                    &repos_dir,
+                )?;

                if !example
                    .base
@ -194,7 +222,7 @@ fn main() {

                let repo_url = example.base.url.clone();
                if repo_urls.insert(repo_url.clone()) {
-                    let repo_path = repo_path_for_url(&repo_url);
+                    let repo_path = example.repo_path.clone();

                    if !repo_path.join(".git").is_dir() {
                        println!(
@ -245,6 +273,9 @@ fn main() {
                let app_state = app_state.clone();
                let model = model.clone();
                let example = example.clone();
+                let zed_commit_sha = zed_commit_sha.clone();
+                let zed_branch_name = zed_branch_name.clone();
+                let run_id = run_id.clone();
                cx.spawn(async move |cx| {
                    let result = async {
                        let run_output = cx
@ -254,8 +285,12 @@ fn main() {
                            run_judge_repetition(
                                example.clone(),
                                model.clone(),
+                                &zed_commit_sha,
+                                &zed_branch_name,
+                                &run_id,
                                &run_output,
                                round,
+                                enable_telemetry,
                                cx,
                            )
                        });
@ -367,9 +402,7 @@ fn main() {
            print_header("CUMULATIVE TOOL METRICS");
            println!("{}", cumulative_tool_metrics);

-            std::thread::sleep(std::time::Duration::from_secs(2));
-
-            app_state.client.telemetry().flush_events();
+            app_state.client.telemetry().flush_events().await;

            cx.update(|cx| cx.quit())
        })
@ -377,8 +410,8 @@ fn main() {
    });
 }

-fn list_all_examples() -> Result<Vec<PathBuf>> {
-    let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
+fn list_all_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
+    let path = std::fs::canonicalize(examples_dir).unwrap();
    let entries = std::fs::read_dir(path).unwrap();
    let mut result_paths = Vec::new();
    for entry in entries {
@ -532,79 +565,66 @@ pub fn find_model(
    Ok(model)
 }

-pub async fn get_current_commit_id(repo_path: &Path) -> Option<String> {
-    (run_git(repo_path, &["rev-parse", "HEAD"]).await).ok()
+pub fn commit_sha_for_path(repo_path: &Path) -> String {
+    futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
 }

-pub fn get_current_commit_id_sync(repo_path: &Path) -> String {
-    futures::executor::block_on(async {
-        get_current_commit_id(repo_path).await.unwrap_or_default()
-    })
+pub fn git_branch_for_path(repo_path: &Path) -> String {
+    match std::env::var("GITHUB_REF_NAME") {
+        Ok(branch) => branch,
+        Err(_) => {
+            futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
+                .unwrap_or_else(|_| "unknown".to_string())
+        }
+    }
 }

 async fn run_judge_repetition(
    example: Example,
    model: Arc<dyn LanguageModel>,
+    zed_commit_sha: &str,
+    zed_branch_name: &str,
+    run_id: &str,
    run_output: &RunOutput,
    round: u32,
+    enable_telemetry: bool,
    cx: &AsyncApp,
 ) -> Result<JudgeOutput> {
-    let judge_result = example.judge(model.clone(), &run_output, round, cx).await;
+    let judge_output = example.judge(model.clone(), &run_output, round, cx).await;

-    if let Ok(judge_output) = &judge_result {
-        let cohort_id = example
-            .run_directory_path
-            .file_name()
-            .map(|name| name.to_string_lossy().to_string())
-            .unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string());
-
-        let path = std::path::Path::new(".");
-        let commit_id = get_current_commit_id(path).await.unwrap_or_default();
-
-        if let Some(thread) = &judge_output.thread {
-            telemetry::event!(
-                "Agent Eval Completed",
-                cohort_id = cohort_id,
-                example_name = example.name.clone(),
-                round = round,
-                diff_score = judge_output.diff.score,
-                diff_analysis = judge_output.diff.analysis,
-                thread_score = thread.score,
-                thread_analysis = thread.analysis,
-                tool_metrics = run_output.tool_metrics,
-                response_count = run_output.response_count,
-                token_usage = run_output.token_usage,
-                model = model.telemetry_id(),
-                model_provider = model.provider_id().to_string(),
-                repository_url = example.base.url.clone(),
-                repository_revision = example.base.revision.clone(),
-                diagnostics_before = run_output.diagnostics_before,
-                diagnostics_after = run_output.diagnostics_after,
-                commit_id = commit_id
-            );
-        } else {
-            telemetry::event!(
-                "Agent Eval Completed",
-                cohort_id = cohort_id,
-                example_name = example.name.clone(),
-                round = round,
-                diff_score = judge_output.diff.score,
-                diff_analysis = judge_output.diff.analysis,
-                tool_metrics = run_output.tool_metrics,
-                response_count = run_output.response_count,
-                token_usage = run_output.token_usage,
-                model = model.telemetry_id(),
-                model_provider = model.provider_id().to_string(),
-                repository_url = example.base.url.clone(),
-                repository_revision = example.base.revision.clone(),
-                diagnostics_before = run_output.diagnostics_before,
-                diagnostics_after = run_output.diagnostics_after,
-                commit_id = commit_id
-            );
-        }
+    let diff_evaluation;
+    let thread_diff_evaluation;
+    if let Ok(output) = judge_output.as_ref() {
+        diff_evaluation = Some(output.diff.clone());
+        thread_diff_evaluation = output.thread.clone();
+    } else {
+        diff_evaluation = None;
+        thread_diff_evaluation = None;
    }

-    judge_result
+    if enable_telemetry {
+        telemetry::event!(
+            "Agent Example Evaluated",
+            zed_commit_sha = zed_commit_sha,
+            zed_branch_name = zed_branch_name,
+            run_id = run_id,
+            example_name = example.name.clone(),
+            round = round,
+            diff_evaluation = diff_evaluation,
+            thread_evaluation = thread_diff_evaluation,
+            tool_metrics = run_output.tool_metrics,
+            response_count = run_output.response_count,
+            token_usage = run_output.token_usage,
+            model = model.telemetry_id(),
+            model_provider = model.provider_id().to_string(),
+            repository_url = example.base.url.clone(),
+            repository_revision = example.base.revision.clone(),
+            diagnostics_before = run_output.diagnostics_before,
+            diagnostics_after = run_output.diagnostics_after,
+        );
+    }
+
+    judge_output
 }

 fn print_header(header: &str) {
--- a/crates/eval/src/example.rs
+++ b/crates/eval/src/example.rs
@ -31,10 +31,6 @@ use util::command::new_smol_command;
 use util::markdown::MarkdownString;
 use util::serde::default_true;

-pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
-pub const REPOS_DIR: &str = "./crates/eval/repos";
-pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
-
 const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);

 const ZED_REPO_URL: &str = "https://github.com/zed-industries/zed.git";
@ -77,6 +73,8 @@ pub struct Example {
    pub run_directory_path: PathBuf,
    /// Prefix used for logging that identifies this example
    pub log_prefix: String,
+    pub worktree_path: PathBuf,
+    pub repo_path: PathBuf,
 }

 #[derive(Debug, Serialize, Deserialize, Clone)]
@ -122,7 +120,12 @@ pub struct JudgeOutput {

 impl Example {
    /// Load an example from a directory containing base.toml, prompt.md, and criteria.md
-    pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result<Self> {
+    pub fn load_from_directory(
+        dir_path: &Path,
+        run_dir: &Path,
+        worktrees_dir: &Path,
+        repos_dir: &Path,
+    ) -> Result<Self> {
        let name = Self::name_from_path(dir_path);
        let base_path = dir_path.join("base.toml");
        let prompt_path = dir_path.join("prompt.md");
@ -134,13 +137,25 @@ impl Example {
            None
        };

+        let base: ExampleBase = toml::from_str(&fs::read_to_string(&base_path)?)?;
+
+        let repo_path = repo_path_for_url(repos_dir, &base.url);
+
+        let worktree_path = worktrees_dir
+            .canonicalize()
+            .unwrap()
+            .join(&name)
+            .join(&base.repo_name());
+
        Ok(Example {
            name: name.clone(),
-            base: toml::from_str(&fs::read_to_string(&base_path)?)?,
+            base,
            prompt: fs::read_to_string(prompt_path.clone())?,
            thread_criteria,
            diff_criteria: fs::read_to_string(diff_criteria_path.clone())?,
            run_directory_path: run_dir.to_path_buf(),
+            worktree_path,
+            repo_path,
            log_prefix: name,
        })
    }
@ -168,21 +183,10 @@ impl Example {
        path.file_name().unwrap().to_string_lossy().to_string()
    }

-    pub fn worktree_path(&self) -> PathBuf {
-        Path::new(WORKTREES_DIR)
-            .canonicalize()
-            .context(format!("No such directory {WORKTREES_DIR}"))
-            .unwrap()
-            .join(&self.name)
-            .join(self.base.repo_name())
-    }
-
    /// Set up the example by checking out the specified Git revision
    pub async fn setup(&mut self) -> Result<()> {
-        let repo_path = repo_path_for_url(&self.base.url);
-
        let revision_exists = run_git(
-            &repo_path,
+            &self.repo_path,
            &["rev-parse", &format!("{}^{{commit}}", self.base.revision)],
        )
        .await
@ -194,29 +198,27 @@ impl Example {
                self.log_prefix, &self.base.revision
            );
            run_git(
-                &repo_path,
+                &self.repo_path,
                &["fetch", "--depth", "1", "origin", &self.base.revision],
            )
            .await?;
        }

-        let worktree_path = self.worktree_path();
-
-        if worktree_path.is_dir() {
+        if self.worktree_path.is_dir() {
            println!("{}Resetting existing worktree", self.log_prefix);

            // TODO: consider including "-x" to remove ignored files. The downside of this is that
            // it will also remove build artifacts, and so prevent incremental reuse there.
-            run_git(&worktree_path, &["clean", "--force", "-d"]).await?;
-            run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
-            run_git(&worktree_path, &["checkout", &self.base.revision]).await?;
+            run_git(&self.worktree_path, &["clean", "--force", "-d"]).await?;
+            run_git(&self.worktree_path, &["reset", "--hard", "HEAD"]).await?;
+            run_git(&self.worktree_path, &["checkout", &self.base.revision]).await?;
        } else {
            println!("{}Creating worktree", self.log_prefix);

-            let worktree_path_string = worktree_path.to_string_lossy().to_string();
+            let worktree_path_string = self.worktree_path.to_string_lossy().to_string();

            run_git(
-                &repo_path,
+                &self.repo_path,
                &[
                    "worktree",
                    "add",
@ -229,7 +231,7 @@ impl Example {
        }

        if self.base.url == ZED_REPO_URL {
-            std::fs::write(worktree_path.join(".rules"), std::fs::read(".rules")?)?;
+            std::fs::write(self.worktree_path.join(".rules"), std::fs::read(".rules")?)?;
        }

        std::fs::create_dir_all(self.example_output_directory())?;
@ -253,9 +255,8 @@ impl Example {
            cx,
        );

-        let worktree_path = self.worktree_path();
        let worktree = project.update(cx, |project, cx| {
-            project.create_worktree(&worktree_path, true, cx)
+            project.create_worktree(&self.worktree_path, true, cx)
        });

        let tools = cx.new(|_| ToolWorkingSet::default());
@ -460,6 +461,7 @@ impl Example {
                            ThreadEvent::SummaryChanged |
                            ThreadEvent::SummaryGenerated |
                            ThreadEvent::CheckpointChanged |
+                            ThreadEvent::ReceivedTextChunk |
                            ThreadEvent::UsageUpdated(_) => {
                                if std::env::var("ZED_EVAL_DEBUG").is_ok() {
                                    println!("{}Event: {:#?}", log_prefix, event);
@ -664,13 +666,12 @@ impl Example {
    }

    async fn repository_diff(&self) -> Result<String> {
-        let worktree_path = self.worktree_path();
-        run_git(&worktree_path, &["add", "."]).await?;
+        run_git(&self.worktree_path, &["add", "."]).await?;
        let mut diff_args = vec!["diff", "--staged"];
        if self.base.url == ZED_REPO_URL {
            diff_args.push(":(exclude).rules");
        }
-        run_git(&worktree_path, &diff_args).await
+        run_git(&self.worktree_path, &diff_args).await
    }
 }

@ -831,13 +832,13 @@ fn get_tag(name: &'static str, response: &str) -> Result<String> {
    anyhow::Ok(content)
 }

-pub fn repo_path_for_url(repo_url: &str) -> PathBuf {
+pub fn repo_path_for_url(repos_dir: &Path, repo_url: &str) -> PathBuf {
    let repo_name = repo_url
        .trim_start_matches("https://")
        .replace(|c: char| !c.is_alphanumeric(), "-");
-    Path::new(REPOS_DIR)
+    Path::new(repos_dir)
        .canonicalize()
-        .context(format!("No such directory {REPOS_DIR}"))
+        .context(format!("No such directory {}", repos_dir.display()))
        .unwrap()
        .join(repo_name)
 }
--- a/crates/eval/src/ids.rs
+++ b/crates/eval/src/ids.rs
@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Result, anyhow};
 use std::fs;
 use std::path::{Path, PathBuf};
 use uuid::Uuid;
@ -11,6 +11,7 @@ pub fn get_or_create_id(path: &Path) -> Result<String> {
        }
    }
    let new_id = Uuid::new_v4().to_string();
+    fs::create_dir_all(path.parent().ok_or_else(|| anyhow!("invalid id path"))?)?;
    fs::write(path, &new_id)?;
    Ok(new_id)
 }