Add new action to run agent eval (#29158)

The old one wasn't linking, and
https://github.com/zed-industries/zed/pull/29081 has a bunch of merge
conflicts. Wanted to start simple/small.

## Todo

* [x] Remove low-signal examples
* [x] Make the eval run on a cron, on main, and on any PR with the
`run-eval` label
* [x] Noise in logs about failure to write settings
    ```
[2025-04-21T20:45:04Z ERROR settings] Failed to write settings to file
"/home/runner/.config/zed/settings.json"
    
       Caused by:
No such file or directory (os error 2) at path
"/home/runner/.config/zed/.tmpLewFEs"
    ```
* [x] `Agentic loop stalled`
(https://github.com/zed-industries/zed/actions/runs/14581044243/job/40897622894)
* [x] Make sure that events are recorded in snowflake
* [ ] Change judge criteria to be more explicit about meanings of scores

Release Notes:

- N/A

---------

Co-authored-by: Antonio Scandurra <me@as-cii.com>
Co-authored-by: Agus Zubiaga <hi@aguz.me>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
Nathan Sobo 2025-04-21 22:30:21 -06:00 committed by GitHub
parent b14356d1d3
commit 458ffaa134
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
58 changed files with 291 additions and 385 deletions

View file

@ -24,13 +24,11 @@ use prompt_store::PromptBuilder;
use release_channel::AppVersion;
use reqwest_client::ReqwestClient;
use settings::{Settings, SettingsStore};
use std::env;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::usize;
use util::ResultExt as _;
pub const RUNS_DIR: &str = "./crates/eval/runs";
#[derive(Parser, Debug)]
#[command(name = "eval", disable_version_flag = true)]
struct Args {
@ -57,8 +55,36 @@ struct Args {
fn main() {
env_logger::init();
let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
let session_id = uuid::Uuid::new_v4().to_string();
let run_timestamp = chrono::Local::now().format("%Y-%m-%d_%H-%M-%S");
let run_id = match env::var("GITHUB_RUN_ID") {
Ok(run_id) => format!("github/{}", run_id),
Err(_) => format!("local/{}", run_timestamp),
};
let root_dir = Path::new(std::env!("CARGO_MANIFEST_DIR"))
.parent()
.unwrap()
.parent()
.unwrap();
let eval_crate_dir = root_dir.join("crates/eval");
let repos_dir = eval_crate_dir.join("repos");
let worktrees_dir = eval_crate_dir.join("worktrees");
let examples_dir = eval_crate_dir.join("examples");
let runs_dir = eval_crate_dir.join("runs");
let run_dir = runs_dir.join(format!("{}", run_timestamp));
std::fs::create_dir_all(&run_dir).unwrap();
std::fs::create_dir_all(&repos_dir).unwrap();
std::fs::create_dir_all(&worktrees_dir).unwrap();
std::fs::create_dir_all(&examples_dir).unwrap();
std::fs::create_dir_all(&paths::config_dir()).unwrap();
let zed_commit_sha = commit_sha_for_path(root_dir);
let zed_branch_name = git_branch_for_path(root_dir);
let args = Args::parse();
let all_available_examples = list_all_examples().unwrap();
let all_available_examples = list_all_examples(&examples_dir).unwrap();
let example_paths = all_available_examples
.iter()
@ -83,14 +109,20 @@ fn main() {
app.run(move |cx| {
let app_state = init(cx);
let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
let session_id = uuid::Uuid::new_v4().to_string();
let telemetry = app_state.client.telemetry();
telemetry.start(system_id, installation_id, session_id, cx);
app_state
.client
.telemetry()
.start(system_id, installation_id, session_id, cx);
let enable_telemetry = env::var("ZED_EVAL_TELEMETRY").map_or(false, |value| value == "1")
&& telemetry.has_checksum_seed();
if enable_telemetry {
println!("Telemetry enabled");
telemetry::event!(
"Agent Eval Started",
zed_commit_sha = zed_commit_sha,
zed_branch_name = zed_branch_name,
run_id = run_id,
);
}
let mut cumulative_tool_metrics = ToolMetrics::default();
@ -114,15 +146,6 @@ fn main() {
cx.spawn(async move |cx| {
authenticate_task.await.unwrap();
std::fs::create_dir_all(REPOS_DIR)?;
std::fs::create_dir_all(WORKTREES_DIR)?;
let run_dir = Path::new(RUNS_DIR).join(format!(
"{}",
chrono::Local::now().format("%Y-%m-%d_%H-%M-%S")
));
std::fs::create_dir_all(&run_dir)?;
let mut examples = Vec::new();
const COLORS: [&str; 12] = [
@ -144,7 +167,12 @@ fn main() {
let mut skipped = Vec::new();
for example_path in &example_paths {
let example = Example::load_from_directory(example_path, &run_dir)?;
let example = Example::load_from_directory(
example_path,
&run_dir,
&worktrees_dir,
&repos_dir,
)?;
if !example
.base
@ -194,7 +222,7 @@ fn main() {
let repo_url = example.base.url.clone();
if repo_urls.insert(repo_url.clone()) {
let repo_path = repo_path_for_url(&repo_url);
let repo_path = example.repo_path.clone();
if !repo_path.join(".git").is_dir() {
println!(
@ -245,6 +273,9 @@ fn main() {
let app_state = app_state.clone();
let model = model.clone();
let example = example.clone();
let zed_commit_sha = zed_commit_sha.clone();
let zed_branch_name = zed_branch_name.clone();
let run_id = run_id.clone();
cx.spawn(async move |cx| {
let result = async {
let run_output = cx
@ -254,8 +285,12 @@ fn main() {
run_judge_repetition(
example.clone(),
model.clone(),
&zed_commit_sha,
&zed_branch_name,
&run_id,
&run_output,
round,
enable_telemetry,
cx,
)
});
@ -367,9 +402,7 @@ fn main() {
print_header("CUMULATIVE TOOL METRICS");
println!("{}", cumulative_tool_metrics);
std::thread::sleep(std::time::Duration::from_secs(2));
app_state.client.telemetry().flush_events();
app_state.client.telemetry().flush_events().await;
cx.update(|cx| cx.quit())
})
@ -377,8 +410,8 @@ fn main() {
});
}
fn list_all_examples() -> Result<Vec<PathBuf>> {
let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
fn list_all_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
let path = std::fs::canonicalize(examples_dir).unwrap();
let entries = std::fs::read_dir(path).unwrap();
let mut result_paths = Vec::new();
for entry in entries {
@ -532,79 +565,66 @@ pub fn find_model(
Ok(model)
}
pub async fn get_current_commit_id(repo_path: &Path) -> Option<String> {
(run_git(repo_path, &["rev-parse", "HEAD"]).await).ok()
pub fn commit_sha_for_path(repo_path: &Path) -> String {
futures::executor::block_on(run_git(repo_path, &["rev-parse", "HEAD"])).unwrap()
}
pub fn get_current_commit_id_sync(repo_path: &Path) -> String {
futures::executor::block_on(async {
get_current_commit_id(repo_path).await.unwrap_or_default()
})
pub fn git_branch_for_path(repo_path: &Path) -> String {
match std::env::var("GITHUB_REF_NAME") {
Ok(branch) => branch,
Err(_) => {
futures::executor::block_on(run_git(repo_path, &["rev-parse", "--abbrev-ref", "HEAD"]))
.unwrap_or_else(|_| "unknown".to_string())
}
}
}
async fn run_judge_repetition(
example: Example,
model: Arc<dyn LanguageModel>,
zed_commit_sha: &str,
zed_branch_name: &str,
run_id: &str,
run_output: &RunOutput,
round: u32,
enable_telemetry: bool,
cx: &AsyncApp,
) -> Result<JudgeOutput> {
let judge_result = example.judge(model.clone(), &run_output, round, cx).await;
let judge_output = example.judge(model.clone(), &run_output, round, cx).await;
if let Ok(judge_output) = &judge_result {
let cohort_id = example
.run_directory_path
.file_name()
.map(|name| name.to_string_lossy().to_string())
.unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string());
let path = std::path::Path::new(".");
let commit_id = get_current_commit_id(path).await.unwrap_or_default();
if let Some(thread) = &judge_output.thread {
telemetry::event!(
"Agent Eval Completed",
cohort_id = cohort_id,
example_name = example.name.clone(),
round = round,
diff_score = judge_output.diff.score,
diff_analysis = judge_output.diff.analysis,
thread_score = thread.score,
thread_analysis = thread.analysis,
tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
model_provider = model.provider_id().to_string(),
repository_url = example.base.url.clone(),
repository_revision = example.base.revision.clone(),
diagnostics_before = run_output.diagnostics_before,
diagnostics_after = run_output.diagnostics_after,
commit_id = commit_id
);
} else {
telemetry::event!(
"Agent Eval Completed",
cohort_id = cohort_id,
example_name = example.name.clone(),
round = round,
diff_score = judge_output.diff.score,
diff_analysis = judge_output.diff.analysis,
tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
model_provider = model.provider_id().to_string(),
repository_url = example.base.url.clone(),
repository_revision = example.base.revision.clone(),
diagnostics_before = run_output.diagnostics_before,
diagnostics_after = run_output.diagnostics_after,
commit_id = commit_id
);
}
let diff_evaluation;
let thread_diff_evaluation;
if let Ok(output) = judge_output.as_ref() {
diff_evaluation = Some(output.diff.clone());
thread_diff_evaluation = output.thread.clone();
} else {
diff_evaluation = None;
thread_diff_evaluation = None;
}
judge_result
if enable_telemetry {
telemetry::event!(
"Agent Example Evaluated",
zed_commit_sha = zed_commit_sha,
zed_branch_name = zed_branch_name,
run_id = run_id,
example_name = example.name.clone(),
round = round,
diff_evaluation = diff_evaluation,
thread_evaluation = thread_diff_evaluation,
tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
model_provider = model.provider_id().to_string(),
repository_url = example.base.url.clone(),
repository_revision = example.base.revision.clone(),
diagnostics_before = run_output.diagnostics_before,
diagnostics_after = run_output.diagnostics_after,
);
}
judge_output
}
fn print_header(header: &str) {

View file

@ -31,10 +31,6 @@ use util::command::new_smol_command;
use util::markdown::MarkdownString;
use util::serde::default_true;
pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
pub const REPOS_DIR: &str = "./crates/eval/repos";
pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
const ZED_REPO_URL: &str = "https://github.com/zed-industries/zed.git";
@ -77,6 +73,8 @@ pub struct Example {
pub run_directory_path: PathBuf,
/// Prefix used for logging that identifies this example
pub log_prefix: String,
pub worktree_path: PathBuf,
pub repo_path: PathBuf,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
@ -122,7 +120,12 @@ pub struct JudgeOutput {
impl Example {
/// Load an example from a directory containing base.toml, prompt.md, and criteria.md
pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result<Self> {
pub fn load_from_directory(
dir_path: &Path,
run_dir: &Path,
worktrees_dir: &Path,
repos_dir: &Path,
) -> Result<Self> {
let name = Self::name_from_path(dir_path);
let base_path = dir_path.join("base.toml");
let prompt_path = dir_path.join("prompt.md");
@ -134,13 +137,25 @@ impl Example {
None
};
let base: ExampleBase = toml::from_str(&fs::read_to_string(&base_path)?)?;
let repo_path = repo_path_for_url(repos_dir, &base.url);
let worktree_path = worktrees_dir
.canonicalize()
.unwrap()
.join(&name)
.join(&base.repo_name());
Ok(Example {
name: name.clone(),
base: toml::from_str(&fs::read_to_string(&base_path)?)?,
base,
prompt: fs::read_to_string(prompt_path.clone())?,
thread_criteria,
diff_criteria: fs::read_to_string(diff_criteria_path.clone())?,
run_directory_path: run_dir.to_path_buf(),
worktree_path,
repo_path,
log_prefix: name,
})
}
@ -168,21 +183,10 @@ impl Example {
path.file_name().unwrap().to_string_lossy().to_string()
}
pub fn worktree_path(&self) -> PathBuf {
Path::new(WORKTREES_DIR)
.canonicalize()
.context(format!("No such directory {WORKTREES_DIR}"))
.unwrap()
.join(&self.name)
.join(self.base.repo_name())
}
/// Set up the example by checking out the specified Git revision
pub async fn setup(&mut self) -> Result<()> {
let repo_path = repo_path_for_url(&self.base.url);
let revision_exists = run_git(
&repo_path,
&self.repo_path,
&["rev-parse", &format!("{}^{{commit}}", self.base.revision)],
)
.await
@ -194,29 +198,27 @@ impl Example {
self.log_prefix, &self.base.revision
);
run_git(
&repo_path,
&self.repo_path,
&["fetch", "--depth", "1", "origin", &self.base.revision],
)
.await?;
}
let worktree_path = self.worktree_path();
if worktree_path.is_dir() {
if self.worktree_path.is_dir() {
println!("{}Resetting existing worktree", self.log_prefix);
// TODO: consider including "-x" to remove ignored files. The downside of this is that
// it will also remove build artifacts, and so prevent incremental reuse there.
run_git(&worktree_path, &["clean", "--force", "-d"]).await?;
run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
run_git(&worktree_path, &["checkout", &self.base.revision]).await?;
run_git(&self.worktree_path, &["clean", "--force", "-d"]).await?;
run_git(&self.worktree_path, &["reset", "--hard", "HEAD"]).await?;
run_git(&self.worktree_path, &["checkout", &self.base.revision]).await?;
} else {
println!("{}Creating worktree", self.log_prefix);
let worktree_path_string = worktree_path.to_string_lossy().to_string();
let worktree_path_string = self.worktree_path.to_string_lossy().to_string();
run_git(
&repo_path,
&self.repo_path,
&[
"worktree",
"add",
@ -229,7 +231,7 @@ impl Example {
}
if self.base.url == ZED_REPO_URL {
std::fs::write(worktree_path.join(".rules"), std::fs::read(".rules")?)?;
std::fs::write(self.worktree_path.join(".rules"), std::fs::read(".rules")?)?;
}
std::fs::create_dir_all(self.example_output_directory())?;
@ -253,9 +255,8 @@ impl Example {
cx,
);
let worktree_path = self.worktree_path();
let worktree = project.update(cx, |project, cx| {
project.create_worktree(&worktree_path, true, cx)
project.create_worktree(&self.worktree_path, true, cx)
});
let tools = cx.new(|_| ToolWorkingSet::default());
@ -460,6 +461,7 @@ impl Example {
ThreadEvent::SummaryChanged |
ThreadEvent::SummaryGenerated |
ThreadEvent::CheckpointChanged |
ThreadEvent::ReceivedTextChunk |
ThreadEvent::UsageUpdated(_) => {
if std::env::var("ZED_EVAL_DEBUG").is_ok() {
println!("{}Event: {:#?}", log_prefix, event);
@ -664,13 +666,12 @@ impl Example {
}
async fn repository_diff(&self) -> Result<String> {
let worktree_path = self.worktree_path();
run_git(&worktree_path, &["add", "."]).await?;
run_git(&self.worktree_path, &["add", "."]).await?;
let mut diff_args = vec!["diff", "--staged"];
if self.base.url == ZED_REPO_URL {
diff_args.push(":(exclude).rules");
}
run_git(&worktree_path, &diff_args).await
run_git(&self.worktree_path, &diff_args).await
}
}
@ -831,13 +832,13 @@ fn get_tag(name: &'static str, response: &str) -> Result<String> {
anyhow::Ok(content)
}
pub fn repo_path_for_url(repo_url: &str) -> PathBuf {
pub fn repo_path_for_url(repos_dir: &Path, repo_url: &str) -> PathBuf {
let repo_name = repo_url
.trim_start_matches("https://")
.replace(|c: char| !c.is_alphanumeric(), "-");
Path::new(REPOS_DIR)
Path::new(repos_dir)
.canonicalize()
.context(format!("No such directory {REPOS_DIR}"))
.context(format!("No such directory {}", repos_dir.display()))
.unwrap()
.join(repo_name)
}

View file

@ -1,4 +1,4 @@
use anyhow::Result;
use anyhow::{Result, anyhow};
use std::fs;
use std::path::{Path, PathBuf};
use uuid::Uuid;
@ -11,6 +11,7 @@ pub fn get_or_create_id(path: &Path) -> Result<String> {
}
}
let new_id = Uuid::new_v4().to_string();
fs::create_dir_all(path.parent().ok_or_else(|| anyhow!("invalid id path"))?)?;
fs::write(path, &new_id)?;
Ok(new_id)
}