agent: Add telemetry for eval runs (#28816)

Release Notes:

- N/A

---------

Co-authored-by: Joseph <joseph@zed.dev>
This commit is contained in:
Thomas Mickley-Doyle 2025-04-15 21:54:26 -05:00 committed by GitHub
parent 1eb948654a
commit 222d4a2546
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 89 additions and 6 deletions

3
Cargo.lock generated
View file

@ -4886,6 +4886,7 @@ dependencies = [
"collections", "collections",
"context_server", "context_server",
"dap", "dap",
"dirs 5.0.1",
"env_logger 0.11.8", "env_logger 0.11.8",
"extension", "extension",
"fs", "fs",
@ -4907,9 +4908,11 @@ dependencies = [
"serde", "serde",
"settings", "settings",
"shellexpand 2.1.2", "shellexpand 2.1.2",
"telemetry",
"toml 0.8.20", "toml 0.8.20",
"unindent", "unindent",
"util", "util",
"uuid",
"workspace-hack", "workspace-hack",
] ]

View file

@ -16,6 +16,7 @@ client.workspace = true
collections.workspace = true collections.workspace = true
context_server.workspace = true context_server.workspace = true
dap.workspace = true dap.workspace = true
dirs = "5.0"
env_logger.workspace = true env_logger.workspace = true
extension.workspace = true extension.workspace = true
fs.workspace = true fs.workspace = true
@ -37,9 +38,11 @@ reqwest_client.workspace = true
serde.workspace = true serde.workspace = true
settings.workspace = true settings.workspace = true
shellexpand.workspace = true shellexpand.workspace = true
telemetry.workspace = true
toml.workspace = true toml.workspace = true
unindent.workspace = true unindent.workspace = true
util.workspace = true util.workspace = true
uuid = { version = "1.6", features = ["v4"] }
workspace-hack.workspace = true workspace-hack.workspace = true
[[bin]] [[bin]]

View file

@ -1,7 +1,9 @@
mod example; mod example;
mod ids;
use client::{Client, ProxySettings, UserStore}; use client::{Client, ProxySettings, UserStore};
pub(crate) use example::*; pub(crate) use example::*;
use telemetry;
use ::fs::RealFs; use ::fs::RealFs;
use anyhow::{Result, anyhow}; use anyhow::{Result, anyhow};
@ -39,7 +41,6 @@ struct Args {
/// Model to use (default: "claude-3-7-sonnet-latest") /// Model to use (default: "claude-3-7-sonnet-latest")
#[arg(long, default_value = "claude-3-7-sonnet-latest")] #[arg(long, default_value = "claude-3-7-sonnet-latest")]
model: String, model: String,
/// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
#[arg(long, value_delimiter = ',')] #[arg(long, value_delimiter = ',')]
languages: Option<Vec<String>>, languages: Option<Vec<String>>,
/// How many times to run the judge on each example run. /// How many times to run the judge on each example run.
@ -77,6 +78,15 @@ fn main() {
app.run(move |cx| { app.run(move |cx| {
let app_state = init(cx); let app_state = init(cx);
let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
let session_id = uuid::Uuid::new_v4().to_string();
app_state
.client
.telemetry()
.start(system_id, installation_id, session_id, cx);
let model = find_model("claude-3-7-sonnet-latest", cx).unwrap(); let model = find_model("claude-3-7-sonnet-latest", cx).unwrap();
LanguageModelRegistry::global(cx).update(cx, |registry, cx| { LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
@ -273,6 +283,11 @@ fn main() {
/ (score_count as f32); / (score_count as f32);
println!("\nAverage score: {average_score}"); println!("\nAverage score: {average_score}");
std::thread::sleep(std::time::Duration::from_secs(2));
// Flush telemetry events before exiting
app_state.client.telemetry().flush_events();
cx.update(|cx| cx.quit()) cx.update(|cx| cx.quit())
}) })
.detach_and_log_err(cx); .detach_and_log_err(cx);
@ -286,15 +301,49 @@ async fn run_example(
judge_repetitions: u32, judge_repetitions: u32,
cx: &mut AsyncApp, cx: &mut AsyncApp,
) -> Result<Vec<Result<JudgeOutput>>> { ) -> Result<Vec<Result<JudgeOutput>>> {
cx.update(|cx| example.run(model.clone(), app_state, cx))? let run_output = cx
.update(|cx| example.run(model.clone(), app_state.clone(), cx))?
.await?; .await?;
let diff = example.repository_diff().await?; let diff = example.repository_diff().await?;
let judge_tasks = (0..judge_repetitions) // Run judge for each repetition
.map(|round| example.judge(model.clone(), diff.clone(), round, cx)) let mut results = Vec::new();
.collect::<Vec<_>>(); for round in 0..judge_repetitions {
let judge_result = example.judge(model.clone(), diff.clone(), round, cx).await;
Ok(future::join_all(judge_tasks).await) // Log telemetry for this judge result
if let Ok(judge_output) = &judge_result {
let cohort_id = example
.output_file_path
.parent()
.and_then(|p| p.file_name())
.map(|name| name.to_string_lossy().to_string())
.unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string());
telemetry::event!(
"Agent Eval Completed",
cohort_id = cohort_id,
example_name = example.name.clone(),
round = round,
score = judge_output.score,
analysis = judge_output.analysis,
tool_use_counts = run_output.tool_use_counts,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
model_provider = model.provider_id().to_string(),
repository_url = example.base.url.clone(),
repository_revision = example.base.revision.clone(),
diagnostics_summary = run_output.diagnostics
);
}
results.push(judge_result);
}
app_state.client.telemetry().flush_events();
Ok(results)
} }
fn list_all_examples() -> Result<Vec<PathBuf>> { fn list_all_examples() -> Result<Vec<PathBuf>> {

28
crates/eval/src/ids.rs Normal file
View file

@ -0,0 +1,28 @@
use anyhow::Result;
use std::fs;
use std::path::{Path, PathBuf};
use uuid::Uuid;
pub fn get_or_create_id(path: &Path) -> Result<String> {
if let Ok(id) = fs::read_to_string(path) {
let trimmed = id.trim();
if !trimmed.is_empty() {
return Ok(trimmed.to_string());
}
}
let new_id = Uuid::new_v4().to_string();
fs::write(path, &new_id)?;
Ok(new_id)
}
pub fn eval_system_id_path() -> PathBuf {
dirs::data_local_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join("zed-eval-system-id")
}
pub fn eval_installation_id_path() -> PathBuf {
dirs::data_local_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join("zed-eval-installation-id")
}