agent: Add telemetry for eval runs (#28816)

Release Notes: - N/A --------- Co-authored-by: Joseph <joseph@zed.dev>
2025-04-15 21:54:26 -05:00 · 2025-04-15 21:54:26 -05:00 · 222d4a2546
commit 222d4a2546
parent 1eb948654a
4 changed files with 89 additions and 6 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4886,6 +4886,7 @@ dependencies = [
 "collections",
 "context_server",
 "dap",
+ "dirs 5.0.1",
 "env_logger 0.11.8",
 "extension",
 "fs",
@ -4907,9 +4908,11 @@ dependencies = [
 "serde",
 "settings",
 "shellexpand 2.1.2",
+ "telemetry",
 "toml 0.8.20",
 "unindent",
 "util",
+ "uuid",
 "workspace-hack",
 ]

--- a/crates/eval/Cargo.toml
+++ b/crates/eval/Cargo.toml
@ -16,6 +16,7 @@ client.workspace = true
 collections.workspace = true
 context_server.workspace = true
 dap.workspace = true
+dirs = "5.0"
 env_logger.workspace = true
 extension.workspace = true
 fs.workspace = true
@ -37,9 +38,11 @@ reqwest_client.workspace = true
 serde.workspace = true
 settings.workspace = true
 shellexpand.workspace = true
+telemetry.workspace = true
 toml.workspace = true
 unindent.workspace = true
 util.workspace = true
+uuid = { version = "1.6", features = ["v4"] }
 workspace-hack.workspace = true

 [[bin]]
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -1,7 +1,9 @@
 mod example;
+mod ids;

 use client::{Client, ProxySettings, UserStore};
 pub(crate) use example::*;
+use telemetry;

 use ::fs::RealFs;
 use anyhow::{Result, anyhow};
@ -39,7 +41,6 @@ struct Args {
    /// Model to use (default: "claude-3-7-sonnet-latest")
    #[arg(long, default_value = "claude-3-7-sonnet-latest")]
    model: String,
-    /// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
    #[arg(long, value_delimiter = ',')]
    languages: Option<Vec<String>>,
    /// How many times to run the judge on each example run.
@ -77,6 +78,15 @@ fn main() {
    app.run(move |cx| {
        let app_state = init(cx);

+        let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
+        let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
+        let session_id = uuid::Uuid::new_v4().to_string();
+
+        app_state
+            .client
+            .telemetry()
+            .start(system_id, installation_id, session_id, cx);
+
        let model = find_model("claude-3-7-sonnet-latest", cx).unwrap();

        LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
@ -273,6 +283,11 @@ fn main() {
                / (score_count as f32);
            println!("\nAverage score: {average_score}");

+            std::thread::sleep(std::time::Duration::from_secs(2));
+
+            // Flush telemetry events before exiting
+            app_state.client.telemetry().flush_events();
+
            cx.update(|cx| cx.quit())
        })
        .detach_and_log_err(cx);
@ -286,15 +301,49 @@ async fn run_example(
    judge_repetitions: u32,
    cx: &mut AsyncApp,
 ) -> Result<Vec<Result<JudgeOutput>>> {
-    cx.update(|cx| example.run(model.clone(), app_state, cx))?
+    let run_output = cx
+        .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
        .await?;
    let diff = example.repository_diff().await?;

-    let judge_tasks = (0..judge_repetitions)
-        .map(|round| example.judge(model.clone(), diff.clone(), round, cx))
-        .collect::<Vec<_>>();
+    // Run judge for each repetition
+    let mut results = Vec::new();
+    for round in 0..judge_repetitions {
+        let judge_result = example.judge(model.clone(), diff.clone(), round, cx).await;

-    Ok(future::join_all(judge_tasks).await)
+        // Log telemetry for this judge result
+        if let Ok(judge_output) = &judge_result {
+            let cohort_id = example
+                .output_file_path
+                .parent()
+                .and_then(|p| p.file_name())
+                .map(|name| name.to_string_lossy().to_string())
+                .unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string());
+
+            telemetry::event!(
+                "Agent Eval Completed",
+                cohort_id = cohort_id,
+                example_name = example.name.clone(),
+                round = round,
+                score = judge_output.score,
+                analysis = judge_output.analysis,
+                tool_use_counts = run_output.tool_use_counts,
+                response_count = run_output.response_count,
+                token_usage = run_output.token_usage,
+                model = model.telemetry_id(),
+                model_provider = model.provider_id().to_string(),
+                repository_url = example.base.url.clone(),
+                repository_revision = example.base.revision.clone(),
+                diagnostics_summary = run_output.diagnostics
+            );
+        }
+
+        results.push(judge_result);
+    }
+
+    app_state.client.telemetry().flush_events();
+
+    Ok(results)
 }

 fn list_all_examples() -> Result<Vec<PathBuf>> {
--- a/crates/eval/src/ids.rs
+++ b/crates/eval/src/ids.rs
@ -0,0 +1,28 @@
+use anyhow::Result;
+use std::fs;
+use std::path::{Path, PathBuf};
+use uuid::Uuid;
+
+pub fn get_or_create_id(path: &Path) -> Result<String> {
+    if let Ok(id) = fs::read_to_string(path) {
+        let trimmed = id.trim();
+        if !trimmed.is_empty() {
+            return Ok(trimmed.to_string());
+        }
+    }
+    let new_id = Uuid::new_v4().to_string();
+    fs::write(path, &new_id)?;
+    Ok(new_id)
+}
+
+pub fn eval_system_id_path() -> PathBuf {
+    dirs::data_local_dir()
+        .unwrap_or_else(|| PathBuf::from("."))
+        .join("zed-eval-system-id")
+}
+
+pub fn eval_installation_id_path() -> PathBuf {
+    dirs::data_local_dir()
+        .unwrap_or_else(|| PathBuf::from("."))
+        .join("zed-eval-installation-id")
+}