Start tracking tool failure rates in eval (#29122)

This pull request will print all the used tools and their failure rates. The objective goal should be to minimize that failure rate. @tmickleydoyle: this also changes the telemetry event to report `tool_metrics` as opposed to `tool_use_counts`. Ideally I'd love to be able to plot failure rates by tool and hopefully see that percentage go down. Can we do that with the data we're tracking with this pull request? Release Notes: - N/A
2025-04-21 16:16:43 +02:00 · 2025-04-21 16:16:43 +02:00 · 97ab0980d1
commit 97ab0980d1
parent 3a27e8c311
3 changed files with 178 additions and 68 deletions
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -1,13 +1,15 @@
 mod example;
 mod ids;
+mod tool_metrics;

-use client::{Client, ProxySettings, UserStore};
 pub(crate) use example::*;
-use telemetry;
+pub(crate) use tool_metrics::*;

 use ::fs::RealFs;
 use anyhow::{Result, anyhow};
 use clap::Parser;
+use client::{Client, ProxySettings, UserStore};
+use collections::HashSet;
 use extension::ExtensionHostProxy;
 use futures::{StreamExt, future};
 use gpui::http_client::{Uri, read_proxy_from_env};
@ -22,7 +24,6 @@ use prompt_store::PromptBuilder;
 use release_channel::AppVersion;
 use reqwest_client::ReqwestClient;
 use settings::{Settings, SettingsStore};
-use std::collections::HashSet;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::usize;
@ -92,6 +93,8 @@ fn main() {
            .telemetry()
            .start(system_id, installation_id, session_id, cx);

+        let mut cumulative_tool_metrics = ToolMetrics::default();
+
        let model_registry = LanguageModelRegistry::read_global(cx);
        let model = find_model("claude-3-7-sonnet-latest", model_registry, cx).unwrap();
        let model_provider_id = model.provider_id();
@ -177,7 +180,7 @@ fn main() {
                return cx.update(|cx| cx.quit());
            }

-            let mut repo_urls = HashSet::new();
+            let mut repo_urls = HashSet::default();
            let mut clone_tasks = Vec::new();

            for (i, example) in examples.iter_mut().enumerate() {
@ -244,9 +247,24 @@ fn main() {
                let model = model.clone();
                let example = example.clone();
                cx.spawn(async move |cx| {
-                    let result =
-                        run_example(&example, model, app_state, judge_repetitions, cx).await;
-                    (result, example)
+                    let result = async {
+                        let run_output = cx
+                            .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
+                            .await?;
+                        let judge_tasks = (0..judge_repetitions).map(|round| {
+                            run_judge_repetition(
+                                example.clone(),
+                                model.clone(),
+                                &run_output,
+                                round,
+                                cx,
+                            )
+                        });
+                        let judge_outputs = future::join_all(judge_tasks).await;
+                        anyhow::Ok((run_output, judge_outputs))
+                    }
+                    .await;
+                    (example, result)
                })
            });

@ -256,52 +274,58 @@ fn main() {
                .await;

            println!("\n\n");
-            println!("========================================");
-            println!("              EVAL RESULTS              ");
-            println!("========================================");
-            println!("");
+            print_header("EVAL RESULTS");

            let mut diff_scores = Vec::new();
            let mut thread_scores = Vec::new();
            let mut error_count = 0;

-            for (result, example) in results {
+            for (example, result) in results {
+                print_header(&example.name);
+
                match result {
                    Err(err) => {
                        println!("💥 {}{:?}", example.log_prefix, err);
                        error_count += 1;
                    }
-                    Ok(judge_results) => {
-                        for judge_result in judge_results {
+                    Ok((run_output, judge_results)) => {
+                        cumulative_tool_metrics.merge(&run_output.tool_metrics);
+
+                        println!("┌───────┬──────┬────────┐");
+                        println!("│ Judge │ Diff │ Thread │");
+                        println!("├───────┼──────┼────────┤");
+
+                        for (i, judge_result) in judge_results.iter().enumerate() {
                            match judge_result {
                                Ok(judge_output) => {
-                                    const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
-                                    let diff_score: u32 = judge_output.diff.score;
-                                    let score_index = (diff_score.min(5)) as usize;
+                                    let diff_score = judge_output.diff.score;
+                                    diff_scores.push(diff_score);
+
+                                    let thread_display = if let Some(thread) = &judge_output.thread
+                                    {
+                                        let thread_score = thread.score;
+                                        thread_scores.push(thread_score);
+                                        format!("{}", thread_score)
+                                    } else {
+                                        "N/A".to_string()
+                                    };

                                    println!(
-                                        "{} {}{} (Diff)",
-                                        SCORES[score_index],
-                                        example.log_prefix,
-                                        judge_output.diff.score,
+                                        "|{:^7}│{:^6}│{:^8}│",
+                                        i + 1,
+                                        diff_score,
+                                        thread_display
                                    );
-                                    diff_scores.push(judge_output.diff.score);
-
-                                    if let Some(thread) = judge_output.thread {
-                                        let process_score: u32 = thread.score;
-                                        let score_index = (process_score.min(5)) as usize;
-                                        println!(
-                                            "{} {}{} (Thread)",
-                                            SCORES[score_index], example.log_prefix, thread.score,
-                                        );
-                                        thread_scores.push(thread.score);
-                                    }
                                }
                                Err(err) => {
-                                    println!("💥 {}{:?}", example.log_prefix, err);
+                                    println!("|{:^7}│{:^6}│{:^8}│{:?}", i + 1, "N/A", "N/A", err);
                                }
                            }
                        }
+
+                        println!("└───────┴──────┴────────┘");
+
+                        println!("{}", run_output.tool_metrics);
                    }
                }
                println!(
@ -341,6 +365,9 @@ fn main() {
                }
            }

+            print_header("CUMULATIVE TOOL METRICS");
+            println!("{}", cumulative_tool_metrics);
+
            std::thread::sleep(std::time::Duration::from_secs(2));

            app_state.client.telemetry().flush_events();
@ -351,27 +378,6 @@ fn main() {
    });
 }

-async fn run_example(
-    example: &Example,
-    model: Arc<dyn LanguageModel>,
-    app_state: Arc<AgentAppState>,
-    judge_repetitions: u32,
-    cx: &mut AsyncApp,
-) -> Result<Vec<Result<JudgeOutput>>> {
-    let run_output = cx
-        .update(|cx| example.run(model.clone(), app_state.clone(), cx))?
-        .await?;
-
-    let judge_tasks = (0..judge_repetitions)
-        .map(|round| run_judge_repetition(example.clone(), model.clone(), &run_output, round, cx));
-
-    let results = future::join_all(judge_tasks).await;
-
-    app_state.client.telemetry().flush_events();
-
-    Ok(results)
-}
-
 fn list_all_examples() -> Result<Vec<PathBuf>> {
    let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
    let entries = std::fs::read_dir(path).unwrap();
@ -566,7 +572,7 @@ async fn run_judge_repetition(
                diff_analysis = judge_output.diff.analysis,
                thread_score = thread.score,
                thread_analysis = thread.analysis,
-                tool_use_counts = run_output.tool_use_counts,
+                tool_metrics = run_output.tool_metrics,
                response_count = run_output.response_count,
                token_usage = run_output.token_usage,
                model = model.telemetry_id(),
@ -585,7 +591,7 @@ async fn run_judge_repetition(
                round = round,
                diff_score = judge_output.diff.score,
                diff_analysis = judge_output.diff.analysis,
-                tool_use_counts = run_output.tool_use_counts,
+                tool_metrics = run_output.tool_metrics,
                response_count = run_output.response_count,
                token_usage = run_output.token_usage,
                model = model.telemetry_id(),
@ -601,3 +607,9 @@ async fn run_judge_repetition(

    judge_result
 }
+
+fn print_header(header: &str) {
+    println!("\n========================================");
+    println!("{:^40}", header);
+    println!("========================================\n");
+}