Start tracking tool failure rates in eval (#29122)

This pull request will print all the used tools and their failure rates.
The objective goal should be to minimize that failure rate.

@tmickleydoyle: this also changes the telemetry event to report
`tool_metrics` as opposed to `tool_use_counts`. Ideally I'd love to be
able to plot failure rates by tool and hopefully see that percentage go
down. Can we do that with the data we're tracking with this pull
request?

Release Notes:

- N/A
This commit is contained in:
Antonio Scandurra 2025-04-21 16:16:43 +02:00 committed by GitHub
parent 3a27e8c311
commit 97ab0980d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 178 additions and 68 deletions

View file

@ -1,13 +1,15 @@
mod example;
mod ids;
mod tool_metrics;
use client::{Client, ProxySettings, UserStore};
pub(crate) use example::*;
use telemetry;
pub(crate) use tool_metrics::*;
use ::fs::RealFs;
use anyhow::{Result, anyhow};
use clap::Parser;
use client::{Client, ProxySettings, UserStore};
use collections::HashSet;
use extension::ExtensionHostProxy;
use futures::{StreamExt, future};
use gpui::http_client::{Uri, read_proxy_from_env};
@ -22,7 +24,6 @@ use prompt_store::PromptBuilder;
use release_channel::AppVersion;
use reqwest_client::ReqwestClient;
use settings::{Settings, SettingsStore};
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::usize;
@ -92,6 +93,8 @@ fn main() {
.telemetry()
.start(system_id, installation_id, session_id, cx);
let mut cumulative_tool_metrics = ToolMetrics::default();
let model_registry = LanguageModelRegistry::read_global(cx);
let model = find_model("claude-3-7-sonnet-latest", model_registry, cx).unwrap();
let model_provider_id = model.provider_id();
@ -177,7 +180,7 @@ fn main() {
return cx.update(|cx| cx.quit());
}
let mut repo_urls = HashSet::new();
let mut repo_urls = HashSet::default();
let mut clone_tasks = Vec::new();
for (i, example) in examples.iter_mut().enumerate() {
@ -244,9 +247,24 @@ fn main() {
let model = model.clone();
let example = example.clone();
cx.spawn(async move |cx| {
let result =
run_example(&example, model, app_state, judge_repetitions, cx).await;
(result, example)
let result = async {
let run_output = cx
.update(|cx| example.run(model.clone(), app_state.clone(), cx))?
.await?;
let judge_tasks = (0..judge_repetitions).map(|round| {
run_judge_repetition(
example.clone(),
model.clone(),
&run_output,
round,
cx,
)
});
let judge_outputs = future::join_all(judge_tasks).await;
anyhow::Ok((run_output, judge_outputs))
}
.await;
(example, result)
})
});
@ -256,52 +274,58 @@ fn main() {
.await;
println!("\n\n");
println!("========================================");
println!(" EVAL RESULTS ");
println!("========================================");
println!("");
print_header("EVAL RESULTS");
let mut diff_scores = Vec::new();
let mut thread_scores = Vec::new();
let mut error_count = 0;
for (result, example) in results {
for (example, result) in results {
print_header(&example.name);
match result {
Err(err) => {
println!("💥 {}{:?}", example.log_prefix, err);
error_count += 1;
}
Ok(judge_results) => {
for judge_result in judge_results {
Ok((run_output, judge_results)) => {
cumulative_tool_metrics.merge(&run_output.tool_metrics);
println!("┌───────┬──────┬────────┐");
println!("│ Judge │ Diff │ Thread │");
println!("├───────┼──────┼────────┤");
for (i, judge_result) in judge_results.iter().enumerate() {
match judge_result {
Ok(judge_output) => {
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
let diff_score: u32 = judge_output.diff.score;
let score_index = (diff_score.min(5)) as usize;
let diff_score = judge_output.diff.score;
diff_scores.push(diff_score);
let thread_display = if let Some(thread) = &judge_output.thread
{
let thread_score = thread.score;
thread_scores.push(thread_score);
format!("{}", thread_score)
} else {
"N/A".to_string()
};
println!(
"{} {}{} (Diff)",
SCORES[score_index],
example.log_prefix,
judge_output.diff.score,
"|{:^7}│{:^6}│{:^8}│",
i + 1,
diff_score,
thread_display
);
diff_scores.push(judge_output.diff.score);
if let Some(thread) = judge_output.thread {
let process_score: u32 = thread.score;
let score_index = (process_score.min(5)) as usize;
println!(
"{} {}{} (Thread)",
SCORES[score_index], example.log_prefix, thread.score,
);
thread_scores.push(thread.score);
}
}
Err(err) => {
println!("💥 {}{:?}", example.log_prefix, err);
println!("|{:^7}{:^6}{:^8}{:?}", i + 1, "N/A", "N/A", err);
}
}
}
println!("└───────┴──────┴────────┘");
println!("{}", run_output.tool_metrics);
}
}
println!(
@ -341,6 +365,9 @@ fn main() {
}
}
print_header("CUMULATIVE TOOL METRICS");
println!("{}", cumulative_tool_metrics);
std::thread::sleep(std::time::Duration::from_secs(2));
app_state.client.telemetry().flush_events();
@ -351,27 +378,6 @@ fn main() {
});
}
async fn run_example(
example: &Example,
model: Arc<dyn LanguageModel>,
app_state: Arc<AgentAppState>,
judge_repetitions: u32,
cx: &mut AsyncApp,
) -> Result<Vec<Result<JudgeOutput>>> {
let run_output = cx
.update(|cx| example.run(model.clone(), app_state.clone(), cx))?
.await?;
let judge_tasks = (0..judge_repetitions)
.map(|round| run_judge_repetition(example.clone(), model.clone(), &run_output, round, cx));
let results = future::join_all(judge_tasks).await;
app_state.client.telemetry().flush_events();
Ok(results)
}
fn list_all_examples() -> Result<Vec<PathBuf>> {
let path = std::fs::canonicalize(EXAMPLES_DIR).unwrap();
let entries = std::fs::read_dir(path).unwrap();
@ -566,7 +572,7 @@ async fn run_judge_repetition(
diff_analysis = judge_output.diff.analysis,
thread_score = thread.score,
thread_analysis = thread.analysis,
tool_use_counts = run_output.tool_use_counts,
tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
@ -585,7 +591,7 @@ async fn run_judge_repetition(
round = round,
diff_score = judge_output.diff.score,
diff_analysis = judge_output.diff.analysis,
tool_use_counts = run_output.tool_use_counts,
tool_metrics = run_output.tool_metrics,
response_count = run_output.response_count,
token_usage = run_output.token_usage,
model = model.telemetry_id(),
@ -601,3 +607,9 @@ async fn run_judge_repetition(
judge_result
}
fn print_header(header: &str) {
println!("\n========================================");
println!("{:^40}", header);
println!("========================================\n");
}