Start tracking tool failure rates in eval (#29122)

This pull request will print all the used tools and their failure rates.
The objective goal should be to minimize that failure rate.

@tmickleydoyle: this also changes the telemetry event to report
`tool_metrics` as opposed to `tool_use_counts`. Ideally I'd love to be
able to plot failure rates by tool and hopefully see that percentage go
down. Can we do that with the data we're tracking with this pull
request?

Release Notes:

- N/A
This commit is contained in:
Antonio Scandurra 2025-04-21 16:16:43 +02:00 committed by GitHub
parent 3a27e8c311
commit 97ab0980d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 178 additions and 68 deletions

View file

@ -0,0 +1,102 @@
use collections::HashMap;
use serde::{Deserialize, Serialize};
use std::{fmt::Display, sync::Arc};
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct ToolMetrics {
pub use_counts: HashMap<Arc<str>, u32>,
pub failure_counts: HashMap<Arc<str>, u32>,
}
impl ToolMetrics {
pub fn insert(&mut self, tool_name: Arc<str>, succeeded: bool) {
*self.use_counts.entry(tool_name.clone()).or_insert(0) += 1;
if !succeeded {
*self.failure_counts.entry(tool_name).or_insert(0) += 1;
}
}
pub fn merge(&mut self, other: &ToolMetrics) {
for (tool_name, use_count) in &other.use_counts {
*self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count;
}
for (tool_name, failure_count) in &other.failure_counts {
*self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
}
}
}
impl Display for ToolMetrics {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut failure_rates: Vec<(Arc<str>, f64)> = Vec::new();
for (tool_name, use_count) in &self.use_counts {
let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0);
if *use_count > 0 {
let failure_rate = failure_count as f64 / *use_count as f64;
failure_rates.push((tool_name.clone(), failure_rate));
}
}
// Sort by failure rate descending
failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
// Table dimensions
let tool_width = 30;
let count_width = 10;
let rate_width = 10;
// Write table top border
writeln!(
f,
"┌{}┬{}┬{}┬{}┐",
"".repeat(tool_width),
"".repeat(count_width),
"".repeat(count_width),
"".repeat(rate_width)
)?;
// Write header row
writeln!(
f,
"│{:^30}│{:^10}│{:^10}│{:^10}│",
"Tool", "Uses", "Failures", "Rate"
)?;
// Write header-data separator
writeln!(
f,
"├{}┼{}┼{}┼{}┤",
"".repeat(tool_width),
"".repeat(count_width),
"".repeat(count_width),
"".repeat(rate_width)
)?;
// Write data rows
for (tool_name, failure_rate) in failure_rates {
let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0);
let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
writeln!(
f,
"│{:^30}│{:^10}│{:^10}│{:^10}│",
tool_name,
use_count,
failure_count,
format!("{}%", (failure_rate * 100.0).round())
)?;
}
// Write table bottom border
writeln!(
f,
"└{}┴{}┴{}┴{}┘",
"".repeat(tool_width),
"".repeat(count_width),
"".repeat(count_width),
"".repeat(rate_width)
)?;
Ok(())
}
}