Start tracking tool failure rates in eval (#29122)

This pull request will print all the used tools and their failure rates. The objective goal should be to minimize that failure rate. @tmickleydoyle: this also changes the telemetry event to report `tool_metrics` as opposed to `tool_use_counts`. Ideally I'd love to be able to plot failure rates by tool and hopefully see that percentage go down. Can we do that with the data we're tracking with this pull request? Release Notes: - N/A
2025-04-21 16:16:43 +02:00 · 2025-04-21 16:16:43 +02:00 · 97ab0980d1
commit 97ab0980d1
parent 3a27e8c311
3 changed files with 178 additions and 68 deletions
--- a/crates/eval/src/tool_metrics.rs
+++ b/crates/eval/src/tool_metrics.rs
@ -0,0 +1,102 @@
+use collections::HashMap;
+use serde::{Deserialize, Serialize};
+use std::{fmt::Display, sync::Arc};
+
+#[derive(Debug, Default, Clone, Serialize, Deserialize)]
+pub struct ToolMetrics {
+    pub use_counts: HashMap<Arc<str>, u32>,
+    pub failure_counts: HashMap<Arc<str>, u32>,
+}
+
+impl ToolMetrics {
+    pub fn insert(&mut self, tool_name: Arc<str>, succeeded: bool) {
+        *self.use_counts.entry(tool_name.clone()).or_insert(0) += 1;
+        if !succeeded {
+            *self.failure_counts.entry(tool_name).or_insert(0) += 1;
+        }
+    }
+
+    pub fn merge(&mut self, other: &ToolMetrics) {
+        for (tool_name, use_count) in &other.use_counts {
+            *self.use_counts.entry(tool_name.clone()).or_insert(0) += use_count;
+        }
+        for (tool_name, failure_count) in &other.failure_counts {
+            *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
+        }
+    }
+}
+
+impl Display for ToolMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let mut failure_rates: Vec<(Arc<str>, f64)> = Vec::new();
+
+        for (tool_name, use_count) in &self.use_counts {
+            let failure_count = self.failure_counts.get(tool_name).cloned().unwrap_or(0);
+            if *use_count > 0 {
+                let failure_rate = failure_count as f64 / *use_count as f64;
+                failure_rates.push((tool_name.clone(), failure_rate));
+            }
+        }
+
+        // Sort by failure rate descending
+        failure_rates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Table dimensions
+        let tool_width = 30;
+        let count_width = 10;
+        let rate_width = 10;
+
+        // Write table top border
+        writeln!(
+            f,
+            "┌{}┬{}┬{}┬{}┐",
+            "─".repeat(tool_width),
+            "─".repeat(count_width),
+            "─".repeat(count_width),
+            "─".repeat(rate_width)
+        )?;
+
+        // Write header row
+        writeln!(
+            f,
+            "│{:^30}│{:^10}│{:^10}│{:^10}│",
+            "Tool", "Uses", "Failures", "Rate"
+        )?;
+
+        // Write header-data separator
+        writeln!(
+            f,
+            "├{}┼{}┼{}┼{}┤",
+            "─".repeat(tool_width),
+            "─".repeat(count_width),
+            "─".repeat(count_width),
+            "─".repeat(rate_width)
+        )?;
+
+        // Write data rows
+        for (tool_name, failure_rate) in failure_rates {
+            let use_count = self.use_counts.get(&tool_name).cloned().unwrap_or(0);
+            let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
+            writeln!(
+                f,
+                "│{:^30}│{:^10}│{:^10}│{:^10}│",
+                tool_name,
+                use_count,
+                failure_count,
+                format!("{}%", (failure_rate * 100.0).round())
+            )?;
+        }
+
+        // Write table bottom border
+        writeln!(
+            f,
+            "└{}┴{}┴{}┴{}┘",
+            "─".repeat(tool_width),
+            "─".repeat(count_width),
+            "─".repeat(count_width),
+            "─".repeat(rate_width)
+        )?;
+
+        Ok(())
+    }
+}