eval: Fine-grained assertions (#29246)

- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
2025-04-22 23:58:58 -03:00 · 2025-04-22 23:58:58 -03:00 · ce1a674eba
commit ce1a674eba
parent 0d3fe474db
18 changed files with 1969 additions and 1229 deletions
--- a/crates/eval/src/tool_metrics.rs
+++ b/crates/eval/src/tool_metrics.rs
@ -24,6 +24,10 @@ impl ToolMetrics {
            *self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
        }
    }
+
+    pub fn is_empty(&self) -> bool {
+        self.use_counts.is_empty() && self.failure_counts.is_empty()
+    }
 }

 impl Display for ToolMetrics {
@ -79,7 +83,7 @@ impl Display for ToolMetrics {
            let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
            writeln!(
                f,
-                "│{:^30}│{:^10}│{:^10}│{:^10}│",
+                "│{:<30}│{:^10}│{:^10}│{:^10}│",
                tool_name,
                use_count,
                failure_count,