eval: Fine-grained assertions (#29246)

- Support programmatic examples
([example](17feb260a0/crates/eval/src/examples/file_search.rs))
- Combine data-driven example declarations into a single `.toml` file
([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml))
- Run judge on individual assertions (previously called "criteria")
- Report judge and programmatic assertions in one combined table

Note: We still need to work on concept naming 

<img width=400
src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5">

Release Notes:

- N/A

---------

Co-authored-by: Richard Feldman <oss@rtfeldman.com>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
Agus Zubiaga 2025-04-22 23:58:58 -03:00 committed by GitHub
parent 0d3fe474db
commit ce1a674eba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1969 additions and 1229 deletions

View file

@ -24,6 +24,10 @@ impl ToolMetrics {
*self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
}
}
pub fn is_empty(&self) -> bool {
self.use_counts.is_empty() && self.failure_counts.is_empty()
}
}
impl Display for ToolMetrics {
@ -79,7 +83,7 @@ impl Display for ToolMetrics {
let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
writeln!(
f,
"│{:^30}│{:^10}│{:^10}│{:^10}│",
"│{:<30}│{:^10}│{:^10}│{:^10}│",
tool_name,
use_count,
failure_count,