eval: Fine-grained assertions (#29246)
- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs
)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml
)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
parent
0d3fe474db
commit
ce1a674eba
18 changed files with 1969 additions and 1229 deletions
|
@ -24,6 +24,10 @@ impl ToolMetrics {
|
|||
*self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.use_counts.is_empty() && self.failure_counts.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ToolMetrics {
|
||||
|
@ -79,7 +83,7 @@ impl Display for ToolMetrics {
|
|||
let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
|
||||
writeln!(
|
||||
f,
|
||||
"│{:^30}│{:^10}│{:^10}│{:^10}│",
|
||||
"│{:<30}│{:^10}│{:^10}│{:^10}│",
|
||||
tool_name,
|
||||
use_count,
|
||||
failure_count,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue