ZIm/crates/eval/src/assertions.rs
Agus Zubiaga ce1a674eba
eval: Fine-grained assertions (#29246)
- Support programmatic examples
([example](17feb260a0/crates/eval/src/examples/file_search.rs))
- Combine data-driven example declarations into a single `.toml` file
([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml))
- Run judge on individual assertions (previously called "criteria")
- Report judge and programmatic assertions in one combined table

Note: We still need to work on concept naming 

<img width=400
src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5">

Release Notes:

- N/A

---------

Co-authored-by: Richard Feldman <oss@rtfeldman.com>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
2025-04-22 23:58:58 -03:00

157 lines
3.9 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use serde::{Deserialize, Serialize};
use std::fmt::Write;
use std::fmt::{self};
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct AssertionsReport {
pub ran: Vec<RanAssertion>,
pub max: Option<usize>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RanAssertion {
pub id: String,
pub result: Result<RanAssertionResult, String>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RanAssertionResult {
pub analysis: Option<String>,
pub passed: bool,
}
impl AssertionsReport {
pub fn new(max: Option<usize>) -> Self {
AssertionsReport {
ran: Vec::new(),
max,
}
}
pub fn is_empty(&self) -> bool {
self.ran.is_empty()
}
pub fn total_count(&self) -> usize {
self.run_count().max(self.max.unwrap_or(0))
}
pub fn run_count(&self) -> usize {
self.ran.len()
}
pub fn passed_count(&self) -> usize {
self.ran
.iter()
.filter(|a| a.result.as_ref().map_or(false, |result| result.passed))
.count()
}
pub fn passed_percentage(&self) -> f32 {
if self.total_count() == 0 {
0.0
} else {
(self.passed_count() as f32 / self.total_count() as f32) * 100.0
}
}
}
const ROUND_WIDTH: usize = "Round".len();
const ASSERTIONS_WIDTH: usize = 42;
const RESULTS_WIDTH: usize = 8;
pub fn print_table_header() {
println!(
"┌─{}─┬─{}─┬─{}─┐",
"".repeat(ROUND_WIDTH),
"".repeat(ASSERTIONS_WIDTH),
"".repeat(RESULTS_WIDTH)
);
println!(
"{:^ROUND_WIDTH$}{:^ASSERTIONS_WIDTH$}{:^RESULTS_WIDTH$}",
"Round", "Assertion", "Result"
);
println!(
"├─{}─┼─{}─┼─{}─┤",
"".repeat(ROUND_WIDTH),
"".repeat(ASSERTIONS_WIDTH),
"".repeat(RESULTS_WIDTH)
)
}
pub fn display_error_row(f: &mut String, round: usize, error: String) -> fmt::Result {
let last_two_columns = ASSERTIONS_WIDTH + RESULTS_WIDTH;
writeln!(
f,
"│ {:^ROUND_WIDTH$} │ {:<last_two_columns$} |",
round,
truncate(&error, last_two_columns)
)
}
pub fn display_table_row(f: &mut String, round: usize, assertion: &RanAssertion) -> fmt::Result {
let result = match &assertion.result {
Ok(result) if result.passed => "\x1b[32m✔ Passed\x1b[0m",
Ok(_) => "\x1b[31m✗ Failed\x1b[0m",
Err(_) => "\x1b[31m💥 Judge Error\x1b[0m",
};
writeln!(
f,
"│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
round,
truncate(&assertion.id, ASSERTIONS_WIDTH),
result
)
}
pub fn print_table_round_summary<'a>(
round: &str,
reports: impl Iterator<Item = &'a AssertionsReport>,
) {
let mut passed = 0;
let mut total = 0;
for report in reports {
passed += report.passed_count();
total += report.total_count();
}
println!(
"{:^ROUND_WIDTH$}{:<ASSERTIONS_WIDTH$}{:>RESULTS_WIDTH$}",
round,
"total",
format!("{}%", (passed as f32 / total as f32 * 100.0).floor())
)
}
pub fn print_table_footer() {
println!(
"└─{}─┴─{}─┴─{}─┘",
"".repeat(ROUND_WIDTH),
"".repeat(ASSERTIONS_WIDTH),
"".repeat(RESULTS_WIDTH)
)
}
pub fn print_table_divider() {
println!(
"├─{}─┼─{}─┼─{}─┤",
"".repeat(ROUND_WIDTH),
"".repeat(ASSERTIONS_WIDTH),
"".repeat(RESULTS_WIDTH)
)
}
fn truncate(assertion: &str, max_width: usize) -> String {
if assertion.len() <= max_width {
assertion.to_string()
} else {
let mut end_ix = max_width - 1;
while !assertion.is_char_boundary(end_ix) {
end_ix -= 1;
}
format!("{}", &assertion[..end_ix])
}
}