eval: Count execution errors as failures (#30712)

- Evals returning an error (e.g., LLM API format mismatch) were silently
skipped in the aggregated results. Now we count them as a failure (0%
success score).

- Setting the `VERBOSE` environment variable to something non-empty
disables string truncation

Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2025-05-14 20:44:19 +03:00 committed by GitHub
parent 83498ebf2b
commit 6420df3975
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 39 additions and 24 deletions

View file

@ -28,6 +28,17 @@ impl AssertionsReport {
} }
} }
pub fn error(msg: String) -> Self {
let assert = RanAssertion {
id: "no-unhandled-errors".into(),
result: Err(msg),
};
AssertionsReport {
ran: vec![assert],
max: Some(1),
}
}
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.ran.is_empty() self.ran.is_empty()
} }
@ -145,7 +156,9 @@ pub fn print_table_divider() {
} }
fn truncate(assertion: &str, max_width: usize) -> String { fn truncate(assertion: &str, max_width: usize) -> String {
if assertion.len() <= max_width { let is_verbose = std::env::var("VERBOSE").is_ok_and(|v| !v.is_empty());
if assertion.len() <= max_width || is_verbose {
assertion.to_string() assertion.to_string()
} else { } else {
let mut end_ix = max_width - 1; let mut end_ix = max_width - 1;

View file

@ -6,7 +6,7 @@ mod ids;
mod instance; mod instance;
mod tool_metrics; mod tool_metrics;
use assertions::display_error_row; use assertions::{AssertionsReport, display_error_row};
use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git}; use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
pub(crate) use tool_metrics::*; pub(crate) use tool_metrics::*;
@ -467,11 +467,12 @@ pub fn find_model(
match matching_models.as_slice() { match matching_models.as_slice() {
[model] => Ok(model.clone()), [model] => Ok(model.clone()),
[] => Err(anyhow!( [] => Err(anyhow!(
"No language model with ID {} was available. Available models: {}", "No language model with ID {}/{} was available. Available models: {}",
provider_id,
model_id, model_id,
model_registry model_registry
.available_models(cx) .available_models(cx)
.map(|model| model.id().0.clone()) .map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(", ") .join(", ")
)), )),
@ -581,12 +582,15 @@ fn print_report(
Err(err) => { Err(err) => {
display_error_row(&mut table_rows, example.repetition, err.to_string())?; display_error_row(&mut table_rows, example.repetition, err.to_string())?;
error_count += 1; error_count += 1;
programmatic_scores.push(0.0);
diff_scores.push(0.0);
thread_scores.push(0.0);
} }
Ok((run_output, judge_output)) => { Ok((run_output, judge_output)) => {
cumulative_tool_metrics.merge(&run_output.tool_metrics); cumulative_tool_metrics.merge(&run_output.tool_metrics);
example_cumulative_tool_metrics.merge(&run_output.tool_metrics); example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
if !run_output.programmatic_assertions.total_count() > 0 { if run_output.programmatic_assertions.total_count() > 0 {
for assertion in &run_output.programmatic_assertions.ran { for assertion in &run_output.programmatic_assertions.ran {
assertions::display_table_row( assertions::display_table_row(
&mut table_rows, &mut table_rows,
@ -626,6 +630,8 @@ fn print_report(
} }
} }
let mut all_asserts = Vec::new();
if !table_rows.is_empty() { if !table_rows.is_empty() {
assertions::print_table_header(); assertions::print_table_header();
print!("{}", table_rows); print!("{}", table_rows);
@ -634,33 +640,29 @@ fn print_report(
for (example, result) in results.iter() { for (example, result) in results.iter() {
if let Ok((run_output, judge_output)) = result { if let Ok((run_output, judge_output)) = result {
let asserts = [
run_output.programmatic_assertions.clone(),
judge_output.diff.clone(),
judge_output.thread.clone(),
];
all_asserts.extend_from_slice(&asserts);
assertions::print_table_round_summary( assertions::print_table_round_summary(
&example.repetition.to_string(), &example.repetition.to_string(),
[ asserts.iter(),
&run_output.programmatic_assertions, )
&judge_output.diff, } else if let Err(err) = result {
&judge_output.thread, let assert = AssertionsReport::error(err.to_string());
] all_asserts.push(assert.clone());
.into_iter(), assertions::print_table_round_summary(
&example.repetition.to_string(),
[assert].iter(),
) )
} }
} }
assertions::print_table_divider(); assertions::print_table_divider();
assertions::print_table_round_summary( assertions::print_table_round_summary("avg", all_asserts.iter());
"avg",
results.iter().flat_map(|(_, result)| {
result.iter().flat_map(|(run_output, judge_output)| {
[
&run_output.programmatic_assertions,
&judge_output.diff,
&judge_output.thread,
]
.into_iter()
})
}),
);
assertions::print_table_footer(); assertions::print_table_footer();
} }