diff --git a/crates/eval/src/assertions.rs b/crates/eval/src/assertions.rs index c021694401..489e4aa22e 100644 --- a/crates/eval/src/assertions.rs +++ b/crates/eval/src/assertions.rs @@ -28,6 +28,17 @@ impl AssertionsReport { } } + pub fn error(msg: String) -> Self { + let assert = RanAssertion { + id: "no-unhandled-errors".into(), + result: Err(msg), + }; + AssertionsReport { + ran: vec![assert], + max: Some(1), + } + } + pub fn is_empty(&self) -> bool { self.ran.is_empty() } @@ -145,7 +156,9 @@ pub fn print_table_divider() { } fn truncate(assertion: &str, max_width: usize) -> String { - if assertion.len() <= max_width { + let is_verbose = std::env::var("VERBOSE").is_ok_and(|v| !v.is_empty()); + + if assertion.len() <= max_width || is_verbose { assertion.to_string() } else { let mut end_ix = max_width - 1; diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index d69ec5d9c9..789349116d 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -6,7 +6,7 @@ mod ids; mod instance; mod tool_metrics; -use assertions::display_error_row; +use assertions::{AssertionsReport, display_error_row}; use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git}; pub(crate) use tool_metrics::*; @@ -467,11 +467,12 @@ pub fn find_model( match matching_models.as_slice() { [model] => Ok(model.clone()), [] => Err(anyhow!( - "No language model with ID {} was available. Available models: {}", + "No language model with ID {}/{} was available. Available models: {}", + provider_id, model_id, model_registry .available_models(cx) - .map(|model| model.id().0.clone()) + .map(|model| format!("{}/{}", model.provider_id().0, model.id().0)) .collect::>() .join(", ") )), @@ -581,12 +582,15 @@ fn print_report( Err(err) => { display_error_row(&mut table_rows, example.repetition, err.to_string())?; error_count += 1; + programmatic_scores.push(0.0); + diff_scores.push(0.0); + thread_scores.push(0.0); } Ok((run_output, judge_output)) => { cumulative_tool_metrics.merge(&run_output.tool_metrics); example_cumulative_tool_metrics.merge(&run_output.tool_metrics); - if !run_output.programmatic_assertions.total_count() > 0 { + if run_output.programmatic_assertions.total_count() > 0 { for assertion in &run_output.programmatic_assertions.ran { assertions::display_table_row( &mut table_rows, @@ -626,6 +630,8 @@ fn print_report( } } + let mut all_asserts = Vec::new(); + if !table_rows.is_empty() { assertions::print_table_header(); print!("{}", table_rows); @@ -634,33 +640,29 @@ fn print_report( for (example, result) in results.iter() { if let Ok((run_output, judge_output)) = result { + let asserts = [ + run_output.programmatic_assertions.clone(), + judge_output.diff.clone(), + judge_output.thread.clone(), + ]; + all_asserts.extend_from_slice(&asserts); assertions::print_table_round_summary( &example.repetition.to_string(), - [ - &run_output.programmatic_assertions, - &judge_output.diff, - &judge_output.thread, - ] - .into_iter(), + asserts.iter(), + ) + } else if let Err(err) = result { + let assert = AssertionsReport::error(err.to_string()); + all_asserts.push(assert.clone()); + assertions::print_table_round_summary( + &example.repetition.to_string(), + [assert].iter(), ) } } assertions::print_table_divider(); - assertions::print_table_round_summary( - "avg", - results.iter().flat_map(|(_, result)| { - result.iter().flat_map(|(run_output, judge_output)| { - [ - &run_output.programmatic_assertions, - &judge_output.diff, - &judge_output.thread, - ] - .into_iter() - }) - }), - ); + assertions::print_table_round_summary("avg", all_asserts.iter()); assertions::print_table_footer(); }