eval: Count execution errors as failures (#30712)
- Evals returning an error (e.g., LLM API format mismatch) were silently skipped in the aggregated results. Now we count them as a failure (0% success score). - Setting the `VERBOSE` environment variable to something non-empty disables string truncation Release Notes: - N/A
This commit is contained in:
parent
83498ebf2b
commit
6420df3975
2 changed files with 39 additions and 24 deletions
|
@ -28,6 +28,17 @@ impl AssertionsReport {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn error(msg: String) -> Self {
|
||||||
|
let assert = RanAssertion {
|
||||||
|
id: "no-unhandled-errors".into(),
|
||||||
|
result: Err(msg),
|
||||||
|
};
|
||||||
|
AssertionsReport {
|
||||||
|
ran: vec![assert],
|
||||||
|
max: Some(1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.ran.is_empty()
|
self.ran.is_empty()
|
||||||
}
|
}
|
||||||
|
@ -145,7 +156,9 @@ pub fn print_table_divider() {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn truncate(assertion: &str, max_width: usize) -> String {
|
fn truncate(assertion: &str, max_width: usize) -> String {
|
||||||
if assertion.len() <= max_width {
|
let is_verbose = std::env::var("VERBOSE").is_ok_and(|v| !v.is_empty());
|
||||||
|
|
||||||
|
if assertion.len() <= max_width || is_verbose {
|
||||||
assertion.to_string()
|
assertion.to_string()
|
||||||
} else {
|
} else {
|
||||||
let mut end_ix = max_width - 1;
|
let mut end_ix = max_width - 1;
|
||||||
|
|
|
@ -6,7 +6,7 @@ mod ids;
|
||||||
mod instance;
|
mod instance;
|
||||||
mod tool_metrics;
|
mod tool_metrics;
|
||||||
|
|
||||||
use assertions::display_error_row;
|
use assertions::{AssertionsReport, display_error_row};
|
||||||
use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
|
use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
|
||||||
pub(crate) use tool_metrics::*;
|
pub(crate) use tool_metrics::*;
|
||||||
|
|
||||||
|
@ -467,11 +467,12 @@ pub fn find_model(
|
||||||
match matching_models.as_slice() {
|
match matching_models.as_slice() {
|
||||||
[model] => Ok(model.clone()),
|
[model] => Ok(model.clone()),
|
||||||
[] => Err(anyhow!(
|
[] => Err(anyhow!(
|
||||||
"No language model with ID {} was available. Available models: {}",
|
"No language model with ID {}/{} was available. Available models: {}",
|
||||||
|
provider_id,
|
||||||
model_id,
|
model_id,
|
||||||
model_registry
|
model_registry
|
||||||
.available_models(cx)
|
.available_models(cx)
|
||||||
.map(|model| model.id().0.clone())
|
.map(|model| format!("{}/{}", model.provider_id().0, model.id().0))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.join(", ")
|
.join(", ")
|
||||||
)),
|
)),
|
||||||
|
@ -581,12 +582,15 @@ fn print_report(
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
display_error_row(&mut table_rows, example.repetition, err.to_string())?;
|
display_error_row(&mut table_rows, example.repetition, err.to_string())?;
|
||||||
error_count += 1;
|
error_count += 1;
|
||||||
|
programmatic_scores.push(0.0);
|
||||||
|
diff_scores.push(0.0);
|
||||||
|
thread_scores.push(0.0);
|
||||||
}
|
}
|
||||||
Ok((run_output, judge_output)) => {
|
Ok((run_output, judge_output)) => {
|
||||||
cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||||
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||||
|
|
||||||
if !run_output.programmatic_assertions.total_count() > 0 {
|
if run_output.programmatic_assertions.total_count() > 0 {
|
||||||
for assertion in &run_output.programmatic_assertions.ran {
|
for assertion in &run_output.programmatic_assertions.ran {
|
||||||
assertions::display_table_row(
|
assertions::display_table_row(
|
||||||
&mut table_rows,
|
&mut table_rows,
|
||||||
|
@ -626,6 +630,8 @@ fn print_report(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut all_asserts = Vec::new();
|
||||||
|
|
||||||
if !table_rows.is_empty() {
|
if !table_rows.is_empty() {
|
||||||
assertions::print_table_header();
|
assertions::print_table_header();
|
||||||
print!("{}", table_rows);
|
print!("{}", table_rows);
|
||||||
|
@ -634,33 +640,29 @@ fn print_report(
|
||||||
|
|
||||||
for (example, result) in results.iter() {
|
for (example, result) in results.iter() {
|
||||||
if let Ok((run_output, judge_output)) = result {
|
if let Ok((run_output, judge_output)) = result {
|
||||||
|
let asserts = [
|
||||||
|
run_output.programmatic_assertions.clone(),
|
||||||
|
judge_output.diff.clone(),
|
||||||
|
judge_output.thread.clone(),
|
||||||
|
];
|
||||||
|
all_asserts.extend_from_slice(&asserts);
|
||||||
assertions::print_table_round_summary(
|
assertions::print_table_round_summary(
|
||||||
&example.repetition.to_string(),
|
&example.repetition.to_string(),
|
||||||
[
|
asserts.iter(),
|
||||||
&run_output.programmatic_assertions,
|
)
|
||||||
&judge_output.diff,
|
} else if let Err(err) = result {
|
||||||
&judge_output.thread,
|
let assert = AssertionsReport::error(err.to_string());
|
||||||
]
|
all_asserts.push(assert.clone());
|
||||||
.into_iter(),
|
assertions::print_table_round_summary(
|
||||||
|
&example.repetition.to_string(),
|
||||||
|
[assert].iter(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
assertions::print_table_divider();
|
assertions::print_table_divider();
|
||||||
|
|
||||||
assertions::print_table_round_summary(
|
assertions::print_table_round_summary("avg", all_asserts.iter());
|
||||||
"avg",
|
|
||||||
results.iter().flat_map(|(_, result)| {
|
|
||||||
result.iter().flat_map(|(run_output, judge_output)| {
|
|
||||||
[
|
|
||||||
&run_output.programmatic_assertions,
|
|
||||||
&judge_output.diff,
|
|
||||||
&judge_output.thread,
|
|
||||||
]
|
|
||||||
.into_iter()
|
|
||||||
})
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
|
|
||||||
assertions::print_table_footer();
|
assertions::print_table_footer();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue