eval: Add HTML overview for evaluation runs (#29413)

This update generates a single self-contained .html file that shows an
overview of evaluation threads in the browser. It's useful for:

- Quickly reviewing results
- Sharing evaluation runs
- Debugging
- Comparing models (TBD)

Features:

- Export thread JSON from the UI
- Keyboard navigation (j/k or Ctrl + ←/→)
- Toggle between compact and full views

Generating the overview:

- `cargo run -p eval` will write this file in the run dir's root.
- Or you can call `cargo run -p eval --bin explorer` to generate it
without running evals.


Screenshot:

![image](https://github.com/user-attachments/assets/4ead71f6-da08-48ea-8fcb-2148d2e4b4db)


Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2025-04-25 17:49:05 +03:00 committed by GitHub
parent f106dfca42
commit 3389327df5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 1351 additions and 149 deletions

View file

@ -1,6 +1,7 @@
mod assertions;
mod example;
mod examples;
mod explorer;
mod ids;
mod instance;
mod tool_metrics;
@ -305,155 +306,11 @@ fn main() {
}))
.await;
print_h1("EVAL RESULTS");
let mut diff_scores = Vec::new();
let mut thread_scores = Vec::new();
let mut programmatic_scores = Vec::new();
let mut error_count = 0;
for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() {
print_h2(&example_name);
results.sort_unstable_by_key(|(example, _)| example.repetition);
let mut example_cumulative_tool_metrics = ToolMetrics::default();
let mut table_rows = String::new();
for (example, result) in results.iter() {
match result {
Err(err) => {
display_error_row(
&mut table_rows,
example.repetition,
err.to_string(),
)?;
error_count += 1;
}
Ok((run_output, judge_output)) => {
cumulative_tool_metrics.merge(&run_output.tool_metrics);
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
if !run_output.programmatic_assertions.total_count() > 0 {
for assertion in &run_output.programmatic_assertions.ran {
assertions::display_table_row(
&mut table_rows,
example.repetition,
assertion,
)?;
}
programmatic_scores
.push(run_output.programmatic_assertions.passed_percentage())
}
if !judge_output.diff.is_empty() {
diff_scores.push(judge_output.diff.passed_percentage());
for assertion in &judge_output.diff.ran {
assertions::display_table_row(
&mut table_rows,
example.repetition,
assertion,
)?;
}
}
if !judge_output.thread.is_empty() {
thread_scores.push(judge_output.thread.passed_percentage());
for assertion in &judge_output.thread.ran {
assertions::display_table_row(
&mut table_rows,
example.repetition,
assertion,
)?;
}
}
}
}
}
if !table_rows.is_empty() {
assertions::print_table_header();
print!("{}", table_rows);
assertions::print_table_divider();
for (example, result) in results.iter() {
if let Ok((run_output, judge_output)) = result {
assertions::print_table_round_summary(
&example.repetition.to_string(),
[
&run_output.programmatic_assertions,
&judge_output.diff,
&judge_output.thread,
]
.into_iter(),
)
}
}
assertions::print_table_divider();
assertions::print_table_round_summary(
"avg",
results.iter().flat_map(|(_, result)| {
result.iter().flat_map(|(run_output, judge_output)| {
[
&run_output.programmatic_assertions,
&judge_output.diff,
&judge_output.thread,
]
.into_iter()
})
}),
);
assertions::print_table_footer();
}
if !example_cumulative_tool_metrics.is_empty() {
println!("{}", &example_cumulative_tool_metrics);
}
}
if results_by_example_name.borrow().len() > 1 {
print_h1("AGGREGATE");
if error_count > 0 {
println!("\n{error_count} examples failed to run!");
}
let programmatic_score_count = programmatic_scores.len();
if programmatic_score_count > 0 {
let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
/ (programmatic_score_count as f32))
.floor();
println!("Average programmatic score: {average_programmatic_score}%");
}
let diff_score_count = diff_scores.len();
if diff_score_count > 0 {
let average_diff_score =
(diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
println!("Average diff score: {average_diff_score}%");
}
let thread_score_count = thread_scores.len();
if thread_score_count > 0 {
let average_thread_score = (thread_scores.into_iter().sum::<f32>()
/ (thread_score_count as f32))
.floor();
println!("Average thread score: {average_thread_score}%");
}
println!("");
print_h2("CUMULATIVE TOOL METRICS");
println!("{}", cumulative_tool_metrics);
}
print_report(
&mut results_by_example_name.borrow_mut(),
&mut cumulative_tool_metrics,
&run_dir,
)?;
app_state.client.telemetry().flush_events().await;
@ -670,3 +527,175 @@ fn print_h2(header: &str) {
println!("{:^HEADER_WIDTH$}", header);
println!("{:-^HEADER_WIDTH$}\n", "");
}
fn print_report(
results_by_example_name: &mut HashMap<
String,
Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
>,
cumulative_tool_metrics: &mut ToolMetrics,
run_dir: &Path,
) -> anyhow::Result<()> {
print_h1("EVAL RESULTS");
let mut diff_scores = Vec::new();
let mut thread_scores = Vec::new();
let mut programmatic_scores = Vec::new();
let mut error_count = 0;
for (example_name, results) in results_by_example_name.iter_mut() {
print_h2(example_name);
results.sort_unstable_by_key(|(example, _)| example.repetition);
let mut example_cumulative_tool_metrics = ToolMetrics::default();
let mut table_rows = String::new();
for (example, result) in results.iter() {
match result {
Err(err) => {
display_error_row(&mut table_rows, example.repetition, err.to_string())?;
error_count += 1;
}
Ok((run_output, judge_output)) => {
cumulative_tool_metrics.merge(&run_output.tool_metrics);
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
if !run_output.programmatic_assertions.total_count() > 0 {
for assertion in &run_output.programmatic_assertions.ran {
assertions::display_table_row(
&mut table_rows,
example.repetition,
assertion,
)?;
}
programmatic_scores
.push(run_output.programmatic_assertions.passed_percentage())
}
if !judge_output.diff.is_empty() {
diff_scores.push(judge_output.diff.passed_percentage());
for assertion in &judge_output.diff.ran {
assertions::display_table_row(
&mut table_rows,
example.repetition,
assertion,
)?;
}
}
if !judge_output.thread.is_empty() {
thread_scores.push(judge_output.thread.passed_percentage());
for assertion in &judge_output.thread.ran {
assertions::display_table_row(
&mut table_rows,
example.repetition,
assertion,
)?;
}
}
}
}
}
if !table_rows.is_empty() {
assertions::print_table_header();
print!("{}", table_rows);
assertions::print_table_divider();
for (example, result) in results.iter() {
if let Ok((run_output, judge_output)) = result {
assertions::print_table_round_summary(
&example.repetition.to_string(),
[
&run_output.programmatic_assertions,
&judge_output.diff,
&judge_output.thread,
]
.into_iter(),
)
}
}
assertions::print_table_divider();
assertions::print_table_round_summary(
"avg",
results.iter().flat_map(|(_, result)| {
result.iter().flat_map(|(run_output, judge_output)| {
[
&run_output.programmatic_assertions,
&judge_output.diff,
&judge_output.thread,
]
.into_iter()
})
}),
);
assertions::print_table_footer();
}
if !example_cumulative_tool_metrics.is_empty() {
println!("{}", &example_cumulative_tool_metrics);
}
}
if results_by_example_name.len() > 1 {
print_h1("AGGREGATE");
if error_count > 0 {
println!("\n{error_count} examples failed to run!");
}
let programmatic_score_count = programmatic_scores.len();
if programmatic_score_count > 0 {
let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
/ (programmatic_score_count as f32))
.floor();
println!("Average programmatic score: {average_programmatic_score}%");
}
let diff_score_count = diff_scores.len();
if diff_score_count > 0 {
let average_diff_score =
(diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
println!("Average diff score: {average_diff_score}%");
}
let thread_score_count = thread_scores.len();
if thread_score_count > 0 {
let average_thread_score =
(thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
println!("Average thread score: {average_thread_score}%");
}
println!("");
print_h2("CUMULATIVE TOOL METRICS");
println!("{}", cumulative_tool_metrics);
}
let explorer_output_path = run_dir.join("overview.html");
let mut json_paths: Vec<PathBuf> = results_by_example_name
.values()
.flat_map(|results| {
results.iter().map(|(example, _)| {
let absolute_path = example.run_directory.join("last.messages.json");
pathdiff::diff_paths(&absolute_path, run_dir)
.unwrap_or_else(|| absolute_path.clone())
})
})
.collect::<Vec<_>>();
json_paths.sort();
if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
eprintln!("Failed to generate explorer HTML: {}", err);
}
Ok(())
}