eval: Add HTML overview for evaluation runs (#29413)
This update generates a single self-contained .html file that shows an overview of evaluation threads in the browser. It's useful for: - Quickly reviewing results - Sharing evaluation runs - Debugging - Comparing models (TBD) Features: - Export thread JSON from the UI - Keyboard navigation (j/k or Ctrl + ←/→) - Toggle between compact and full views Generating the overview: - `cargo run -p eval` will write this file in the run dir's root. - Or you can call `cargo run -p eval --bin explorer` to generate it without running evals. Screenshot:  Release Notes: - N/A
This commit is contained in:
parent
f106dfca42
commit
3389327df5
7 changed files with 1351 additions and 149 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -4983,6 +4983,7 @@ dependencies = [
|
|||
"language_models",
|
||||
"languages",
|
||||
"node_runtime",
|
||||
"pathdiff",
|
||||
"paths",
|
||||
"project",
|
||||
"prompt_store",
|
||||
|
|
|
@ -3,6 +3,7 @@ name = "eval"
|
|||
version = "0.1.0"
|
||||
publish.workspace = true
|
||||
edition.workspace = true
|
||||
default-run = "eval"
|
||||
|
||||
[dependencies]
|
||||
agent.workspace = true
|
||||
|
@ -31,6 +32,7 @@ language_model.workspace = true
|
|||
language_models.workspace = true
|
||||
languages = { workspace = true, features = ["load-grammars"] }
|
||||
node_runtime.workspace = true
|
||||
pathdiff = "0.2"
|
||||
paths.workspace = true
|
||||
project.workspace = true
|
||||
prompt_store.workspace = true
|
||||
|
@ -48,9 +50,14 @@ unindent.workspace = true
|
|||
util.workspace = true
|
||||
uuid = { version = "1.6", features = ["v4"] }
|
||||
workspace-hack.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "eval"
|
||||
path = "src/eval.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "explorer"
|
||||
path = "src/explorer.rs"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
|
|
@ -5,3 +5,21 @@ This eval assumes the working directory is the root of the repository. Run it wi
|
|||
```sh
|
||||
cargo run -p eval
|
||||
```
|
||||
|
||||
## Explorer Tool
|
||||
|
||||
The explorer tool generates a self-contained HTML view from one or more thread
|
||||
JSON file. It provides a visual interface to explore the agent thread, including
|
||||
tool calls and results. See [./docs/explorer.md](./docs/explorer.md) for more details.
|
||||
|
||||
### Usage
|
||||
|
||||
```sh
|
||||
cargo run -p eval --bin explorer -- --input <path-to-json-files> --output <output-html-path>
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```sh
|
||||
cargo run -p eval --bin explorer -- --input ./runs/2025-04-23_15-53-30/fastmcp_bugifx/*/last.messages.json --output /tmp/explorer.html
|
||||
```
|
||||
|
|
27
crates/eval/docs/explorer.md
Normal file
27
crates/eval/docs/explorer.md
Normal file
|
@ -0,0 +1,27 @@
|
|||
# Explorer
|
||||
|
||||
Threads Explorer is a single self-contained HTML file that gives an overview of
|
||||
evaluation runs, while allowing for some interactivity.
|
||||
|
||||
When you open a file, it gives you a _thread overview_, which looks like this:
|
||||
|
||||
| Turn | Text | Tool | Result |
|
||||
| ---- | ------------------------------------ | -------------------------------------------- | --------------------------------------------- |
|
||||
| 1 | [User]: | | |
|
||||
| | Fix the bug: kwargs not passed... | | |
|
||||
| 2 | I'll help you fix that bug. | **list_directory**(path="fastmcp") | `fastmcp/src [...]` |
|
||||
| | | | |
|
||||
| 3 | Let's examine the code. | **read_file**(path="fastmcp/main.py", [...]) | `def run_application(app, \*\*kwargs): [...]` |
|
||||
| 4 | I found the issue. | **edit_file**(path="fastmcp/core.py", [...]) | `Made edit to fastmcp/core.py` |
|
||||
| 5 | Let's check if there are any errors. | **diagnostics**() | `No errors found` |
|
||||
|
||||
### Implementation details
|
||||
|
||||
`src/explorer.html` contains the template. You can open this template in a
|
||||
browser as is, and it will show some dummy values. But the main use is to set
|
||||
the `threadsData` variable with real data, which then will be used instead of
|
||||
the dummy values.
|
||||
|
||||
`src/explorer.rs` takes one or more JSON files as generated by `cargo run -p
|
||||
eval`, and outputs an HTML file for rendering these threads. Refer dummy data
|
||||
in `explorer.html` for a sample format.
|
|
@ -1,6 +1,7 @@
|
|||
mod assertions;
|
||||
mod example;
|
||||
mod examples;
|
||||
mod explorer;
|
||||
mod ids;
|
||||
mod instance;
|
||||
mod tool_metrics;
|
||||
|
@ -305,155 +306,11 @@ fn main() {
|
|||
}))
|
||||
.await;
|
||||
|
||||
print_h1("EVAL RESULTS");
|
||||
|
||||
let mut diff_scores = Vec::new();
|
||||
let mut thread_scores = Vec::new();
|
||||
let mut programmatic_scores = Vec::new();
|
||||
let mut error_count = 0;
|
||||
|
||||
for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() {
|
||||
print_h2(&example_name);
|
||||
|
||||
results.sort_unstable_by_key(|(example, _)| example.repetition);
|
||||
let mut example_cumulative_tool_metrics = ToolMetrics::default();
|
||||
|
||||
let mut table_rows = String::new();
|
||||
|
||||
for (example, result) in results.iter() {
|
||||
match result {
|
||||
Err(err) => {
|
||||
display_error_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
err.to_string(),
|
||||
)?;
|
||||
error_count += 1;
|
||||
}
|
||||
Ok((run_output, judge_output)) => {
|
||||
cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||
|
||||
if !run_output.programmatic_assertions.total_count() > 0 {
|
||||
for assertion in &run_output.programmatic_assertions.ran {
|
||||
assertions::display_table_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
assertion,
|
||||
)?;
|
||||
}
|
||||
|
||||
programmatic_scores
|
||||
.push(run_output.programmatic_assertions.passed_percentage())
|
||||
}
|
||||
|
||||
if !judge_output.diff.is_empty() {
|
||||
diff_scores.push(judge_output.diff.passed_percentage());
|
||||
|
||||
for assertion in &judge_output.diff.ran {
|
||||
assertions::display_table_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
assertion,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !judge_output.thread.is_empty() {
|
||||
thread_scores.push(judge_output.thread.passed_percentage());
|
||||
|
||||
for assertion in &judge_output.thread.ran {
|
||||
assertions::display_table_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
assertion,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !table_rows.is_empty() {
|
||||
assertions::print_table_header();
|
||||
print!("{}", table_rows);
|
||||
|
||||
assertions::print_table_divider();
|
||||
|
||||
for (example, result) in results.iter() {
|
||||
if let Ok((run_output, judge_output)) = result {
|
||||
assertions::print_table_round_summary(
|
||||
&example.repetition.to_string(),
|
||||
[
|
||||
&run_output.programmatic_assertions,
|
||||
&judge_output.diff,
|
||||
&judge_output.thread,
|
||||
]
|
||||
.into_iter(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assertions::print_table_divider();
|
||||
|
||||
assertions::print_table_round_summary(
|
||||
"avg",
|
||||
results.iter().flat_map(|(_, result)| {
|
||||
result.iter().flat_map(|(run_output, judge_output)| {
|
||||
[
|
||||
&run_output.programmatic_assertions,
|
||||
&judge_output.diff,
|
||||
&judge_output.thread,
|
||||
]
|
||||
.into_iter()
|
||||
})
|
||||
}),
|
||||
);
|
||||
|
||||
assertions::print_table_footer();
|
||||
}
|
||||
|
||||
if !example_cumulative_tool_metrics.is_empty() {
|
||||
println!("{}", &example_cumulative_tool_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
if results_by_example_name.borrow().len() > 1 {
|
||||
print_h1("AGGREGATE");
|
||||
|
||||
if error_count > 0 {
|
||||
println!("\n{error_count} examples failed to run!");
|
||||
}
|
||||
|
||||
let programmatic_score_count = programmatic_scores.len();
|
||||
if programmatic_score_count > 0 {
|
||||
let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
|
||||
/ (programmatic_score_count as f32))
|
||||
.floor();
|
||||
println!("Average programmatic score: {average_programmatic_score}%");
|
||||
}
|
||||
|
||||
let diff_score_count = diff_scores.len();
|
||||
if diff_score_count > 0 {
|
||||
let average_diff_score =
|
||||
(diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
|
||||
println!("Average diff score: {average_diff_score}%");
|
||||
}
|
||||
|
||||
let thread_score_count = thread_scores.len();
|
||||
|
||||
if thread_score_count > 0 {
|
||||
let average_thread_score = (thread_scores.into_iter().sum::<f32>()
|
||||
/ (thread_score_count as f32))
|
||||
.floor();
|
||||
println!("Average thread score: {average_thread_score}%");
|
||||
}
|
||||
|
||||
println!("");
|
||||
|
||||
print_h2("CUMULATIVE TOOL METRICS");
|
||||
println!("{}", cumulative_tool_metrics);
|
||||
}
|
||||
print_report(
|
||||
&mut results_by_example_name.borrow_mut(),
|
||||
&mut cumulative_tool_metrics,
|
||||
&run_dir,
|
||||
)?;
|
||||
|
||||
app_state.client.telemetry().flush_events().await;
|
||||
|
||||
|
@ -670,3 +527,175 @@ fn print_h2(header: &str) {
|
|||
println!("{:^HEADER_WIDTH$}", header);
|
||||
println!("{:-^HEADER_WIDTH$}\n", "");
|
||||
}
|
||||
|
||||
fn print_report(
|
||||
results_by_example_name: &mut HashMap<
|
||||
String,
|
||||
Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
|
||||
>,
|
||||
cumulative_tool_metrics: &mut ToolMetrics,
|
||||
run_dir: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
print_h1("EVAL RESULTS");
|
||||
|
||||
let mut diff_scores = Vec::new();
|
||||
let mut thread_scores = Vec::new();
|
||||
let mut programmatic_scores = Vec::new();
|
||||
let mut error_count = 0;
|
||||
|
||||
for (example_name, results) in results_by_example_name.iter_mut() {
|
||||
print_h2(example_name);
|
||||
|
||||
results.sort_unstable_by_key(|(example, _)| example.repetition);
|
||||
let mut example_cumulative_tool_metrics = ToolMetrics::default();
|
||||
|
||||
let mut table_rows = String::new();
|
||||
|
||||
for (example, result) in results.iter() {
|
||||
match result {
|
||||
Err(err) => {
|
||||
display_error_row(&mut table_rows, example.repetition, err.to_string())?;
|
||||
error_count += 1;
|
||||
}
|
||||
Ok((run_output, judge_output)) => {
|
||||
cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||
|
||||
if !run_output.programmatic_assertions.total_count() > 0 {
|
||||
for assertion in &run_output.programmatic_assertions.ran {
|
||||
assertions::display_table_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
assertion,
|
||||
)?;
|
||||
}
|
||||
|
||||
programmatic_scores
|
||||
.push(run_output.programmatic_assertions.passed_percentage())
|
||||
}
|
||||
|
||||
if !judge_output.diff.is_empty() {
|
||||
diff_scores.push(judge_output.diff.passed_percentage());
|
||||
|
||||
for assertion in &judge_output.diff.ran {
|
||||
assertions::display_table_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
assertion,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !judge_output.thread.is_empty() {
|
||||
thread_scores.push(judge_output.thread.passed_percentage());
|
||||
|
||||
for assertion in &judge_output.thread.ran {
|
||||
assertions::display_table_row(
|
||||
&mut table_rows,
|
||||
example.repetition,
|
||||
assertion,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !table_rows.is_empty() {
|
||||
assertions::print_table_header();
|
||||
print!("{}", table_rows);
|
||||
|
||||
assertions::print_table_divider();
|
||||
|
||||
for (example, result) in results.iter() {
|
||||
if let Ok((run_output, judge_output)) = result {
|
||||
assertions::print_table_round_summary(
|
||||
&example.repetition.to_string(),
|
||||
[
|
||||
&run_output.programmatic_assertions,
|
||||
&judge_output.diff,
|
||||
&judge_output.thread,
|
||||
]
|
||||
.into_iter(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assertions::print_table_divider();
|
||||
|
||||
assertions::print_table_round_summary(
|
||||
"avg",
|
||||
results.iter().flat_map(|(_, result)| {
|
||||
result.iter().flat_map(|(run_output, judge_output)| {
|
||||
[
|
||||
&run_output.programmatic_assertions,
|
||||
&judge_output.diff,
|
||||
&judge_output.thread,
|
||||
]
|
||||
.into_iter()
|
||||
})
|
||||
}),
|
||||
);
|
||||
|
||||
assertions::print_table_footer();
|
||||
}
|
||||
|
||||
if !example_cumulative_tool_metrics.is_empty() {
|
||||
println!("{}", &example_cumulative_tool_metrics);
|
||||
}
|
||||
}
|
||||
|
||||
if results_by_example_name.len() > 1 {
|
||||
print_h1("AGGREGATE");
|
||||
|
||||
if error_count > 0 {
|
||||
println!("\n{error_count} examples failed to run!");
|
||||
}
|
||||
|
||||
let programmatic_score_count = programmatic_scores.len();
|
||||
if programmatic_score_count > 0 {
|
||||
let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
|
||||
/ (programmatic_score_count as f32))
|
||||
.floor();
|
||||
println!("Average programmatic score: {average_programmatic_score}%");
|
||||
}
|
||||
|
||||
let diff_score_count = diff_scores.len();
|
||||
if diff_score_count > 0 {
|
||||
let average_diff_score =
|
||||
(diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
|
||||
println!("Average diff score: {average_diff_score}%");
|
||||
}
|
||||
|
||||
let thread_score_count = thread_scores.len();
|
||||
|
||||
if thread_score_count > 0 {
|
||||
let average_thread_score =
|
||||
(thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
|
||||
println!("Average thread score: {average_thread_score}%");
|
||||
}
|
||||
|
||||
println!("");
|
||||
|
||||
print_h2("CUMULATIVE TOOL METRICS");
|
||||
println!("{}", cumulative_tool_metrics);
|
||||
}
|
||||
|
||||
let explorer_output_path = run_dir.join("overview.html");
|
||||
let mut json_paths: Vec<PathBuf> = results_by_example_name
|
||||
.values()
|
||||
.flat_map(|results| {
|
||||
results.iter().map(|(example, _)| {
|
||||
let absolute_path = example.run_directory.join("last.messages.json");
|
||||
pathdiff::diff_paths(&absolute_path, run_dir)
|
||||
.unwrap_or_else(|| absolute_path.clone())
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
json_paths.sort();
|
||||
if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
|
||||
eprintln!("Failed to generate explorer HTML: {}", err);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
1045
crates/eval/src/explorer.html
Normal file
1045
crates/eval/src/explorer.html
Normal file
File diff suppressed because it is too large
Load diff
75
crates/eval/src/explorer.rs
Normal file
75
crates/eval/src/explorer.rs
Normal file
|
@ -0,0 +1,75 @@
|
|||
use anyhow::{Context, Result, anyhow};
|
||||
use clap::Parser;
|
||||
use serde_json::{Value, json};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[clap(about = "Generate HTML explorer from JSON thread files")]
|
||||
struct Args {
|
||||
/// Paths to JSON files containing thread data
|
||||
#[clap(long, required = true, num_args = 1..)]
|
||||
input: Vec<PathBuf>,
|
||||
|
||||
/// Path where the HTML explorer file will be written
|
||||
#[clap(long)]
|
||||
output: PathBuf,
|
||||
}
|
||||
|
||||
pub fn generate_explorer_html(inputs: &[PathBuf], output: &PathBuf) -> Result<String> {
|
||||
if let Some(parent) = output.parent() {
|
||||
if !parent.exists() {
|
||||
fs::create_dir_all(parent).context(format!(
|
||||
"Failed to create output directory: {}",
|
||||
parent.display()
|
||||
))?;
|
||||
}
|
||||
}
|
||||
|
||||
let template_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/explorer.html");
|
||||
let template = fs::read_to_string(&template_path).context(format!(
|
||||
"Template file not found or couldn't be read: {}",
|
||||
template_path.display()
|
||||
))?;
|
||||
|
||||
let threads = inputs
|
||||
.iter()
|
||||
.map(|input_path| {
|
||||
let mut thread_data: Value = fs::read_to_string(input_path)
|
||||
.context(format!("Failed to read file: {}", input_path.display()))?
|
||||
.parse::<Value>()
|
||||
.context(format!("Failed to parse JSON: {}", input_path.display()))?;
|
||||
thread_data["filename"] = json!(input_path); // This will be shown in a thread heading
|
||||
Ok(thread_data)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
let all_threads = json!({ "threads": threads });
|
||||
let html_content = inject_thread_data(template, all_threads)?;
|
||||
fs::write(&output, &html_content)
|
||||
.context(format!("Failed to write output: {}", output.display()))?;
|
||||
|
||||
println!("Saved {} thread(s) to {}", threads.len(), output.display());
|
||||
Ok(html_content)
|
||||
}
|
||||
|
||||
fn inject_thread_data(template: String, threads_data: Value) -> Result<String> {
|
||||
let injection_marker = "let threadsData = window.threadsData || { threads: [dummyThread] };";
|
||||
template
|
||||
.find(injection_marker)
|
||||
.ok_or_else(|| anyhow!("Could not find the thread injection point in the template"))?;
|
||||
|
||||
let threads_json = serde_json::to_string_pretty(&threads_data)
|
||||
.context("Failed to serialize threads data to JSON")?;
|
||||
let script_injection = format!("let threadsData = {};", threads_json);
|
||||
let final_html = template.replacen(injection_marker, &script_injection, 1);
|
||||
|
||||
Ok(final_html)
|
||||
}
|
||||
|
||||
#[cfg(not(any(test, doctest)))]
|
||||
#[allow(dead_code)]
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
generate_explorer_html(&args.input, &args.output).map(|_| ())
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue