eval: Add HTML overview for evaluation runs (#29413)

This update generates a single self-contained .html file that shows an overview of evaluation threads in the browser. It's useful for: - Quickly reviewing results - Sharing evaluation runs - Debugging - Comparing models (TBD) Features: - Export thread JSON from the UI - Keyboard navigation (j/k or Ctrl + ←/→) - Toggle between compact and full views Generating the overview: - `cargo run -p eval` will write this file in the run dir's root. - Or you can call `cargo run -p eval --bin explorer` to generate it without running evals. Screenshot: ![image](https://github.com/user-attachments/assets/4ead71f6-da08-48ea-8fcb-2148d2e4b4db) Release Notes: - N/A
2025-04-25 17:49:05 +03:00 · 2025-04-25 17:49:05 +03:00 · 3389327df5
commit 3389327df5
parent f106dfca42
7 changed files with 1351 additions and 149 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4983,6 +4983,7 @@ dependencies = [
 "language_models",
 "languages",
 "node_runtime",
+ "pathdiff",
 "paths",
 "project",
 "prompt_store",
--- a/crates/eval/Cargo.toml
+++ b/crates/eval/Cargo.toml
@ -3,6 +3,7 @@ name = "eval"
 version = "0.1.0"
 publish.workspace = true
 edition.workspace = true
+default-run = "eval"

 [dependencies]
 agent.workspace = true
@ -31,6 +32,7 @@ language_model.workspace = true
 language_models.workspace = true
 languages = { workspace = true, features = ["load-grammars"] }
 node_runtime.workspace = true
+pathdiff = "0.2"
 paths.workspace = true
 project.workspace = true
 prompt_store.workspace = true
@ -48,9 +50,14 @@ unindent.workspace = true
 util.workspace = true
 uuid = { version = "1.6", features = ["v4"] }
 workspace-hack.workspace = true
+
 [[bin]]
 name = "eval"
 path = "src/eval.rs"

+[[bin]]
+name = "explorer"
+path = "src/explorer.rs"
+
 [lints]
 workspace = true
--- a/crates/eval/README.md
+++ b/crates/eval/README.md
@ -5,3 +5,21 @@ This eval assumes the working directory is the root of the repository. Run it wi
 ```sh
 cargo run -p eval
 ```
+
+## Explorer Tool
+
+The explorer tool generates a self-contained HTML view from one or more thread
+JSON file. It provides a visual interface to explore the agent thread, including
+tool calls and results. See [./docs/explorer.md](./docs/explorer.md) for more details.
+
+### Usage
+
+```sh
+cargo run -p eval --bin explorer -- --input <path-to-json-files> --output <output-html-path>
+```
+
+Example:
+
+```sh
+cargo run -p eval --bin explorer -- --input ./runs/2025-04-23_15-53-30/fastmcp_bugifx/*/last.messages.json --output /tmp/explorer.html
+```
--- a/crates/eval/docs/explorer.md
+++ b/crates/eval/docs/explorer.md
@ -0,0 +1,27 @@
+# Explorer
+
+Threads Explorer is a single self-contained HTML file that gives an overview of
+evaluation runs, while allowing for some interactivity.
+
+When you open a file, it gives you a _thread overview_, which looks like this:
+
+| Turn | Text                                 | Tool                                         | Result                                        |
+| ---- | ------------------------------------ | -------------------------------------------- | --------------------------------------------- |
+| 1    | [User]:                              |                                              |                                               |
+|      | Fix the bug: kwargs not passed...    |                                              |                                               |
+| 2    | I'll help you fix that bug.          | **list_directory**(path="fastmcp")           | `fastmcp/src [...]`                           |
+|      |                                      |                                              |                                               |
+| 3    | Let's examine the code.              | **read_file**(path="fastmcp/main.py", [...]) | `def run_application(app, \*\*kwargs): [...]` |
+| 4    | I found the issue.                   | **edit_file**(path="fastmcp/core.py", [...]) | `Made edit to fastmcp/core.py`                |
+| 5    | Let's check if there are any errors. | **diagnostics**()                            | `No errors found`                             |
+
+### Implementation details
+
+`src/explorer.html` contains the template. You can open this template in a
+browser as is, and it will show some dummy values. But the main use is to set
+the `threadsData` variable with real data, which then will be used instead of
+the dummy values.
+
+`src/explorer.rs` takes one or more JSON files as generated by `cargo run -p
+eval`, and outputs an HTML file for rendering these threads. Refer dummy data
+in `explorer.html` for a sample format.
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -1,6 +1,7 @@
 mod assertions;
 mod example;
 mod examples;
+mod explorer;
 mod ids;
 mod instance;
 mod tool_metrics;
@ -305,155 +306,11 @@ fn main() {
            }))
            .await;

-            print_h1("EVAL RESULTS");
-
-            let mut diff_scores = Vec::new();
-            let mut thread_scores = Vec::new();
-            let mut programmatic_scores = Vec::new();
-            let mut error_count = 0;
-
-            for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() {
-                print_h2(&example_name);
-
-                results.sort_unstable_by_key(|(example, _)| example.repetition);
-                let mut example_cumulative_tool_metrics = ToolMetrics::default();
-
-                let mut table_rows = String::new();
-
-                for (example, result) in results.iter() {
-                    match result {
-                        Err(err) => {
-                            display_error_row(
-                                &mut table_rows,
-                                example.repetition,
-                                err.to_string(),
-                            )?;
-                            error_count += 1;
-                        }
-                        Ok((run_output, judge_output)) => {
-                            cumulative_tool_metrics.merge(&run_output.tool_metrics);
-                            example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
-
-                            if !run_output.programmatic_assertions.total_count() > 0 {
-                                for assertion in &run_output.programmatic_assertions.ran {
-                                    assertions::display_table_row(
-                                        &mut table_rows,
-                                        example.repetition,
-                                        assertion,
-                                    )?;
-                                }
-
-                                programmatic_scores
-                                    .push(run_output.programmatic_assertions.passed_percentage())
-                            }
-
-                            if !judge_output.diff.is_empty() {
-                                diff_scores.push(judge_output.diff.passed_percentage());
-
-                                for assertion in &judge_output.diff.ran {
-                                    assertions::display_table_row(
-                                        &mut table_rows,
-                                        example.repetition,
-                                        assertion,
-                                    )?;
-                                }
-                            }
-
-                            if !judge_output.thread.is_empty() {
-                                thread_scores.push(judge_output.thread.passed_percentage());
-
-                                for assertion in &judge_output.thread.ran {
-                                    assertions::display_table_row(
-                                        &mut table_rows,
-                                        example.repetition,
-                                        assertion,
-                                    )?;
-                                }
-                            }
-                        }
-                    }
-                }
-
-                if !table_rows.is_empty() {
-                    assertions::print_table_header();
-                    print!("{}", table_rows);
-
-                    assertions::print_table_divider();
-
-                    for (example, result) in results.iter() {
-                        if let Ok((run_output, judge_output)) = result {
-                            assertions::print_table_round_summary(
-                                &example.repetition.to_string(),
-                                [
-                                    &run_output.programmatic_assertions,
-                                    &judge_output.diff,
-                                    &judge_output.thread,
-                                ]
-                                .into_iter(),
-                            )
-                        }
-                    }
-
-                    assertions::print_table_divider();
-
-                    assertions::print_table_round_summary(
-                        "avg",
-                        results.iter().flat_map(|(_, result)| {
-                            result.iter().flat_map(|(run_output, judge_output)| {
-                                [
-                                    &run_output.programmatic_assertions,
-                                    &judge_output.diff,
-                                    &judge_output.thread,
-                                ]
-                                .into_iter()
-                            })
-                        }),
-                    );
-
-                    assertions::print_table_footer();
-                }
-
-                if !example_cumulative_tool_metrics.is_empty() {
-                    println!("{}", &example_cumulative_tool_metrics);
-                }
-            }
-
-            if results_by_example_name.borrow().len() > 1 {
-                print_h1("AGGREGATE");
-
-                if error_count > 0 {
-                    println!("\n{error_count} examples failed to run!");
-                }
-
-                let programmatic_score_count = programmatic_scores.len();
-                if programmatic_score_count > 0 {
-                    let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
-                        / (programmatic_score_count as f32))
-                        .floor();
-                    println!("Average programmatic score: {average_programmatic_score}%");
-                }
-
-                let diff_score_count = diff_scores.len();
-                if diff_score_count > 0 {
-                    let average_diff_score =
-                        (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
-                    println!("Average diff score: {average_diff_score}%");
-                }
-
-                let thread_score_count = thread_scores.len();
-
-                if thread_score_count > 0 {
-                    let average_thread_score = (thread_scores.into_iter().sum::<f32>()
-                        / (thread_score_count as f32))
-                        .floor();
-                    println!("Average thread score: {average_thread_score}%");
-                }
-
-                println!("");
-
-                print_h2("CUMULATIVE TOOL METRICS");
-                println!("{}", cumulative_tool_metrics);
-            }
+            print_report(
+                &mut results_by_example_name.borrow_mut(),
+                &mut cumulative_tool_metrics,
+                &run_dir,
+            )?;

            app_state.client.telemetry().flush_events().await;

@ -670,3 +527,175 @@ fn print_h2(header: &str) {
    println!("{:^HEADER_WIDTH$}", header);
    println!("{:-^HEADER_WIDTH$}\n", "");
 }
+
+fn print_report(
+    results_by_example_name: &mut HashMap<
+        String,
+        Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>,
+    >,
+    cumulative_tool_metrics: &mut ToolMetrics,
+    run_dir: &Path,
+) -> anyhow::Result<()> {
+    print_h1("EVAL RESULTS");
+
+    let mut diff_scores = Vec::new();
+    let mut thread_scores = Vec::new();
+    let mut programmatic_scores = Vec::new();
+    let mut error_count = 0;
+
+    for (example_name, results) in results_by_example_name.iter_mut() {
+        print_h2(example_name);
+
+        results.sort_unstable_by_key(|(example, _)| example.repetition);
+        let mut example_cumulative_tool_metrics = ToolMetrics::default();
+
+        let mut table_rows = String::new();
+
+        for (example, result) in results.iter() {
+            match result {
+                Err(err) => {
+                    display_error_row(&mut table_rows, example.repetition, err.to_string())?;
+                    error_count += 1;
+                }
+                Ok((run_output, judge_output)) => {
+                    cumulative_tool_metrics.merge(&run_output.tool_metrics);
+                    example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
+
+                    if !run_output.programmatic_assertions.total_count() > 0 {
+                        for assertion in &run_output.programmatic_assertions.ran {
+                            assertions::display_table_row(
+                                &mut table_rows,
+                                example.repetition,
+                                assertion,
+                            )?;
+                        }
+
+                        programmatic_scores
+                            .push(run_output.programmatic_assertions.passed_percentage())
+                    }
+
+                    if !judge_output.diff.is_empty() {
+                        diff_scores.push(judge_output.diff.passed_percentage());
+
+                        for assertion in &judge_output.diff.ran {
+                            assertions::display_table_row(
+                                &mut table_rows,
+                                example.repetition,
+                                assertion,
+                            )?;
+                        }
+                    }
+
+                    if !judge_output.thread.is_empty() {
+                        thread_scores.push(judge_output.thread.passed_percentage());
+
+                        for assertion in &judge_output.thread.ran {
+                            assertions::display_table_row(
+                                &mut table_rows,
+                                example.repetition,
+                                assertion,
+                            )?;
+                        }
+                    }
+                }
+            }
+        }
+
+        if !table_rows.is_empty() {
+            assertions::print_table_header();
+            print!("{}", table_rows);
+
+            assertions::print_table_divider();
+
+            for (example, result) in results.iter() {
+                if let Ok((run_output, judge_output)) = result {
+                    assertions::print_table_round_summary(
+                        &example.repetition.to_string(),
+                        [
+                            &run_output.programmatic_assertions,
+                            &judge_output.diff,
+                            &judge_output.thread,
+                        ]
+                        .into_iter(),
+                    )
+                }
+            }
+
+            assertions::print_table_divider();
+
+            assertions::print_table_round_summary(
+                "avg",
+                results.iter().flat_map(|(_, result)| {
+                    result.iter().flat_map(|(run_output, judge_output)| {
+                        [
+                            &run_output.programmatic_assertions,
+                            &judge_output.diff,
+                            &judge_output.thread,
+                        ]
+                        .into_iter()
+                    })
+                }),
+            );
+
+            assertions::print_table_footer();
+        }
+
+        if !example_cumulative_tool_metrics.is_empty() {
+            println!("{}", &example_cumulative_tool_metrics);
+        }
+    }
+
+    if results_by_example_name.len() > 1 {
+        print_h1("AGGREGATE");
+
+        if error_count > 0 {
+            println!("\n{error_count} examples failed to run!");
+        }
+
+        let programmatic_score_count = programmatic_scores.len();
+        if programmatic_score_count > 0 {
+            let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
+                / (programmatic_score_count as f32))
+                .floor();
+            println!("Average programmatic score: {average_programmatic_score}%");
+        }
+
+        let diff_score_count = diff_scores.len();
+        if diff_score_count > 0 {
+            let average_diff_score =
+                (diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
+            println!("Average diff score: {average_diff_score}%");
+        }
+
+        let thread_score_count = thread_scores.len();
+
+        if thread_score_count > 0 {
+            let average_thread_score =
+                (thread_scores.into_iter().sum::<f32>() / (thread_score_count as f32)).floor();
+            println!("Average thread score: {average_thread_score}%");
+        }
+
+        println!("");
+
+        print_h2("CUMULATIVE TOOL METRICS");
+        println!("{}", cumulative_tool_metrics);
+    }
+
+    let explorer_output_path = run_dir.join("overview.html");
+    let mut json_paths: Vec<PathBuf> = results_by_example_name
+        .values()
+        .flat_map(|results| {
+            results.iter().map(|(example, _)| {
+                let absolute_path = example.run_directory.join("last.messages.json");
+                pathdiff::diff_paths(&absolute_path, run_dir)
+                    .unwrap_or_else(|| absolute_path.clone())
+            })
+        })
+        .collect::<Vec<_>>();
+    json_paths.sort();
+    if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) {
+        eprintln!("Failed to generate explorer HTML: {}", err);
+    }
+
+    Ok(())
+}
--- a/crates/eval/src/explorer.html
+++ b/crates/eval/src/explorer.html
--- a/crates/eval/src/explorer.rs
+++ b/crates/eval/src/explorer.rs
@ -0,0 +1,75 @@
+use anyhow::{Context, Result, anyhow};
+use clap::Parser;
+use serde_json::{Value, json};
+use std::fs;
+use std::path::PathBuf;
+
+#[derive(Parser, Debug)]
+#[clap(about = "Generate HTML explorer from JSON thread files")]
+struct Args {
+    /// Paths to JSON files containing thread data
+    #[clap(long, required = true, num_args = 1..)]
+    input: Vec<PathBuf>,
+
+    /// Path where the HTML explorer file will be written
+    #[clap(long)]
+    output: PathBuf,
+}
+
+pub fn generate_explorer_html(inputs: &[PathBuf], output: &PathBuf) -> Result<String> {
+    if let Some(parent) = output.parent() {
+        if !parent.exists() {
+            fs::create_dir_all(parent).context(format!(
+                "Failed to create output directory: {}",
+                parent.display()
+            ))?;
+        }
+    }
+
+    let template_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/explorer.html");
+    let template = fs::read_to_string(&template_path).context(format!(
+        "Template file not found or couldn't be read: {}",
+        template_path.display()
+    ))?;
+
+    let threads = inputs
+        .iter()
+        .map(|input_path| {
+            let mut thread_data: Value = fs::read_to_string(input_path)
+                .context(format!("Failed to read file: {}", input_path.display()))?
+                .parse::<Value>()
+                .context(format!("Failed to parse JSON: {}", input_path.display()))?;
+            thread_data["filename"] = json!(input_path); // This will be shown in a thread heading
+            Ok(thread_data)
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    let all_threads = json!({ "threads": threads });
+    let html_content = inject_thread_data(template, all_threads)?;
+    fs::write(&output, &html_content)
+        .context(format!("Failed to write output: {}", output.display()))?;
+
+    println!("Saved {} thread(s) to {}", threads.len(), output.display());
+    Ok(html_content)
+}
+
+fn inject_thread_data(template: String, threads_data: Value) -> Result<String> {
+    let injection_marker = "let threadsData = window.threadsData || { threads: [dummyThread] };";
+    template
+        .find(injection_marker)
+        .ok_or_else(|| anyhow!("Could not find the thread injection point in the template"))?;
+
+    let threads_json = serde_json::to_string_pretty(&threads_data)
+        .context("Failed to serialize threads data to JSON")?;
+    let script_injection = format!("let threadsData = {};", threads_json);
+    let final_html = template.replacen(injection_marker, &script_injection, 1);
+
+    Ok(final_html)
+}
+
+#[cfg(not(any(test, doctest)))]
+#[allow(dead_code)]
+fn main() -> Result<()> {
+    let args = Args::parse();
+    generate_explorer_html(&args.input, &args.output).map(|_| ())
+}