diff --git a/Cargo.lock b/Cargo.lock index cc00d4b4df..f06b861591 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4983,6 +4983,7 @@ dependencies = [ "language_models", "languages", "node_runtime", + "pathdiff", "paths", "project", "prompt_store", diff --git a/crates/eval/Cargo.toml b/crates/eval/Cargo.toml index 0046ca50a9..6c754bf7ff 100644 --- a/crates/eval/Cargo.toml +++ b/crates/eval/Cargo.toml @@ -3,6 +3,7 @@ name = "eval" version = "0.1.0" publish.workspace = true edition.workspace = true +default-run = "eval" [dependencies] agent.workspace = true @@ -31,6 +32,7 @@ language_model.workspace = true language_models.workspace = true languages = { workspace = true, features = ["load-grammars"] } node_runtime.workspace = true +pathdiff = "0.2" paths.workspace = true project.workspace = true prompt_store.workspace = true @@ -48,9 +50,14 @@ unindent.workspace = true util.workspace = true uuid = { version = "1.6", features = ["v4"] } workspace-hack.workspace = true + [[bin]] name = "eval" path = "src/eval.rs" +[[bin]] +name = "explorer" +path = "src/explorer.rs" + [lints] workspace = true diff --git a/crates/eval/README.md b/crates/eval/README.md index b28806bebb..75fe7265fc 100644 --- a/crates/eval/README.md +++ b/crates/eval/README.md @@ -5,3 +5,21 @@ This eval assumes the working directory is the root of the repository. Run it wi ```sh cargo run -p eval ``` + +## Explorer Tool + +The explorer tool generates a self-contained HTML view from one or more thread +JSON file. It provides a visual interface to explore the agent thread, including +tool calls and results. See [./docs/explorer.md](./docs/explorer.md) for more details. + +### Usage + +```sh +cargo run -p eval --bin explorer -- --input --output +``` + +Example: + +```sh +cargo run -p eval --bin explorer -- --input ./runs/2025-04-23_15-53-30/fastmcp_bugifx/*/last.messages.json --output /tmp/explorer.html +``` diff --git a/crates/eval/docs/explorer.md b/crates/eval/docs/explorer.md new file mode 100644 index 0000000000..2ca3336a23 --- /dev/null +++ b/crates/eval/docs/explorer.md @@ -0,0 +1,27 @@ +# Explorer + +Threads Explorer is a single self-contained HTML file that gives an overview of +evaluation runs, while allowing for some interactivity. + +When you open a file, it gives you a _thread overview_, which looks like this: + +| Turn | Text | Tool | Result | +| ---- | ------------------------------------ | -------------------------------------------- | --------------------------------------------- | +| 1 | [User]: | | | +| | Fix the bug: kwargs not passed... | | | +| 2 | I'll help you fix that bug. | **list_directory**(path="fastmcp") | `fastmcp/src [...]` | +| | | | | +| 3 | Let's examine the code. | **read_file**(path="fastmcp/main.py", [...]) | `def run_application(app, \*\*kwargs): [...]` | +| 4 | I found the issue. | **edit_file**(path="fastmcp/core.py", [...]) | `Made edit to fastmcp/core.py` | +| 5 | Let's check if there are any errors. | **diagnostics**() | `No errors found` | + +### Implementation details + +`src/explorer.html` contains the template. You can open this template in a +browser as is, and it will show some dummy values. But the main use is to set +the `threadsData` variable with real data, which then will be used instead of +the dummy values. + +`src/explorer.rs` takes one or more JSON files as generated by `cargo run -p +eval`, and outputs an HTML file for rendering these threads. Refer dummy data +in `explorer.html` for a sample format. diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index 1873adbb61..dcc39592ed 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -1,6 +1,7 @@ mod assertions; mod example; mod examples; +mod explorer; mod ids; mod instance; mod tool_metrics; @@ -305,155 +306,11 @@ fn main() { })) .await; - print_h1("EVAL RESULTS"); - - let mut diff_scores = Vec::new(); - let mut thread_scores = Vec::new(); - let mut programmatic_scores = Vec::new(); - let mut error_count = 0; - - for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() { - print_h2(&example_name); - - results.sort_unstable_by_key(|(example, _)| example.repetition); - let mut example_cumulative_tool_metrics = ToolMetrics::default(); - - let mut table_rows = String::new(); - - for (example, result) in results.iter() { - match result { - Err(err) => { - display_error_row( - &mut table_rows, - example.repetition, - err.to_string(), - )?; - error_count += 1; - } - Ok((run_output, judge_output)) => { - cumulative_tool_metrics.merge(&run_output.tool_metrics); - example_cumulative_tool_metrics.merge(&run_output.tool_metrics); - - if !run_output.programmatic_assertions.total_count() > 0 { - for assertion in &run_output.programmatic_assertions.ran { - assertions::display_table_row( - &mut table_rows, - example.repetition, - assertion, - )?; - } - - programmatic_scores - .push(run_output.programmatic_assertions.passed_percentage()) - } - - if !judge_output.diff.is_empty() { - diff_scores.push(judge_output.diff.passed_percentage()); - - for assertion in &judge_output.diff.ran { - assertions::display_table_row( - &mut table_rows, - example.repetition, - assertion, - )?; - } - } - - if !judge_output.thread.is_empty() { - thread_scores.push(judge_output.thread.passed_percentage()); - - for assertion in &judge_output.thread.ran { - assertions::display_table_row( - &mut table_rows, - example.repetition, - assertion, - )?; - } - } - } - } - } - - if !table_rows.is_empty() { - assertions::print_table_header(); - print!("{}", table_rows); - - assertions::print_table_divider(); - - for (example, result) in results.iter() { - if let Ok((run_output, judge_output)) = result { - assertions::print_table_round_summary( - &example.repetition.to_string(), - [ - &run_output.programmatic_assertions, - &judge_output.diff, - &judge_output.thread, - ] - .into_iter(), - ) - } - } - - assertions::print_table_divider(); - - assertions::print_table_round_summary( - "avg", - results.iter().flat_map(|(_, result)| { - result.iter().flat_map(|(run_output, judge_output)| { - [ - &run_output.programmatic_assertions, - &judge_output.diff, - &judge_output.thread, - ] - .into_iter() - }) - }), - ); - - assertions::print_table_footer(); - } - - if !example_cumulative_tool_metrics.is_empty() { - println!("{}", &example_cumulative_tool_metrics); - } - } - - if results_by_example_name.borrow().len() > 1 { - print_h1("AGGREGATE"); - - if error_count > 0 { - println!("\n{error_count} examples failed to run!"); - } - - let programmatic_score_count = programmatic_scores.len(); - if programmatic_score_count > 0 { - let average_programmatic_score = (programmatic_scores.into_iter().sum::() - / (programmatic_score_count as f32)) - .floor(); - println!("Average programmatic score: {average_programmatic_score}%"); - } - - let diff_score_count = diff_scores.len(); - if diff_score_count > 0 { - let average_diff_score = - (diff_scores.into_iter().sum::() / (diff_score_count as f32)).floor(); - println!("Average diff score: {average_diff_score}%"); - } - - let thread_score_count = thread_scores.len(); - - if thread_score_count > 0 { - let average_thread_score = (thread_scores.into_iter().sum::() - / (thread_score_count as f32)) - .floor(); - println!("Average thread score: {average_thread_score}%"); - } - - println!(""); - - print_h2("CUMULATIVE TOOL METRICS"); - println!("{}", cumulative_tool_metrics); - } + print_report( + &mut results_by_example_name.borrow_mut(), + &mut cumulative_tool_metrics, + &run_dir, + )?; app_state.client.telemetry().flush_events().await; @@ -670,3 +527,175 @@ fn print_h2(header: &str) { println!("{:^HEADER_WIDTH$}", header); println!("{:-^HEADER_WIDTH$}\n", ""); } + +fn print_report( + results_by_example_name: &mut HashMap< + String, + Vec<(ExampleInstance, anyhow::Result<(RunOutput, JudgeOutput)>)>, + >, + cumulative_tool_metrics: &mut ToolMetrics, + run_dir: &Path, +) -> anyhow::Result<()> { + print_h1("EVAL RESULTS"); + + let mut diff_scores = Vec::new(); + let mut thread_scores = Vec::new(); + let mut programmatic_scores = Vec::new(); + let mut error_count = 0; + + for (example_name, results) in results_by_example_name.iter_mut() { + print_h2(example_name); + + results.sort_unstable_by_key(|(example, _)| example.repetition); + let mut example_cumulative_tool_metrics = ToolMetrics::default(); + + let mut table_rows = String::new(); + + for (example, result) in results.iter() { + match result { + Err(err) => { + display_error_row(&mut table_rows, example.repetition, err.to_string())?; + error_count += 1; + } + Ok((run_output, judge_output)) => { + cumulative_tool_metrics.merge(&run_output.tool_metrics); + example_cumulative_tool_metrics.merge(&run_output.tool_metrics); + + if !run_output.programmatic_assertions.total_count() > 0 { + for assertion in &run_output.programmatic_assertions.ran { + assertions::display_table_row( + &mut table_rows, + example.repetition, + assertion, + )?; + } + + programmatic_scores + .push(run_output.programmatic_assertions.passed_percentage()) + } + + if !judge_output.diff.is_empty() { + diff_scores.push(judge_output.diff.passed_percentage()); + + for assertion in &judge_output.diff.ran { + assertions::display_table_row( + &mut table_rows, + example.repetition, + assertion, + )?; + } + } + + if !judge_output.thread.is_empty() { + thread_scores.push(judge_output.thread.passed_percentage()); + + for assertion in &judge_output.thread.ran { + assertions::display_table_row( + &mut table_rows, + example.repetition, + assertion, + )?; + } + } + } + } + } + + if !table_rows.is_empty() { + assertions::print_table_header(); + print!("{}", table_rows); + + assertions::print_table_divider(); + + for (example, result) in results.iter() { + if let Ok((run_output, judge_output)) = result { + assertions::print_table_round_summary( + &example.repetition.to_string(), + [ + &run_output.programmatic_assertions, + &judge_output.diff, + &judge_output.thread, + ] + .into_iter(), + ) + } + } + + assertions::print_table_divider(); + + assertions::print_table_round_summary( + "avg", + results.iter().flat_map(|(_, result)| { + result.iter().flat_map(|(run_output, judge_output)| { + [ + &run_output.programmatic_assertions, + &judge_output.diff, + &judge_output.thread, + ] + .into_iter() + }) + }), + ); + + assertions::print_table_footer(); + } + + if !example_cumulative_tool_metrics.is_empty() { + println!("{}", &example_cumulative_tool_metrics); + } + } + + if results_by_example_name.len() > 1 { + print_h1("AGGREGATE"); + + if error_count > 0 { + println!("\n{error_count} examples failed to run!"); + } + + let programmatic_score_count = programmatic_scores.len(); + if programmatic_score_count > 0 { + let average_programmatic_score = (programmatic_scores.into_iter().sum::() + / (programmatic_score_count as f32)) + .floor(); + println!("Average programmatic score: {average_programmatic_score}%"); + } + + let diff_score_count = diff_scores.len(); + if diff_score_count > 0 { + let average_diff_score = + (diff_scores.into_iter().sum::() / (diff_score_count as f32)).floor(); + println!("Average diff score: {average_diff_score}%"); + } + + let thread_score_count = thread_scores.len(); + + if thread_score_count > 0 { + let average_thread_score = + (thread_scores.into_iter().sum::() / (thread_score_count as f32)).floor(); + println!("Average thread score: {average_thread_score}%"); + } + + println!(""); + + print_h2("CUMULATIVE TOOL METRICS"); + println!("{}", cumulative_tool_metrics); + } + + let explorer_output_path = run_dir.join("overview.html"); + let mut json_paths: Vec = results_by_example_name + .values() + .flat_map(|results| { + results.iter().map(|(example, _)| { + let absolute_path = example.run_directory.join("last.messages.json"); + pathdiff::diff_paths(&absolute_path, run_dir) + .unwrap_or_else(|| absolute_path.clone()) + }) + }) + .collect::>(); + json_paths.sort(); + if let Err(err) = explorer::generate_explorer_html(&json_paths, &explorer_output_path) { + eprintln!("Failed to generate explorer HTML: {}", err); + } + + Ok(()) +} diff --git a/crates/eval/src/explorer.html b/crates/eval/src/explorer.html new file mode 100644 index 0000000000..fec4597163 --- /dev/null +++ b/crates/eval/src/explorer.html @@ -0,0 +1,1045 @@ + + + + + + Eval Explorer + + + +

Thread Explorer

+
+ + + +
+ +
+
+
+ +
+ Thread 1 of + 1: + Default Thread +
+ +
+ + + + + + + + + + + + +
TurnTextToolResult
+ + + + diff --git a/crates/eval/src/explorer.rs b/crates/eval/src/explorer.rs new file mode 100644 index 0000000000..4be44392dd --- /dev/null +++ b/crates/eval/src/explorer.rs @@ -0,0 +1,75 @@ +use anyhow::{Context, Result, anyhow}; +use clap::Parser; +use serde_json::{Value, json}; +use std::fs; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +#[clap(about = "Generate HTML explorer from JSON thread files")] +struct Args { + /// Paths to JSON files containing thread data + #[clap(long, required = true, num_args = 1..)] + input: Vec, + + /// Path where the HTML explorer file will be written + #[clap(long)] + output: PathBuf, +} + +pub fn generate_explorer_html(inputs: &[PathBuf], output: &PathBuf) -> Result { + if let Some(parent) = output.parent() { + if !parent.exists() { + fs::create_dir_all(parent).context(format!( + "Failed to create output directory: {}", + parent.display() + ))?; + } + } + + let template_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/explorer.html"); + let template = fs::read_to_string(&template_path).context(format!( + "Template file not found or couldn't be read: {}", + template_path.display() + ))?; + + let threads = inputs + .iter() + .map(|input_path| { + let mut thread_data: Value = fs::read_to_string(input_path) + .context(format!("Failed to read file: {}", input_path.display()))? + .parse::() + .context(format!("Failed to parse JSON: {}", input_path.display()))?; + thread_data["filename"] = json!(input_path); // This will be shown in a thread heading + Ok(thread_data) + }) + .collect::>>()?; + + let all_threads = json!({ "threads": threads }); + let html_content = inject_thread_data(template, all_threads)?; + fs::write(&output, &html_content) + .context(format!("Failed to write output: {}", output.display()))?; + + println!("Saved {} thread(s) to {}", threads.len(), output.display()); + Ok(html_content) +} + +fn inject_thread_data(template: String, threads_data: Value) -> Result { + let injection_marker = "let threadsData = window.threadsData || { threads: [dummyThread] };"; + template + .find(injection_marker) + .ok_or_else(|| anyhow!("Could not find the thread injection point in the template"))?; + + let threads_json = serde_json::to_string_pretty(&threads_data) + .context("Failed to serialize threads data to JSON")?; + let script_injection = format!("let threadsData = {};", threads_json); + let final_html = template.replacen(injection_marker, &script_injection, 1); + + Ok(final_html) +} + +#[cfg(not(any(test, doctest)))] +#[allow(dead_code)] +fn main() -> Result<()> { + let args = Args::parse(); + generate_explorer_html(&args.input, &args.output).map(|_| ()) +}