eval: Fine-grained assertions (#29246)
- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs
)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml
)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
parent
0d3fe474db
commit
ce1a674eba
18 changed files with 1969 additions and 1229 deletions
4
Cargo.lock
generated
4
Cargo.lock
generated
|
@ -4895,6 +4895,7 @@ dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"assistant_tool",
|
"assistant_tool",
|
||||||
"assistant_tools",
|
"assistant_tools",
|
||||||
|
"async-trait",
|
||||||
"async-watch",
|
"async-watch",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
|
@ -4915,13 +4916,14 @@ dependencies = [
|
||||||
"language_models",
|
"language_models",
|
||||||
"languages",
|
"languages",
|
||||||
"node_runtime",
|
"node_runtime",
|
||||||
"parking_lot",
|
|
||||||
"paths",
|
"paths",
|
||||||
"project",
|
"project",
|
||||||
"prompt_store",
|
"prompt_store",
|
||||||
|
"regex",
|
||||||
"release_channel",
|
"release_channel",
|
||||||
"reqwest_client",
|
"reqwest_client",
|
||||||
"serde",
|
"serde",
|
||||||
|
"serde_json",
|
||||||
"settings",
|
"settings",
|
||||||
"shellexpand 2.1.2",
|
"shellexpand 2.1.2",
|
||||||
"smol",
|
"smol",
|
||||||
|
|
|
@ -315,6 +315,7 @@ pub struct Thread {
|
||||||
request_callback: Option<
|
request_callback: Option<
|
||||||
Box<dyn FnMut(&LanguageModelRequest, &[Result<LanguageModelCompletionEvent, String>])>,
|
Box<dyn FnMut(&LanguageModelRequest, &[Result<LanguageModelCompletionEvent, String>])>,
|
||||||
>,
|
>,
|
||||||
|
remaining_turns: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
@ -368,6 +369,7 @@ impl Thread {
|
||||||
message_feedback: HashMap::default(),
|
message_feedback: HashMap::default(),
|
||||||
last_auto_capture_at: None,
|
last_auto_capture_at: None,
|
||||||
request_callback: None,
|
request_callback: None,
|
||||||
|
remaining_turns: u32::MAX,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -442,6 +444,7 @@ impl Thread {
|
||||||
message_feedback: HashMap::default(),
|
message_feedback: HashMap::default(),
|
||||||
last_auto_capture_at: None,
|
last_auto_capture_at: None,
|
||||||
request_callback: None,
|
request_callback: None,
|
||||||
|
remaining_turns: u32::MAX,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -522,7 +525,7 @@ impl Thread {
|
||||||
self.messages.iter().find(|message| message.id == id)
|
self.messages.iter().find(|message| message.id == id)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn messages(&self) -> impl Iterator<Item = &Message> {
|
pub fn messages(&self) -> impl ExactSizeIterator<Item = &Message> {
|
||||||
self.messages.iter()
|
self.messages.iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -958,7 +961,21 @@ impl Thread {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn remaining_turns(&self) -> u32 {
|
||||||
|
self.remaining_turns
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_remaining_turns(&mut self, remaining_turns: u32) {
|
||||||
|
self.remaining_turns = remaining_turns;
|
||||||
|
}
|
||||||
|
|
||||||
pub fn send_to_model(&mut self, model: Arc<dyn LanguageModel>, cx: &mut Context<Self>) {
|
pub fn send_to_model(&mut self, model: Arc<dyn LanguageModel>, cx: &mut Context<Self>) {
|
||||||
|
if self.remaining_turns == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.remaining_turns -= 1;
|
||||||
|
|
||||||
let mut request = self.to_completion_request(cx);
|
let mut request = self.to_completion_request(cx);
|
||||||
if model.supports_tools() {
|
if model.supports_tools() {
|
||||||
request.tools = {
|
request.tools = {
|
||||||
|
|
|
@ -56,6 +56,8 @@ use crate::symbol_info_tool::SymbolInfoTool;
|
||||||
use crate::terminal_tool::TerminalTool;
|
use crate::terminal_tool::TerminalTool;
|
||||||
use crate::thinking_tool::ThinkingTool;
|
use crate::thinking_tool::ThinkingTool;
|
||||||
|
|
||||||
|
pub use path_search_tool::PathSearchToolInput;
|
||||||
|
|
||||||
pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
|
pub fn init(http_client: Arc<HttpClientWithUrl>, cx: &mut App) {
|
||||||
assistant_tool::init(cx);
|
assistant_tool::init(cx);
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ agent.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
assistant_tool.workspace = true
|
assistant_tool.workspace = true
|
||||||
assistant_tools.workspace = true
|
assistant_tools.workspace = true
|
||||||
|
async-trait.workspace = true
|
||||||
async-watch.workspace = true
|
async-watch.workspace = true
|
||||||
chrono.workspace = true
|
chrono.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
|
@ -29,13 +30,14 @@ language_model.workspace = true
|
||||||
language_models.workspace = true
|
language_models.workspace = true
|
||||||
languages = { workspace = true, features = ["load-grammars"] }
|
languages = { workspace = true, features = ["load-grammars"] }
|
||||||
node_runtime.workspace = true
|
node_runtime.workspace = true
|
||||||
parking_lot.workspace = true
|
|
||||||
paths.workspace = true
|
paths.workspace = true
|
||||||
project.workspace = true
|
project.workspace = true
|
||||||
prompt_store.workspace = true
|
prompt_store.workspace = true
|
||||||
|
regex.workspace = true
|
||||||
release_channel.workspace = true
|
release_channel.workspace = true
|
||||||
reqwest_client.workspace = true
|
reqwest_client.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
|
serde_json.workspace = true
|
||||||
settings.workspace = true
|
settings.workspace = true
|
||||||
shellexpand.workspace = true
|
shellexpand.workspace = true
|
||||||
smol.workspace = true
|
smol.workspace = true
|
||||||
|
@ -45,7 +47,6 @@ unindent.workspace = true
|
||||||
util.workspace = true
|
util.workspace = true
|
||||||
uuid = { version = "1.6", features = ["v4"] }
|
uuid = { version = "1.6", features = ["v4"] }
|
||||||
workspace-hack.workspace = true
|
workspace-hack.workspace = true
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "eval"
|
name = "eval"
|
||||||
path = "src/eval.rs"
|
path = "src/eval.rs"
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
url = "https://github.com/zed-industries/zed.git"
|
|
||||||
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
|
|
||||||
language_extension = "rs"
|
|
|
@ -1,2 +0,0 @@
|
||||||
- The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct. The struct should contain an `output` field that is the same as the task we were returning before, and a new `card` field that contains a view for the card.
|
|
||||||
- The card should be a view that displays a diff. Each line in the diff should be colored according to whether it was added, removed or unchanged.
|
|
|
@ -1,3 +0,0 @@
|
||||||
Look at the `find_replace_file_tool.rs`. I want to implement a card for it. The card should implement the `Render` trait.
|
|
||||||
|
|
||||||
The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green background for lines that were added. We should have a div per diff line.
|
|
|
@ -1,3 +0,0 @@
|
||||||
- The first tool call should be to path search including "find_replace_file_tool.rs" in the string. (*Not* grep, for example, or reading the file based on a guess at the path.) This is because we gave the model a filename and it needs to turn that into a real path.
|
|
||||||
- After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
|
|
||||||
- When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information on what path the Render trait might be in.
|
|
157
crates/eval/src/assertions.rs
Normal file
157
crates/eval/src/assertions.rs
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::fmt::Write;
|
||||||
|
use std::fmt::{self};
|
||||||
|
|
||||||
|
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct AssertionsReport {
|
||||||
|
pub ran: Vec<RanAssertion>,
|
||||||
|
pub max: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct RanAssertion {
|
||||||
|
pub id: String,
|
||||||
|
pub result: Result<RanAssertionResult, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct RanAssertionResult {
|
||||||
|
pub analysis: Option<String>,
|
||||||
|
pub passed: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AssertionsReport {
|
||||||
|
pub fn new(max: Option<usize>) -> Self {
|
||||||
|
AssertionsReport {
|
||||||
|
ran: Vec::new(),
|
||||||
|
max,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.ran.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn total_count(&self) -> usize {
|
||||||
|
self.run_count().max(self.max.unwrap_or(0))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run_count(&self) -> usize {
|
||||||
|
self.ran.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn passed_count(&self) -> usize {
|
||||||
|
self.ran
|
||||||
|
.iter()
|
||||||
|
.filter(|a| a.result.as_ref().map_or(false, |result| result.passed))
|
||||||
|
.count()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn passed_percentage(&self) -> f32 {
|
||||||
|
if self.total_count() == 0 {
|
||||||
|
0.0
|
||||||
|
} else {
|
||||||
|
(self.passed_count() as f32 / self.total_count() as f32) * 100.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const ROUND_WIDTH: usize = "Round".len();
|
||||||
|
const ASSERTIONS_WIDTH: usize = 42;
|
||||||
|
const RESULTS_WIDTH: usize = 8;
|
||||||
|
|
||||||
|
pub fn print_table_header() {
|
||||||
|
println!(
|
||||||
|
"┌─{}─┬─{}─┬─{}─┐",
|
||||||
|
"─".repeat(ROUND_WIDTH),
|
||||||
|
"─".repeat(ASSERTIONS_WIDTH),
|
||||||
|
"─".repeat(RESULTS_WIDTH)
|
||||||
|
);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"│ {:^ROUND_WIDTH$} │ {:^ASSERTIONS_WIDTH$} │ {:^RESULTS_WIDTH$} │",
|
||||||
|
"Round", "Assertion", "Result"
|
||||||
|
);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"├─{}─┼─{}─┼─{}─┤",
|
||||||
|
"─".repeat(ROUND_WIDTH),
|
||||||
|
"─".repeat(ASSERTIONS_WIDTH),
|
||||||
|
"─".repeat(RESULTS_WIDTH)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn display_error_row(f: &mut String, round: usize, error: String) -> fmt::Result {
|
||||||
|
let last_two_columns = ASSERTIONS_WIDTH + RESULTS_WIDTH;
|
||||||
|
writeln!(
|
||||||
|
f,
|
||||||
|
"│ {:^ROUND_WIDTH$} │ {:<last_two_columns$} |",
|
||||||
|
round,
|
||||||
|
truncate(&error, last_two_columns)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn display_table_row(f: &mut String, round: usize, assertion: &RanAssertion) -> fmt::Result {
|
||||||
|
let result = match &assertion.result {
|
||||||
|
Ok(result) if result.passed => "\x1b[32m✔︎ Passed\x1b[0m",
|
||||||
|
Ok(_) => "\x1b[31m✗ Failed\x1b[0m",
|
||||||
|
Err(_) => "\x1b[31m💥 Judge Error\x1b[0m",
|
||||||
|
};
|
||||||
|
|
||||||
|
writeln!(
|
||||||
|
f,
|
||||||
|
"│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
|
||||||
|
round,
|
||||||
|
truncate(&assertion.id, ASSERTIONS_WIDTH),
|
||||||
|
result
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn print_table_round_summary<'a>(
|
||||||
|
round: &str,
|
||||||
|
reports: impl Iterator<Item = &'a AssertionsReport>,
|
||||||
|
) {
|
||||||
|
let mut passed = 0;
|
||||||
|
let mut total = 0;
|
||||||
|
for report in reports {
|
||||||
|
passed += report.passed_count();
|
||||||
|
total += report.total_count();
|
||||||
|
}
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"│ {:^ROUND_WIDTH$} │ {:<ASSERTIONS_WIDTH$} │ {:>RESULTS_WIDTH$} │",
|
||||||
|
round,
|
||||||
|
"total",
|
||||||
|
format!("{}%", (passed as f32 / total as f32 * 100.0).floor())
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn print_table_footer() {
|
||||||
|
println!(
|
||||||
|
"└─{}─┴─{}─┴─{}─┘",
|
||||||
|
"─".repeat(ROUND_WIDTH),
|
||||||
|
"─".repeat(ASSERTIONS_WIDTH),
|
||||||
|
"─".repeat(RESULTS_WIDTH)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn print_table_divider() {
|
||||||
|
println!(
|
||||||
|
"├─{}─┼─{}─┼─{}─┤",
|
||||||
|
"─".repeat(ROUND_WIDTH),
|
||||||
|
"─".repeat(ASSERTIONS_WIDTH),
|
||||||
|
"─".repeat(RESULTS_WIDTH)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn truncate(assertion: &str, max_width: usize) -> String {
|
||||||
|
if assertion.len() <= max_width {
|
||||||
|
assertion.to_string()
|
||||||
|
} else {
|
||||||
|
let mut end_ix = max_width - 1;
|
||||||
|
while !assertion.is_char_boundary(end_ix) {
|
||||||
|
end_ix -= 1;
|
||||||
|
}
|
||||||
|
format!("{}…", &assertion[..end_ix])
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,13 +1,16 @@
|
||||||
|
mod assertions;
|
||||||
mod example;
|
mod example;
|
||||||
|
mod examples;
|
||||||
mod ids;
|
mod ids;
|
||||||
|
mod instance;
|
||||||
mod tool_metrics;
|
mod tool_metrics;
|
||||||
|
|
||||||
pub(crate) use example::*;
|
use assertions::display_error_row;
|
||||||
use parking_lot::Mutex;
|
use instance::{ExampleInstance, JudgeOutput, RunOutput, run_git};
|
||||||
pub(crate) use tool_metrics::*;
|
pub(crate) use tool_metrics::*;
|
||||||
|
|
||||||
use ::fs::RealFs;
|
use ::fs::RealFs;
|
||||||
use anyhow::{Result, anyhow};
|
use anyhow::anyhow;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use client::{Client, ProxySettings, UserStore};
|
use client::{Client, ProxySettings, UserStore};
|
||||||
use collections::{HashMap, HashSet};
|
use collections::{HashMap, HashSet};
|
||||||
|
@ -25,18 +28,20 @@ use prompt_store::PromptBuilder;
|
||||||
use release_channel::AppVersion;
|
use release_channel::AppVersion;
|
||||||
use reqwest_client::ReqwestClient;
|
use reqwest_client::ReqwestClient;
|
||||||
use settings::{Settings, SettingsStore};
|
use settings::{Settings, SettingsStore};
|
||||||
|
use std::cell::RefCell;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::rc::Rc;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use util::ResultExt as _;
|
use util::ResultExt as _;
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[command(name = "eval", disable_version_flag = true)]
|
#[command(name = "eval", disable_version_flag = true)]
|
||||||
struct Args {
|
struct Args {
|
||||||
/// Runs all examples that contain these substrings. If unspecified, all examples are run.
|
/// Runs all examples and threads that contain these substrings. If unspecified, all examples and threads are run.
|
||||||
#[arg(value_name = "EXAMPLE_SUBSTRING")]
|
#[arg(value_name = "EXAMPLE_SUBSTRING")]
|
||||||
examples: Vec<String>,
|
filter: Vec<String>,
|
||||||
/// Model to use (default: "claude-3-7-sonnet-latest")
|
/// Model to use (default: "claude-3-7-sonnet-latest")
|
||||||
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
|
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
|
||||||
model: String,
|
model: String,
|
||||||
|
@ -66,43 +71,30 @@ fn main() {
|
||||||
.parent()
|
.parent()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.parent()
|
.parent()
|
||||||
|
.unwrap()
|
||||||
|
.canonicalize()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let eval_crate_dir = root_dir.join("crates/eval");
|
let eval_crate_dir = root_dir.join("crates").join("eval");
|
||||||
let repos_dir = eval_crate_dir.join("repos");
|
let repos_dir = eval_crate_dir.join("repos");
|
||||||
let worktrees_dir = eval_crate_dir.join("worktrees");
|
let worktrees_dir = eval_crate_dir.join("worktrees");
|
||||||
let examples_dir = eval_crate_dir.join("examples");
|
let examples_dir = eval_crate_dir.join("src").join("examples");
|
||||||
let runs_dir = eval_crate_dir.join("runs");
|
let run_dir = eval_crate_dir
|
||||||
let run_dir = runs_dir.join(format!("{}", run_timestamp));
|
.join("runs")
|
||||||
|
.join(format!("{}", run_timestamp));
|
||||||
std::fs::create_dir_all(&run_dir).unwrap();
|
std::fs::create_dir_all(&run_dir).unwrap();
|
||||||
std::fs::create_dir_all(&repos_dir).unwrap();
|
std::fs::create_dir_all(&repos_dir).unwrap();
|
||||||
std::fs::create_dir_all(&worktrees_dir).unwrap();
|
std::fs::create_dir_all(&worktrees_dir).unwrap();
|
||||||
std::fs::create_dir_all(&examples_dir).unwrap();
|
std::fs::create_dir_all(&examples_dir).unwrap();
|
||||||
std::fs::create_dir_all(&paths::config_dir()).unwrap();
|
std::fs::create_dir_all(&paths::config_dir()).unwrap();
|
||||||
|
|
||||||
let zed_commit_sha = commit_sha_for_path(root_dir);
|
let zed_commit_sha = commit_sha_for_path(&root_dir);
|
||||||
let zed_branch_name = git_branch_for_path(root_dir);
|
let zed_branch_name = git_branch_for_path(&root_dir);
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
let all_available_examples = list_all_examples(&examples_dir).unwrap();
|
let languages: HashSet<String> = args.languages.into_iter().collect();
|
||||||
|
|
||||||
let example_paths = all_available_examples
|
|
||||||
.iter()
|
|
||||||
.filter_map(|example_path| {
|
|
||||||
let name = example_path.file_name()?.to_string_lossy();
|
|
||||||
if args.examples.is_empty()
|
|
||||||
|| args
|
|
||||||
.examples
|
|
||||||
.iter()
|
|
||||||
.any(|name_substring| name.contains(name_substring))
|
|
||||||
{
|
|
||||||
Some(example_path.clone())
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let http_client = Arc::new(ReqwestClient::new());
|
let http_client = Arc::new(ReqwestClient::new());
|
||||||
let app = Application::headless().with_http_client(http_client.clone());
|
let app = Application::headless().with_http_client(http_client.clone());
|
||||||
|
let all_threads = examples::all(&examples_dir);
|
||||||
|
|
||||||
app.run(move |cx| {
|
app.run(move |cx| {
|
||||||
let app_state = init(cx);
|
let app_state = init(cx);
|
||||||
|
@ -163,28 +155,40 @@ fn main() {
|
||||||
|
|
||||||
let mut skipped = Vec::new();
|
let mut skipped = Vec::new();
|
||||||
|
|
||||||
for example_path in &example_paths {
|
for thread in all_threads {
|
||||||
let example = Example::load_from_directory(
|
let meta = thread.meta();
|
||||||
example_path,
|
if !args.filter.is_empty() && !args.filter.iter().any(|sub| meta.name.contains(sub))
|
||||||
&run_dir,
|
|
||||||
&worktrees_dir,
|
|
||||||
&repos_dir,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
if !example
|
|
||||||
.base
|
|
||||||
.language_extension
|
|
||||||
.as_ref()
|
|
||||||
.map_or(false, |lang| args.languages.contains(lang))
|
|
||||||
{
|
{
|
||||||
skipped.push(example.name);
|
skipped.push(meta.name);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
examples.extend(example.repeat(args.repetitions));
|
if meta.language_server.map_or(false, |language| {
|
||||||
|
!languages.contains(&language.file_extension)
|
||||||
|
}) {
|
||||||
|
skipped.push(meta.name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: This creates a worktree per repetition. Ideally these examples should
|
||||||
|
// either be run sequentially on the same worktree, or reuse worktrees when there
|
||||||
|
// are more examples to run than the concurrency limit.
|
||||||
|
for repetition_number in 0..args.repetitions {
|
||||||
|
let example_instance = ExampleInstance::new(
|
||||||
|
thread.clone(),
|
||||||
|
&repos_dir,
|
||||||
|
&run_dir,
|
||||||
|
&worktrees_dir,
|
||||||
|
repetition_number,
|
||||||
|
);
|
||||||
|
|
||||||
|
examples.push(example_instance);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("Skipped examples: {}\n", skipped.join(", "));
|
if !skipped.is_empty() {
|
||||||
|
println!("Skipped threads: {}", skipped.join(", "));
|
||||||
|
}
|
||||||
|
|
||||||
if examples.is_empty() {
|
if examples.is_empty() {
|
||||||
eprintln!("Filter matched no examples");
|
eprintln!("Filter matched no examples");
|
||||||
|
@ -196,22 +200,23 @@ fn main() {
|
||||||
|
|
||||||
let max_name_width = examples
|
let max_name_width = examples
|
||||||
.iter()
|
.iter()
|
||||||
.map(|e| e.repetition_name().len())
|
.map(|e| e.worktree_name().len())
|
||||||
.max()
|
.max()
|
||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
for (i, example) in examples.iter_mut().enumerate() {
|
|
||||||
|
for (i, example_instance) in examples.iter_mut().enumerate() {
|
||||||
let color = COLORS[i % COLORS.len()].to_string();
|
let color = COLORS[i % COLORS.len()].to_string();
|
||||||
example.set_log_prefix_style(&color, max_name_width);
|
example_instance.set_log_prefix_style(&color, max_name_width);
|
||||||
|
|
||||||
println!(
|
println!(
|
||||||
"{}Logging to: {}",
|
"{}Logging to: {}",
|
||||||
example.log_prefix,
|
example_instance.log_prefix,
|
||||||
example.run_directory_path().display()
|
example_instance.run_directory.display()
|
||||||
);
|
);
|
||||||
|
|
||||||
let repo_url = example.base.url.clone();
|
let repo_url = example_instance.repo_url();
|
||||||
if repo_urls.insert(repo_url.clone()) {
|
if repo_urls.insert(repo_url.clone()) {
|
||||||
let repo_path = example.repo_path.clone();
|
let repo_path = example_instance.repo_path.clone();
|
||||||
|
|
||||||
if !repo_path.join(".git").is_dir() {
|
if !repo_path.join(".git").is_dir() {
|
||||||
println!(
|
println!(
|
||||||
|
@ -251,12 +256,12 @@ fn main() {
|
||||||
|
|
||||||
future::join_all(clone_tasks).await;
|
future::join_all(clone_tasks).await;
|
||||||
|
|
||||||
for example in examples.iter_mut() {
|
for example_instance in examples.iter_mut() {
|
||||||
example.fetch().await?;
|
example_instance.fetch().await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let examples = Arc::new(Mutex::new(VecDeque::from(examples)));
|
let examples = Rc::new(RefCell::new(VecDeque::from(examples)));
|
||||||
let results_by_example_name = Arc::new(Mutex::new(HashMap::default()));
|
let results_by_example_name = Rc::new(RefCell::new(HashMap::default()));
|
||||||
|
|
||||||
future::join_all((0..args.concurrency).map(|_| {
|
future::join_all((0..args.concurrency).map(|_| {
|
||||||
let app_state = app_state.clone();
|
let app_state = app_state.clone();
|
||||||
|
@ -268,7 +273,7 @@ fn main() {
|
||||||
let results = results_by_example_name.clone();
|
let results = results_by_example_name.clone();
|
||||||
cx.spawn(async move |cx| {
|
cx.spawn(async move |cx| {
|
||||||
loop {
|
loop {
|
||||||
let Some(mut example) = examples.lock().pop_front() else {
|
let Some(mut example) = examples.borrow_mut().pop_front() else {
|
||||||
break;
|
break;
|
||||||
};
|
};
|
||||||
let result = async {
|
let result = async {
|
||||||
|
@ -291,7 +296,7 @@ fn main() {
|
||||||
}
|
}
|
||||||
.await;
|
.await;
|
||||||
results
|
results
|
||||||
.lock()
|
.borrow_mut()
|
||||||
.entry(example.name.clone())
|
.entry(example.name.clone())
|
||||||
.or_insert(Vec::new())
|
.or_insert(Vec::new())
|
||||||
.push((example.clone(), result));
|
.push((example.clone(), result));
|
||||||
|
@ -300,98 +305,156 @@ fn main() {
|
||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
println!("\n\n");
|
print_h1("EVAL RESULTS");
|
||||||
print_header("EVAL RESULTS");
|
|
||||||
|
|
||||||
let mut diff_scores = Vec::new();
|
let mut diff_scores = Vec::new();
|
||||||
let mut thread_scores = Vec::new();
|
let mut thread_scores = Vec::new();
|
||||||
|
let mut programmatic_scores = Vec::new();
|
||||||
let mut error_count = 0;
|
let mut error_count = 0;
|
||||||
|
|
||||||
for (example_name, results) in results_by_example_name.lock().iter_mut() {
|
for (example_name, results) in results_by_example_name.borrow_mut().iter_mut() {
|
||||||
print_header(&example_name);
|
print_h2(&example_name);
|
||||||
|
|
||||||
results.sort_unstable_by_key(|(example, _)| example.repetition);
|
results.sort_unstable_by_key(|(example, _)| example.repetition);
|
||||||
let mut example_cumulative_tool_metrics = ToolMetrics::default();
|
let mut example_cumulative_tool_metrics = ToolMetrics::default();
|
||||||
|
|
||||||
println!("┌───────┬──────┬────────┐");
|
let mut table_rows = String::new();
|
||||||
println!("│ Round │ Diff │ Thread │");
|
|
||||||
println!("├───────┼──────┼────────┤");
|
|
||||||
for (example, result) in results {
|
|
||||||
let run_dir_path = example.run_directory_path();
|
|
||||||
let relative_run_dir_path = run_dir_path.strip_prefix(root_dir).unwrap();
|
|
||||||
|
|
||||||
|
for (example, result) in results.iter() {
|
||||||
match result {
|
match result {
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
println!(
|
display_error_row(
|
||||||
"|{:^7}│{:^6}│{:^8}│ {:?}{}",
|
&mut table_rows,
|
||||||
example.repetition,
|
example.repetition,
|
||||||
"N/A",
|
err.to_string(),
|
||||||
"N/A",
|
)?;
|
||||||
err,
|
|
||||||
relative_run_dir_path.display()
|
|
||||||
);
|
|
||||||
error_count += 1;
|
error_count += 1;
|
||||||
}
|
}
|
||||||
Ok((run_output, judge_result)) => {
|
Ok((run_output, judge_output)) => {
|
||||||
cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||||
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
example_cumulative_tool_metrics.merge(&run_output.tool_metrics);
|
||||||
|
|
||||||
match judge_result {
|
if !run_output.programmatic_assertions.total_count() > 0 {
|
||||||
Ok(judge_output) => {
|
for assertion in &run_output.programmatic_assertions.ran {
|
||||||
diff_scores.push(judge_output.diff.score());
|
assertions::display_table_row(
|
||||||
thread_scores.push(judge_output.thread.score());
|
&mut table_rows,
|
||||||
println!(
|
|
||||||
"|{:^7}│{:^6}│{:^8}│ {}",
|
|
||||||
example.repetition,
|
example.repetition,
|
||||||
format!("{}%", judge_output.diff.score()),
|
assertion,
|
||||||
format!("{}%", judge_output.thread.score()),
|
)?;
|
||||||
relative_run_dir_path.display()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
Err(err) => {
|
|
||||||
println!(
|
programmatic_scores
|
||||||
"|{:^7}│{:^6}│{:^8}│{:?}│ {}",
|
.push(run_output.programmatic_assertions.passed_percentage())
|
||||||
|
}
|
||||||
|
|
||||||
|
if !judge_output.diff.is_empty() {
|
||||||
|
diff_scores.push(judge_output.diff.passed_percentage());
|
||||||
|
|
||||||
|
for assertion in &judge_output.diff.ran {
|
||||||
|
assertions::display_table_row(
|
||||||
|
&mut table_rows,
|
||||||
example.repetition,
|
example.repetition,
|
||||||
"N/A",
|
assertion,
|
||||||
"N/A",
|
)?;
|
||||||
err,
|
}
|
||||||
relative_run_dir_path.display()
|
}
|
||||||
);
|
|
||||||
|
if !judge_output.thread.is_empty() {
|
||||||
|
thread_scores.push(judge_output.thread.passed_percentage());
|
||||||
|
|
||||||
|
for assertion in &judge_output.thread.ran {
|
||||||
|
assertions::display_table_row(
|
||||||
|
&mut table_rows,
|
||||||
|
example.repetition,
|
||||||
|
assertion,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("└───────┴──────┴────────┘");
|
if !table_rows.is_empty() {
|
||||||
println!("{}", example_cumulative_tool_metrics);
|
assertions::print_table_header();
|
||||||
|
print!("{}", table_rows);
|
||||||
|
|
||||||
|
assertions::print_table_divider();
|
||||||
|
|
||||||
|
for (example, result) in results.iter() {
|
||||||
|
if let Ok((run_output, judge_output)) = result {
|
||||||
|
assertions::print_table_round_summary(
|
||||||
|
&example.repetition.to_string(),
|
||||||
|
[
|
||||||
|
&run_output.programmatic_assertions,
|
||||||
|
&judge_output.diff,
|
||||||
|
&judge_output.thread,
|
||||||
|
]
|
||||||
|
.into_iter(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assertions::print_table_divider();
|
||||||
|
|
||||||
|
assertions::print_table_round_summary(
|
||||||
|
"avg",
|
||||||
|
results.iter().flat_map(|(_, result)| {
|
||||||
|
result.iter().flat_map(|(run_output, judge_output)| {
|
||||||
|
[
|
||||||
|
&run_output.programmatic_assertions,
|
||||||
|
&judge_output.diff,
|
||||||
|
&judge_output.thread,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
})
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assertions::print_table_footer();
|
||||||
|
}
|
||||||
|
|
||||||
|
if !example_cumulative_tool_metrics.is_empty() {
|
||||||
|
println!("{}", &example_cumulative_tool_metrics);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let diff_score_count = diff_scores.len();
|
if results_by_example_name.borrow().len() > 1 {
|
||||||
let average_diff_score = diff_scores
|
print_h1("AGGREGATE");
|
||||||
.into_iter()
|
|
||||||
.map(|score| score as f32)
|
|
||||||
.sum::<f32>()
|
|
||||||
/ (diff_score_count as f32);
|
|
||||||
|
|
||||||
if error_count > 0 {
|
if error_count > 0 {
|
||||||
println!("\n{error_count} examples failed to run!");
|
println!("\n{error_count} examples failed to run!");
|
||||||
|
}
|
||||||
|
|
||||||
|
let programmatic_score_count = programmatic_scores.len();
|
||||||
|
if programmatic_score_count > 0 {
|
||||||
|
let average_programmatic_score = (programmatic_scores.into_iter().sum::<f32>()
|
||||||
|
/ (programmatic_score_count as f32))
|
||||||
|
.floor();
|
||||||
|
println!("Average programmatic score: {average_programmatic_score}%");
|
||||||
|
}
|
||||||
|
|
||||||
|
let diff_score_count = diff_scores.len();
|
||||||
|
if diff_score_count > 0 {
|
||||||
|
let average_diff_score =
|
||||||
|
(diff_scores.into_iter().sum::<f32>() / (diff_score_count as f32)).floor();
|
||||||
|
println!("Average diff score: {average_diff_score}%");
|
||||||
|
}
|
||||||
|
|
||||||
|
let thread_score_count = thread_scores.len();
|
||||||
|
|
||||||
|
if thread_score_count > 0 {
|
||||||
|
let average_thread_score = (thread_scores.into_iter().sum::<f32>()
|
||||||
|
/ (thread_score_count as f32))
|
||||||
|
.floor();
|
||||||
|
println!("Average thread score: {average_thread_score}%");
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("");
|
||||||
|
|
||||||
|
print_h2("CUMULATIVE TOOL METRICS");
|
||||||
|
println!("{}", cumulative_tool_metrics);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("\nAverage code diff score: {average_diff_score}");
|
|
||||||
|
|
||||||
let thread_score_count = thread_scores.len();
|
|
||||||
let average_thread_score = thread_scores
|
|
||||||
.into_iter()
|
|
||||||
.map(|score| score as f32)
|
|
||||||
.sum::<f32>()
|
|
||||||
/ (thread_score_count as f32);
|
|
||||||
|
|
||||||
println!("\nAverage thread score: {average_thread_score}");
|
|
||||||
|
|
||||||
print_header("CUMULATIVE TOOL METRICS");
|
|
||||||
println!("{}", cumulative_tool_metrics);
|
|
||||||
|
|
||||||
app_state.client.telemetry().flush_events().await;
|
app_state.client.telemetry().flush_events().await;
|
||||||
|
|
||||||
cx.update(|cx| cx.quit())
|
cx.update(|cx| cx.quit())
|
||||||
|
@ -400,20 +463,6 @@ fn main() {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn list_all_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
|
|
||||||
let path = std::fs::canonicalize(examples_dir).unwrap();
|
|
||||||
let entries = std::fs::read_dir(path).unwrap();
|
|
||||||
let mut result_paths = Vec::new();
|
|
||||||
for entry in entries {
|
|
||||||
let entry = entry?;
|
|
||||||
let path = entry.path();
|
|
||||||
if path.is_dir() {
|
|
||||||
result_paths.push(path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(result_paths)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
|
/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
|
||||||
pub struct AgentAppState {
|
pub struct AgentAppState {
|
||||||
pub languages: Arc<LanguageRegistry>,
|
pub languages: Arc<LanguageRegistry>,
|
||||||
|
@ -570,7 +619,7 @@ pub fn git_branch_for_path(repo_path: &Path) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn judge_example(
|
async fn judge_example(
|
||||||
example: Example,
|
example: ExampleInstance,
|
||||||
model: Arc<dyn LanguageModel>,
|
model: Arc<dyn LanguageModel>,
|
||||||
zed_commit_sha: &str,
|
zed_commit_sha: &str,
|
||||||
zed_branch_name: &str,
|
zed_branch_name: &str,
|
||||||
|
@ -578,19 +627,9 @@ async fn judge_example(
|
||||||
run_output: &RunOutput,
|
run_output: &RunOutput,
|
||||||
enable_telemetry: bool,
|
enable_telemetry: bool,
|
||||||
cx: &AsyncApp,
|
cx: &AsyncApp,
|
||||||
) -> Result<JudgeOutput> {
|
) -> JudgeOutput {
|
||||||
let judge_output = example.judge(model.clone(), &run_output, cx).await;
|
let judge_output = example.judge(model.clone(), &run_output, cx).await;
|
||||||
|
|
||||||
let diff_evaluation;
|
|
||||||
let thread_evaluation;
|
|
||||||
if let Ok(output) = judge_output.as_ref() {
|
|
||||||
diff_evaluation = Some(output.diff.clone());
|
|
||||||
thread_evaluation = Some(output.thread.clone());
|
|
||||||
} else {
|
|
||||||
diff_evaluation = None;
|
|
||||||
thread_evaluation = None;
|
|
||||||
}
|
|
||||||
|
|
||||||
if enable_telemetry {
|
if enable_telemetry {
|
||||||
telemetry::event!(
|
telemetry::event!(
|
||||||
"Agent Example Evaluated",
|
"Agent Example Evaluated",
|
||||||
|
@ -599,15 +638,15 @@ async fn judge_example(
|
||||||
run_id = run_id,
|
run_id = run_id,
|
||||||
example_name = example.name.clone(),
|
example_name = example.name.clone(),
|
||||||
example_repetition = example.repetition,
|
example_repetition = example.repetition,
|
||||||
diff_evaluation = diff_evaluation,
|
diff_evaluation = judge_output.diff.clone(),
|
||||||
thread_evaluation = thread_evaluation,
|
thread_evaluation = judge_output.thread.clone(),
|
||||||
tool_metrics = run_output.tool_metrics,
|
tool_metrics = run_output.tool_metrics,
|
||||||
response_count = run_output.response_count,
|
response_count = run_output.response_count,
|
||||||
token_usage = run_output.token_usage,
|
token_usage = run_output.token_usage,
|
||||||
model = model.telemetry_id(),
|
model = model.telemetry_id(),
|
||||||
model_provider = model.provider_id().to_string(),
|
model_provider = model.provider_id().to_string(),
|
||||||
repository_url = example.base.url.clone(),
|
repository_url = example.repo_url(),
|
||||||
repository_revision = example.base.revision.clone(),
|
repository_revision = example.revision(),
|
||||||
diagnostic_summary_before = run_output.diagnostic_summary_before,
|
diagnostic_summary_before = run_output.diagnostic_summary_before,
|
||||||
diagnostic_summary_after = run_output.diagnostic_summary_after,
|
diagnostic_summary_after = run_output.diagnostic_summary_after,
|
||||||
diagnostics_before = run_output.diagnostics_before,
|
diagnostics_before = run_output.diagnostics_before,
|
||||||
|
@ -618,8 +657,16 @@ async fn judge_example(
|
||||||
judge_output
|
judge_output
|
||||||
}
|
}
|
||||||
|
|
||||||
fn print_header(header: &str) {
|
const HEADER_WIDTH: usize = 65;
|
||||||
println!("\n========================================");
|
|
||||||
println!("{:^40}", header);
|
fn print_h1(header: &str) {
|
||||||
println!("========================================\n");
|
println!("\n\n{:=^HEADER_WIDTH$}", "");
|
||||||
|
println!("{:^HEADER_WIDTH$}", header);
|
||||||
|
println!("{:=^HEADER_WIDTH$}\n", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn print_h2(header: &str) {
|
||||||
|
println!("\n{:-^HEADER_WIDTH$}", "");
|
||||||
|
println!("{:^HEADER_WIDTH$}", header);
|
||||||
|
println!("{:-^HEADER_WIDTH$}\n", "");
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
53
crates/eval/src/examples/file_search.rs
Normal file
53
crates/eval/src/examples/file_search.rs
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use assistant_tools::PathSearchToolInput;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
use crate::example::{Example, ExampleContext, ExampleMetadata};
|
||||||
|
|
||||||
|
pub struct FileSearchExample;
|
||||||
|
|
||||||
|
#[async_trait(?Send)]
|
||||||
|
impl Example for FileSearchExample {
|
||||||
|
fn meta(&self) -> ExampleMetadata {
|
||||||
|
ExampleMetadata {
|
||||||
|
name: "file_search".to_string(),
|
||||||
|
url: "https://github.com/zed-industries/zed.git".to_string(),
|
||||||
|
revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
|
||||||
|
language_server: None,
|
||||||
|
max_assertions: Some(4),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||||
|
const FILENAME: &str = "find_replace_file_tool.rs";
|
||||||
|
cx.push_user_message(format!(
|
||||||
|
r#"
|
||||||
|
Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
|
||||||
|
|
||||||
|
The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
|
||||||
|
markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
|
||||||
|
background for lines that were added. We should have a div per diff line.
|
||||||
|
"#
|
||||||
|
));
|
||||||
|
|
||||||
|
let response = cx.run_turn().await?;
|
||||||
|
let tool_use = response.expect_tool("path_search", cx)?;
|
||||||
|
let input = tool_use.expect_input::<PathSearchToolInput>(cx)?;
|
||||||
|
|
||||||
|
let glob = input.glob;
|
||||||
|
cx.assert(
|
||||||
|
glob.ends_with(FILENAME),
|
||||||
|
format!("glob ends with `{FILENAME}`"),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let without_filename = glob.replace(FILENAME, "");
|
||||||
|
let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
|
||||||
|
.unwrap()
|
||||||
|
.is_match(&without_filename);
|
||||||
|
|
||||||
|
cx.assert(matches, "glob starts with either `**` or `zed`")?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
43
crates/eval/src/examples/find_and_replace_diff_card.toml
Normal file
43
crates/eval/src/examples/find_and_replace_diff_card.toml
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
url = "https://github.com/zed-industries/zed.git"
|
||||||
|
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
|
||||||
|
language_extension = "rs"
|
||||||
|
|
||||||
|
prompt = """
|
||||||
|
Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
|
||||||
|
The card should implement the `Render` trait.
|
||||||
|
|
||||||
|
The card should show a diff. It should be a beautifully presented diff.
|
||||||
|
The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
|
||||||
|
I want to see a red background for lines that were deleted and a green background for lines
|
||||||
|
that were added. We should have a div per diff line.
|
||||||
|
"""
|
||||||
|
|
||||||
|
[diff_assertions]
|
||||||
|
|
||||||
|
modify_find_and_replace_tool = """
|
||||||
|
The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
|
||||||
|
The struct should contain an `output` field that is the same as the task we were returning before,
|
||||||
|
and a new `card` field that contains a view for the card.
|
||||||
|
"""
|
||||||
|
|
||||||
|
card_implementation = """
|
||||||
|
The card should be a view that displays a diff.
|
||||||
|
Each line in the diff should be colored according to whether it was added, removed or unchanged.
|
||||||
|
"""
|
||||||
|
|
||||||
|
[thread_assertions]
|
||||||
|
|
||||||
|
path_search = """
|
||||||
|
The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
|
||||||
|
(*Not* grep, for example, or reading the file based on a guess at the path.)
|
||||||
|
This is because we gave the model a filename and it needs to turn that into a real path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
read_file_from_path_search = """
|
||||||
|
After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
symbol_search = """
|
||||||
|
When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
|
||||||
|
on what path the Render trait might be in.
|
||||||
|
"""
|
128
crates/eval/src/examples/mod.rs
Normal file
128
crates/eval/src/examples/mod.rs
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
use std::fs;
|
||||||
|
use std::{
|
||||||
|
path::{Path, PathBuf},
|
||||||
|
rc::Rc,
|
||||||
|
};
|
||||||
|
use util::serde::default_true;
|
||||||
|
|
||||||
|
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
|
||||||
|
|
||||||
|
mod file_search;
|
||||||
|
|
||||||
|
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
|
||||||
|
let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
|
||||||
|
|
||||||
|
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
||||||
|
threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
|
||||||
|
}
|
||||||
|
|
||||||
|
threads
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DeclarativeExample {
|
||||||
|
metadata: ExampleMetadata,
|
||||||
|
prompt: String,
|
||||||
|
diff_assertions: Vec<JudgeAssertion>,
|
||||||
|
thread_assertions: Vec<JudgeAssertion>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeclarativeExample {
|
||||||
|
pub fn load(example_path: &Path) -> Result<Self> {
|
||||||
|
let name = Self::name_from_path(example_path);
|
||||||
|
let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
|
||||||
|
|
||||||
|
let language_server = if base.require_lsp {
|
||||||
|
Some(crate::example::LanguageServer {
|
||||||
|
file_extension: base
|
||||||
|
.language_extension
|
||||||
|
.expect("Language extension is required when require_lsp = true"),
|
||||||
|
allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let metadata = ExampleMetadata {
|
||||||
|
name,
|
||||||
|
url: base.url,
|
||||||
|
revision: base.revision,
|
||||||
|
language_server,
|
||||||
|
max_assertions: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(DeclarativeExample {
|
||||||
|
metadata,
|
||||||
|
prompt: base.prompt,
|
||||||
|
thread_assertions: base
|
||||||
|
.thread_assertions
|
||||||
|
.into_iter()
|
||||||
|
.map(|(id, description)| JudgeAssertion { id, description })
|
||||||
|
.collect(),
|
||||||
|
diff_assertions: base
|
||||||
|
.diff_assertions
|
||||||
|
.into_iter()
|
||||||
|
.map(|(id, description)| JudgeAssertion { id, description })
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn name_from_path(path: &Path) -> String {
|
||||||
|
path.file_stem().unwrap().to_string_lossy().to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
|
pub struct ExampleToml {
|
||||||
|
pub url: String,
|
||||||
|
pub revision: String,
|
||||||
|
pub language_extension: Option<String>,
|
||||||
|
pub insert_id: Option<String>,
|
||||||
|
#[serde(default = "default_true")]
|
||||||
|
pub require_lsp: bool,
|
||||||
|
#[serde(default)]
|
||||||
|
pub allow_preexisting_diagnostics: bool,
|
||||||
|
pub prompt: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub diff_assertions: BTreeMap<String, String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub thread_assertions: BTreeMap<String, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait(?Send)]
|
||||||
|
impl Example for DeclarativeExample {
|
||||||
|
fn meta(&self) -> ExampleMetadata {
|
||||||
|
self.metadata.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||||
|
cx.push_user_message(&self.prompt);
|
||||||
|
let _ = cx.run_to_end().await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
|
||||||
|
self.diff_assertions.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
|
||||||
|
self.thread_assertions.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
|
||||||
|
let path = std::fs::canonicalize(examples_dir).unwrap();
|
||||||
|
let entries = std::fs::read_dir(path).unwrap();
|
||||||
|
let mut result_paths = Vec::new();
|
||||||
|
for entry in entries {
|
||||||
|
let entry = entry?;
|
||||||
|
let path = entry.path();
|
||||||
|
if path.extension() == Some("toml".as_ref()) {
|
||||||
|
result_paths.push(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(result_paths)
|
||||||
|
}
|
1023
crates/eval/src/instance.rs
Normal file
1023
crates/eval/src/instance.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
||||||
You are an expert software developer. Your task is to evaluate a diff produced by an AI agent in response to a prompt.
|
You are an expert software developer. Your task is to evaluate a diff produced by an AI agent
|
||||||
Here is the prompt and the diff:
|
in response to a prompt. Here is the prompt and the diff:
|
||||||
|
|
||||||
<prompt>
|
<prompt>
|
||||||
{{{prompt}}}
|
{{{prompt}}}
|
||||||
|
@ -9,17 +9,17 @@ Here is the prompt and the diff:
|
||||||
{{{repository_diff}}}
|
{{{repository_diff}}}
|
||||||
</diff>
|
</diff>
|
||||||
|
|
||||||
Evaluate how many of the following criteria were satisfied by the diff:
|
Evaluate whether or not the diff passes the following assertion:
|
||||||
|
|
||||||
<criteria>
|
<assertion>
|
||||||
{{criteria}}
|
{{assertion}}
|
||||||
- There are no changes unrelated to the prompt
|
</assertion>
|
||||||
</criteria>
|
|
||||||
|
|
||||||
Analyze the diff hunk by hunk, and structure your answer in the following XML format:
|
Analyze the diff hunk by hunk, and structure your answer in the following XML format:
|
||||||
|
|
||||||
```
|
```
|
||||||
<analysis>{YOUR ANALYSIS HERE}</analysis>
|
<analysis>{YOUR ANALYSIS HERE}</analysis>
|
||||||
<total_criteria>{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED}</total_criteria>
|
<passed>{PASSED_ASSERTION}</passed>
|
||||||
<passing_criteria>{THE NUMBER OF CRITERIA THAT ARE MET BY THE DIFF}</passing_criteria>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Where `PASSED_ASSERTION` is either `true` or `false`.
|
||||||
|
|
|
@ -1,19 +1,21 @@
|
||||||
You are an expert software developer. Your task is to evaluate an AI agent's messages and tool calls in this conversation:
|
You are an expert software developer.
|
||||||
|
Your task is to evaluate an AI agent's messages and tool calls in this conversation:
|
||||||
|
|
||||||
<messages>
|
<messages>
|
||||||
{{{messages}}}
|
{{{messages}}}
|
||||||
</messages>
|
</messages>
|
||||||
|
|
||||||
You must count how many of the following criteria were satisfied by the messages:
|
Evaluate whether or not the sequence of messages passes the following assertion:
|
||||||
|
|
||||||
<criteria>
|
<assertion>
|
||||||
{{{criteria}}}
|
{{{assertion}}}
|
||||||
</criteria>
|
</assertion>
|
||||||
|
|
||||||
Analyze the messages one by one, and structure your answer in the following XML format:
|
Analyze the messages one by one, and structure your answer in the following XML format:
|
||||||
|
|
||||||
```
|
```
|
||||||
<analysis>{YOUR ANALYSIS HERE}</analysis>
|
<analysis>{YOUR ANALYSIS HERE}</analysis>
|
||||||
<total_criteria>{THE TOTAL NUMBER OF CRITERIA THAT WERE LISTED}</total_criteria>
|
<passed>{PASSED_ASSERTION}</passed>
|
||||||
<passing_criteria>{THE NUMBER OF CRITERIA THAT ARE MET BY THE MESSAGES}</passing_criteria>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Where `PASSED_ASSERTION` is either `true` or `false`.
|
||||||
|
|
|
@ -24,6 +24,10 @@ impl ToolMetrics {
|
||||||
*self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
|
*self.failure_counts.entry(tool_name.clone()).or_insert(0) += failure_count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.use_counts.is_empty() && self.failure_counts.is_empty()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Display for ToolMetrics {
|
impl Display for ToolMetrics {
|
||||||
|
@ -79,7 +83,7 @@ impl Display for ToolMetrics {
|
||||||
let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
|
let failure_count = self.failure_counts.get(&tool_name).cloned().unwrap_or(0);
|
||||||
writeln!(
|
writeln!(
|
||||||
f,
|
f,
|
||||||
"│{:^30}│{:^10}│{:^10}│{:^10}│",
|
"│{:<30}│{:^10}│{:^10}│{:^10}│",
|
||||||
tool_name,
|
tool_name,
|
||||||
use_count,
|
use_count,
|
||||||
failure_count,
|
failure_count,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue