1753 lines
65 KiB
Rust
1753 lines
65 KiB
Rust
use super::*;
|
||
use crate::{
|
||
ReadFileToolInput,
|
||
edit_file_tool::{EditFileMode, EditFileToolInput},
|
||
grep_tool::GrepToolInput,
|
||
list_directory_tool::ListDirectoryToolInput,
|
||
};
|
||
use Role::*;
|
||
use assistant_tool::ToolRegistry;
|
||
use client::{Client, UserStore};
|
||
use collections::HashMap;
|
||
use fs::FakeFs;
|
||
use futures::{FutureExt, future::LocalBoxFuture};
|
||
use gpui::{AppContext, TestAppContext, Timer};
|
||
use http_client::StatusCode;
|
||
use indoc::{formatdoc, indoc};
|
||
use language_model::{
|
||
LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
|
||
LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
|
||
};
|
||
use project::Project;
|
||
use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
|
||
use rand::prelude::*;
|
||
use reqwest_client::ReqwestClient;
|
||
use serde_json::json;
|
||
use std::{
|
||
cmp::Reverse,
|
||
fmt::{self, Display},
|
||
io::Write as _,
|
||
path::Path,
|
||
str::FromStr,
|
||
sync::mpsc,
|
||
time::Duration,
|
||
};
|
||
use util::path;
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_extract_handle_command_output() {
|
||
// Test how well agent generates multiple edit hunks.
|
||
//
|
||
// Model | Pass rate
|
||
// ----------------------------|----------
|
||
// claude-3.7-sonnet | 0.99 (2025-06-14)
|
||
// claude-sonnet-4 | 0.97 (2025-06-14)
|
||
// gemini-2.5-pro-06-05 | 0.98 (2025-06-16)
|
||
// gemini-2.5-flash | 0.11 (2025-05-22)
|
||
// gpt-4.1 | 1.00 (2025-05-22)
|
||
|
||
let input_file_path = "root/blame.rs";
|
||
let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
|
||
let possible_diffs = vec![
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
|
||
include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
|
||
];
|
||
let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
|
||
eval(
|
||
100,
|
||
0.95,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(
|
||
User,
|
||
[text(formatdoc! {"
|
||
Read the `{input_file_path}` file and extract a method in
|
||
the final stanza of `run_git_blame` to deal with command failures,
|
||
call it `handle_command_output` and take the std::process::Output as the only parameter.
|
||
Do not document the method and do not add any comments.
|
||
|
||
Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result("tool_1", "read_file", input_file_content)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_2",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
)],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::assert_diff_any(possible_diffs),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_delete_run_git_blame() {
|
||
// Model | Pass rate
|
||
// ----------------------------|----------
|
||
// claude-3.7-sonnet | 1.0 (2025-06-14)
|
||
// claude-sonnet-4 | 0.96 (2025-06-14)
|
||
// gemini-2.5-pro-06-05 | 1.0 (2025-06-16)
|
||
// gemini-2.5-flash |
|
||
// gpt-4.1 |
|
||
let input_file_path = "root/blame.rs";
|
||
let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
|
||
let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
|
||
let edit_description = "Delete the `run_git_blame` function.";
|
||
eval(
|
||
100,
|
||
0.95,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(
|
||
User,
|
||
[text(formatdoc! {"
|
||
Read the `{input_file_path}` file and delete `run_git_blame`. Just that
|
||
one function, not its usages.
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result("tool_1", "read_file", input_file_content)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_2",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
)],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::assert_eq(output_file_content),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_translate_doc_comments() {
|
||
// Model | Pass rate
|
||
// ============================================
|
||
//
|
||
// claude-3.7-sonnet | 1.0 (2025-06-14)
|
||
// claude-sonnet-4 | 1.0 (2025-06-14)
|
||
// gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
|
||
// gemini-2.5-flash-preview-04-17 |
|
||
// gpt-4.1 |
|
||
let input_file_path = "root/canvas.rs";
|
||
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
|
||
let edit_description = "Translate all doc comments to Italian";
|
||
eval(
|
||
200,
|
||
1.,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(
|
||
User,
|
||
[text(formatdoc! {"
|
||
Read the {input_file_path} file and edit it (without overwriting it),
|
||
translating all the doc comments to italian.
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result("tool_1", "read_file", input_file_content)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_2",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
)],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::judge_diff("Doc comments were translated to Italian"),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
|
||
// Model | Pass rate
|
||
// ============================================
|
||
//
|
||
// claude-3.7-sonnet | 0.96 (2025-06-14)
|
||
// claude-sonnet-4 | 0.11 (2025-06-14)
|
||
// gemini-2.5-pro-preview-latest | 0.99 (2025-06-16)
|
||
// gemini-2.5-flash-preview-04-17 |
|
||
// gpt-4.1 |
|
||
let input_file_path = "root/lib.rs";
|
||
let input_file_content =
|
||
include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
|
||
let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
|
||
eval(
|
||
100,
|
||
0.95,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(
|
||
User,
|
||
[text(formatdoc! {"
|
||
Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
|
||
Use `ureq` to download the SDK for the current platform and architecture.
|
||
Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
|
||
Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
|
||
that's inside of the archive.
|
||
Don't re-download the SDK if that executable already exists.
|
||
|
||
Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{{language_name}}
|
||
|
||
Here are the available wasi-sdk assets:
|
||
- wasi-sdk-25.0-x86_64-macos.tar.gz
|
||
- wasi-sdk-25.0-arm64-macos.tar.gz
|
||
- wasi-sdk-25.0-x86_64-linux.tar.gz
|
||
- wasi-sdk-25.0-arm64-linux.tar.gz
|
||
- wasi-sdk-25.0-x86_64-linux.tar.gz
|
||
- wasi-sdk-25.0-arm64-linux.tar.gz
|
||
- wasi-sdk-25.0-x86_64-windows.tar.gz
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: Some(971),
|
||
end_line: Some(1050),
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_1",
|
||
"read_file",
|
||
lines(input_file_content, 971..1050),
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_2",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: Some(1050),
|
||
end_line: Some(1100),
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_2",
|
||
"read_file",
|
||
lines(input_file_content, 1050..1100),
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_3",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: Some(1100),
|
||
end_line: Some(1150),
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_3",
|
||
"read_file",
|
||
lines(input_file_content, 1100..1150),
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_4",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
)],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::judge_diff(indoc! {"
|
||
- The compile_parser_to_wasm method has been changed to use wasi-sdk
|
||
- ureq is used to download the SDK for current platform and architecture
|
||
"}),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_disable_cursor_blinking() {
|
||
// Model | Pass rate
|
||
// ============================================
|
||
//
|
||
// claude-3.7-sonnet | 0.59 (2025-07-14)
|
||
// claude-sonnet-4 | 0.81 (2025-07-14)
|
||
// gemini-2.5-pro | 0.95 (2025-07-14)
|
||
// gemini-2.5-flash-preview-04-17 | 0.78 (2025-07-14)
|
||
// gpt-4.1 | 0.00 (2025-07-14) (follows edit_description too literally)
|
||
let input_file_path = "root/editor.rs";
|
||
let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
|
||
let edit_description = "Comment out the call to `BlinkManager::enable`";
|
||
let possible_diffs = vec![
|
||
include_str!("evals/fixtures/disable_cursor_blinking/possible-01.diff"),
|
||
include_str!("evals/fixtures/disable_cursor_blinking/possible-02.diff"),
|
||
include_str!("evals/fixtures/disable_cursor_blinking/possible-03.diff"),
|
||
include_str!("evals/fixtures/disable_cursor_blinking/possible-04.diff"),
|
||
];
|
||
eval(
|
||
100,
|
||
0.51,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(User, [text("Let's research how to cursor blinking works.")]),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"grep",
|
||
GrepToolInput {
|
||
regex: "blink".into(),
|
||
include_pattern: None,
|
||
offset: 0,
|
||
case_sensitive: false,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_1",
|
||
"grep",
|
||
[
|
||
lines(input_file_content, 100..400),
|
||
lines(input_file_content, 800..1300),
|
||
lines(input_file_content, 1600..2000),
|
||
lines(input_file_content, 5000..5500),
|
||
lines(input_file_content, 8000..9000),
|
||
lines(input_file_content, 18455..18470),
|
||
lines(input_file_content, 20000..20500),
|
||
lines(input_file_content, 21000..21300),
|
||
]
|
||
.join("Match found:\n\n"),
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[text(indoc! {"
|
||
Comment out the lines that interact with the BlinkManager.
|
||
Keep the outer `update` blocks, but comments everything that's inside (including if statements).
|
||
Don't add additional comments.
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_4",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
)],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::assert_diff_any(possible_diffs),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_from_pixels_constructor() {
|
||
// Results for 2025-06-13
|
||
//
|
||
// The outcome of this evaluation depends heavily on the LINE_HINT_TOLERANCE
|
||
// value. Higher values improve the pass rate but may sometimes cause
|
||
// edits to be misapplied. In the context of this eval, this means
|
||
// the agent might add from_pixels tests in incorrect locations
|
||
// (e.g., at the beginning of the file), yet the evaluation may still
|
||
// rate it highly.
|
||
//
|
||
// Model | Date | Pass rate
|
||
// =========================================================
|
||
// claude-4.0-sonnet | 2025-06-14 | 0.99
|
||
// claude-3.7-sonnet | 2025-06-14 | 0.88
|
||
// gemini-2.5-pro-preview-06-05 | 2025-06-16 | 0.98
|
||
// gpt-4.1 |
|
||
let input_file_path = "root/canvas.rs";
|
||
let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
|
||
let edit_description = "Implement from_pixels constructor and add tests.";
|
||
eval(
|
||
100,
|
||
0.95,
|
||
// For whatever reason, this eval produces more mismatched tags.
|
||
// Increasing for now, let's see if we can bring this down.
|
||
0.25,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(
|
||
User,
|
||
[text(indoc! {"
|
||
Introduce a new `from_pixels` constructor in Canvas and
|
||
also add tests for it in the same file.
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result("tool_1", "read_file", input_file_content)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_2",
|
||
"grep",
|
||
GrepToolInput {
|
||
regex: "mod\\s+tests".into(),
|
||
include_pattern: Some("font-kit/src/canvas.rs".into()),
|
||
offset: 0,
|
||
case_sensitive: false,
|
||
},
|
||
)],
|
||
),
|
||
message(User, [tool_result("tool_2", "grep", "No matches found")]),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_3",
|
||
"grep",
|
||
GrepToolInput {
|
||
regex: "mod\\s+tests".into(),
|
||
include_pattern: Some("font-kit/src/**/*.rs".into()),
|
||
offset: 0,
|
||
case_sensitive: false,
|
||
},
|
||
)],
|
||
),
|
||
message(User, [tool_result("tool_3", "grep", "No matches found")]),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_4",
|
||
"grep",
|
||
GrepToolInput {
|
||
regex: "#\\[test\\]".into(),
|
||
include_pattern: Some("font-kit/src/**/*.rs".into()),
|
||
offset: 0,
|
||
case_sensitive: false,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_4",
|
||
"grep",
|
||
indoc! {"
|
||
Found 6 matches:
|
||
|
||
## Matches in font-kit/src/loaders/core_text.rs
|
||
|
||
### mod test › L926-936
|
||
```
|
||
mod test {
|
||
use super::Font;
|
||
use crate::properties::{Stretch, Weight};
|
||
|
||
#[cfg(feature = \"source\")]
|
||
use crate::source::SystemSource;
|
||
|
||
static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
|
||
|
||
#[cfg(feature = \"source\")]
|
||
#[test]
|
||
```
|
||
|
||
55 lines remaining in ancestor node. Read the file to see all.
|
||
|
||
### mod test › L947-951
|
||
```
|
||
}
|
||
|
||
#[test]
|
||
fn test_core_text_to_css_font_weight() {
|
||
// Exact matches
|
||
```
|
||
|
||
### mod test › L959-963
|
||
```
|
||
}
|
||
|
||
#[test]
|
||
fn test_core_text_to_css_font_stretch() {
|
||
// Exact matches
|
||
```
|
||
|
||
## Matches in font-kit/src/loaders/freetype.rs
|
||
|
||
### mod test › L1238-1248
|
||
```
|
||
mod test {
|
||
use crate::loaders::freetype::Font;
|
||
|
||
static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
|
||
static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
|
||
|
||
#[test]
|
||
fn get_pcf_postscript_name() {
|
||
let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
|
||
assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
|
||
}
|
||
```
|
||
|
||
1 lines remaining in ancestor node. Read the file to see all.
|
||
|
||
## Matches in font-kit/src/sources/core_text.rs
|
||
|
||
### mod test › L265-275
|
||
```
|
||
mod test {
|
||
use crate::properties::{Stretch, Weight};
|
||
|
||
#[test]
|
||
fn test_css_to_core_text_font_weight() {
|
||
// Exact matches
|
||
assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
|
||
assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
|
||
assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
|
||
assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
|
||
|
||
```
|
||
|
||
27 lines remaining in ancestor node. Read the file to see all.
|
||
|
||
### mod test › L278-282
|
||
```
|
||
}
|
||
|
||
#[test]
|
||
fn test_css_to_core_text_font_stretch() {
|
||
// Exact matches
|
||
```
|
||
"},
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_5",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
)],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::judge_diff(indoc! {"
|
||
- The diff contains a new `from_pixels` constructor
|
||
- The diff contains new tests for the `from_pixels` constructor
|
||
"}),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_zode() {
|
||
// Model | Pass rate
|
||
// ============================================
|
||
//
|
||
// claude-3.7-sonnet | 1.0 (2025-06-14)
|
||
// claude-sonnet-4 | 1.0 (2025-06-14)
|
||
// gemini-2.5-pro-preview-03-25 | 1.0 (2025-05-22)
|
||
// gemini-2.5-flash-preview-04-17 | 1.0 (2025-05-22)
|
||
// gpt-4.1 | 1.0 (2025-05-22)
|
||
let input_file_path = "root/zode.py";
|
||
let input_content = None;
|
||
let edit_description = "Create the main Zode CLI script";
|
||
eval(
|
||
50,
|
||
1.,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
|
||
message(
|
||
Assistant,
|
||
[
|
||
tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: "root/eval/react.py".into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
),
|
||
tool_use(
|
||
"tool_2",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: "root/eval/react_test.py".into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
),
|
||
],
|
||
),
|
||
message(
|
||
User,
|
||
[
|
||
tool_result(
|
||
"tool_1",
|
||
"read_file",
|
||
include_str!("evals/fixtures/zode/react.py"),
|
||
),
|
||
tool_result(
|
||
"tool_2",
|
||
"read_file",
|
||
include_str!("evals/fixtures/zode/react_test.py"),
|
||
),
|
||
],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text(
|
||
"Now that I understand what we need to build, I'll create the main Python script:",
|
||
),
|
||
tool_use(
|
||
"tool_3",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Create,
|
||
},
|
||
),
|
||
],
|
||
),
|
||
],
|
||
input_content,
|
||
EvalAssertion::new(async move |sample, _, _cx| {
|
||
let invalid_starts = [' ', '`', '\n'];
|
||
let mut message = String::new();
|
||
for start in invalid_starts {
|
||
if sample.text_after.starts_with(start) {
|
||
message.push_str(&format!("The sample starts with a {:?}\n", start));
|
||
break;
|
||
}
|
||
}
|
||
// Remove trailing newline.
|
||
message.pop();
|
||
|
||
if message.is_empty() {
|
||
Ok(EvalAssertionOutcome {
|
||
score: 100,
|
||
message: None,
|
||
})
|
||
} else {
|
||
Ok(EvalAssertionOutcome {
|
||
score: 0,
|
||
message: Some(message),
|
||
})
|
||
}
|
||
}),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_add_overwrite_test() {
|
||
// Model | Pass rate
|
||
// ============================================
|
||
//
|
||
// claude-3.7-sonnet | 0.65 (2025-06-14)
|
||
// claude-sonnet-4 | 0.07 (2025-06-14)
|
||
// gemini-2.5-pro-preview-03-25 | 0.35 (2025-05-22)
|
||
// gemini-2.5-flash-preview-04-17 |
|
||
// gpt-4.1 |
|
||
let input_file_path = "root/action_log.rs";
|
||
let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
|
||
let edit_description = "Add a new test for overwriting a file in action_log.rs";
|
||
eval(
|
||
200,
|
||
0.5, // TODO: make this eval better
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(
|
||
User,
|
||
[text(indoc! {"
|
||
Introduce a new test in `action_log.rs` to test overwriting a file.
|
||
That is, a file already exists, but we call `buffer_created` as if the file were new.
|
||
Take inspiration from all the other tests in the file.
|
||
"})],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[tool_use(
|
||
"tool_1",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: None,
|
||
end_line: None,
|
||
},
|
||
)],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_1",
|
||
"read_file",
|
||
indoc! {"
|
||
pub struct ActionLog [L13-20]
|
||
tracked_buffers [L15]
|
||
edited_since_project_diagnostics_check [L17]
|
||
project [L19]
|
||
impl ActionLog [L22-498]
|
||
pub fn new [L24-30]
|
||
pub fn project [L32-34]
|
||
pub fn checked_project_diagnostics [L37-39]
|
||
pub fn has_edited_files_since_project_diagnostics_check [L42-44]
|
||
fn track_buffer_internal [L46-101]
|
||
fn handle_buffer_event [L103-116]
|
||
fn handle_buffer_edited [L118-123]
|
||
fn handle_buffer_file_changed [L125-158]
|
||
async fn maintain_diff [L160-264]
|
||
pub fn buffer_read [L267-269]
|
||
pub fn buffer_created [L272-276]
|
||
pub fn buffer_edited [L279-287]
|
||
pub fn will_delete_buffer [L289-304]
|
||
pub fn keep_edits_in_range [L306-364]
|
||
pub fn reject_edits_in_ranges [L366-459]
|
||
pub fn keep_all_edits [L461-473]
|
||
pub fn changed_buffers [L476-482]
|
||
pub fn stale_buffers [L485-497]
|
||
fn apply_non_conflicting_edits [L500-561]
|
||
fn diff_snapshots [L563-585]
|
||
fn point_to_row_edit [L587-614]
|
||
enum ChangeAuthor [L617-620]
|
||
User [L618]
|
||
Agent [L619]
|
||
enum TrackedBufferStatus [L623-627]
|
||
Created [L624]
|
||
Modified [L625]
|
||
Deleted [L626]
|
||
struct TrackedBuffer [L629-641]
|
||
buffer [L630]
|
||
base_text [L631]
|
||
unreviewed_changes [L632]
|
||
status [L633]
|
||
version [L634]
|
||
diff [L635]
|
||
snapshot [L636]
|
||
diff_update [L637]
|
||
_open_lsp_handle [L638]
|
||
_maintain_diff [L639]
|
||
_subscription [L640]
|
||
impl TrackedBuffer [L643-657]
|
||
fn has_changes [L644-650]
|
||
fn schedule_diff_update [L652-656]
|
||
pub struct ChangedBuffer [L659-661]
|
||
pub diff [L660]
|
||
mod tests [L664-1574]
|
||
fn init_logger [L678-682]
|
||
fn init_test [L684-691]
|
||
async fn test_keep_edits [L694-769]
|
||
async fn test_deletions [L772-854]
|
||
async fn test_overlapping_user_edits [L857-951]
|
||
async fn test_creating_files [L954-1010]
|
||
async fn test_deleting_files [L1013-1120]
|
||
async fn test_reject_edits [L1123-1255]
|
||
async fn test_reject_multiple_edits [L1258-1331]
|
||
async fn test_reject_deleted_file [L1334-1388]
|
||
async fn test_reject_created_file [L1391-1443]
|
||
async fn test_random_diffs [L1446-1535]
|
||
fn quiesce [L1510-1534]
|
||
struct HunkStatus [L1538-1542]
|
||
range [L1539]
|
||
diff_status [L1540]
|
||
old_text [L1541]
|
||
fn unreviewed_hunks [L1544-1573]
|
||
|
||
Showing symbols 1-69 (total symbols: 69)
|
||
|
||
Using the line numbers in this outline, you can call this tool again while specifying
|
||
the start_line and end_line fields to see the implementations of symbols in the outline.
|
||
"},
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text(
|
||
"Let's look at the existing test cases that are related to file creation to understand the pattern:",
|
||
),
|
||
tool_use(
|
||
"tool_2",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: Some(953),
|
||
end_line: Some(1010),
|
||
},
|
||
),
|
||
],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_2",
|
||
"read_file",
|
||
lines(input_file_content, 953..1010),
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text(
|
||
"Now let's look at another relevant test to understand how they're structured:",
|
||
),
|
||
tool_use(
|
||
"tool_3",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: Some(1012),
|
||
end_line: Some(1120),
|
||
},
|
||
),
|
||
],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_3",
|
||
"read_file",
|
||
lines(input_file_content, 1012..1120),
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text("Now let's look at how `buffer_created` is implemented:"),
|
||
tool_use(
|
||
"tool_4",
|
||
"read_file",
|
||
ReadFileToolInput {
|
||
path: input_file_path.into(),
|
||
start_line: Some(271),
|
||
end_line: Some(276),
|
||
},
|
||
),
|
||
],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"tool_4",
|
||
"read_file",
|
||
lines(input_file_content, 271..276),
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text(indoc! {"
|
||
Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
|
||
|
||
1. Create a file
|
||
2. Open a buffer for it
|
||
3. Track it in the action log
|
||
4. Modify the buffer
|
||
5. Call `buffer_created` again as if the file were new
|
||
6. Check that the changes are properly tracked
|
||
|
||
Let's write the test:
|
||
"}),
|
||
tool_use(
|
||
"tool_5",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: edit_description.into(),
|
||
path: input_file_path.into(),
|
||
mode: EditFileMode::Edit,
|
||
},
|
||
),
|
||
],
|
||
),
|
||
],
|
||
Some(input_file_content.into()),
|
||
EvalAssertion::judge_diff(
|
||
"A new test for overwritten files was created, without changing any previous test",
|
||
),
|
||
),
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
#[cfg_attr(not(feature = "eval"), ignore)]
|
||
fn eval_create_empty_file() {
|
||
// Check that Edit Agent can create a file without writing its
|
||
// thoughts into it. This issue is not specific to empty files, but
|
||
// it's easier to reproduce with them.
|
||
//
|
||
// Model | Pass rate
|
||
// ============================================
|
||
//
|
||
// claude-3.7-sonnet | 1.00 (2025-06-14)
|
||
// claude-sonnet-4 | 1.00 (2025-06-14)
|
||
// gemini-2.5-pro-preview-03-25 | 1.00 (2025-05-21)
|
||
// gemini-2.5-flash-preview-04-17 | 1.00 (2025-05-21)
|
||
// gpt-4.1 | 1.00 (2025-05-21)
|
||
//
|
||
//
|
||
// TODO: gpt-4.1-mini errored 38 times:
|
||
// "data did not match any variant of untagged enum ResponseStreamResult"
|
||
//
|
||
let input_file_content = None;
|
||
let expected_output_content = String::new();
|
||
eval(
|
||
100,
|
||
0.99,
|
||
0.05,
|
||
EvalInput::from_conversation(
|
||
vec![
|
||
message(User, [text("Create a second empty todo file ")]),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text(formatdoc! {"
|
||
I'll help you create a second empty todo file.
|
||
First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
|
||
"}),
|
||
tool_use(
|
||
"toolu_01GAF8TtsgpjKxCr8fgQLDgR",
|
||
"list_directory",
|
||
ListDirectoryToolInput {
|
||
path: "root".to_string(),
|
||
},
|
||
),
|
||
],
|
||
),
|
||
message(
|
||
User,
|
||
[tool_result(
|
||
"toolu_01GAF8TtsgpjKxCr8fgQLDgR",
|
||
"list_directory",
|
||
"root/TODO\nroot/TODO2\nroot/new.txt\n",
|
||
)],
|
||
),
|
||
message(
|
||
Assistant,
|
||
[
|
||
text(formatdoc! {"
|
||
I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
|
||
"}),
|
||
tool_use(
|
||
"toolu_01Tb3iQ9griqSYMmVuykQPWU",
|
||
"edit_file",
|
||
EditFileToolInput {
|
||
display_description: "Create empty TODO3 file".to_string(),
|
||
mode: EditFileMode::Create,
|
||
path: "root/TODO3".into(),
|
||
},
|
||
),
|
||
],
|
||
),
|
||
],
|
||
input_file_content,
|
||
// Bad behavior is to write something like
|
||
// "I'll create an empty TODO3 file as requested."
|
||
EvalAssertion::assert_eq(expected_output_content),
|
||
),
|
||
);
|
||
}
|
||
|
||
fn message(
|
||
role: Role,
|
||
contents: impl IntoIterator<Item = MessageContent>,
|
||
) -> LanguageModelRequestMessage {
|
||
LanguageModelRequestMessage {
|
||
role,
|
||
content: contents.into_iter().collect(),
|
||
cache: false,
|
||
}
|
||
}
|
||
|
||
fn text(text: impl Into<String>) -> MessageContent {
|
||
MessageContent::Text(text.into())
|
||
}
|
||
|
||
fn lines(input: &str, range: Range<usize>) -> String {
|
||
input
|
||
.lines()
|
||
.skip(range.start)
|
||
.take(range.len())
|
||
.collect::<Vec<_>>()
|
||
.join("\n")
|
||
}
|
||
|
||
fn tool_use(
|
||
id: impl Into<Arc<str>>,
|
||
name: impl Into<Arc<str>>,
|
||
input: impl Serialize,
|
||
) -> MessageContent {
|
||
MessageContent::ToolUse(LanguageModelToolUse {
|
||
id: LanguageModelToolUseId::from(id.into()),
|
||
name: name.into(),
|
||
raw_input: serde_json::to_string_pretty(&input).unwrap(),
|
||
input: serde_json::to_value(input).unwrap(),
|
||
is_input_complete: true,
|
||
})
|
||
}
|
||
|
||
fn tool_result(
|
||
id: impl Into<Arc<str>>,
|
||
name: impl Into<Arc<str>>,
|
||
result: impl Into<Arc<str>>,
|
||
) -> MessageContent {
|
||
MessageContent::ToolResult(LanguageModelToolResult {
|
||
tool_use_id: LanguageModelToolUseId::from(id.into()),
|
||
tool_name: name.into(),
|
||
is_error: false,
|
||
content: LanguageModelToolResultContent::Text(result.into()),
|
||
output: None,
|
||
})
|
||
}
|
||
|
||
#[derive(Clone)]
|
||
struct EvalInput {
|
||
conversation: Vec<LanguageModelRequestMessage>,
|
||
edit_file_input: EditFileToolInput,
|
||
input_content: Option<String>,
|
||
assertion: EvalAssertion,
|
||
}
|
||
|
||
impl EvalInput {
|
||
fn from_conversation(
|
||
conversation: Vec<LanguageModelRequestMessage>,
|
||
input_content: Option<String>,
|
||
assertion: EvalAssertion,
|
||
) -> Self {
|
||
let msg = conversation.last().expect("Conversation must not be empty");
|
||
if msg.role != Role::Assistant {
|
||
panic!("Conversation must end with an assistant message");
|
||
}
|
||
let tool_use = msg
|
||
.content
|
||
.iter()
|
||
.flat_map(|content| match content {
|
||
MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
|
||
Some(tool_use)
|
||
}
|
||
_ => None,
|
||
})
|
||
.next()
|
||
.expect("Conversation must end with an edit_file tool use")
|
||
.clone();
|
||
|
||
let edit_file_input: EditFileToolInput =
|
||
serde_json::from_value(tool_use.input.clone()).unwrap();
|
||
|
||
EvalInput {
|
||
conversation,
|
||
edit_file_input,
|
||
input_content,
|
||
assertion,
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Clone)]
|
||
struct EvalSample {
|
||
text_before: String,
|
||
text_after: String,
|
||
edit_output: EditAgentOutput,
|
||
diff: String,
|
||
}
|
||
|
||
trait AssertionFn: 'static + Send + Sync {
|
||
fn assert<'a>(
|
||
&'a self,
|
||
sample: &'a EvalSample,
|
||
judge_model: Arc<dyn LanguageModel>,
|
||
cx: &'a mut TestAppContext,
|
||
) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>>;
|
||
}
|
||
|
||
impl<F> AssertionFn for F
|
||
where
|
||
F: 'static
|
||
+ Send
|
||
+ Sync
|
||
+ AsyncFn(
|
||
&EvalSample,
|
||
Arc<dyn LanguageModel>,
|
||
&mut TestAppContext,
|
||
) -> Result<EvalAssertionOutcome>,
|
||
{
|
||
fn assert<'a>(
|
||
&'a self,
|
||
sample: &'a EvalSample,
|
||
judge_model: Arc<dyn LanguageModel>,
|
||
cx: &'a mut TestAppContext,
|
||
) -> LocalBoxFuture<'a, Result<EvalAssertionOutcome>> {
|
||
(self)(sample, judge_model, cx).boxed_local()
|
||
}
|
||
}
|
||
|
||
#[derive(Clone)]
|
||
struct EvalAssertion(Arc<dyn AssertionFn>);
|
||
|
||
impl EvalAssertion {
|
||
fn new<F>(f: F) -> Self
|
||
where
|
||
F: 'static
|
||
+ Send
|
||
+ Sync
|
||
+ AsyncFn(
|
||
&EvalSample,
|
||
Arc<dyn LanguageModel>,
|
||
&mut TestAppContext,
|
||
) -> Result<EvalAssertionOutcome>,
|
||
{
|
||
EvalAssertion(Arc::new(f))
|
||
}
|
||
|
||
fn assert_eq(expected: impl Into<String>) -> Self {
|
||
let expected = expected.into();
|
||
Self::new(async move |sample, _judge, _cx| {
|
||
Ok(EvalAssertionOutcome {
|
||
score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
|
||
100
|
||
} else {
|
||
0
|
||
},
|
||
message: None,
|
||
})
|
||
})
|
||
}
|
||
|
||
fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
|
||
let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
|
||
Self::new(async move |sample, _judge, _cx| {
|
||
let matches = expected_diffs.iter().any(|possible_diff| {
|
||
let expected =
|
||
language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
|
||
strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
|
||
});
|
||
|
||
Ok(EvalAssertionOutcome {
|
||
score: if matches { 100 } else { 0 },
|
||
message: None,
|
||
})
|
||
})
|
||
}
|
||
|
||
fn judge_diff(assertions: &'static str) -> Self {
|
||
Self::new(async move |sample, judge, cx| {
|
||
let prompt = DiffJudgeTemplate {
|
||
diff: sample.diff.clone(),
|
||
assertions,
|
||
}
|
||
.render(&Templates::new())
|
||
.unwrap();
|
||
|
||
let request = LanguageModelRequest {
|
||
messages: vec![LanguageModelRequestMessage {
|
||
role: Role::User,
|
||
content: vec![prompt.into()],
|
||
cache: false,
|
||
}],
|
||
thinking_allowed: true,
|
||
..Default::default()
|
||
};
|
||
let mut response = retry_on_rate_limit(async || {
|
||
Ok(judge
|
||
.stream_completion_text(request.clone(), &cx.to_async())
|
||
.await?)
|
||
})
|
||
.await?;
|
||
let mut output = String::new();
|
||
while let Some(chunk) = response.stream.next().await {
|
||
let chunk = chunk?;
|
||
output.push_str(&chunk);
|
||
}
|
||
|
||
// Parse the score from the response
|
||
let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
|
||
if let Some(captures) = re.captures(&output) {
|
||
if let Some(score_match) = captures.get(1) {
|
||
let score = score_match.as_str().parse().unwrap_or(0);
|
||
return Ok(EvalAssertionOutcome {
|
||
score,
|
||
message: Some(output),
|
||
});
|
||
}
|
||
}
|
||
|
||
anyhow::bail!("No score found in response. Raw output: {output}");
|
||
})
|
||
}
|
||
|
||
async fn run(
|
||
&self,
|
||
input: &EvalSample,
|
||
judge_model: Arc<dyn LanguageModel>,
|
||
cx: &mut TestAppContext,
|
||
) -> Result<EvalAssertionOutcome> {
|
||
self.0.assert(input, judge_model, cx).await
|
||
}
|
||
}
|
||
|
||
fn eval(
|
||
iterations: usize,
|
||
expected_pass_ratio: f32,
|
||
mismatched_tag_threshold: f32,
|
||
mut eval: EvalInput,
|
||
) {
|
||
let mut evaluated_count = 0;
|
||
let mut failed_count = 0;
|
||
report_progress(evaluated_count, failed_count, iterations);
|
||
|
||
let (tx, rx) = mpsc::channel();
|
||
|
||
// Cache the last message in the conversation, and run one instance of the eval so that
|
||
// all the next ones are cached.
|
||
eval.conversation.last_mut().unwrap().cache = true;
|
||
run_eval(eval.clone(), tx.clone());
|
||
|
||
let executor = gpui::background_executor();
|
||
let semaphore = Arc::new(smol::lock::Semaphore::new(32));
|
||
for _ in 1..iterations {
|
||
let eval = eval.clone();
|
||
let tx = tx.clone();
|
||
let semaphore = semaphore.clone();
|
||
executor
|
||
.spawn(async move {
|
||
let _guard = semaphore.acquire().await;
|
||
run_eval(eval, tx)
|
||
})
|
||
.detach();
|
||
}
|
||
drop(tx);
|
||
|
||
let mut failed_evals = HashMap::default();
|
||
let mut errored_evals = HashMap::default();
|
||
let mut eval_outputs = Vec::new();
|
||
let mut cumulative_parser_metrics = EditParserMetrics::default();
|
||
while let Ok(output) = rx.recv() {
|
||
match output {
|
||
Ok(output) => {
|
||
cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
|
||
eval_outputs.push(output.clone());
|
||
if output.assertion.score < 80 {
|
||
failed_count += 1;
|
||
failed_evals
|
||
.entry(output.sample.text_after.clone())
|
||
.or_insert(Vec::new())
|
||
.push(output);
|
||
}
|
||
}
|
||
Err(error) => {
|
||
failed_count += 1;
|
||
*errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
|
||
}
|
||
}
|
||
|
||
evaluated_count += 1;
|
||
report_progress(evaluated_count, failed_count, iterations);
|
||
}
|
||
|
||
let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
|
||
println!("Actual pass ratio: {}\n", actual_pass_ratio);
|
||
if actual_pass_ratio < expected_pass_ratio {
|
||
let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
|
||
errored_evals.sort_by_key(|(_, count)| Reverse(*count));
|
||
for (error, count) in errored_evals {
|
||
println!("Eval errored {} times. Error: {}", count, error);
|
||
}
|
||
|
||
let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
|
||
failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
|
||
for (_buffer_output, failed_evals) in failed_evals {
|
||
let eval_output = failed_evals.first().unwrap();
|
||
println!("Eval failed {} times", failed_evals.len());
|
||
println!("{}", eval_output);
|
||
}
|
||
|
||
panic!(
|
||
"Actual pass ratio: {}\nExpected pass ratio: {}",
|
||
actual_pass_ratio, expected_pass_ratio
|
||
);
|
||
}
|
||
|
||
let mismatched_tag_ratio =
|
||
cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
|
||
if mismatched_tag_ratio > mismatched_tag_threshold {
|
||
for eval_output in eval_outputs {
|
||
println!("{}", eval_output);
|
||
}
|
||
panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
|
||
}
|
||
}
|
||
|
||
fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
|
||
let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
|
||
let mut cx = TestAppContext::build(dispatcher, None);
|
||
let output = cx.executor().block_test(async {
|
||
let test = EditAgentTest::new(&mut cx).await;
|
||
test.eval(eval, &mut cx).await
|
||
});
|
||
tx.send(output).unwrap();
|
||
}
|
||
|
||
#[derive(Clone)]
|
||
struct EvalOutput {
|
||
sample: EvalSample,
|
||
assertion: EvalAssertionOutcome,
|
||
}
|
||
|
||
impl Display for EvalOutput {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
writeln!(f, "Score: {:?}", self.assertion.score)?;
|
||
if let Some(message) = self.assertion.message.as_ref() {
|
||
writeln!(f, "Message: {}", message)?;
|
||
}
|
||
|
||
writeln!(f, "Diff:\n{}", self.sample.diff)?;
|
||
|
||
writeln!(
|
||
f,
|
||
"Parser Metrics:\n{:#?}",
|
||
self.sample.edit_output.parser_metrics
|
||
)?;
|
||
writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
fn report_progress(evaluated_count: usize, failed_count: usize, iterations: usize) {
|
||
let passed_count = evaluated_count - failed_count;
|
||
let passed_ratio = if evaluated_count == 0 {
|
||
0.0
|
||
} else {
|
||
passed_count as f64 / evaluated_count as f64
|
||
};
|
||
print!(
|
||
"\r\x1b[KEvaluated {}/{} ({:.2}% passed)",
|
||
evaluated_count,
|
||
iterations,
|
||
passed_ratio * 100.0
|
||
);
|
||
std::io::stdout().flush().unwrap();
|
||
}
|
||
|
||
struct EditAgentTest {
|
||
agent: EditAgent,
|
||
project: Entity<Project>,
|
||
judge_model: Arc<dyn LanguageModel>,
|
||
}
|
||
|
||
impl EditAgentTest {
|
||
async fn new(cx: &mut TestAppContext) -> Self {
|
||
cx.executor().allow_parking();
|
||
|
||
let fs = FakeFs::new(cx.executor().clone());
|
||
cx.update(|cx| {
|
||
settings::init(cx);
|
||
gpui_tokio::init(cx);
|
||
let http_client = Arc::new(ReqwestClient::user_agent("agent tests").unwrap());
|
||
cx.set_http_client(http_client);
|
||
|
||
client::init_settings(cx);
|
||
let client = Client::production(cx);
|
||
let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
|
||
|
||
settings::init(cx);
|
||
Project::init_settings(cx);
|
||
language::init(cx);
|
||
language_model::init(client.clone(), cx);
|
||
language_models::init(user_store.clone(), client.clone(), cx);
|
||
crate::init(client.http_client(), cx);
|
||
});
|
||
|
||
fs.insert_tree("/root", json!({})).await;
|
||
let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
|
||
let agent_model = SelectedModel::from_str(
|
||
&std::env::var("ZED_AGENT_MODEL")
|
||
.unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
|
||
)
|
||
.unwrap();
|
||
let judge_model = SelectedModel::from_str(
|
||
&std::env::var("ZED_JUDGE_MODEL")
|
||
.unwrap_or("anthropic/claude-3-7-sonnet-latest".into()),
|
||
)
|
||
.unwrap();
|
||
let (agent_model, judge_model) = cx
|
||
.update(|cx| {
|
||
cx.spawn(async move |cx| {
|
||
let agent_model = Self::load_model(&agent_model, cx).await;
|
||
let judge_model = Self::load_model(&judge_model, cx).await;
|
||
(agent_model.unwrap(), judge_model.unwrap())
|
||
})
|
||
})
|
||
.await;
|
||
let action_log = cx.new(|_| ActionLog::new(project.clone()));
|
||
|
||
let edit_format = EditFormat::from_env(agent_model.clone()).unwrap();
|
||
|
||
Self {
|
||
agent: EditAgent::new(
|
||
agent_model,
|
||
project.clone(),
|
||
action_log,
|
||
Templates::new(),
|
||
edit_format,
|
||
),
|
||
project,
|
||
judge_model,
|
||
}
|
||
}
|
||
|
||
async fn load_model(
|
||
selected_model: &SelectedModel,
|
||
cx: &mut AsyncApp,
|
||
) -> Result<Arc<dyn LanguageModel>> {
|
||
let (provider, model) = cx.update(|cx| {
|
||
let models = LanguageModelRegistry::read_global(cx);
|
||
let model = models
|
||
.available_models(cx)
|
||
.find(|model| {
|
||
model.provider_id() == selected_model.provider
|
||
&& model.id() == selected_model.model
|
||
})
|
||
.expect("Model not found");
|
||
let provider = models.provider(&model.provider_id()).unwrap();
|
||
(provider, model)
|
||
})?;
|
||
cx.update(|cx| provider.authenticate(cx))?.await?;
|
||
Ok(model)
|
||
}
|
||
|
||
async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
|
||
let path = self
|
||
.project
|
||
.read_with(cx, |project, cx| {
|
||
project.find_project_path(eval.edit_file_input.path, cx)
|
||
})
|
||
.unwrap();
|
||
let buffer = self
|
||
.project
|
||
.update(cx, |project, cx| project.open_buffer(path, cx))
|
||
.await
|
||
.unwrap();
|
||
let tools = cx.update(|cx| {
|
||
ToolRegistry::default_global(cx)
|
||
.tools()
|
||
.into_iter()
|
||
.filter_map(|tool| {
|
||
let input_schema = tool
|
||
.input_schema(self.agent.model.tool_input_format())
|
||
.ok()?;
|
||
Some(LanguageModelRequestTool {
|
||
name: tool.name(),
|
||
description: tool.description(),
|
||
input_schema,
|
||
})
|
||
})
|
||
.collect::<Vec<_>>()
|
||
});
|
||
let tool_names = tools
|
||
.iter()
|
||
.map(|tool| tool.name.clone())
|
||
.collect::<Vec<_>>();
|
||
let worktrees = vec![WorktreeContext {
|
||
root_name: "root".to_string(),
|
||
abs_path: Path::new("/path/to/root").into(),
|
||
rules_file: None,
|
||
}];
|
||
let prompt_builder = PromptBuilder::new(None)?;
|
||
let project_context = ProjectContext::new(worktrees, Vec::default());
|
||
let system_prompt = prompt_builder.generate_assistant_system_prompt(
|
||
&project_context,
|
||
&ModelContext {
|
||
available_tools: tool_names,
|
||
},
|
||
)?;
|
||
|
||
let has_system_prompt = eval
|
||
.conversation
|
||
.first()
|
||
.map_or(false, |msg| msg.role == Role::System);
|
||
let messages = if has_system_prompt {
|
||
eval.conversation
|
||
} else {
|
||
[LanguageModelRequestMessage {
|
||
role: Role::System,
|
||
content: vec![MessageContent::Text(system_prompt)],
|
||
cache: true,
|
||
}]
|
||
.into_iter()
|
||
.chain(eval.conversation)
|
||
.collect::<Vec<_>>()
|
||
};
|
||
|
||
let conversation = LanguageModelRequest {
|
||
messages,
|
||
tools,
|
||
thinking_allowed: true,
|
||
..Default::default()
|
||
};
|
||
|
||
let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
|
||
if let Some(input_content) = eval.input_content.as_deref() {
|
||
buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
|
||
}
|
||
retry_on_rate_limit(async || {
|
||
self.agent
|
||
.edit(
|
||
buffer.clone(),
|
||
eval.edit_file_input.display_description.clone(),
|
||
&conversation,
|
||
&mut cx.to_async(),
|
||
)
|
||
.0
|
||
.await
|
||
})
|
||
.await?
|
||
} else {
|
||
retry_on_rate_limit(async || {
|
||
self.agent
|
||
.overwrite(
|
||
buffer.clone(),
|
||
eval.edit_file_input.display_description.clone(),
|
||
&conversation,
|
||
&mut cx.to_async(),
|
||
)
|
||
.0
|
||
.await
|
||
})
|
||
.await?
|
||
};
|
||
|
||
let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
|
||
let sample = EvalSample {
|
||
edit_output,
|
||
diff: language::unified_diff(
|
||
eval.input_content.as_deref().unwrap_or_default(),
|
||
&buffer_text,
|
||
),
|
||
text_before: eval.input_content.unwrap_or_default(),
|
||
text_after: buffer_text,
|
||
};
|
||
let assertion = eval
|
||
.assertion
|
||
.run(&sample, self.judge_model.clone(), cx)
|
||
.await?;
|
||
|
||
Ok(EvalOutput { assertion, sample })
|
||
}
|
||
}
|
||
|
||
async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
|
||
let mut attempt = 0;
|
||
loop {
|
||
attempt += 1;
|
||
match request().await {
|
||
Ok(result) => return Ok(result),
|
||
Err(err) => {
|
||
if attempt > 20 {
|
||
return Err(err);
|
||
}
|
||
|
||
match err.downcast::<LanguageModelCompletionError>() {
|
||
Ok(err) => match &err {
|
||
LanguageModelCompletionError::RateLimitExceeded { retry_after, .. }
|
||
| LanguageModelCompletionError::ServerOverloaded { retry_after, .. } => {
|
||
let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
|
||
// Wait for the duration supplied, with some jitter to avoid all requests being made at the same time.
|
||
let jitter =
|
||
retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
|
||
eprintln!(
|
||
"Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
|
||
);
|
||
Timer::after(retry_after + jitter).await;
|
||
continue;
|
||
}
|
||
LanguageModelCompletionError::UpstreamProviderError {
|
||
status,
|
||
retry_after,
|
||
..
|
||
} => {
|
||
// Only retry for specific status codes
|
||
let should_retry = matches!(
|
||
*status,
|
||
StatusCode::TOO_MANY_REQUESTS | StatusCode::SERVICE_UNAVAILABLE
|
||
) || status.as_u16() == 529;
|
||
|
||
if !should_retry {
|
||
return Err(err.into());
|
||
}
|
||
|
||
// Use server-provided retry_after if available, otherwise use default
|
||
let retry_after = retry_after.unwrap_or(Duration::from_secs(5));
|
||
let jitter =
|
||
retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
|
||
eprintln!(
|
||
"Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
|
||
);
|
||
Timer::after(retry_after + jitter).await;
|
||
continue;
|
||
}
|
||
LanguageModelCompletionError::ApiInternalServerError { .. }
|
||
| LanguageModelCompletionError::ApiReadResponseError { .. }
|
||
| LanguageModelCompletionError::DeserializeResponse { .. }
|
||
| LanguageModelCompletionError::HttpSend { .. } => {
|
||
let retry_after = Duration::from_secs(attempt);
|
||
let jitter =
|
||
retry_after.mul_f64(rand::thread_rng().gen_range(0.0..1.0));
|
||
eprintln!(
|
||
"Attempt #{attempt}: {err}. Retry after {retry_after:?} + jitter of {jitter:?}"
|
||
);
|
||
Timer::after(retry_after + jitter).await;
|
||
continue;
|
||
}
|
||
_ => return Err(err.into()),
|
||
},
|
||
Err(err) => return Err(err),
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
||
struct EvalAssertionOutcome {
|
||
score: usize,
|
||
message: Option<String>,
|
||
}
|
||
|
||
#[derive(Serialize)]
|
||
pub struct DiffJudgeTemplate {
|
||
diff: String,
|
||
assertions: &'static str,
|
||
}
|
||
|
||
impl Template for DiffJudgeTemplate {
|
||
const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
|
||
}
|
||
|
||
fn strip_empty_lines(text: &str) -> String {
|
||
text.lines()
|
||
.filter(|line| !line.trim().is_empty())
|
||
.collect::<Vec<_>>()
|
||
.join("\n")
|
||
}
|