Add new editing eval scenario and improve it substantially (#29997)

This improves the new eval scenario by ~80% (`0.29` vs `0.525`) without decreasing performance in the other evals. Release Notes: - Improved the performance of the `edit_file` tool.
2025-05-06 14:22:42 +02:00 · 2025-05-06 14:22:42 +02:00 · 07e6e49583
commit 07e6e49583
parent 6e9f8f997e
4 changed files with 1811 additions and 7 deletions
--- a/crates/assistant_tools/src/edit_agent.rs
+++ b/crates/assistant_tools/src/edit_agent.rs
@ -652,14 +652,16 @@ impl EditAgent {
 }

 fn fuzzy_eq(left: &str, right: &str) -> bool {
+    const THRESHOLD: f64 = 0.8;
+
    let min_levenshtein = left.len().abs_diff(right.len());
    let min_normalized_levenshtein =
-        1. - (min_levenshtein as f32 / cmp::max(left.len(), right.len()) as f32);
-    if min_normalized_levenshtein < 0.8 {
+        1. - (min_levenshtein as f64 / cmp::max(left.len(), right.len()) as f64);
+    if min_normalized_levenshtein < THRESHOLD {
        return false;
    }

-    strsim::normalized_levenshtein(left, right) >= 0.8
+    strsim::normalized_levenshtein(left, right) >= THRESHOLD
 }

 #[derive(Copy, Clone, Debug)]
--- a/crates/assistant_tools/src/edit_agent/evals.rs
+++ b/crates/assistant_tools/src/edit_agent/evals.rs
@ -267,7 +267,7 @@ fn eval_disable_cursor_blinking() {
    let output_file_content = include_str!("evals/fixtures/disable_cursor_blinking/after.rs");
    let edit_description = "Comment out the call to `BlinkManager::enable`";
    eval(
-        100,
+        200,
        0.6, // TODO: make this eval better
        EvalInput {
            conversation: vec![
@ -623,6 +623,230 @@ fn eval_zode() {
    );
 }

+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_add_overwrite_test() {
+    let input_file_path = "root/action_log.rs";
+    let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
+    let edit_description = "Add a new test for overwriting a file in action_log.rs";
+    eval(
+        200,
+        0.5, // TODO: make this eval better
+        EvalInput {
+            conversation: vec![
+                message(
+                    User,
+                    [text(indoc! {"
+                        Introduce a new test in `action_log.rs` to test overwriting a file.
+                        That is, a file already exists, but we call `buffer_created` as if the file were new.
+                        Take inspiration from all the other tests in the file.
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: None,
+                            end_line: None,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_1",
+                        "read_file",
+                        indoc! {"
+                            pub struct ActionLog [L13-20]
+                             tracked_buffers [L15]
+                             edited_since_project_diagnostics_check [L17]
+                             project [L19]
+                            impl ActionLog [L22-498]
+                             pub fn new [L24-30]
+                             pub fn project [L32-34]
+                             pub fn checked_project_diagnostics [L37-39]
+                             pub fn has_edited_files_since_project_diagnostics_check [L42-44]
+                             fn track_buffer_internal [L46-101]
+                             fn handle_buffer_event [L103-116]
+                             fn handle_buffer_edited [L118-123]
+                             fn handle_buffer_file_changed [L125-158]
+                             async fn maintain_diff [L160-264]
+                             pub fn buffer_read [L267-269]
+                             pub fn buffer_created [L272-276]
+                             pub fn buffer_edited [L279-287]
+                             pub fn will_delete_buffer [L289-304]
+                             pub fn keep_edits_in_range [L306-364]
+                             pub fn reject_edits_in_ranges [L366-459]
+                             pub fn keep_all_edits [L461-473]
+                             pub fn changed_buffers [L476-482]
+                             pub fn stale_buffers [L485-497]
+                            fn apply_non_conflicting_edits [L500-561]
+                            fn diff_snapshots [L563-585]
+                            fn point_to_row_edit [L587-614]
+                            enum ChangeAuthor [L617-620]
+                             User [L618]
+                             Agent [L619]
+                            enum TrackedBufferStatus [L623-627]
+                             Created [L624]
+                             Modified [L625]
+                             Deleted [L626]
+                            struct TrackedBuffer [L629-641]
+                             buffer [L630]
+                             base_text [L631]
+                             unreviewed_changes [L632]
+                             status [L633]
+                             version [L634]
+                             diff [L635]
+                             snapshot [L636]
+                             diff_update [L637]
+                             _open_lsp_handle [L638]
+                             _maintain_diff [L639]
+                             _subscription [L640]
+                            impl TrackedBuffer [L643-657]
+                             fn has_changes [L644-650]
+                             fn schedule_diff_update [L652-656]
+                            pub struct ChangedBuffer [L659-661]
+                             pub diff [L660]
+                            mod tests [L664-1574]
+                             fn init_logger [L678-682]
+                             fn init_test [L684-691]
+                             async fn test_keep_edits [L694-769]
+                             async fn test_deletions [L772-854]
+                             async fn test_overlapping_user_edits [L857-951]
+                             async fn test_creating_files [L954-1010]
+                             async fn test_deleting_files [L1013-1120]
+                             async fn test_reject_edits [L1123-1255]
+                             async fn test_reject_multiple_edits [L1258-1331]
+                             async fn test_reject_deleted_file [L1334-1388]
+                             async fn test_reject_created_file [L1391-1443]
+                             async fn test_random_diffs [L1446-1535]
+                              fn quiesce [L1510-1534]
+                             struct HunkStatus [L1538-1542]
+                              range [L1539]
+                              diff_status [L1540]
+                              old_text [L1541]
+                             fn unreviewed_hunks [L1544-1573]
+
+                            Showing symbols 1-69 (total symbols: 69)
+
+                            Using the line numbers in this outline, you can call this tool again while specifying
+                            the start_line and end_line fields to see the implementations of symbols in the outline.
+                        "},
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text(
+                            "Let's look at the existing test cases that are related to file creation to understand the pattern:",
+                        ),
+                        tool_use(
+                            "tool_2",
+                            "read_file",
+                            ReadFileToolInput {
+                                path: input_file_path.into(),
+                                start_line: Some(953),
+                                end_line: Some(1010),
+                            },
+                        ),
+                    ],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_2",
+                        "read_file",
+                        lines(input_file_content, 953..1010),
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text(
+                            "Now let's look at another relevant test to understand how they're structured:",
+                        ),
+                        tool_use(
+                            "tool_3",
+                            "read_file",
+                            ReadFileToolInput {
+                                path: input_file_path.into(),
+                                start_line: Some(1012),
+                                end_line: Some(1120),
+                            },
+                        ),
+                    ],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_3",
+                        "read_file",
+                        lines(input_file_content, 1012..1120),
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text("Now let's look at how `buffer_created` is implemented:"),
+                        tool_use(
+                            "tool_4",
+                            "read_file",
+                            ReadFileToolInput {
+                                path: input_file_path.into(),
+                                start_line: Some(271),
+                                end_line: Some(276),
+                            },
+                        ),
+                    ],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_4",
+                        "read_file",
+                        lines(input_file_content, 271..276),
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text(indoc! {"
+                            Based on my understanding of the existing tests and code, I'll now create a new test for overwriting a file. In this test, we'll:
+
+                            1. Create a file
+                            2. Open a buffer for it
+                            3. Track it in the action log
+                            4. Modify the buffer
+                            5. Call `buffer_created` again as if the file were new
+                            6. Check that the changes are properly tracked
+
+                            Let's write the test:
+                        "}),
+                        tool_use(
+                            "tool_5",
+                            "edit_file",
+                            StreamingEditFileToolInput {
+                                display_description: edit_description.into(),
+                                path: input_file_path.into(),
+                                create_or_overwrite: false,
+                            },
+                        ),
+                    ],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: Some(input_file_content.into()),
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::judge_diff(
+                "A new test for overwritten files was created, without changing any previous test",
+            ),
+        },
+    );
+}
+
 fn message(
    role: Role,
    contents: impl IntoIterator<Item = MessageContent>,
@ -879,7 +1103,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {

    let mismatched_tag_ratio =
        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
-    if mismatched_tag_ratio > 0.02 {
+    if mismatched_tag_ratio > 0.05 {
        for eval_output in eval_outputs {
            println!("{}", eval_output);
        }
--- a/crates/assistant_tools/src/edit_agent/evals/fixtures/add_overwrite_test/before.rs
+++ b/crates/assistant_tools/src/edit_agent/evals/fixtures/add_overwrite_test/before.rs
--- a/crates/assistant_tools/src/templates/edit_file_prompt.hbs
+++ b/crates/assistant_tools/src/templates/edit_file_prompt.hbs
@ -31,8 +31,12 @@ NEW TEXT 3 HERE

 Rules for editing:

- `old_text` represents lines in the input file that will be replaced with `new_text`. `old_text` MUST exactly match the existing file content, character for character, including indentation.
- Always include enough context around the lines you want to replace in `old_text` such that it's impossible to mistake them for other lines.
+- `old_text` represents lines in the input file that will be replaced with `new_text`.
+- `old_text` MUST exactly match the existing file content, character for character, including indentation.
+- `old_text` MUST NEVER come from the outline, but from actual lines in the file.
+- Strive to be minimal in the lines you replace in `old_text`:
+  - If the lines you want to replace are unique, you MUST include just those in the `old_text`.
+  - If the lines you want to replace are NOT unique, you MUST include enough context around them in `old_text` to distinguish them from other lines.
 - If you want to replace many occurrences of the same text, repeat the same `old_text`/`new_text` pair multiple times and I will apply them sequentially, one occurrence at a time.
 - When reporting multiple edits, each edit assumes the previous one has already been applied! Therefore, you must ensure `old_text` doesn't reference text that has already been modified by a previous edit.
 - Don't explain the edits, just report them.