evals: Eval for creating an empty file (#31034)

This eval checks that Edit Agent can create an empty file without writing its thoughts into it. This issue is not specific to empty files, but it's easier to reproduce with them. For some mysterious reason, I could easily reproduce this issue roughly 90% of the time in actual Zed. However, once I extract the exact LLM request before the failure point and generate from that, the reproduction rate drops to 2%! Things I've tried to make sure it's not a fluke: disabling prompt caching, capturing the LLM request via a proxy server, running the prompt on Claude separately from evals. Every time it was mostly giving good outcomes, which doesn't match my actual experience in Zed. At some point I discovered that simply adding one insignificant space or a newline to the prompt suddenly results in an outcome I tried to reproduce almost perfectly. This weirdness happens even outside the Zed code base and even when using a different subscription. The result is the same: an extra newline or space changes the model behavior significantly enough, so that the pass rate drops from 99% to 0-3% I have no explanation to this. Release Notes: - N/A
2025-05-20 20:03:08 +03:00 · 2025-05-20 20:03:08 +03:00 · 5e5a124ae1
commit 5e5a124ae1
parent 65e751ca33
3 changed files with 206 additions and 68 deletions
--- a/crates/assistant_tools/src/edit_agent.rs
+++ b/crates/assistant_tools/src/edit_agent.rs
@ -24,6 +24,7 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::{cmp, iter, mem, ops::Range, path::PathBuf, sync::Arc, task::Poll};
 use streaming_diff::{CharOperation, StreamingDiff};
+use util::debug_panic;

 #[derive(Serialize)]
 struct CreateFilePromptTemplate {
@ -543,6 +544,11 @@ impl EditAgent {
                if last_message.content.is_empty() {
                    conversation.messages.pop();
                }
+            } else {
+                debug_panic!(
+                    "Last message must be an Assistant tool calling! Got {:?}",
+                    last_message.content
+                );
            }
        }

--- a/crates/assistant_tools/src/edit_agent/evals.rs
+++ b/crates/assistant_tools/src/edit_agent/evals.rs
@ -3,6 +3,7 @@ use crate::{
    ReadFileToolInput,
    edit_file_tool::{EditFileMode, EditFileToolInput},
    grep_tool::GrepToolInput,
+    list_directory_tool::ListDirectoryToolInput,
 };
 use Role::*;
 use anyhow::anyhow;
@ -40,8 +41,8 @@ fn eval_extract_handle_command_output() {
    eval(
        100,
        0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(
                    User,
                    [text(formatdoc! {"
@ -81,11 +82,9 @@ fn eval_extract_handle_command_output() {
                    )],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::assert_eq(output_file_content),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::assert_eq(output_file_content),
+        ),
    );
 }

@ -99,8 +98,8 @@ fn eval_delete_run_git_blame() {
    eval(
        100,
        0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(
                    User,
                    [text(formatdoc! {"
@ -137,11 +136,9 @@ fn eval_delete_run_git_blame() {
                    )],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::assert_eq(output_file_content),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::assert_eq(output_file_content),
+        ),
    );
 }

@ -154,8 +151,8 @@ fn eval_translate_doc_comments() {
    eval(
        200,
        1.,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(
                    User,
                    [text(formatdoc! {"
@ -192,11 +189,9 @@ fn eval_translate_doc_comments() {
                    )],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff("Doc comments were translated to Italian"),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff("Doc comments were translated to Italian"),
+        ),
    );
 }

@ -210,8 +205,8 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
    eval(
        100,
        0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(
                    User,
                    [text(formatdoc! {"
@ -307,14 +302,12 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
                    )],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(indoc! {"
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(indoc! {"
                - The compile_parser_to_wasm method has been changed to use wasi-sdk
                - ureq is used to download the SDK for current platform and architecture
            "}),
-        },
+        ),
    );
 }

@ -325,10 +318,10 @@ fn eval_disable_cursor_blinking() {
    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
    let edit_description = "Comment out the call to `BlinkManager::enable`";
    eval(
-        200,
+        100,
        0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(User, [text("Let's research how to cursor blinking works.")]),
                message(
                    Assistant,
@ -382,15 +375,13 @@ fn eval_disable_cursor_blinking() {
                    )],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(indoc! {"
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(indoc! {"
                - Calls to BlinkManager in `observe_window_activation` were commented out
                - The call to `blink_manager.enable` above the call to show_cursor_names was commented out
                - All the edits have valid indentation
            "}),
-        },
+        ),
    );
 }

@ -403,8 +394,8 @@ fn eval_from_pixels_constructor() {
    eval(
        100,
        0.95,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(
                    User,
                    [text(indoc! {"
@ -576,14 +567,12 @@ fn eval_from_pixels_constructor() {
                    )],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(indoc! {"
-                - The diff contains a new `from_pixels` constructor
-                - The diff contains new tests for the `from_pixels` constructor
-            "}),
-        },
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(indoc! {"
+                    - The diff contains a new `from_pixels` constructor
+                    - The diff contains new tests for the `from_pixels` constructor
+                "}),
+        ),
    );
 }

@ -591,12 +580,13 @@ fn eval_from_pixels_constructor() {
 #[cfg_attr(not(feature = "eval"), ignore)]
 fn eval_zode() {
    let input_file_path = "root/zode.py";
+    let input_content = None;
    let edit_description = "Create the main Zode CLI script";
    eval(
        200,
        1.,
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]),
                message(
                    Assistant,
@ -654,10 +644,8 @@ fn eval_zode() {
                    ],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: None,
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::new(async move |sample, _, _cx| {
+            input_content,
+            EvalAssertion::new(async move |sample, _, _cx| {
                let invalid_starts = [' ', '`', '\n'];
                let mut message = String::new();
                for start in invalid_starts {
@ -681,7 +669,7 @@ fn eval_zode() {
                    })
                }
            }),
-        },
+        ),
    );
 }

@ -694,8 +682,8 @@ fn eval_add_overwrite_test() {
    eval(
        200,
        0.5, // TODO: make this eval better
-        EvalInput {
-            conversation: vec![
+        EvalInput::from_conversation(
+            vec![
                message(
                    User,
                    [text(indoc! {"
@ -899,13 +887,121 @@ fn eval_add_overwrite_test() {
                    ],
                ),
            ],
-            input_path: input_file_path.into(),
-            input_content: Some(input_file_content.into()),
-            edit_description: edit_description.into(),
-            assertion: EvalAssertion::judge_diff(
+            Some(input_file_content.into()),
+            EvalAssertion::judge_diff(
                "A new test for overwritten files was created, without changing any previous test",
            ),
-        },
+        ),
+    );
+}
+
+#[test]
+#[ignore] // until we figure out the mystery described in the comments
+// #[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_create_empty_file() {
+    // Check that Edit Agent can create a file without writing its
+    // thoughts into it. This issue is not specific to empty files, but
+    // it's easier to reproduce with them.
+    //
+    // NOTE: For some mysterious reason, I could easily reproduce this
+    // issue roughly 90% of the time in actual Zed. However, once I
+    // extract the exact LLM request before the failure point and
+    // generate from that, the reproduction rate drops to 2%!
+    //
+    // Things I've tried to make sure it's not a fluke: disabling prompt
+    // caching, capturing the LLM request via a proxy server, running the
+    // prompt on Claude separately from evals. Every time it was mostly
+    // giving good outcomes, which doesn't match my actual experience in
+    // Zed.
+    //
+    // At some point I discovered that simply adding one insignificant
+    // space or a newline to the prompt suddenly results in an outcome I
+    // tried to reproduce almost perfectly.
+    //
+    // This weirdness happens even outside of the Zed code base and even
+    // when using a different subscription. The result is the same: an
+    // extra newline or space changes the model behavior significantly
+    // enough, so that the pass rate drops from 99% to 0-3%
+    //
+    // I have no explanation to this.
+    //
+    //
+    //  Model                          | Pass rate
+    // ============================================
+    //
+    // --------------------------------------------
+    //           Prompt version: 2025-05-19
+    // --------------------------------------------
+    //
+    //  claude-3.7-sonnet              |  0.98
+    //    + one extra space in prompt  |  0.00
+    //    + original prompt again      |  0.99
+    //    + extra newline              |  0.03
+    //  gemini-2.5-pro-preview-03-25   |  1.00
+    //  gemini-2.5-flash-preview-04-17 |  1.00
+    //    + one extra space            |  1.00
+    //  gpt-4.1                        |  1.00
+    //    + one extra space            |  1.00
+    //
+    //
+    // TODO: gpt-4.1-mini errored 38 times:
+    // "data did not match any variant of untagged enum ResponseStreamResult"
+    //
+    let input_file_content = None;
+    let expected_output_content = String::new();
+    eval(
+        1,
+        1.0,
+        EvalInput::from_conversation(
+            vec![
+                message(User, [text("Create a second empty todo file ")]),
+                message(
+                    Assistant,
+                    [
+                        text(formatdoc! {"
+                        I'll help you create a second empty todo file.
+                        First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one.
+                        "}),
+                        tool_use(
+                            "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
+                            "list_directory",
+                            ListDirectoryToolInput {
+                                path: "root".to_string(),
+                            },
+                        ),
+                    ],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "toolu_01GAF8TtsgpjKxCr8fgQLDgR",
+                        "list_directory",
+                        "root/TODO\nroot/TODO2\nroot/new.txt\n",
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [
+                        text(formatdoc! {"
+                        I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory:
+                    "}),
+                        tool_use(
+                            "toolu_01Tb3iQ9griqSYMmVuykQPWU",
+                            "edit_file",
+                            EditFileToolInput {
+                                display_description: "Create empty TODO3 file".to_string(),
+                                mode: EditFileMode::Create,
+                                path: "root/TODO3".into(),
+                            },
+                        ),
+                    ],
+                ),
+            ],
+            input_file_content,
+            // Bad behavior is to write something like
+            // "I'll create an empty TODO3 file as requested."
+            EvalAssertion::assert_eq(expected_output_content),
+        ),
    );
 }

@ -964,12 +1060,46 @@ fn tool_result(
 #[derive(Clone)]
 struct EvalInput {
    conversation: Vec<LanguageModelRequestMessage>,
-    input_path: PathBuf,
+    edit_file_input: EditFileToolInput,
    input_content: Option<String>,
-    edit_description: String,
    assertion: EvalAssertion,
 }

+impl EvalInput {
+    fn from_conversation(
+        conversation: Vec<LanguageModelRequestMessage>,
+        input_content: Option<String>,
+        assertion: EvalAssertion,
+    ) -> Self {
+        let msg = conversation.last().expect("Conversation must not be empty");
+        if msg.role != Role::Assistant {
+            panic!("Conversation must end with an assistant message");
+        }
+        let tool_use = msg
+            .content
+            .iter()
+            .flat_map(|content| match content {
+                MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => {
+                    Some(tool_use)
+                }
+                _ => None,
+            })
+            .next()
+            .expect("Conversation must end with an edit_file tool use")
+            .clone();
+
+        let edit_file_input: EditFileToolInput =
+            serde_json::from_value(tool_use.input.clone()).unwrap();
+
+        EvalInput {
+            conversation,
+            edit_file_input,
+            input_content,
+            assertion,
+        }
+    }
+}
+
 #[derive(Clone)]
 struct EvalSample {
    text: String,
@ -1308,7 +1438,7 @@ impl EditAgentTest {
        let path = self
            .project
            .read_with(cx, |project, cx| {
-                project.find_project_path(eval.input_path, cx)
+                project.find_project_path(eval.edit_file_input.path, cx)
            })
            .unwrap();
        let buffer = self
@ -1336,11 +1466,13 @@ impl EditAgentTest {
            }),
            ..Default::default()
        };
-        let edit_output = if let Some(input_content) = eval.input_content.as_deref() {
-            buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+        let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
+            if let Some(input_content) = eval.input_content.as_deref() {
+                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
+            }
            let (edit_output, _) = self.agent.edit(
                buffer.clone(),
-                eval.edit_description,
+                eval.edit_file_input.display_description,
                &conversation,
                &mut cx.to_async(),
            );
@ -1348,7 +1480,7 @@ impl EditAgentTest {
        } else {
            let (edit_output, _) = self.agent.overwrite(
                buffer.clone(),
-                eval.edit_description,
+                eval.edit_file_input.display_description,
                &conversation,
                &mut cx.to_async(),
            );
--- a/crates/assistant_tools/src/edit_file_tool.rs
+++ b/crates/assistant_tools/src/edit_file_tool.rs
@ -38,7 +38,7 @@ use workspace::Workspace;

 pub struct EditFileTool;

-#[derive(Debug, Serialize, Deserialize, JsonSchema)]
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
 pub struct EditFileToolInput {
    /// A one-line, user-friendly markdown description of the edit. This will be
    /// shown in the UI and also passed to another model to perform the edit.