diff --git a/crates/assistant_tools/src/edit_agent.rs b/crates/assistant_tools/src/edit_agent.rs index e1856cd46e..4925f2c02e 100644 --- a/crates/assistant_tools/src/edit_agent.rs +++ b/crates/assistant_tools/src/edit_agent.rs @@ -24,6 +24,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::{cmp, iter, mem, ops::Range, path::PathBuf, sync::Arc, task::Poll}; use streaming_diff::{CharOperation, StreamingDiff}; +use util::debug_panic; #[derive(Serialize)] struct CreateFilePromptTemplate { @@ -543,6 +544,11 @@ impl EditAgent { if last_message.content.is_empty() { conversation.messages.pop(); } + } else { + debug_panic!( + "Last message must be an Assistant tool calling! Got {:?}", + last_message.content + ); } } diff --git a/crates/assistant_tools/src/edit_agent/evals.rs b/crates/assistant_tools/src/edit_agent/evals.rs index 9ad716aadb..ee6f828a0c 100644 --- a/crates/assistant_tools/src/edit_agent/evals.rs +++ b/crates/assistant_tools/src/edit_agent/evals.rs @@ -3,6 +3,7 @@ use crate::{ ReadFileToolInput, edit_file_tool::{EditFileMode, EditFileToolInput}, grep_tool::GrepToolInput, + list_directory_tool::ListDirectoryToolInput, }; use Role::*; use anyhow::anyhow; @@ -40,8 +41,8 @@ fn eval_extract_handle_command_output() { eval( 100, 0.95, - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message( User, [text(formatdoc! {" @@ -81,11 +82,9 @@ fn eval_extract_handle_command_output() { )], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::assert_eq(output_file_content), - }, + Some(input_file_content.into()), + EvalAssertion::assert_eq(output_file_content), + ), ); } @@ -99,8 +98,8 @@ fn eval_delete_run_git_blame() { eval( 100, 0.95, - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message( User, [text(formatdoc! {" @@ -137,11 +136,9 @@ fn eval_delete_run_git_blame() { )], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::assert_eq(output_file_content), - }, + Some(input_file_content.into()), + EvalAssertion::assert_eq(output_file_content), + ), ); } @@ -154,8 +151,8 @@ fn eval_translate_doc_comments() { eval( 200, 1., - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message( User, [text(formatdoc! {" @@ -192,11 +189,9 @@ fn eval_translate_doc_comments() { )], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::judge_diff("Doc comments were translated to Italian"), - }, + Some(input_file_content.into()), + EvalAssertion::judge_diff("Doc comments were translated to Italian"), + ), ); } @@ -210,8 +205,8 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() { eval( 100, 0.95, - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message( User, [text(formatdoc! {" @@ -307,14 +302,12 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() { )], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::judge_diff(indoc! {" + Some(input_file_content.into()), + EvalAssertion::judge_diff(indoc! {" - The compile_parser_to_wasm method has been changed to use wasi-sdk - ureq is used to download the SDK for current platform and architecture "}), - }, + ), ); } @@ -325,10 +318,10 @@ fn eval_disable_cursor_blinking() { let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs"); let edit_description = "Comment out the call to `BlinkManager::enable`"; eval( - 200, + 100, 0.95, - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message(User, [text("Let's research how to cursor blinking works.")]), message( Assistant, @@ -382,15 +375,13 @@ fn eval_disable_cursor_blinking() { )], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::judge_diff(indoc! {" + Some(input_file_content.into()), + EvalAssertion::judge_diff(indoc! {" - Calls to BlinkManager in `observe_window_activation` were commented out - The call to `blink_manager.enable` above the call to show_cursor_names was commented out - All the edits have valid indentation "}), - }, + ), ); } @@ -403,8 +394,8 @@ fn eval_from_pixels_constructor() { eval( 100, 0.95, - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message( User, [text(indoc! {" @@ -576,14 +567,12 @@ fn eval_from_pixels_constructor() { )], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::judge_diff(indoc! {" - - The diff contains a new `from_pixels` constructor - - The diff contains new tests for the `from_pixels` constructor - "}), - }, + Some(input_file_content.into()), + EvalAssertion::judge_diff(indoc! {" + - The diff contains a new `from_pixels` constructor + - The diff contains new tests for the `from_pixels` constructor + "}), + ), ); } @@ -591,12 +580,13 @@ fn eval_from_pixels_constructor() { #[cfg_attr(not(feature = "eval"), ignore)] fn eval_zode() { let input_file_path = "root/zode.py"; + let input_content = None; let edit_description = "Create the main Zode CLI script"; eval( 200, 1., - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message(User, [text(include_str!("evals/fixtures/zode/prompt.md"))]), message( Assistant, @@ -654,10 +644,8 @@ fn eval_zode() { ], ), ], - input_path: input_file_path.into(), - input_content: None, - edit_description: edit_description.into(), - assertion: EvalAssertion::new(async move |sample, _, _cx| { + input_content, + EvalAssertion::new(async move |sample, _, _cx| { let invalid_starts = [' ', '`', '\n']; let mut message = String::new(); for start in invalid_starts { @@ -681,7 +669,7 @@ fn eval_zode() { }) } }), - }, + ), ); } @@ -694,8 +682,8 @@ fn eval_add_overwrite_test() { eval( 200, 0.5, // TODO: make this eval better - EvalInput { - conversation: vec![ + EvalInput::from_conversation( + vec![ message( User, [text(indoc! {" @@ -899,13 +887,121 @@ fn eval_add_overwrite_test() { ], ), ], - input_path: input_file_path.into(), - input_content: Some(input_file_content.into()), - edit_description: edit_description.into(), - assertion: EvalAssertion::judge_diff( + Some(input_file_content.into()), + EvalAssertion::judge_diff( "A new test for overwritten files was created, without changing any previous test", ), - }, + ), + ); +} + +#[test] +#[ignore] // until we figure out the mystery described in the comments +// #[cfg_attr(not(feature = "eval"), ignore)] +fn eval_create_empty_file() { + // Check that Edit Agent can create a file without writing its + // thoughts into it. This issue is not specific to empty files, but + // it's easier to reproduce with them. + // + // NOTE: For some mysterious reason, I could easily reproduce this + // issue roughly 90% of the time in actual Zed. However, once I + // extract the exact LLM request before the failure point and + // generate from that, the reproduction rate drops to 2%! + // + // Things I've tried to make sure it's not a fluke: disabling prompt + // caching, capturing the LLM request via a proxy server, running the + // prompt on Claude separately from evals. Every time it was mostly + // giving good outcomes, which doesn't match my actual experience in + // Zed. + // + // At some point I discovered that simply adding one insignificant + // space or a newline to the prompt suddenly results in an outcome I + // tried to reproduce almost perfectly. + // + // This weirdness happens even outside of the Zed code base and even + // when using a different subscription. The result is the same: an + // extra newline or space changes the model behavior significantly + // enough, so that the pass rate drops from 99% to 0-3% + // + // I have no explanation to this. + // + // + // Model | Pass rate + // ============================================ + // + // -------------------------------------------- + // Prompt version: 2025-05-19 + // -------------------------------------------- + // + // claude-3.7-sonnet | 0.98 + // + one extra space in prompt | 0.00 + // + original prompt again | 0.99 + // + extra newline | 0.03 + // gemini-2.5-pro-preview-03-25 | 1.00 + // gemini-2.5-flash-preview-04-17 | 1.00 + // + one extra space | 1.00 + // gpt-4.1 | 1.00 + // + one extra space | 1.00 + // + // + // TODO: gpt-4.1-mini errored 38 times: + // "data did not match any variant of untagged enum ResponseStreamResult" + // + let input_file_content = None; + let expected_output_content = String::new(); + eval( + 1, + 1.0, + EvalInput::from_conversation( + vec![ + message(User, [text("Create a second empty todo file ")]), + message( + Assistant, + [ + text(formatdoc! {" + I'll help you create a second empty todo file. + First, let me examine the project structure to see if there's already a todo file, which will help me determine the appropriate name and location for the second one. + "}), + tool_use( + "toolu_01GAF8TtsgpjKxCr8fgQLDgR", + "list_directory", + ListDirectoryToolInput { + path: "root".to_string(), + }, + ), + ], + ), + message( + User, + [tool_result( + "toolu_01GAF8TtsgpjKxCr8fgQLDgR", + "list_directory", + "root/TODO\nroot/TODO2\nroot/new.txt\n", + )], + ), + message( + Assistant, + [ + text(formatdoc! {" + I can see there's already a `TODO` file in the `root` directory. Let me create a second empty todo file called `TODO3` in the same directory: + "}), + tool_use( + "toolu_01Tb3iQ9griqSYMmVuykQPWU", + "edit_file", + EditFileToolInput { + display_description: "Create empty TODO3 file".to_string(), + mode: EditFileMode::Create, + path: "root/TODO3".into(), + }, + ), + ], + ), + ], + input_file_content, + // Bad behavior is to write something like + // "I'll create an empty TODO3 file as requested." + EvalAssertion::assert_eq(expected_output_content), + ), ); } @@ -964,12 +1060,46 @@ fn tool_result( #[derive(Clone)] struct EvalInput { conversation: Vec, - input_path: PathBuf, + edit_file_input: EditFileToolInput, input_content: Option, - edit_description: String, assertion: EvalAssertion, } +impl EvalInput { + fn from_conversation( + conversation: Vec, + input_content: Option, + assertion: EvalAssertion, + ) -> Self { + let msg = conversation.last().expect("Conversation must not be empty"); + if msg.role != Role::Assistant { + panic!("Conversation must end with an assistant message"); + } + let tool_use = msg + .content + .iter() + .flat_map(|content| match content { + MessageContent::ToolUse(tool_use) if tool_use.name == "edit_file".into() => { + Some(tool_use) + } + _ => None, + }) + .next() + .expect("Conversation must end with an edit_file tool use") + .clone(); + + let edit_file_input: EditFileToolInput = + serde_json::from_value(tool_use.input.clone()).unwrap(); + + EvalInput { + conversation, + edit_file_input, + input_content, + assertion, + } + } +} + #[derive(Clone)] struct EvalSample { text: String, @@ -1308,7 +1438,7 @@ impl EditAgentTest { let path = self .project .read_with(cx, |project, cx| { - project.find_project_path(eval.input_path, cx) + project.find_project_path(eval.edit_file_input.path, cx) }) .unwrap(); let buffer = self @@ -1336,11 +1466,13 @@ impl EditAgentTest { }), ..Default::default() }; - let edit_output = if let Some(input_content) = eval.input_content.as_deref() { - buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx)); + let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) { + if let Some(input_content) = eval.input_content.as_deref() { + buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx)); + } let (edit_output, _) = self.agent.edit( buffer.clone(), - eval.edit_description, + eval.edit_file_input.display_description, &conversation, &mut cx.to_async(), ); @@ -1348,7 +1480,7 @@ impl EditAgentTest { } else { let (edit_output, _) = self.agent.overwrite( buffer.clone(), - eval.edit_description, + eval.edit_file_input.display_description, &conversation, &mut cx.to_async(), ); diff --git a/crates/assistant_tools/src/edit_file_tool.rs b/crates/assistant_tools/src/edit_file_tool.rs index 3f4f55c600..19cfa9a6b3 100644 --- a/crates/assistant_tools/src/edit_file_tool.rs +++ b/crates/assistant_tools/src/edit_file_tool.rs @@ -38,7 +38,7 @@ use workspace::Workspace; pub struct EditFileTool; -#[derive(Debug, Serialize, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct EditFileToolInput { /// A one-line, user-friendly markdown description of the edit. This will be /// shown in the UI and also passed to another model to perform the edit.