From 91bc5aefa4d238b069295cd5e913a2ea17aee43f Mon Sep 17 00:00:00 2001 From: Oleksiy Syvokon Date: Wed, 21 May 2025 13:14:58 +0300 Subject: [PATCH] evals: Add system prompt to edit agent evals + fix edit agent (#31082) 1. Add system prompt: this is how it's called from threads. Previously, we were sending 2. Fix an issue with writing agent thought into a newly created empty file. Release Notes: - N/A --------- Co-authored-by: Ben Brandt Co-authored-by: Antonio Scandurra --- Cargo.lock | 1 + crates/assistant_tools/Cargo.toml | 1 + .../assistant_tools/src/edit_agent/evals.rs | 108 ++++++++++-------- .../src/templates/create_file_prompt.hbs | 11 +- 4 files changed, 66 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 745755662a..027c146609 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -688,6 +688,7 @@ dependencies = [ "portable-pty", "pretty_assertions", "project", + "prompt_store", "rand 0.8.5", "regex", "reqwest_client", diff --git a/crates/assistant_tools/Cargo.toml b/crates/assistant_tools/Cargo.toml index 3bf249c174..6d6baf2d54 100644 --- a/crates/assistant_tools/Cargo.toml +++ b/crates/assistant_tools/Cargo.toml @@ -41,6 +41,7 @@ open.workspace = true paths.workspace = true portable-pty.workspace = true project.workspace = true +prompt_store.workspace = true regex.workspace = true rust-embed.workspace = true schemars.workspace = true diff --git a/crates/assistant_tools/src/edit_agent/evals.rs b/crates/assistant_tools/src/edit_agent/evals.rs index c00a5e684a..19e72dd2ee 100644 --- a/crates/assistant_tools/src/edit_agent/evals.rs +++ b/crates/assistant_tools/src/edit_agent/evals.rs @@ -18,6 +18,7 @@ use language_model::{ LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel, }; use project::Project; +use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext}; use rand::prelude::*; use reqwest_client::ReqwestClient; use serde_json::json; @@ -895,52 +896,24 @@ fn eval_add_overwrite_test() { } #[test] -#[ignore] // until we figure out the mystery described in the comments -// #[cfg_attr(not(feature = "eval"), ignore)] +#[cfg_attr(not(feature = "eval"), ignore)] fn eval_create_empty_file() { // Check that Edit Agent can create a file without writing its // thoughts into it. This issue is not specific to empty files, but // it's easier to reproduce with them. // - // NOTE: For some mysterious reason, I could easily reproduce this - // issue roughly 90% of the time in actual Zed. However, once I - // extract the exact LLM request before the failure point and - // generate from that, the reproduction rate drops to 2%! - // - // Things I've tried to make sure it's not a fluke: disabling prompt - // caching, capturing the LLM request via a proxy server, running the - // prompt on Claude separately from evals. Every time it was mostly - // giving good outcomes, which doesn't match my actual experience in - // Zed. - // - // At some point I discovered that simply adding one insignificant - // space or a newline to the prompt suddenly results in an outcome I - // tried to reproduce almost perfectly. - // - // This weirdness happens even outside of the Zed code base and even - // when using a different subscription. The result is the same: an - // extra newline or space changes the model behavior significantly - // enough, so that the pass rate drops from 99% to 0-3% - // - // I have no explanation to this. - // // // Model | Pass rate // ============================================ // // -------------------------------------------- - // Prompt version: 2025-05-19 + // Prompt version: 2025-05-21 // -------------------------------------------- // - // claude-3.7-sonnet | 0.98 - // + one extra space in prompt | 0.00 - // + original prompt again | 0.99 - // + extra newline | 0.03 + // claude-3.7-sonnet | 1.00 // gemini-2.5-pro-preview-03-25 | 1.00 // gemini-2.5-flash-preview-04-17 | 1.00 - // + one extra space | 1.00 // gpt-4.1 | 1.00 - // + one extra space | 1.00 // // // TODO: gpt-4.1-mini errored 38 times: @@ -949,8 +922,8 @@ fn eval_create_empty_file() { let input_file_content = None; let expected_output_content = String::new(); eval( - 1, - 1.0, + 100, + 0.99, EvalInput::from_conversation( vec![ message(User, [text("Create a second empty todo file ")]), @@ -1442,24 +1415,59 @@ impl EditAgentTest { .update(cx, |project, cx| project.open_buffer(path, cx)) .await .unwrap(); - let conversation = LanguageModelRequest { - messages: eval.conversation, - tools: cx.update(|cx| { - ToolRegistry::default_global(cx) - .tools() - .into_iter() - .filter_map(|tool| { - let input_schema = tool - .input_schema(self.agent.model.tool_input_format()) - .ok()?; - Some(LanguageModelRequestTool { - name: tool.name(), - description: tool.description(), - input_schema, - }) + let tools = cx.update(|cx| { + ToolRegistry::default_global(cx) + .tools() + .into_iter() + .filter_map(|tool| { + let input_schema = tool + .input_schema(self.agent.model.tool_input_format()) + .ok()?; + Some(LanguageModelRequestTool { + name: tool.name(), + description: tool.description(), + input_schema, }) - .collect() - }), + }) + .collect::>() + }); + let tool_names = tools + .iter() + .map(|tool| tool.name.clone()) + .collect::>(); + let worktrees = vec![WorktreeContext { + root_name: "root".to_string(), + rules_file: None, + }]; + let prompt_builder = PromptBuilder::new(None)?; + let project_context = ProjectContext::new(worktrees, Vec::default()); + let system_prompt = prompt_builder.generate_assistant_system_prompt( + &project_context, + &ModelContext { + available_tools: tool_names, + }, + )?; + + let has_system_prompt = eval + .conversation + .first() + .map_or(false, |msg| msg.role == Role::System); + let messages = if has_system_prompt { + eval.conversation + } else { + [LanguageModelRequestMessage { + role: Role::System, + content: vec![MessageContent::Text(system_prompt)], + cache: true, + }] + .into_iter() + .chain(eval.conversation) + .collect::>() + }; + + let conversation = LanguageModelRequest { + messages, + tools, ..Default::default() }; let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) { diff --git a/crates/assistant_tools/src/templates/create_file_prompt.hbs b/crates/assistant_tools/src/templates/create_file_prompt.hbs index fb26af99a8..ffefee04ea 100644 --- a/crates/assistant_tools/src/templates/create_file_prompt.hbs +++ b/crates/assistant_tools/src/templates/create_file_prompt.hbs @@ -1,12 +1,13 @@ You are an expert engineer and your task is to write a new file from scratch. - +You MUST respond directly with the file's content, without explanations, additional text or triple backticks. +The text you output will be saved verbatim as the content of the file. +Tool calls have been disabled. You MUST start your response directly with the file's new content. + + {{path}} - + {{edit_description}} - -You MUST respond directly with the file's content, without explanations, additional text or triple backticks. -The text you output will be saved verbatim as the content of the file.