evals: Add system prompt to edit agent evals + fix edit agent (#31082)
1. Add system prompt: this is how it's called from threads. Previously, we were sending 2. Fix an issue with writing agent thought into a newly created empty file. Release Notes: - N/A --------- Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com> Co-authored-by: Antonio Scandurra <me@as-cii.com>
This commit is contained in:
parent
2f3564b85f
commit
91bc5aefa4
4 changed files with 66 additions and 55 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -688,6 +688,7 @@ dependencies = [
|
||||||
"portable-pty",
|
"portable-pty",
|
||||||
"pretty_assertions",
|
"pretty_assertions",
|
||||||
"project",
|
"project",
|
||||||
|
"prompt_store",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest_client",
|
"reqwest_client",
|
||||||
|
|
|
@ -41,6 +41,7 @@ open.workspace = true
|
||||||
paths.workspace = true
|
paths.workspace = true
|
||||||
portable-pty.workspace = true
|
portable-pty.workspace = true
|
||||||
project.workspace = true
|
project.workspace = true
|
||||||
|
prompt_store.workspace = true
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
rust-embed.workspace = true
|
rust-embed.workspace = true
|
||||||
schemars.workspace = true
|
schemars.workspace = true
|
||||||
|
|
|
@ -18,6 +18,7 @@ use language_model::{
|
||||||
LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
|
LanguageModelToolResultContent, LanguageModelToolUse, LanguageModelToolUseId, SelectedModel,
|
||||||
};
|
};
|
||||||
use project::Project;
|
use project::Project;
|
||||||
|
use prompt_store::{ModelContext, ProjectContext, PromptBuilder, WorktreeContext};
|
||||||
use rand::prelude::*;
|
use rand::prelude::*;
|
||||||
use reqwest_client::ReqwestClient;
|
use reqwest_client::ReqwestClient;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
@ -895,52 +896,24 @@ fn eval_add_overwrite_test() {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[ignore] // until we figure out the mystery described in the comments
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
// #[cfg_attr(not(feature = "eval"), ignore)]
|
|
||||||
fn eval_create_empty_file() {
|
fn eval_create_empty_file() {
|
||||||
// Check that Edit Agent can create a file without writing its
|
// Check that Edit Agent can create a file without writing its
|
||||||
// thoughts into it. This issue is not specific to empty files, but
|
// thoughts into it. This issue is not specific to empty files, but
|
||||||
// it's easier to reproduce with them.
|
// it's easier to reproduce with them.
|
||||||
//
|
//
|
||||||
// NOTE: For some mysterious reason, I could easily reproduce this
|
|
||||||
// issue roughly 90% of the time in actual Zed. However, once I
|
|
||||||
// extract the exact LLM request before the failure point and
|
|
||||||
// generate from that, the reproduction rate drops to 2%!
|
|
||||||
//
|
|
||||||
// Things I've tried to make sure it's not a fluke: disabling prompt
|
|
||||||
// caching, capturing the LLM request via a proxy server, running the
|
|
||||||
// prompt on Claude separately from evals. Every time it was mostly
|
|
||||||
// giving good outcomes, which doesn't match my actual experience in
|
|
||||||
// Zed.
|
|
||||||
//
|
|
||||||
// At some point I discovered that simply adding one insignificant
|
|
||||||
// space or a newline to the prompt suddenly results in an outcome I
|
|
||||||
// tried to reproduce almost perfectly.
|
|
||||||
//
|
|
||||||
// This weirdness happens even outside of the Zed code base and even
|
|
||||||
// when using a different subscription. The result is the same: an
|
|
||||||
// extra newline or space changes the model behavior significantly
|
|
||||||
// enough, so that the pass rate drops from 99% to 0-3%
|
|
||||||
//
|
|
||||||
// I have no explanation to this.
|
|
||||||
//
|
|
||||||
//
|
//
|
||||||
// Model | Pass rate
|
// Model | Pass rate
|
||||||
// ============================================
|
// ============================================
|
||||||
//
|
//
|
||||||
// --------------------------------------------
|
// --------------------------------------------
|
||||||
// Prompt version: 2025-05-19
|
// Prompt version: 2025-05-21
|
||||||
// --------------------------------------------
|
// --------------------------------------------
|
||||||
//
|
//
|
||||||
// claude-3.7-sonnet | 0.98
|
// claude-3.7-sonnet | 1.00
|
||||||
// + one extra space in prompt | 0.00
|
|
||||||
// + original prompt again | 0.99
|
|
||||||
// + extra newline | 0.03
|
|
||||||
// gemini-2.5-pro-preview-03-25 | 1.00
|
// gemini-2.5-pro-preview-03-25 | 1.00
|
||||||
// gemini-2.5-flash-preview-04-17 | 1.00
|
// gemini-2.5-flash-preview-04-17 | 1.00
|
||||||
// + one extra space | 1.00
|
|
||||||
// gpt-4.1 | 1.00
|
// gpt-4.1 | 1.00
|
||||||
// + one extra space | 1.00
|
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
// TODO: gpt-4.1-mini errored 38 times:
|
// TODO: gpt-4.1-mini errored 38 times:
|
||||||
|
@ -949,8 +922,8 @@ fn eval_create_empty_file() {
|
||||||
let input_file_content = None;
|
let input_file_content = None;
|
||||||
let expected_output_content = String::new();
|
let expected_output_content = String::new();
|
||||||
eval(
|
eval(
|
||||||
1,
|
100,
|
||||||
1.0,
|
0.99,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
message(User, [text("Create a second empty todo file ")]),
|
message(User, [text("Create a second empty todo file ")]),
|
||||||
|
@ -1442,24 +1415,59 @@ impl EditAgentTest {
|
||||||
.update(cx, |project, cx| project.open_buffer(path, cx))
|
.update(cx, |project, cx| project.open_buffer(path, cx))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let conversation = LanguageModelRequest {
|
let tools = cx.update(|cx| {
|
||||||
messages: eval.conversation,
|
ToolRegistry::default_global(cx)
|
||||||
tools: cx.update(|cx| {
|
.tools()
|
||||||
ToolRegistry::default_global(cx)
|
.into_iter()
|
||||||
.tools()
|
.filter_map(|tool| {
|
||||||
.into_iter()
|
let input_schema = tool
|
||||||
.filter_map(|tool| {
|
.input_schema(self.agent.model.tool_input_format())
|
||||||
let input_schema = tool
|
.ok()?;
|
||||||
.input_schema(self.agent.model.tool_input_format())
|
Some(LanguageModelRequestTool {
|
||||||
.ok()?;
|
name: tool.name(),
|
||||||
Some(LanguageModelRequestTool {
|
description: tool.description(),
|
||||||
name: tool.name(),
|
input_schema,
|
||||||
description: tool.description(),
|
|
||||||
input_schema,
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
.collect()
|
})
|
||||||
}),
|
.collect::<Vec<_>>()
|
||||||
|
});
|
||||||
|
let tool_names = tools
|
||||||
|
.iter()
|
||||||
|
.map(|tool| tool.name.clone())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let worktrees = vec![WorktreeContext {
|
||||||
|
root_name: "root".to_string(),
|
||||||
|
rules_file: None,
|
||||||
|
}];
|
||||||
|
let prompt_builder = PromptBuilder::new(None)?;
|
||||||
|
let project_context = ProjectContext::new(worktrees, Vec::default());
|
||||||
|
let system_prompt = prompt_builder.generate_assistant_system_prompt(
|
||||||
|
&project_context,
|
||||||
|
&ModelContext {
|
||||||
|
available_tools: tool_names,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let has_system_prompt = eval
|
||||||
|
.conversation
|
||||||
|
.first()
|
||||||
|
.map_or(false, |msg| msg.role == Role::System);
|
||||||
|
let messages = if has_system_prompt {
|
||||||
|
eval.conversation
|
||||||
|
} else {
|
||||||
|
[LanguageModelRequestMessage {
|
||||||
|
role: Role::System,
|
||||||
|
content: vec![MessageContent::Text(system_prompt)],
|
||||||
|
cache: true,
|
||||||
|
}]
|
||||||
|
.into_iter()
|
||||||
|
.chain(eval.conversation)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
};
|
||||||
|
|
||||||
|
let conversation = LanguageModelRequest {
|
||||||
|
messages,
|
||||||
|
tools,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
|
let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
You are an expert engineer and your task is to write a new file from scratch.
|
You are an expert engineer and your task is to write a new file from scratch.
|
||||||
|
|
||||||
<file_to_edit>
|
You MUST respond directly with the file's content, without explanations, additional text or triple backticks.
|
||||||
|
The text you output will be saved verbatim as the content of the file.
|
||||||
|
Tool calls have been disabled. You MUST start your response directly with the file's new content.
|
||||||
|
|
||||||
|
<file_path>
|
||||||
{{path}}
|
{{path}}
|
||||||
</file_to_edit>
|
</file_path>
|
||||||
|
|
||||||
<edit_description>
|
<edit_description>
|
||||||
{{edit_description}}
|
{{edit_description}}
|
||||||
</edit_description>
|
</edit_description>
|
||||||
|
|
||||||
You MUST respond directly with the file's content, without explanations, additional text or triple backticks.
|
|
||||||
The text you output will be saved verbatim as the content of the file.
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue