agent: Overwrite files more cautiously (#30649)
1. The `edit_file` tool tended to use `create_or_overwrite` a bit too often, leading to corruption of long files. This change replaces the boolean flag with an `EditFileMode` enum, which helps Agent make a more deliberate choice when overwriting files. With this change, the pass rate of the new eval increased from 10% to 100%. 2. eval: Added ability to run eval on top of an existing thread. Threads can now be loaded from JSON files in the `SerializedThread` format, which makes it easy to use real threads as starting points for tests/evals. 3. Don't try to restore tool cards when running in headless or eval mode -- we don't have a window to properly do this. Release Notes: - N/A
This commit is contained in:
parent
22f76ac1a7
commit
255d8f7cf8
18 changed files with 425 additions and 37 deletions
49
crates/eval/src/examples/overwrite_file.rs
Normal file
49
crates/eval/src/examples/overwrite_file.rs
Normal file
|
@ -0,0 +1,49 @@
|
|||
use anyhow::Result;
|
||||
use assistant_settings::AgentProfileId;
|
||||
use assistant_tools::{EditFileMode, EditFileToolInput};
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata};
|
||||
|
||||
pub struct FileOverwriteExample;
|
||||
|
||||
/*
|
||||
This eval tests a fix for a destructive behavior of the `edit_file` tool.
|
||||
Previously, it would rewrite existing files too aggressively, which often
|
||||
resulted in content loss.
|
||||
|
||||
Pass rate before the fix: 10%
|
||||
Pass rate after the fix: 100%
|
||||
*/
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for FileOverwriteExample {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
let thread_json = include_str!("threads/overwrite-file.json");
|
||||
|
||||
ExampleMetadata {
|
||||
name: "file_overwrite".to_string(),
|
||||
url: "https://github.com/zed-industries/zed.git".to_string(),
|
||||
revision: "023a60806a8cc82e73bd8d88e63b4b07fc7a0040".to_string(),
|
||||
language_server: None,
|
||||
max_assertions: Some(1),
|
||||
profile_id: AgentProfileId::default(),
|
||||
existing_thread_json: Some(thread_json.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
let response = cx.run_turns(1).await?;
|
||||
let file_overwritten = if let Some(tool_use) = response.find_tool_call("edit_file") {
|
||||
let input = tool_use.parse_input::<EditFileToolInput>()?;
|
||||
match input.mode {
|
||||
EditFileMode::Edit => false,
|
||||
EditFileMode::Create | EditFileMode::Overwrite => true,
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
cx.assert(!file_overwritten, "File should be edited, not overwritten")
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue