agent: Overwrite files more cautiously (#30649)

1. The `edit_file` tool tended to use `create_or_overwrite` a bit too
often, leading to corruption of long files. This change replaces the
boolean flag with an `EditFileMode` enum, which helps Agent make a more
deliberate choice when overwriting files.

With this change, the pass rate of the new eval increased from 10% to
100%.

2. eval: Added ability to run eval on top of an existing thread. Threads
can now be loaded from JSON files in the `SerializedThread` format,
which makes it easy to use real threads as starting points for
tests/evals.

3. Don't try to restore tool cards when running in headless or eval mode
-- we don't have a window to properly do this.

Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2025-05-14 10:40:44 +03:00 committed by GitHub
parent 22f76ac1a7
commit 255d8f7cf8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 425 additions and 37 deletions

View file

@ -42,7 +42,7 @@ use crate::list_directory_tool::ListDirectoryTool;
use crate::now_tool::NowTool;
use crate::thinking_tool::ThinkingTool;
pub use edit_file_tool::EditFileToolInput;
pub use edit_file_tool::{EditFileMode, EditFileToolInput};
pub use find_path_tool::FindPathToolInput;
pub use open_tool::OpenTool;
pub use read_file_tool::{ReadFileTool, ReadFileToolInput};

View file

@ -1,5 +1,9 @@
use super::*;
use crate::{ReadFileToolInput, edit_file_tool::EditFileToolInput, grep_tool::GrepToolInput};
use crate::{
ReadFileToolInput,
edit_file_tool::{EditFileMode, EditFileToolInput},
grep_tool::GrepToolInput,
};
use Role::*;
use anyhow::anyhow;
use assistant_tool::ToolRegistry;
@ -71,7 +75,7 @@ fn eval_extract_handle_command_output() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
)],
),
@ -127,7 +131,7 @@ fn eval_delete_run_git_blame() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
)],
),
@ -182,7 +186,7 @@ fn eval_translate_doc_comments() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
)],
),
@ -297,7 +301,7 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
)],
),
@ -372,7 +376,7 @@ fn eval_disable_cursor_blinking() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
)],
),
@ -566,7 +570,7 @@ fn eval_from_pixels_constructor() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
)],
),
@ -643,7 +647,7 @@ fn eval_zode() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: true,
mode: EditFileMode::Create,
},
),
],
@ -888,7 +892,7 @@ fn eval_add_overwrite_test() {
EditFileToolInput {
display_description: edit_description.into(),
path: input_file_path.into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
},
),
],

View file

@ -76,12 +76,22 @@ pub struct EditFileToolInput {
/// </example>
pub path: PathBuf,
/// If true, this tool will recreate the file from scratch.
/// If false, this tool will produce granular edits to an existing file.
/// The mode of operation on the file. Possible values:
/// - 'edit': Make granular edits to an existing file.
/// - 'create': Create a new file if it doesn't exist.
/// - 'overwrite': Replace the entire contents of an existing file.
///
/// When a file already exists or you just created it, always prefer editing
/// When a file already exists or you just created it, prefer editing
/// it as opposed to recreating it from scratch.
pub create_or_overwrite: bool,
pub mode: EditFileMode,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "lowercase")]
pub enum EditFileMode {
Edit,
Create,
Overwrite,
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
@ -195,7 +205,11 @@ impl Tool for EditFileTool {
.as_ref()
.map_or(false, |file| file.disk_state().exists())
})?;
if !input.create_or_overwrite && !exists {
let create_or_overwrite = match input.mode {
EditFileMode::Create | EditFileMode::Overwrite => true,
_ => false,
};
if !create_or_overwrite && !exists {
return Err(anyhow!("{} not found", input.path.display()));
}
@ -207,7 +221,7 @@ impl Tool for EditFileTool {
})
.await;
let (output, mut events) = if input.create_or_overwrite {
let (output, mut events) = if create_or_overwrite {
edit_agent.overwrite(
buffer.clone(),
input.display_description.clone(),
@ -876,7 +890,7 @@ mod tests {
let input = serde_json::to_value(EditFileToolInput {
display_description: "Some edit".into(),
path: "root/nonexistent_file.txt".into(),
create_or_overwrite: false,
mode: EditFileMode::Edit,
})
.unwrap();
Arc::new(EditFileTool)