agent: Overwrite files more cautiously (#30649)

1. The `edit_file` tool tended to use `create_or_overwrite` a bit too
often, leading to corruption of long files. This change replaces the
boolean flag with an `EditFileMode` enum, which helps Agent make a more
deliberate choice when overwriting files.

With this change, the pass rate of the new eval increased from 10% to
100%.

2. eval: Added ability to run eval on top of an existing thread. Threads
can now be loaded from JSON files in the `SerializedThread` format,
which makes it easy to use real threads as starting points for
tests/evals.

3. Don't try to restore tool cards when running in headless or eval mode
-- we don't have a window to properly do this.

Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2025-05-14 10:40:44 +03:00 committed by GitHub
parent 22f76ac1a7
commit 255d8f7cf8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 425 additions and 37 deletions

View file

@ -54,15 +54,19 @@ impl ToolUseState {
/// Constructs a [`ToolUseState`] from the given list of [`SerializedMessage`]s.
///
/// Accepts a function to filter the tools that should be used to populate the state.
///
/// If `window` is `None` (e.g., when in headless mode or when running evals),
/// tool cards won't be deserialized
pub fn from_serialized_messages(
tools: Entity<ToolWorkingSet>,
messages: &[SerializedMessage],
project: Entity<Project>,
window: &mut Window,
window: Option<&mut Window>, // None in headless mode
cx: &mut App,
) -> Self {
let mut this = Self::new(tools);
let mut tool_names_by_id = HashMap::default();
let mut window = window;
for message in messages {
match message.role {
@ -107,12 +111,17 @@ impl ToolUseState {
},
);
if let Some(tool) = this.tools.read(cx).tool(tool_use, cx) {
if let Some(output) = tool_result.output.clone() {
if let Some(card) =
tool.deserialize_card(output, project.clone(), window, cx)
{
this.tool_result_cards.insert(tool_use_id, card);
if let Some(window) = &mut window {
if let Some(tool) = this.tools.read(cx).tool(tool_use, cx) {
if let Some(output) = tool_result.output.clone() {
if let Some(card) = tool.deserialize_card(
output,
project.clone(),
window,
cx,
) {
this.tool_result_cards.insert(tool_use_id, card);
}
}
}
}