agent: Improve Gemini support in the edit_file tool (#31116)
This change improves `eval_extract_handle_command_output` results for all models: Model | Pass rate before | Pass rate after ----------------------------|------------------|---------------- claude-3.7-sonnet | 0.96 | 0.98 gemini-2.5-pro | 0.35 | 0.86 gpt-4.1 | 0.81 | 1.00 Part of this improvement comes from more robust evaluation, which now accepts multiple possible outcomes. Another part is from the prompt adaptation: addressing common Gemini failure modes, adding a few-shot example, and, in the final commit, auto-rewriting instructions for clarity and conciseness. This change still needs validation from larger end-to-end evals. Release Notes: - N/A
This commit is contained in:
parent
71fb17c507
commit
ab017129d8
15 changed files with 307 additions and 398 deletions
|
@ -66,6 +66,7 @@ tree-sitter.workspace = true
|
|||
unicase = "2.6"
|
||||
util.workspace = true
|
||||
workspace-hack.workspace = true
|
||||
diffy = "0.4.2"
|
||||
|
||||
[dev-dependencies]
|
||||
collections = { workspace = true, features = ["test-support"] }
|
||||
|
|
|
@ -65,7 +65,9 @@ use std::{num::NonZeroU32, sync::OnceLock};
|
|||
use syntax_map::{QueryCursorHandle, SyntaxSnapshot};
|
||||
use task::RunnableTag;
|
||||
pub use task_context::{ContextProvider, RunnableRange};
|
||||
pub use text_diff::{DiffOptions, line_diff, text_diff, text_diff_with_options, unified_diff};
|
||||
pub use text_diff::{
|
||||
DiffOptions, apply_diff_patch, line_diff, text_diff, text_diff_with_options, unified_diff,
|
||||
};
|
||||
use theme::SyntaxTheme;
|
||||
pub use toolchain::{LanguageToolchainStore, Toolchain, ToolchainList, ToolchainLister};
|
||||
use tree_sitter::{self, Query, QueryCursor, WasmStore, wasmtime};
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use crate::{CharClassifier, CharKind, LanguageScope};
|
||||
use anyhow::{Context, anyhow};
|
||||
use imara_diff::{
|
||||
Algorithm, UnifiedDiffBuilder, diff,
|
||||
intern::{InternedInput, Token},
|
||||
|
@ -119,6 +120,12 @@ pub fn text_diff_with_options(
|
|||
edits
|
||||
}
|
||||
|
||||
pub fn apply_diff_patch(base_text: &str, patch: &str) -> Result<String, anyhow::Error> {
|
||||
let patch = diffy::Patch::from_str(patch).context("Failed to parse patch")?;
|
||||
let result = diffy::apply(base_text, &patch);
|
||||
result.map_err(|err| anyhow!(err))
|
||||
}
|
||||
|
||||
fn should_perform_word_diff_within_hunk(
|
||||
old_row_range: &Range<u32>,
|
||||
old_byte_range: &Range<usize>,
|
||||
|
@ -270,4 +277,12 @@ mod tests {
|
|||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_apply_diff_patch() {
|
||||
let old_text = "one two\nthree four five\nsix seven eight nine\nten\n";
|
||||
let new_text = "one two\nthree FOUR five\nsix SEVEN eight nine\nten\nELEVEN\n";
|
||||
let patch = unified_diff(old_text, new_text);
|
||||
assert_eq!(apply_diff_patch(old_text, &patch).unwrap(), new_text);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue