agent: Improve Gemini support in the edit_file tool (#31116)

This change improves `eval_extract_handle_command_output` results for
all models:

Model                       | Pass rate before | Pass rate after
----------------------------|------------------|----------------
claude-3.7-sonnet           |  0.96            | 0.98
gemini-2.5-pro              |  0.35            | 0.86
gpt-4.1                     |  0.81            | 1.00

Part of this improvement comes from more robust evaluation, which now
accepts multiple possible outcomes. Another part is from the prompt
adaptation: addressing common Gemini failure modes, adding a few-shot
example, and, in the final commit, auto-rewriting instructions for
clarity and conciseness.

This change still needs validation from larger end-to-end evals.


Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2025-05-22 12:01:43 +03:00 committed by GitHub
parent 71fb17c507
commit ab017129d8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 307 additions and 398 deletions

View file

@ -34,13 +34,30 @@ use util::path;
#[test]
#[cfg_attr(not(feature = "eval"), ignore)]
fn eval_extract_handle_command_output() {
// Test how well agent generates multiple edit hunks.
//
// Model | Pass rate
// ----------------------------|----------
// claude-3.7-sonnet | 0.98
// gemini-2.5-pro | 0.86
// gemini-2.5-flash | 0.11
// gpt-4.1 | 1.00
let input_file_path = "root/blame.rs";
let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
let possible_diffs = vec![
include_str!("evals/fixtures/extract_handle_command_output/possible-01.diff"),
include_str!("evals/fixtures/extract_handle_command_output/possible-02.diff"),
include_str!("evals/fixtures/extract_handle_command_output/possible-03.diff"),
include_str!("evals/fixtures/extract_handle_command_output/possible-04.diff"),
include_str!("evals/fixtures/extract_handle_command_output/possible-05.diff"),
include_str!("evals/fixtures/extract_handle_command_output/possible-06.diff"),
include_str!("evals/fixtures/extract_handle_command_output/possible-07.diff"),
];
let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
eval(
100,
0.95,
0.7, // Taking the lower bar for Gemini
EvalInput::from_conversation(
vec![
message(
@ -49,6 +66,7 @@ fn eval_extract_handle_command_output() {
Read the `{input_file_path}` file and extract a method in
the final stanza of `run_git_blame` to deal with command failures,
call it `handle_command_output` and take the std::process::Output as the only parameter.
Do not document the method and do not add any comments.
Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
"})],
@ -83,7 +101,7 @@ fn eval_extract_handle_command_output() {
),
],
Some(input_file_content.into()),
EvalAssertion::assert_eq(output_file_content),
EvalAssertion::assert_diff_any(possible_diffs),
),
);
}
@ -649,7 +667,7 @@ fn eval_zode() {
let invalid_starts = [' ', '`', '\n'];
let mut message = String::new();
for start in invalid_starts {
if sample.text.starts_with(start) {
if sample.text_after.starts_with(start) {
message.push_str(&format!("The sample starts with a {:?}\n", start));
break;
}
@ -1074,7 +1092,8 @@ impl EvalInput {
#[derive(Clone)]
struct EvalSample {
text: String,
text_before: String,
text_after: String,
edit_output: EditAgentOutput,
diff: String,
}
@ -1131,7 +1150,7 @@ impl EvalAssertion {
let expected = expected.into();
Self::new(async move |sample, _judge, _cx| {
Ok(EvalAssertionOutcome {
score: if strip_empty_lines(&sample.text) == strip_empty_lines(&expected) {
score: if strip_empty_lines(&sample.text_after) == strip_empty_lines(&expected) {
100
} else {
0
@ -1141,6 +1160,22 @@ impl EvalAssertion {
})
}
fn assert_diff_any(expected_diffs: Vec<impl Into<String>>) -> Self {
let expected_diffs: Vec<String> = expected_diffs.into_iter().map(Into::into).collect();
Self::new(async move |sample, _judge, _cx| {
let matches = expected_diffs.iter().any(|possible_diff| {
let expected =
language::apply_diff_patch(&sample.text_before, possible_diff).unwrap();
strip_empty_lines(&expected) == strip_empty_lines(&sample.text_after)
});
Ok(EvalAssertionOutcome {
score: if matches { 100 } else { 0 },
message: None,
})
})
}
fn judge_diff(assertions: &'static str) -> Self {
Self::new(async move |sample, judge, cx| {
let prompt = DiffJudgeTemplate {
@ -1225,7 +1260,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
if output.assertion.score < 80 {
failed_count += 1;
failed_evals
.entry(output.sample.text.clone())
.entry(output.sample.text_after.clone())
.or_insert(Vec::new())
.push(output);
}
@ -1470,6 +1505,7 @@ impl EditAgentTest {
tools,
..Default::default()
};
let edit_output = if matches!(eval.edit_file_input.mode, EditFileMode::Edit) {
if let Some(input_content) = eval.input_content.as_deref() {
buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
@ -1498,7 +1534,8 @@ impl EditAgentTest {
eval.input_content.as_deref().unwrap_or_default(),
&buffer_text,
),
text: buffer_text,
text_before: eval.input_content.unwrap_or_default(),
text_after: buffer_text,
};
let assertion = eval
.assertion