agent: Improve Gemini support in the edit_file tool (#31116)
This change improves `eval_extract_handle_command_output` results for all models: Model | Pass rate before | Pass rate after ----------------------------|------------------|---------------- claude-3.7-sonnet | 0.96 | 0.98 gemini-2.5-pro | 0.35 | 0.86 gpt-4.1 | 0.81 | 1.00 Part of this improvement comes from more robust evaluation, which now accepts multiple possible outcomes. Another part is from the prompt adaptation: addressing common Gemini failure modes, adding a few-shot example, and, in the final commit, auto-rewriting instructions for clarity and conciseness. This change still needs validation from larger end-to-end evals. Release Notes: - N/A
This commit is contained in:
parent
71fb17c507
commit
ab017129d8
15 changed files with 307 additions and 398 deletions
21
Cargo.lock
generated
21
Cargo.lock
generated
|
@ -4402,6 +4402,15 @@ version = "0.1.13"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
|
||||
|
||||
[[package]]
|
||||
name = "diffy"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b545b8c50194bdd008283985ab0b31dba153cfd5b3066a92770634fbc0d7d291"
|
||||
dependencies = [
|
||||
"nu-ansi-term 0.50.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.10.7"
|
||||
|
@ -8677,6 +8686,7 @@ dependencies = [
|
|||
"clock",
|
||||
"collections",
|
||||
"ctor",
|
||||
"diffy",
|
||||
"ec4rs",
|
||||
"env_logger 0.11.8",
|
||||
"fs",
|
||||
|
@ -10191,6 +10201,15 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.50.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399"
|
||||
dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.3"
|
||||
|
@ -16389,7 +16408,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"nu-ansi-term 0.46.0",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue