assistant: Partial fix for HTML entities in tools params (#32148)

This problem seems to be specific to Opus 4. Eval shows improvement from
89% to 97%.

Closes: https://github.com/zed-industries/zed/issues/32060

Release Notes:

- N/A

Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com>
This commit is contained in:
Oleksiy Syvokon 2025-06-05 13:36:55 +03:00 committed by GitHub
parent 8af984ae70
commit 3884de937b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 64 additions and 3 deletions

View file

@ -0,0 +1,59 @@
use agent_settings::AgentProfileId;
use anyhow::Result;
use assistant_tools::GrepToolInput;
use async_trait::async_trait;
use crate::example::{Example, ExampleContext, ExampleMetadata};
pub struct GrepParamsEscapementExample;
/*
This eval checks that the model doesn't use HTML escapement for characters like `<` and
`>` in tool parameters.
original +system_prompt change +tool description
claude-opus-4 89% 92% 97%+
claude-sonnet-4 100%
gpt-4.1-mini 100%
gemini-2.5-pro 98%
*/
#[async_trait(?Send)]
impl Example for GrepParamsEscapementExample {
fn meta(&self) -> ExampleMetadata {
ExampleMetadata {
name: "grep_params_escapement".to_string(),
url: "https://github.com/octocat/hello-world".to_string(),
revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
language_server: None,
max_assertions: Some(1),
profile_id: AgentProfileId::default(),
existing_thread_json: None,
max_turns: Some(2),
}
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
// cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works");
cx.push_user_message("Search for files containing the characters `>` or `<`");
let response = cx.run_turns(2).await?;
let grep_input = response
.find_tool_call("grep")
.and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
cx.assert(
!contains_html_entities(&grep_input.unwrap().regex),
"Tool parameters should not be escaped",
)
}
}
fn contains_html_entities(pattern: &str) -> bool {
regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
.unwrap()
.is_match(pattern)
}

View file

@ -16,6 +16,7 @@ mod add_arg_to_trait_method;
mod code_block_citations;
mod comment_translation;
mod file_search;
mod grep_params_escapement;
mod overwrite_file;
mod planets;
@ -27,6 +28,7 @@ pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
Rc::new(planets::Planets),
Rc::new(comment_translation::CommentTranslation),
Rc::new(overwrite_file::FileOverwriteExample),
Rc::new(grep_params_escapement::GrepParamsEscapementExample),
];
for example_path in list_declarative_examples(examples_dir).unwrap() {