assistant: Partial fix for HTML entities in tools params (#32148)
This problem seems to be specific to Opus 4. Eval shows improvement from 89% to 97%. Closes: https://github.com/zed-industries/zed/issues/32060 Release Notes: - N/A Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com>
This commit is contained in:
parent
8af984ae70
commit
3884de937b
5 changed files with 64 additions and 3 deletions
|
@ -17,13 +17,13 @@ You are a highly skilled software engineer with extensive knowledge in many prog
|
|||
4. Use only the tools that are currently available.
|
||||
5. DO NOT use a tool that is not available just because it appears in the conversation. This means the user turned it off.
|
||||
6. NEVER run commands that don't terminate on their own such as web servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers.
|
||||
7. Avoid HTML entity escaping - use plain characters instead.
|
||||
|
||||
## Searching and Reading
|
||||
|
||||
If you are unsure how to fulfill the user's request, gather more information with tool calls and/or clarifying questions.
|
||||
|
||||
{{! TODO: If there are files, we should mention it but otherwise omit that fact }}
|
||||
{{#if has_tools}}
|
||||
If appropriate, use tool calls to explore the current project, which contains the following root directories:
|
||||
|
||||
{{#each worktrees}}
|
||||
|
@ -38,7 +38,6 @@ If appropriate, use tool calls to explore the current project, which contains th
|
|||
- As you learn about the structure of the project, use that information to scope `grep` searches to targeted subtrees of the project.
|
||||
- The user might specify a partial file path. If you don't know the full path, use `find_path` (not `grep`) before you read the file.
|
||||
{{/if}}
|
||||
{{/if}}
|
||||
{{else}}
|
||||
You are being tasked with providing a response, but you have no ability to use tools or to read or write any aspect of the user's system (other than any context the user might have provided to you).
|
||||
|
||||
|
|
|
@ -37,13 +37,13 @@ use crate::diagnostics_tool::DiagnosticsTool;
|
|||
use crate::edit_file_tool::EditFileTool;
|
||||
use crate::fetch_tool::FetchTool;
|
||||
use crate::find_path_tool::FindPathTool;
|
||||
use crate::grep_tool::GrepTool;
|
||||
use crate::list_directory_tool::ListDirectoryTool;
|
||||
use crate::now_tool::NowTool;
|
||||
use crate::thinking_tool::ThinkingTool;
|
||||
|
||||
pub use edit_file_tool::{EditFileMode, EditFileToolInput};
|
||||
pub use find_path_tool::FindPathToolInput;
|
||||
pub use grep_tool::{GrepTool, GrepToolInput};
|
||||
pub use open_tool::OpenTool;
|
||||
pub use read_file_tool::{ReadFileTool, ReadFileToolInput};
|
||||
pub use terminal_tool::TerminalTool;
|
||||
|
|
|
@ -6,3 +6,4 @@ Searches the contents of files in the project with a regular expression
|
|||
- Never use this tool to search for paths. Only search file contents with this tool.
|
||||
- Use this tool when you need to find files containing specific patterns
|
||||
- Results are paginated with 20 matches per page. Use the optional 'offset' parameter to request subsequent pages.
|
||||
- DO NOT use HTML entities solely to escape characters in the tool parameters.
|
||||
|
|
59
crates/eval/src/examples/grep_params_escapement.rs
Normal file
59
crates/eval/src/examples/grep_params_escapement.rs
Normal file
|
@ -0,0 +1,59 @@
|
|||
use agent_settings::AgentProfileId;
|
||||
use anyhow::Result;
|
||||
use assistant_tools::GrepToolInput;
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata};
|
||||
|
||||
pub struct GrepParamsEscapementExample;
|
||||
|
||||
/*
|
||||
|
||||
This eval checks that the model doesn't use HTML escapement for characters like `<` and
|
||||
`>` in tool parameters.
|
||||
|
||||
original +system_prompt change +tool description
|
||||
claude-opus-4 89% 92% 97%+
|
||||
claude-sonnet-4 100%
|
||||
gpt-4.1-mini 100%
|
||||
gemini-2.5-pro 98%
|
||||
|
||||
*/
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for GrepParamsEscapementExample {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
ExampleMetadata {
|
||||
name: "grep_params_escapement".to_string(),
|
||||
url: "https://github.com/octocat/hello-world".to_string(),
|
||||
revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
|
||||
language_server: None,
|
||||
max_assertions: Some(1),
|
||||
profile_id: AgentProfileId::default(),
|
||||
existing_thread_json: None,
|
||||
max_turns: Some(2),
|
||||
}
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
// cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works");
|
||||
cx.push_user_message("Search for files containing the characters `>` or `<`");
|
||||
let response = cx.run_turns(2).await?;
|
||||
let grep_input = response
|
||||
.find_tool_call("grep")
|
||||
.and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
|
||||
|
||||
cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
|
||||
|
||||
cx.assert(
|
||||
!contains_html_entities(&grep_input.unwrap().regex),
|
||||
"Tool parameters should not be escaped",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn contains_html_entities(pattern: &str) -> bool {
|
||||
regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
|
||||
.unwrap()
|
||||
.is_match(pattern)
|
||||
}
|
|
@ -16,6 +16,7 @@ mod add_arg_to_trait_method;
|
|||
mod code_block_citations;
|
||||
mod comment_translation;
|
||||
mod file_search;
|
||||
mod grep_params_escapement;
|
||||
mod overwrite_file;
|
||||
mod planets;
|
||||
|
||||
|
@ -27,6 +28,7 @@ pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
|
|||
Rc::new(planets::Planets),
|
||||
Rc::new(comment_translation::CommentTranslation),
|
||||
Rc::new(overwrite_file::FileOverwriteExample),
|
||||
Rc::new(grep_params_escapement::GrepParamsEscapementExample),
|
||||
];
|
||||
|
||||
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue