assistant: Partial fix for HTML entities in tools params (#32148)
This problem seems to be specific to Opus 4. Eval shows improvement from 89% to 97%. Closes: https://github.com/zed-industries/zed/issues/32060 Release Notes: - N/A Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com>
This commit is contained in:
parent
8af984ae70
commit
3884de937b
5 changed files with 64 additions and 3 deletions
|
@ -17,13 +17,13 @@ You are a highly skilled software engineer with extensive knowledge in many prog
|
||||||
4. Use only the tools that are currently available.
|
4. Use only the tools that are currently available.
|
||||||
5. DO NOT use a tool that is not available just because it appears in the conversation. This means the user turned it off.
|
5. DO NOT use a tool that is not available just because it appears in the conversation. This means the user turned it off.
|
||||||
6. NEVER run commands that don't terminate on their own such as web servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers.
|
6. NEVER run commands that don't terminate on their own such as web servers (like `npm run start`, `npm run dev`, `python -m http.server`, etc) or file watchers.
|
||||||
|
7. Avoid HTML entity escaping - use plain characters instead.
|
||||||
|
|
||||||
## Searching and Reading
|
## Searching and Reading
|
||||||
|
|
||||||
If you are unsure how to fulfill the user's request, gather more information with tool calls and/or clarifying questions.
|
If you are unsure how to fulfill the user's request, gather more information with tool calls and/or clarifying questions.
|
||||||
|
|
||||||
{{! TODO: If there are files, we should mention it but otherwise omit that fact }}
|
{{! TODO: If there are files, we should mention it but otherwise omit that fact }}
|
||||||
{{#if has_tools}}
|
|
||||||
If appropriate, use tool calls to explore the current project, which contains the following root directories:
|
If appropriate, use tool calls to explore the current project, which contains the following root directories:
|
||||||
|
|
||||||
{{#each worktrees}}
|
{{#each worktrees}}
|
||||||
|
@ -38,7 +38,6 @@ If appropriate, use tool calls to explore the current project, which contains th
|
||||||
- As you learn about the structure of the project, use that information to scope `grep` searches to targeted subtrees of the project.
|
- As you learn about the structure of the project, use that information to scope `grep` searches to targeted subtrees of the project.
|
||||||
- The user might specify a partial file path. If you don't know the full path, use `find_path` (not `grep`) before you read the file.
|
- The user might specify a partial file path. If you don't know the full path, use `find_path` (not `grep`) before you read the file.
|
||||||
{{/if}}
|
{{/if}}
|
||||||
{{/if}}
|
|
||||||
{{else}}
|
{{else}}
|
||||||
You are being tasked with providing a response, but you have no ability to use tools or to read or write any aspect of the user's system (other than any context the user might have provided to you).
|
You are being tasked with providing a response, but you have no ability to use tools or to read or write any aspect of the user's system (other than any context the user might have provided to you).
|
||||||
|
|
||||||
|
|
|
@ -37,13 +37,13 @@ use crate::diagnostics_tool::DiagnosticsTool;
|
||||||
use crate::edit_file_tool::EditFileTool;
|
use crate::edit_file_tool::EditFileTool;
|
||||||
use crate::fetch_tool::FetchTool;
|
use crate::fetch_tool::FetchTool;
|
||||||
use crate::find_path_tool::FindPathTool;
|
use crate::find_path_tool::FindPathTool;
|
||||||
use crate::grep_tool::GrepTool;
|
|
||||||
use crate::list_directory_tool::ListDirectoryTool;
|
use crate::list_directory_tool::ListDirectoryTool;
|
||||||
use crate::now_tool::NowTool;
|
use crate::now_tool::NowTool;
|
||||||
use crate::thinking_tool::ThinkingTool;
|
use crate::thinking_tool::ThinkingTool;
|
||||||
|
|
||||||
pub use edit_file_tool::{EditFileMode, EditFileToolInput};
|
pub use edit_file_tool::{EditFileMode, EditFileToolInput};
|
||||||
pub use find_path_tool::FindPathToolInput;
|
pub use find_path_tool::FindPathToolInput;
|
||||||
|
pub use grep_tool::{GrepTool, GrepToolInput};
|
||||||
pub use open_tool::OpenTool;
|
pub use open_tool::OpenTool;
|
||||||
pub use read_file_tool::{ReadFileTool, ReadFileToolInput};
|
pub use read_file_tool::{ReadFileTool, ReadFileToolInput};
|
||||||
pub use terminal_tool::TerminalTool;
|
pub use terminal_tool::TerminalTool;
|
||||||
|
|
|
@ -6,3 +6,4 @@ Searches the contents of files in the project with a regular expression
|
||||||
- Never use this tool to search for paths. Only search file contents with this tool.
|
- Never use this tool to search for paths. Only search file contents with this tool.
|
||||||
- Use this tool when you need to find files containing specific patterns
|
- Use this tool when you need to find files containing specific patterns
|
||||||
- Results are paginated with 20 matches per page. Use the optional 'offset' parameter to request subsequent pages.
|
- Results are paginated with 20 matches per page. Use the optional 'offset' parameter to request subsequent pages.
|
||||||
|
- DO NOT use HTML entities solely to escape characters in the tool parameters.
|
||||||
|
|
59
crates/eval/src/examples/grep_params_escapement.rs
Normal file
59
crates/eval/src/examples/grep_params_escapement.rs
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
use agent_settings::AgentProfileId;
|
||||||
|
use anyhow::Result;
|
||||||
|
use assistant_tools::GrepToolInput;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
|
||||||
|
use crate::example::{Example, ExampleContext, ExampleMetadata};
|
||||||
|
|
||||||
|
pub struct GrepParamsEscapementExample;
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
This eval checks that the model doesn't use HTML escapement for characters like `<` and
|
||||||
|
`>` in tool parameters.
|
||||||
|
|
||||||
|
original +system_prompt change +tool description
|
||||||
|
claude-opus-4 89% 92% 97%+
|
||||||
|
claude-sonnet-4 100%
|
||||||
|
gpt-4.1-mini 100%
|
||||||
|
gemini-2.5-pro 98%
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#[async_trait(?Send)]
|
||||||
|
impl Example for GrepParamsEscapementExample {
|
||||||
|
fn meta(&self) -> ExampleMetadata {
|
||||||
|
ExampleMetadata {
|
||||||
|
name: "grep_params_escapement".to_string(),
|
||||||
|
url: "https://github.com/octocat/hello-world".to_string(),
|
||||||
|
revision: "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d".to_string(),
|
||||||
|
language_server: None,
|
||||||
|
max_assertions: Some(1),
|
||||||
|
profile_id: AgentProfileId::default(),
|
||||||
|
existing_thread_json: None,
|
||||||
|
max_turns: Some(2),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||||
|
// cx.push_user_message("How does the precedence/specificity work with Keymap contexts? I am seeing that `MessageEditor > Editor` is lower precendence than `Editor` which is surprising to me, but might be how it works");
|
||||||
|
cx.push_user_message("Search for files containing the characters `>` or `<`");
|
||||||
|
let response = cx.run_turns(2).await?;
|
||||||
|
let grep_input = response
|
||||||
|
.find_tool_call("grep")
|
||||||
|
.and_then(|tool_use| tool_use.parse_input::<GrepToolInput>().ok());
|
||||||
|
|
||||||
|
cx.assert_some(grep_input.as_ref(), "`grep` tool should be called")?;
|
||||||
|
|
||||||
|
cx.assert(
|
||||||
|
!contains_html_entities(&grep_input.unwrap().regex),
|
||||||
|
"Tool parameters should not be escaped",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn contains_html_entities(pattern: &str) -> bool {
|
||||||
|
regex::Regex::new(r"&[a-zA-Z]+;|&#[0-9]+;|&#x[0-9a-fA-F]+;")
|
||||||
|
.unwrap()
|
||||||
|
.is_match(pattern)
|
||||||
|
}
|
|
@ -16,6 +16,7 @@ mod add_arg_to_trait_method;
|
||||||
mod code_block_citations;
|
mod code_block_citations;
|
||||||
mod comment_translation;
|
mod comment_translation;
|
||||||
mod file_search;
|
mod file_search;
|
||||||
|
mod grep_params_escapement;
|
||||||
mod overwrite_file;
|
mod overwrite_file;
|
||||||
mod planets;
|
mod planets;
|
||||||
|
|
||||||
|
@ -27,6 +28,7 @@ pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
|
||||||
Rc::new(planets::Planets),
|
Rc::new(planets::Planets),
|
||||||
Rc::new(comment_translation::CommentTranslation),
|
Rc::new(comment_translation::CommentTranslation),
|
||||||
Rc::new(overwrite_file::FileOverwriteExample),
|
Rc::new(overwrite_file::FileOverwriteExample),
|
||||||
|
Rc::new(grep_params_escapement::GrepParamsEscapementExample),
|
||||||
];
|
];
|
||||||
|
|
||||||
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue