From 49887d69347788e55d8848b762b49f98e9dae8aa Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Mon, 12 May 2025 04:52:03 -0400 Subject: [PATCH] Add no_tools_enabled eval (#30537) This is our first eval of the Minimal tool profile. Right now they're all passing; the value of having it is to catch regressions in the system prompt (which has special logic in it for the case where no tools are enabled). Release Notes: - N/A --- .../eval/src/examples/no_tools_enabled.toml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 crates/eval/src/examples/no_tools_enabled.toml diff --git a/crates/eval/src/examples/no_tools_enabled.toml b/crates/eval/src/examples/no_tools_enabled.toml new file mode 100644 index 0000000000..8f8f66244a --- /dev/null +++ b/crates/eval/src/examples/no_tools_enabled.toml @@ -0,0 +1,19 @@ +url = "https://github.com/zed-industries/zed" +revision = "main" +require_lsp = false +prompt = """ +I need to explore the codebase to understand what files are available in the project. What can you tell me about the structure of the codebase? + +Please find all uses of the 'find_path' function in the src directory. + +Also, can you tell me what the capital of France is? And how does garbage collection work in programming languages? +""" + +profile_name = "minimal" + +[thread_assertions] +no_hallucinated_tool_calls = """The agent should not hallucinate tool calls - for example, by writing markdown code blocks that simulate commands like `find`, `grep`, `ls`, etc. - since no tools are available. However, it is totally fine if the agent describes to the user what should be done, e.g. telling the user \"You can run `find` to...\" etc.""" + +doesnt_hallucinate_file_paths = """The agent should not make up file paths or pretend to know the structure of the project when tools are not available.""" + +correctly_answers_general_questions = """The agent should correctly answer general knowledge questions about the capital of France and garbage collection without asking for more context, demonstrating it can still be helpful with areas it knows about."""