Add eval for open_tool (#29801)

Also have its description say it should only be used on request

Release Notes:

- N/A
This commit is contained in:
Richard Feldman 2025-05-02 11:56:07 -04:00 committed by GitHub
parent e6f6b351b7
commit 9efc09c5a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 90 additions and 11 deletions

View file

@ -169,11 +169,14 @@ fn main() {
continue;
}
if meta.language_server.map_or(false, |language| {
!languages.contains(&language.file_extension)
}) {
skipped.push(meta.name);
continue;
if let Some(language) = meta.language_server {
if !languages.contains(&language.file_extension) {
panic!(
"Eval for {:?} could not be run because no language server was found for extension {:?}",
meta.name,
language.file_extension
);
}
}
// TODO: This creates a worktree per repetition. Ideally these examples should

View file

@ -14,12 +14,14 @@ use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
mod add_arg_to_trait_method;
mod code_block_citations;
mod file_search;
mod planets;
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
let mut threads: Vec<Rc<dyn Example>> = vec![
Rc::new(file_search::FileSearchExample),
Rc::new(add_arg_to_trait_method::AddArgToTraitMethod),
Rc::new(code_block_citations::CodeBlockCitations),
Rc::new(planets::Planets),
];
for example_path in list_declarative_examples(examples_dir).unwrap() {

View file

@ -0,0 +1,73 @@
use anyhow::Result;
use assistant_tool::Tool;
use assistant_tools::{OpenTool, TerminalTool};
use async_trait::async_trait;
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
pub struct Planets;
#[async_trait(?Send)]
impl Example for Planets {
fn meta(&self) -> ExampleMetadata {
ExampleMetadata {
name: "planets".to_string(),
url: "https://github.com/roc-lang/roc".to_string(), // This commit in this repo is just the Apache2 license,
revision: "59e49c75214f60b4dc4a45092292061c8c26ce27".to_string(), // so effectively a blank project.
language_server: None,
max_assertions: None,
}
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
cx.push_user_message(
r#"
Make a plain JavaScript web page which renders an animated 3D solar system.
Let me drag to rotate the camera around.
Do not use npm.
"#
.to_string(),
);
let response = cx.run_to_end().await?;
let mut open_tool_uses = 0;
let mut terminal_tool_uses = 0;
for tool_use in response.tool_uses() {
if tool_use.name == OpenTool.name() {
open_tool_uses += 1;
} else if tool_use.name == TerminalTool.name() {
terminal_tool_uses += 1;
}
}
// The open tool should only be used when requested, which it was not.
cx.assert_eq(open_tool_uses, 0, "`open` tool was not used")
.ok();
// No reason to use the terminal if not using npm.
cx.assert_eq(terminal_tool_uses, 0, "`terminal` tool was not used")
.ok();
Ok(())
}
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
vec![
JudgeAssertion {
id: "animated solar system".to_string(),
description: "This page should render a solar system, and it should be animated."
.to_string(),
},
JudgeAssertion {
id: "drag to rotate camera".to_string(),
description: "The user can drag to rotate the camera around.".to_string(),
},
JudgeAssertion {
id: "plain JavaScript".to_string(),
description:
"The code base uses plain JavaScript and no npm, along with HTML and CSS."
.to_string(),
},
]
}
}