eval: Fine-grained assertions (#29246)
- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs
)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml
)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
parent
0d3fe474db
commit
ce1a674eba
18 changed files with 1969 additions and 1229 deletions
53
crates/eval/src/examples/file_search.rs
Normal file
53
crates/eval/src/examples/file_search.rs
Normal file
|
@ -0,0 +1,53 @@
|
|||
use anyhow::Result;
|
||||
use assistant_tools::PathSearchToolInput;
|
||||
use async_trait::async_trait;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata};
|
||||
|
||||
pub struct FileSearchExample;
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for FileSearchExample {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
ExampleMetadata {
|
||||
name: "file_search".to_string(),
|
||||
url: "https://github.com/zed-industries/zed.git".to_string(),
|
||||
revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
|
||||
language_server: None,
|
||||
max_assertions: Some(4),
|
||||
}
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
const FILENAME: &str = "find_replace_file_tool.rs";
|
||||
cx.push_user_message(format!(
|
||||
r#"
|
||||
Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
|
||||
|
||||
The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
|
||||
markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
|
||||
background for lines that were added. We should have a div per diff line.
|
||||
"#
|
||||
));
|
||||
|
||||
let response = cx.run_turn().await?;
|
||||
let tool_use = response.expect_tool("path_search", cx)?;
|
||||
let input = tool_use.expect_input::<PathSearchToolInput>(cx)?;
|
||||
|
||||
let glob = input.glob;
|
||||
cx.assert(
|
||||
glob.ends_with(FILENAME),
|
||||
format!("glob ends with `{FILENAME}`"),
|
||||
)?;
|
||||
|
||||
let without_filename = glob.replace(FILENAME, "");
|
||||
let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
|
||||
.unwrap()
|
||||
.is_match(&without_filename);
|
||||
|
||||
cx.assert(matches, "glob starts with either `**` or `zed`")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
43
crates/eval/src/examples/find_and_replace_diff_card.toml
Normal file
43
crates/eval/src/examples/find_and_replace_diff_card.toml
Normal file
|
@ -0,0 +1,43 @@
|
|||
url = "https://github.com/zed-industries/zed.git"
|
||||
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
|
||||
language_extension = "rs"
|
||||
|
||||
prompt = """
|
||||
Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
|
||||
The card should implement the `Render` trait.
|
||||
|
||||
The card should show a diff. It should be a beautifully presented diff.
|
||||
The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
|
||||
I want to see a red background for lines that were deleted and a green background for lines
|
||||
that were added. We should have a div per diff line.
|
||||
"""
|
||||
|
||||
[diff_assertions]
|
||||
|
||||
modify_find_and_replace_tool = """
|
||||
The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
|
||||
The struct should contain an `output` field that is the same as the task we were returning before,
|
||||
and a new `card` field that contains a view for the card.
|
||||
"""
|
||||
|
||||
card_implementation = """
|
||||
The card should be a view that displays a diff.
|
||||
Each line in the diff should be colored according to whether it was added, removed or unchanged.
|
||||
"""
|
||||
|
||||
[thread_assertions]
|
||||
|
||||
path_search = """
|
||||
The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
|
||||
(*Not* grep, for example, or reading the file based on a guess at the path.)
|
||||
This is because we gave the model a filename and it needs to turn that into a real path.
|
||||
"""
|
||||
|
||||
read_file_from_path_search = """
|
||||
After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
|
||||
"""
|
||||
|
||||
symbol_search = """
|
||||
When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
|
||||
on what path the Render trait might be in.
|
||||
"""
|
128
crates/eval/src/examples/mod.rs
Normal file
128
crates/eval/src/examples/mod.rs
Normal file
|
@ -0,0 +1,128 @@
|
|||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use serde::Deserialize;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
rc::Rc,
|
||||
};
|
||||
use util::serde::default_true;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
|
||||
|
||||
mod file_search;
|
||||
|
||||
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
|
||||
let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
|
||||
|
||||
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
||||
threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
|
||||
}
|
||||
|
||||
threads
|
||||
}
|
||||
|
||||
struct DeclarativeExample {
|
||||
metadata: ExampleMetadata,
|
||||
prompt: String,
|
||||
diff_assertions: Vec<JudgeAssertion>,
|
||||
thread_assertions: Vec<JudgeAssertion>,
|
||||
}
|
||||
|
||||
impl DeclarativeExample {
|
||||
pub fn load(example_path: &Path) -> Result<Self> {
|
||||
let name = Self::name_from_path(example_path);
|
||||
let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
|
||||
|
||||
let language_server = if base.require_lsp {
|
||||
Some(crate::example::LanguageServer {
|
||||
file_extension: base
|
||||
.language_extension
|
||||
.expect("Language extension is required when require_lsp = true"),
|
||||
allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let metadata = ExampleMetadata {
|
||||
name,
|
||||
url: base.url,
|
||||
revision: base.revision,
|
||||
language_server,
|
||||
max_assertions: None,
|
||||
};
|
||||
|
||||
Ok(DeclarativeExample {
|
||||
metadata,
|
||||
prompt: base.prompt,
|
||||
thread_assertions: base
|
||||
.thread_assertions
|
||||
.into_iter()
|
||||
.map(|(id, description)| JudgeAssertion { id, description })
|
||||
.collect(),
|
||||
diff_assertions: base
|
||||
.diff_assertions
|
||||
.into_iter()
|
||||
.map(|(id, description)| JudgeAssertion { id, description })
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn name_from_path(path: &Path) -> String {
|
||||
path.file_stem().unwrap().to_string_lossy().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct ExampleToml {
|
||||
pub url: String,
|
||||
pub revision: String,
|
||||
pub language_extension: Option<String>,
|
||||
pub insert_id: Option<String>,
|
||||
#[serde(default = "default_true")]
|
||||
pub require_lsp: bool,
|
||||
#[serde(default)]
|
||||
pub allow_preexisting_diagnostics: bool,
|
||||
pub prompt: String,
|
||||
#[serde(default)]
|
||||
pub diff_assertions: BTreeMap<String, String>,
|
||||
#[serde(default)]
|
||||
pub thread_assertions: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for DeclarativeExample {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
self.metadata.clone()
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
cx.push_user_message(&self.prompt);
|
||||
let _ = cx.run_to_end().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
|
||||
self.diff_assertions.clone()
|
||||
}
|
||||
|
||||
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
|
||||
self.thread_assertions.clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
|
||||
let path = std::fs::canonicalize(examples_dir).unwrap();
|
||||
let entries = std::fs::read_dir(path).unwrap();
|
||||
let mut result_paths = Vec::new();
|
||||
for entry in entries {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.extension() == Some("toml".as_ref()) {
|
||||
result_paths.push(path);
|
||||
}
|
||||
}
|
||||
Ok(result_paths)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue