eval: Fine-grained assertions (#29246)

- Support programmatic examples
([example](17feb260a0/crates/eval/src/examples/file_search.rs))
- Combine data-driven example declarations into a single `.toml` file
([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml))
- Run judge on individual assertions (previously called "criteria")
- Report judge and programmatic assertions in one combined table

Note: We still need to work on concept naming 

<img width=400
src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5">

Release Notes:

- N/A

---------

Co-authored-by: Richard Feldman <oss@rtfeldman.com>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
Agus Zubiaga 2025-04-22 23:58:58 -03:00 committed by GitHub
parent 0d3fe474db
commit ce1a674eba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1969 additions and 1229 deletions

View file

@ -0,0 +1,53 @@
use anyhow::Result;
use assistant_tools::PathSearchToolInput;
use async_trait::async_trait;
use regex::Regex;
use crate::example::{Example, ExampleContext, ExampleMetadata};
pub struct FileSearchExample;
#[async_trait(?Send)]
impl Example for FileSearchExample {
fn meta(&self) -> ExampleMetadata {
ExampleMetadata {
name: "file_search".to_string(),
url: "https://github.com/zed-industries/zed.git".to_string(),
revision: "03ecb88fe30794873f191ddb728f597935b3101c".to_string(),
language_server: None,
max_assertions: Some(4),
}
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
const FILENAME: &str = "find_replace_file_tool.rs";
cx.push_user_message(format!(
r#"
Look at the `{FILENAME}`. I want to implement a card for it. The card should implement the `Render` trait.
The card should show a diff. It should be a beautifully presented diff. The card "box" should look like what we show for
markdown codeblocks (look at `MarkdownElement`). I want to see a red background for lines that were deleted and a green
background for lines that were added. We should have a div per diff line.
"#
));
let response = cx.run_turn().await?;
let tool_use = response.expect_tool("path_search", cx)?;
let input = tool_use.expect_input::<PathSearchToolInput>(cx)?;
let glob = input.glob;
cx.assert(
glob.ends_with(FILENAME),
format!("glob ends with `{FILENAME}`"),
)?;
let without_filename = glob.replace(FILENAME, "");
let matches = Regex::new("(\\*\\*|zed)/(\\*\\*?/)?")
.unwrap()
.is_match(&without_filename);
cx.assert(matches, "glob starts with either `**` or `zed`")?;
Ok(())
}
}

View file

@ -0,0 +1,43 @@
url = "https://github.com/zed-industries/zed.git"
revision = "38fcadf9481d018543c65f36ac3bafeba190179b"
language_extension = "rs"
prompt = """
Look at the `find_replace_file_tool.rs`. I want to implement a card for it.
The card should implement the `Render` trait.
The card should show a diff. It should be a beautifully presented diff.
The card "box" should look like what we show for markdown codeblocks (look at `MarkdownElement`).
I want to see a red background for lines that were deleted and a green background for lines
that were added. We should have a div per diff line.
"""
[diff_assertions]
modify_find_and_replace_tool = """
The changes must replace the previous output returned by `FindReplaceFileTool` with the new `ToolResult` struct.
The struct should contain an `output` field that is the same as the task we were returning before,
and a new `card` field that contains a view for the card.
"""
card_implementation = """
The card should be a view that displays a diff.
Each line in the diff should be colored according to whether it was added, removed or unchanged.
"""
[thread_assertions]
path_search = """
The first tool call should be to path search including "find_replace_file_tool.rs" in the string.
(*Not* grep, for example, or reading the file based on a guess at the path.)
This is because we gave the model a filename and it needs to turn that into a real path.
"""
read_file_from_path_search = """
After obtaining the correct path of "zed/crates/assistant_tools/src/find_replace_file_tool.rs", it should read the contents of that path.
"""
symbol_search = """
When trying to find information about the Render trait, it should *not* begin with a path search, because it doesn't yet have any information
on what path the Render trait might be in.
"""

View file

@ -0,0 +1,128 @@
use anyhow::Result;
use async_trait::async_trait;
use serde::Deserialize;
use std::collections::BTreeMap;
use std::fs;
use std::{
path::{Path, PathBuf},
rc::Rc,
};
use util::serde::default_true;
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
mod file_search;
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
for example_path in list_declarative_examples(examples_dir).unwrap() {
threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
}
threads
}
struct DeclarativeExample {
metadata: ExampleMetadata,
prompt: String,
diff_assertions: Vec<JudgeAssertion>,
thread_assertions: Vec<JudgeAssertion>,
}
impl DeclarativeExample {
pub fn load(example_path: &Path) -> Result<Self> {
let name = Self::name_from_path(example_path);
let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
let language_server = if base.require_lsp {
Some(crate::example::LanguageServer {
file_extension: base
.language_extension
.expect("Language extension is required when require_lsp = true"),
allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
})
} else {
None
};
let metadata = ExampleMetadata {
name,
url: base.url,
revision: base.revision,
language_server,
max_assertions: None,
};
Ok(DeclarativeExample {
metadata,
prompt: base.prompt,
thread_assertions: base
.thread_assertions
.into_iter()
.map(|(id, description)| JudgeAssertion { id, description })
.collect(),
diff_assertions: base
.diff_assertions
.into_iter()
.map(|(id, description)| JudgeAssertion { id, description })
.collect(),
})
}
pub fn name_from_path(path: &Path) -> String {
path.file_stem().unwrap().to_string_lossy().to_string()
}
}
#[derive(Clone, Debug, Deserialize)]
pub struct ExampleToml {
pub url: String,
pub revision: String,
pub language_extension: Option<String>,
pub insert_id: Option<String>,
#[serde(default = "default_true")]
pub require_lsp: bool,
#[serde(default)]
pub allow_preexisting_diagnostics: bool,
pub prompt: String,
#[serde(default)]
pub diff_assertions: BTreeMap<String, String>,
#[serde(default)]
pub thread_assertions: BTreeMap<String, String>,
}
#[async_trait(?Send)]
impl Example for DeclarativeExample {
fn meta(&self) -> ExampleMetadata {
self.metadata.clone()
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
cx.push_user_message(&self.prompt);
let _ = cx.run_to_end().await;
Ok(())
}
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
self.diff_assertions.clone()
}
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
self.thread_assertions.clone()
}
}
fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
let path = std::fs::canonicalize(examples_dir).unwrap();
let entries = std::fs::read_dir(path).unwrap();
let mut result_paths = Vec::new();
for entry in entries {
let entry = entry?;
let path = entry.path();
if path.extension() == Some("toml".as_ref()) {
result_paths.push(path);
}
}
Ok(result_paths)
}