eval: Fine-grained assertions (#29246)

- Support programmatic examples
([example](17feb260a0/crates/eval/src/examples/file_search.rs))
- Combine data-driven example declarations into a single `.toml` file
([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml))
- Run judge on individual assertions (previously called "criteria")
- Report judge and programmatic assertions in one combined table

Note: We still need to work on concept naming 

<img width=400
src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5">

Release Notes:

- N/A

---------

Co-authored-by: Richard Feldman <oss@rtfeldman.com>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
Agus Zubiaga 2025-04-22 23:58:58 -03:00 committed by GitHub
parent 0d3fe474db
commit ce1a674eba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1969 additions and 1229 deletions

View file

@ -0,0 +1,128 @@
use anyhow::Result;
use async_trait::async_trait;
use serde::Deserialize;
use std::collections::BTreeMap;
use std::fs;
use std::{
path::{Path, PathBuf},
rc::Rc,
};
use util::serde::default_true;
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
mod file_search;
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
for example_path in list_declarative_examples(examples_dir).unwrap() {
threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
}
threads
}
struct DeclarativeExample {
metadata: ExampleMetadata,
prompt: String,
diff_assertions: Vec<JudgeAssertion>,
thread_assertions: Vec<JudgeAssertion>,
}
impl DeclarativeExample {
pub fn load(example_path: &Path) -> Result<Self> {
let name = Self::name_from_path(example_path);
let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
let language_server = if base.require_lsp {
Some(crate::example::LanguageServer {
file_extension: base
.language_extension
.expect("Language extension is required when require_lsp = true"),
allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
})
} else {
None
};
let metadata = ExampleMetadata {
name,
url: base.url,
revision: base.revision,
language_server,
max_assertions: None,
};
Ok(DeclarativeExample {
metadata,
prompt: base.prompt,
thread_assertions: base
.thread_assertions
.into_iter()
.map(|(id, description)| JudgeAssertion { id, description })
.collect(),
diff_assertions: base
.diff_assertions
.into_iter()
.map(|(id, description)| JudgeAssertion { id, description })
.collect(),
})
}
pub fn name_from_path(path: &Path) -> String {
path.file_stem().unwrap().to_string_lossy().to_string()
}
}
#[derive(Clone, Debug, Deserialize)]
pub struct ExampleToml {
pub url: String,
pub revision: String,
pub language_extension: Option<String>,
pub insert_id: Option<String>,
#[serde(default = "default_true")]
pub require_lsp: bool,
#[serde(default)]
pub allow_preexisting_diagnostics: bool,
pub prompt: String,
#[serde(default)]
pub diff_assertions: BTreeMap<String, String>,
#[serde(default)]
pub thread_assertions: BTreeMap<String, String>,
}
#[async_trait(?Send)]
impl Example for DeclarativeExample {
fn meta(&self) -> ExampleMetadata {
self.metadata.clone()
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
cx.push_user_message(&self.prompt);
let _ = cx.run_to_end().await;
Ok(())
}
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
self.diff_assertions.clone()
}
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
self.thread_assertions.clone()
}
}
fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
let path = std::fs::canonicalize(examples_dir).unwrap();
let entries = std::fs::read_dir(path).unwrap();
let mut result_paths = Vec::new();
for entry in entries {
let entry = entry?;
let path = entry.path();
if path.extension() == Some("toml".as_ref()) {
result_paths.push(path);
}
}
Ok(result_paths)
}