eval: Fine-grained assertions (#29246)
- Support programmatic examples ([example](17feb260a0/crates/eval/src/examples/file_search.rs
)) - Combine data-driven example declarations into a single `.toml` file ([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml
)) - Run judge on individual assertions (previously called "criteria") - Report judge and programmatic assertions in one combined table Note: We still need to work on concept naming <img width=400 src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5"> Release Notes: - N/A --------- Co-authored-by: Richard Feldman <oss@rtfeldman.com> Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com> Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
parent
0d3fe474db
commit
ce1a674eba
18 changed files with 1969 additions and 1229 deletions
128
crates/eval/src/examples/mod.rs
Normal file
128
crates/eval/src/examples/mod.rs
Normal file
|
@ -0,0 +1,128 @@
|
|||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use serde::Deserialize;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
rc::Rc,
|
||||
};
|
||||
use util::serde::default_true;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
|
||||
|
||||
mod file_search;
|
||||
|
||||
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
|
||||
let mut threads: Vec<Rc<dyn Example>> = vec![Rc::new(file_search::FileSearchExample)];
|
||||
|
||||
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
||||
threads.push(Rc::new(DeclarativeExample::load(&example_path).unwrap()));
|
||||
}
|
||||
|
||||
threads
|
||||
}
|
||||
|
||||
struct DeclarativeExample {
|
||||
metadata: ExampleMetadata,
|
||||
prompt: String,
|
||||
diff_assertions: Vec<JudgeAssertion>,
|
||||
thread_assertions: Vec<JudgeAssertion>,
|
||||
}
|
||||
|
||||
impl DeclarativeExample {
|
||||
pub fn load(example_path: &Path) -> Result<Self> {
|
||||
let name = Self::name_from_path(example_path);
|
||||
let base: ExampleToml = toml::from_str(&fs::read_to_string(&example_path)?)?;
|
||||
|
||||
let language_server = if base.require_lsp {
|
||||
Some(crate::example::LanguageServer {
|
||||
file_extension: base
|
||||
.language_extension
|
||||
.expect("Language extension is required when require_lsp = true"),
|
||||
allow_preexisting_diagnostics: base.allow_preexisting_diagnostics,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let metadata = ExampleMetadata {
|
||||
name,
|
||||
url: base.url,
|
||||
revision: base.revision,
|
||||
language_server,
|
||||
max_assertions: None,
|
||||
};
|
||||
|
||||
Ok(DeclarativeExample {
|
||||
metadata,
|
||||
prompt: base.prompt,
|
||||
thread_assertions: base
|
||||
.thread_assertions
|
||||
.into_iter()
|
||||
.map(|(id, description)| JudgeAssertion { id, description })
|
||||
.collect(),
|
||||
diff_assertions: base
|
||||
.diff_assertions
|
||||
.into_iter()
|
||||
.map(|(id, description)| JudgeAssertion { id, description })
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn name_from_path(path: &Path) -> String {
|
||||
path.file_stem().unwrap().to_string_lossy().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct ExampleToml {
|
||||
pub url: String,
|
||||
pub revision: String,
|
||||
pub language_extension: Option<String>,
|
||||
pub insert_id: Option<String>,
|
||||
#[serde(default = "default_true")]
|
||||
pub require_lsp: bool,
|
||||
#[serde(default)]
|
||||
pub allow_preexisting_diagnostics: bool,
|
||||
pub prompt: String,
|
||||
#[serde(default)]
|
||||
pub diff_assertions: BTreeMap<String, String>,
|
||||
#[serde(default)]
|
||||
pub thread_assertions: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for DeclarativeExample {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
self.metadata.clone()
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
cx.push_user_message(&self.prompt);
|
||||
let _ = cx.run_to_end().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn diff_assertions(&self) -> Vec<JudgeAssertion> {
|
||||
self.diff_assertions.clone()
|
||||
}
|
||||
|
||||
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
|
||||
self.thread_assertions.clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn list_declarative_examples(examples_dir: &Path) -> Result<Vec<PathBuf>> {
|
||||
let path = std::fs::canonicalize(examples_dir).unwrap();
|
||||
let entries = std::fs::read_dir(path).unwrap();
|
||||
let mut result_paths = Vec::new();
|
||||
for entry in entries {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.extension() == Some("toml".as_ref()) {
|
||||
result_paths.push(path);
|
||||
}
|
||||
}
|
||||
Ok(result_paths)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue