eval: Fine-grained assertions (#29246)

- Support programmatic examples
([example](17feb260a0/crates/eval/src/examples/file_search.rs))
- Combine data-driven example declarations into a single `.toml` file
([example](17feb260a0/crates/eval/src/examples/find_and_replace_diff_card.toml))
- Run judge on individual assertions (previously called "criteria")
- Report judge and programmatic assertions in one combined table

Note: We still need to work on concept naming 

<img width=400
src="https://github.com/user-attachments/assets/fc719c93-467f-412b-8d47-68821bd8a5f5">

Release Notes:

- N/A

---------

Co-authored-by: Richard Feldman <oss@rtfeldman.com>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
Agus Zubiaga 2025-04-22 23:58:58 -03:00 committed by GitHub
parent 0d3fe474db
commit ce1a674eba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1969 additions and 1229 deletions

View file

@ -315,6 +315,7 @@ pub struct Thread {
request_callback: Option<
Box<dyn FnMut(&LanguageModelRequest, &[Result<LanguageModelCompletionEvent, String>])>,
>,
remaining_turns: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -368,6 +369,7 @@ impl Thread {
message_feedback: HashMap::default(),
last_auto_capture_at: None,
request_callback: None,
remaining_turns: u32::MAX,
}
}
@ -442,6 +444,7 @@ impl Thread {
message_feedback: HashMap::default(),
last_auto_capture_at: None,
request_callback: None,
remaining_turns: u32::MAX,
}
}
@ -522,7 +525,7 @@ impl Thread {
self.messages.iter().find(|message| message.id == id)
}
pub fn messages(&self) -> impl Iterator<Item = &Message> {
pub fn messages(&self) -> impl ExactSizeIterator<Item = &Message> {
self.messages.iter()
}
@ -958,7 +961,21 @@ impl Thread {
})
}
pub fn remaining_turns(&self) -> u32 {
self.remaining_turns
}
pub fn set_remaining_turns(&mut self, remaining_turns: u32) {
self.remaining_turns = remaining_turns;
}
pub fn send_to_model(&mut self, model: Arc<dyn LanguageModel>, cx: &mut Context<Self>) {
if self.remaining_turns == 0 {
return;
}
self.remaining_turns -= 1;
let mut request = self.to_completion_request(cx);
if model.supports_tools() {
request.tools = {