Agent Eval: Fail example when there are no events in 2 minutes (#28725)

Release Notes:

- N/A
This commit is contained in:
Michael Sloan 2025-04-14 17:01:21 -06:00 committed by GitHub
parent d74f0735c2
commit 5f897b0e00
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,8 +4,8 @@ use assistant_tool::ToolWorkingSet;
use client::proto::LspWorkProgress; use client::proto::LspWorkProgress;
use collections::HashMap; use collections::HashMap;
use dap::DapRegistry; use dap::DapRegistry;
use futures::channel::{mpsc, oneshot}; use futures::channel::mpsc;
use futures::{FutureExt, StreamExt as _}; use futures::{FutureExt, StreamExt as _, select_biased};
use gpui::{App, AsyncApp, Entity, Task}; use gpui::{App, AsyncApp, Entity, Task};
use handlebars::Handlebars; use handlebars::Handlebars;
use language::{DiagnosticSeverity, OffsetRangeExt}; use language::{DiagnosticSeverity, OffsetRangeExt};
@ -35,6 +35,8 @@ pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
pub const REPOS_DIR: &str = "./crates/eval/repos"; pub const REPOS_DIR: &str = "./crates/eval/repos";
pub const WORKTREES_DIR: &str = "./crates/eval/worktrees"; pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
#[derive(Clone, Debug, Deserialize)] #[derive(Clone, Debug, Deserialize)]
pub struct ExampleBase { pub struct ExampleBase {
pub url: String, pub url: String,
@ -277,39 +279,46 @@ impl Example {
let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> = let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
Mutex::new(HashMap::default()).into(); Mutex::new(HashMap::default()).into();
let (tx, rx) = oneshot::channel(); let (thread_event_tx, mut thread_event_rx) = mpsc::unbounded();
let mut tx = Some(tx);
let subscription = cx.subscribe(&thread, { let subscription = cx.subscribe(&thread, move |_thread, event: &ThreadEvent, _cx| {
thread_event_tx.unbounded_send(event.clone()).log_err();
});
let event_handler_task = cx.spawn({
let log_file = this.log_file.clone(); let log_file = this.log_file.clone();
let name = this.name.clone(); let name = this.name.clone();
let tool_use_counts = tool_use_counts.clone(); let tool_use_counts = tool_use_counts.clone();
move |thread, event: &ThreadEvent, cx| { let thread = thread.downgrade();
async move |cx| {
loop {
let event = select_biased! {
event = thread_event_rx.next() => event,
_ = cx.background_executor().timer(THREAD_EVENT_TIMEOUT).fuse() => {
return Err(anyhow!("Agentic loop stalled - waited {:?} without any events", THREAD_EVENT_TIMEOUT));
}
};
let Some(event) = event else {
return Err(anyhow!("ThreadEvent channel ended early"));
};
let mut log_file = log_file.lock().unwrap(); let mut log_file = log_file.lock().unwrap();
match event { match event {
ThreadEvent::Stopped(reason) => match reason { ThreadEvent::Stopped(reason) => match reason {
Ok(StopReason::EndTurn) => { Ok(StopReason::EndTurn) => {
if let Some(tx) = tx.take() { return Ok(());
tx.send(Ok(())).ok();
}
} }
Ok(StopReason::MaxTokens) => { Ok(StopReason::MaxTokens) => {
if let Some(tx) = tx.take() { return Err(anyhow!("Exceeded maximum tokens"));
tx.send(Err(anyhow!("Exceeded maximum tokens"))).ok();
}
} }
Ok(StopReason::ToolUse) => {} Ok(StopReason::ToolUse) => {}
Err(error) => { Err(error) => {
if let Some(tx) = tx.take() { return Err(anyhow!(error.clone()));
tx.send(Err(anyhow!(error.clone()))).ok();
}
} }
}, },
ThreadEvent::ShowError(thread_error) => { ThreadEvent::ShowError(thread_error) => {
if let Some(tx) = tx.take() { break Err(anyhow!(thread_error.clone()));
tx.send(Err(anyhow!(thread_error.clone()))).ok();
}
} }
ThreadEvent::StreamedAssistantText(_, chunk) => { ThreadEvent::StreamedAssistantText(_, chunk) => {
write!(&mut log_file, "{}", chunk).log_err(); write!(&mut log_file, "{}", chunk).log_err();
@ -334,20 +343,23 @@ impl Example {
println!("{name}> {message}"); println!("{name}> {message}");
writeln!(&mut log_file, "\n{}", message).log_err(); writeln!(&mut log_file, "\n{}", message).log_err();
} }
if let Some(tool_result) = thread.read(cx).tool_result(tool_use_id) { thread.update(cx, |thread, _cx| {
if let Some(tool_result) = thread.tool_result(&tool_use_id) {
writeln!(&mut log_file, "\n{}\n", tool_result.content).log_err(); writeln!(&mut log_file, "\n{}\n", tool_result.content).log_err();
let mut tool_use_counts = tool_use_counts.lock().unwrap(); let mut tool_use_counts = tool_use_counts.lock().unwrap();
*tool_use_counts *tool_use_counts
.entry(tool_result.tool_name.clone()) .entry(tool_result.tool_name.clone())
.or_insert(0) += 1; .or_insert(0) += 1;
} }
})?;
} }
_ => {} _ => {}
} }
log_file.flush().log_err(); log_file.flush().log_err();
} }
})?; }
});
thread.update(cx, |thread, cx| { thread.update(cx, |thread, cx| {
let context = vec![]; let context = vec![];
@ -355,7 +367,7 @@ impl Example {
thread.send_to_model(model, RequestKind::Chat, cx); thread.send_to_model(model, RequestKind::Chat, cx);
})?; })?;
rx.await??; event_handler_task.await?;
if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() { if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() {
wait_for_lang_server(lsp_store, this.name.clone(), cx).await?; wait_for_lang_server(lsp_store, this.name.clone(), cx).await?;