Agent Eval: Fail example when there are no events in 2 minutes (#28725)
Release Notes: - N/A
This commit is contained in:
parent
d74f0735c2
commit
5f897b0e00
1 changed files with 74 additions and 62 deletions
|
@ -4,8 +4,8 @@ use assistant_tool::ToolWorkingSet;
|
||||||
use client::proto::LspWorkProgress;
|
use client::proto::LspWorkProgress;
|
||||||
use collections::HashMap;
|
use collections::HashMap;
|
||||||
use dap::DapRegistry;
|
use dap::DapRegistry;
|
||||||
use futures::channel::{mpsc, oneshot};
|
use futures::channel::mpsc;
|
||||||
use futures::{FutureExt, StreamExt as _};
|
use futures::{FutureExt, StreamExt as _, select_biased};
|
||||||
use gpui::{App, AsyncApp, Entity, Task};
|
use gpui::{App, AsyncApp, Entity, Task};
|
||||||
use handlebars::Handlebars;
|
use handlebars::Handlebars;
|
||||||
use language::{DiagnosticSeverity, OffsetRangeExt};
|
use language::{DiagnosticSeverity, OffsetRangeExt};
|
||||||
|
@ -35,6 +35,8 @@ pub const EXAMPLES_DIR: &str = "./crates/eval/examples";
|
||||||
pub const REPOS_DIR: &str = "./crates/eval/repos";
|
pub const REPOS_DIR: &str = "./crates/eval/repos";
|
||||||
pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
|
pub const WORKTREES_DIR: &str = "./crates/eval/worktrees";
|
||||||
|
|
||||||
|
const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize)]
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
pub struct ExampleBase {
|
pub struct ExampleBase {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
|
@ -277,39 +279,46 @@ impl Example {
|
||||||
let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
|
let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
|
||||||
Mutex::new(HashMap::default()).into();
|
Mutex::new(HashMap::default()).into();
|
||||||
|
|
||||||
let (tx, rx) = oneshot::channel();
|
let (thread_event_tx, mut thread_event_rx) = mpsc::unbounded();
|
||||||
let mut tx = Some(tx);
|
|
||||||
|
|
||||||
let subscription = cx.subscribe(&thread, {
|
let subscription = cx.subscribe(&thread, move |_thread, event: &ThreadEvent, _cx| {
|
||||||
|
thread_event_tx.unbounded_send(event.clone()).log_err();
|
||||||
|
});
|
||||||
|
|
||||||
|
let event_handler_task = cx.spawn({
|
||||||
let log_file = this.log_file.clone();
|
let log_file = this.log_file.clone();
|
||||||
let name = this.name.clone();
|
let name = this.name.clone();
|
||||||
let tool_use_counts = tool_use_counts.clone();
|
let tool_use_counts = tool_use_counts.clone();
|
||||||
move |thread, event: &ThreadEvent, cx| {
|
let thread = thread.downgrade();
|
||||||
|
async move |cx| {
|
||||||
|
loop {
|
||||||
|
let event = select_biased! {
|
||||||
|
event = thread_event_rx.next() => event,
|
||||||
|
_ = cx.background_executor().timer(THREAD_EVENT_TIMEOUT).fuse() => {
|
||||||
|
return Err(anyhow!("Agentic loop stalled - waited {:?} without any events", THREAD_EVENT_TIMEOUT));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let Some(event) = event else {
|
||||||
|
return Err(anyhow!("ThreadEvent channel ended early"));
|
||||||
|
};
|
||||||
|
|
||||||
let mut log_file = log_file.lock().unwrap();
|
let mut log_file = log_file.lock().unwrap();
|
||||||
|
|
||||||
match event {
|
match event {
|
||||||
ThreadEvent::Stopped(reason) => match reason {
|
ThreadEvent::Stopped(reason) => match reason {
|
||||||
Ok(StopReason::EndTurn) => {
|
Ok(StopReason::EndTurn) => {
|
||||||
if let Some(tx) = tx.take() {
|
return Ok(());
|
||||||
tx.send(Ok(())).ok();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(StopReason::MaxTokens) => {
|
Ok(StopReason::MaxTokens) => {
|
||||||
if let Some(tx) = tx.take() {
|
return Err(anyhow!("Exceeded maximum tokens"));
|
||||||
tx.send(Err(anyhow!("Exceeded maximum tokens"))).ok();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(StopReason::ToolUse) => {}
|
Ok(StopReason::ToolUse) => {}
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
if let Some(tx) = tx.take() {
|
return Err(anyhow!(error.clone()));
|
||||||
tx.send(Err(anyhow!(error.clone()))).ok();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
ThreadEvent::ShowError(thread_error) => {
|
ThreadEvent::ShowError(thread_error) => {
|
||||||
if let Some(tx) = tx.take() {
|
break Err(anyhow!(thread_error.clone()));
|
||||||
tx.send(Err(anyhow!(thread_error.clone()))).ok();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
ThreadEvent::StreamedAssistantText(_, chunk) => {
|
ThreadEvent::StreamedAssistantText(_, chunk) => {
|
||||||
write!(&mut log_file, "{}", chunk).log_err();
|
write!(&mut log_file, "{}", chunk).log_err();
|
||||||
|
@ -334,20 +343,23 @@ impl Example {
|
||||||
println!("{name}> {message}");
|
println!("{name}> {message}");
|
||||||
writeln!(&mut log_file, "\n{}", message).log_err();
|
writeln!(&mut log_file, "\n{}", message).log_err();
|
||||||
}
|
}
|
||||||
if let Some(tool_result) = thread.read(cx).tool_result(tool_use_id) {
|
thread.update(cx, |thread, _cx| {
|
||||||
|
if let Some(tool_result) = thread.tool_result(&tool_use_id) {
|
||||||
writeln!(&mut log_file, "\n{}\n", tool_result.content).log_err();
|
writeln!(&mut log_file, "\n{}\n", tool_result.content).log_err();
|
||||||
let mut tool_use_counts = tool_use_counts.lock().unwrap();
|
let mut tool_use_counts = tool_use_counts.lock().unwrap();
|
||||||
*tool_use_counts
|
*tool_use_counts
|
||||||
.entry(tool_result.tool_name.clone())
|
.entry(tool_result.tool_name.clone())
|
||||||
.or_insert(0) += 1;
|
.or_insert(0) += 1;
|
||||||
}
|
}
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
log_file.flush().log_err();
|
log_file.flush().log_err();
|
||||||
}
|
}
|
||||||
})?;
|
}
|
||||||
|
});
|
||||||
|
|
||||||
thread.update(cx, |thread, cx| {
|
thread.update(cx, |thread, cx| {
|
||||||
let context = vec![];
|
let context = vec![];
|
||||||
|
@ -355,7 +367,7 @@ impl Example {
|
||||||
thread.send_to_model(model, RequestKind::Chat, cx);
|
thread.send_to_model(model, RequestKind::Chat, cx);
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
rx.await??;
|
event_handler_task.await?;
|
||||||
|
|
||||||
if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() {
|
if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() {
|
||||||
wait_for_lang_server(lsp_store, this.name.clone(), cx).await?;
|
wait_for_lang_server(lsp_store, this.name.clone(), cx).await?;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue