More resilient eval (#32257)

Bubbles up rate limit information so that we can retry after a certain
duration if needed higher up in the stack.

Also caps the number of concurrent evals running at once to also help.

Release Notes:

- N/A
This commit is contained in:
Ben Brandt 2025-06-09 20:07:22 +02:00 committed by GitHub
parent fa54fa80d0
commit e4bd115a63
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 147 additions and 56 deletions

View file

@ -80,6 +80,7 @@ rand.workspace = true
pretty_assertions.workspace = true
reqwest_client.workspace = true
settings = { workspace = true, features = ["test-support"] }
smol.workspace = true
task = { workspace = true, features = ["test-support"]}
tempfile.workspace = true
theme.workspace = true

View file

@ -11,7 +11,7 @@ use client::{Client, UserStore};
use collections::HashMap;
use fs::FakeFs;
use futures::{FutureExt, future::LocalBoxFuture};
use gpui::{AppContext, TestAppContext};
use gpui::{AppContext, TestAppContext, Timer};
use indoc::{formatdoc, indoc};
use language_model::{
LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
@ -1255,9 +1255,12 @@ impl EvalAssertion {
}],
..Default::default()
};
let mut response = judge
.stream_completion_text(request, &cx.to_async())
.await?;
let mut response = retry_on_rate_limit(async || {
Ok(judge
.stream_completion_text(request.clone(), &cx.to_async())
.await?)
})
.await?;
let mut output = String::new();
while let Some(chunk) = response.stream.next().await {
let chunk = chunk?;
@ -1308,10 +1311,17 @@ fn eval(
run_eval(eval.clone(), tx.clone());
let executor = gpui::background_executor();
let semaphore = Arc::new(smol::lock::Semaphore::new(32));
for _ in 1..iterations {
let eval = eval.clone();
let tx = tx.clone();
executor.spawn(async move { run_eval(eval, tx) }).detach();
let semaphore = semaphore.clone();
executor
.spawn(async move {
let _guard = semaphore.acquire().await;
run_eval(eval, tx)
})
.detach();
}
drop(tx);
@ -1577,21 +1587,31 @@ impl EditAgentTest {
if let Some(input_content) = eval.input_content.as_deref() {
buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
}
let (edit_output, _) = self.agent.edit(
buffer.clone(),
eval.edit_file_input.display_description,
&conversation,
&mut cx.to_async(),
);
edit_output.await?
retry_on_rate_limit(async || {
self.agent
.edit(
buffer.clone(),
eval.edit_file_input.display_description.clone(),
&conversation,
&mut cx.to_async(),
)
.0
.await
})
.await?
} else {
let (edit_output, _) = self.agent.overwrite(
buffer.clone(),
eval.edit_file_input.display_description,
&conversation,
&mut cx.to_async(),
);
edit_output.await?
retry_on_rate_limit(async || {
self.agent
.overwrite(
buffer.clone(),
eval.edit_file_input.display_description.clone(),
&conversation,
&mut cx.to_async(),
)
.0
.await
})
.await?
};
let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
@ -1613,6 +1633,26 @@ impl EditAgentTest {
}
}
async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
loop {
match request().await {
Ok(result) => return Ok(result),
Err(err) => match err.downcast::<LanguageModelCompletionError>() {
Ok(err) => match err {
LanguageModelCompletionError::RateLimit(duration) => {
// Wait until after we are allowed to try again
eprintln!("Rate limit exceeded. Waiting for {duration:?}...",);
Timer::after(duration).await;
continue;
}
_ => return Err(err.into()),
},
Err(err) => return Err(err),
},
}
}
}
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
struct EvalAssertionOutcome {
score: usize,