More resilient eval (#32257)

Bubbles up rate limit information so that we can retry after a certain duration if needed higher up in the stack. Also caps the number of concurrent evals running at once to also help. Release Notes: - N/A
2025-06-09 20:07:22 +02:00 · 2025-06-09 20:07:22 +02:00 · e4bd115a63
commit e4bd115a63
parent fa54fa80d0
22 changed files with 147 additions and 56 deletions
--- a/crates/assistant_tools/src/edit_agent/evals.rs
+++ b/crates/assistant_tools/src/edit_agent/evals.rs
@ -11,7 +11,7 @@ use client::{Client, UserStore};
 use collections::HashMap;
 use fs::FakeFs;
 use futures::{FutureExt, future::LocalBoxFuture};
-use gpui::{AppContext, TestAppContext};
+use gpui::{AppContext, TestAppContext, Timer};
 use indoc::{formatdoc, indoc};
 use language_model::{
    LanguageModelRegistry, LanguageModelRequestTool, LanguageModelToolResult,
@ -1255,9 +1255,12 @@ impl EvalAssertion {
                }],
                ..Default::default()
            };
-            let mut response = judge
-                .stream_completion_text(request, &cx.to_async())
-                .await?;
+            let mut response = retry_on_rate_limit(async || {
+                Ok(judge
+                    .stream_completion_text(request.clone(), &cx.to_async())
+                    .await?)
+            })
+            .await?;
            let mut output = String::new();
            while let Some(chunk) = response.stream.next().await {
                let chunk = chunk?;
@ -1308,10 +1311,17 @@ fn eval(
    run_eval(eval.clone(), tx.clone());

    let executor = gpui::background_executor();
+    let semaphore = Arc::new(smol::lock::Semaphore::new(32));
    for _ in 1..iterations {
        let eval = eval.clone();
        let tx = tx.clone();
-        executor.spawn(async move { run_eval(eval, tx) }).detach();
+        let semaphore = semaphore.clone();
+        executor
+            .spawn(async move {
+                let _guard = semaphore.acquire().await;
+                run_eval(eval, tx)
+            })
+            .detach();
    }
    drop(tx);

@ -1577,21 +1587,31 @@ impl EditAgentTest {
            if let Some(input_content) = eval.input_content.as_deref() {
                buffer.update(cx, |buffer, cx| buffer.set_text(input_content, cx));
            }
-            let (edit_output, _) = self.agent.edit(
-                buffer.clone(),
-                eval.edit_file_input.display_description,
-                &conversation,
-                &mut cx.to_async(),
-            );
-            edit_output.await?
+            retry_on_rate_limit(async || {
+                self.agent
+                    .edit(
+                        buffer.clone(),
+                        eval.edit_file_input.display_description.clone(),
+                        &conversation,
+                        &mut cx.to_async(),
+                    )
+                    .0
+                    .await
+            })
+            .await?
        } else {
-            let (edit_output, _) = self.agent.overwrite(
-                buffer.clone(),
-                eval.edit_file_input.display_description,
-                &conversation,
-                &mut cx.to_async(),
-            );
-            edit_output.await?
+            retry_on_rate_limit(async || {
+                self.agent
+                    .overwrite(
+                        buffer.clone(),
+                        eval.edit_file_input.display_description.clone(),
+                        &conversation,
+                        &mut cx.to_async(),
+                    )
+                    .0
+                    .await
+            })
+            .await?
        };

        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
@ -1613,6 +1633,26 @@ impl EditAgentTest {
    }
 }

+async fn retry_on_rate_limit<R>(mut request: impl AsyncFnMut() -> Result<R>) -> Result<R> {
+    loop {
+        match request().await {
+            Ok(result) => return Ok(result),
+            Err(err) => match err.downcast::<LanguageModelCompletionError>() {
+                Ok(err) => match err {
+                    LanguageModelCompletionError::RateLimit(duration) => {
+                        // Wait until after we are allowed to try again
+                        eprintln!("Rate limit exceeded. Waiting for {duration:?}...",);
+                        Timer::after(duration).await;
+                        continue;
+                    }
+                    _ => return Err(err.into()),
+                },
+                Err(err) => return Err(err),
+            },
+        }
+    }
+}
+
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
 struct EvalAssertionOutcome {
    score: usize,