From 2a9355a3d25ddd887556741a30bbafd5d6ff58f0 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Thu, 24 Jul 2025 11:11:26 -0400 Subject: [PATCH] Don't auto-retry in certain circumstances (#35037) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Someone encountered this in production, which should not happen: Screenshot 2025-07-24 at 10 38
40 AM This moves certain errors into the category of "never retry" and reduces the number of retries for some others. Also it adds some diagnostic logging for retry policy. It's not a complete fix for the above, because the underlying issues is that the server is sending a HTTP 403 response and although we were already treating 403s as "do not retry" it was deciding to retry with 2 attempts anyway. So further debugging is needed to figure out why it wasn't going down the 403 branch by the time the request got here. Release Notes: - N/A --- crates/agent/src/thread.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/crates/agent/src/thread.rs b/crates/agent/src/thread.rs index 1b3b022ab2..1af27ca8a7 100644 --- a/crates/agent/src/thread.rs +++ b/crates/agent/src/thread.rs @@ -2037,6 +2037,12 @@ impl Thread { if let Some(retry_strategy) = Thread::get_retry_strategy(completion_error) { + log::info!( + "Retrying with {:?} for language model completion error {:?}", + retry_strategy, + completion_error + ); + retry_scheduled = thread .handle_retryable_error_with_delay( &completion_error, @@ -2246,15 +2252,14 @@ impl Thread { .. } | AuthenticationError { .. } - | PermissionError { .. } => None, - // These errors might be transient, so retry them - SerializeRequest { .. } - | BuildRequestBody { .. } - | PromptTooLarge { .. } + | PermissionError { .. } + | NoApiKey { .. } | ApiEndpointNotFound { .. } - | NoApiKey { .. } => Some(RetryStrategy::Fixed { + | PromptTooLarge { .. } => None, + // These errors might be transient, so retry them + SerializeRequest { .. } | BuildRequestBody { .. } => Some(RetryStrategy::Fixed { delay: BASE_RETRY_DELAY, - max_attempts: 2, + max_attempts: 1, }), // Retry all other 4xx and 5xx errors once. HttpResponseError { status_code, .. }