agent: Extract usage information from response headers (#29002)

This PR updates the Agent to extract the usage information from the
response headers, if they are present.

For now we just log the information, but we'll be using this soon to
populate some UI.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2025-04-17 16:11:07 -04:00 committed by GitHub
parent b402007de6
commit d93141bded
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 141 additions and 22 deletions

View file

@ -13,7 +13,7 @@ use language_model::{
AuthenticateError, CloudModel, LanguageModel, LanguageModelCacheConfiguration, LanguageModelId,
LanguageModelKnownError, LanguageModelName, LanguageModelProviderId, LanguageModelProviderName,
LanguageModelProviderState, LanguageModelProviderTosView, LanguageModelRequest,
LanguageModelToolSchemaFormat, ModelRequestLimitReachedError, RateLimiter,
LanguageModelToolSchemaFormat, ModelRequestLimitReachedError, RateLimiter, RequestUsage,
ZED_CLOUD_PROVIDER_ID,
};
use language_model::{
@ -518,7 +518,7 @@ impl CloudLanguageModel {
client: Arc<Client>,
llm_api_token: LlmApiToken,
body: CompletionBody,
) -> Result<Response<AsyncBody>> {
) -> Result<(Response<AsyncBody>, Option<RequestUsage>)> {
let http_client = &client.http_client();
let mut token = llm_api_token.acquire(&client).await?;
@ -540,7 +540,9 @@ impl CloudLanguageModel {
let mut response = http_client.send(request).await?;
let status = response.status();
if status.is_success() {
return Ok(response);
let usage = RequestUsage::from_headers(response.headers()).ok();
return Ok((response, usage));
} else if response
.headers()
.get(EXPIRED_LLM_TOKEN_HEADER_NAME)
@ -708,8 +710,24 @@ impl LanguageModel for CloudLanguageModel {
fn stream_completion(
&self,
request: LanguageModelRequest,
_cx: &AsyncApp,
cx: &AsyncApp,
) -> BoxFuture<'static, Result<BoxStream<'static, Result<LanguageModelCompletionEvent>>>> {
self.stream_completion_with_usage(request, cx)
.map(|result| result.map(|(stream, _)| stream))
.boxed()
}
fn stream_completion_with_usage(
&self,
request: LanguageModelRequest,
_cx: &AsyncApp,
) -> BoxFuture<
'static,
Result<(
BoxStream<'static, Result<LanguageModelCompletionEvent>>,
Option<RequestUsage>,
)>,
> {
match &self.model {
CloudModel::Anthropic(model) => {
let request = into_anthropic(
@ -721,8 +739,8 @@ impl LanguageModel for CloudLanguageModel {
);
let client = self.client.clone();
let llm_api_token = self.llm_api_token.clone();
let future = self.request_limiter.stream(async move {
let response = Self::perform_llm_completion(
let future = self.request_limiter.stream_with_usage(async move {
let (response, usage) = Self::perform_llm_completion(
client.clone(),
llm_api_token,
CompletionBody {
@ -748,20 +766,25 @@ impl LanguageModel for CloudLanguageModel {
Err(err) => anyhow!(err),
})?;
Ok(
Ok((
crate::provider::anthropic::map_to_language_model_completion_events(
Box::pin(response_lines(response).map_err(AnthropicError::Other)),
),
)
usage,
))
});
async move { Ok(future.await?.boxed()) }.boxed()
async move {
let (stream, usage) = future.await?;
Ok((stream.boxed(), usage))
}
.boxed()
}
CloudModel::OpenAi(model) => {
let client = self.client.clone();
let request = into_open_ai(request, model, model.max_output_tokens());
let llm_api_token = self.llm_api_token.clone();
let future = self.request_limiter.stream(async move {
let response = Self::perform_llm_completion(
let future = self.request_limiter.stream_with_usage(async move {
let (response, usage) = Self::perform_llm_completion(
client.clone(),
llm_api_token,
CompletionBody {
@ -771,20 +794,25 @@ impl LanguageModel for CloudLanguageModel {
},
)
.await?;
Ok(
Ok((
crate::provider::open_ai::map_to_language_model_completion_events(
Box::pin(response_lines(response)),
),
)
usage,
))
});
async move { Ok(future.await?.boxed()) }.boxed()
async move {
let (stream, usage) = future.await?;
Ok((stream.boxed(), usage))
}
.boxed()
}
CloudModel::Google(model) => {
let client = self.client.clone();
let request = into_google(request, model.id().into());
let llm_api_token = self.llm_api_token.clone();
let future = self.request_limiter.stream(async move {
let response = Self::perform_llm_completion(
let future = self.request_limiter.stream_with_usage(async move {
let (response, usage) = Self::perform_llm_completion(
client.clone(),
llm_api_token,
CompletionBody {
@ -794,13 +822,18 @@ impl LanguageModel for CloudLanguageModel {
},
)
.await?;
Ok(
Ok((
crate::provider::google::map_to_language_model_completion_events(Box::pin(
response_lines(response),
)),
)
usage,
))
});
async move { Ok(future.await?.boxed()) }.boxed()
async move {
let (stream, usage) = future.await?;
Ok((stream.boxed(), usage))
}
.boxed()
}
}
}