agent: Extract usage information from response headers (#29002)

This PR updates the Agent to extract the usage information from the response headers, if they are present. For now we just log the information, but we'll be using this soon to populate some UI. Release Notes: - N/A
2025-04-17 16:11:07 -04:00 · 2025-04-17 16:11:07 -04:00 · d93141bded
commit d93141bded
parent b402007de6
7 changed files with 141 additions and 22 deletions
--- a/crates/language_model/Cargo.toml
+++ b/crates/language_model/Cargo.toml
@ -40,6 +40,7 @@ telemetry_events.workspace = true
 thiserror.workspace = true
 util.workspace = true
 workspace-hack.workspace = true
+zed_llm_client.workspace = true

 [dev-dependencies]
 gpui = { workspace = true, features = ["test-support"] }
--- a/crates/language_model/src/language_model.rs
+++ b/crates/language_model/src/language_model.rs
@ -8,11 +8,12 @@ mod telemetry;
 #[cfg(any(test, feature = "test-support"))]
 pub mod fake_provider;

-use anyhow::Result;
+use anyhow::{Result, anyhow};
 use client::Client;
 use futures::FutureExt;
 use futures::{StreamExt, future::BoxFuture, stream::BoxStream};
 use gpui::{AnyElement, AnyView, App, AsyncApp, SharedString, Task, Window};
+use http_client::http::{HeaderMap, HeaderValue};
 use icons::IconName;
 use parking_lot::Mutex;
 use proto::Plan;
@ -20,9 +21,13 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize, de::DeserializeOwned};
 use std::fmt;
 use std::ops::{Add, Sub};
+use std::str::FromStr as _;
 use std::sync::Arc;
 use thiserror::Error;
 use util::serde::is_default;
+use zed_llm_client::{
+    MODEL_REQUESTS_USAGE_AMOUNT_HEADER_NAME, MODEL_REQUESTS_USAGE_LIMIT_HEADER_NAME, UsageLimit,
+};

 pub use crate::model::*;
 pub use crate::rate_limiter::*;
@ -83,6 +88,28 @@ pub enum StopReason {
    ToolUse,
 }

+#[derive(Debug, Clone, Copy)]
+pub struct RequestUsage {
+    pub limit: UsageLimit,
+    pub amount: i32,
+}
+
+impl RequestUsage {
+    pub fn from_headers(headers: &HeaderMap<HeaderValue>) -> Result<Self> {
+        let limit = headers
+            .get(MODEL_REQUESTS_USAGE_LIMIT_HEADER_NAME)
+            .ok_or_else(|| anyhow!("missing {MODEL_REQUESTS_USAGE_LIMIT_HEADER_NAME:?} header"))?;
+        let limit = UsageLimit::from_str(limit.to_str()?)?;
+
+        let amount = headers
+            .get(MODEL_REQUESTS_USAGE_AMOUNT_HEADER_NAME)
+            .ok_or_else(|| anyhow!("missing {MODEL_REQUESTS_USAGE_AMOUNT_HEADER_NAME:?} header"))?;
+        let amount = amount.to_str()?.parse::<i32>()?;
+
+        Ok(Self { limit, amount })
+    }
+}
+
 #[derive(Debug, PartialEq, Clone, Copy, Serialize, Deserialize, Default)]
 pub struct TokenUsage {
    #[serde(default, skip_serializing_if = "is_default")]
@ -214,6 +241,22 @@ pub trait LanguageModel: Send + Sync {
        cx: &AsyncApp,
    ) -> BoxFuture<'static, Result<BoxStream<'static, Result<LanguageModelCompletionEvent>>>>;

+    fn stream_completion_with_usage(
+        &self,
+        request: LanguageModelRequest,
+        cx: &AsyncApp,
+    ) -> BoxFuture<
+        'static,
+        Result<(
+            BoxStream<'static, Result<LanguageModelCompletionEvent>>,
+            Option<RequestUsage>,
+        )>,
+    > {
+        self.stream_completion(request, cx)
+            .map(|result| result.map(|stream| (stream, None)))
+            .boxed()
+    }
+
    fn stream_completion_text(
        &self,
        request: LanguageModelRequest,
--- a/crates/language_model/src/rate_limiter.rs
+++ b/crates/language_model/src/rate_limiter.rs
@ -8,6 +8,8 @@ use std::{
    task::{Context, Poll},
 };

+use crate::RequestUsage;
+
 #[derive(Clone)]
 pub struct RateLimiter {
    semaphore: Arc<Semaphore>,
@ -67,4 +69,32 @@ impl RateLimiter {
            })
        }
    }
+
+    pub fn stream_with_usage<'a, Fut, T>(
+        &self,
+        future: Fut,
+    ) -> impl 'a
+    + Future<
+        Output = Result<(
+            impl Stream<Item = T::Item> + use<Fut, T>,
+            Option<RequestUsage>,
+        )>,
+    >
+    where
+        Fut: 'a + Future<Output = Result<(T, Option<RequestUsage>)>>,
+        T: Stream,
+    {
+        let guard = self.semaphore.acquire_arc();
+        async move {
+            let guard = guard.await;
+            let (inner, usage) = future.await?;
+            Ok((
+                RateLimitGuard {
+                    inner,
+                    _guard: guard,
+                },
+                usage,
+            ))
+        }
+    }
 }