Add tracing needed for LLM rate limit dashboards (#16388)
Release Notes: - N/A --------- Co-authored-by: Marshall <marshall@zed.dev>
This commit is contained in:
parent
9ef3306f55
commit
1b1070e0f7
6 changed files with 227 additions and 29 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -223,6 +223,7 @@ name = "anthropic"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"chrono",
|
||||||
"futures 0.3.30",
|
"futures 0.3.30",
|
||||||
"http_client",
|
"http_client",
|
||||||
"isahc",
|
"isahc",
|
||||||
|
@ -232,6 +233,7 @@ dependencies = [
|
||||||
"strum",
|
"strum",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
"util",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -17,6 +17,7 @@ path = "src/anthropic.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
chrono.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
http_client.workspace = true
|
http_client.workspace = true
|
||||||
isahc.workspace = true
|
isahc.workspace = true
|
||||||
|
@ -25,6 +26,7 @@ serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
strum.workspace = true
|
strum.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
util.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
|
|
|
@ -1,14 +1,17 @@
|
||||||
mod supported_countries;
|
mod supported_countries;
|
||||||
|
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use futures::{io::BufReader, stream::BoxStream, AsyncBufReadExt, AsyncReadExt, Stream, StreamExt};
|
use futures::{io::BufReader, stream::BoxStream, AsyncBufReadExt, AsyncReadExt, Stream, StreamExt};
|
||||||
use http_client::{AsyncBody, HttpClient, Method, Request as HttpRequest};
|
use http_client::{AsyncBody, HttpClient, Method, Request as HttpRequest};
|
||||||
use isahc::config::Configurable;
|
use isahc::config::Configurable;
|
||||||
|
use isahc::http::{HeaderMap, HeaderValue};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use std::{pin::Pin, str::FromStr};
|
use std::{pin::Pin, str::FromStr};
|
||||||
use strum::{EnumIter, EnumString};
|
use strum::{EnumIter, EnumString};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
use util::ResultExt as _;
|
||||||
|
|
||||||
pub use supported_countries::*;
|
pub use supported_countries::*;
|
||||||
|
|
||||||
|
@ -195,6 +198,66 @@ pub async fn stream_completion(
|
||||||
request: Request,
|
request: Request,
|
||||||
low_speed_timeout: Option<Duration>,
|
low_speed_timeout: Option<Duration>,
|
||||||
) -> Result<BoxStream<'static, Result<Event, AnthropicError>>, AnthropicError> {
|
) -> Result<BoxStream<'static, Result<Event, AnthropicError>>, AnthropicError> {
|
||||||
|
stream_completion_with_rate_limit_info(client, api_url, api_key, request, low_speed_timeout)
|
||||||
|
.await
|
||||||
|
.map(|output| output.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// https://docs.anthropic.com/en/api/rate-limits#response-headers
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct RateLimitInfo {
|
||||||
|
pub requests_limit: usize,
|
||||||
|
pub requests_remaining: usize,
|
||||||
|
pub requests_reset: DateTime<Utc>,
|
||||||
|
pub tokens_limit: usize,
|
||||||
|
pub tokens_remaining: usize,
|
||||||
|
pub tokens_reset: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RateLimitInfo {
|
||||||
|
fn from_headers(headers: &HeaderMap<HeaderValue>) -> Result<Self> {
|
||||||
|
let tokens_limit = get_header("anthropic-ratelimit-tokens-limit", headers)?.parse()?;
|
||||||
|
let requests_limit = get_header("anthropic-ratelimit-requests-limit", headers)?.parse()?;
|
||||||
|
let tokens_remaining =
|
||||||
|
get_header("anthropic-ratelimit-tokens-remaining", headers)?.parse()?;
|
||||||
|
let requests_remaining =
|
||||||
|
get_header("anthropic-ratelimit-requests-remaining", headers)?.parse()?;
|
||||||
|
let requests_reset = get_header("anthropic-ratelimit-requests-reset", headers)?;
|
||||||
|
let tokens_reset = get_header("anthropic-ratelimit-tokens-reset", headers)?;
|
||||||
|
let requests_reset = DateTime::parse_from_rfc3339(requests_reset)?.to_utc();
|
||||||
|
let tokens_reset = DateTime::parse_from_rfc3339(tokens_reset)?.to_utc();
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
requests_limit,
|
||||||
|
tokens_limit,
|
||||||
|
requests_remaining,
|
||||||
|
tokens_remaining,
|
||||||
|
requests_reset,
|
||||||
|
tokens_reset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_header<'a>(key: &str, headers: &'a HeaderMap) -> Result<&'a str, anyhow::Error> {
|
||||||
|
Ok(headers
|
||||||
|
.get(key)
|
||||||
|
.ok_or_else(|| anyhow!("missing header `{key}`"))?
|
||||||
|
.to_str()?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn stream_completion_with_rate_limit_info(
|
||||||
|
client: &dyn HttpClient,
|
||||||
|
api_url: &str,
|
||||||
|
api_key: &str,
|
||||||
|
request: Request,
|
||||||
|
low_speed_timeout: Option<Duration>,
|
||||||
|
) -> Result<
|
||||||
|
(
|
||||||
|
BoxStream<'static, Result<Event, AnthropicError>>,
|
||||||
|
Option<RateLimitInfo>,
|
||||||
|
),
|
||||||
|
AnthropicError,
|
||||||
|
> {
|
||||||
let request = StreamingRequest {
|
let request = StreamingRequest {
|
||||||
base: request,
|
base: request,
|
||||||
stream: true,
|
stream: true,
|
||||||
|
@ -224,8 +287,9 @@ pub async fn stream_completion(
|
||||||
.await
|
.await
|
||||||
.context("failed to send request to Anthropic")?;
|
.context("failed to send request to Anthropic")?;
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
|
let rate_limits = RateLimitInfo::from_headers(response.headers());
|
||||||
let reader = BufReader::new(response.into_body());
|
let reader = BufReader::new(response.into_body());
|
||||||
Ok(reader
|
let stream = reader
|
||||||
.lines()
|
.lines()
|
||||||
.filter_map(|line| async move {
|
.filter_map(|line| async move {
|
||||||
match line {
|
match line {
|
||||||
|
@ -239,7 +303,8 @@ pub async fn stream_completion(
|
||||||
Err(error) => Some(Err(AnthropicError::Other(anyhow!(error)))),
|
Err(error) => Some(Err(AnthropicError::Other(anyhow!(error)))),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.boxed())
|
.boxed();
|
||||||
|
Ok((stream, rate_limits.log_err()))
|
||||||
} else {
|
} else {
|
||||||
let mut body = Vec::new();
|
let mut body = Vec::new();
|
||||||
response
|
response
|
||||||
|
|
|
@ -217,7 +217,7 @@ async fn perform_completion(
|
||||||
_ => request.model,
|
_ => request.model,
|
||||||
};
|
};
|
||||||
|
|
||||||
let chunks = anthropic::stream_completion(
|
let (chunks, rate_limit_info) = anthropic::stream_completion_with_rate_limit_info(
|
||||||
&state.http_client,
|
&state.http_client,
|
||||||
anthropic::ANTHROPIC_API_URL,
|
anthropic::ANTHROPIC_API_URL,
|
||||||
api_key,
|
api_key,
|
||||||
|
@ -245,6 +245,18 @@ async fn perform_completion(
|
||||||
anthropic::AnthropicError::Other(err) => Error::Internal(err),
|
anthropic::AnthropicError::Other(err) => Error::Internal(err),
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
if let Some(rate_limit_info) = rate_limit_info {
|
||||||
|
tracing::info!(
|
||||||
|
target: "upstream rate limit",
|
||||||
|
provider = params.provider.to_string(),
|
||||||
|
model = model,
|
||||||
|
tokens_remaining = rate_limit_info.tokens_remaining,
|
||||||
|
requests_remaining = rate_limit_info.requests_remaining,
|
||||||
|
requests_reset = ?rate_limit_info.requests_reset,
|
||||||
|
tokens_reset = ?rate_limit_info.tokens_reset,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
chunks
|
chunks
|
||||||
.map(move |event| {
|
.map(move |event| {
|
||||||
let chunk = event?;
|
let chunk = event?;
|
||||||
|
@ -540,33 +552,74 @@ impl<S> Drop for TokenCountingStream<S> {
|
||||||
.await
|
.await
|
||||||
.log_err();
|
.log_err();
|
||||||
|
|
||||||
if let Some((clickhouse_client, usage)) = state.clickhouse_client.as_ref().zip(usage) {
|
if let Some(usage) = usage {
|
||||||
report_llm_usage(
|
tracing::info!(
|
||||||
clickhouse_client,
|
target: "user usage",
|
||||||
LlmUsageEventRow {
|
user_id = claims.user_id,
|
||||||
time: Utc::now().timestamp_millis(),
|
login = claims.github_user_login,
|
||||||
user_id: claims.user_id as i32,
|
authn.jti = claims.jti,
|
||||||
is_staff: claims.is_staff,
|
requests_this_minute = usage.requests_this_minute,
|
||||||
plan: match claims.plan {
|
tokens_this_minute = usage.tokens_this_minute,
|
||||||
Plan::Free => "free".to_string(),
|
);
|
||||||
Plan::ZedPro => "zed_pro".to_string(),
|
|
||||||
|
if let Some(clickhouse_client) = state.clickhouse_client.as_ref() {
|
||||||
|
report_llm_usage(
|
||||||
|
clickhouse_client,
|
||||||
|
LlmUsageEventRow {
|
||||||
|
time: Utc::now().timestamp_millis(),
|
||||||
|
user_id: claims.user_id as i32,
|
||||||
|
is_staff: claims.is_staff,
|
||||||
|
plan: match claims.plan {
|
||||||
|
Plan::Free => "free".to_string(),
|
||||||
|
Plan::ZedPro => "zed_pro".to_string(),
|
||||||
|
},
|
||||||
|
model,
|
||||||
|
provider: provider.to_string(),
|
||||||
|
input_token_count: input_token_count as u64,
|
||||||
|
output_token_count: output_token_count as u64,
|
||||||
|
requests_this_minute: usage.requests_this_minute as u64,
|
||||||
|
tokens_this_minute: usage.tokens_this_minute as u64,
|
||||||
|
tokens_this_day: usage.tokens_this_day as u64,
|
||||||
|
input_tokens_this_month: usage.input_tokens_this_month as u64,
|
||||||
|
output_tokens_this_month: usage.output_tokens_this_month as u64,
|
||||||
|
spending_this_month: usage.spending_this_month as u64,
|
||||||
|
lifetime_spending: usage.lifetime_spending as u64,
|
||||||
},
|
},
|
||||||
model,
|
)
|
||||||
provider: provider.to_string(),
|
.await
|
||||||
input_token_count: input_token_count as u64,
|
.log_err();
|
||||||
output_token_count: output_token_count as u64,
|
}
|
||||||
requests_this_minute: usage.requests_this_minute as u64,
|
|
||||||
tokens_this_minute: usage.tokens_this_minute as u64,
|
|
||||||
tokens_this_day: usage.tokens_this_day as u64,
|
|
||||||
input_tokens_this_month: usage.input_tokens_this_month as u64,
|
|
||||||
output_tokens_this_month: usage.output_tokens_this_month as u64,
|
|
||||||
spending_this_month: usage.spending_this_month as u64,
|
|
||||||
lifetime_spending: usage.lifetime_spending as u64,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.log_err();
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn log_usage_periodically(state: Arc<LlmState>) {
|
||||||
|
state.executor.clone().spawn_detached(async move {
|
||||||
|
loop {
|
||||||
|
state
|
||||||
|
.executor
|
||||||
|
.sleep(std::time::Duration::from_secs(30))
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let Some(usages) = state
|
||||||
|
.db
|
||||||
|
.get_application_wide_usages_by_model(Utc::now())
|
||||||
|
.await
|
||||||
|
.log_err()
|
||||||
|
else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
for usage in usages {
|
||||||
|
tracing::info!(
|
||||||
|
target: "computed usage",
|
||||||
|
provider = usage.provider.to_string(),
|
||||||
|
model = usage.model,
|
||||||
|
requests_this_minute = usage.requests_this_minute,
|
||||||
|
tokens_this_minute = usage.tokens_this_minute,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
use crate::db::UserId;
|
use crate::db::UserId;
|
||||||
use chrono::Duration;
|
use chrono::Duration;
|
||||||
|
use futures::StreamExt as _;
|
||||||
use rpc::LanguageModelProvider;
|
use rpc::LanguageModelProvider;
|
||||||
use sea_orm::QuerySelect;
|
use sea_orm::QuerySelect;
|
||||||
use std::{iter, str::FromStr};
|
use std::{iter, str::FromStr};
|
||||||
|
@ -18,6 +19,14 @@ pub struct Usage {
|
||||||
pub lifetime_spending: usize,
|
pub lifetime_spending: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
|
pub struct ApplicationWideUsage {
|
||||||
|
pub provider: LanguageModelProvider,
|
||||||
|
pub model: String,
|
||||||
|
pub requests_this_minute: usize,
|
||||||
|
pub tokens_this_minute: usize,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, Default)]
|
#[derive(Clone, Copy, Debug, Default)]
|
||||||
pub struct ActiveUserCount {
|
pub struct ActiveUserCount {
|
||||||
pub users_in_recent_minutes: usize,
|
pub users_in_recent_minutes: usize,
|
||||||
|
@ -63,6 +72,71 @@ impl LlmDatabase {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_application_wide_usages_by_model(
|
||||||
|
&self,
|
||||||
|
now: DateTimeUtc,
|
||||||
|
) -> Result<Vec<ApplicationWideUsage>> {
|
||||||
|
self.transaction(|tx| async move {
|
||||||
|
let past_minute = now - Duration::minutes(1);
|
||||||
|
let requests_per_minute = self.usage_measure_ids[&UsageMeasure::RequestsPerMinute];
|
||||||
|
let tokens_per_minute = self.usage_measure_ids[&UsageMeasure::TokensPerMinute];
|
||||||
|
|
||||||
|
let mut results = Vec::new();
|
||||||
|
for (provider, model) in self.models.keys().cloned() {
|
||||||
|
let mut usages = usage::Entity::find()
|
||||||
|
.filter(
|
||||||
|
usage::Column::Timestamp
|
||||||
|
.gte(past_minute.naive_utc())
|
||||||
|
.and(usage::Column::IsStaff.eq(false))
|
||||||
|
.and(
|
||||||
|
usage::Column::MeasureId
|
||||||
|
.eq(requests_per_minute)
|
||||||
|
.or(usage::Column::MeasureId.eq(tokens_per_minute)),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.stream(&*tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut requests_this_minute = 0;
|
||||||
|
let mut tokens_this_minute = 0;
|
||||||
|
while let Some(usage) = usages.next().await {
|
||||||
|
let usage = usage?;
|
||||||
|
if usage.measure_id == requests_per_minute {
|
||||||
|
requests_this_minute += Self::get_live_buckets(
|
||||||
|
&usage,
|
||||||
|
now.naive_utc(),
|
||||||
|
UsageMeasure::RequestsPerMinute,
|
||||||
|
)
|
||||||
|
.0
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.sum::<i64>() as usize;
|
||||||
|
} else if usage.measure_id == tokens_per_minute {
|
||||||
|
tokens_this_minute += Self::get_live_buckets(
|
||||||
|
&usage,
|
||||||
|
now.naive_utc(),
|
||||||
|
UsageMeasure::TokensPerMinute,
|
||||||
|
)
|
||||||
|
.0
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.sum::<i64>() as usize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(ApplicationWideUsage {
|
||||||
|
provider,
|
||||||
|
model,
|
||||||
|
requests_this_minute,
|
||||||
|
tokens_this_minute,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn get_usage(
|
pub async fn get_usage(
|
||||||
&self,
|
&self,
|
||||||
user_id: UserId,
|
user_id: UserId,
|
||||||
|
|
|
@ -5,7 +5,7 @@ use axum::{
|
||||||
routing::get,
|
routing::get,
|
||||||
Extension, Router,
|
Extension, Router,
|
||||||
};
|
};
|
||||||
use collab::llm::db::LlmDatabase;
|
use collab::llm::{db::LlmDatabase, log_usage_periodically};
|
||||||
use collab::migrations::run_database_migrations;
|
use collab::migrations::run_database_migrations;
|
||||||
use collab::{api::billing::poll_stripe_events_periodically, llm::LlmState, ServiceMode};
|
use collab::{api::billing::poll_stripe_events_periodically, llm::LlmState, ServiceMode};
|
||||||
use collab::{
|
use collab::{
|
||||||
|
@ -95,6 +95,8 @@ async fn main() -> Result<()> {
|
||||||
|
|
||||||
let state = LlmState::new(config.clone(), Executor::Production).await?;
|
let state = LlmState::new(config.clone(), Executor::Production).await?;
|
||||||
|
|
||||||
|
log_usage_periodically(state.clone());
|
||||||
|
|
||||||
app = app
|
app = app
|
||||||
.merge(collab::llm::routes())
|
.merge(collab::llm::routes())
|
||||||
.layer(Extension(state.clone()));
|
.layer(Extension(state.clone()));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue