collab: Remove unused POST /predict_edits endpoint from LLM service (#23997)

This PR removes the `POST /predict_edits` endpoint from the LLM service, as it has been superseded by the corresponding endpoint running in Cloudflare Workers. All traffic is already being routed to the Cloudflare Workers via the Workers route, so nothing is hitting this endpoint running in the LLM service anymore. You can see the drop off in requests to this endpoint on this graph when the Workers route was added: <img width="472" alt="Screenshot 2025-01-30 at 9 18 04 PM" src="https://github.com/user-attachments/assets/fa60f7c8-2737-4329-88a3-17093bdb5a29" /> We also don't use the `fireworks` crate anymore in this repo, so it has been removed. Release Notes: - N/A
2025-01-30 22:21:40 -05:00 · 2025-01-30 22:21:40 -05:00 · 8be73bf187
commit 8be73bf187
parent 35fbe1ef3d
8 changed files with 2 additions and 380 deletions
--- a/crates/collab/Cargo.toml
+++ b/crates/collab/Cargo.toml
@ -34,7 +34,6 @@ collections.workspace = true
 dashmap.workspace = true
 derive_more.workspace = true
 envy = "0.4.2"
-fireworks.workspace = true
 futures.workspace = true
 google_ai.workspace = true
 hex.workspace = true
--- a/crates/collab/src/llm.rs
+++ b/crates/collab/src/llm.rs
@ -21,15 +21,12 @@ use chrono::{DateTime, Duration, Utc};
 use collections::HashMap;
 use db::TokenUsage;
 use db::{usage_measure::UsageMeasure, ActiveUserCount, LlmDatabase};
-use futures::{FutureExt, Stream, StreamExt as _};
+use futures::{Stream, StreamExt as _};
 use reqwest_client::ReqwestClient;
 use rpc::{
    proto::Plan, LanguageModelProvider, PerformCompletionParams, EXPIRED_LLM_TOKEN_HEADER_NAME,
 };
-use rpc::{
-    ListModelsResponse, PredictEditsParams, PredictEditsResponse,
-    MAX_LLM_MONTHLY_SPEND_REACHED_HEADER_NAME,
-};
+use rpc::{ListModelsResponse, MAX_LLM_MONTHLY_SPEND_REACHED_HEADER_NAME};
 use serde_json::json;
 use std::{
    pin::Pin,
@ -44,9 +41,6 @@ pub use token::*;

 const ACTIVE_USER_COUNT_CACHE_DURATION: Duration = Duration::seconds(30);

-/// Output token limit. A copy of this constant is also in `crates/zeta/src/zeta.rs`.
-const MAX_OUTPUT_TOKENS: u32 = 2048;
-
 pub struct LlmState {
    pub config: Config,
    pub executor: Executor,
@ -123,7 +117,6 @@ pub fn routes() -> Router<(), Body> {
    Router::new()
        .route("/models", get(list_models))
        .route("/completion", post(perform_completion))
-        .route("/predict_edits", post(predict_edits))
        .layer(middleware::from_fn(validate_api_token))
 }

@ -437,156 +430,6 @@ fn normalize_model_name(known_models: Vec<String>, name: String) -> String {
    }
 }

-async fn predict_edits(
-    Extension(state): Extension<Arc<LlmState>>,
-    Extension(claims): Extension<LlmTokenClaims>,
-    _country_code_header: Option<TypedHeader<CloudflareIpCountryHeader>>,
-    Json(params): Json<PredictEditsParams>,
-) -> Result<impl IntoResponse> {
-    if !claims.is_staff && !claims.has_predict_edits_feature_flag {
-        return Err(Error::http(
-            StatusCode::FORBIDDEN,
-            "no access to Zed's edit prediction feature".to_string(),
-        ));
-    }
-
-    let should_sample = claims.is_staff || params.can_collect_data;
-
-    let api_url = state
-        .config
-        .prediction_api_url
-        .as_ref()
-        .context("no PREDICTION_API_URL configured on the server")?;
-    let api_key = state
-        .config
-        .prediction_api_key
-        .as_ref()
-        .context("no PREDICTION_API_KEY configured on the server")?;
-    let model = state
-        .config
-        .prediction_model
-        .as_ref()
-        .context("no PREDICTION_MODEL configured on the server")?;
-
-    let outline_prefix = params
-        .outline
-        .as_ref()
-        .map(|outline| format!("### Outline for current file:\n{}\n", outline))
-        .unwrap_or_default();
-
-    let prompt = include_str!("./llm/prediction_prompt.md")
-        .replace("<outline>", &outline_prefix)
-        .replace("<events>", &params.input_events)
-        .replace("<excerpt>", &params.input_excerpt);
-
-    let request_start = std::time::Instant::now();
-    let timeout = state
-        .executor
-        .sleep(std::time::Duration::from_secs(2))
-        .fuse();
-    let response = fireworks::complete(
-        &state.http_client,
-        api_url,
-        api_key,
-        fireworks::CompletionRequest {
-            model: model.to_string(),
-            prompt: prompt.clone(),
-            max_tokens: MAX_OUTPUT_TOKENS,
-            temperature: 0.,
-            prediction: Some(fireworks::Prediction::Content {
-                content: params.input_excerpt.clone(),
-            }),
-            rewrite_speculation: Some(true),
-        },
-    )
-    .fuse();
-    futures::pin_mut!(timeout);
-    futures::pin_mut!(response);
-
-    futures::select! {
-        _ = timeout => {
-            state.executor.spawn_detached({
-                let kinesis_client = state.kinesis_client.clone();
-                let kinesis_stream = state.config.kinesis_stream.clone();
-                let model = model.clone();
-                async move {
-                    SnowflakeRow::new(
-                        "Fireworks Completion Timeout",
-                        claims.metrics_id,
-                        claims.is_staff,
-                        claims.system_id.clone(),
-                        json!({
-                            "model": model.to_string(),
-                            "prompt": prompt,
-                        }),
-                    )
-                    .write(&kinesis_client, &kinesis_stream)
-                    .await
-                    .log_err();
-                }
-            });
-            Err(anyhow!("request timed out"))?
-        },
-        response = response => {
-            let duration = request_start.elapsed();
-
-            let mut response = response?;
-            let choice = response
-                .completion
-                .choices
-                .pop()
-                .context("no output from completion response")?;
-
-            state.executor.spawn_detached({
-                let kinesis_client = state.kinesis_client.clone();
-                let kinesis_stream = state.config.kinesis_stream.clone();
-                let model = model.clone();
-                let output = choice.text.clone();
-
-                async move {
-                    let properties = if should_sample {
-                        json!({
-                            "model": model.to_string(),
-                            "headers": response.headers,
-                            "usage": response.completion.usage,
-                            "duration": duration.as_secs_f64(),
-                            "prompt": prompt,
-                            "input_excerpt": params.input_excerpt,
-                            "input_events": params.input_events,
-                            "outline": params.outline,
-                            "output": output,
-                            "is_sampled": true,
-                        })
-                    } else {
-                        json!({
-                            "model": model.to_string(),
-                            "headers": response.headers,
-                            "usage": response.completion.usage,
-                            "duration": duration.as_secs_f64(),
-                            "is_sampled": false,
-                        })
-                    };
-
-                    SnowflakeRow::new(
-                        "Fireworks Completion Requested",
-                        claims.metrics_id,
-                        claims.is_staff,
-                        claims.system_id.clone(),
-                        properties,
-                    )
-                    .write(&kinesis_client, &kinesis_stream)
-                    .await
-                    .log_err();
-                }
-            });
-
-            Ok(Json(PredictEditsResponse {
-                output_excerpt: choice.text,
-            }))
-        },
-    }
-}
-
 /// The maximum monthly spending an individual user can reach on the free tier
 /// before they have to pay.
 pub const FREE_TIER_MONTHLY_SPENDING_LIMIT: Cents = Cents::from_dollars(10);
--- a/crates/collab/src/llm/prediction_prompt.md
+++ b/crates/collab/src/llm/prediction_prompt.md
@ -1,13 +0,0 @@
-<outline>## Task
-Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
-
-### Events:
-<events>
-
-### Input:
-<excerpt>
-
-### Response: