Implement simpler logic for edit predictions prompt byte limits (#23983)

Realized that the logic in #23814 was more than needed, and harder to maintain. Something like that could make sense if using the tokenizer and wanting to precisely hit a token limit. However in the case of edit predictions it's more of a latency+expense vs capability tradeoff, and so such precision is unnecessary. Happily this change didn't require much extra work, just copy-modifying parts of that change was sufficient. Release Notes: - N/A
2025-01-30 15:27:42 -07:00 · 2025-01-30 15:27:42 -07:00 · 87b0f62041
commit 87b0f62041
parent 9d6c0e57a0
2 changed files with 156 additions and 240 deletions
--- a/crates/collab/src/llm.rs
+++ b/crates/collab/src/llm.rs
@ -42,6 +42,11 @@ use util::ResultExt;

 pub use token::*;

+const ACTIVE_USER_COUNT_CACHE_DURATION: Duration = Duration::seconds(30);
+
+/// Output token limit. A copy of this constant is also in `crates/zeta/src/zeta.rs`.
+const MAX_OUTPUT_TOKENS: u32 = 2048;
+
 pub struct LlmState {
    pub config: Config,
    pub executor: Executor,
@ -52,8 +57,6 @@ pub struct LlmState {
        RwLock<HashMap<(LanguageModelProvider, String), (DateTime<Utc>, ActiveUserCount)>>,
 }

-const ACTIVE_USER_COUNT_CACHE_DURATION: Duration = Duration::seconds(30);
-
 impl LlmState {
    pub async fn new(config: Config, executor: Executor) -> Result<Arc<Self>> {
        let database_url = config
@ -488,7 +491,7 @@ async fn predict_edits(
        fireworks::CompletionRequest {
            model: model.to_string(),
            prompt: prompt.clone(),
-            max_tokens: 2048,
+            max_tokens: MAX_OUTPUT_TOKENS,
            temperature: 0.,
            prediction: Some(fireworks::Prediction::Content {
                content: params.input_excerpt.clone(),