Implement simpler logic for edit predictions prompt byte limits (#23983)

Realized that the logic in #23814 was more than needed, and harder to
maintain. Something like that could make sense if using the tokenizer
and wanting to precisely hit a token limit. However in the case of edit
predictions it's more of a latency+expense vs capability tradeoff, and
so such precision is unnecessary.

Happily this change didn't require much extra work, just copy-modifying
parts of that change was sufficient.

Release Notes:

- N/A
This commit is contained in:
Michael Sloan 2025-01-30 15:27:42 -07:00 committed by GitHub
parent 9d6c0e57a0
commit 87b0f62041
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 156 additions and 240 deletions

View file

@ -42,6 +42,11 @@ use util::ResultExt;
pub use token::*;
const ACTIVE_USER_COUNT_CACHE_DURATION: Duration = Duration::seconds(30);
/// Output token limit. A copy of this constant is also in `crates/zeta/src/zeta.rs`.
const MAX_OUTPUT_TOKENS: u32 = 2048;
pub struct LlmState {
pub config: Config,
pub executor: Executor,
@ -52,8 +57,6 @@ pub struct LlmState {
RwLock<HashMap<(LanguageModelProvider, String), (DateTime<Utc>, ActiveUserCount)>>,
}
const ACTIVE_USER_COUNT_CACHE_DURATION: Duration = Duration::seconds(30);
impl LlmState {
pub async fn new(config: Config, executor: Executor) -> Result<Arc<Self>> {
let database_url = config
@ -488,7 +491,7 @@ async fn predict_edits(
fireworks::CompletionRequest {
model: model.to_string(),
prompt: prompt.clone(),
max_tokens: 2048,
max_tokens: MAX_OUTPUT_TOKENS,
temperature: 0.,
prediction: Some(fireworks::Prediction::Content {
content: params.input_excerpt.clone(),