Add thinking budget for Gemini custom models (#31251)

Closes #31243

As described in my issue, the [thinking
budget](https://ai.google.dev/gemini-api/docs/thinking) gets
automatically chosen by Gemini unless it is specifically set to
something. In order to have fast responses (inline assistant) I prefer
to set it to 0.

Release Notes:

- ai: Added `thinking` mode for custom Google models with configurable
token budget

---------

Co-authored-by: Ben Brandt <benjamin.j.brandt@gmail.com>
This commit is contained in:
90aca 2025-06-03 13:40:20 +02:00 committed by GitHub
parent b74477d12e
commit cf931247d0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 82 additions and 5 deletions

View file

@ -4,7 +4,8 @@ use credentials_provider::CredentialsProvider;
use editor::{Editor, EditorElement, EditorStyle};
use futures::{FutureExt, Stream, StreamExt, future::BoxFuture};
use google_ai::{
FunctionDeclaration, GenerateContentResponse, Part, SystemInstruction, UsageMetadata,
FunctionDeclaration, GenerateContentResponse, GoogleModelMode, Part, SystemInstruction,
ThinkingConfig, UsageMetadata,
};
use gpui::{
AnyView, App, AsyncApp, Context, Entity, FontStyle, Subscription, Task, TextStyle, WhiteSpace,
@ -45,11 +46,41 @@ pub struct GoogleSettings {
pub available_models: Vec<AvailableModel>,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Serialize, Deserialize, JsonSchema)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum ModelMode {
#[default]
Default,
Thinking {
/// The maximum number of tokens to use for reasoning. Must be lower than the model's `max_output_tokens`.
budget_tokens: Option<u32>,
},
}
impl From<ModelMode> for GoogleModelMode {
fn from(value: ModelMode) -> Self {
match value {
ModelMode::Default => GoogleModelMode::Default,
ModelMode::Thinking { budget_tokens } => GoogleModelMode::Thinking { budget_tokens },
}
}
}
impl From<GoogleModelMode> for ModelMode {
fn from(value: GoogleModelMode) -> Self {
match value {
GoogleModelMode::Default => ModelMode::Default,
GoogleModelMode::Thinking { budget_tokens } => ModelMode::Thinking { budget_tokens },
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, JsonSchema)]
pub struct AvailableModel {
name: String,
display_name: Option<String>,
max_tokens: usize,
mode: Option<ModelMode>,
}
pub struct GoogleLanguageModelProvider {
@ -216,6 +247,7 @@ impl LanguageModelProvider for GoogleLanguageModelProvider {
name: model.name.clone(),
display_name: model.display_name.clone(),
max_tokens: model.max_tokens,
mode: model.mode.unwrap_or_default().into(),
},
);
}
@ -343,7 +375,7 @@ impl LanguageModel for GoogleLanguageModel {
cx: &App,
) -> BoxFuture<'static, Result<usize>> {
let model_id = self.model.id().to_string();
let request = into_google(request, model_id.clone());
let request = into_google(request, model_id.clone(), self.model.mode());
let http_client = self.http_client.clone();
let api_key = self.state.read(cx).api_key.clone();
@ -379,7 +411,7 @@ impl LanguageModel for GoogleLanguageModel {
>,
>,
> {
let request = into_google(request, self.model.id().to_string());
let request = into_google(request, self.model.id().to_string(), self.model.mode());
let request = self.stream_completion(request, cx);
let future = self.request_limiter.stream(async move {
let response = request
@ -394,6 +426,7 @@ impl LanguageModel for GoogleLanguageModel {
pub fn into_google(
mut request: LanguageModelRequest,
model_id: String,
mode: GoogleModelMode,
) -> google_ai::GenerateContentRequest {
fn map_content(content: Vec<MessageContent>) -> Vec<Part> {
content
@ -504,6 +537,12 @@ pub fn into_google(
stop_sequences: Some(request.stop),
max_output_tokens: None,
temperature: request.temperature.map(|t| t as f64).or(Some(1.0)),
thinking_config: match mode {
GoogleModelMode::Thinking { budget_tokens } => {
budget_tokens.map(|thinking_budget| ThinkingConfig { thinking_budget })
}
GoogleModelMode::Default => None,
},
top_p: None,
top_k: None,
}),