agent: Disable thinking when using inline assistant/edit file tool (#34141)

This introduces a new field `thinking_allowed` on `LanguageModelRequest` which lets us control whether thinking should be enabled if the model supports it. We permit thinking in the Inline Assistant, Edit File tool and the Git Commit message generator, this should make generation faster when using a thinking model, e.g. `claude-sonnet-4-thinking` Release Notes: - N/A
2025-07-09 20:05:39 +02:00 · 2025-07-09 20:05:39 +02:00 · 41fe2a2ab4
commit 41fe2a2ab4
parent 96ff6d86a3
21 changed files with 37 additions and 8 deletions
--- a/crates/language_models/src/provider/anthropic.rs
+++ b/crates/language_models/src/provider/anthropic.rs
@ -663,7 +663,9 @@ pub fn into_anthropic(
        } else {
            Some(anthropic::StringOrContents::String(system_message))
        },
-        thinking: if let AnthropicModelMode::Thinking { budget_tokens } = mode {
+        thinking: if request.thinking_allowed
+            && let AnthropicModelMode::Thinking { budget_tokens } = mode
+        {
            Some(anthropic::Thinking::Enabled { budget_tokens })
        } else {
            None
@ -1108,6 +1110,7 @@ mod tests {
            temperature: None,
            tools: vec![],
            tool_choice: None,
+            thinking_allowed: true,
        };

        let anthropic_request = into_anthropic(
--- a/crates/language_models/src/provider/bedrock.rs
+++ b/crates/language_models/src/provider/bedrock.rs
@ -799,7 +799,9 @@ pub fn into_bedrock(
        max_tokens: max_output_tokens,
        system: Some(system_message),
        tools: Some(tool_config),
-        thinking: if let BedrockModelMode::Thinking { budget_tokens } = mode {
+        thinking: if request.thinking_allowed
+            && let BedrockModelMode::Thinking { budget_tokens } = mode
+        {
            Some(bedrock::Thinking::Enabled { budget_tokens })
        } else {
            None
--- a/crates/language_models/src/provider/cloud.rs
+++ b/crates/language_models/src/provider/cloud.rs
@ -849,6 +849,7 @@ impl LanguageModel for CloudLanguageModel {
        let use_cloud = cx
            .update(|cx| cx.has_flag::<ZedCloudFeatureFlag>())
            .unwrap_or(false);
+        let thinking_allowed = request.thinking_allowed;
        match self.model.provider {
            zed_llm_client::LanguageModelProvider::Anthropic => {
                let request = into_anthropic(
@ -856,7 +857,7 @@ impl LanguageModel for CloudLanguageModel {
                    self.model.id.to_string(),
                    1.0,
                    self.model.max_output_tokens as u64,
-                    if self.model.id.0.ends_with("-thinking") {
+                    if thinking_allowed && self.model.id.0.ends_with("-thinking") {
                        AnthropicModelMode::Thinking {
                            budget_tokens: Some(4_096),
                        }
--- a/crates/language_models/src/provider/google.rs
+++ b/crates/language_models/src/provider/google.rs
@ -559,11 +559,11 @@ pub fn into_google(
            stop_sequences: Some(request.stop),
            max_output_tokens: None,
            temperature: request.temperature.map(|t| t as f64).or(Some(1.0)),
-            thinking_config: match mode {
-                GoogleModelMode::Thinking { budget_tokens } => {
+            thinking_config: match (request.thinking_allowed, mode) {
+                (true, GoogleModelMode::Thinking { budget_tokens }) => {
                    budget_tokens.map(|thinking_budget| ThinkingConfig { thinking_budget })
                }
-                GoogleModelMode::Default => None,
+                _ => None,
            },
            top_p: None,
            top_k: None,
--- a/crates/language_models/src/provider/mistral.rs
+++ b/crates/language_models/src/provider/mistral.rs
@ -911,6 +911,7 @@ mod tests {
            intent: None,
            mode: None,
            stop: vec![],
+            thinking_allowed: true,
        };

        let mistral_request = into_mistral(request, "mistral-small-latest".into(), None);
@ -943,6 +944,7 @@ mod tests {
            intent: None,
            mode: None,
            stop: vec![],
+            thinking_allowed: true,
        };

        let mistral_request = into_mistral(request, "pixtral-12b-latest".into(), None);
--- a/crates/language_models/src/provider/ollama.rs
+++ b/crates/language_models/src/provider/ollama.rs
@ -334,7 +334,10 @@ impl OllamaLanguageModel {
                temperature: request.temperature.or(Some(1.0)),
                ..Default::default()
            }),
-            think: self.model.supports_thinking,
+            think: self
+                .model
+                .supports_thinking
+                .map(|supports_thinking| supports_thinking && request.thinking_allowed),
            tools: request.tools.into_iter().map(tool_into_ollama).collect(),
        }
    }
--- a/crates/language_models/src/provider/open_ai.rs
+++ b/crates/language_models/src/provider/open_ai.rs
@ -999,6 +999,7 @@ mod tests {
            tool_choice: None,
            stop: vec![],
            temperature: None,
+            thinking_allowed: true,
        };

        // Validate that all models are supported by tiktoken-rs
--- a/crates/language_models/src/provider/open_router.rs
+++ b/crates/language_models/src/provider/open_router.rs
@ -523,7 +523,9 @@ pub fn into_open_router(
            None
        },
        usage: open_router::RequestUsage { include: true },
-        reasoning: if let OpenRouterModelMode::Thinking { budget_tokens } = model.mode {
+        reasoning: if request.thinking_allowed
+            && let OpenRouterModelMode::Thinking { budget_tokens } = model.mode
+        {
            Some(open_router::Reasoning {
                effort: None,
                max_tokens: budget_tokens,