agent: Disable thinking when using inline assistant/edit file tool (#34141)

This introduces a new field `thinking_allowed` on `LanguageModelRequest`
which lets us control whether thinking should be enabled if the model
supports it.
We permit thinking in the Inline Assistant, Edit File tool and the Git
Commit message generator, this should make generation faster when using
a thinking model, e.g. `claude-sonnet-4-thinking`

Release Notes:

- N/A
This commit is contained in:
Bennet Bo Fenner 2025-07-09 20:05:39 +02:00 committed by GitHub
parent 96ff6d86a3
commit 41fe2a2ab4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 37 additions and 8 deletions

View file

@ -1284,6 +1284,7 @@ impl Thread {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature: AgentSettings::temperature_for_model(&model, cx), temperature: AgentSettings::temperature_for_model(&model, cx),
thinking_allowed: true,
}; };
let available_tools = self.available_tools(cx, model.clone()); let available_tools = self.available_tools(cx, model.clone());
@ -1449,6 +1450,7 @@ impl Thread {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature: AgentSettings::temperature_for_model(model, cx), temperature: AgentSettings::temperature_for_model(model, cx),
thinking_allowed: false,
}; };
for message in &self.messages { for message in &self.messages {

View file

@ -1461,6 +1461,7 @@ impl ActiveThread {
&configured_model.model, &configured_model.model,
cx, cx,
), ),
thinking_allowed: true,
}; };
Some(configured_model.model.count_tokens(request, cx)) Some(configured_model.model.count_tokens(request, cx))

View file

@ -475,6 +475,7 @@ impl CodegenAlternative {
stop: Vec::new(), stop: Vec::new(),
temperature, temperature,
messages: vec![request_message], messages: vec![request_message],
thinking_allowed: false,
} }
})) }))
} }

View file

@ -1454,6 +1454,7 @@ impl MessageEditor {
tool_choice: None, tool_choice: None,
stop: vec![], stop: vec![],
temperature: AgentSettings::temperature_for_model(&model.model, cx), temperature: AgentSettings::temperature_for_model(&model.model, cx),
thinking_allowed: true,
}; };
Some(model.model.count_tokens(request, cx)) Some(model.model.count_tokens(request, cx))

View file

@ -297,6 +297,7 @@ impl TerminalInlineAssistant {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature, temperature,
thinking_allowed: false,
} }
})) }))
} }

View file

@ -2293,6 +2293,7 @@ impl AssistantContext {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature: model.and_then(|model| AgentSettings::temperature_for_model(model, cx)), temperature: model.and_then(|model| AgentSettings::temperature_for_model(model, cx)),
thinking_allowed: true,
}; };
for message in self.messages(cx) { for message in self.messages(cx) {
if message.status != MessageStatus::Done { if message.status != MessageStatus::Done {

View file

@ -719,6 +719,7 @@ impl EditAgent {
tools, tools,
stop: Vec::new(), stop: Vec::new(),
temperature: None, temperature: None,
thinking_allowed: false,
}; };
Ok(self.model.stream_completion_text(request, cx).await?.stream) Ok(self.model.stream_completion_text(request, cx).await?.stream)

View file

@ -1263,6 +1263,7 @@ impl EvalAssertion {
content: vec![prompt.into()], content: vec![prompt.into()],
cache: false, cache: false,
}], }],
thinking_allowed: true,
..Default::default() ..Default::default()
}; };
let mut response = retry_on_rate_limit(async || { let mut response = retry_on_rate_limit(async || {
@ -1599,6 +1600,7 @@ impl EditAgentTest {
let conversation = LanguageModelRequest { let conversation = LanguageModelRequest {
messages, messages,
tools, tools,
thinking_allowed: true,
..Default::default() ..Default::default()
}; };

View file

@ -594,6 +594,7 @@ impl ExampleInstance {
tools: Vec::new(), tools: Vec::new(),
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
thinking_allowed: true,
}; };
let model = model.clone(); let model = model.clone();

View file

@ -1830,6 +1830,7 @@ impl GitPanel {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature, temperature,
thinking_allowed: false,
}; };
let stream = model.stream_completion_text(request, &cx); let stream = model.stream_completion_text(request, &cx);

View file

@ -391,6 +391,7 @@ pub struct LanguageModelRequest {
pub tool_choice: Option<LanguageModelToolChoice>, pub tool_choice: Option<LanguageModelToolChoice>,
pub stop: Vec<String>, pub stop: Vec<String>,
pub temperature: Option<f32>, pub temperature: Option<f32>,
pub thinking_allowed: bool,
} }
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]

View file

@ -663,7 +663,9 @@ pub fn into_anthropic(
} else { } else {
Some(anthropic::StringOrContents::String(system_message)) Some(anthropic::StringOrContents::String(system_message))
}, },
thinking: if let AnthropicModelMode::Thinking { budget_tokens } = mode { thinking: if request.thinking_allowed
&& let AnthropicModelMode::Thinking { budget_tokens } = mode
{
Some(anthropic::Thinking::Enabled { budget_tokens }) Some(anthropic::Thinking::Enabled { budget_tokens })
} else { } else {
None None
@ -1108,6 +1110,7 @@ mod tests {
temperature: None, temperature: None,
tools: vec![], tools: vec![],
tool_choice: None, tool_choice: None,
thinking_allowed: true,
}; };
let anthropic_request = into_anthropic( let anthropic_request = into_anthropic(

View file

@ -799,7 +799,9 @@ pub fn into_bedrock(
max_tokens: max_output_tokens, max_tokens: max_output_tokens,
system: Some(system_message), system: Some(system_message),
tools: Some(tool_config), tools: Some(tool_config),
thinking: if let BedrockModelMode::Thinking { budget_tokens } = mode { thinking: if request.thinking_allowed
&& let BedrockModelMode::Thinking { budget_tokens } = mode
{
Some(bedrock::Thinking::Enabled { budget_tokens }) Some(bedrock::Thinking::Enabled { budget_tokens })
} else { } else {
None None

View file

@ -849,6 +849,7 @@ impl LanguageModel for CloudLanguageModel {
let use_cloud = cx let use_cloud = cx
.update(|cx| cx.has_flag::<ZedCloudFeatureFlag>()) .update(|cx| cx.has_flag::<ZedCloudFeatureFlag>())
.unwrap_or(false); .unwrap_or(false);
let thinking_allowed = request.thinking_allowed;
match self.model.provider { match self.model.provider {
zed_llm_client::LanguageModelProvider::Anthropic => { zed_llm_client::LanguageModelProvider::Anthropic => {
let request = into_anthropic( let request = into_anthropic(
@ -856,7 +857,7 @@ impl LanguageModel for CloudLanguageModel {
self.model.id.to_string(), self.model.id.to_string(),
1.0, 1.0,
self.model.max_output_tokens as u64, self.model.max_output_tokens as u64,
if self.model.id.0.ends_with("-thinking") { if thinking_allowed && self.model.id.0.ends_with("-thinking") {
AnthropicModelMode::Thinking { AnthropicModelMode::Thinking {
budget_tokens: Some(4_096), budget_tokens: Some(4_096),
} }

View file

@ -559,11 +559,11 @@ pub fn into_google(
stop_sequences: Some(request.stop), stop_sequences: Some(request.stop),
max_output_tokens: None, max_output_tokens: None,
temperature: request.temperature.map(|t| t as f64).or(Some(1.0)), temperature: request.temperature.map(|t| t as f64).or(Some(1.0)),
thinking_config: match mode { thinking_config: match (request.thinking_allowed, mode) {
GoogleModelMode::Thinking { budget_tokens } => { (true, GoogleModelMode::Thinking { budget_tokens }) => {
budget_tokens.map(|thinking_budget| ThinkingConfig { thinking_budget }) budget_tokens.map(|thinking_budget| ThinkingConfig { thinking_budget })
} }
GoogleModelMode::Default => None, _ => None,
}, },
top_p: None, top_p: None,
top_k: None, top_k: None,

View file

@ -911,6 +911,7 @@ mod tests {
intent: None, intent: None,
mode: None, mode: None,
stop: vec![], stop: vec![],
thinking_allowed: true,
}; };
let mistral_request = into_mistral(request, "mistral-small-latest".into(), None); let mistral_request = into_mistral(request, "mistral-small-latest".into(), None);
@ -943,6 +944,7 @@ mod tests {
intent: None, intent: None,
mode: None, mode: None,
stop: vec![], stop: vec![],
thinking_allowed: true,
}; };
let mistral_request = into_mistral(request, "pixtral-12b-latest".into(), None); let mistral_request = into_mistral(request, "pixtral-12b-latest".into(), None);

View file

@ -334,7 +334,10 @@ impl OllamaLanguageModel {
temperature: request.temperature.or(Some(1.0)), temperature: request.temperature.or(Some(1.0)),
..Default::default() ..Default::default()
}), }),
think: self.model.supports_thinking, think: self
.model
.supports_thinking
.map(|supports_thinking| supports_thinking && request.thinking_allowed),
tools: request.tools.into_iter().map(tool_into_ollama).collect(), tools: request.tools.into_iter().map(tool_into_ollama).collect(),
} }
} }

View file

@ -999,6 +999,7 @@ mod tests {
tool_choice: None, tool_choice: None,
stop: vec![], stop: vec![],
temperature: None, temperature: None,
thinking_allowed: true,
}; };
// Validate that all models are supported by tiktoken-rs // Validate that all models are supported by tiktoken-rs

View file

@ -523,7 +523,9 @@ pub fn into_open_router(
None None
}, },
usage: open_router::RequestUsage { include: true }, usage: open_router::RequestUsage { include: true },
reasoning: if let OpenRouterModelMode::Thinking { budget_tokens } = model.mode { reasoning: if request.thinking_allowed
&& let OpenRouterModelMode::Thinking { budget_tokens } = model.mode
{
Some(open_router::Reasoning { Some(open_router::Reasoning {
effort: None, effort: None,
max_tokens: budget_tokens, max_tokens: budget_tokens,

View file

@ -981,6 +981,7 @@ impl RulesLibrary {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature: None, temperature: None,
thinking_allowed: true,
}, },
cx, cx,
) )

View file

@ -570,6 +570,7 @@ impl SummaryIndex {
tool_choice: None, tool_choice: None,
stop: Vec::new(), stop: Vec::new(),
temperature: None, temperature: None,
thinking_allowed: true,
}; };
let code_len = code.len(); let code_len = code.len();