From edc7d7364384a36ee5ec7fa016ffdf01eefa12c0 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Thu, 27 Mar 2025 18:32:50 -0400 Subject: [PATCH] Set cache breakpoint on second-to-last message (#27632) Here's a sample `dbg!` of token usage after this change, for a small agent thread: ``` [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 5354, output_tokens: 184, cache_creation_input_tokens: 0, cache_read_input_tokens: 0, } [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 54, output_tokens: 132, cache_creation_input_tokens: 5518, cache_read_input_tokens: 0, } [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 54, output_tokens: 113, cache_creation_input_tokens: 166, cache_read_input_tokens: 5518, } [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 291, output_tokens: 181, cache_creation_input_tokens: 147, cache_read_input_tokens: 5684, } ``` Release Notes: - N/A --- crates/assistant2/src/thread.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crates/assistant2/src/thread.rs b/crates/assistant2/src/thread.rs index c75ead9dd3..5bcf6714ba 100644 --- a/crates/assistant2/src/thread.rs +++ b/crates/assistant2/src/thread.rs @@ -857,6 +857,13 @@ impl Thread { request.messages.push(request_message); } + // Set a cache breakpoint at the second-to-last message. + // https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching + let breakpoint_index = request.messages.len() - 2; + for (index, message) in request.messages.iter_mut().enumerate() { + message.cache = index == breakpoint_index; + } + if !referenced_context_ids.is_empty() { let mut context_message = LanguageModelRequestMessage { role: Role::User,