Set cache breakpoint on second-to-last message (#27632)
Here's a sample `dbg!` of token usage after this change, for a small agent thread: ``` [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 5354, output_tokens: 184, cache_creation_input_tokens: 0, cache_read_input_tokens: 0, } [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 54, output_tokens: 132, cache_creation_input_tokens: 5518, cache_read_input_tokens: 0, } [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 54, output_tokens: 113, cache_creation_input_tokens: 166, cache_read_input_tokens: 5518, } [crates/assistant2/src/thread.rs:1092:25] &usage = TokenUsage { input_tokens: 291, output_tokens: 181, cache_creation_input_tokens: 147, cache_read_input_tokens: 5684, } ``` Release Notes: - N/A
This commit is contained in:
parent
4839195003
commit
edc7d73643
1 changed files with 7 additions and 0 deletions
|
@ -857,6 +857,13 @@ impl Thread {
|
||||||
request.messages.push(request_message);
|
request.messages.push(request_message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set a cache breakpoint at the second-to-last message.
|
||||||
|
// https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
|
||||||
|
let breakpoint_index = request.messages.len() - 2;
|
||||||
|
for (index, message) in request.messages.iter_mut().enumerate() {
|
||||||
|
message.cache = index == breakpoint_index;
|
||||||
|
}
|
||||||
|
|
||||||
if !referenced_context_ids.is_empty() {
|
if !referenced_context_ids.is_empty() {
|
||||||
let mut context_message = LanguageModelRequestMessage {
|
let mut context_message = LanguageModelRequestMessage {
|
||||||
role: Role::User,
|
role: Role::User,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue