Implement Anthropic prompt caching (#16274)
Release Notes: - Adds support for Prompt Caching in Anthropic. For models that support it this can dramatically lower cost while improving performance.
This commit is contained in:
parent
09b6e3f2a6
commit
46fb917e02
11 changed files with 338 additions and 70 deletions
|
@ -20,7 +20,7 @@ pub use registry::*;
|
|||
pub use request::*;
|
||||
pub use role::*;
|
||||
use schemars::JsonSchema;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{future::Future, sync::Arc};
|
||||
use ui::IconName;
|
||||
|
||||
|
@ -43,6 +43,14 @@ pub enum LanguageModelAvailability {
|
|||
RequiresPlan(Plan),
|
||||
}
|
||||
|
||||
/// Configuration for caching language model messages.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, JsonSchema)]
|
||||
pub struct LanguageModelCacheConfiguration {
|
||||
pub max_cache_anchors: usize,
|
||||
pub should_speculate: bool,
|
||||
pub min_total_token: usize,
|
||||
}
|
||||
|
||||
pub trait LanguageModel: Send + Sync {
|
||||
fn id(&self) -> LanguageModelId;
|
||||
fn name(&self) -> LanguageModelName;
|
||||
|
@ -78,6 +86,10 @@ pub trait LanguageModel: Send + Sync {
|
|||
cx: &AsyncAppContext,
|
||||
) -> BoxFuture<'static, Result<BoxStream<'static, Result<String>>>>;
|
||||
|
||||
fn cache_configuration(&self) -> Option<LanguageModelCacheConfiguration> {
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(any(test, feature = "test-support"))]
|
||||
fn as_fake(&self) -> &provider::fake::FakeLanguageModel {
|
||||
unimplemented!()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use crate::{
|
||||
settings::AllLanguageModelSettings, LanguageModel, LanguageModelId, LanguageModelName,
|
||||
LanguageModelProvider, LanguageModelProviderId, LanguageModelProviderName,
|
||||
LanguageModelProviderState, LanguageModelRequest, RateLimiter, Role,
|
||||
settings::AllLanguageModelSettings, LanguageModel, LanguageModelCacheConfiguration,
|
||||
LanguageModelId, LanguageModelName, LanguageModelProvider, LanguageModelProviderId,
|
||||
LanguageModelProviderName, LanguageModelProviderState, LanguageModelRequest, RateLimiter, Role,
|
||||
};
|
||||
use anthropic::AnthropicError;
|
||||
use anyhow::{anyhow, Context as _, Result};
|
||||
|
@ -38,6 +38,7 @@ pub struct AvailableModel {
|
|||
pub name: String,
|
||||
pub max_tokens: usize,
|
||||
pub tool_override: Option<String>,
|
||||
pub cache_configuration: Option<LanguageModelCacheConfiguration>,
|
||||
}
|
||||
|
||||
pub struct AnthropicLanguageModelProvider {
|
||||
|
@ -171,6 +172,13 @@ impl LanguageModelProvider for AnthropicLanguageModelProvider {
|
|||
name: model.name.clone(),
|
||||
max_tokens: model.max_tokens,
|
||||
tool_override: model.tool_override.clone(),
|
||||
cache_configuration: model.cache_configuration.as_ref().map(|config| {
|
||||
anthropic::AnthropicModelCacheConfiguration {
|
||||
max_cache_anchors: config.max_cache_anchors,
|
||||
should_speculate: config.should_speculate,
|
||||
min_total_token: config.min_total_token,
|
||||
}
|
||||
}),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
@ -351,6 +359,16 @@ impl LanguageModel for AnthropicModel {
|
|||
.boxed()
|
||||
}
|
||||
|
||||
fn cache_configuration(&self) -> Option<LanguageModelCacheConfiguration> {
|
||||
self.model
|
||||
.cache_configuration()
|
||||
.map(|config| LanguageModelCacheConfiguration {
|
||||
max_cache_anchors: config.max_cache_anchors,
|
||||
should_speculate: config.should_speculate,
|
||||
min_total_token: config.min_total_token,
|
||||
})
|
||||
}
|
||||
|
||||
fn use_any_tool(
|
||||
&self,
|
||||
request: LanguageModelRequest,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use super::open_ai::count_open_ai_tokens;
|
||||
use crate::{
|
||||
settings::AllLanguageModelSettings, CloudModel, LanguageModel, LanguageModelId,
|
||||
LanguageModelName, LanguageModelProviderId, LanguageModelProviderName,
|
||||
settings::AllLanguageModelSettings, CloudModel, LanguageModel, LanguageModelCacheConfiguration,
|
||||
LanguageModelId, LanguageModelName, LanguageModelProviderId, LanguageModelProviderName,
|
||||
LanguageModelProviderState, LanguageModelRequest, RateLimiter, ZedModel,
|
||||
};
|
||||
use anthropic::AnthropicError;
|
||||
|
@ -56,6 +56,7 @@ pub struct AvailableModel {
|
|||
name: String,
|
||||
max_tokens: usize,
|
||||
tool_override: Option<String>,
|
||||
cache_configuration: Option<LanguageModelCacheConfiguration>,
|
||||
}
|
||||
|
||||
pub struct CloudLanguageModelProvider {
|
||||
|
@ -202,6 +203,13 @@ impl LanguageModelProvider for CloudLanguageModelProvider {
|
|||
name: model.name.clone(),
|
||||
max_tokens: model.max_tokens,
|
||||
tool_override: model.tool_override.clone(),
|
||||
cache_configuration: model.cache_configuration.as_ref().map(|config| {
|
||||
anthropic::AnthropicModelCacheConfiguration {
|
||||
max_cache_anchors: config.max_cache_anchors,
|
||||
should_speculate: config.should_speculate,
|
||||
min_total_token: config.min_total_token,
|
||||
}
|
||||
}),
|
||||
})
|
||||
}
|
||||
AvailableProvider::OpenAi => CloudModel::OpenAi(open_ai::Model::Custom {
|
||||
|
|
|
@ -193,6 +193,7 @@ impl From<&str> for MessageContent {
|
|||
pub struct LanguageModelRequestMessage {
|
||||
pub role: Role,
|
||||
pub content: Vec<MessageContent>,
|
||||
pub cache: bool,
|
||||
}
|
||||
|
||||
impl LanguageModelRequestMessage {
|
||||
|
@ -213,7 +214,7 @@ impl LanguageModelRequestMessage {
|
|||
.content
|
||||
.get(0)
|
||||
.map(|content| match content {
|
||||
MessageContent::Text(s) => s.is_empty(),
|
||||
MessageContent::Text(s) => s.trim().is_empty(),
|
||||
MessageContent::Image(_) => true,
|
||||
})
|
||||
.unwrap_or(false)
|
||||
|
@ -286,7 +287,7 @@ impl LanguageModelRequest {
|
|||
}
|
||||
|
||||
pub fn into_anthropic(self, model: String) -> anthropic::Request {
|
||||
let mut new_messages: Vec<LanguageModelRequestMessage> = Vec::new();
|
||||
let mut new_messages: Vec<anthropic::Message> = Vec::new();
|
||||
let mut system_message = String::new();
|
||||
|
||||
for message in self.messages {
|
||||
|
@ -296,18 +297,50 @@ impl LanguageModelRequest {
|
|||
|
||||
match message.role {
|
||||
Role::User | Role::Assistant => {
|
||||
let cache_control = if message.cache {
|
||||
Some(anthropic::CacheControl {
|
||||
cache_type: anthropic::CacheControlType::Ephemeral,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let anthropic_message_content: Vec<anthropic::Content> = message
|
||||
.content
|
||||
.into_iter()
|
||||
// TODO: filter out the empty messages in the message construction step
|
||||
.filter_map(|content| match content {
|
||||
MessageContent::Text(t) if !t.is_empty() => {
|
||||
Some(anthropic::Content::Text {
|
||||
text: t,
|
||||
cache_control,
|
||||
})
|
||||
}
|
||||
MessageContent::Image(i) => Some(anthropic::Content::Image {
|
||||
source: anthropic::ImageSource {
|
||||
source_type: "base64".to_string(),
|
||||
media_type: "image/png".to_string(),
|
||||
data: i.source.to_string(),
|
||||
},
|
||||
cache_control,
|
||||
}),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
let anthropic_role = match message.role {
|
||||
Role::User => anthropic::Role::User,
|
||||
Role::Assistant => anthropic::Role::Assistant,
|
||||
Role::System => unreachable!("System role should never occur here"),
|
||||
};
|
||||
if let Some(last_message) = new_messages.last_mut() {
|
||||
if last_message.role == message.role {
|
||||
// TODO: is this append done properly?
|
||||
last_message.content.push(MessageContent::Text(format!(
|
||||
"\n\n{}",
|
||||
message.string_contents()
|
||||
)));
|
||||
if last_message.role == anthropic_role {
|
||||
last_message.content.extend(anthropic_message_content);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
new_messages.push(message);
|
||||
new_messages.push(anthropic::Message {
|
||||
role: anthropic_role,
|
||||
content: anthropic_message_content,
|
||||
});
|
||||
}
|
||||
Role::System => {
|
||||
if !system_message.is_empty() {
|
||||
|
@ -320,36 +353,7 @@ impl LanguageModelRequest {
|
|||
|
||||
anthropic::Request {
|
||||
model,
|
||||
messages: new_messages
|
||||
.into_iter()
|
||||
.filter_map(|message| {
|
||||
Some(anthropic::Message {
|
||||
role: match message.role {
|
||||
Role::User => anthropic::Role::User,
|
||||
Role::Assistant => anthropic::Role::Assistant,
|
||||
Role::System => return None,
|
||||
},
|
||||
content: message
|
||||
.content
|
||||
.into_iter()
|
||||
// TODO: filter out the empty messages in the message construction step
|
||||
.filter_map(|content| match content {
|
||||
MessageContent::Text(t) if !t.is_empty() => {
|
||||
Some(anthropic::Content::Text { text: t })
|
||||
}
|
||||
MessageContent::Image(i) => Some(anthropic::Content::Image {
|
||||
source: anthropic::ImageSource {
|
||||
source_type: "base64".to_string(),
|
||||
media_type: "image/png".to_string(),
|
||||
data: i.source.to_string(),
|
||||
},
|
||||
}),
|
||||
_ => None,
|
||||
})
|
||||
.collect(),
|
||||
})
|
||||
})
|
||||
.collect(),
|
||||
messages: new_messages,
|
||||
max_tokens: 4092,
|
||||
system: Some(system_message),
|
||||
tools: Vec::new(),
|
||||
|
|
|
@ -7,14 +7,17 @@ use schemars::JsonSchema;
|
|||
use serde::{Deserialize, Serialize};
|
||||
use settings::{update_settings_file, Settings, SettingsSources};
|
||||
|
||||
use crate::provider::{
|
||||
self,
|
||||
anthropic::AnthropicSettings,
|
||||
cloud::{self, ZedDotDevSettings},
|
||||
copilot_chat::CopilotChatSettings,
|
||||
google::GoogleSettings,
|
||||
ollama::OllamaSettings,
|
||||
open_ai::OpenAiSettings,
|
||||
use crate::{
|
||||
provider::{
|
||||
self,
|
||||
anthropic::AnthropicSettings,
|
||||
cloud::{self, ZedDotDevSettings},
|
||||
copilot_chat::CopilotChatSettings,
|
||||
google::GoogleSettings,
|
||||
ollama::OllamaSettings,
|
||||
open_ai::OpenAiSettings,
|
||||
},
|
||||
LanguageModelCacheConfiguration,
|
||||
};
|
||||
|
||||
/// Initializes the language model settings.
|
||||
|
@ -93,10 +96,18 @@ impl AnthropicSettingsContent {
|
|||
name,
|
||||
max_tokens,
|
||||
tool_override,
|
||||
cache_configuration,
|
||||
} => Some(provider::anthropic::AvailableModel {
|
||||
name,
|
||||
max_tokens,
|
||||
tool_override,
|
||||
cache_configuration: cache_configuration.as_ref().map(
|
||||
|config| LanguageModelCacheConfiguration {
|
||||
max_cache_anchors: config.max_cache_anchors,
|
||||
should_speculate: config.should_speculate,
|
||||
min_total_token: config.min_total_token,
|
||||
},
|
||||
),
|
||||
}),
|
||||
_ => None,
|
||||
})
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue