Implement Anthropic prompt caching (#16274)

Release Notes: - Adds support for Prompt Caching in Anthropic. For models that support it this can dramatically lower cost while improving performance.
2024-08-15 23:21:06 -04:00 · 2024-08-15 23:21:06 -04:00 · 46fb917e02
commit 46fb917e02
parent 09b6e3f2a6
11 changed files with 338 additions and 70 deletions
--- a/crates/language_model/src/language_model.rs
+++ b/crates/language_model/src/language_model.rs
@ -20,7 +20,7 @@ pub use registry::*;
 pub use request::*;
 pub use role::*;
 use schemars::JsonSchema;
-use serde::de::DeserializeOwned;
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{future::Future, sync::Arc};
 use ui::IconName;

@ -43,6 +43,14 @@ pub enum LanguageModelAvailability {
    RequiresPlan(Plan),
 }

+/// Configuration for caching language model messages.
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, JsonSchema)]
+pub struct LanguageModelCacheConfiguration {
+    pub max_cache_anchors: usize,
+    pub should_speculate: bool,
+    pub min_total_token: usize,
+}
+
 pub trait LanguageModel: Send + Sync {
    fn id(&self) -> LanguageModelId;
    fn name(&self) -> LanguageModelName;
@ -78,6 +86,10 @@ pub trait LanguageModel: Send + Sync {
        cx: &AsyncAppContext,
    ) -> BoxFuture<'static, Result<BoxStream<'static, Result<String>>>>;

+    fn cache_configuration(&self) -> Option<LanguageModelCacheConfiguration> {
+        None
+    }
+
    #[cfg(any(test, feature = "test-support"))]
    fn as_fake(&self) -> &provider::fake::FakeLanguageModel {
        unimplemented!()
--- a/crates/language_model/src/provider/anthropic.rs
+++ b/crates/language_model/src/provider/anthropic.rs
@ -1,7 +1,7 @@
 use crate::{
-    settings::AllLanguageModelSettings, LanguageModel, LanguageModelId, LanguageModelName,
-    LanguageModelProvider, LanguageModelProviderId, LanguageModelProviderName,
-    LanguageModelProviderState, LanguageModelRequest, RateLimiter, Role,
+    settings::AllLanguageModelSettings, LanguageModel, LanguageModelCacheConfiguration,
+    LanguageModelId, LanguageModelName, LanguageModelProvider, LanguageModelProviderId,
+    LanguageModelProviderName, LanguageModelProviderState, LanguageModelRequest, RateLimiter, Role,
 };
 use anthropic::AnthropicError;
 use anyhow::{anyhow, Context as _, Result};
@ -38,6 +38,7 @@ pub struct AvailableModel {
    pub name: String,
    pub max_tokens: usize,
    pub tool_override: Option<String>,
+    pub cache_configuration: Option<LanguageModelCacheConfiguration>,
 }

 pub struct AnthropicLanguageModelProvider {
@ -171,6 +172,13 @@ impl LanguageModelProvider for AnthropicLanguageModelProvider {
                    name: model.name.clone(),
                    max_tokens: model.max_tokens,
                    tool_override: model.tool_override.clone(),
+                    cache_configuration: model.cache_configuration.as_ref().map(|config| {
+                        anthropic::AnthropicModelCacheConfiguration {
+                            max_cache_anchors: config.max_cache_anchors,
+                            should_speculate: config.should_speculate,
+                            min_total_token: config.min_total_token,
+                        }
+                    }),
                },
            );
        }
@ -351,6 +359,16 @@ impl LanguageModel for AnthropicModel {
        .boxed()
    }

+    fn cache_configuration(&self) -> Option<LanguageModelCacheConfiguration> {
+        self.model
+            .cache_configuration()
+            .map(|config| LanguageModelCacheConfiguration {
+                max_cache_anchors: config.max_cache_anchors,
+                should_speculate: config.should_speculate,
+                min_total_token: config.min_total_token,
+            })
+    }
+
    fn use_any_tool(
        &self,
        request: LanguageModelRequest,
--- a/crates/language_model/src/provider/cloud.rs
+++ b/crates/language_model/src/provider/cloud.rs
@ -1,7 +1,7 @@
 use super::open_ai::count_open_ai_tokens;
 use crate::{
-    settings::AllLanguageModelSettings, CloudModel, LanguageModel, LanguageModelId,
-    LanguageModelName, LanguageModelProviderId, LanguageModelProviderName,
+    settings::AllLanguageModelSettings, CloudModel, LanguageModel, LanguageModelCacheConfiguration,
+    LanguageModelId, LanguageModelName, LanguageModelProviderId, LanguageModelProviderName,
    LanguageModelProviderState, LanguageModelRequest, RateLimiter, ZedModel,
 };
 use anthropic::AnthropicError;
@ -56,6 +56,7 @@ pub struct AvailableModel {
    name: String,
    max_tokens: usize,
    tool_override: Option<String>,
+    cache_configuration: Option<LanguageModelCacheConfiguration>,
 }

 pub struct CloudLanguageModelProvider {
@ -202,6 +203,13 @@ impl LanguageModelProvider for CloudLanguageModelProvider {
                            name: model.name.clone(),
                            max_tokens: model.max_tokens,
                            tool_override: model.tool_override.clone(),
+                            cache_configuration: model.cache_configuration.as_ref().map(|config| {
+                                anthropic::AnthropicModelCacheConfiguration {
+                                    max_cache_anchors: config.max_cache_anchors,
+                                    should_speculate: config.should_speculate,
+                                    min_total_token: config.min_total_token,
+                                }
+                            }),
                        })
                    }
                    AvailableProvider::OpenAi => CloudModel::OpenAi(open_ai::Model::Custom {
--- a/crates/language_model/src/request.rs
+++ b/crates/language_model/src/request.rs
@ -193,6 +193,7 @@ impl From<&str> for MessageContent {
 pub struct LanguageModelRequestMessage {
    pub role: Role,
    pub content: Vec<MessageContent>,
+    pub cache: bool,
 }

 impl LanguageModelRequestMessage {
@ -213,7 +214,7 @@ impl LanguageModelRequestMessage {
                .content
                .get(0)
                .map(|content| match content {
-                    MessageContent::Text(s) => s.is_empty(),
+                    MessageContent::Text(s) => s.trim().is_empty(),
                    MessageContent::Image(_) => true,
                })
                .unwrap_or(false)
@ -286,7 +287,7 @@ impl LanguageModelRequest {
    }

    pub fn into_anthropic(self, model: String) -> anthropic::Request {
-        let mut new_messages: Vec<LanguageModelRequestMessage> = Vec::new();
+        let mut new_messages: Vec<anthropic::Message> = Vec::new();
        let mut system_message = String::new();

        for message in self.messages {
@ -296,18 +297,50 @@ impl LanguageModelRequest {

            match message.role {
                Role::User | Role::Assistant => {
+                    let cache_control = if message.cache {
+                        Some(anthropic::CacheControl {
+                            cache_type: anthropic::CacheControlType::Ephemeral,
+                        })
+                    } else {
+                        None
+                    };
+                    let anthropic_message_content: Vec<anthropic::Content> = message
+                        .content
+                        .into_iter()
+                        // TODO: filter out the empty messages in the message construction step
+                        .filter_map(|content| match content {
+                            MessageContent::Text(t) if !t.is_empty() => {
+                                Some(anthropic::Content::Text {
+                                    text: t,
+                                    cache_control,
+                                })
+                            }
+                            MessageContent::Image(i) => Some(anthropic::Content::Image {
+                                source: anthropic::ImageSource {
+                                    source_type: "base64".to_string(),
+                                    media_type: "image/png".to_string(),
+                                    data: i.source.to_string(),
+                                },
+                                cache_control,
+                            }),
+                            _ => None,
+                        })
+                        .collect();
+                    let anthropic_role = match message.role {
+                        Role::User => anthropic::Role::User,
+                        Role::Assistant => anthropic::Role::Assistant,
+                        Role::System => unreachable!("System role should never occur here"),
+                    };
                    if let Some(last_message) = new_messages.last_mut() {
-                        if last_message.role == message.role {
-                            // TODO: is this append done properly?
-                            last_message.content.push(MessageContent::Text(format!(
-                                "\n\n{}",
-                                message.string_contents()
-                            )));
+                        if last_message.role == anthropic_role {
+                            last_message.content.extend(anthropic_message_content);
                            continue;
                        }
                    }
-
-                    new_messages.push(message);
+                    new_messages.push(anthropic::Message {
+                        role: anthropic_role,
+                        content: anthropic_message_content,
+                    });
                }
                Role::System => {
                    if !system_message.is_empty() {
@ -320,36 +353,7 @@ impl LanguageModelRequest {

        anthropic::Request {
            model,
-            messages: new_messages
-                .into_iter()
-                .filter_map(|message| {
-                    Some(anthropic::Message {
-                        role: match message.role {
-                            Role::User => anthropic::Role::User,
-                            Role::Assistant => anthropic::Role::Assistant,
-                            Role::System => return None,
-                        },
-                        content: message
-                            .content
-                            .into_iter()
-                            // TODO: filter out the empty messages in the message construction step
-                            .filter_map(|content| match content {
-                                MessageContent::Text(t) if !t.is_empty() => {
-                                    Some(anthropic::Content::Text { text: t })
-                                }
-                                MessageContent::Image(i) => Some(anthropic::Content::Image {
-                                    source: anthropic::ImageSource {
-                                        source_type: "base64".to_string(),
-                                        media_type: "image/png".to_string(),
-                                        data: i.source.to_string(),
-                                    },
-                                }),
-                                _ => None,
-                            })
-                            .collect(),
-                    })
-                })
-                .collect(),
+            messages: new_messages,
            max_tokens: 4092,
            system: Some(system_message),
            tools: Vec::new(),
--- a/crates/language_model/src/settings.rs
+++ b/crates/language_model/src/settings.rs
@ -7,14 +7,17 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use settings::{update_settings_file, Settings, SettingsSources};

-use crate::provider::{
-    self,
-    anthropic::AnthropicSettings,
-    cloud::{self, ZedDotDevSettings},
-    copilot_chat::CopilotChatSettings,
-    google::GoogleSettings,
-    ollama::OllamaSettings,
-    open_ai::OpenAiSettings,
+use crate::{
+    provider::{
+        self,
+        anthropic::AnthropicSettings,
+        cloud::{self, ZedDotDevSettings},
+        copilot_chat::CopilotChatSettings,
+        google::GoogleSettings,
+        ollama::OllamaSettings,
+        open_ai::OpenAiSettings,
+    },
+    LanguageModelCacheConfiguration,
 };

 /// Initializes the language model settings.
@ -93,10 +96,18 @@ impl AnthropicSettingsContent {
                                    name,
                                    max_tokens,
                                    tool_override,
+                                    cache_configuration,
                                } => Some(provider::anthropic::AvailableModel {
                                    name,
                                    max_tokens,
                                    tool_override,
+                                    cache_configuration: cache_configuration.as_ref().map(
+                                        |config| LanguageModelCacheConfiguration {
+                                            max_cache_anchors: config.max_cache_anchors,
+                                            should_speculate: config.should_speculate,
+                                            min_total_token: config.min_total_token,
+                                        },
+                                    ),
                                }),
                                _ => None,
                            })