From 298c2213a0afa68f1dbaf04dc8b90420303743a9 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 11 Jul 2023 12:03:56 -0400 Subject: [PATCH] added opt-in default settings for vector store --- Cargo.lock | 2 ++ assets/settings/default.json | 6 ++++ crates/vector_store/Cargo.toml | 3 ++ crates/vector_store/src/db.rs | 7 +--- crates/vector_store/src/vector_store.rs | 29 ++++++++++------- .../vector_store/src/vector_store_settings.rs | 32 +++++++++++++++++++ crates/vector_store/src/vector_store_tests.rs | 10 +++++- 7 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 crates/vector_store/src/vector_store_settings.rs diff --git a/Cargo.lock b/Cargo.lock index 22df4083fd..cd92d0003a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8503,8 +8503,10 @@ dependencies = [ "rand 0.8.5", "rpc", "rusqlite", + "schemars", "serde", "serde_json", + "settings", "smol", "tempdir", "theme", diff --git a/assets/settings/default.json b/assets/settings/default.json index 9ae5c916b5..cf8f630dfb 100644 --- a/assets/settings/default.json +++ b/assets/settings/default.json @@ -291,6 +291,12 @@ // the terminal will default to matching the buffer's font family. // "font_family": "Zed Mono" }, + // Difference settings for vector_store + "vector_store": { + "enable": false, + "reindexing_delay_seconds": 600, + "embedding_batch_size": 150 + }, // Different settings for specific languages. "languages": { "Plain Text": { diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 35a6a689ae..40bff8b95c 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -18,6 +18,7 @@ picker = { path = "../picker" } theme = { path = "../theme" } editor = { path = "../editor" } rpc = { path = "../rpc" } +settings = { path = "../settings" } anyhow.workspace = true futures.workspace = true smol.workspace = true @@ -33,6 +34,7 @@ bincode = "1.3.3" matrixmultiply = "0.3.7" tiktoken-rs = "0.5.0" rand.workspace = true +schemars.workspace = true [dev-dependencies] gpui = { path = "../gpui", features = ["test-support"] } @@ -40,6 +42,7 @@ language = { path = "../language", features = ["test-support"] } project = { path = "../project", features = ["test-support"] } rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } +settings = { path = "../settings", features = ["test-support"]} tree-sitter-rust = "*" rand.workspace = true unindent.workspace = true diff --git a/crates/vector_store/src/db.rs b/crates/vector_store/src/db.rs index 197e7d5696..79d90e87bf 100644 --- a/crates/vector_store/src/db.rs +++ b/crates/vector_store/src/db.rs @@ -204,8 +204,6 @@ impl VectorDatabase { ) -> Result> { let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1); self.for_each_document(&worktree_ids, |id, embedding| { - eprintln!("document {id} {embedding:?}"); - let similarity = dot(&embedding, &query_embedding); let ix = match results .binary_search_by(|(_, s)| similarity.partial_cmp(&s).unwrap_or(Ordering::Equal)) @@ -243,10 +241,7 @@ impl VectorDatabase { Ok((row.get(0)?, row.get::<_, Embedding>(1)?)) })? .filter_map(|row| row.ok()) - .for_each(|(id, embedding)| { - dbg!("id"); - f(id, embedding.0) - }); + .for_each(|(id, embedding)| f(id, embedding.0)); Ok(()) } diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 9b21073998..4b5f6b636f 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -2,22 +2,25 @@ mod db; mod embedding; mod modal; mod parsing; +mod vector_store_settings; #[cfg(test)] mod vector_store_tests; +use crate::vector_store_settings::VectorStoreSettings; use anyhow::{anyhow, Result}; use db::VectorDatabase; use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use futures::{channel::oneshot, Future}; use gpui::{ - AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, ViewContext, - WeakModelHandle, + AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Subscription, Task, + ViewContext, WeakModelHandle, }; use language::{Language, LanguageRegistry}; use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; use parsing::{CodeContextRetriever, ParsedFile}; use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId}; +use settings::SettingsStore; use smol::channel; use std::{ collections::HashMap, @@ -34,9 +37,6 @@ use util::{ }; use workspace::{Workspace, WorkspaceCreated}; -const REINDEXING_DELAY_SECONDS: u64 = 3; -const EMBEDDINGS_BATCH_SIZE: usize = 150; - pub fn init( fs: Arc, http_client: Arc, @@ -47,6 +47,12 @@ pub fn init( return; } + settings::register::(cx); + + if !settings::get::(cx).enable { + return; + } + let db_file_path = EMBEDDINGS_DIR .join(Path::new(RELEASE_CHANNEL_NAME.as_str())) .join("embeddings_db"); @@ -83,6 +89,7 @@ pub fn init( .detach(); cx.add_action({ + // "semantic search: Toggle" move |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext| { let vector_store = vector_store.clone(); workspace.toggle_modal(cx, |workspace, cx| { @@ -274,7 +281,6 @@ impl VectorStore { worktree_id, indexed_file, } => { - log::info!("Inserting Data for {:?}", &indexed_file.path); db.insert_file(worktree_id, indexed_file).log_err(); } DbOperation::Delete { worktree_id, path } => { @@ -347,6 +353,7 @@ impl VectorStore { }); // batch_tx/rx: Batch Files to Send for Embeddings + let batch_size = settings::get::(cx).embedding_batch_size; let (batch_files_tx, batch_files_rx) = channel::unbounded::(); let _batch_files_task = cx.background().spawn(async move { let mut queue_len = 0; @@ -361,7 +368,7 @@ impl VectorStore { } => { queue_len += &document_spans.len(); embeddings_queue.push((worktree_id, parsed_file, document_spans)); - queue_len >= EMBEDDINGS_BATCH_SIZE + queue_len >= batch_size } EmbeddingJob::Flush => true, }; @@ -387,8 +394,6 @@ impl VectorStore { let cursor = QueryCursor::new(); let mut retriever = CodeContextRetriever { parser, cursor, fs }; while let Ok(pending_file) = parsing_files_rx.recv().await { - log::info!("Parsing File: {:?}", &pending_file.relative_path); - if let Some((indexed_file, document_spans)) = retriever.parse_file(pending_file.clone()).await.log_err() { @@ -476,11 +481,9 @@ impl VectorStore { let parsing_files_tx = self.parsing_files_tx.clone(); cx.spawn(|this, mut cx| async move { - let t0 = Instant::now(); futures::future::join_all(worktree_scans_complete).await; let worktree_db_ids = futures::future::join_all(worktree_db_ids).await; - log::info!("Worktree Scanning Done in {:?}", t0.elapsed().as_millis()); if let Some(db_directory) = database_url.parent() { fs.create_dir(db_directory).await.log_err(); @@ -665,6 +668,8 @@ impl VectorStore { cx: &mut ModelContext<'_, VectorStore>, worktree_id: &WorktreeId, ) -> Option<()> { + let reindexing_delay = settings::get::(cx).reindexing_delay_seconds; + let worktree = project .read(cx) .worktree_for_id(worktree_id.clone(), cx)? @@ -725,7 +730,7 @@ impl VectorStore { if !already_stored { this.update(&mut cx, |this, _| { let reindex_time = modified_time - + Duration::from_secs(REINDEXING_DELAY_SECONDS); + + Duration::from_secs(reindexing_delay as u64); let project_state = this.projects.get_mut(&project.downgrade())?; diff --git a/crates/vector_store/src/vector_store_settings.rs b/crates/vector_store/src/vector_store_settings.rs new file mode 100644 index 0000000000..0bde07dd65 --- /dev/null +++ b/crates/vector_store/src/vector_store_settings.rs @@ -0,0 +1,32 @@ +use anyhow; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use settings::Setting; + +#[derive(Deserialize, Debug)] +pub struct VectorStoreSettings { + pub enable: bool, + pub reindexing_delay_seconds: usize, + pub embedding_batch_size: usize, +} + +#[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)] +pub struct VectorStoreSettingsContent { + pub enable: Option, + pub reindexing_delay_seconds: Option, + pub embedding_batch_size: Option, +} + +impl Setting for VectorStoreSettings { + const KEY: Option<&'static str> = Some("vector_store"); + + type FileContent = VectorStoreSettingsContent; + + fn load( + default_value: &Self::FileContent, + user_values: &[&Self::FileContent], + _: &gpui::AppContext, + ) -> anyhow::Result { + Self::load_via_json_merge(default_value, user_values) + } +} diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index b1756b7964..a3a40722ea 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -1,4 +1,6 @@ -use crate::{db::dot, embedding::EmbeddingProvider, VectorStore}; +use crate::{ + db::dot, embedding::EmbeddingProvider, vector_store_settings::VectorStoreSettings, VectorStore, +}; use anyhow::Result; use async_trait::async_trait; use gpui::{Task, TestAppContext}; @@ -6,11 +8,17 @@ use language::{Language, LanguageConfig, LanguageRegistry}; use project::{FakeFs, Project}; use rand::Rng; use serde_json::json; +use settings::SettingsStore; use std::sync::Arc; use unindent::Unindent; #[gpui::test] async fn test_vector_store(cx: &mut TestAppContext) { + cx.update(|cx| { + cx.set_global(SettingsStore::test(cx)); + settings::register::(cx); + }); + let fs = FakeFs::new(cx.background()); fs.insert_tree( "/the-root",