added opt-in default settings for vector store

This commit is contained in:
KCaverly 2023-07-11 12:03:56 -04:00
parent f5fec55930
commit 298c2213a0
7 changed files with 70 additions and 19 deletions

2
Cargo.lock generated
View file

@ -8503,8 +8503,10 @@ dependencies = [
"rand 0.8.5",
"rpc",
"rusqlite",
"schemars",
"serde",
"serde_json",
"settings",
"smol",
"tempdir",
"theme",

View file

@ -291,6 +291,12 @@
// the terminal will default to matching the buffer's font family.
// "font_family": "Zed Mono"
},
// Difference settings for vector_store
"vector_store": {
"enable": false,
"reindexing_delay_seconds": 600,
"embedding_batch_size": 150
},
// Different settings for specific languages.
"languages": {
"Plain Text": {

View file

@ -18,6 +18,7 @@ picker = { path = "../picker" }
theme = { path = "../theme" }
editor = { path = "../editor" }
rpc = { path = "../rpc" }
settings = { path = "../settings" }
anyhow.workspace = true
futures.workspace = true
smol.workspace = true
@ -33,6 +34,7 @@ bincode = "1.3.3"
matrixmultiply = "0.3.7"
tiktoken-rs = "0.5.0"
rand.workspace = true
schemars.workspace = true
[dev-dependencies]
gpui = { path = "../gpui", features = ["test-support"] }
@ -40,6 +42,7 @@ language = { path = "../language", features = ["test-support"] }
project = { path = "../project", features = ["test-support"] }
rpc = { path = "../rpc", features = ["test-support"] }
workspace = { path = "../workspace", features = ["test-support"] }
settings = { path = "../settings", features = ["test-support"]}
tree-sitter-rust = "*"
rand.workspace = true
unindent.workspace = true

View file

@ -204,8 +204,6 @@ impl VectorDatabase {
) -> Result<Vec<(i64, PathBuf, usize, String)>> {
let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1);
self.for_each_document(&worktree_ids, |id, embedding| {
eprintln!("document {id} {embedding:?}");
let similarity = dot(&embedding, &query_embedding);
let ix = match results
.binary_search_by(|(_, s)| similarity.partial_cmp(&s).unwrap_or(Ordering::Equal))
@ -243,10 +241,7 @@ impl VectorDatabase {
Ok((row.get(0)?, row.get::<_, Embedding>(1)?))
})?
.filter_map(|row| row.ok())
.for_each(|(id, embedding)| {
dbg!("id");
f(id, embedding.0)
});
.for_each(|(id, embedding)| f(id, embedding.0));
Ok(())
}

View file

@ -2,22 +2,25 @@ mod db;
mod embedding;
mod modal;
mod parsing;
mod vector_store_settings;
#[cfg(test)]
mod vector_store_tests;
use crate::vector_store_settings::VectorStoreSettings;
use anyhow::{anyhow, Result};
use db::VectorDatabase;
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
use futures::{channel::oneshot, Future};
use gpui::{
AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, ViewContext,
WeakModelHandle,
AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Subscription, Task,
ViewContext, WeakModelHandle,
};
use language::{Language, LanguageRegistry};
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
use parsing::{CodeContextRetriever, ParsedFile};
use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId};
use settings::SettingsStore;
use smol::channel;
use std::{
collections::HashMap,
@ -34,9 +37,6 @@ use util::{
};
use workspace::{Workspace, WorkspaceCreated};
const REINDEXING_DELAY_SECONDS: u64 = 3;
const EMBEDDINGS_BATCH_SIZE: usize = 150;
pub fn init(
fs: Arc<dyn Fs>,
http_client: Arc<dyn HttpClient>,
@ -47,6 +47,12 @@ pub fn init(
return;
}
settings::register::<VectorStoreSettings>(cx);
if !settings::get::<VectorStoreSettings>(cx).enable {
return;
}
let db_file_path = EMBEDDINGS_DIR
.join(Path::new(RELEASE_CHANNEL_NAME.as_str()))
.join("embeddings_db");
@ -83,6 +89,7 @@ pub fn init(
.detach();
cx.add_action({
// "semantic search: Toggle"
move |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext<Workspace>| {
let vector_store = vector_store.clone();
workspace.toggle_modal(cx, |workspace, cx| {
@ -274,7 +281,6 @@ impl VectorStore {
worktree_id,
indexed_file,
} => {
log::info!("Inserting Data for {:?}", &indexed_file.path);
db.insert_file(worktree_id, indexed_file).log_err();
}
DbOperation::Delete { worktree_id, path } => {
@ -347,6 +353,7 @@ impl VectorStore {
});
// batch_tx/rx: Batch Files to Send for Embeddings
let batch_size = settings::get::<VectorStoreSettings>(cx).embedding_batch_size;
let (batch_files_tx, batch_files_rx) = channel::unbounded::<EmbeddingJob>();
let _batch_files_task = cx.background().spawn(async move {
let mut queue_len = 0;
@ -361,7 +368,7 @@ impl VectorStore {
} => {
queue_len += &document_spans.len();
embeddings_queue.push((worktree_id, parsed_file, document_spans));
queue_len >= EMBEDDINGS_BATCH_SIZE
queue_len >= batch_size
}
EmbeddingJob::Flush => true,
};
@ -387,8 +394,6 @@ impl VectorStore {
let cursor = QueryCursor::new();
let mut retriever = CodeContextRetriever { parser, cursor, fs };
while let Ok(pending_file) = parsing_files_rx.recv().await {
log::info!("Parsing File: {:?}", &pending_file.relative_path);
if let Some((indexed_file, document_spans)) =
retriever.parse_file(pending_file.clone()).await.log_err()
{
@ -476,11 +481,9 @@ impl VectorStore {
let parsing_files_tx = self.parsing_files_tx.clone();
cx.spawn(|this, mut cx| async move {
let t0 = Instant::now();
futures::future::join_all(worktree_scans_complete).await;
let worktree_db_ids = futures::future::join_all(worktree_db_ids).await;
log::info!("Worktree Scanning Done in {:?}", t0.elapsed().as_millis());
if let Some(db_directory) = database_url.parent() {
fs.create_dir(db_directory).await.log_err();
@ -665,6 +668,8 @@ impl VectorStore {
cx: &mut ModelContext<'_, VectorStore>,
worktree_id: &WorktreeId,
) -> Option<()> {
let reindexing_delay = settings::get::<VectorStoreSettings>(cx).reindexing_delay_seconds;
let worktree = project
.read(cx)
.worktree_for_id(worktree_id.clone(), cx)?
@ -725,7 +730,7 @@ impl VectorStore {
if !already_stored {
this.update(&mut cx, |this, _| {
let reindex_time = modified_time
+ Duration::from_secs(REINDEXING_DELAY_SECONDS);
+ Duration::from_secs(reindexing_delay as u64);
let project_state =
this.projects.get_mut(&project.downgrade())?;

View file

@ -0,0 +1,32 @@
use anyhow;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use settings::Setting;
#[derive(Deserialize, Debug)]
pub struct VectorStoreSettings {
pub enable: bool,
pub reindexing_delay_seconds: usize,
pub embedding_batch_size: usize,
}
#[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)]
pub struct VectorStoreSettingsContent {
pub enable: Option<bool>,
pub reindexing_delay_seconds: Option<usize>,
pub embedding_batch_size: Option<usize>,
}
impl Setting for VectorStoreSettings {
const KEY: Option<&'static str> = Some("vector_store");
type FileContent = VectorStoreSettingsContent;
fn load(
default_value: &Self::FileContent,
user_values: &[&Self::FileContent],
_: &gpui::AppContext,
) -> anyhow::Result<Self> {
Self::load_via_json_merge(default_value, user_values)
}
}

View file

@ -1,4 +1,6 @@
use crate::{db::dot, embedding::EmbeddingProvider, VectorStore};
use crate::{
db::dot, embedding::EmbeddingProvider, vector_store_settings::VectorStoreSettings, VectorStore,
};
use anyhow::Result;
use async_trait::async_trait;
use gpui::{Task, TestAppContext};
@ -6,11 +8,17 @@ use language::{Language, LanguageConfig, LanguageRegistry};
use project::{FakeFs, Project};
use rand::Rng;
use serde_json::json;
use settings::SettingsStore;
use std::sync::Arc;
use unindent::Unindent;
#[gpui::test]
async fn test_vector_store(cx: &mut TestAppContext) {
cx.update(|cx| {
cx.set_global(SettingsStore::test(cx));
settings::register::<VectorStoreSettings>(cx);
});
let fs = FakeFs::new(cx.background());
fs.insert_tree(
"/the-root",