added opt-in default settings for vector store
This commit is contained in:
parent
f5fec55930
commit
298c2213a0
7 changed files with 70 additions and 19 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -8503,8 +8503,10 @@ dependencies = [
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"rpc",
|
"rpc",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
|
"schemars",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"settings",
|
||||||
"smol",
|
"smol",
|
||||||
"tempdir",
|
"tempdir",
|
||||||
"theme",
|
"theme",
|
||||||
|
|
|
@ -291,6 +291,12 @@
|
||||||
// the terminal will default to matching the buffer's font family.
|
// the terminal will default to matching the buffer's font family.
|
||||||
// "font_family": "Zed Mono"
|
// "font_family": "Zed Mono"
|
||||||
},
|
},
|
||||||
|
// Difference settings for vector_store
|
||||||
|
"vector_store": {
|
||||||
|
"enable": false,
|
||||||
|
"reindexing_delay_seconds": 600,
|
||||||
|
"embedding_batch_size": 150
|
||||||
|
},
|
||||||
// Different settings for specific languages.
|
// Different settings for specific languages.
|
||||||
"languages": {
|
"languages": {
|
||||||
"Plain Text": {
|
"Plain Text": {
|
||||||
|
|
|
@ -18,6 +18,7 @@ picker = { path = "../picker" }
|
||||||
theme = { path = "../theme" }
|
theme = { path = "../theme" }
|
||||||
editor = { path = "../editor" }
|
editor = { path = "../editor" }
|
||||||
rpc = { path = "../rpc" }
|
rpc = { path = "../rpc" }
|
||||||
|
settings = { path = "../settings" }
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
smol.workspace = true
|
smol.workspace = true
|
||||||
|
@ -33,6 +34,7 @@ bincode = "1.3.3"
|
||||||
matrixmultiply = "0.3.7"
|
matrixmultiply = "0.3.7"
|
||||||
tiktoken-rs = "0.5.0"
|
tiktoken-rs = "0.5.0"
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
|
schemars.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
gpui = { path = "../gpui", features = ["test-support"] }
|
gpui = { path = "../gpui", features = ["test-support"] }
|
||||||
|
@ -40,6 +42,7 @@ language = { path = "../language", features = ["test-support"] }
|
||||||
project = { path = "../project", features = ["test-support"] }
|
project = { path = "../project", features = ["test-support"] }
|
||||||
rpc = { path = "../rpc", features = ["test-support"] }
|
rpc = { path = "../rpc", features = ["test-support"] }
|
||||||
workspace = { path = "../workspace", features = ["test-support"] }
|
workspace = { path = "../workspace", features = ["test-support"] }
|
||||||
|
settings = { path = "../settings", features = ["test-support"]}
|
||||||
tree-sitter-rust = "*"
|
tree-sitter-rust = "*"
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
unindent.workspace = true
|
unindent.workspace = true
|
||||||
|
|
|
@ -204,8 +204,6 @@ impl VectorDatabase {
|
||||||
) -> Result<Vec<(i64, PathBuf, usize, String)>> {
|
) -> Result<Vec<(i64, PathBuf, usize, String)>> {
|
||||||
let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1);
|
let mut results = Vec::<(i64, f32)>::with_capacity(limit + 1);
|
||||||
self.for_each_document(&worktree_ids, |id, embedding| {
|
self.for_each_document(&worktree_ids, |id, embedding| {
|
||||||
eprintln!("document {id} {embedding:?}");
|
|
||||||
|
|
||||||
let similarity = dot(&embedding, &query_embedding);
|
let similarity = dot(&embedding, &query_embedding);
|
||||||
let ix = match results
|
let ix = match results
|
||||||
.binary_search_by(|(_, s)| similarity.partial_cmp(&s).unwrap_or(Ordering::Equal))
|
.binary_search_by(|(_, s)| similarity.partial_cmp(&s).unwrap_or(Ordering::Equal))
|
||||||
|
@ -243,10 +241,7 @@ impl VectorDatabase {
|
||||||
Ok((row.get(0)?, row.get::<_, Embedding>(1)?))
|
Ok((row.get(0)?, row.get::<_, Embedding>(1)?))
|
||||||
})?
|
})?
|
||||||
.filter_map(|row| row.ok())
|
.filter_map(|row| row.ok())
|
||||||
.for_each(|(id, embedding)| {
|
.for_each(|(id, embedding)| f(id, embedding.0));
|
||||||
dbg!("id");
|
|
||||||
f(id, embedding.0)
|
|
||||||
});
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,22 +2,25 @@ mod db;
|
||||||
mod embedding;
|
mod embedding;
|
||||||
mod modal;
|
mod modal;
|
||||||
mod parsing;
|
mod parsing;
|
||||||
|
mod vector_store_settings;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod vector_store_tests;
|
mod vector_store_tests;
|
||||||
|
|
||||||
|
use crate::vector_store_settings::VectorStoreSettings;
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use db::VectorDatabase;
|
use db::VectorDatabase;
|
||||||
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
|
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
|
||||||
use futures::{channel::oneshot, Future};
|
use futures::{channel::oneshot, Future};
|
||||||
use gpui::{
|
use gpui::{
|
||||||
AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, ViewContext,
|
AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Subscription, Task,
|
||||||
WeakModelHandle,
|
ViewContext, WeakModelHandle,
|
||||||
};
|
};
|
||||||
use language::{Language, LanguageRegistry};
|
use language::{Language, LanguageRegistry};
|
||||||
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
||||||
use parsing::{CodeContextRetriever, ParsedFile};
|
use parsing::{CodeContextRetriever, ParsedFile};
|
||||||
use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId};
|
use project::{Fs, PathChange, Project, ProjectEntryId, WorktreeId};
|
||||||
|
use settings::SettingsStore;
|
||||||
use smol::channel;
|
use smol::channel;
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
@ -34,9 +37,6 @@ use util::{
|
||||||
};
|
};
|
||||||
use workspace::{Workspace, WorkspaceCreated};
|
use workspace::{Workspace, WorkspaceCreated};
|
||||||
|
|
||||||
const REINDEXING_DELAY_SECONDS: u64 = 3;
|
|
||||||
const EMBEDDINGS_BATCH_SIZE: usize = 150;
|
|
||||||
|
|
||||||
pub fn init(
|
pub fn init(
|
||||||
fs: Arc<dyn Fs>,
|
fs: Arc<dyn Fs>,
|
||||||
http_client: Arc<dyn HttpClient>,
|
http_client: Arc<dyn HttpClient>,
|
||||||
|
@ -47,6 +47,12 @@ pub fn init(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
settings::register::<VectorStoreSettings>(cx);
|
||||||
|
|
||||||
|
if !settings::get::<VectorStoreSettings>(cx).enable {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let db_file_path = EMBEDDINGS_DIR
|
let db_file_path = EMBEDDINGS_DIR
|
||||||
.join(Path::new(RELEASE_CHANNEL_NAME.as_str()))
|
.join(Path::new(RELEASE_CHANNEL_NAME.as_str()))
|
||||||
.join("embeddings_db");
|
.join("embeddings_db");
|
||||||
|
@ -83,6 +89,7 @@ pub fn init(
|
||||||
.detach();
|
.detach();
|
||||||
|
|
||||||
cx.add_action({
|
cx.add_action({
|
||||||
|
// "semantic search: Toggle"
|
||||||
move |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext<Workspace>| {
|
move |workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext<Workspace>| {
|
||||||
let vector_store = vector_store.clone();
|
let vector_store = vector_store.clone();
|
||||||
workspace.toggle_modal(cx, |workspace, cx| {
|
workspace.toggle_modal(cx, |workspace, cx| {
|
||||||
|
@ -274,7 +281,6 @@ impl VectorStore {
|
||||||
worktree_id,
|
worktree_id,
|
||||||
indexed_file,
|
indexed_file,
|
||||||
} => {
|
} => {
|
||||||
log::info!("Inserting Data for {:?}", &indexed_file.path);
|
|
||||||
db.insert_file(worktree_id, indexed_file).log_err();
|
db.insert_file(worktree_id, indexed_file).log_err();
|
||||||
}
|
}
|
||||||
DbOperation::Delete { worktree_id, path } => {
|
DbOperation::Delete { worktree_id, path } => {
|
||||||
|
@ -347,6 +353,7 @@ impl VectorStore {
|
||||||
});
|
});
|
||||||
|
|
||||||
// batch_tx/rx: Batch Files to Send for Embeddings
|
// batch_tx/rx: Batch Files to Send for Embeddings
|
||||||
|
let batch_size = settings::get::<VectorStoreSettings>(cx).embedding_batch_size;
|
||||||
let (batch_files_tx, batch_files_rx) = channel::unbounded::<EmbeddingJob>();
|
let (batch_files_tx, batch_files_rx) = channel::unbounded::<EmbeddingJob>();
|
||||||
let _batch_files_task = cx.background().spawn(async move {
|
let _batch_files_task = cx.background().spawn(async move {
|
||||||
let mut queue_len = 0;
|
let mut queue_len = 0;
|
||||||
|
@ -361,7 +368,7 @@ impl VectorStore {
|
||||||
} => {
|
} => {
|
||||||
queue_len += &document_spans.len();
|
queue_len += &document_spans.len();
|
||||||
embeddings_queue.push((worktree_id, parsed_file, document_spans));
|
embeddings_queue.push((worktree_id, parsed_file, document_spans));
|
||||||
queue_len >= EMBEDDINGS_BATCH_SIZE
|
queue_len >= batch_size
|
||||||
}
|
}
|
||||||
EmbeddingJob::Flush => true,
|
EmbeddingJob::Flush => true,
|
||||||
};
|
};
|
||||||
|
@ -387,8 +394,6 @@ impl VectorStore {
|
||||||
let cursor = QueryCursor::new();
|
let cursor = QueryCursor::new();
|
||||||
let mut retriever = CodeContextRetriever { parser, cursor, fs };
|
let mut retriever = CodeContextRetriever { parser, cursor, fs };
|
||||||
while let Ok(pending_file) = parsing_files_rx.recv().await {
|
while let Ok(pending_file) = parsing_files_rx.recv().await {
|
||||||
log::info!("Parsing File: {:?}", &pending_file.relative_path);
|
|
||||||
|
|
||||||
if let Some((indexed_file, document_spans)) =
|
if let Some((indexed_file, document_spans)) =
|
||||||
retriever.parse_file(pending_file.clone()).await.log_err()
|
retriever.parse_file(pending_file.clone()).await.log_err()
|
||||||
{
|
{
|
||||||
|
@ -476,11 +481,9 @@ impl VectorStore {
|
||||||
let parsing_files_tx = self.parsing_files_tx.clone();
|
let parsing_files_tx = self.parsing_files_tx.clone();
|
||||||
|
|
||||||
cx.spawn(|this, mut cx| async move {
|
cx.spawn(|this, mut cx| async move {
|
||||||
let t0 = Instant::now();
|
|
||||||
futures::future::join_all(worktree_scans_complete).await;
|
futures::future::join_all(worktree_scans_complete).await;
|
||||||
|
|
||||||
let worktree_db_ids = futures::future::join_all(worktree_db_ids).await;
|
let worktree_db_ids = futures::future::join_all(worktree_db_ids).await;
|
||||||
log::info!("Worktree Scanning Done in {:?}", t0.elapsed().as_millis());
|
|
||||||
|
|
||||||
if let Some(db_directory) = database_url.parent() {
|
if let Some(db_directory) = database_url.parent() {
|
||||||
fs.create_dir(db_directory).await.log_err();
|
fs.create_dir(db_directory).await.log_err();
|
||||||
|
@ -665,6 +668,8 @@ impl VectorStore {
|
||||||
cx: &mut ModelContext<'_, VectorStore>,
|
cx: &mut ModelContext<'_, VectorStore>,
|
||||||
worktree_id: &WorktreeId,
|
worktree_id: &WorktreeId,
|
||||||
) -> Option<()> {
|
) -> Option<()> {
|
||||||
|
let reindexing_delay = settings::get::<VectorStoreSettings>(cx).reindexing_delay_seconds;
|
||||||
|
|
||||||
let worktree = project
|
let worktree = project
|
||||||
.read(cx)
|
.read(cx)
|
||||||
.worktree_for_id(worktree_id.clone(), cx)?
|
.worktree_for_id(worktree_id.clone(), cx)?
|
||||||
|
@ -725,7 +730,7 @@ impl VectorStore {
|
||||||
if !already_stored {
|
if !already_stored {
|
||||||
this.update(&mut cx, |this, _| {
|
this.update(&mut cx, |this, _| {
|
||||||
let reindex_time = modified_time
|
let reindex_time = modified_time
|
||||||
+ Duration::from_secs(REINDEXING_DELAY_SECONDS);
|
+ Duration::from_secs(reindexing_delay as u64);
|
||||||
|
|
||||||
let project_state =
|
let project_state =
|
||||||
this.projects.get_mut(&project.downgrade())?;
|
this.projects.get_mut(&project.downgrade())?;
|
||||||
|
|
32
crates/vector_store/src/vector_store_settings.rs
Normal file
32
crates/vector_store/src/vector_store_settings.rs
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
use anyhow;
|
||||||
|
use schemars::JsonSchema;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use settings::Setting;
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
pub struct VectorStoreSettings {
|
||||||
|
pub enable: bool,
|
||||||
|
pub reindexing_delay_seconds: usize,
|
||||||
|
pub embedding_batch_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Default, Serialize, Deserialize, JsonSchema, Debug)]
|
||||||
|
pub struct VectorStoreSettingsContent {
|
||||||
|
pub enable: Option<bool>,
|
||||||
|
pub reindexing_delay_seconds: Option<usize>,
|
||||||
|
pub embedding_batch_size: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Setting for VectorStoreSettings {
|
||||||
|
const KEY: Option<&'static str> = Some("vector_store");
|
||||||
|
|
||||||
|
type FileContent = VectorStoreSettingsContent;
|
||||||
|
|
||||||
|
fn load(
|
||||||
|
default_value: &Self::FileContent,
|
||||||
|
user_values: &[&Self::FileContent],
|
||||||
|
_: &gpui::AppContext,
|
||||||
|
) -> anyhow::Result<Self> {
|
||||||
|
Self::load_via_json_merge(default_value, user_values)
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,4 +1,6 @@
|
||||||
use crate::{db::dot, embedding::EmbeddingProvider, VectorStore};
|
use crate::{
|
||||||
|
db::dot, embedding::EmbeddingProvider, vector_store_settings::VectorStoreSettings, VectorStore,
|
||||||
|
};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use gpui::{Task, TestAppContext};
|
use gpui::{Task, TestAppContext};
|
||||||
|
@ -6,11 +8,17 @@ use language::{Language, LanguageConfig, LanguageRegistry};
|
||||||
use project::{FakeFs, Project};
|
use project::{FakeFs, Project};
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
use settings::SettingsStore;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use unindent::Unindent;
|
use unindent::Unindent;
|
||||||
|
|
||||||
#[gpui::test]
|
#[gpui::test]
|
||||||
async fn test_vector_store(cx: &mut TestAppContext) {
|
async fn test_vector_store(cx: &mut TestAppContext) {
|
||||||
|
cx.update(|cx| {
|
||||||
|
cx.set_global(SettingsStore::test(cx));
|
||||||
|
settings::register::<VectorStoreSettings>(cx);
|
||||||
|
});
|
||||||
|
|
||||||
let fs = FakeFs::new(cx.background());
|
let fs = FakeFs::new(cx.background());
|
||||||
fs.insert_tree(
|
fs.insert_tree(
|
||||||
"/the-root",
|
"/the-root",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue