WIP: work towards wiring up a embeddings_for_digest hashmap that is stored for all indexed files
This commit is contained in:
parent
50cfb067e7
commit
afa59abbcd
2 changed files with 104 additions and 23 deletions
|
@ -9,6 +9,7 @@ use gpui::executor;
|
||||||
use project::{search::PathMatcher, Fs};
|
use project::{search::PathMatcher, Fs};
|
||||||
use rpc::proto::Timestamp;
|
use rpc::proto::Timestamp;
|
||||||
use rusqlite::params;
|
use rusqlite::params;
|
||||||
|
use rusqlite::types::Value;
|
||||||
use std::{
|
use std::{
|
||||||
cmp::Ordering,
|
cmp::Ordering,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
@ -283,6 +284,41 @@ impl VectorDatabase {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn embeddings_for_files(
|
||||||
|
&self,
|
||||||
|
worktree_id_file_paths: Vec<(i64, PathBuf)>,
|
||||||
|
) -> impl Future<Output = Result<HashMap<DocumentDigest, Embedding>>> {
|
||||||
|
todo!();
|
||||||
|
// The remainder of the code is wired up.
|
||||||
|
// I'm having a bit of trouble figuring out the rusqlite syntax for a WHERE (files.worktree_id, files.relative_path) IN (VALUES (?, ?), (?, ?)) query
|
||||||
|
async { Ok(HashMap::new()) }
|
||||||
|
// let mut embeddings_by_digest = HashMap::new();
|
||||||
|
// self.transact(move |db| {
|
||||||
|
|
||||||
|
// let worktree_ids: Rc<Vec<Value>> = Rc::new(
|
||||||
|
// worktree_id_file_paths
|
||||||
|
// .iter()
|
||||||
|
// .map(|(id, _)| Value::from(*id))
|
||||||
|
// .collect(),
|
||||||
|
// );
|
||||||
|
// let file_paths: Rc<Vec<Value>> = Rc::new(worktree_id_file_paths
|
||||||
|
// .iter()
|
||||||
|
// .map(|(_, path)| Value::from(path.to_string_lossy().to_string()))
|
||||||
|
// .collect());
|
||||||
|
|
||||||
|
// let mut query = db.prepare("SELECT digest, embedding FROM documents LEFT JOIN files ON files.id = documents.file_id WHERE (files.worktree_id, files.relative_path) IN (VALUES (rarray = (?1), rarray = (?2))")?;
|
||||||
|
|
||||||
|
// for row in query.query_map(params![worktree_ids, file_paths], |row| {
|
||||||
|
// Ok((row.get::<_, DocumentDigest>(0)?, row.get::<_, Embedding>(1)?))
|
||||||
|
// })? {
|
||||||
|
// if let Ok(row) = row {
|
||||||
|
// embeddings_by_digest.insert(row.0, row.1);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// Ok(embeddings_by_digest)
|
||||||
|
// })
|
||||||
|
}
|
||||||
|
|
||||||
pub fn find_or_create_worktree(
|
pub fn find_or_create_worktree(
|
||||||
&self,
|
&self,
|
||||||
worktree_root_path: PathBuf,
|
worktree_root_path: PathBuf,
|
||||||
|
|
|
@ -10,12 +10,12 @@ mod semantic_index_tests;
|
||||||
use crate::semantic_index_settings::SemanticIndexSettings;
|
use crate::semantic_index_settings::SemanticIndexSettings;
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use db::VectorDatabase;
|
use db::VectorDatabase;
|
||||||
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
|
use embedding::{Embedding, EmbeddingProvider, OpenAIEmbeddings};
|
||||||
use embedding_queue::{EmbeddingQueue, FileToEmbed};
|
use embedding_queue::{EmbeddingQueue, FileToEmbed};
|
||||||
use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle};
|
use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle};
|
||||||
use language::{Anchor, Buffer, Language, LanguageRegistry};
|
use language::{Anchor, Buffer, Language, LanguageRegistry};
|
||||||
use parking_lot::Mutex;
|
use parking_lot::Mutex;
|
||||||
use parsing::{CodeContextRetriever, PARSEABLE_ENTIRE_FILE_TYPES};
|
use parsing::{CodeContextRetriever, DocumentDigest, PARSEABLE_ENTIRE_FILE_TYPES};
|
||||||
use postage::watch;
|
use postage::watch;
|
||||||
use project::{
|
use project::{
|
||||||
search::PathMatcher, Fs, PathChange, Project, ProjectEntryId, ProjectPath, Worktree, WorktreeId,
|
search::PathMatcher, Fs, PathChange, Project, ProjectEntryId, ProjectPath, Worktree, WorktreeId,
|
||||||
|
@ -103,7 +103,7 @@ pub struct SemanticIndex {
|
||||||
db: VectorDatabase,
|
db: VectorDatabase,
|
||||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||||
language_registry: Arc<LanguageRegistry>,
|
language_registry: Arc<LanguageRegistry>,
|
||||||
parsing_files_tx: channel::Sender<PendingFile>,
|
parsing_files_tx: channel::Sender<(Arc<HashMap<DocumentDigest, Embedding>>, PendingFile)>,
|
||||||
_embedding_task: Task<()>,
|
_embedding_task: Task<()>,
|
||||||
_parsing_files_tasks: Vec<Task<()>>,
|
_parsing_files_tasks: Vec<Task<()>>,
|
||||||
projects: HashMap<WeakModelHandle<Project>, ProjectState>,
|
projects: HashMap<WeakModelHandle<Project>, ProjectState>,
|
||||||
|
@ -247,7 +247,8 @@ impl SemanticIndex {
|
||||||
});
|
});
|
||||||
|
|
||||||
// Parse files into embeddable documents.
|
// Parse files into embeddable documents.
|
||||||
let (parsing_files_tx, parsing_files_rx) = channel::unbounded::<PendingFile>();
|
let (parsing_files_tx, parsing_files_rx) =
|
||||||
|
channel::unbounded::<(Arc<HashMap<DocumentDigest, Embedding>>, PendingFile)>();
|
||||||
let embedding_queue = Arc::new(Mutex::new(embedding_queue));
|
let embedding_queue = Arc::new(Mutex::new(embedding_queue));
|
||||||
let mut _parsing_files_tasks = Vec::new();
|
let mut _parsing_files_tasks = Vec::new();
|
||||||
for _ in 0..cx.background().num_cpus() {
|
for _ in 0..cx.background().num_cpus() {
|
||||||
|
@ -258,14 +259,16 @@ impl SemanticIndex {
|
||||||
let db = db.clone();
|
let db = db.clone();
|
||||||
_parsing_files_tasks.push(cx.background().spawn(async move {
|
_parsing_files_tasks.push(cx.background().spawn(async move {
|
||||||
let mut retriever = CodeContextRetriever::new(embedding_provider.clone());
|
let mut retriever = CodeContextRetriever::new(embedding_provider.clone());
|
||||||
while let Ok(pending_file) = parsing_files_rx.recv().await {
|
while let Ok((embeddings_for_digest, pending_file)) =
|
||||||
|
parsing_files_rx.recv().await
|
||||||
|
{
|
||||||
Self::parse_file(
|
Self::parse_file(
|
||||||
&fs,
|
&fs,
|
||||||
pending_file,
|
pending_file,
|
||||||
&mut retriever,
|
&mut retriever,
|
||||||
&embedding_queue,
|
&embedding_queue,
|
||||||
&parsing_files_rx,
|
&parsing_files_rx,
|
||||||
&db,
|
&embeddings_for_digest,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
@ -294,8 +297,11 @@ impl SemanticIndex {
|
||||||
pending_file: PendingFile,
|
pending_file: PendingFile,
|
||||||
retriever: &mut CodeContextRetriever,
|
retriever: &mut CodeContextRetriever,
|
||||||
embedding_queue: &Arc<Mutex<EmbeddingQueue>>,
|
embedding_queue: &Arc<Mutex<EmbeddingQueue>>,
|
||||||
parsing_files_rx: &channel::Receiver<PendingFile>,
|
parsing_files_rx: &channel::Receiver<(
|
||||||
db: &VectorDatabase,
|
Arc<HashMap<DocumentDigest, Embedding>>,
|
||||||
|
PendingFile,
|
||||||
|
)>,
|
||||||
|
embeddings_for_digest: &HashMap<DocumentDigest, Embedding>,
|
||||||
) {
|
) {
|
||||||
let Some(language) = pending_file.language else {
|
let Some(language) = pending_file.language else {
|
||||||
return;
|
return;
|
||||||
|
@ -312,18 +318,9 @@ impl SemanticIndex {
|
||||||
documents.len()
|
documents.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
if let Some(sha_to_embeddings) = db
|
for document in documents.iter_mut() {
|
||||||
.embeddings_for_file(
|
if let Some(embedding) = embeddings_for_digest.get(&document.digest) {
|
||||||
pending_file.worktree_db_id,
|
document.embedding = Some(embedding.to_owned());
|
||||||
pending_file.relative_path.clone(),
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.log_err()
|
|
||||||
{
|
|
||||||
for document in documents.iter_mut() {
|
|
||||||
if let Some(embedding) = sha_to_embeddings.get(&document.digest) {
|
|
||||||
document.embedding = Some(embedding.to_owned());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -381,6 +378,17 @@ impl SemanticIndex {
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let embeddings_for_digest = {
|
||||||
|
let mut worktree_id_file_paths = Vec::new();
|
||||||
|
for (path, _) in &project_state.changed_paths {
|
||||||
|
if let Some(worktree_db_id) = project_state.db_id_for_worktree_id(path.worktree_id)
|
||||||
|
{
|
||||||
|
worktree_id_file_paths.push((worktree_db_id, path.path.to_path_buf()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.db.embeddings_for_files(worktree_id_file_paths)
|
||||||
|
};
|
||||||
|
|
||||||
let worktree = worktree.read(cx);
|
let worktree = worktree.read(cx);
|
||||||
let change_time = Instant::now();
|
let change_time = Instant::now();
|
||||||
for (path, entry_id, change) in changes.iter() {
|
for (path, entry_id, change) in changes.iter() {
|
||||||
|
@ -405,9 +413,18 @@ impl SemanticIndex {
|
||||||
}
|
}
|
||||||
|
|
||||||
cx.spawn_weak(|this, mut cx| async move {
|
cx.spawn_weak(|this, mut cx| async move {
|
||||||
|
let embeddings_for_digest = embeddings_for_digest.await.log_err().unwrap_or_default();
|
||||||
|
|
||||||
cx.background().timer(BACKGROUND_INDEXING_DELAY).await;
|
cx.background().timer(BACKGROUND_INDEXING_DELAY).await;
|
||||||
if let Some((this, project)) = this.upgrade(&cx).zip(project.upgrade(&cx)) {
|
if let Some((this, project)) = this.upgrade(&cx).zip(project.upgrade(&cx)) {
|
||||||
Self::reindex_changed_paths(this, project, Some(change_time), &mut cx).await;
|
Self::reindex_changed_paths(
|
||||||
|
this,
|
||||||
|
project,
|
||||||
|
Some(change_time),
|
||||||
|
&mut cx,
|
||||||
|
Arc::new(embeddings_for_digest),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.detach();
|
.detach();
|
||||||
|
@ -561,7 +578,32 @@ impl SemanticIndex {
|
||||||
cx: &mut ModelContext<Self>,
|
cx: &mut ModelContext<Self>,
|
||||||
) -> Task<Result<(usize, watch::Receiver<usize>)>> {
|
) -> Task<Result<(usize, watch::Receiver<usize>)>> {
|
||||||
cx.spawn(|this, mut cx| async move {
|
cx.spawn(|this, mut cx| async move {
|
||||||
Self::reindex_changed_paths(this.clone(), project.clone(), None, &mut cx).await;
|
let embeddings_for_digest = this.read_with(&cx, |this, cx| {
|
||||||
|
if let Some(state) = this.projects.get(&project.downgrade()) {
|
||||||
|
let mut worktree_id_file_paths = Vec::new();
|
||||||
|
for (path, _) in &state.changed_paths {
|
||||||
|
if let Some(worktree_db_id) = state.db_id_for_worktree_id(path.worktree_id)
|
||||||
|
{
|
||||||
|
worktree_id_file_paths.push((worktree_db_id, path.path.to_path_buf()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(this.db.embeddings_for_files(worktree_id_file_paths))
|
||||||
|
} else {
|
||||||
|
Err(anyhow!("Project not yet initialized"))
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let embeddings_for_digest = Arc::new(embeddings_for_digest.await?);
|
||||||
|
|
||||||
|
Self::reindex_changed_paths(
|
||||||
|
this.clone(),
|
||||||
|
project.clone(),
|
||||||
|
None,
|
||||||
|
&mut cx,
|
||||||
|
embeddings_for_digest,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
this.update(&mut cx, |this, _cx| {
|
this.update(&mut cx, |this, _cx| {
|
||||||
let Some(state) = this.projects.get(&project.downgrade()) else {
|
let Some(state) = this.projects.get(&project.downgrade()) else {
|
||||||
|
@ -726,6 +768,7 @@ impl SemanticIndex {
|
||||||
project: ModelHandle<Project>,
|
project: ModelHandle<Project>,
|
||||||
last_changed_before: Option<Instant>,
|
last_changed_before: Option<Instant>,
|
||||||
cx: &mut AsyncAppContext,
|
cx: &mut AsyncAppContext,
|
||||||
|
embeddings_for_digest: Arc<HashMap<DocumentDigest, Embedding>>,
|
||||||
) {
|
) {
|
||||||
let mut pending_files = Vec::new();
|
let mut pending_files = Vec::new();
|
||||||
let mut files_to_delete = Vec::new();
|
let mut files_to_delete = Vec::new();
|
||||||
|
@ -805,7 +848,9 @@ impl SemanticIndex {
|
||||||
}
|
}
|
||||||
pending_file.language = Some(language);
|
pending_file.language = Some(language);
|
||||||
}
|
}
|
||||||
parsing_files_tx.try_send(pending_file).ok();
|
parsing_files_tx
|
||||||
|
.try_send((embeddings_for_digest.clone(), pending_file))
|
||||||
|
.ok();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue