Semantic index rough edges (#2871)
Small PR aimed at improving a few edge cases for semantic indexing large projects. Release Notes (Preview-only). - Batched large files with a number of documents greater than EMBEDDINGS_BATCH_SIZE. - Ensured that the job handle counting mechanism is consistent with inner file batching. - Updated tab content names for semantic search, to match text/regex searches.
This commit is contained in:
commit
bbe6d3b261
3 changed files with 131 additions and 33 deletions
|
@ -499,10 +499,14 @@ impl Item for ProjectSearchView {
|
||||||
.with_margin_right(tab_theme.spacing),
|
.with_margin_right(tab_theme.spacing),
|
||||||
)
|
)
|
||||||
.with_child({
|
.with_child({
|
||||||
let tab_name: Option<Cow<_>> =
|
let tab_name: Option<Cow<_>> = self
|
||||||
self.model.read(cx).active_query.as_ref().map(|query| {
|
.model
|
||||||
let query_text =
|
.read(cx)
|
||||||
util::truncate_and_trailoff(query.as_str(), MAX_TAB_TITLE_LEN);
|
.search_history
|
||||||
|
.current()
|
||||||
|
.as_ref()
|
||||||
|
.map(|query| {
|
||||||
|
let query_text = util::truncate_and_trailoff(query, MAX_TAB_TITLE_LEN);
|
||||||
query_text.into()
|
query_text.into()
|
||||||
});
|
});
|
||||||
Label::new(
|
Label::new(
|
||||||
|
|
|
@ -156,25 +156,27 @@ impl VectorDatabase {
|
||||||
mtime: SystemTime,
|
mtime: SystemTime,
|
||||||
documents: Vec<Document>,
|
documents: Vec<Document>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// Write to files table, and return generated id.
|
// Return the existing ID, if both the file and mtime match
|
||||||
|
let mtime = Timestamp::from(mtime);
|
||||||
|
let mut existing_id_query = self.db.prepare("SELECT id FROM files WHERE worktree_id = ?1 AND relative_path = ?2 AND mtime_seconds = ?3 AND mtime_nanos = ?4")?;
|
||||||
|
let existing_id = existing_id_query
|
||||||
|
.query_row(
|
||||||
|
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
|
||||||
|
|row| Ok(row.get::<_, i64>(0)?),
|
||||||
|
)
|
||||||
|
.map_err(|err| anyhow!(err));
|
||||||
|
let file_id = if existing_id.is_ok() {
|
||||||
|
// If already exists, just return the existing id
|
||||||
|
existing_id.unwrap()
|
||||||
|
} else {
|
||||||
|
// Delete Existing Row
|
||||||
self.db.execute(
|
self.db.execute(
|
||||||
"
|
"DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;",
|
||||||
DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;
|
|
||||||
",
|
|
||||||
params![worktree_id, path.to_str()],
|
params![worktree_id, path.to_str()],
|
||||||
)?;
|
)?;
|
||||||
let mtime = Timestamp::from(mtime);
|
self.db.execute("INSERT INTO files (worktree_id, relative_path, mtime_seconds, mtime_nanos) VALUES (?1, ?2, ?3, ?4);", params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos])?;
|
||||||
self.db.execute(
|
self.db.last_insert_rowid()
|
||||||
"
|
};
|
||||||
INSERT INTO files
|
|
||||||
(worktree_id, relative_path, mtime_seconds, mtime_nanos)
|
|
||||||
VALUES
|
|
||||||
(?1, ?2, $3, $4);
|
|
||||||
",
|
|
||||||
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let file_id = self.db.last_insert_rowid();
|
|
||||||
|
|
||||||
// Currently inserting at approximately 3400 documents a second
|
// Currently inserting at approximately 3400 documents a second
|
||||||
// I imagine we can speed this up with a bulk insert of some kind.
|
// I imagine we can speed this up with a bulk insert of some kind.
|
||||||
|
|
|
@ -96,10 +96,21 @@ struct ProjectState {
|
||||||
_outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>,
|
_outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
struct JobHandle {
|
struct JobHandle {
|
||||||
tx: Weak<Mutex<watch::Sender<usize>>>,
|
/// The outer Arc is here to count the clones of a JobHandle instance;
|
||||||
|
/// when the last handle to a given job is dropped, we decrement a counter (just once).
|
||||||
|
tx: Arc<Weak<Mutex<watch::Sender<usize>>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl JobHandle {
|
||||||
|
fn new(tx: &Arc<Mutex<watch::Sender<usize>>>) -> Self {
|
||||||
|
*tx.lock().borrow_mut() += 1;
|
||||||
|
Self {
|
||||||
|
tx: Arc::new(Arc::downgrade(&tx)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
impl ProjectState {
|
impl ProjectState {
|
||||||
fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option<i64> {
|
fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option<i64> {
|
||||||
self.worktree_db_ids
|
self.worktree_db_ids
|
||||||
|
@ -380,6 +391,20 @@ impl SemanticIndex {
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Insert the file in spite of failure so that future attempts to index it do not take place (unless the file is changed).
|
||||||
|
for (worktree_id, _, path, mtime, job_handle) in embeddings_queue.into_iter() {
|
||||||
|
db_update_tx
|
||||||
|
.send(DbOperation::InsertFile {
|
||||||
|
worktree_id,
|
||||||
|
documents: vec![],
|
||||||
|
path,
|
||||||
|
mtime,
|
||||||
|
job_handle,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -389,6 +414,7 @@ impl SemanticIndex {
|
||||||
embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
|
embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
|
||||||
embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>,
|
embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>,
|
||||||
) {
|
) {
|
||||||
|
// Handle edge case where individual file has more documents than max batch size
|
||||||
let should_flush = match job {
|
let should_flush = match job {
|
||||||
EmbeddingJob::Enqueue {
|
EmbeddingJob::Enqueue {
|
||||||
documents,
|
documents,
|
||||||
|
@ -397,10 +423,44 @@ impl SemanticIndex {
|
||||||
mtime,
|
mtime,
|
||||||
job_handle,
|
job_handle,
|
||||||
} => {
|
} => {
|
||||||
|
// If documents is greater than embeddings batch size, recursively batch existing rows.
|
||||||
|
if &documents.len() > &EMBEDDINGS_BATCH_SIZE {
|
||||||
|
let first_job = EmbeddingJob::Enqueue {
|
||||||
|
documents: documents[..EMBEDDINGS_BATCH_SIZE].to_vec(),
|
||||||
|
worktree_id,
|
||||||
|
path: path.clone(),
|
||||||
|
mtime,
|
||||||
|
job_handle: job_handle.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
Self::enqueue_documents_to_embed(
|
||||||
|
first_job,
|
||||||
|
queue_len,
|
||||||
|
embeddings_queue,
|
||||||
|
embed_batch_tx,
|
||||||
|
);
|
||||||
|
|
||||||
|
let second_job = EmbeddingJob::Enqueue {
|
||||||
|
documents: documents[EMBEDDINGS_BATCH_SIZE..].to_vec(),
|
||||||
|
worktree_id,
|
||||||
|
path: path.clone(),
|
||||||
|
mtime,
|
||||||
|
job_handle: job_handle.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
Self::enqueue_documents_to_embed(
|
||||||
|
second_job,
|
||||||
|
queue_len,
|
||||||
|
embeddings_queue,
|
||||||
|
embed_batch_tx,
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
*queue_len += &documents.len();
|
*queue_len += &documents.len();
|
||||||
embeddings_queue.push((worktree_id, documents, path, mtime, job_handle));
|
embeddings_queue.push((worktree_id, documents, path, mtime, job_handle));
|
||||||
*queue_len >= EMBEDDINGS_BATCH_SIZE
|
*queue_len >= EMBEDDINGS_BATCH_SIZE
|
||||||
}
|
}
|
||||||
|
}
|
||||||
EmbeddingJob::Flush => true,
|
EmbeddingJob::Flush => true,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -613,10 +673,8 @@ impl SemanticIndex {
|
||||||
|
|
||||||
if !already_stored {
|
if !already_stored {
|
||||||
count += 1;
|
count += 1;
|
||||||
*job_count_tx.lock().borrow_mut() += 1;
|
|
||||||
let job_handle = JobHandle {
|
let job_handle = JobHandle::new(&job_count_tx);
|
||||||
tx: Arc::downgrade(&job_count_tx),
|
|
||||||
};
|
|
||||||
parsing_files_tx
|
parsing_files_tx
|
||||||
.try_send(PendingFile {
|
.try_send(PendingFile {
|
||||||
worktree_db_id: db_ids_by_worktree_id[&worktree.id()],
|
worktree_db_id: db_ids_by_worktree_id[&worktree.id()],
|
||||||
|
@ -690,6 +748,7 @@ impl SemanticIndex {
|
||||||
let database_url = self.database_url.clone();
|
let database_url = self.database_url.clone();
|
||||||
let fs = self.fs.clone();
|
let fs = self.fs.clone();
|
||||||
cx.spawn(|this, mut cx| async move {
|
cx.spawn(|this, mut cx| async move {
|
||||||
|
let t0 = Instant::now();
|
||||||
let database = VectorDatabase::new(fs.clone(), database_url.clone()).await?;
|
let database = VectorDatabase::new(fs.clone(), database_url.clone()).await?;
|
||||||
|
|
||||||
let phrase_embedding = embedding_provider
|
let phrase_embedding = embedding_provider
|
||||||
|
@ -699,6 +758,11 @@ impl SemanticIndex {
|
||||||
.next()
|
.next()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
log::trace!(
|
||||||
|
"Embedding search phrase took: {:?} milliseconds",
|
||||||
|
t0.elapsed().as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
let file_ids =
|
let file_ids =
|
||||||
database.retrieve_included_file_ids(&worktree_db_ids, &includes, &excludes)?;
|
database.retrieve_included_file_ids(&worktree_db_ids, &includes, &excludes)?;
|
||||||
|
|
||||||
|
@ -773,6 +837,11 @@ impl SemanticIndex {
|
||||||
|
|
||||||
let buffers = futures::future::join_all(tasks).await;
|
let buffers = futures::future::join_all(tasks).await;
|
||||||
|
|
||||||
|
log::trace!(
|
||||||
|
"Semantic Searching took: {:?} milliseconds in total",
|
||||||
|
t0.elapsed().as_millis()
|
||||||
|
);
|
||||||
|
|
||||||
Ok(buffers
|
Ok(buffers
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.zip(ranges)
|
.zip(ranges)
|
||||||
|
@ -794,9 +863,32 @@ impl Entity for SemanticIndex {
|
||||||
|
|
||||||
impl Drop for JobHandle {
|
impl Drop for JobHandle {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if let Some(tx) = self.tx.upgrade() {
|
if let Some(inner) = Arc::get_mut(&mut self.tx) {
|
||||||
|
// This is the last instance of the JobHandle (regardless of it's origin - whether it was cloned or not)
|
||||||
|
if let Some(tx) = inner.upgrade() {
|
||||||
let mut tx = tx.lock();
|
let mut tx = tx.lock();
|
||||||
*tx.borrow_mut() -= 1;
|
*tx.borrow_mut() -= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
#[test]
|
||||||
|
fn test_job_handle() {
|
||||||
|
let (job_count_tx, job_count_rx) = watch::channel_with(0);
|
||||||
|
let tx = Arc::new(Mutex::new(job_count_tx));
|
||||||
|
let job_handle = JobHandle::new(&tx);
|
||||||
|
|
||||||
|
assert_eq!(1, *job_count_rx.borrow());
|
||||||
|
let new_job_handle = job_handle.clone();
|
||||||
|
assert_eq!(1, *job_count_rx.borrow());
|
||||||
|
drop(job_handle);
|
||||||
|
assert_eq!(1, *job_count_rx.borrow());
|
||||||
|
drop(new_job_handle);
|
||||||
|
assert_eq!(0, *job_count_rx.borrow());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue