Semantic index rough edges (#2871)

Small PR aimed at improving a few edge cases for semantic indexing large projects. Release Notes (Preview-only). - Batched large files with a number of documents greater than EMBEDDINGS_BATCH_SIZE. - Ensured that the job handle counting mechanism is consistent with inner file batching. - Updated tab content names for semantic search, to match text/regex searches.
2023-08-21 13:43:08 +02:00 · 2023-08-21 13:43:08 +02:00 · bbe6d3b261
commit bbe6d3b261
parent 267c0b9a3f c68b518aec
3 changed files with 131 additions and 33 deletions
--- a/crates/search/src/project_search.rs
+++ b/crates/search/src/project_search.rs
@ -499,10 +499,14 @@ impl Item for ProjectSearchView {
                    .with_margin_right(tab_theme.spacing),
            )
            .with_child({
-                let tab_name: Option<Cow<_>> =
-                    self.model.read(cx).active_query.as_ref().map(|query| {
-                        let query_text =
-                            util::truncate_and_trailoff(query.as_str(), MAX_TAB_TITLE_LEN);
+                let tab_name: Option<Cow<_>> = self
+                    .model
+                    .read(cx)
+                    .search_history
+                    .current()
+                    .as_ref()
+                    .map(|query| {
+                        let query_text = util::truncate_and_trailoff(query, MAX_TAB_TITLE_LEN);
                        query_text.into()
                    });
                Label::new(
--- a/crates/semantic_index/src/db.rs
+++ b/crates/semantic_index/src/db.rs
@ -156,25 +156,27 @@ impl VectorDatabase {
        mtime: SystemTime,
        documents: Vec<Document>,
    ) -> Result<()> {
-        // Write to files table, and return generated id.
+        // Return the existing ID, if both the file and mtime match
+        let mtime = Timestamp::from(mtime);
+        let mut existing_id_query = self.db.prepare("SELECT id FROM files WHERE worktree_id = ?1 AND relative_path = ?2 AND mtime_seconds = ?3 AND mtime_nanos = ?4")?;
+        let existing_id = existing_id_query
+            .query_row(
+                params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
+                |row| Ok(row.get::<_, i64>(0)?),
+            )
+            .map_err(|err| anyhow!(err));
+        let file_id = if existing_id.is_ok() {
+            // If already exists, just return the existing id
+            existing_id.unwrap()
+        } else {
+            // Delete Existing Row
            self.db.execute(
-            "
-            DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;
-            ",
+                "DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;",
                params![worktree_id, path.to_str()],
            )?;
-        let mtime = Timestamp::from(mtime);
-        self.db.execute(
-            "
-            INSERT INTO files
-            (worktree_id, relative_path, mtime_seconds, mtime_nanos)
-            VALUES
-            (?1, ?2, $3, $4);
-            ",
-            params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
-        )?;
-
-        let file_id = self.db.last_insert_rowid();
+            self.db.execute("INSERT INTO files (worktree_id, relative_path, mtime_seconds, mtime_nanos) VALUES (?1, ?2, ?3, ?4);", params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos])?;
+            self.db.last_insert_rowid()
+        };

        // Currently inserting at approximately 3400 documents a second
        // I imagine we can speed this up with a bulk insert of some kind.
--- a/crates/semantic_index/src/semantic_index.rs
+++ b/crates/semantic_index/src/semantic_index.rs
@ -96,10 +96,21 @@ struct ProjectState {
    _outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>,
 }

+#[derive(Clone)]
 struct JobHandle {
-    tx: Weak<Mutex<watch::Sender<usize>>>,
+    /// The outer Arc is here to count the clones of a JobHandle instance;
+    /// when the last handle to a given job is dropped, we decrement a counter (just once).
+    tx: Arc<Weak<Mutex<watch::Sender<usize>>>>,
 }

+impl JobHandle {
+    fn new(tx: &Arc<Mutex<watch::Sender<usize>>>) -> Self {
+        *tx.lock().borrow_mut() += 1;
+        Self {
+            tx: Arc::new(Arc::downgrade(&tx)),
+        }
+    }
+}
 impl ProjectState {
    fn db_id_for_worktree_id(&self, id: WorktreeId) -> Option<i64> {
        self.worktree_db_ids
@ -380,6 +391,20 @@ impl SemanticIndex {
                    .await
                    .unwrap();
            }
+        } else {
+            // Insert the file in spite of failure so that future attempts to index it do not take place (unless the file is changed).
+            for (worktree_id, _, path, mtime, job_handle) in embeddings_queue.into_iter() {
+                db_update_tx
+                    .send(DbOperation::InsertFile {
+                        worktree_id,
+                        documents: vec![],
+                        path,
+                        mtime,
+                        job_handle,
+                    })
+                    .await
+                    .unwrap();
+            }
        }
    }

@ -389,6 +414,7 @@ impl SemanticIndex {
        embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
        embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>,
    ) {
+        // Handle edge case where individual file has more documents than max batch size
        let should_flush = match job {
            EmbeddingJob::Enqueue {
                documents,
@ -397,10 +423,44 @@ impl SemanticIndex {
                mtime,
                job_handle,
            } => {
+                // If documents is greater than embeddings batch size, recursively batch existing rows.
+                if &documents.len() > &EMBEDDINGS_BATCH_SIZE {
+                    let first_job = EmbeddingJob::Enqueue {
+                        documents: documents[..EMBEDDINGS_BATCH_SIZE].to_vec(),
+                        worktree_id,
+                        path: path.clone(),
+                        mtime,
+                        job_handle: job_handle.clone(),
+                    };
+
+                    Self::enqueue_documents_to_embed(
+                        first_job,
+                        queue_len,
+                        embeddings_queue,
+                        embed_batch_tx,
+                    );
+
+                    let second_job = EmbeddingJob::Enqueue {
+                        documents: documents[EMBEDDINGS_BATCH_SIZE..].to_vec(),
+                        worktree_id,
+                        path: path.clone(),
+                        mtime,
+                        job_handle: job_handle.clone(),
+                    };
+
+                    Self::enqueue_documents_to_embed(
+                        second_job,
+                        queue_len,
+                        embeddings_queue,
+                        embed_batch_tx,
+                    );
+                    return;
+                } else {
                    *queue_len += &documents.len();
                    embeddings_queue.push((worktree_id, documents, path, mtime, job_handle));
                    *queue_len >= EMBEDDINGS_BATCH_SIZE
                }
+            }
            EmbeddingJob::Flush => true,
        };

@ -613,10 +673,8 @@ impl SemanticIndex {

                                if !already_stored {
                                    count += 1;
-                                    *job_count_tx.lock().borrow_mut() += 1;
-                                    let job_handle = JobHandle {
-                                        tx: Arc::downgrade(&job_count_tx),
-                                    };
+
+                                    let job_handle = JobHandle::new(&job_count_tx);
                                    parsing_files_tx
                                        .try_send(PendingFile {
                                            worktree_db_id: db_ids_by_worktree_id[&worktree.id()],
@ -690,6 +748,7 @@ impl SemanticIndex {
        let database_url = self.database_url.clone();
        let fs = self.fs.clone();
        cx.spawn(|this, mut cx| async move {
+            let t0 = Instant::now();
            let database = VectorDatabase::new(fs.clone(), database_url.clone()).await?;

            let phrase_embedding = embedding_provider
@ -699,6 +758,11 @@ impl SemanticIndex {
                .next()
                .unwrap();

+            log::trace!(
+                "Embedding search phrase took: {:?} milliseconds",
+                t0.elapsed().as_millis()
+            );
+
            let file_ids =
                database.retrieve_included_file_ids(&worktree_db_ids, &includes, &excludes)?;

@ -773,6 +837,11 @@ impl SemanticIndex {

            let buffers = futures::future::join_all(tasks).await;

+            log::trace!(
+                "Semantic Searching took: {:?} milliseconds in total",
+                t0.elapsed().as_millis()
+            );
+
            Ok(buffers
                .into_iter()
                .zip(ranges)
@ -794,9 +863,32 @@ impl Entity for SemanticIndex {

 impl Drop for JobHandle {
    fn drop(&mut self) {
-        if let Some(tx) = self.tx.upgrade() {
+        if let Some(inner) = Arc::get_mut(&mut self.tx) {
+            // This is the last instance of the JobHandle (regardless of it's origin - whether it was cloned or not)
+            if let Some(tx) = inner.upgrade() {
                let mut tx = tx.lock();
                *tx.borrow_mut() -= 1;
            }
        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+    #[test]
+    fn test_job_handle() {
+        let (job_count_tx, job_count_rx) = watch::channel_with(0);
+        let tx = Arc::new(Mutex::new(job_count_tx));
+        let job_handle = JobHandle::new(&tx);
+
+        assert_eq!(1, *job_count_rx.borrow());
+        let new_job_handle = job_handle.clone();
+        assert_eq!(1, *job_count_rx.borrow());
+        drop(job_handle);
+        assert_eq!(1, *job_count_rx.borrow());
+        drop(new_job_handle);
+        assert_eq!(0, *job_count_rx.borrow());
+    }
 }