diff --git a/crates/semantic_index/src/db.rs b/crates/semantic_index/src/db.rs index 81b05720d2..375934e7fe 100644 --- a/crates/semantic_index/src/db.rs +++ b/crates/semantic_index/src/db.rs @@ -1,4 +1,8 @@ -use crate::{embedding::Embedding, parsing::Document, SEMANTIC_INDEX_VERSION}; +use crate::{ + embedding::Embedding, + parsing::{Document, DocumentDigest}, + SEMANTIC_INDEX_VERSION, +}; use anyhow::{anyhow, Context, Result}; use futures::channel::oneshot; use gpui::executor; @@ -165,7 +169,7 @@ impl VectorDatabase { end_byte INTEGER NOT NULL, name VARCHAR NOT NULL, embedding BLOB NOT NULL, - sha1 BLOB NOT NULL, + digest BLOB NOT NULL, FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE )", [], @@ -225,14 +229,14 @@ impl VectorDatabase { // I imagine we can speed this up with a bulk insert of some kind. for document in documents { db.execute( - "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", params![ file_id, document.range.start.to_string(), document.range.end.to_string(), document.name, document.embedding, - document.sha1 + document.digest ], )?; } diff --git a/crates/semantic_index/src/parsing.rs b/crates/semantic_index/src/parsing.rs index 2b67f41714..c0a94c6b73 100644 --- a/crates/semantic_index/src/parsing.rs +++ b/crates/semantic_index/src/parsing.rs @@ -1,11 +1,11 @@ -use crate::embedding::{EmbeddingProvider, Embedding}; +use crate::embedding::{Embedding, EmbeddingProvider}; use anyhow::{anyhow, Result}; use language::{Grammar, Language}; use rusqlite::{ types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef}, ToSql, }; -use sha1::Digest; +use sha1::{Digest, Sha1}; use std::{ cmp::{self, Reverse}, collections::HashSet, @@ -15,10 +15,10 @@ use std::{ }; use tree_sitter::{Parser, QueryCursor}; -#[derive(Debug, PartialEq, Clone)] -pub struct Sha1([u8; 20]); +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub struct DocumentDigest([u8; 20]); -impl FromSql for Sha1 { +impl FromSql for DocumentDigest { fn column_result(value: ValueRef) -> FromSqlResult { let blob = value.as_blob()?; let bytes = @@ -27,19 +27,19 @@ impl FromSql for Sha1 { expected_size: 20, blob_size: blob.len(), })?; - return Ok(Sha1(bytes)); + return Ok(DocumentDigest(bytes)); } } -impl ToSql for Sha1 { +impl ToSql for DocumentDigest { fn to_sql(&self) -> rusqlite::Result { self.0.to_sql() } } -impl From<&'_ str> for Sha1 { +impl From<&'_ str> for DocumentDigest { fn from(value: &'_ str) -> Self { - let mut sha1 = sha1::Sha1::new(); + let mut sha1 = Sha1::new(); sha1.update(value); Self(sha1.finalize().into()) } @@ -51,7 +51,7 @@ pub struct Document { pub range: Range, pub content: String, pub embedding: Option, - pub sha1: Sha1, + pub digest: DocumentDigest, pub token_count: usize, } @@ -102,17 +102,14 @@ impl CodeContextRetriever { .replace("", relative_path.to_string_lossy().as_ref()) .replace("", language_name.as_ref()) .replace("", &content); - - let sha1 = Sha1::from(document_span.as_str()); - + let digest = DocumentDigest::from(document_span.as_str()); let (document_span, token_count) = self.embedding_provider.truncate(&document_span); - Ok(vec![Document { range: 0..content.len(), content: document_span, embedding: Default::default(), name: language_name.to_string(), - sha1, + digest, token_count, }]) } @@ -121,14 +118,14 @@ impl CodeContextRetriever { let document_span = MARKDOWN_CONTEXT_TEMPLATE .replace("", relative_path.to_string_lossy().as_ref()) .replace("", &content); - let sha1 = Sha1::from(document_span.as_str()); + let digest = DocumentDigest::from(document_span.as_str()); let (document_span, token_count) = self.embedding_provider.truncate(&document_span); Ok(vec![Document { range: 0..content.len(), content: document_span, embedding: None, name: "Markdown".to_string(), - sha1, + digest, token_count, }]) } @@ -308,13 +305,13 @@ impl CodeContextRetriever { ); } - let sha1 = Sha1::from(document_content.as_str()); + let sha1 = DocumentDigest::from(document_content.as_str()); documents.push(Document { name, content: document_content, range: item_range.clone(), embedding: None, - sha1, + digest: sha1, token_count: 0, }) } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 7a0985b273..0a9a808a64 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -37,7 +37,7 @@ use util::{ }; use workspace::WorkspaceCreated; -const SEMANTIC_INDEX_VERSION: usize = 7; +const SEMANTIC_INDEX_VERSION: usize = 8; const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600); pub fn init( diff --git a/crates/semantic_index/src/semantic_index_tests.rs b/crates/semantic_index/src/semantic_index_tests.rs index 75232eb4d2..e65bc04412 100644 --- a/crates/semantic_index/src/semantic_index_tests.rs +++ b/crates/semantic_index/src/semantic_index_tests.rs @@ -1,7 +1,7 @@ use crate::{ embedding::{DummyEmbeddings, Embedding, EmbeddingProvider}, embedding_queue::EmbeddingQueue, - parsing::{subtract_ranges, CodeContextRetriever, Document, Sha1}, + parsing::{subtract_ranges, CodeContextRetriever, Document, DocumentDigest}, semantic_index_settings::SemanticIndexSettings, FileToEmbed, JobHandle, SearchResult, SemanticIndex, }; @@ -220,13 +220,13 @@ async fn test_embedding_batching(cx: &mut TestAppContext, mut rng: StdRng) { .with_simple_text() .take(content_len) .collect::(); - let sha1 = Sha1::from(content.as_str()); + let digest = DocumentDigest::from(content.as_str()); Document { range: 0..10, embedding: None, name: format!("document {document_ix}"), content, - sha1, + digest, token_count: rng.gen_range(10..30), } })