Rename Sha1
to DocumentDigest
Co-Authored-By: Kyle Caverly <kyle@zed.dev>
This commit is contained in:
parent
3001a46f69
commit
2503d54d19
4 changed files with 28 additions and 27 deletions
|
@ -1,4 +1,8 @@
|
||||||
use crate::{embedding::Embedding, parsing::Document, SEMANTIC_INDEX_VERSION};
|
use crate::{
|
||||||
|
embedding::Embedding,
|
||||||
|
parsing::{Document, DocumentDigest},
|
||||||
|
SEMANTIC_INDEX_VERSION,
|
||||||
|
};
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use futures::channel::oneshot;
|
use futures::channel::oneshot;
|
||||||
use gpui::executor;
|
use gpui::executor;
|
||||||
|
@ -165,7 +169,7 @@ impl VectorDatabase {
|
||||||
end_byte INTEGER NOT NULL,
|
end_byte INTEGER NOT NULL,
|
||||||
name VARCHAR NOT NULL,
|
name VARCHAR NOT NULL,
|
||||||
embedding BLOB NOT NULL,
|
embedding BLOB NOT NULL,
|
||||||
sha1 BLOB NOT NULL,
|
digest BLOB NOT NULL,
|
||||||
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
|
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||||
)",
|
)",
|
||||||
[],
|
[],
|
||||||
|
@ -225,14 +229,14 @@ impl VectorDatabase {
|
||||||
// I imagine we can speed this up with a bulk insert of some kind.
|
// I imagine we can speed this up with a bulk insert of some kind.
|
||||||
for document in documents {
|
for document in documents {
|
||||||
db.execute(
|
db.execute(
|
||||||
"INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, sha1) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
"INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
||||||
params![
|
params![
|
||||||
file_id,
|
file_id,
|
||||||
document.range.start.to_string(),
|
document.range.start.to_string(),
|
||||||
document.range.end.to_string(),
|
document.range.end.to_string(),
|
||||||
document.name,
|
document.name,
|
||||||
document.embedding,
|
document.embedding,
|
||||||
document.sha1
|
document.digest
|
||||||
],
|
],
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
use crate::embedding::{EmbeddingProvider, Embedding};
|
use crate::embedding::{Embedding, EmbeddingProvider};
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
use language::{Grammar, Language};
|
use language::{Grammar, Language};
|
||||||
use rusqlite::{
|
use rusqlite::{
|
||||||
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
|
types::{FromSql, FromSqlResult, ToSqlOutput, ValueRef},
|
||||||
ToSql,
|
ToSql,
|
||||||
};
|
};
|
||||||
use sha1::Digest;
|
use sha1::{Digest, Sha1};
|
||||||
use std::{
|
use std::{
|
||||||
cmp::{self, Reverse},
|
cmp::{self, Reverse},
|
||||||
collections::HashSet,
|
collections::HashSet,
|
||||||
|
@ -15,10 +15,10 @@ use std::{
|
||||||
};
|
};
|
||||||
use tree_sitter::{Parser, QueryCursor};
|
use tree_sitter::{Parser, QueryCursor};
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Clone)]
|
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||||
pub struct Sha1([u8; 20]);
|
pub struct DocumentDigest([u8; 20]);
|
||||||
|
|
||||||
impl FromSql for Sha1 {
|
impl FromSql for DocumentDigest {
|
||||||
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
|
fn column_result(value: ValueRef) -> FromSqlResult<Self> {
|
||||||
let blob = value.as_blob()?;
|
let blob = value.as_blob()?;
|
||||||
let bytes =
|
let bytes =
|
||||||
|
@ -27,19 +27,19 @@ impl FromSql for Sha1 {
|
||||||
expected_size: 20,
|
expected_size: 20,
|
||||||
blob_size: blob.len(),
|
blob_size: blob.len(),
|
||||||
})?;
|
})?;
|
||||||
return Ok(Sha1(bytes));
|
return Ok(DocumentDigest(bytes));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ToSql for Sha1 {
|
impl ToSql for DocumentDigest {
|
||||||
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
|
fn to_sql(&self) -> rusqlite::Result<ToSqlOutput> {
|
||||||
self.0.to_sql()
|
self.0.to_sql()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&'_ str> for Sha1 {
|
impl From<&'_ str> for DocumentDigest {
|
||||||
fn from(value: &'_ str) -> Self {
|
fn from(value: &'_ str) -> Self {
|
||||||
let mut sha1 = sha1::Sha1::new();
|
let mut sha1 = Sha1::new();
|
||||||
sha1.update(value);
|
sha1.update(value);
|
||||||
Self(sha1.finalize().into())
|
Self(sha1.finalize().into())
|
||||||
}
|
}
|
||||||
|
@ -51,7 +51,7 @@ pub struct Document {
|
||||||
pub range: Range<usize>,
|
pub range: Range<usize>,
|
||||||
pub content: String,
|
pub content: String,
|
||||||
pub embedding: Option<Embedding>,
|
pub embedding: Option<Embedding>,
|
||||||
pub sha1: Sha1,
|
pub digest: DocumentDigest,
|
||||||
pub token_count: usize,
|
pub token_count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,17 +102,14 @@ impl CodeContextRetriever {
|
||||||
.replace("<path>", relative_path.to_string_lossy().as_ref())
|
.replace("<path>", relative_path.to_string_lossy().as_ref())
|
||||||
.replace("<language>", language_name.as_ref())
|
.replace("<language>", language_name.as_ref())
|
||||||
.replace("<item>", &content);
|
.replace("<item>", &content);
|
||||||
|
let digest = DocumentDigest::from(document_span.as_str());
|
||||||
let sha1 = Sha1::from(document_span.as_str());
|
|
||||||
|
|
||||||
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
|
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
|
||||||
|
|
||||||
Ok(vec![Document {
|
Ok(vec![Document {
|
||||||
range: 0..content.len(),
|
range: 0..content.len(),
|
||||||
content: document_span,
|
content: document_span,
|
||||||
embedding: Default::default(),
|
embedding: Default::default(),
|
||||||
name: language_name.to_string(),
|
name: language_name.to_string(),
|
||||||
sha1,
|
digest,
|
||||||
token_count,
|
token_count,
|
||||||
}])
|
}])
|
||||||
}
|
}
|
||||||
|
@ -121,14 +118,14 @@ impl CodeContextRetriever {
|
||||||
let document_span = MARKDOWN_CONTEXT_TEMPLATE
|
let document_span = MARKDOWN_CONTEXT_TEMPLATE
|
||||||
.replace("<path>", relative_path.to_string_lossy().as_ref())
|
.replace("<path>", relative_path.to_string_lossy().as_ref())
|
||||||
.replace("<item>", &content);
|
.replace("<item>", &content);
|
||||||
let sha1 = Sha1::from(document_span.as_str());
|
let digest = DocumentDigest::from(document_span.as_str());
|
||||||
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
|
let (document_span, token_count) = self.embedding_provider.truncate(&document_span);
|
||||||
Ok(vec![Document {
|
Ok(vec![Document {
|
||||||
range: 0..content.len(),
|
range: 0..content.len(),
|
||||||
content: document_span,
|
content: document_span,
|
||||||
embedding: None,
|
embedding: None,
|
||||||
name: "Markdown".to_string(),
|
name: "Markdown".to_string(),
|
||||||
sha1,
|
digest,
|
||||||
token_count,
|
token_count,
|
||||||
}])
|
}])
|
||||||
}
|
}
|
||||||
|
@ -308,13 +305,13 @@ impl CodeContextRetriever {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let sha1 = Sha1::from(document_content.as_str());
|
let sha1 = DocumentDigest::from(document_content.as_str());
|
||||||
documents.push(Document {
|
documents.push(Document {
|
||||||
name,
|
name,
|
||||||
content: document_content,
|
content: document_content,
|
||||||
range: item_range.clone(),
|
range: item_range.clone(),
|
||||||
embedding: None,
|
embedding: None,
|
||||||
sha1,
|
digest: sha1,
|
||||||
token_count: 0,
|
token_count: 0,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,7 +37,7 @@ use util::{
|
||||||
};
|
};
|
||||||
use workspace::WorkspaceCreated;
|
use workspace::WorkspaceCreated;
|
||||||
|
|
||||||
const SEMANTIC_INDEX_VERSION: usize = 7;
|
const SEMANTIC_INDEX_VERSION: usize = 8;
|
||||||
const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
|
const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
|
||||||
|
|
||||||
pub fn init(
|
pub fn init(
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
use crate::{
|
use crate::{
|
||||||
embedding::{DummyEmbeddings, Embedding, EmbeddingProvider},
|
embedding::{DummyEmbeddings, Embedding, EmbeddingProvider},
|
||||||
embedding_queue::EmbeddingQueue,
|
embedding_queue::EmbeddingQueue,
|
||||||
parsing::{subtract_ranges, CodeContextRetriever, Document, Sha1},
|
parsing::{subtract_ranges, CodeContextRetriever, Document, DocumentDigest},
|
||||||
semantic_index_settings::SemanticIndexSettings,
|
semantic_index_settings::SemanticIndexSettings,
|
||||||
FileToEmbed, JobHandle, SearchResult, SemanticIndex,
|
FileToEmbed, JobHandle, SearchResult, SemanticIndex,
|
||||||
};
|
};
|
||||||
|
@ -220,13 +220,13 @@ async fn test_embedding_batching(cx: &mut TestAppContext, mut rng: StdRng) {
|
||||||
.with_simple_text()
|
.with_simple_text()
|
||||||
.take(content_len)
|
.take(content_len)
|
||||||
.collect::<String>();
|
.collect::<String>();
|
||||||
let sha1 = Sha1::from(content.as_str());
|
let digest = DocumentDigest::from(content.as_str());
|
||||||
Document {
|
Document {
|
||||||
range: 0..10,
|
range: 0..10,
|
||||||
embedding: None,
|
embedding: None,
|
||||||
name: format!("document {document_ix}"),
|
name: format!("document {document_ix}"),
|
||||||
content,
|
content,
|
||||||
sha1,
|
digest,
|
||||||
token_count: rng.gen_range(10..30),
|
token_count: rng.gen_range(10..30),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue