More fixes to the semantic index's chunking (#11376)
This fixes a tricky intermittent issue I was seeing, where failed to chunk certain files correctly because of the way we reuse Tree-sitter `Parser` instances across parses. I've also accounted for leading comments in chunk boundaries, so that items are grouped with their leading comments whenever possible when chunking. Finally, we've changed the `debug project index` action so that it opens a simple debug view in a pane, instead of printing paths to the console. This lets you click into a path and see how it was chunked. Release Notes: - N/A --------- Co-authored-by: Marshall <marshall@zed.dev>
This commit is contained in:
parent
335c307b93
commit
6964302d89
11 changed files with 532 additions and 171 deletions
|
@ -1,5 +1,6 @@
|
|||
mod chunking;
|
||||
mod embedding;
|
||||
mod project_index_debug_view;
|
||||
|
||||
use anyhow::{anyhow, Context as _, Result};
|
||||
use chunking::{chunk_text, Chunk};
|
||||
|
@ -31,6 +32,8 @@ use std::{
|
|||
use util::ResultExt;
|
||||
use worktree::LocalSnapshot;
|
||||
|
||||
pub use project_index_debug_view::ProjectIndexDebugView;
|
||||
|
||||
pub struct SemanticIndex {
|
||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
db_connection: heed::Env,
|
||||
|
@ -397,26 +400,35 @@ impl ProjectIndex {
|
|||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn debug(&self, cx: &mut ModelContext<Self>) -> Task<Result<()>> {
|
||||
let indices = self
|
||||
pub(crate) fn worktree_index(
|
||||
&self,
|
||||
worktree_id: WorktreeId,
|
||||
cx: &AppContext,
|
||||
) -> Option<Model<WorktreeIndex>> {
|
||||
for index in self.worktree_indices.values() {
|
||||
if let WorktreeIndexHandle::Loaded { index, .. } = index {
|
||||
if index.read(cx).worktree.read(cx).id() == worktree_id {
|
||||
return Some(index.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub(crate) fn worktree_indices(&self, cx: &AppContext) -> Vec<Model<WorktreeIndex>> {
|
||||
let mut result = self
|
||||
.worktree_indices
|
||||
.values()
|
||||
.filter_map(|worktree_index| {
|
||||
if let WorktreeIndexHandle::Loaded { index, .. } = worktree_index {
|
||||
.filter_map(|index| {
|
||||
if let WorktreeIndexHandle::Loaded { index, .. } = index {
|
||||
Some(index.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
cx.spawn(|_, mut cx| async move {
|
||||
eprintln!("semantic index contents:");
|
||||
for index in indices {
|
||||
index.update(&mut cx, |index, cx| index.debug(cx))?.await?
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
result.sort_by_key(|index| index.read(cx).worktree.read(cx).id());
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -726,10 +738,8 @@ impl WorktreeIndex {
|
|||
.language_for_file_path(&entry.path)
|
||||
.await
|
||||
.ok();
|
||||
let grammar =
|
||||
language.as_ref().and_then(|language| language.grammar());
|
||||
let chunked_file = ChunkedFile {
|
||||
chunks: chunk_text(&text, grammar),
|
||||
chunks: chunk_text(&text, language.as_ref(), &entry.path),
|
||||
handle,
|
||||
path: entry.path,
|
||||
mtime: entry.mtime,
|
||||
|
@ -861,7 +871,6 @@ impl WorktreeIndex {
|
|||
db.put(&mut txn, &key, file)?;
|
||||
}
|
||||
txn.commit()?;
|
||||
eprintln!("committed {:?}", embedded_files.len());
|
||||
|
||||
drop(embedded_files);
|
||||
log::debug!("committed");
|
||||
|
@ -871,18 +880,38 @@ impl WorktreeIndex {
|
|||
})
|
||||
}
|
||||
|
||||
fn debug(&mut self, cx: &mut ModelContext<Self>) -> Task<Result<()>> {
|
||||
fn paths(&self, cx: &AppContext) -> Task<Result<Vec<Arc<Path>>>> {
|
||||
let connection = self.db_connection.clone();
|
||||
let db = self.db;
|
||||
cx.background_executor().spawn(async move {
|
||||
let tx = connection
|
||||
.read_txn()
|
||||
.context("failed to create read transaction")?;
|
||||
for record in db.iter(&tx)? {
|
||||
let (key, _) = record?;
|
||||
eprintln!("{}", path_for_db_key(key));
|
||||
}
|
||||
Ok(())
|
||||
let result = db
|
||||
.iter(&tx)?
|
||||
.map(|entry| Ok(entry?.1.path.clone()))
|
||||
.collect::<Result<Vec<Arc<Path>>>>();
|
||||
drop(tx);
|
||||
result
|
||||
})
|
||||
}
|
||||
|
||||
fn chunks_for_path(
|
||||
&self,
|
||||
path: Arc<Path>,
|
||||
cx: &AppContext,
|
||||
) -> Task<Result<Vec<EmbeddedChunk>>> {
|
||||
let connection = self.db_connection.clone();
|
||||
let db = self.db;
|
||||
cx.background_executor().spawn(async move {
|
||||
let tx = connection
|
||||
.read_txn()
|
||||
.context("failed to create read transaction")?;
|
||||
Ok(db
|
||||
.get(&tx, &db_key_for_path(&path))?
|
||||
.ok_or_else(|| anyhow!("no such path"))?
|
||||
.chunks
|
||||
.clone())
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -927,7 +956,7 @@ struct EmbeddedFile {
|
|||
chunks: Vec<EmbeddedChunk>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct EmbeddedChunk {
|
||||
chunk: Chunk,
|
||||
embedding: Embedding,
|
||||
|
@ -981,10 +1010,6 @@ fn db_key_for_path(path: &Arc<Path>) -> String {
|
|||
path.to_string_lossy().replace('/', "\0")
|
||||
}
|
||||
|
||||
fn path_for_db_key(key: &str) -> String {
|
||||
key.replace('\0', "/")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue