More fixes to the semantic index's chunking (#11376)
This fixes a tricky intermittent issue I was seeing, where failed to chunk certain files correctly because of the way we reuse Tree-sitter `Parser` instances across parses. I've also accounted for leading comments in chunk boundaries, so that items are grouped with their leading comments whenever possible when chunking. Finally, we've changed the `debug project index` action so that it opens a simple debug view in a pane, instead of printing paths to the console. This lets you click into a path and see how it was chunked. Release Notes: - N/A --------- Co-authored-by: Marshall <marshall@zed.dev>
This commit is contained in:
parent
335c307b93
commit
6964302d89
11 changed files with 532 additions and 171 deletions
|
@ -1,9 +1,10 @@
|
|||
use language::{with_parser, with_query_cursor, Grammar};
|
||||
use language::{with_parser, with_query_cursor, Language};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::{
|
||||
cmp::{self, Reverse},
|
||||
ops::Range,
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
};
|
||||
use tree_sitter::QueryCapture;
|
||||
|
@ -26,52 +27,95 @@ pub struct Chunk {
|
|||
pub digest: [u8; 32],
|
||||
}
|
||||
|
||||
pub fn chunk_text(text: &str, grammar: Option<&Arc<Grammar>>) -> Vec<Chunk> {
|
||||
chunk_text_with_size_range(text, grammar, CHUNK_SIZE_RANGE)
|
||||
pub fn chunk_text(text: &str, language: Option<&Arc<Language>>, path: &Path) -> Vec<Chunk> {
|
||||
chunk_text_with_size_range(text, language, path, CHUNK_SIZE_RANGE)
|
||||
}
|
||||
|
||||
fn chunk_text_with_size_range(
|
||||
text: &str,
|
||||
grammar: Option<&Arc<Grammar>>,
|
||||
language: Option<&Arc<Language>>,
|
||||
path: &Path,
|
||||
size_config: ChunkSizeRange,
|
||||
) -> Vec<Chunk> {
|
||||
let mut syntactic_ranges = Vec::new();
|
||||
let ranges = syntactic_ranges(text, language, path).unwrap_or_default();
|
||||
chunk_text_with_syntactic_ranges(text, &ranges, size_config)
|
||||
}
|
||||
|
||||
if let Some(grammar) = grammar {
|
||||
if let Some(outline) = grammar.outline_config.as_ref() {
|
||||
let tree = with_parser(|parser| {
|
||||
parser.set_language(&grammar.ts_language).log_err()?;
|
||||
parser.parse(&text, None)
|
||||
});
|
||||
fn syntactic_ranges(
|
||||
text: &str,
|
||||
language: Option<&Arc<Language>>,
|
||||
path: &Path,
|
||||
) -> Option<Vec<Range<usize>>> {
|
||||
let language = language?;
|
||||
let grammar = language.grammar()?;
|
||||
let outline = grammar.outline_config.as_ref()?;
|
||||
let tree = with_parser(|parser| {
|
||||
parser.set_language(&grammar.ts_language).log_err()?;
|
||||
parser.parse(&text, None)
|
||||
});
|
||||
|
||||
if let Some(tree) = tree {
|
||||
with_query_cursor(|cursor| {
|
||||
// Retrieve a list of ranges of outline items (types, functions, etc) in the document.
|
||||
// Omit single-line outline items (e.g. struct fields, constant declarations), because
|
||||
// we'll already be attempting to split on lines.
|
||||
syntactic_ranges = cursor
|
||||
.matches(&outline.query, tree.root_node(), text.as_bytes())
|
||||
.filter_map(|mat| {
|
||||
mat.captures
|
||||
.iter()
|
||||
.find_map(|QueryCapture { node, index }| {
|
||||
if *index == outline.item_capture_ix {
|
||||
if node.end_position().row > node.start_position().row {
|
||||
return Some(node.byte_range());
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
syntactic_ranges
|
||||
.sort_unstable_by_key(|range| (range.start, Reverse(range.end)));
|
||||
});
|
||||
}
|
||||
}
|
||||
let Some(tree) = tree else {
|
||||
log::error!("failed to parse file {path:?} for chunking");
|
||||
return None;
|
||||
};
|
||||
|
||||
struct RowInfo {
|
||||
offset: usize,
|
||||
is_comment: bool,
|
||||
}
|
||||
|
||||
chunk_text_with_syntactic_ranges(text, &syntactic_ranges, size_config)
|
||||
let scope = language.default_scope();
|
||||
let line_comment_prefixes = scope.line_comment_prefixes();
|
||||
let row_infos = text
|
||||
.split('\n')
|
||||
.map({
|
||||
let mut offset = 0;
|
||||
move |line| {
|
||||
let line = line.trim_start();
|
||||
let is_comment = line_comment_prefixes
|
||||
.iter()
|
||||
.any(|prefix| line.starts_with(prefix.as_ref()));
|
||||
let result = RowInfo { offset, is_comment };
|
||||
offset += line.len() + 1;
|
||||
result
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Retrieve a list of ranges of outline items (types, functions, etc) in the document.
|
||||
// Omit single-line outline items (e.g. struct fields, constant declarations), because
|
||||
// we'll already be attempting to split on lines.
|
||||
let mut ranges = with_query_cursor(|cursor| {
|
||||
cursor
|
||||
.matches(&outline.query, tree.root_node(), text.as_bytes())
|
||||
.filter_map(|mat| {
|
||||
mat.captures
|
||||
.iter()
|
||||
.find_map(|QueryCapture { node, index }| {
|
||||
if *index == outline.item_capture_ix {
|
||||
let mut start_offset = node.start_byte();
|
||||
let mut start_row = node.start_position().row;
|
||||
let end_offset = node.end_byte();
|
||||
let end_row = node.end_position().row;
|
||||
|
||||
// Expand the range to include any preceding comments.
|
||||
while start_row > 0 && row_infos[start_row - 1].is_comment {
|
||||
start_offset = row_infos[start_row - 1].offset;
|
||||
start_row -= 1;
|
||||
}
|
||||
|
||||
if end_row > start_row {
|
||||
return Some(start_offset..end_offset);
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
ranges.sort_unstable_by_key(|range| (range.start, Reverse(range.end)));
|
||||
Some(ranges)
|
||||
}
|
||||
|
||||
fn chunk_text_with_syntactic_ranges(
|
||||
|
@ -148,7 +192,7 @@ fn chunk_text_with_syntactic_ranges(
|
|||
if !range.is_empty() {
|
||||
chunks.push(Chunk {
|
||||
range: range.clone(),
|
||||
digest: Sha256::digest(&text[range.clone()]).into(),
|
||||
digest: Sha256::digest(&text[range]).into(),
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -177,6 +221,8 @@ mod tests {
|
|||
Self { first_name, last_name, age }
|
||||
}
|
||||
|
||||
/// Returns the first name
|
||||
/// something something something
|
||||
fn first_name(&self) -> &str {
|
||||
&self.first_name
|
||||
}
|
||||
|
@ -185,8 +231,8 @@ mod tests {
|
|||
&self.last_name
|
||||
}
|
||||
|
||||
fn age(&self) -> usize {
|
||||
self.ages
|
||||
fn age(&self) -> u32 {
|
||||
self.age
|
||||
}
|
||||
}
|
||||
"
|
||||
|
@ -194,7 +240,8 @@ mod tests {
|
|||
|
||||
let chunks = chunk_text_with_size_range(
|
||||
&text,
|
||||
language.grammar(),
|
||||
Some(&language),
|
||||
Path::new("lib.rs"),
|
||||
ChunkSizeRange {
|
||||
min: text.find('}').unwrap(),
|
||||
max: text.find("Self {").unwrap(),
|
||||
|
@ -209,8 +256,8 @@ mod tests {
|
|||
&[
|
||||
"struct Person {", // ...
|
||||
"impl Person {",
|
||||
" fn first_name",
|
||||
" fn age",
|
||||
" /// Returns the first name",
|
||||
" fn last_name",
|
||||
],
|
||||
);
|
||||
|
||||
|
@ -227,7 +274,8 @@ mod tests {
|
|||
|
||||
let chunks = chunk_text_with_size_range(
|
||||
&text,
|
||||
language.grammar(),
|
||||
Some(&language),
|
||||
Path::new("lib.rs"),
|
||||
ChunkSizeRange {
|
||||
min: text.find('{').unwrap(),
|
||||
max: text.find('V').unwrap(),
|
||||
|
@ -263,7 +311,8 @@ mod tests {
|
|||
|
||||
let chunks = chunk_text_with_size_range(
|
||||
&text,
|
||||
language.grammar(),
|
||||
Some(&language),
|
||||
Path::new("lib.rs"),
|
||||
ChunkSizeRange { min: 32, max: 64 },
|
||||
);
|
||||
|
||||
|
@ -331,33 +380,35 @@ mod tests {
|
|||
#[test]
|
||||
fn test_chunk_text() {
|
||||
let text = "a\n".repeat(1000);
|
||||
let chunks = chunk_text(&text, None);
|
||||
let chunks = chunk_text(&text, None, Path::new("lib.rs"));
|
||||
assert_eq!(
|
||||
chunks.len(),
|
||||
((2000_f64) / (CHUNK_SIZE_RANGE.max as f64)).ceil() as usize
|
||||
);
|
||||
}
|
||||
|
||||
fn rust_language() -> Language {
|
||||
Language::new(
|
||||
LanguageConfig {
|
||||
name: "Rust".into(),
|
||||
matcher: LanguageMatcher {
|
||||
path_suffixes: vec!["rs".to_string()],
|
||||
fn rust_language() -> Arc<Language> {
|
||||
Arc::new(
|
||||
Language::new(
|
||||
LanguageConfig {
|
||||
name: "Rust".into(),
|
||||
matcher: LanguageMatcher {
|
||||
path_suffixes: vec!["rs".to_string()],
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
},
|
||||
Some(tree_sitter_rust::language()),
|
||||
)
|
||||
.with_outline_query(
|
||||
"
|
||||
Some(tree_sitter_rust::language()),
|
||||
)
|
||||
.with_outline_query(
|
||||
"
|
||||
(function_item name: (_) @name) @item
|
||||
(impl_item type: (_) @name) @item
|
||||
(struct_item name: (_) @name) @item
|
||||
(field_declaration name: (_) @name) @item
|
||||
",
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue