ZIm/crates/semantic_index/src/chunking.rs
Liam Murphy 72e1947025
Update tree-sitter to 0.24 (#24492)
I didn't update it to 0.25 because its Wasm support seems to be
partially broken due to
https://github.com/tree-sitter/tree-sitter/pull/3938: it didn't
introduce a check that the Wasm module's ABI is new enough to include
supertype info while parsing it, and so in the case where it isn't it
ends up interpreting random bytes as the number of supertypes, causing
out-of-bounds memory accesses.

Closes #24489

Release Notes:

- Fixed a rare crash during syntax highlighting
2025-02-10 10:52:27 -08:00

415 lines
12 KiB
Rust

use language::{with_parser, with_query_cursor, Language};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::{
cmp::{self, Reverse},
ops::Range,
path::Path,
sync::Arc,
};
use streaming_iterator::StreamingIterator;
use tree_sitter::QueryCapture;
use util::ResultExt as _;
#[derive(Copy, Clone)]
struct ChunkSizeRange {
min: usize,
max: usize,
}
const CHUNK_SIZE_RANGE: ChunkSizeRange = ChunkSizeRange {
min: 1024,
max: 8192,
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Chunk {
pub range: Range<usize>,
pub digest: [u8; 32],
}
pub fn chunk_text(text: &str, language: Option<&Arc<Language>>, path: &Path) -> Vec<Chunk> {
chunk_text_with_size_range(text, language, path, CHUNK_SIZE_RANGE)
}
fn chunk_text_with_size_range(
text: &str,
language: Option<&Arc<Language>>,
path: &Path,
size_config: ChunkSizeRange,
) -> Vec<Chunk> {
let ranges = syntactic_ranges(text, language, path).unwrap_or_default();
chunk_text_with_syntactic_ranges(text, &ranges, size_config)
}
fn syntactic_ranges(
text: &str,
language: Option<&Arc<Language>>,
path: &Path,
) -> Option<Vec<Range<usize>>> {
let language = language?;
let grammar = language.grammar()?;
let outline = grammar.outline_config.as_ref()?;
let tree = with_parser(|parser| {
parser.set_language(&grammar.ts_language).log_err()?;
parser.parse(text, None)
});
let Some(tree) = tree else {
log::error!("failed to parse file {path:?} for chunking");
return None;
};
struct RowInfo {
offset: usize,
is_comment: bool,
}
let scope = language.default_scope();
let line_comment_prefixes = scope.line_comment_prefixes();
let row_infos = text
.split('\n')
.map({
let mut offset = 0;
move |line| {
let line = line.trim_start();
let is_comment = line_comment_prefixes
.iter()
.any(|prefix| line.starts_with(prefix.as_ref()));
let result = RowInfo { offset, is_comment };
offset += line.len() + 1;
result
}
})
.collect::<Vec<_>>();
// Retrieve a list of ranges of outline items (types, functions, etc) in the document.
// Omit single-line outline items (e.g. struct fields, constant declarations), because
// we'll already be attempting to split on lines.
let mut ranges = with_query_cursor(|cursor| {
cursor
.matches(&outline.query, tree.root_node(), text.as_bytes())
.filter_map_deref(|mat| {
mat.captures
.iter()
.find_map(|QueryCapture { node, index }| {
if *index == outline.item_capture_ix {
let mut start_offset = node.start_byte();
let mut start_row = node.start_position().row;
let end_offset = node.end_byte();
let end_row = node.end_position().row;
// Expand the range to include any preceding comments.
while start_row > 0 && row_infos[start_row - 1].is_comment {
start_offset = row_infos[start_row - 1].offset;
start_row -= 1;
}
if end_row > start_row {
return Some(start_offset..end_offset);
}
}
None
})
})
.collect::<Vec<_>>()
});
ranges.sort_unstable_by_key(|range| (range.start, Reverse(range.end)));
Some(ranges)
}
fn chunk_text_with_syntactic_ranges(
text: &str,
mut syntactic_ranges: &[Range<usize>],
size_config: ChunkSizeRange,
) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut range = 0..0;
let mut range_end_nesting_depth = 0;
// Try to split the text at line boundaries.
let mut line_ixs = text
.match_indices('\n')
.map(|(ix, _)| ix + 1)
.chain(if text.ends_with('\n') {
None
} else {
Some(text.len())
})
.peekable();
while let Some(&line_ix) = line_ixs.peek() {
// If the current position is beyond the maximum chunk size, then
// start a new chunk.
if line_ix - range.start > size_config.max {
if range.is_empty() {
range.end = cmp::min(range.start + size_config.max, line_ix);
while !text.is_char_boundary(range.end) {
range.end -= 1;
}
}
chunks.push(Chunk {
range: range.clone(),
digest: Sha256::digest(&text[range.clone()]).into(),
});
range_end_nesting_depth = 0;
range.start = range.end;
continue;
}
// Discard any syntactic ranges that end before the current position.
while let Some(first_item) = syntactic_ranges.first() {
if first_item.end < line_ix {
syntactic_ranges = &syntactic_ranges[1..];
continue;
} else {
break;
}
}
// Count how many syntactic ranges contain the current position.
let mut nesting_depth = 0;
for range in syntactic_ranges {
if range.start > line_ix {
break;
}
if range.start < line_ix && range.end > line_ix {
nesting_depth += 1;
}
}
// Extend the current range to this position, unless an earlier candidate
// end position was less nested syntactically.
if range.len() < size_config.min || nesting_depth <= range_end_nesting_depth {
range.end = line_ix;
range_end_nesting_depth = nesting_depth;
}
line_ixs.next();
}
if !range.is_empty() {
chunks.push(Chunk {
range: range.clone(),
digest: Sha256::digest(&text[range]).into(),
});
}
chunks
}
#[cfg(test)]
mod tests {
use super::*;
use language::{tree_sitter_rust, Language, LanguageConfig, LanguageMatcher};
use unindent::Unindent as _;
#[test]
fn test_chunk_text_with_syntax() {
let language = rust_language();
let text = "
struct Person {
first_name: String,
last_name: String,
age: u32,
}
impl Person {
fn new(first_name: String, last_name: String, age: u32) -> Self {
Self { first_name, last_name, age }
}
/// Returns the first name
/// something something something
fn first_name(&self) -> &str {
&self.first_name
}
fn last_name(&self) -> &str {
&self.last_name
}
fn age(&self) -> u32 {
self.age
}
}
"
.unindent();
let chunks = chunk_text_with_size_range(
&text,
Some(&language),
Path::new("lib.rs"),
ChunkSizeRange {
min: text.find('}').unwrap(),
max: text.find("Self {").unwrap(),
},
);
// The entire impl cannot fit in a chunk, so it is split.
// Within the impl, two methods can fit in a chunk.
assert_chunks(
&text,
&chunks,
&[
"struct Person {", // ...
"impl Person {",
" /// Returns the first name",
" fn last_name",
],
);
let text = "
struct T {}
struct U {}
struct V {}
struct W {
a: T,
b: U,
}
"
.unindent();
let chunks = chunk_text_with_size_range(
&text,
Some(&language),
Path::new("lib.rs"),
ChunkSizeRange {
min: text.find('{').unwrap(),
max: text.find('V').unwrap(),
},
);
// Two single-line structs can fit in a chunk.
// The last struct cannot fit in a chunk together
// with the previous single-line struct.
assert_chunks(
&text,
&chunks,
&[
"struct T", // ...
"struct V", // ...
"struct W", // ...
"}",
],
);
}
#[test]
fn test_chunk_with_long_lines() {
let language = rust_language();
let text = "
struct S { a: u32 }
struct T { a: u64 }
struct U { a: u64, b: u64, c: u64, d: u64, e: u64, f: u64, g: u64, h: u64, i: u64, j: u64 }
struct W { a: u64, b: u64, c: u64, d: u64, e: u64, f: u64, g: u64, h: u64, i: u64, j: u64 }
"
.unindent();
let chunks = chunk_text_with_size_range(
&text,
Some(&language),
Path::new("lib.rs"),
ChunkSizeRange { min: 32, max: 64 },
);
// The line is too long to fit in one chunk
assert_chunks(
&text,
&chunks,
&[
"struct S {", // ...
"struct U",
"4, h: u64, i: u64", // ...
"struct W",
"4, h: u64, i: u64", // ...
],
);
}
#[track_caller]
fn assert_chunks(text: &str, chunks: &[Chunk], expected_chunk_text_prefixes: &[&str]) {
check_chunk_invariants(text, chunks);
assert_eq!(
chunks.len(),
expected_chunk_text_prefixes.len(),
"unexpected number of chunks: {chunks:?}",
);
let mut prev_chunk_end = 0;
for (ix, chunk) in chunks.iter().enumerate() {
let expected_prefix = expected_chunk_text_prefixes[ix];
let chunk_text = &text[chunk.range.clone()];
if !chunk_text.starts_with(expected_prefix) {
let chunk_prefix_offset = text[prev_chunk_end..].find(expected_prefix);
if let Some(chunk_prefix_offset) = chunk_prefix_offset {
panic!(
"chunk {ix} starts at unexpected offset {}. expected {}",
chunk.range.start,
chunk_prefix_offset + prev_chunk_end
);
} else {
panic!("invalid expected chunk prefix {ix}: {expected_prefix:?}");
}
}
prev_chunk_end = chunk.range.end;
}
}
#[track_caller]
fn check_chunk_invariants(text: &str, chunks: &[Chunk]) {
for (ix, chunk) in chunks.iter().enumerate() {
if ix > 0 && chunk.range.start != chunks[ix - 1].range.end {
panic!("chunk ranges are not contiguous: {:?}", chunks);
}
}
if text.is_empty() {
assert!(chunks.is_empty())
} else if chunks.first().unwrap().range.start != 0
|| chunks.last().unwrap().range.end != text.len()
{
panic!("chunks don't cover entire text {:?}", chunks);
}
}
#[test]
fn test_chunk_text() {
let text = "a\n".repeat(1000);
let chunks = chunk_text(&text, None, Path::new("lib.rs"));
assert_eq!(
chunks.len(),
((2000_f64) / (CHUNK_SIZE_RANGE.max as f64)).ceil() as usize
);
}
fn rust_language() -> Arc<Language> {
Arc::new(
Language::new(
LanguageConfig {
name: "Rust".into(),
matcher: LanguageMatcher {
path_suffixes: vec!["rs".to_string()],
..Default::default()
},
..Default::default()
},
Some(tree_sitter_rust::LANGUAGE.into()),
)
.with_outline_query(
"
(function_item name: (_) @name) @item
(impl_item type: (_) @name) @item
(struct_item name: (_) @name) @item
(field_declaration name: (_) @name) @item
",
)
.unwrap(),
)
}
}