updated vector store indexing to only use languages with an embedding.scm treesitter query

Co-authored-by: maxbrunsfeld <max@zed.dev>
This commit is contained in:
KCaverly 2023-06-30 16:14:11 -04:00
parent 0db0876289
commit 36907bb4dc
5 changed files with 98 additions and 7 deletions

View file

@ -350,6 +350,7 @@ pub struct LanguageQueries {
pub brackets: Option<Cow<'static, str>>, pub brackets: Option<Cow<'static, str>>,
pub indents: Option<Cow<'static, str>>, pub indents: Option<Cow<'static, str>>,
pub outline: Option<Cow<'static, str>>, pub outline: Option<Cow<'static, str>>,
pub embedding: Option<Cow<'static, str>>,
pub injections: Option<Cow<'static, str>>, pub injections: Option<Cow<'static, str>>,
pub overrides: Option<Cow<'static, str>>, pub overrides: Option<Cow<'static, str>>,
} }
@ -495,6 +496,7 @@ pub struct Grammar {
pub(crate) brackets_config: Option<BracketConfig>, pub(crate) brackets_config: Option<BracketConfig>,
pub(crate) indents_config: Option<IndentConfig>, pub(crate) indents_config: Option<IndentConfig>,
pub outline_config: Option<OutlineConfig>, pub outline_config: Option<OutlineConfig>,
pub embedding_config: Option<EmbeddingConfig>,
pub(crate) injection_config: Option<InjectionConfig>, pub(crate) injection_config: Option<InjectionConfig>,
pub(crate) override_config: Option<OverrideConfig>, pub(crate) override_config: Option<OverrideConfig>,
pub(crate) highlight_map: Mutex<HighlightMap>, pub(crate) highlight_map: Mutex<HighlightMap>,
@ -516,6 +518,15 @@ pub struct OutlineConfig {
pub extra_context_capture_ix: Option<u32>, pub extra_context_capture_ix: Option<u32>,
} }
#[derive(Debug)]
pub struct EmbeddingConfig {
pub query: Query,
pub item_capture_ix: u32,
pub name_capture_ix: u32,
pub context_capture_ix: Option<u32>,
pub extra_context_capture_ix: Option<u32>,
}
struct InjectionConfig { struct InjectionConfig {
query: Query, query: Query,
content_capture_ix: u32, content_capture_ix: u32,
@ -1145,6 +1156,7 @@ impl Language {
highlights_query: None, highlights_query: None,
brackets_config: None, brackets_config: None,
outline_config: None, outline_config: None,
embedding_config: None,
indents_config: None, indents_config: None,
injection_config: None, injection_config: None,
override_config: None, override_config: None,
@ -1181,6 +1193,9 @@ impl Language {
if let Some(query) = queries.outline { if let Some(query) = queries.outline {
self = self.with_outline_query(query.as_ref())?; self = self.with_outline_query(query.as_ref())?;
} }
if let Some(query) = queries.embedding {
self = self.with_embedding_query(query.as_ref())?;
}
if let Some(query) = queries.injections { if let Some(query) = queries.injections {
self = self.with_injection_query(query.as_ref())?; self = self.with_injection_query(query.as_ref())?;
} }
@ -1189,6 +1204,7 @@ impl Language {
} }
Ok(self) Ok(self)
} }
pub fn with_highlights_query(mut self, source: &str) -> Result<Self> { pub fn with_highlights_query(mut self, source: &str) -> Result<Self> {
let grammar = self.grammar_mut(); let grammar = self.grammar_mut();
grammar.highlights_query = Some(Query::new(grammar.ts_language, source)?); grammar.highlights_query = Some(Query::new(grammar.ts_language, source)?);
@ -1223,6 +1239,34 @@ impl Language {
Ok(self) Ok(self)
} }
pub fn with_embedding_query(mut self, source: &str) -> Result<Self> {
let grammar = self.grammar_mut();
let query = Query::new(grammar.ts_language, source)?;
let mut item_capture_ix = None;
let mut name_capture_ix = None;
let mut context_capture_ix = None;
let mut extra_context_capture_ix = None;
get_capture_indices(
&query,
&mut [
("item", &mut item_capture_ix),
("name", &mut name_capture_ix),
("context", &mut context_capture_ix),
("context.extra", &mut extra_context_capture_ix),
],
);
if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) {
grammar.embedding_config = Some(EmbeddingConfig {
query,
item_capture_ix,
name_capture_ix,
context_capture_ix,
extra_context_capture_ix,
});
}
Ok(self)
}
pub fn with_brackets_query(mut self, source: &str) -> Result<Self> { pub fn with_brackets_query(mut self, source: &str) -> Result<Self> {
let grammar = self.grammar_mut(); let grammar = self.grammar_mut();
let query = Query::new(grammar.ts_language, source)?; let query = Query::new(grammar.ts_language, source)?;

View file

@ -136,8 +136,8 @@ impl VectorStore {
content: String, content: String,
) -> Result<IndexedFile> { ) -> Result<IndexedFile> {
let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?; let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
let outline_config = grammar let embedding_config = grammar
.outline_config .embedding_config
.as_ref() .as_ref()
.ok_or_else(|| anyhow!("no outline query"))?; .ok_or_else(|| anyhow!("no outline query"))?;
@ -148,13 +148,17 @@ impl VectorStore {
let mut documents = Vec::new(); let mut documents = Vec::new();
let mut context_spans = Vec::new(); let mut context_spans = Vec::new();
for mat in cursor.matches(&outline_config.query, tree.root_node(), content.as_bytes()) { for mat in cursor.matches(
&embedding_config.query,
tree.root_node(),
content.as_bytes(),
) {
let mut item_range = None; let mut item_range = None;
let mut name_range = None; let mut name_range = None;
for capture in mat.captures { for capture in mat.captures {
if capture.index == outline_config.item_capture_ix { if capture.index == embedding_config.item_capture_ix {
item_range = Some(capture.node.byte_range()); item_range = Some(capture.node.byte_range());
} else if capture.index == outline_config.name_capture_ix { } else if capture.index == embedding_config.name_capture_ix {
name_range = Some(capture.node.byte_range()); name_range = Some(capture.node.byte_range());
} }
} }
@ -266,7 +270,11 @@ impl VectorStore {
.language_for_file(&absolute_path, None) .language_for_file(&absolute_path, None)
.await .await
{ {
if language.name().as_ref() != "Rust" { if language
.grammar()
.and_then(|grammar| grammar.embedding_config.as_ref())
.is_none()
{
continue; continue;
} }
@ -359,6 +367,8 @@ impl VectorStore {
this.worktree_db_ids.extend(worktree_db_ids); this.worktree_db_ids.extend(worktree_db_ids);
}); });
log::info!("Semantic Indexing Complete!");
anyhow::Ok(()) anyhow::Ok(())
}) })
} }

View file

@ -46,7 +46,7 @@ async fn test_vector_store(cx: &mut TestAppContext) {
}, },
Some(tree_sitter_rust::language()), Some(tree_sitter_rust::language()),
) )
.with_outline_query( .with_embedding_query(
r#" r#"
(function_item (function_item
name: (identifier) @name name: (identifier) @name

View file

@ -170,6 +170,7 @@ fn load_queries(name: &str) -> LanguageQueries {
brackets: load_query(name, "/brackets"), brackets: load_query(name, "/brackets"),
indents: load_query(name, "/indents"), indents: load_query(name, "/indents"),
outline: load_query(name, "/outline"), outline: load_query(name, "/outline"),
embedding: load_query(name, "/embedding"),
injections: load_query(name, "/injections"), injections: load_query(name, "/injections"),
overrides: load_query(name, "/overrides"), overrides: load_query(name, "/overrides"),
} }

View file

@ -0,0 +1,36 @@
(struct_item
(visibility_modifier)? @context
"struct" @context
name: (_) @name) @item
(enum_item
(visibility_modifier)? @context
"enum" @context
name: (_) @name) @item
(impl_item
"impl" @context
trait: (_)? @name
"for"? @context
type: (_) @name) @item
(trait_item
(visibility_modifier)? @context
"trait" @context
name: (_) @name) @item
(function_item
(visibility_modifier)? @context
(function_modifiers)? @context
"fn" @context
name: (_) @name) @item
(function_signature_item
(visibility_modifier)? @context
(function_modifiers)? @context
"fn" @context
name: (_) @name) @item
(macro_definition
. "macro_rules!" @context
name: (_) @name) @item