diff --git a/Cargo.lock b/Cargo.lock index b6049e611e..afd40fd308 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8134,6 +8134,16 @@ dependencies = [ "tree-sitter", ] +[[package]] +name = "tree-sitter-toml" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca517f578a98b23d20780247cc2688407fa81effad5b627a5a364ec3339b53e8" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "tree-sitter-typescript" version = "0.20.2" @@ -8508,8 +8518,8 @@ dependencies = [ "theme", "tiktoken-rs 0.5.0", "tree-sitter", - "tree-sitter-javascript", "tree-sitter-rust", + "tree-sitter-toml 0.20.0", "tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)", "unindent", "util", @@ -9560,7 +9570,7 @@ dependencies = [ "tree-sitter-ruby", "tree-sitter-rust", "tree-sitter-scheme", - "tree-sitter-toml", + "tree-sitter-toml 0.5.1", "tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)", "tree-sitter-yaml", "unindent", diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index 6b2e77e904..31119a1ba6 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -51,6 +51,6 @@ tempdir.workspace = true ctor.workspace = true env_logger.workspace = true -tree-sitter-javascript = "*" tree-sitter-typescript = "*" tree-sitter-rust = "*" +tree-sitter-toml = "*" diff --git a/crates/vector_store/src/parsing.rs b/crates/vector_store/src/parsing.rs index 4ce8b6763a..216ef1b5e1 100644 --- a/crates/vector_store/src/parsing.rs +++ b/crates/vector_store/src/parsing.rs @@ -13,6 +13,9 @@ pub struct Document { const CODE_CONTEXT_TEMPLATE: &str = "The below code snippet is from file ''\n\n```\n\n```"; +const ENTIRE_FILE_TEMPLATE: &str = + "The below snippet is from file ''\n\n```\n\n```"; +pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"]; pub struct CodeContextRetriever { pub parser: Parser, @@ -27,12 +30,35 @@ impl CodeContextRetriever { } } + fn _parse_entire_file( + &self, + relative_path: &Path, + language_name: Arc, + content: &str, + ) -> Result> { + let document_span = ENTIRE_FILE_TEMPLATE + .replace("", relative_path.to_string_lossy().as_ref()) + .replace("", language_name.as_ref()) + .replace("item", &content); + + Ok(vec![Document { + range: 0..content.len(), + content: document_span, + embedding: Vec::new(), + name: language_name.to_string(), + }]) + } + pub fn parse_file( &mut self, relative_path: &Path, content: &str, language: Arc, ) -> Result> { + if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) { + return self._parse_entire_file(relative_path, language.name(), &content); + } + let grammar = language .grammar() .ok_or_else(|| anyhow!("no grammar for language"))?; diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 3f7ab5c6cd..0f55bd9e63 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -19,7 +19,7 @@ use gpui::{ use language::{Language, LanguageRegistry}; use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; use parking_lot::Mutex; -use parsing::{CodeContextRetriever, Document}; +use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES}; use project::{Fs, Project, WorktreeId}; use smol::channel; use std::{ @@ -537,10 +537,11 @@ impl VectorStore { .language_for_file(&absolute_path, None) .await { - if language - .grammar() - .and_then(|grammar| grammar.embedding_config.as_ref()) - .is_none() + if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) + && language + .grammar() + .and_then(|grammar| grammar.embedding_config.as_ref()) + .is_none() { continue; } diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index 76465b1aaf..84c9962493 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -56,6 +56,9 @@ async fn test_vector_store(cx: &mut TestAppContext) { println!(\"bbbb!\"); } ".unindent(), + "file3.toml": " + ZZZZZZZ = 5 + ".unindent(), } }), ) @@ -63,7 +66,9 @@ async fn test_vector_store(cx: &mut TestAppContext) { let languages = Arc::new(LanguageRegistry::new(Task::ready(()))); let rust_language = rust_lang(); + let toml_language = toml_lang(); languages.add(rust_language); + languages.add(toml_language); let db_dir = tempdir::TempDir::new("vector-store").unwrap(); let db_path = db_dir.path().join("db.sqlite"); @@ -87,7 +92,7 @@ async fn test_vector_store(cx: &mut TestAppContext) { .update(cx, |store, cx| store.index_project(project.clone(), cx)) .await .unwrap(); - assert_eq!(file_count, 2); + assert_eq!(file_count, 3); cx.foreground().run_until_parked(); store.update(cx, |store, _cx| { assert_eq!( @@ -578,3 +583,14 @@ fn rust_lang() -> Arc { .unwrap(), ) } + +fn toml_lang() -> Arc { + Arc::new(Language::new( + LanguageConfig { + name: "TOML".into(), + path_suffixes: vec!["toml".into()], + ..Default::default() + }, + Some(tree_sitter_toml::language()), + )) +}