update vector_store to accomodate for full file parsing for JSON, TOML and YAML files

This commit is contained in:
KCaverly 2023-07-17 10:04:32 -04:00
parent 4bece54655
commit cf0dd09b5c
5 changed files with 62 additions and 9 deletions

14
Cargo.lock generated
View file

@ -8134,6 +8134,16 @@ dependencies = [
"tree-sitter",
]
[[package]]
name = "tree-sitter-toml"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca517f578a98b23d20780247cc2688407fa81effad5b627a5a364ec3339b53e8"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-typescript"
version = "0.20.2"
@ -8508,8 +8518,8 @@ dependencies = [
"theme",
"tiktoken-rs 0.5.0",
"tree-sitter",
"tree-sitter-javascript",
"tree-sitter-rust",
"tree-sitter-toml 0.20.0",
"tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)",
"unindent",
"util",
@ -9560,7 +9570,7 @@ dependencies = [
"tree-sitter-ruby",
"tree-sitter-rust",
"tree-sitter-scheme",
"tree-sitter-toml",
"tree-sitter-toml 0.5.1",
"tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)",
"tree-sitter-yaml",
"unindent",

View file

@ -51,6 +51,6 @@ tempdir.workspace = true
ctor.workspace = true
env_logger.workspace = true
tree-sitter-javascript = "*"
tree-sitter-typescript = "*"
tree-sitter-rust = "*"
tree-sitter-toml = "*"

View file

@ -13,6 +13,9 @@ pub struct Document {
const CODE_CONTEXT_TEMPLATE: &str =
"The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
const ENTIRE_FILE_TEMPLATE: &str =
"The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"];
pub struct CodeContextRetriever {
pub parser: Parser,
@ -27,12 +30,35 @@ impl CodeContextRetriever {
}
}
fn _parse_entire_file(
&self,
relative_path: &Path,
language_name: Arc<str>,
content: &str,
) -> Result<Vec<Document>> {
let document_span = ENTIRE_FILE_TEMPLATE
.replace("<path>", relative_path.to_string_lossy().as_ref())
.replace("<language>", language_name.as_ref())
.replace("item", &content);
Ok(vec![Document {
range: 0..content.len(),
content: document_span,
embedding: Vec::new(),
name: language_name.to_string(),
}])
}
pub fn parse_file(
&mut self,
relative_path: &Path,
content: &str,
language: Arc<Language>,
) -> Result<Vec<Document>> {
if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) {
return self._parse_entire_file(relative_path, language.name(), &content);
}
let grammar = language
.grammar()
.ok_or_else(|| anyhow!("no grammar for language"))?;

View file

@ -19,7 +19,7 @@ use gpui::{
use language::{Language, LanguageRegistry};
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
use parking_lot::Mutex;
use parsing::{CodeContextRetriever, Document};
use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES};
use project::{Fs, Project, WorktreeId};
use smol::channel;
use std::{
@ -537,10 +537,11 @@ impl VectorStore {
.language_for_file(&absolute_path, None)
.await
{
if language
.grammar()
.and_then(|grammar| grammar.embedding_config.as_ref())
.is_none()
if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref())
&& language
.grammar()
.and_then(|grammar| grammar.embedding_config.as_ref())
.is_none()
{
continue;
}

View file

@ -56,6 +56,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
println!(\"bbbb!\");
}
".unindent(),
"file3.toml": "
ZZZZZZZ = 5
".unindent(),
}
}),
)
@ -63,7 +66,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
let rust_language = rust_lang();
let toml_language = toml_lang();
languages.add(rust_language);
languages.add(toml_language);
let db_dir = tempdir::TempDir::new("vector-store").unwrap();
let db_path = db_dir.path().join("db.sqlite");
@ -87,7 +92,7 @@ async fn test_vector_store(cx: &mut TestAppContext) {
.update(cx, |store, cx| store.index_project(project.clone(), cx))
.await
.unwrap();
assert_eq!(file_count, 2);
assert_eq!(file_count, 3);
cx.foreground().run_until_parked();
store.update(cx, |store, _cx| {
assert_eq!(
@ -578,3 +583,14 @@ fn rust_lang() -> Arc<Language> {
.unwrap(),
)
}
fn toml_lang() -> Arc<Language> {
Arc::new(Language::new(
LanguageConfig {
name: "TOML".into(),
path_suffixes: vec!["toml".into()],
..Default::default()
},
Some(tree_sitter_toml::language()),
))
}