update vector_store to accomodate for full file parsing for JSON, TOML and YAML files
This commit is contained in:
parent
4bece54655
commit
cf0dd09b5c
5 changed files with 62 additions and 9 deletions
14
Cargo.lock
generated
14
Cargo.lock
generated
|
@ -8134,6 +8134,16 @@ dependencies = [
|
|||
"tree-sitter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-toml"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca517f578a98b23d20780247cc2688407fa81effad5b627a5a364ec3339b53e8"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"tree-sitter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-typescript"
|
||||
version = "0.20.2"
|
||||
|
@ -8508,8 +8518,8 @@ dependencies = [
|
|||
"theme",
|
||||
"tiktoken-rs 0.5.0",
|
||||
"tree-sitter",
|
||||
"tree-sitter-javascript",
|
||||
"tree-sitter-rust",
|
||||
"tree-sitter-toml 0.20.0",
|
||||
"tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unindent",
|
||||
"util",
|
||||
|
@ -9560,7 +9570,7 @@ dependencies = [
|
|||
"tree-sitter-ruby",
|
||||
"tree-sitter-rust",
|
||||
"tree-sitter-scheme",
|
||||
"tree-sitter-toml",
|
||||
"tree-sitter-toml 0.5.1",
|
||||
"tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)",
|
||||
"tree-sitter-yaml",
|
||||
"unindent",
|
||||
|
|
|
@ -51,6 +51,6 @@ tempdir.workspace = true
|
|||
ctor.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
tree-sitter-javascript = "*"
|
||||
tree-sitter-typescript = "*"
|
||||
tree-sitter-rust = "*"
|
||||
tree-sitter-toml = "*"
|
||||
|
|
|
@ -13,6 +13,9 @@ pub struct Document {
|
|||
|
||||
const CODE_CONTEXT_TEMPLATE: &str =
|
||||
"The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
||||
const ENTIRE_FILE_TEMPLATE: &str =
|
||||
"The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
||||
pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"];
|
||||
|
||||
pub struct CodeContextRetriever {
|
||||
pub parser: Parser,
|
||||
|
@ -27,12 +30,35 @@ impl CodeContextRetriever {
|
|||
}
|
||||
}
|
||||
|
||||
fn _parse_entire_file(
|
||||
&self,
|
||||
relative_path: &Path,
|
||||
language_name: Arc<str>,
|
||||
content: &str,
|
||||
) -> Result<Vec<Document>> {
|
||||
let document_span = ENTIRE_FILE_TEMPLATE
|
||||
.replace("<path>", relative_path.to_string_lossy().as_ref())
|
||||
.replace("<language>", language_name.as_ref())
|
||||
.replace("item", &content);
|
||||
|
||||
Ok(vec![Document {
|
||||
range: 0..content.len(),
|
||||
content: document_span,
|
||||
embedding: Vec::new(),
|
||||
name: language_name.to_string(),
|
||||
}])
|
||||
}
|
||||
|
||||
pub fn parse_file(
|
||||
&mut self,
|
||||
relative_path: &Path,
|
||||
content: &str,
|
||||
language: Arc<Language>,
|
||||
) -> Result<Vec<Document>> {
|
||||
if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) {
|
||||
return self._parse_entire_file(relative_path, language.name(), &content);
|
||||
}
|
||||
|
||||
let grammar = language
|
||||
.grammar()
|
||||
.ok_or_else(|| anyhow!("no grammar for language"))?;
|
||||
|
|
|
@ -19,7 +19,7 @@ use gpui::{
|
|||
use language::{Language, LanguageRegistry};
|
||||
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
||||
use parking_lot::Mutex;
|
||||
use parsing::{CodeContextRetriever, Document};
|
||||
use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES};
|
||||
use project::{Fs, Project, WorktreeId};
|
||||
use smol::channel;
|
||||
use std::{
|
||||
|
@ -537,10 +537,11 @@ impl VectorStore {
|
|||
.language_for_file(&absolute_path, None)
|
||||
.await
|
||||
{
|
||||
if language
|
||||
.grammar()
|
||||
.and_then(|grammar| grammar.embedding_config.as_ref())
|
||||
.is_none()
|
||||
if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref())
|
||||
&& language
|
||||
.grammar()
|
||||
.and_then(|grammar| grammar.embedding_config.as_ref())
|
||||
.is_none()
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -56,6 +56,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
|
|||
println!(\"bbbb!\");
|
||||
}
|
||||
".unindent(),
|
||||
"file3.toml": "
|
||||
ZZZZZZZ = 5
|
||||
".unindent(),
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
@ -63,7 +66,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
|
|||
|
||||
let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
|
||||
let rust_language = rust_lang();
|
||||
let toml_language = toml_lang();
|
||||
languages.add(rust_language);
|
||||
languages.add(toml_language);
|
||||
|
||||
let db_dir = tempdir::TempDir::new("vector-store").unwrap();
|
||||
let db_path = db_dir.path().join("db.sqlite");
|
||||
|
@ -87,7 +92,7 @@ async fn test_vector_store(cx: &mut TestAppContext) {
|
|||
.update(cx, |store, cx| store.index_project(project.clone(), cx))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(file_count, 2);
|
||||
assert_eq!(file_count, 3);
|
||||
cx.foreground().run_until_parked();
|
||||
store.update(cx, |store, _cx| {
|
||||
assert_eq!(
|
||||
|
@ -578,3 +583,14 @@ fn rust_lang() -> Arc<Language> {
|
|||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
fn toml_lang() -> Arc<Language> {
|
||||
Arc::new(Language::new(
|
||||
LanguageConfig {
|
||||
name: "TOML".into(),
|
||||
path_suffixes: vec!["toml".into()],
|
||||
..Default::default()
|
||||
},
|
||||
Some(tree_sitter_toml::language()),
|
||||
))
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue