update vector_store to accomodate for full file parsing for JSON, TOML and YAML files
This commit is contained in:
parent
4bece54655
commit
cf0dd09b5c
5 changed files with 62 additions and 9 deletions
14
Cargo.lock
generated
14
Cargo.lock
generated
|
@ -8134,6 +8134,16 @@ dependencies = [
|
||||||
"tree-sitter",
|
"tree-sitter",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tree-sitter-toml"
|
||||||
|
version = "0.20.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ca517f578a98b23d20780247cc2688407fa81effad5b627a5a364ec3339b53e8"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"tree-sitter",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tree-sitter-typescript"
|
name = "tree-sitter-typescript"
|
||||||
version = "0.20.2"
|
version = "0.20.2"
|
||||||
|
@ -8508,8 +8518,8 @@ dependencies = [
|
||||||
"theme",
|
"theme",
|
||||||
"tiktoken-rs 0.5.0",
|
"tiktoken-rs 0.5.0",
|
||||||
"tree-sitter",
|
"tree-sitter",
|
||||||
"tree-sitter-javascript",
|
|
||||||
"tree-sitter-rust",
|
"tree-sitter-rust",
|
||||||
|
"tree-sitter-toml 0.20.0",
|
||||||
"tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"tree-sitter-typescript 0.20.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"unindent",
|
"unindent",
|
||||||
"util",
|
"util",
|
||||||
|
@ -9560,7 +9570,7 @@ dependencies = [
|
||||||
"tree-sitter-ruby",
|
"tree-sitter-ruby",
|
||||||
"tree-sitter-rust",
|
"tree-sitter-rust",
|
||||||
"tree-sitter-scheme",
|
"tree-sitter-scheme",
|
||||||
"tree-sitter-toml",
|
"tree-sitter-toml 0.5.1",
|
||||||
"tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)",
|
"tree-sitter-typescript 0.20.2 (git+https://github.com/tree-sitter/tree-sitter-typescript?rev=5d20856f34315b068c41edaee2ac8a100081d259)",
|
||||||
"tree-sitter-yaml",
|
"tree-sitter-yaml",
|
||||||
"unindent",
|
"unindent",
|
||||||
|
|
|
@ -51,6 +51,6 @@ tempdir.workspace = true
|
||||||
ctor.workspace = true
|
ctor.workspace = true
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
|
|
||||||
tree-sitter-javascript = "*"
|
|
||||||
tree-sitter-typescript = "*"
|
tree-sitter-typescript = "*"
|
||||||
tree-sitter-rust = "*"
|
tree-sitter-rust = "*"
|
||||||
|
tree-sitter-toml = "*"
|
||||||
|
|
|
@ -13,6 +13,9 @@ pub struct Document {
|
||||||
|
|
||||||
const CODE_CONTEXT_TEMPLATE: &str =
|
const CODE_CONTEXT_TEMPLATE: &str =
|
||||||
"The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
"The below code snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
||||||
|
const ENTIRE_FILE_TEMPLATE: &str =
|
||||||
|
"The below snippet is from file '<path>'\n\n```<language>\n<item>\n```";
|
||||||
|
pub const PARSEABLE_ENTIRE_FILE_TYPES: [&str; 3] = ["TOML", "YAML", "JSON"];
|
||||||
|
|
||||||
pub struct CodeContextRetriever {
|
pub struct CodeContextRetriever {
|
||||||
pub parser: Parser,
|
pub parser: Parser,
|
||||||
|
@ -27,12 +30,35 @@ impl CodeContextRetriever {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn _parse_entire_file(
|
||||||
|
&self,
|
||||||
|
relative_path: &Path,
|
||||||
|
language_name: Arc<str>,
|
||||||
|
content: &str,
|
||||||
|
) -> Result<Vec<Document>> {
|
||||||
|
let document_span = ENTIRE_FILE_TEMPLATE
|
||||||
|
.replace("<path>", relative_path.to_string_lossy().as_ref())
|
||||||
|
.replace("<language>", language_name.as_ref())
|
||||||
|
.replace("item", &content);
|
||||||
|
|
||||||
|
Ok(vec![Document {
|
||||||
|
range: 0..content.len(),
|
||||||
|
content: document_span,
|
||||||
|
embedding: Vec::new(),
|
||||||
|
name: language_name.to_string(),
|
||||||
|
}])
|
||||||
|
}
|
||||||
|
|
||||||
pub fn parse_file(
|
pub fn parse_file(
|
||||||
&mut self,
|
&mut self,
|
||||||
relative_path: &Path,
|
relative_path: &Path,
|
||||||
content: &str,
|
content: &str,
|
||||||
language: Arc<Language>,
|
language: Arc<Language>,
|
||||||
) -> Result<Vec<Document>> {
|
) -> Result<Vec<Document>> {
|
||||||
|
if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) {
|
||||||
|
return self._parse_entire_file(relative_path, language.name(), &content);
|
||||||
|
}
|
||||||
|
|
||||||
let grammar = language
|
let grammar = language
|
||||||
.grammar()
|
.grammar()
|
||||||
.ok_or_else(|| anyhow!("no grammar for language"))?;
|
.ok_or_else(|| anyhow!("no grammar for language"))?;
|
||||||
|
|
|
@ -19,7 +19,7 @@ use gpui::{
|
||||||
use language::{Language, LanguageRegistry};
|
use language::{Language, LanguageRegistry};
|
||||||
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
||||||
use parking_lot::Mutex;
|
use parking_lot::Mutex;
|
||||||
use parsing::{CodeContextRetriever, Document};
|
use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES};
|
||||||
use project::{Fs, Project, WorktreeId};
|
use project::{Fs, Project, WorktreeId};
|
||||||
use smol::channel;
|
use smol::channel;
|
||||||
use std::{
|
use std::{
|
||||||
|
@ -537,7 +537,8 @@ impl VectorStore {
|
||||||
.language_for_file(&absolute_path, None)
|
.language_for_file(&absolute_path, None)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
if language
|
if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref())
|
||||||
|
&& language
|
||||||
.grammar()
|
.grammar()
|
||||||
.and_then(|grammar| grammar.embedding_config.as_ref())
|
.and_then(|grammar| grammar.embedding_config.as_ref())
|
||||||
.is_none()
|
.is_none()
|
||||||
|
|
|
@ -56,6 +56,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
|
||||||
println!(\"bbbb!\");
|
println!(\"bbbb!\");
|
||||||
}
|
}
|
||||||
".unindent(),
|
".unindent(),
|
||||||
|
"file3.toml": "
|
||||||
|
ZZZZZZZ = 5
|
||||||
|
".unindent(),
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
@ -63,7 +66,9 @@ async fn test_vector_store(cx: &mut TestAppContext) {
|
||||||
|
|
||||||
let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
|
let languages = Arc::new(LanguageRegistry::new(Task::ready(())));
|
||||||
let rust_language = rust_lang();
|
let rust_language = rust_lang();
|
||||||
|
let toml_language = toml_lang();
|
||||||
languages.add(rust_language);
|
languages.add(rust_language);
|
||||||
|
languages.add(toml_language);
|
||||||
|
|
||||||
let db_dir = tempdir::TempDir::new("vector-store").unwrap();
|
let db_dir = tempdir::TempDir::new("vector-store").unwrap();
|
||||||
let db_path = db_dir.path().join("db.sqlite");
|
let db_path = db_dir.path().join("db.sqlite");
|
||||||
|
@ -87,7 +92,7 @@ async fn test_vector_store(cx: &mut TestAppContext) {
|
||||||
.update(cx, |store, cx| store.index_project(project.clone(), cx))
|
.update(cx, |store, cx| store.index_project(project.clone(), cx))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(file_count, 2);
|
assert_eq!(file_count, 3);
|
||||||
cx.foreground().run_until_parked();
|
cx.foreground().run_until_parked();
|
||||||
store.update(cx, |store, _cx| {
|
store.update(cx, |store, _cx| {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -578,3 +583,14 @@ fn rust_lang() -> Arc<Language> {
|
||||||
.unwrap(),
|
.unwrap(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn toml_lang() -> Arc<Language> {
|
||||||
|
Arc::new(Language::new(
|
||||||
|
LanguageConfig {
|
||||||
|
name: "TOML".into(),
|
||||||
|
path_suffixes: vec!["toml".into()],
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
Some(tree_sitter_toml::language()),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue