pulled treesitter parsing to own file for ease of testing and management
This commit is contained in:
parent
3f5667b101
commit
0189742497
3 changed files with 110 additions and 109 deletions
|
@ -7,7 +7,7 @@ use std::{
|
||||||
|
|
||||||
use anyhow::{anyhow, Result};
|
use anyhow::{anyhow, Result};
|
||||||
|
|
||||||
use crate::IndexedFile;
|
use crate::parsing::ParsedFile;
|
||||||
use rpc::proto::Timestamp;
|
use rpc::proto::Timestamp;
|
||||||
use rusqlite::{
|
use rusqlite::{
|
||||||
params,
|
params,
|
||||||
|
@ -109,7 +109,7 @@ impl VectorDatabase {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert_file(&self, worktree_id: i64, indexed_file: IndexedFile) -> Result<()> {
|
pub fn insert_file(&self, worktree_id: i64, indexed_file: ParsedFile) -> Result<()> {
|
||||||
// Write to files table, and return generated id.
|
// Write to files table, and return generated id.
|
||||||
self.db.execute(
|
self.db.execute(
|
||||||
"
|
"
|
||||||
|
|
94
crates/vector_store/src/parsing.rs
Normal file
94
crates/vector_store/src/parsing.rs
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
use std::{ops::Range, path::PathBuf, sync::Arc, time::SystemTime};
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Ok, Result};
|
||||||
|
use project::Fs;
|
||||||
|
use tree_sitter::{Parser, QueryCursor};
|
||||||
|
|
||||||
|
use crate::PendingFile;
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
|
pub struct Document {
|
||||||
|
pub offset: usize,
|
||||||
|
pub name: String,
|
||||||
|
pub embedding: Vec<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
|
pub struct ParsedFile {
|
||||||
|
pub path: PathBuf,
|
||||||
|
pub mtime: SystemTime,
|
||||||
|
pub documents: Vec<Document>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct CodeContextRetriever {
|
||||||
|
pub parser: Parser,
|
||||||
|
pub cursor: QueryCursor,
|
||||||
|
pub fs: Arc<dyn Fs>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CodeContextRetriever {
|
||||||
|
pub async fn parse_file(
|
||||||
|
&mut self,
|
||||||
|
pending_file: PendingFile,
|
||||||
|
) -> Result<(ParsedFile, Vec<String>)> {
|
||||||
|
let grammar = pending_file
|
||||||
|
.language
|
||||||
|
.grammar()
|
||||||
|
.ok_or_else(|| anyhow!("no grammar for language"))?;
|
||||||
|
let embedding_config = grammar
|
||||||
|
.embedding_config
|
||||||
|
.as_ref()
|
||||||
|
.ok_or_else(|| anyhow!("no embedding queries"))?;
|
||||||
|
|
||||||
|
let content = self.fs.load(&pending_file.absolute_path).await?;
|
||||||
|
|
||||||
|
self.parser.set_language(grammar.ts_language).unwrap();
|
||||||
|
|
||||||
|
let tree = self
|
||||||
|
.parser
|
||||||
|
.parse(&content, None)
|
||||||
|
.ok_or_else(|| anyhow!("parsing failed"))?;
|
||||||
|
|
||||||
|
let mut documents = Vec::new();
|
||||||
|
let mut context_spans = Vec::new();
|
||||||
|
|
||||||
|
// Iterate through query matches
|
||||||
|
for mat in self.cursor.matches(
|
||||||
|
&embedding_config.query,
|
||||||
|
tree.root_node(),
|
||||||
|
content.as_bytes(),
|
||||||
|
) {
|
||||||
|
let mut item_range: Option<Range<usize>> = None;
|
||||||
|
let mut name_range: Option<Range<usize>> = None;
|
||||||
|
for capture in mat.captures {
|
||||||
|
if capture.index == embedding_config.item_capture_ix {
|
||||||
|
item_range = Some(capture.node.byte_range());
|
||||||
|
} else if capture.index == embedding_config.name_capture_ix {
|
||||||
|
name_range = Some(capture.node.byte_range());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((item_range, name_range)) = item_range.zip(name_range) {
|
||||||
|
if let Some((item, name)) =
|
||||||
|
content.get(item_range.clone()).zip(content.get(name_range))
|
||||||
|
{
|
||||||
|
context_spans.push(item.to_string());
|
||||||
|
documents.push(Document {
|
||||||
|
name: name.to_string(),
|
||||||
|
offset: item_range.start,
|
||||||
|
embedding: Vec::new(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok((
|
||||||
|
ParsedFile {
|
||||||
|
path: pending_file.relative_path,
|
||||||
|
mtime: pending_file.modified_time,
|
||||||
|
documents,
|
||||||
|
},
|
||||||
|
context_spans,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
mod db;
|
mod db;
|
||||||
mod embedding;
|
mod embedding;
|
||||||
mod modal;
|
mod modal;
|
||||||
|
mod parsing;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod vector_store_tests;
|
mod vector_store_tests;
|
||||||
|
@ -15,6 +16,7 @@ use gpui::{
|
||||||
};
|
};
|
||||||
use language::{Language, LanguageRegistry};
|
use language::{Language, LanguageRegistry};
|
||||||
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
||||||
|
use parsing::{CodeContextRetriever, ParsedFile};
|
||||||
use project::{Fs, Project, WorktreeId};
|
use project::{Fs, Project, WorktreeId};
|
||||||
use smol::channel;
|
use smol::channel;
|
||||||
use std::{
|
use std::{
|
||||||
|
@ -38,13 +40,6 @@ use workspace::{Workspace, WorkspaceCreated};
|
||||||
const REINDEXING_DELAY_SECONDS: u64 = 3;
|
const REINDEXING_DELAY_SECONDS: u64 = 3;
|
||||||
const EMBEDDINGS_BATCH_SIZE: usize = 150;
|
const EMBEDDINGS_BATCH_SIZE: usize = 150;
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct Document {
|
|
||||||
pub offset: usize,
|
|
||||||
pub name: String,
|
|
||||||
pub embedding: Vec<f32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn init(
|
pub fn init(
|
||||||
fs: Arc<dyn Fs>,
|
fs: Arc<dyn Fs>,
|
||||||
http_client: Arc<dyn HttpClient>,
|
http_client: Arc<dyn HttpClient>,
|
||||||
|
@ -113,13 +108,6 @@ pub fn init(
|
||||||
.detach();
|
.detach();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct IndexedFile {
|
|
||||||
path: PathBuf,
|
|
||||||
mtime: SystemTime,
|
|
||||||
documents: Vec<Document>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct VectorStore {
|
pub struct VectorStore {
|
||||||
fs: Arc<dyn Fs>,
|
fs: Arc<dyn Fs>,
|
||||||
database_url: Arc<PathBuf>,
|
database_url: Arc<PathBuf>,
|
||||||
|
@ -182,7 +170,7 @@ impl ProjectState {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
struct PendingFile {
|
pub struct PendingFile {
|
||||||
worktree_db_id: i64,
|
worktree_db_id: i64,
|
||||||
relative_path: PathBuf,
|
relative_path: PathBuf,
|
||||||
absolute_path: PathBuf,
|
absolute_path: PathBuf,
|
||||||
|
@ -201,7 +189,7 @@ pub struct SearchResult {
|
||||||
enum DbWrite {
|
enum DbWrite {
|
||||||
InsertFile {
|
InsertFile {
|
||||||
worktree_id: i64,
|
worktree_id: i64,
|
||||||
indexed_file: IndexedFile,
|
indexed_file: ParsedFile,
|
||||||
},
|
},
|
||||||
Delete {
|
Delete {
|
||||||
worktree_id: i64,
|
worktree_id: i64,
|
||||||
|
@ -267,7 +255,7 @@ impl VectorStore {
|
||||||
|
|
||||||
// embed_tx/rx: Embed Batch and Send to Database
|
// embed_tx/rx: Embed Batch and Send to Database
|
||||||
let (embed_batch_tx, embed_batch_rx) =
|
let (embed_batch_tx, embed_batch_rx) =
|
||||||
channel::unbounded::<Vec<(i64, IndexedFile, Vec<String>)>>();
|
channel::unbounded::<Vec<(i64, ParsedFile, Vec<String>)>>();
|
||||||
let mut _embed_batch_task = Vec::new();
|
let mut _embed_batch_task = Vec::new();
|
||||||
for _ in 0..1 {
|
for _ in 0..1 {
|
||||||
//cx.background().num_cpus() {
|
//cx.background().num_cpus() {
|
||||||
|
@ -324,13 +312,14 @@ impl VectorStore {
|
||||||
|
|
||||||
// batch_tx/rx: Batch Files to Send for Embeddings
|
// batch_tx/rx: Batch Files to Send for Embeddings
|
||||||
let (batch_files_tx, batch_files_rx) =
|
let (batch_files_tx, batch_files_rx) =
|
||||||
channel::unbounded::<(i64, IndexedFile, Vec<String>)>();
|
channel::unbounded::<(i64, ParsedFile, Vec<String>)>();
|
||||||
let _batch_files_task = cx.background().spawn(async move {
|
let _batch_files_task = cx.background().spawn(async move {
|
||||||
let mut queue_len = 0;
|
let mut queue_len = 0;
|
||||||
let mut embeddings_queue = vec![];
|
let mut embeddings_queue = vec![];
|
||||||
while let Ok((worktree_id, indexed_file, document_spans)) =
|
while let Ok((worktree_id, indexed_file, document_spans)) =
|
||||||
batch_files_rx.recv().await
|
batch_files_rx.recv().await
|
||||||
{
|
{
|
||||||
|
dbg!("Batching in while loop");
|
||||||
queue_len += &document_spans.len();
|
queue_len += &document_spans.len();
|
||||||
embeddings_queue.push((worktree_id, indexed_file, document_spans));
|
embeddings_queue.push((worktree_id, indexed_file, document_spans));
|
||||||
if queue_len >= EMBEDDINGS_BATCH_SIZE {
|
if queue_len >= EMBEDDINGS_BATCH_SIZE {
|
||||||
|
@ -339,6 +328,7 @@ impl VectorStore {
|
||||||
queue_len = 0;
|
queue_len = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// TODO: This is never getting called, We've gotta manage for how to clear the embedding batch if its less than the necessary batch size.
|
||||||
if queue_len > 0 {
|
if queue_len > 0 {
|
||||||
embed_batch_tx.try_send(embeddings_queue).unwrap();
|
embed_batch_tx.try_send(embeddings_queue).unwrap();
|
||||||
}
|
}
|
||||||
|
@ -353,21 +343,14 @@ impl VectorStore {
|
||||||
let parsing_files_rx = parsing_files_rx.clone();
|
let parsing_files_rx = parsing_files_rx.clone();
|
||||||
let batch_files_tx = batch_files_tx.clone();
|
let batch_files_tx = batch_files_tx.clone();
|
||||||
_parsing_files_tasks.push(cx.background().spawn(async move {
|
_parsing_files_tasks.push(cx.background().spawn(async move {
|
||||||
let mut parser = Parser::new();
|
let parser = Parser::new();
|
||||||
let mut cursor = QueryCursor::new();
|
let cursor = QueryCursor::new();
|
||||||
|
let mut retriever = CodeContextRetriever { parser, cursor, fs };
|
||||||
while let Ok(pending_file) = parsing_files_rx.recv().await {
|
while let Ok(pending_file) = parsing_files_rx.recv().await {
|
||||||
log::info!("Parsing File: {:?}", &pending_file.relative_path);
|
log::info!("Parsing File: {:?}", &pending_file.relative_path);
|
||||||
if let Some((indexed_file, document_spans)) = Self::index_file(
|
|
||||||
&mut cursor,
|
if let Some((indexed_file, document_spans)) =
|
||||||
&mut parser,
|
retriever.parse_file(pending_file.clone()).await.log_err()
|
||||||
&fs,
|
|
||||||
pending_file.language,
|
|
||||||
pending_file.relative_path.clone(),
|
|
||||||
pending_file.absolute_path.clone(),
|
|
||||||
pending_file.modified_time,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.log_err()
|
|
||||||
{
|
{
|
||||||
batch_files_tx
|
batch_files_tx
|
||||||
.try_send((
|
.try_send((
|
||||||
|
@ -397,82 +380,6 @@ impl VectorStore {
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn index_file(
|
|
||||||
cursor: &mut QueryCursor,
|
|
||||||
parser: &mut Parser,
|
|
||||||
fs: &Arc<dyn Fs>,
|
|
||||||
language: Arc<Language>,
|
|
||||||
relative_file_path: PathBuf,
|
|
||||||
absolute_file_path: PathBuf,
|
|
||||||
mtime: SystemTime,
|
|
||||||
) -> Result<(IndexedFile, Vec<String>)> {
|
|
||||||
let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
|
|
||||||
let embedding_config = grammar
|
|
||||||
.embedding_config
|
|
||||||
.as_ref()
|
|
||||||
.ok_or_else(|| anyhow!("no outline query"))?;
|
|
||||||
|
|
||||||
let content = fs.load(&absolute_file_path).await?;
|
|
||||||
|
|
||||||
parser.set_language(grammar.ts_language).unwrap();
|
|
||||||
let tree = parser
|
|
||||||
.parse(&content, None)
|
|
||||||
.ok_or_else(|| anyhow!("parsing failed"))?;
|
|
||||||
|
|
||||||
let mut documents = Vec::new();
|
|
||||||
let mut context_spans = Vec::new();
|
|
||||||
for mat in cursor.matches(
|
|
||||||
&embedding_config.query,
|
|
||||||
tree.root_node(),
|
|
||||||
content.as_bytes(),
|
|
||||||
) {
|
|
||||||
let mut item_range = None;
|
|
||||||
let mut name_range = None;
|
|
||||||
let mut context_range = None;
|
|
||||||
for capture in mat.captures {
|
|
||||||
if capture.index == embedding_config.item_capture_ix {
|
|
||||||
item_range = Some(capture.node.byte_range());
|
|
||||||
} else if capture.index == embedding_config.name_capture_ix {
|
|
||||||
name_range = Some(capture.node.byte_range());
|
|
||||||
}
|
|
||||||
if let Some(context_capture_ix) = embedding_config.context_capture_ix {
|
|
||||||
if capture.index == context_capture_ix {
|
|
||||||
context_range = Some(capture.node.byte_range());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((item_range, name_range)) = item_range.zip(name_range) {
|
|
||||||
let mut context_data = String::new();
|
|
||||||
if let Some(context_range) = context_range {
|
|
||||||
if let Some(context) = content.get(context_range.clone()) {
|
|
||||||
context_data.push_str(context);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((item, name)) =
|
|
||||||
content.get(item_range.clone()).zip(content.get(name_range))
|
|
||||||
{
|
|
||||||
context_spans.push(item.to_string());
|
|
||||||
documents.push(Document {
|
|
||||||
name: format!("{} {}", context_data.to_string(), name.to_string()),
|
|
||||||
offset: item_range.start,
|
|
||||||
embedding: Vec::new(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Ok((
|
|
||||||
IndexedFile {
|
|
||||||
path: relative_file_path,
|
|
||||||
mtime,
|
|
||||||
documents,
|
|
||||||
},
|
|
||||||
context_spans,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
|
fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
|
||||||
let (tx, rx) = oneshot::channel();
|
let (tx, rx) = oneshot::channel();
|
||||||
self.db_update_tx
|
self.db_update_tx
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue