pulled treesitter parsing to own file for ease of testing and management

This commit is contained in:
KCaverly 2023-07-10 10:06:07 -04:00
parent 3f5667b101
commit 0189742497
3 changed files with 110 additions and 109 deletions

View file

@ -7,7 +7,7 @@ use std::{
use anyhow::{anyhow, Result}; use anyhow::{anyhow, Result};
use crate::IndexedFile; use crate::parsing::ParsedFile;
use rpc::proto::Timestamp; use rpc::proto::Timestamp;
use rusqlite::{ use rusqlite::{
params, params,
@ -109,7 +109,7 @@ impl VectorDatabase {
Ok(()) Ok(())
} }
pub fn insert_file(&self, worktree_id: i64, indexed_file: IndexedFile) -> Result<()> { pub fn insert_file(&self, worktree_id: i64, indexed_file: ParsedFile) -> Result<()> {
// Write to files table, and return generated id. // Write to files table, and return generated id.
self.db.execute( self.db.execute(
" "

View file

@ -0,0 +1,94 @@
use std::{ops::Range, path::PathBuf, sync::Arc, time::SystemTime};
use anyhow::{anyhow, Ok, Result};
use project::Fs;
use tree_sitter::{Parser, QueryCursor};
use crate::PendingFile;
#[derive(Debug, PartialEq, Clone)]
pub struct Document {
pub offset: usize,
pub name: String,
pub embedding: Vec<f32>,
}
#[derive(Debug, PartialEq, Clone)]
pub struct ParsedFile {
pub path: PathBuf,
pub mtime: SystemTime,
pub documents: Vec<Document>,
}
pub struct CodeContextRetriever {
pub parser: Parser,
pub cursor: QueryCursor,
pub fs: Arc<dyn Fs>,
}
impl CodeContextRetriever {
pub async fn parse_file(
&mut self,
pending_file: PendingFile,
) -> Result<(ParsedFile, Vec<String>)> {
let grammar = pending_file
.language
.grammar()
.ok_or_else(|| anyhow!("no grammar for language"))?;
let embedding_config = grammar
.embedding_config
.as_ref()
.ok_or_else(|| anyhow!("no embedding queries"))?;
let content = self.fs.load(&pending_file.absolute_path).await?;
self.parser.set_language(grammar.ts_language).unwrap();
let tree = self
.parser
.parse(&content, None)
.ok_or_else(|| anyhow!("parsing failed"))?;
let mut documents = Vec::new();
let mut context_spans = Vec::new();
// Iterate through query matches
for mat in self.cursor.matches(
&embedding_config.query,
tree.root_node(),
content.as_bytes(),
) {
let mut item_range: Option<Range<usize>> = None;
let mut name_range: Option<Range<usize>> = None;
for capture in mat.captures {
if capture.index == embedding_config.item_capture_ix {
item_range = Some(capture.node.byte_range());
} else if capture.index == embedding_config.name_capture_ix {
name_range = Some(capture.node.byte_range());
}
}
if let Some((item_range, name_range)) = item_range.zip(name_range) {
if let Some((item, name)) =
content.get(item_range.clone()).zip(content.get(name_range))
{
context_spans.push(item.to_string());
documents.push(Document {
name: name.to_string(),
offset: item_range.start,
embedding: Vec::new(),
});
}
}
}
return Ok((
ParsedFile {
path: pending_file.relative_path,
mtime: pending_file.modified_time,
documents,
},
context_spans,
));
}
}

View file

@ -1,6 +1,7 @@
mod db; mod db;
mod embedding; mod embedding;
mod modal; mod modal;
mod parsing;
#[cfg(test)] #[cfg(test)]
mod vector_store_tests; mod vector_store_tests;
@ -15,6 +16,7 @@ use gpui::{
}; };
use language::{Language, LanguageRegistry}; use language::{Language, LanguageRegistry};
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle}; use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
use parsing::{CodeContextRetriever, ParsedFile};
use project::{Fs, Project, WorktreeId}; use project::{Fs, Project, WorktreeId};
use smol::channel; use smol::channel;
use std::{ use std::{
@ -38,13 +40,6 @@ use workspace::{Workspace, WorkspaceCreated};
const REINDEXING_DELAY_SECONDS: u64 = 3; const REINDEXING_DELAY_SECONDS: u64 = 3;
const EMBEDDINGS_BATCH_SIZE: usize = 150; const EMBEDDINGS_BATCH_SIZE: usize = 150;
#[derive(Debug, Clone)]
pub struct Document {
pub offset: usize,
pub name: String,
pub embedding: Vec<f32>,
}
pub fn init( pub fn init(
fs: Arc<dyn Fs>, fs: Arc<dyn Fs>,
http_client: Arc<dyn HttpClient>, http_client: Arc<dyn HttpClient>,
@ -113,13 +108,6 @@ pub fn init(
.detach(); .detach();
} }
#[derive(Debug, Clone)]
pub struct IndexedFile {
path: PathBuf,
mtime: SystemTime,
documents: Vec<Document>,
}
pub struct VectorStore { pub struct VectorStore {
fs: Arc<dyn Fs>, fs: Arc<dyn Fs>,
database_url: Arc<PathBuf>, database_url: Arc<PathBuf>,
@ -182,7 +170,7 @@ impl ProjectState {
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct PendingFile { pub struct PendingFile {
worktree_db_id: i64, worktree_db_id: i64,
relative_path: PathBuf, relative_path: PathBuf,
absolute_path: PathBuf, absolute_path: PathBuf,
@ -201,7 +189,7 @@ pub struct SearchResult {
enum DbWrite { enum DbWrite {
InsertFile { InsertFile {
worktree_id: i64, worktree_id: i64,
indexed_file: IndexedFile, indexed_file: ParsedFile,
}, },
Delete { Delete {
worktree_id: i64, worktree_id: i64,
@ -267,7 +255,7 @@ impl VectorStore {
// embed_tx/rx: Embed Batch and Send to Database // embed_tx/rx: Embed Batch and Send to Database
let (embed_batch_tx, embed_batch_rx) = let (embed_batch_tx, embed_batch_rx) =
channel::unbounded::<Vec<(i64, IndexedFile, Vec<String>)>>(); channel::unbounded::<Vec<(i64, ParsedFile, Vec<String>)>>();
let mut _embed_batch_task = Vec::new(); let mut _embed_batch_task = Vec::new();
for _ in 0..1 { for _ in 0..1 {
//cx.background().num_cpus() { //cx.background().num_cpus() {
@ -324,13 +312,14 @@ impl VectorStore {
// batch_tx/rx: Batch Files to Send for Embeddings // batch_tx/rx: Batch Files to Send for Embeddings
let (batch_files_tx, batch_files_rx) = let (batch_files_tx, batch_files_rx) =
channel::unbounded::<(i64, IndexedFile, Vec<String>)>(); channel::unbounded::<(i64, ParsedFile, Vec<String>)>();
let _batch_files_task = cx.background().spawn(async move { let _batch_files_task = cx.background().spawn(async move {
let mut queue_len = 0; let mut queue_len = 0;
let mut embeddings_queue = vec![]; let mut embeddings_queue = vec![];
while let Ok((worktree_id, indexed_file, document_spans)) = while let Ok((worktree_id, indexed_file, document_spans)) =
batch_files_rx.recv().await batch_files_rx.recv().await
{ {
dbg!("Batching in while loop");
queue_len += &document_spans.len(); queue_len += &document_spans.len();
embeddings_queue.push((worktree_id, indexed_file, document_spans)); embeddings_queue.push((worktree_id, indexed_file, document_spans));
if queue_len >= EMBEDDINGS_BATCH_SIZE { if queue_len >= EMBEDDINGS_BATCH_SIZE {
@ -339,6 +328,7 @@ impl VectorStore {
queue_len = 0; queue_len = 0;
} }
} }
// TODO: This is never getting called, We've gotta manage for how to clear the embedding batch if its less than the necessary batch size.
if queue_len > 0 { if queue_len > 0 {
embed_batch_tx.try_send(embeddings_queue).unwrap(); embed_batch_tx.try_send(embeddings_queue).unwrap();
} }
@ -353,21 +343,14 @@ impl VectorStore {
let parsing_files_rx = parsing_files_rx.clone(); let parsing_files_rx = parsing_files_rx.clone();
let batch_files_tx = batch_files_tx.clone(); let batch_files_tx = batch_files_tx.clone();
_parsing_files_tasks.push(cx.background().spawn(async move { _parsing_files_tasks.push(cx.background().spawn(async move {
let mut parser = Parser::new(); let parser = Parser::new();
let mut cursor = QueryCursor::new(); let cursor = QueryCursor::new();
let mut retriever = CodeContextRetriever { parser, cursor, fs };
while let Ok(pending_file) = parsing_files_rx.recv().await { while let Ok(pending_file) = parsing_files_rx.recv().await {
log::info!("Parsing File: {:?}", &pending_file.relative_path); log::info!("Parsing File: {:?}", &pending_file.relative_path);
if let Some((indexed_file, document_spans)) = Self::index_file(
&mut cursor, if let Some((indexed_file, document_spans)) =
&mut parser, retriever.parse_file(pending_file.clone()).await.log_err()
&fs,
pending_file.language,
pending_file.relative_path.clone(),
pending_file.absolute_path.clone(),
pending_file.modified_time,
)
.await
.log_err()
{ {
batch_files_tx batch_files_tx
.try_send(( .try_send((
@ -397,82 +380,6 @@ impl VectorStore {
})) }))
} }
async fn index_file(
cursor: &mut QueryCursor,
parser: &mut Parser,
fs: &Arc<dyn Fs>,
language: Arc<Language>,
relative_file_path: PathBuf,
absolute_file_path: PathBuf,
mtime: SystemTime,
) -> Result<(IndexedFile, Vec<String>)> {
let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?;
let embedding_config = grammar
.embedding_config
.as_ref()
.ok_or_else(|| anyhow!("no outline query"))?;
let content = fs.load(&absolute_file_path).await?;
parser.set_language(grammar.ts_language).unwrap();
let tree = parser
.parse(&content, None)
.ok_or_else(|| anyhow!("parsing failed"))?;
let mut documents = Vec::new();
let mut context_spans = Vec::new();
for mat in cursor.matches(
&embedding_config.query,
tree.root_node(),
content.as_bytes(),
) {
let mut item_range = None;
let mut name_range = None;
let mut context_range = None;
for capture in mat.captures {
if capture.index == embedding_config.item_capture_ix {
item_range = Some(capture.node.byte_range());
} else if capture.index == embedding_config.name_capture_ix {
name_range = Some(capture.node.byte_range());
}
if let Some(context_capture_ix) = embedding_config.context_capture_ix {
if capture.index == context_capture_ix {
context_range = Some(capture.node.byte_range());
}
}
}
if let Some((item_range, name_range)) = item_range.zip(name_range) {
let mut context_data = String::new();
if let Some(context_range) = context_range {
if let Some(context) = content.get(context_range.clone()) {
context_data.push_str(context);
}
}
if let Some((item, name)) =
content.get(item_range.clone()).zip(content.get(name_range))
{
context_spans.push(item.to_string());
documents.push(Document {
name: format!("{} {}", context_data.to_string(), name.to_string()),
offset: item_range.start,
embedding: Vec::new(),
});
}
}
}
return Ok((
IndexedFile {
path: relative_file_path,
mtime,
documents,
},
context_spans,
));
}
fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> { fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
let (tx, rx) = oneshot::channel(); let (tx, rx) = oneshot::channel();
self.db_update_tx self.db_update_tx