Rename Document to Span

This commit is contained in:
Antonio Scandurra 2023-09-06 16:48:53 +02:00
parent de0f53b39f
commit ce62173534
5 changed files with 109 additions and 112 deletions

View file

@ -1,4 +1,4 @@
use crate::{embedding::EmbeddingProvider, parsing::Document, JobHandle};
use crate::{embedding::EmbeddingProvider, parsing::Span, JobHandle};
use gpui::executor::Background;
use parking_lot::Mutex;
use smol::channel;
@ -9,7 +9,7 @@ pub struct FileToEmbed {
pub worktree_id: i64,
pub path: Arc<Path>,
pub mtime: SystemTime,
pub documents: Vec<Document>,
pub spans: Vec<Span>,
pub job_handle: JobHandle,
}
@ -19,7 +19,7 @@ impl std::fmt::Debug for FileToEmbed {
.field("worktree_id", &self.worktree_id)
.field("path", &self.path)
.field("mtime", &self.mtime)
.field("document", &self.documents)
.field("spans", &self.spans)
.finish_non_exhaustive()
}
}
@ -29,13 +29,13 @@ impl PartialEq for FileToEmbed {
self.worktree_id == other.worktree_id
&& self.path == other.path
&& self.mtime == other.mtime
&& self.documents == other.documents
&& self.spans == other.spans
}
}
pub struct EmbeddingQueue {
embedding_provider: Arc<dyn EmbeddingProvider>,
pending_batch: Vec<FileToEmbedFragment>,
pending_batch: Vec<FileFragmentToEmbed>,
executor: Arc<Background>,
pending_batch_token_count: usize,
finished_files_tx: channel::Sender<FileToEmbed>,
@ -43,9 +43,9 @@ pub struct EmbeddingQueue {
}
#[derive(Clone)]
pub struct FileToEmbedFragment {
pub struct FileFragmentToEmbed {
file: Arc<Mutex<FileToEmbed>>,
document_range: Range<usize>,
span_range: Range<usize>,
}
impl EmbeddingQueue {
@ -62,41 +62,41 @@ impl EmbeddingQueue {
}
pub fn push(&mut self, file: FileToEmbed) {
if file.documents.is_empty() {
if file.spans.is_empty() {
self.finished_files_tx.try_send(file).unwrap();
return;
}
let file = Arc::new(Mutex::new(file));
self.pending_batch.push(FileToEmbedFragment {
self.pending_batch.push(FileFragmentToEmbed {
file: file.clone(),
document_range: 0..0,
span_range: 0..0,
});
let mut fragment_range = &mut self.pending_batch.last_mut().unwrap().document_range;
let mut fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
let mut saved_tokens = 0;
for (ix, document) in file.lock().documents.iter().enumerate() {
let document_token_count = if document.embedding.is_none() {
document.token_count
for (ix, span) in file.lock().spans.iter().enumerate() {
let span_token_count = if span.embedding.is_none() {
span.token_count
} else {
saved_tokens += document.token_count;
saved_tokens += span.token_count;
0
};
let next_token_count = self.pending_batch_token_count + document_token_count;
let next_token_count = self.pending_batch_token_count + span_token_count;
if next_token_count > self.embedding_provider.max_tokens_per_batch() {
let range_end = fragment_range.end;
self.flush();
self.pending_batch.push(FileToEmbedFragment {
self.pending_batch.push(FileFragmentToEmbed {
file: file.clone(),
document_range: range_end..range_end,
span_range: range_end..range_end,
});
fragment_range = &mut self.pending_batch.last_mut().unwrap().document_range;
fragment_range = &mut self.pending_batch.last_mut().unwrap().span_range;
}
fragment_range.end = ix + 1;
self.pending_batch_token_count += document_token_count;
self.pending_batch_token_count += span_token_count;
}
log::trace!("Saved Tokens: {:?}", saved_tokens);
}
@ -113,20 +113,20 @@ impl EmbeddingQueue {
self.executor.spawn(async move {
let mut spans = Vec::new();
let mut document_count = 0;
let mut span_count = 0;
for fragment in &batch {
let file = fragment.file.lock();
document_count += file.documents[fragment.document_range.clone()].len();
span_count += file.spans[fragment.span_range.clone()].len();
spans.extend(
{
file.documents[fragment.document_range.clone()]
file.spans[fragment.span_range.clone()]
.iter().filter(|d| d.embedding.is_none())
.map(|d| d.content.clone())
}
);
}
log::trace!("Documents Length: {:?}", document_count);
log::trace!("Documents Length: {:?}", span_count);
log::trace!("Span Length: {:?}", spans.clone().len());
// If spans is 0, just send the fragment to the finished files if its the last one.
@ -143,11 +143,11 @@ impl EmbeddingQueue {
Ok(embeddings) => {
let mut embeddings = embeddings.into_iter();
for fragment in batch {
for document in
&mut fragment.file.lock().documents[fragment.document_range.clone()].iter_mut().filter(|d| d.embedding.is_none())
for span in
&mut fragment.file.lock().spans[fragment.span_range.clone()].iter_mut().filter(|d| d.embedding.is_none())
{
if let Some(embedding) = embeddings.next() {
document.embedding = Some(embedding);
span.embedding = Some(embedding);
} else {
//
log::error!("number of embeddings returned different from number of documents");