Start work on exposing semantic search via project search view
Co-authored-by: Kyle <kyle@zed.dev>
This commit is contained in:
parent
d83c4ffb07
commit
afc4c10ec1
9 changed files with 397 additions and 423 deletions
|
@ -20,6 +20,7 @@ editor = { path = "../editor" }
|
|||
rpc = { path = "../rpc" }
|
||||
settings = { path = "../settings" }
|
||||
anyhow.workspace = true
|
||||
postage.workspace = true
|
||||
futures.workspace = true
|
||||
smol.workspace = true
|
||||
rusqlite = { version = "0.27.0", features = ["blob", "array", "modern_sqlite"] }
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use crate::{parsing::Document, SEMANTIC_INDEX_VERSION};
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use project::Fs;
|
||||
use rpc::proto::Timestamp;
|
||||
use rusqlite::{
|
||||
|
@ -76,14 +76,14 @@ impl VectorDatabase {
|
|||
self.db
|
||||
.execute(
|
||||
"
|
||||
DROP TABLE semantic_index_config;
|
||||
DROP TABLE worktrees;
|
||||
DROP TABLE files;
|
||||
DROP TABLE documents;
|
||||
DROP TABLE IF EXISTS documents;
|
||||
DROP TABLE IF EXISTS files;
|
||||
DROP TABLE IF EXISTS worktrees;
|
||||
DROP TABLE IF EXISTS semantic_index_config;
|
||||
",
|
||||
[],
|
||||
)
|
||||
.ok();
|
||||
.context("failed to drop tables")?;
|
||||
|
||||
// Initialize Vector Databasing Tables
|
||||
self.db.execute(
|
||||
|
|
|
@ -86,6 +86,7 @@ impl OpenAIEmbeddings {
|
|||
async fn send_request(&self, api_key: &str, spans: Vec<&str>) -> Result<Response<AsyncBody>> {
|
||||
let request = Request::post("https://api.openai.com/v1/embeddings")
|
||||
.redirect_policy(isahc::config::RedirectPolicy::Follow)
|
||||
.timeout(Duration::from_secs(4))
|
||||
.header("Content-Type", "application/json")
|
||||
.header("Authorization", format!("Bearer {}", api_key))
|
||||
.body(
|
||||
|
@ -133,7 +134,11 @@ impl EmbeddingProvider for OpenAIEmbeddings {
|
|||
self.executor.timer(delay).await;
|
||||
}
|
||||
StatusCode::BAD_REQUEST => {
|
||||
log::info!("BAD REQUEST: {:?}", &response.status());
|
||||
log::info!(
|
||||
"BAD REQUEST: {:?} {:?}",
|
||||
&response.status(),
|
||||
response.body()
|
||||
);
|
||||
// Don't worry about delaying bad request, as we can assume
|
||||
// we haven't been rate limited yet.
|
||||
for span in spans.iter_mut() {
|
||||
|
|
|
@ -1,172 +0,0 @@
|
|||
use crate::{SearchResult, SemanticIndex};
|
||||
use editor::{scroll::autoscroll::Autoscroll, Editor};
|
||||
use gpui::{
|
||||
actions, elements::*, AnyElement, AppContext, ModelHandle, MouseState, Task, ViewContext,
|
||||
WeakViewHandle,
|
||||
};
|
||||
use picker::{Picker, PickerDelegate, PickerEvent};
|
||||
use project::{Project, ProjectPath};
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
use util::ResultExt;
|
||||
use workspace::Workspace;
|
||||
|
||||
const MIN_QUERY_LEN: usize = 5;
|
||||
const EMBEDDING_DEBOUNCE_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
actions!(semantic_search, [Toggle]);
|
||||
|
||||
pub type SemanticSearch = Picker<SemanticSearchDelegate>;
|
||||
|
||||
pub struct SemanticSearchDelegate {
|
||||
workspace: WeakViewHandle<Workspace>,
|
||||
project: ModelHandle<Project>,
|
||||
semantic_index: ModelHandle<SemanticIndex>,
|
||||
selected_match_index: usize,
|
||||
matches: Vec<SearchResult>,
|
||||
history: HashMap<String, Vec<SearchResult>>,
|
||||
}
|
||||
|
||||
impl SemanticSearchDelegate {
|
||||
// This is currently searching on every keystroke,
|
||||
// This is wildly overkill, and has the potential to get expensive
|
||||
// We will need to update this to throttle searching
|
||||
pub fn new(
|
||||
workspace: WeakViewHandle<Workspace>,
|
||||
project: ModelHandle<Project>,
|
||||
semantic_index: ModelHandle<SemanticIndex>,
|
||||
) -> Self {
|
||||
Self {
|
||||
workspace,
|
||||
project,
|
||||
semantic_index,
|
||||
selected_match_index: 0,
|
||||
matches: vec![],
|
||||
history: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PickerDelegate for SemanticSearchDelegate {
|
||||
fn placeholder_text(&self) -> Arc<str> {
|
||||
"Search repository in natural language...".into()
|
||||
}
|
||||
|
||||
fn confirm(&mut self, cx: &mut ViewContext<SemanticSearch>) {
|
||||
if let Some(search_result) = self.matches.get(self.selected_match_index) {
|
||||
// Open Buffer
|
||||
let search_result = search_result.clone();
|
||||
let buffer = self.project.update(cx, |project, cx| {
|
||||
project.open_buffer(
|
||||
ProjectPath {
|
||||
worktree_id: search_result.worktree_id,
|
||||
path: search_result.file_path.clone().into(),
|
||||
},
|
||||
cx,
|
||||
)
|
||||
});
|
||||
|
||||
let workspace = self.workspace.clone();
|
||||
let position = search_result.clone().byte_range.start;
|
||||
cx.spawn(|_, mut cx| async move {
|
||||
let buffer = buffer.await?;
|
||||
workspace.update(&mut cx, |workspace, cx| {
|
||||
let editor = workspace.open_project_item::<Editor>(buffer, cx);
|
||||
editor.update(cx, |editor, cx| {
|
||||
editor.change_selections(Some(Autoscroll::center()), cx, |s| {
|
||||
s.select_ranges([position..position])
|
||||
});
|
||||
});
|
||||
})?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
})
|
||||
.detach_and_log_err(cx);
|
||||
cx.emit(PickerEvent::Dismiss);
|
||||
}
|
||||
}
|
||||
|
||||
fn dismissed(&mut self, _cx: &mut ViewContext<SemanticSearch>) {}
|
||||
|
||||
fn match_count(&self) -> usize {
|
||||
self.matches.len()
|
||||
}
|
||||
|
||||
fn selected_index(&self) -> usize {
|
||||
self.selected_match_index
|
||||
}
|
||||
|
||||
fn set_selected_index(&mut self, ix: usize, _cx: &mut ViewContext<SemanticSearch>) {
|
||||
self.selected_match_index = ix;
|
||||
}
|
||||
|
||||
fn update_matches(&mut self, query: String, cx: &mut ViewContext<SemanticSearch>) -> Task<()> {
|
||||
log::info!("Searching for {:?}...", query);
|
||||
if query.len() < MIN_QUERY_LEN {
|
||||
log::info!("Query below minimum length");
|
||||
return Task::ready(());
|
||||
}
|
||||
|
||||
let semantic_index = self.semantic_index.clone();
|
||||
let project = self.project.clone();
|
||||
cx.spawn(|this, mut cx| async move {
|
||||
cx.background().timer(EMBEDDING_DEBOUNCE_INTERVAL).await;
|
||||
|
||||
let retrieved_cached = this.update(&mut cx, |this, _| {
|
||||
let delegate = this.delegate_mut();
|
||||
if delegate.history.contains_key(&query) {
|
||||
let historic_results = delegate.history.get(&query).unwrap().to_owned();
|
||||
delegate.matches = historic_results.clone();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
|
||||
if let Some(retrieved) = retrieved_cached.log_err() {
|
||||
if !retrieved {
|
||||
let task = semantic_index.update(&mut cx, |store, cx| {
|
||||
store.search_project(project.clone(), query.to_string(), 10, cx)
|
||||
});
|
||||
|
||||
if let Some(results) = task.await.log_err() {
|
||||
log::info!("Not queried previously, searching...");
|
||||
this.update(&mut cx, |this, _| {
|
||||
let delegate = this.delegate_mut();
|
||||
delegate.matches = results.clone();
|
||||
delegate.history.insert(query, results);
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
} else {
|
||||
log::info!("Already queried, retrieved directly from cached history");
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn render_match(
|
||||
&self,
|
||||
ix: usize,
|
||||
mouse_state: &mut MouseState,
|
||||
selected: bool,
|
||||
cx: &AppContext,
|
||||
) -> AnyElement<Picker<Self>> {
|
||||
let theme = theme::current(cx);
|
||||
let style = &theme.picker.item;
|
||||
let current_style = style.in_state(selected).style_for(mouse_state);
|
||||
|
||||
let search_result = &self.matches[ix];
|
||||
|
||||
let path = search_result.file_path.to_string_lossy();
|
||||
let name = search_result.name.clone();
|
||||
|
||||
Flex::column()
|
||||
.with_child(Text::new(name, current_style.label.text.clone()).with_soft_wrap(false))
|
||||
.with_child(Label::new(
|
||||
path.to_string(),
|
||||
style.inactive_state().default.label.clone(),
|
||||
))
|
||||
.contained()
|
||||
.with_style(current_style.container)
|
||||
.into_any()
|
||||
}
|
||||
}
|
|
@ -1,6 +1,5 @@
|
|||
mod db;
|
||||
mod embedding;
|
||||
mod modal;
|
||||
mod parsing;
|
||||
mod semantic_index_settings;
|
||||
|
||||
|
@ -12,25 +11,20 @@ use anyhow::{anyhow, Result};
|
|||
use db::VectorDatabase;
|
||||
use embedding::{EmbeddingProvider, OpenAIEmbeddings};
|
||||
use futures::{channel::oneshot, Future};
|
||||
use gpui::{
|
||||
AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, ViewContext,
|
||||
WeakModelHandle,
|
||||
};
|
||||
use gpui::{AppContext, AsyncAppContext, Entity, ModelContext, ModelHandle, Task, WeakModelHandle};
|
||||
use language::{Language, LanguageRegistry};
|
||||
use modal::{SemanticSearch, SemanticSearchDelegate, Toggle};
|
||||
use parking_lot::Mutex;
|
||||
use parsing::{CodeContextRetriever, Document, PARSEABLE_ENTIRE_FILE_TYPES};
|
||||
use postage::watch;
|
||||
use project::{Fs, Project, WorktreeId};
|
||||
use smol::channel;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
collections::HashMap,
|
||||
mem,
|
||||
ops::Range,
|
||||
path::{Path, PathBuf},
|
||||
sync::{
|
||||
atomic::{self, AtomicUsize},
|
||||
Arc, Weak,
|
||||
},
|
||||
time::{Instant, SystemTime},
|
||||
sync::{Arc, Weak},
|
||||
time::SystemTime,
|
||||
};
|
||||
use util::{
|
||||
channel::{ReleaseChannel, RELEASE_CHANNEL, RELEASE_CHANNEL_NAME},
|
||||
|
@ -38,9 +32,8 @@ use util::{
|
|||
paths::EMBEDDINGS_DIR,
|
||||
ResultExt,
|
||||
};
|
||||
use workspace::{Workspace, WorkspaceCreated};
|
||||
|
||||
const SEMANTIC_INDEX_VERSION: usize = 1;
|
||||
const SEMANTIC_INDEX_VERSION: usize = 3;
|
||||
const EMBEDDINGS_BATCH_SIZE: usize = 150;
|
||||
|
||||
pub fn init(
|
||||
|
@ -55,25 +48,6 @@ pub fn init(
|
|||
.join(Path::new(RELEASE_CHANNEL_NAME.as_str()))
|
||||
.join("embeddings_db");
|
||||
|
||||
SemanticSearch::init(cx);
|
||||
cx.add_action(
|
||||
|workspace: &mut Workspace, _: &Toggle, cx: &mut ViewContext<Workspace>| {
|
||||
if cx.has_global::<ModelHandle<SemanticIndex>>() {
|
||||
let semantic_index = cx.global::<ModelHandle<SemanticIndex>>().clone();
|
||||
workspace.toggle_modal(cx, |workspace, cx| {
|
||||
let project = workspace.project().clone();
|
||||
let workspace = cx.weak_handle();
|
||||
cx.add_view(|cx| {
|
||||
SemanticSearch::new(
|
||||
SemanticSearchDelegate::new(workspace, project, semantic_index),
|
||||
cx,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
if *RELEASE_CHANNEL == ReleaseChannel::Stable
|
||||
|| !settings::get::<SemanticIndexSettings>(cx).enabled
|
||||
{
|
||||
|
@ -95,21 +69,6 @@ pub fn init(
|
|||
|
||||
cx.update(|cx| {
|
||||
cx.set_global(semantic_index.clone());
|
||||
cx.subscribe_global::<WorkspaceCreated, _>({
|
||||
let semantic_index = semantic_index.clone();
|
||||
move |event, cx| {
|
||||
let workspace = &event.0;
|
||||
if let Some(workspace) = workspace.upgrade(cx) {
|
||||
let project = workspace.read(cx).project().clone();
|
||||
if project.read(cx).is_local() {
|
||||
semantic_index.update(cx, |store, cx| {
|
||||
store.index_project(project, cx).detach();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.detach();
|
||||
});
|
||||
|
||||
anyhow::Ok(())
|
||||
|
@ -128,20 +87,17 @@ pub struct SemanticIndex {
|
|||
_embed_batch_task: Task<()>,
|
||||
_batch_files_task: Task<()>,
|
||||
_parsing_files_tasks: Vec<Task<()>>,
|
||||
next_job_id: Arc<AtomicUsize>,
|
||||
projects: HashMap<WeakModelHandle<Project>, ProjectState>,
|
||||
}
|
||||
|
||||
struct ProjectState {
|
||||
worktree_db_ids: Vec<(WorktreeId, i64)>,
|
||||
outstanding_jobs: Arc<Mutex<HashSet<JobId>>>,
|
||||
outstanding_job_count_rx: watch::Receiver<usize>,
|
||||
outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>,
|
||||
}
|
||||
|
||||
type JobId = usize;
|
||||
|
||||
struct JobHandle {
|
||||
id: JobId,
|
||||
set: Weak<Mutex<HashSet<JobId>>>,
|
||||
tx: Weak<Mutex<watch::Sender<usize>>>,
|
||||
}
|
||||
|
||||
impl ProjectState {
|
||||
|
@ -221,6 +177,14 @@ enum EmbeddingJob {
|
|||
}
|
||||
|
||||
impl SemanticIndex {
|
||||
pub fn global(cx: &AppContext) -> Option<ModelHandle<SemanticIndex>> {
|
||||
if cx.has_global::<ModelHandle<Self>>() {
|
||||
Some(cx.global::<ModelHandle<SemanticIndex>>().clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn new(
|
||||
fs: Arc<dyn Fs>,
|
||||
database_url: PathBuf,
|
||||
|
@ -236,184 +200,69 @@ impl SemanticIndex {
|
|||
.await?;
|
||||
|
||||
Ok(cx.add_model(|cx| {
|
||||
// paths_tx -> embeddings_tx -> db_update_tx
|
||||
|
||||
//db_update_tx/rx: Updating Database
|
||||
// Perform database operations
|
||||
let (db_update_tx, db_update_rx) = channel::unbounded();
|
||||
let _db_update_task = cx.background().spawn(async move {
|
||||
while let Ok(job) = db_update_rx.recv().await {
|
||||
match job {
|
||||
DbOperation::InsertFile {
|
||||
worktree_id,
|
||||
documents,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
} => {
|
||||
db.insert_file(worktree_id, path, mtime, documents)
|
||||
.log_err();
|
||||
drop(job_handle)
|
||||
}
|
||||
DbOperation::Delete { worktree_id, path } => {
|
||||
db.delete_file(worktree_id, path).log_err();
|
||||
}
|
||||
DbOperation::FindOrCreateWorktree { path, sender } => {
|
||||
let id = db.find_or_create_worktree(&path);
|
||||
sender.send(id).ok();
|
||||
}
|
||||
DbOperation::FileMTimes {
|
||||
worktree_id: worktree_db_id,
|
||||
sender,
|
||||
} => {
|
||||
let file_mtimes = db.get_file_mtimes(worktree_db_id);
|
||||
sender.send(file_mtimes).ok();
|
||||
}
|
||||
let _db_update_task = cx.background().spawn({
|
||||
async move {
|
||||
while let Ok(job) = db_update_rx.recv().await {
|
||||
Self::run_db_operation(&db, job)
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// embed_tx/rx: Embed Batch and Send to Database
|
||||
// Group documents into batches and send them to the embedding provider.
|
||||
let (embed_batch_tx, embed_batch_rx) =
|
||||
channel::unbounded::<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>();
|
||||
let _embed_batch_task = cx.background().spawn({
|
||||
let db_update_tx = db_update_tx.clone();
|
||||
let embedding_provider = embedding_provider.clone();
|
||||
async move {
|
||||
while let Ok(mut embeddings_queue) = embed_batch_rx.recv().await {
|
||||
// Construct Batch
|
||||
let mut batch_documents = vec![];
|
||||
for (_, documents, _, _, _) in embeddings_queue.iter() {
|
||||
batch_documents
|
||||
.extend(documents.iter().map(|document| document.content.as_str()));
|
||||
}
|
||||
|
||||
if let Ok(embeddings) =
|
||||
embedding_provider.embed_batch(batch_documents).await
|
||||
{
|
||||
log::trace!(
|
||||
"created {} embeddings for {} files",
|
||||
embeddings.len(),
|
||||
embeddings_queue.len(),
|
||||
);
|
||||
|
||||
let mut i = 0;
|
||||
let mut j = 0;
|
||||
|
||||
for embedding in embeddings.iter() {
|
||||
while embeddings_queue[i].1.len() == j {
|
||||
i += 1;
|
||||
j = 0;
|
||||
}
|
||||
|
||||
embeddings_queue[i].1[j].embedding = embedding.to_owned();
|
||||
j += 1;
|
||||
}
|
||||
|
||||
for (worktree_id, documents, path, mtime, job_handle) in
|
||||
embeddings_queue.into_iter()
|
||||
{
|
||||
for document in documents.iter() {
|
||||
// TODO: Update this so it doesn't panic
|
||||
assert!(
|
||||
document.embedding.len() > 0,
|
||||
"Document Embedding Not Complete"
|
||||
);
|
||||
}
|
||||
|
||||
db_update_tx
|
||||
.send(DbOperation::InsertFile {
|
||||
worktree_id,
|
||||
documents,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
while let Ok(embeddings_queue) = embed_batch_rx.recv().await {
|
||||
Self::compute_embeddings_for_batch(
|
||||
embeddings_queue,
|
||||
&embedding_provider,
|
||||
&db_update_tx,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// batch_tx/rx: Batch Files to Send for Embeddings
|
||||
// Group documents into batches and send them to the embedding provider.
|
||||
let (batch_files_tx, batch_files_rx) = channel::unbounded::<EmbeddingJob>();
|
||||
let _batch_files_task = cx.background().spawn(async move {
|
||||
let mut queue_len = 0;
|
||||
let mut embeddings_queue = vec![];
|
||||
|
||||
while let Ok(job) = batch_files_rx.recv().await {
|
||||
let should_flush = match job {
|
||||
EmbeddingJob::Enqueue {
|
||||
documents,
|
||||
worktree_id,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
} => {
|
||||
queue_len += &documents.len();
|
||||
embeddings_queue.push((
|
||||
worktree_id,
|
||||
documents,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
));
|
||||
queue_len >= EMBEDDINGS_BATCH_SIZE
|
||||
}
|
||||
EmbeddingJob::Flush => true,
|
||||
};
|
||||
|
||||
if should_flush {
|
||||
embed_batch_tx.try_send(embeddings_queue).unwrap();
|
||||
embeddings_queue = vec![];
|
||||
queue_len = 0;
|
||||
}
|
||||
Self::enqueue_documents_to_embed(
|
||||
job,
|
||||
&mut queue_len,
|
||||
&mut embeddings_queue,
|
||||
&embed_batch_tx,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// parsing_files_tx/rx: Parsing Files to Embeddable Documents
|
||||
// Parse files into embeddable documents.
|
||||
let (parsing_files_tx, parsing_files_rx) = channel::unbounded::<PendingFile>();
|
||||
|
||||
let mut _parsing_files_tasks = Vec::new();
|
||||
for _ in 0..cx.background().num_cpus() {
|
||||
let fs = fs.clone();
|
||||
let parsing_files_rx = parsing_files_rx.clone();
|
||||
let batch_files_tx = batch_files_tx.clone();
|
||||
let db_update_tx = db_update_tx.clone();
|
||||
_parsing_files_tasks.push(cx.background().spawn(async move {
|
||||
let mut retriever = CodeContextRetriever::new();
|
||||
while let Ok(pending_file) = parsing_files_rx.recv().await {
|
||||
if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err()
|
||||
{
|
||||
if let Some(documents) = retriever
|
||||
.parse_file(
|
||||
&pending_file.relative_path,
|
||||
&content,
|
||||
pending_file.language,
|
||||
)
|
||||
.log_err()
|
||||
{
|
||||
log::trace!(
|
||||
"parsed path {:?}: {} documents",
|
||||
pending_file.relative_path,
|
||||
documents.len()
|
||||
);
|
||||
|
||||
batch_files_tx
|
||||
.try_send(EmbeddingJob::Enqueue {
|
||||
worktree_id: pending_file.worktree_db_id,
|
||||
path: pending_file.relative_path,
|
||||
mtime: pending_file.modified_time,
|
||||
job_handle: pending_file.job_handle,
|
||||
documents,
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
if parsing_files_rx.len() == 0 {
|
||||
batch_files_tx.try_send(EmbeddingJob::Flush).unwrap();
|
||||
}
|
||||
Self::parse_file(
|
||||
&fs,
|
||||
pending_file,
|
||||
&mut retriever,
|
||||
&batch_files_tx,
|
||||
&parsing_files_rx,
|
||||
&db_update_tx,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
@ -424,7 +273,6 @@ impl SemanticIndex {
|
|||
embedding_provider,
|
||||
language_registry,
|
||||
db_update_tx,
|
||||
next_job_id: Default::default(),
|
||||
parsing_files_tx,
|
||||
_db_update_task,
|
||||
_embed_batch_task,
|
||||
|
@ -435,6 +283,167 @@ impl SemanticIndex {
|
|||
}))
|
||||
}
|
||||
|
||||
fn run_db_operation(db: &VectorDatabase, job: DbOperation) {
|
||||
match job {
|
||||
DbOperation::InsertFile {
|
||||
worktree_id,
|
||||
documents,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
} => {
|
||||
db.insert_file(worktree_id, path, mtime, documents)
|
||||
.log_err();
|
||||
drop(job_handle)
|
||||
}
|
||||
DbOperation::Delete { worktree_id, path } => {
|
||||
db.delete_file(worktree_id, path).log_err();
|
||||
}
|
||||
DbOperation::FindOrCreateWorktree { path, sender } => {
|
||||
let id = db.find_or_create_worktree(&path);
|
||||
sender.send(id).ok();
|
||||
}
|
||||
DbOperation::FileMTimes {
|
||||
worktree_id: worktree_db_id,
|
||||
sender,
|
||||
} => {
|
||||
let file_mtimes = db.get_file_mtimes(worktree_db_id);
|
||||
sender.send(file_mtimes).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn compute_embeddings_for_batch(
|
||||
mut embeddings_queue: Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
|
||||
embedding_provider: &Arc<dyn EmbeddingProvider>,
|
||||
db_update_tx: &channel::Sender<DbOperation>,
|
||||
) {
|
||||
let mut batch_documents = vec![];
|
||||
for (_, documents, _, _, _) in embeddings_queue.iter() {
|
||||
batch_documents.extend(documents.iter().map(|document| document.content.as_str()));
|
||||
}
|
||||
|
||||
if let Ok(embeddings) = embedding_provider.embed_batch(batch_documents).await {
|
||||
log::trace!(
|
||||
"created {} embeddings for {} files",
|
||||
embeddings.len(),
|
||||
embeddings_queue.len(),
|
||||
);
|
||||
|
||||
let mut i = 0;
|
||||
let mut j = 0;
|
||||
|
||||
for embedding in embeddings.iter() {
|
||||
while embeddings_queue[i].1.len() == j {
|
||||
i += 1;
|
||||
j = 0;
|
||||
}
|
||||
|
||||
embeddings_queue[i].1[j].embedding = embedding.to_owned();
|
||||
j += 1;
|
||||
}
|
||||
|
||||
for (worktree_id, documents, path, mtime, job_handle) in embeddings_queue.into_iter() {
|
||||
// for document in documents.iter() {
|
||||
// // TODO: Update this so it doesn't panic
|
||||
// assert!(
|
||||
// document.embedding.len() > 0,
|
||||
// "Document Embedding Not Complete"
|
||||
// );
|
||||
// }
|
||||
|
||||
db_update_tx
|
||||
.send(DbOperation::InsertFile {
|
||||
worktree_id,
|
||||
documents,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn enqueue_documents_to_embed(
|
||||
job: EmbeddingJob,
|
||||
queue_len: &mut usize,
|
||||
embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
|
||||
embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>,
|
||||
) {
|
||||
let should_flush = match job {
|
||||
EmbeddingJob::Enqueue {
|
||||
documents,
|
||||
worktree_id,
|
||||
path,
|
||||
mtime,
|
||||
job_handle,
|
||||
} => {
|
||||
*queue_len += &documents.len();
|
||||
embeddings_queue.push((worktree_id, documents, path, mtime, job_handle));
|
||||
*queue_len >= EMBEDDINGS_BATCH_SIZE
|
||||
}
|
||||
EmbeddingJob::Flush => true,
|
||||
};
|
||||
|
||||
if should_flush {
|
||||
embed_batch_tx
|
||||
.try_send(mem::take(embeddings_queue))
|
||||
.unwrap();
|
||||
*queue_len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
async fn parse_file(
|
||||
fs: &Arc<dyn Fs>,
|
||||
pending_file: PendingFile,
|
||||
retriever: &mut CodeContextRetriever,
|
||||
batch_files_tx: &channel::Sender<EmbeddingJob>,
|
||||
parsing_files_rx: &channel::Receiver<PendingFile>,
|
||||
db_update_tx: &channel::Sender<DbOperation>,
|
||||
) {
|
||||
if let Some(content) = fs.load(&pending_file.absolute_path).await.log_err() {
|
||||
if let Some(documents) = retriever
|
||||
.parse_file(&pending_file.relative_path, &content, pending_file.language)
|
||||
.log_err()
|
||||
{
|
||||
log::trace!(
|
||||
"parsed path {:?}: {} documents",
|
||||
pending_file.relative_path,
|
||||
documents.len()
|
||||
);
|
||||
|
||||
if documents.len() == 0 {
|
||||
db_update_tx
|
||||
.send(DbOperation::InsertFile {
|
||||
worktree_id: pending_file.worktree_db_id,
|
||||
documents,
|
||||
path: pending_file.relative_path,
|
||||
mtime: pending_file.modified_time,
|
||||
job_handle: pending_file.job_handle,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
} else {
|
||||
batch_files_tx
|
||||
.try_send(EmbeddingJob::Enqueue {
|
||||
worktree_id: pending_file.worktree_db_id,
|
||||
path: pending_file.relative_path,
|
||||
mtime: pending_file.modified_time,
|
||||
job_handle: pending_file.job_handle,
|
||||
documents,
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if parsing_files_rx.len() == 0 {
|
||||
batch_files_tx.try_send(EmbeddingJob::Flush).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn find_or_create_worktree(&self, path: PathBuf) -> impl Future<Output = Result<i64>> {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
self.db_update_tx
|
||||
|
@ -457,11 +466,11 @@ impl SemanticIndex {
|
|||
async move { rx.await? }
|
||||
}
|
||||
|
||||
fn index_project(
|
||||
pub fn index_project(
|
||||
&mut self,
|
||||
project: ModelHandle<Project>,
|
||||
cx: &mut ModelContext<Self>,
|
||||
) -> Task<Result<usize>> {
|
||||
) -> Task<Result<(usize, watch::Receiver<usize>)>> {
|
||||
let worktree_scans_complete = project
|
||||
.read(cx)
|
||||
.worktrees(cx)
|
||||
|
@ -483,7 +492,6 @@ impl SemanticIndex {
|
|||
let language_registry = self.language_registry.clone();
|
||||
let db_update_tx = self.db_update_tx.clone();
|
||||
let parsing_files_tx = self.parsing_files_tx.clone();
|
||||
let next_job_id = self.next_job_id.clone();
|
||||
|
||||
cx.spawn(|this, mut cx| async move {
|
||||
futures::future::join_all(worktree_scans_complete).await;
|
||||
|
@ -509,8 +517,8 @@ impl SemanticIndex {
|
|||
);
|
||||
}
|
||||
|
||||
// let mut pending_files: Vec<(PathBuf, ((i64, PathBuf, Arc<Language>, SystemTime), SystemTime))> = vec![];
|
||||
let outstanding_jobs = Arc::new(Mutex::new(HashSet::new()));
|
||||
let (job_count_tx, job_count_rx) = watch::channel_with(0);
|
||||
let job_count_tx = Arc::new(Mutex::new(job_count_tx));
|
||||
this.update(&mut cx, |this, _| {
|
||||
this.projects.insert(
|
||||
project.downgrade(),
|
||||
|
@ -519,7 +527,8 @@ impl SemanticIndex {
|
|||
.iter()
|
||||
.map(|(a, b)| (*a, *b))
|
||||
.collect(),
|
||||
outstanding_jobs: outstanding_jobs.clone(),
|
||||
outstanding_job_count_rx: job_count_rx.clone(),
|
||||
outstanding_job_count_tx: job_count_tx.clone(),
|
||||
},
|
||||
);
|
||||
});
|
||||
|
@ -527,7 +536,6 @@ impl SemanticIndex {
|
|||
cx.background()
|
||||
.spawn(async move {
|
||||
let mut count = 0;
|
||||
let t0 = Instant::now();
|
||||
for worktree in worktrees.into_iter() {
|
||||
let mut file_mtimes = worktree_file_mtimes.remove(&worktree.id()).unwrap();
|
||||
for file in worktree.files(false, 0) {
|
||||
|
@ -552,14 +560,11 @@ impl SemanticIndex {
|
|||
.map_or(false, |existing_mtime| existing_mtime == file.mtime);
|
||||
|
||||
if !already_stored {
|
||||
log::trace!("sending for parsing: {:?}", path_buf);
|
||||
count += 1;
|
||||
let job_id = next_job_id.fetch_add(1, atomic::Ordering::SeqCst);
|
||||
*job_count_tx.lock().borrow_mut() += 1;
|
||||
let job_handle = JobHandle {
|
||||
id: job_id,
|
||||
set: Arc::downgrade(&outstanding_jobs),
|
||||
tx: Arc::downgrade(&job_count_tx),
|
||||
};
|
||||
outstanding_jobs.lock().insert(job_id);
|
||||
parsing_files_tx
|
||||
.try_send(PendingFile {
|
||||
worktree_db_id: db_ids_by_worktree_id[&worktree.id()],
|
||||
|
@ -582,27 +587,22 @@ impl SemanticIndex {
|
|||
.unwrap();
|
||||
}
|
||||
}
|
||||
log::trace!(
|
||||
"parsing worktree completed in {:?}",
|
||||
t0.elapsed().as_millis()
|
||||
);
|
||||
|
||||
Ok(count)
|
||||
anyhow::Ok((count, job_count_rx))
|
||||
})
|
||||
.await
|
||||
})
|
||||
}
|
||||
|
||||
pub fn remaining_files_to_index_for_project(
|
||||
pub fn outstanding_job_count_rx(
|
||||
&self,
|
||||
project: &ModelHandle<Project>,
|
||||
) -> Option<usize> {
|
||||
) -> Option<watch::Receiver<usize>> {
|
||||
Some(
|
||||
self.projects
|
||||
.get(&project.downgrade())?
|
||||
.outstanding_jobs
|
||||
.lock()
|
||||
.len(),
|
||||
.outstanding_job_count_rx
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -678,8 +678,9 @@ impl Entity for SemanticIndex {
|
|||
|
||||
impl Drop for JobHandle {
|
||||
fn drop(&mut self) {
|
||||
if let Some(set) = self.set.upgrade() {
|
||||
set.lock().remove(&self.id);
|
||||
if let Some(tx) = self.tx.upgrade() {
|
||||
let mut tx = tx.lock();
|
||||
*tx.borrow_mut() -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,18 +88,13 @@ async fn test_semantic_index(cx: &mut TestAppContext) {
|
|||
let worktree_id = project.read_with(cx, |project, cx| {
|
||||
project.worktrees(cx).next().unwrap().read(cx).id()
|
||||
});
|
||||
let file_count = store
|
||||
let (file_count, outstanding_file_count) = store
|
||||
.update(cx, |store, cx| store.index_project(project.clone(), cx))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(file_count, 3);
|
||||
cx.foreground().run_until_parked();
|
||||
store.update(cx, |store, _cx| {
|
||||
assert_eq!(
|
||||
store.remaining_files_to_index_for_project(&project),
|
||||
Some(0)
|
||||
);
|
||||
});
|
||||
assert_eq!(*outstanding_file_count.borrow(), 0);
|
||||
|
||||
let search_results = store
|
||||
.update(cx, |store, cx| {
|
||||
|
@ -128,19 +123,14 @@ async fn test_semantic_index(cx: &mut TestAppContext) {
|
|||
cx.foreground().run_until_parked();
|
||||
|
||||
let prev_embedding_count = embedding_provider.embedding_count();
|
||||
let file_count = store
|
||||
let (file_count, outstanding_file_count) = store
|
||||
.update(cx, |store, cx| store.index_project(project.clone(), cx))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(file_count, 1);
|
||||
|
||||
cx.foreground().run_until_parked();
|
||||
store.update(cx, |store, _cx| {
|
||||
assert_eq!(
|
||||
store.remaining_files_to_index_for_project(&project),
|
||||
Some(0)
|
||||
);
|
||||
});
|
||||
assert_eq!(*outstanding_file_count.borrow(), 0);
|
||||
|
||||
assert_eq!(
|
||||
embedding_provider.embedding_count() - prev_embedding_count,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue