/auto (#16696)
Add `/auto` behind a feature flag that's disabled for now, even for staff. We've decided on a different design for context inference, but there are parts of /auto that will be useful for that, so we want them in the code base even if they're unused for now. Release Notes: - N/A --------- Co-authored-by: Antonio Scandurra <me@as-cii.com> Co-authored-by: Marshall Bowers <elliott.codes@gmail.com>
This commit is contained in:
parent
93a3e8bc94
commit
91ffa02e2c
42 changed files with 2776 additions and 1054 deletions
523
crates/semantic_index/src/project_index.rs
Normal file
523
crates/semantic_index/src/project_index.rs
Normal file
|
@ -0,0 +1,523 @@
|
|||
use crate::{
|
||||
embedding::{EmbeddingProvider, TextToEmbed},
|
||||
summary_index::FileSummary,
|
||||
worktree_index::{WorktreeIndex, WorktreeIndexHandle},
|
||||
};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use collections::HashMap;
|
||||
use fs::Fs;
|
||||
use futures::{stream::StreamExt, FutureExt};
|
||||
use gpui::{
|
||||
AppContext, Entity, EntityId, EventEmitter, Model, ModelContext, Subscription, Task, WeakModel,
|
||||
};
|
||||
use language::LanguageRegistry;
|
||||
use log;
|
||||
use project::{Project, Worktree, WorktreeId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smol::channel;
|
||||
use std::{cmp::Ordering, future::Future, num::NonZeroUsize, ops::Range, path::Path, sync::Arc};
|
||||
use util::ResultExt;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SearchResult {
|
||||
pub worktree: Model<Worktree>,
|
||||
pub path: Arc<Path>,
|
||||
pub range: Range<usize>,
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
pub struct WorktreeSearchResult {
|
||||
pub worktree_id: WorktreeId,
|
||||
pub path: Arc<Path>,
|
||||
pub range: Range<usize>,
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
|
||||
pub enum Status {
|
||||
Idle,
|
||||
Loading,
|
||||
Scanning { remaining_count: NonZeroUsize },
|
||||
}
|
||||
|
||||
pub struct ProjectIndex {
|
||||
db_connection: heed::Env,
|
||||
project: WeakModel<Project>,
|
||||
worktree_indices: HashMap<EntityId, WorktreeIndexHandle>,
|
||||
language_registry: Arc<LanguageRegistry>,
|
||||
fs: Arc<dyn Fs>,
|
||||
last_status: Status,
|
||||
status_tx: channel::Sender<()>,
|
||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
_maintain_status: Task<()>,
|
||||
_subscription: Subscription,
|
||||
}
|
||||
|
||||
impl ProjectIndex {
|
||||
pub fn new(
|
||||
project: Model<Project>,
|
||||
db_connection: heed::Env,
|
||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
cx: &mut ModelContext<Self>,
|
||||
) -> Self {
|
||||
let language_registry = project.read(cx).languages().clone();
|
||||
let fs = project.read(cx).fs().clone();
|
||||
let (status_tx, mut status_rx) = channel::unbounded();
|
||||
let mut this = ProjectIndex {
|
||||
db_connection,
|
||||
project: project.downgrade(),
|
||||
worktree_indices: HashMap::default(),
|
||||
language_registry,
|
||||
fs,
|
||||
status_tx,
|
||||
last_status: Status::Idle,
|
||||
embedding_provider,
|
||||
_subscription: cx.subscribe(&project, Self::handle_project_event),
|
||||
_maintain_status: cx.spawn(|this, mut cx| async move {
|
||||
while status_rx.next().await.is_some() {
|
||||
if this
|
||||
.update(&mut cx, |this, cx| this.update_status(cx))
|
||||
.is_err()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}),
|
||||
};
|
||||
this.update_worktree_indices(cx);
|
||||
this
|
||||
}
|
||||
|
||||
pub fn status(&self) -> Status {
|
||||
self.last_status
|
||||
}
|
||||
|
||||
pub fn project(&self) -> WeakModel<Project> {
|
||||
self.project.clone()
|
||||
}
|
||||
|
||||
pub fn fs(&self) -> Arc<dyn Fs> {
|
||||
self.fs.clone()
|
||||
}
|
||||
|
||||
fn handle_project_event(
|
||||
&mut self,
|
||||
_: Model<Project>,
|
||||
event: &project::Event,
|
||||
cx: &mut ModelContext<Self>,
|
||||
) {
|
||||
match event {
|
||||
project::Event::WorktreeAdded | project::Event::WorktreeRemoved(_) => {
|
||||
self.update_worktree_indices(cx);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn update_worktree_indices(&mut self, cx: &mut ModelContext<Self>) {
|
||||
let Some(project) = self.project.upgrade() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let worktrees = project
|
||||
.read(cx)
|
||||
.visible_worktrees(cx)
|
||||
.filter_map(|worktree| {
|
||||
if worktree.read(cx).is_local() {
|
||||
Some((worktree.entity_id(), worktree))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
self.worktree_indices
|
||||
.retain(|worktree_id, _| worktrees.contains_key(worktree_id));
|
||||
for (worktree_id, worktree) in worktrees {
|
||||
self.worktree_indices.entry(worktree_id).or_insert_with(|| {
|
||||
let worktree_index = WorktreeIndex::load(
|
||||
worktree.clone(),
|
||||
self.db_connection.clone(),
|
||||
self.language_registry.clone(),
|
||||
self.fs.clone(),
|
||||
self.status_tx.clone(),
|
||||
self.embedding_provider.clone(),
|
||||
cx,
|
||||
);
|
||||
|
||||
let load_worktree = cx.spawn(|this, mut cx| async move {
|
||||
let result = match worktree_index.await {
|
||||
Ok(worktree_index) => {
|
||||
this.update(&mut cx, |this, _| {
|
||||
this.worktree_indices.insert(
|
||||
worktree_id,
|
||||
WorktreeIndexHandle::Loaded {
|
||||
index: worktree_index.clone(),
|
||||
},
|
||||
);
|
||||
})?;
|
||||
Ok(worktree_index)
|
||||
}
|
||||
Err(error) => {
|
||||
this.update(&mut cx, |this, _cx| {
|
||||
this.worktree_indices.remove(&worktree_id)
|
||||
})?;
|
||||
Err(Arc::new(error))
|
||||
}
|
||||
};
|
||||
|
||||
this.update(&mut cx, |this, cx| this.update_status(cx))?;
|
||||
|
||||
result
|
||||
});
|
||||
|
||||
WorktreeIndexHandle::Loading {
|
||||
index: load_worktree.shared(),
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
self.update_status(cx);
|
||||
}
|
||||
|
||||
fn update_status(&mut self, cx: &mut ModelContext<Self>) {
|
||||
let mut indexing_count = 0;
|
||||
let mut any_loading = false;
|
||||
|
||||
for index in self.worktree_indices.values_mut() {
|
||||
match index {
|
||||
WorktreeIndexHandle::Loading { .. } => {
|
||||
any_loading = true;
|
||||
break;
|
||||
}
|
||||
WorktreeIndexHandle::Loaded { index, .. } => {
|
||||
indexing_count += index.read(cx).entry_ids_being_indexed().len();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let status = if any_loading {
|
||||
Status::Loading
|
||||
} else if let Some(remaining_count) = NonZeroUsize::new(indexing_count) {
|
||||
Status::Scanning { remaining_count }
|
||||
} else {
|
||||
Status::Idle
|
||||
};
|
||||
|
||||
if status != self.last_status {
|
||||
self.last_status = status;
|
||||
cx.emit(status);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn search(
|
||||
&self,
|
||||
query: String,
|
||||
limit: usize,
|
||||
cx: &AppContext,
|
||||
) -> Task<Result<Vec<SearchResult>>> {
|
||||
let (chunks_tx, chunks_rx) = channel::bounded(1024);
|
||||
let mut worktree_scan_tasks = Vec::new();
|
||||
for worktree_index in self.worktree_indices.values() {
|
||||
let worktree_index = worktree_index.clone();
|
||||
let chunks_tx = chunks_tx.clone();
|
||||
worktree_scan_tasks.push(cx.spawn(|cx| async move {
|
||||
let index = match worktree_index {
|
||||
WorktreeIndexHandle::Loading { index } => {
|
||||
index.clone().await.map_err(|error| anyhow!(error))?
|
||||
}
|
||||
WorktreeIndexHandle::Loaded { index } => index.clone(),
|
||||
};
|
||||
|
||||
index
|
||||
.read_with(&cx, |index, cx| {
|
||||
let worktree_id = index.worktree().read(cx).id();
|
||||
let db_connection = index.db_connection().clone();
|
||||
let db = *index.embedding_index().db();
|
||||
cx.background_executor().spawn(async move {
|
||||
let txn = db_connection
|
||||
.read_txn()
|
||||
.context("failed to create read transaction")?;
|
||||
let db_entries = db.iter(&txn).context("failed to iterate database")?;
|
||||
for db_entry in db_entries {
|
||||
let (_key, db_embedded_file) = db_entry?;
|
||||
for chunk in db_embedded_file.chunks {
|
||||
chunks_tx
|
||||
.send((worktree_id, db_embedded_file.path.clone(), chunk))
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
anyhow::Ok(())
|
||||
})
|
||||
})?
|
||||
.await
|
||||
}));
|
||||
}
|
||||
drop(chunks_tx);
|
||||
|
||||
let project = self.project.clone();
|
||||
let embedding_provider = self.embedding_provider.clone();
|
||||
cx.spawn(|cx| async move {
|
||||
#[cfg(debug_assertions)]
|
||||
let embedding_query_start = std::time::Instant::now();
|
||||
log::info!("Searching for {query}");
|
||||
|
||||
let query_embeddings = embedding_provider
|
||||
.embed(&[TextToEmbed::new(&query)])
|
||||
.await?;
|
||||
let query_embedding = query_embeddings
|
||||
.into_iter()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("no embedding for query"))?;
|
||||
|
||||
let mut results_by_worker = Vec::new();
|
||||
for _ in 0..cx.background_executor().num_cpus() {
|
||||
results_by_worker.push(Vec::<WorktreeSearchResult>::new());
|
||||
}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
let search_start = std::time::Instant::now();
|
||||
|
||||
cx.background_executor()
|
||||
.scoped(|cx| {
|
||||
for results in results_by_worker.iter_mut() {
|
||||
cx.spawn(async {
|
||||
while let Ok((worktree_id, path, chunk)) = chunks_rx.recv().await {
|
||||
let score = chunk.embedding.similarity(&query_embedding);
|
||||
let ix = match results.binary_search_by(|probe| {
|
||||
score.partial_cmp(&probe.score).unwrap_or(Ordering::Equal)
|
||||
}) {
|
||||
Ok(ix) | Err(ix) => ix,
|
||||
};
|
||||
results.insert(
|
||||
ix,
|
||||
WorktreeSearchResult {
|
||||
worktree_id,
|
||||
path: path.clone(),
|
||||
range: chunk.chunk.range.clone(),
|
||||
score,
|
||||
},
|
||||
);
|
||||
results.truncate(limit);
|
||||
}
|
||||
});
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
for scan_task in futures::future::join_all(worktree_scan_tasks).await {
|
||||
scan_task.log_err();
|
||||
}
|
||||
|
||||
project.read_with(&cx, |project, cx| {
|
||||
let mut search_results = Vec::with_capacity(results_by_worker.len() * limit);
|
||||
for worker_results in results_by_worker {
|
||||
search_results.extend(worker_results.into_iter().filter_map(|result| {
|
||||
Some(SearchResult {
|
||||
worktree: project.worktree_for_id(result.worktree_id, cx)?,
|
||||
path: result.path,
|
||||
range: result.range,
|
||||
score: result.score,
|
||||
})
|
||||
}));
|
||||
}
|
||||
search_results.sort_unstable_by(|a, b| {
|
||||
b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal)
|
||||
});
|
||||
search_results.truncate(limit);
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
let search_elapsed = search_start.elapsed();
|
||||
log::debug!(
|
||||
"searched {} entries in {:?}",
|
||||
search_results.len(),
|
||||
search_elapsed
|
||||
);
|
||||
let embedding_query_elapsed = embedding_query_start.elapsed();
|
||||
log::debug!("embedding query took {:?}", embedding_query_elapsed);
|
||||
}
|
||||
|
||||
search_results
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn path_count(&self, cx: &AppContext) -> Result<u64> {
|
||||
let mut result = 0;
|
||||
for worktree_index in self.worktree_indices.values() {
|
||||
if let WorktreeIndexHandle::Loaded { index, .. } = worktree_index {
|
||||
result += index.read(cx).path_count()?;
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) fn worktree_index(
|
||||
&self,
|
||||
worktree_id: WorktreeId,
|
||||
cx: &AppContext,
|
||||
) -> Option<Model<WorktreeIndex>> {
|
||||
for index in self.worktree_indices.values() {
|
||||
if let WorktreeIndexHandle::Loaded { index, .. } = index {
|
||||
if index.read(cx).worktree().read(cx).id() == worktree_id {
|
||||
return Some(index.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub(crate) fn worktree_indices(&self, cx: &AppContext) -> Vec<Model<WorktreeIndex>> {
|
||||
let mut result = self
|
||||
.worktree_indices
|
||||
.values()
|
||||
.filter_map(|index| {
|
||||
if let WorktreeIndexHandle::Loaded { index, .. } = index {
|
||||
Some(index.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
result.sort_by_key(|index| index.read(cx).worktree().read(cx).id());
|
||||
result
|
||||
}
|
||||
|
||||
pub fn all_summaries(&self, cx: &AppContext) -> Task<Result<Vec<FileSummary>>> {
|
||||
let (summaries_tx, summaries_rx) = channel::bounded(1024);
|
||||
let mut worktree_scan_tasks = Vec::new();
|
||||
for worktree_index in self.worktree_indices.values() {
|
||||
let worktree_index = worktree_index.clone();
|
||||
let summaries_tx: channel::Sender<(String, String)> = summaries_tx.clone();
|
||||
worktree_scan_tasks.push(cx.spawn(|cx| async move {
|
||||
let index = match worktree_index {
|
||||
WorktreeIndexHandle::Loading { index } => {
|
||||
index.clone().await.map_err(|error| anyhow!(error))?
|
||||
}
|
||||
WorktreeIndexHandle::Loaded { index } => index.clone(),
|
||||
};
|
||||
|
||||
index
|
||||
.read_with(&cx, |index, cx| {
|
||||
let db_connection = index.db_connection().clone();
|
||||
let summary_index = index.summary_index();
|
||||
let file_digest_db = summary_index.file_digest_db();
|
||||
let summary_db = summary_index.summary_db();
|
||||
|
||||
cx.background_executor().spawn(async move {
|
||||
let txn = db_connection
|
||||
.read_txn()
|
||||
.context("failed to create db read transaction")?;
|
||||
let db_entries = file_digest_db
|
||||
.iter(&txn)
|
||||
.context("failed to iterate database")?;
|
||||
for db_entry in db_entries {
|
||||
let (file_path, db_file) = db_entry?;
|
||||
|
||||
match summary_db.get(&txn, &db_file.digest) {
|
||||
Ok(opt_summary) => {
|
||||
// Currently, we only use summaries we already have. If the file hasn't been
|
||||
// summarized yet, then we skip it and don't include it in the inferred context.
|
||||
// If we want to do just-in-time summarization, this would be the place to do it!
|
||||
if let Some(summary) = opt_summary {
|
||||
summaries_tx
|
||||
.send((file_path.to_string(), summary.to_string()))
|
||||
.await?;
|
||||
} else {
|
||||
log::warn!("No summary found for {:?}", &db_file);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
log::error!(
|
||||
"Error reading from summary database: {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::Ok(())
|
||||
})
|
||||
})?
|
||||
.await
|
||||
}));
|
||||
}
|
||||
drop(summaries_tx);
|
||||
|
||||
let project = self.project.clone();
|
||||
cx.spawn(|cx| async move {
|
||||
let mut results_by_worker = Vec::new();
|
||||
for _ in 0..cx.background_executor().num_cpus() {
|
||||
results_by_worker.push(Vec::<FileSummary>::new());
|
||||
}
|
||||
|
||||
cx.background_executor()
|
||||
.scoped(|cx| {
|
||||
for results in results_by_worker.iter_mut() {
|
||||
cx.spawn(async {
|
||||
while let Ok((filename, summary)) = summaries_rx.recv().await {
|
||||
results.push(FileSummary { filename, summary });
|
||||
}
|
||||
});
|
||||
}
|
||||
})
|
||||
.await;
|
||||
|
||||
for scan_task in futures::future::join_all(worktree_scan_tasks).await {
|
||||
scan_task.log_err();
|
||||
}
|
||||
|
||||
project.read_with(&cx, |_project, _cx| {
|
||||
results_by_worker.into_iter().flatten().collect()
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Empty out the backlogs of all the worktrees in the project
|
||||
pub fn flush_summary_backlogs(&self, cx: &AppContext) -> impl Future<Output = ()> {
|
||||
let flush_start = std::time::Instant::now();
|
||||
|
||||
futures::future::join_all(self.worktree_indices.values().map(|worktree_index| {
|
||||
let worktree_index = worktree_index.clone();
|
||||
|
||||
cx.spawn(|cx| async move {
|
||||
let index = match worktree_index {
|
||||
WorktreeIndexHandle::Loading { index } => {
|
||||
index.clone().await.map_err(|error| anyhow!(error))?
|
||||
}
|
||||
WorktreeIndexHandle::Loaded { index } => index.clone(),
|
||||
};
|
||||
let worktree_abs_path =
|
||||
cx.update(|cx| index.read(cx).worktree().read(cx).abs_path())?;
|
||||
|
||||
index
|
||||
.read_with(&cx, |index, cx| {
|
||||
cx.background_executor()
|
||||
.spawn(index.summary_index().flush_backlog(worktree_abs_path, cx))
|
||||
})?
|
||||
.await
|
||||
})
|
||||
}))
|
||||
.map(move |results| {
|
||||
// Log any errors, but don't block the user. These summaries are supposed to
|
||||
// improve quality by providing extra context, but they aren't hard requirements!
|
||||
for result in results {
|
||||
if let Err(err) = result {
|
||||
log::error!("Error flushing summary backlog: {:?}", err);
|
||||
}
|
||||
}
|
||||
|
||||
log::info!("Summary backlog flushed in {:?}", flush_start.elapsed());
|
||||
})
|
||||
}
|
||||
|
||||
pub fn remaining_summaries(&self, cx: &mut ModelContext<Self>) -> usize {
|
||||
self.worktree_indices(cx)
|
||||
.iter()
|
||||
.map(|index| index.read(cx).summary_index().backlog_len())
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
impl EventEmitter<Status> for ProjectIndex {}
|
Loading…
Add table
Add a link
Reference in a new issue