diff --git a/Cargo.lock b/Cargo.lock index 3bf0a568a2..beb84e04bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1389,6 +1389,15 @@ dependencies = [ "theme", ] +[[package]] +name = "conv" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299" +dependencies = [ + "custom_derive", +] + [[package]] name = "copilot" version = "0.1.0" @@ -1766,6 +1775,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "custom_derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" + [[package]] name = "cxx" version = "1.0.94" @@ -7882,11 +7897,15 @@ name = "vector_store" version = "0.1.0" dependencies = [ "anyhow", + "async-compat", + "conv", "futures 0.3.28", "gpui", "language", "project", + "rand 0.8.5", "smol", + "sqlx", "util", "workspace", ] diff --git a/crates/vector_store/Cargo.toml b/crates/vector_store/Cargo.toml index c33a35bcad..74ad23740e 100644 --- a/crates/vector_store/Cargo.toml +++ b/crates/vector_store/Cargo.toml @@ -17,6 +17,10 @@ util = { path = "../util" } anyhow.workspace = true futures.workspace = true smol.workspace = true +sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] } +async-compat = "0.2.1" +conv = "0.3.3" +rand.workspace = true [dev-dependencies] gpui = { path = "../gpui", features = ["test-support"] } diff --git a/crates/vector_store/src/db.rs b/crates/vector_store/src/db.rs new file mode 100644 index 0000000000..dfa85044d6 --- /dev/null +++ b/crates/vector_store/src/db.rs @@ -0,0 +1,107 @@ +use anyhow::Result; +use async_compat::{Compat, CompatExt}; +use conv::ValueFrom; +use sqlx::{migrate::MigrateDatabase, Pool, Sqlite, SqlitePool}; +use std::time::{Duration, Instant}; + +use crate::IndexedFile; + +// This is saving to a local database store within the users dev zed path +// Where do we want this to sit? +// Assuming near where the workspace DB sits. +const VECTOR_DB_URL: &str = "embeddings_db"; + +pub struct VectorDatabase {} + +impl VectorDatabase { + pub async fn initialize_database() -> Result<()> { + // If database doesnt exist create database + if !Sqlite::database_exists(VECTOR_DB_URL) + .compat() + .await + .unwrap_or(false) + { + Sqlite::create_database(VECTOR_DB_URL).compat().await?; + } + + let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?; + + // Initialize Vector Databasing Tables + // We may be able to skip this assuming the database is never created + // without creating the tables at the same time. + sqlx::query( + "CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + path NVARCHAR(100) NOT NULL, + sha1 NVARCHAR(40) NOT NULL + )", + ) + .execute(&db) + .compat() + .await?; + + sqlx::query( + "CREATE TABLE IF NOT EXISTS documents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_id INTEGER NOT NULL, + offset INTEGER NOT NULL, + name NVARCHAR(100) NOT NULL, + embedding BLOB NOT NULL, + FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE + )", + ) + .execute(&db) + .compat() + .await?; + + Ok(()) + } + + pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> { + // Write to files table, and return generated id. + let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?; + + let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)") + .bind(indexed_file.path.to_str()) + .bind(indexed_file.sha1) + .execute(&db) + .compat() + .await?; + + let inserted_id = files_insert.last_insert_rowid(); + + // I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again + // I imagine there is a better way to serialize to/from blob + fn get_binary_from_values(values: Vec) -> String { + let bits: Vec<_> = values.iter().map(|v| v.to_bits().to_string()).collect(); + bits.join(";") + } + + fn get_values_from_binary(bin: &str) -> Vec { + (0..bin.len() / 32) + .map(|i| { + let start = i * 32; + let end = start + 32; + f32::from_bits(u32::from_str_radix(&bin[start..end], 2).unwrap()) + }) + .collect() + } + + // Currently inserting at approximately 3400 documents a second + // I imagine we can speed this up with a bulk insert of some kind. + for document in indexed_file.documents { + sqlx::query( + "INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)", + ) + .bind(inserted_id) + .bind(document.offset.to_string()) + .bind(document.name) + .bind(get_binary_from_values(document.embedding)) + .execute(&db) + .compat() + .await?; + } + + Ok(()) + } +} diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 1556df7ebe..93f9fbe06d 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -1,9 +1,12 @@ -use anyhow::{anyhow, Result}; +mod db; +use anyhow::Result; +use db::VectorDatabase; use gpui::{AppContext, Entity, ModelContext, ModelHandle}; use language::LanguageRegistry; use project::{Fs, Project}; +use rand::Rng; use smol::channel; -use std::{path::PathBuf, sync::Arc}; +use std::{path::PathBuf, sync::Arc, time::Instant}; use util::ResultExt; use workspace::WorkspaceCreated; @@ -27,13 +30,15 @@ pub fn init(fs: Arc, language_registry: Arc, cx: &mut .detach(); } +#[derive(Debug, sqlx::FromRow)] struct Document { offset: usize, name: String, embedding: Vec, } -struct IndexedFile { +#[derive(Debug, sqlx::FromRow)] +pub struct IndexedFile { path: PathBuf, sha1: String, documents: Vec, @@ -64,9 +69,24 @@ impl VectorStore { language_registry: &Arc, file_path: PathBuf, ) -> Result { - eprintln!("indexing file {file_path:?}"); - Err(anyhow!("not implemented")) - // todo!(); + // This is creating dummy documents to test the database writes. + let mut documents = vec![]; + let mut rng = rand::thread_rng(); + let rand_num_of_documents: u8 = rng.gen_range(0..200); + for _ in 0..rand_num_of_documents { + let doc = Document { + offset: 0, + name: "test symbol".to_string(), + embedding: vec![0.32 as f32; 768], + }; + documents.push(doc); + } + + return Ok(IndexedFile { + path: file_path, + sha1: "asdfasdfasdf".to_string(), + documents, + }); } fn add_project(&mut self, project: ModelHandle, cx: &mut ModelContext) { @@ -100,13 +120,17 @@ impl VectorStore { } }) .detach(); + cx.background() .spawn(async move { + // Initialize Database, creates database and tables if not exists + VectorDatabase::initialize_database().await.log_err(); while let Ok(indexed_file) = indexed_files_rx.recv().await { - // write document to database + VectorDatabase::insert_file(indexed_file).await.log_err(); } }) .detach(); + cx.background() .scoped(|scope| { for _ in 0..cx.background().num_cpus() {