WIP: started DB creating and naive inserts
This commit is contained in:
parent
80a894b829
commit
d4a4db42aa
4 changed files with 161 additions and 7 deletions
19
Cargo.lock
generated
19
Cargo.lock
generated
|
@ -1389,6 +1389,15 @@ dependencies = [
|
||||||
"theme",
|
"theme",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "conv"
|
||||||
|
version = "0.3.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "78ff10625fd0ac447827aa30ea8b861fead473bb60aeb73af6c1c58caf0d1299"
|
||||||
|
dependencies = [
|
||||||
|
"custom_derive",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "copilot"
|
name = "copilot"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
@ -1766,6 +1775,12 @@ dependencies = [
|
||||||
"winapi 0.3.9",
|
"winapi 0.3.9",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "custom_derive"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cxx"
|
name = "cxx"
|
||||||
version = "1.0.94"
|
version = "1.0.94"
|
||||||
|
@ -7882,11 +7897,15 @@ name = "vector_store"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-compat",
|
||||||
|
"conv",
|
||||||
"futures 0.3.28",
|
"futures 0.3.28",
|
||||||
"gpui",
|
"gpui",
|
||||||
"language",
|
"language",
|
||||||
"project",
|
"project",
|
||||||
|
"rand 0.8.5",
|
||||||
"smol",
|
"smol",
|
||||||
|
"sqlx",
|
||||||
"util",
|
"util",
|
||||||
"workspace",
|
"workspace",
|
||||||
]
|
]
|
||||||
|
|
|
@ -17,6 +17,10 @@ util = { path = "../util" }
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
smol.workspace = true
|
smol.workspace = true
|
||||||
|
sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] }
|
||||||
|
async-compat = "0.2.1"
|
||||||
|
conv = "0.3.3"
|
||||||
|
rand.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
gpui = { path = "../gpui", features = ["test-support"] }
|
gpui = { path = "../gpui", features = ["test-support"] }
|
||||||
|
|
107
crates/vector_store/src/db.rs
Normal file
107
crates/vector_store/src/db.rs
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use async_compat::{Compat, CompatExt};
|
||||||
|
use conv::ValueFrom;
|
||||||
|
use sqlx::{migrate::MigrateDatabase, Pool, Sqlite, SqlitePool};
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use crate::IndexedFile;
|
||||||
|
|
||||||
|
// This is saving to a local database store within the users dev zed path
|
||||||
|
// Where do we want this to sit?
|
||||||
|
// Assuming near where the workspace DB sits.
|
||||||
|
const VECTOR_DB_URL: &str = "embeddings_db";
|
||||||
|
|
||||||
|
pub struct VectorDatabase {}
|
||||||
|
|
||||||
|
impl VectorDatabase {
|
||||||
|
pub async fn initialize_database() -> Result<()> {
|
||||||
|
// If database doesnt exist create database
|
||||||
|
if !Sqlite::database_exists(VECTOR_DB_URL)
|
||||||
|
.compat()
|
||||||
|
.await
|
||||||
|
.unwrap_or(false)
|
||||||
|
{
|
||||||
|
Sqlite::create_database(VECTOR_DB_URL).compat().await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
|
||||||
|
|
||||||
|
// Initialize Vector Databasing Tables
|
||||||
|
// We may be able to skip this assuming the database is never created
|
||||||
|
// without creating the tables at the same time.
|
||||||
|
sqlx::query(
|
||||||
|
"CREATE TABLE IF NOT EXISTS files (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
path NVARCHAR(100) NOT NULL,
|
||||||
|
sha1 NVARCHAR(40) NOT NULL
|
||||||
|
)",
|
||||||
|
)
|
||||||
|
.execute(&db)
|
||||||
|
.compat()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
sqlx::query(
|
||||||
|
"CREATE TABLE IF NOT EXISTS documents (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
file_id INTEGER NOT NULL,
|
||||||
|
offset INTEGER NOT NULL,
|
||||||
|
name NVARCHAR(100) NOT NULL,
|
||||||
|
embedding BLOB NOT NULL,
|
||||||
|
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||||
|
)",
|
||||||
|
)
|
||||||
|
.execute(&db)
|
||||||
|
.compat()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> {
|
||||||
|
// Write to files table, and return generated id.
|
||||||
|
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
|
||||||
|
|
||||||
|
let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)")
|
||||||
|
.bind(indexed_file.path.to_str())
|
||||||
|
.bind(indexed_file.sha1)
|
||||||
|
.execute(&db)
|
||||||
|
.compat()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let inserted_id = files_insert.last_insert_rowid();
|
||||||
|
|
||||||
|
// I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again
|
||||||
|
// I imagine there is a better way to serialize to/from blob
|
||||||
|
fn get_binary_from_values(values: Vec<f32>) -> String {
|
||||||
|
let bits: Vec<_> = values.iter().map(|v| v.to_bits().to_string()).collect();
|
||||||
|
bits.join(";")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_values_from_binary(bin: &str) -> Vec<f32> {
|
||||||
|
(0..bin.len() / 32)
|
||||||
|
.map(|i| {
|
||||||
|
let start = i * 32;
|
||||||
|
let end = start + 32;
|
||||||
|
f32::from_bits(u32::from_str_radix(&bin[start..end], 2).unwrap())
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Currently inserting at approximately 3400 documents a second
|
||||||
|
// I imagine we can speed this up with a bulk insert of some kind.
|
||||||
|
for document in indexed_file.documents {
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)",
|
||||||
|
)
|
||||||
|
.bind(inserted_id)
|
||||||
|
.bind(document.offset.to_string())
|
||||||
|
.bind(document.name)
|
||||||
|
.bind(get_binary_from_values(document.embedding))
|
||||||
|
.execute(&db)
|
||||||
|
.compat()
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,9 +1,12 @@
|
||||||
use anyhow::{anyhow, Result};
|
mod db;
|
||||||
|
use anyhow::Result;
|
||||||
|
use db::VectorDatabase;
|
||||||
use gpui::{AppContext, Entity, ModelContext, ModelHandle};
|
use gpui::{AppContext, Entity, ModelContext, ModelHandle};
|
||||||
use language::LanguageRegistry;
|
use language::LanguageRegistry;
|
||||||
use project::{Fs, Project};
|
use project::{Fs, Project};
|
||||||
|
use rand::Rng;
|
||||||
use smol::channel;
|
use smol::channel;
|
||||||
use std::{path::PathBuf, sync::Arc};
|
use std::{path::PathBuf, sync::Arc, time::Instant};
|
||||||
use util::ResultExt;
|
use util::ResultExt;
|
||||||
use workspace::WorkspaceCreated;
|
use workspace::WorkspaceCreated;
|
||||||
|
|
||||||
|
@ -27,13 +30,15 @@ pub fn init(fs: Arc<dyn Fs>, language_registry: Arc<LanguageRegistry>, cx: &mut
|
||||||
.detach();
|
.detach();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, sqlx::FromRow)]
|
||||||
struct Document {
|
struct Document {
|
||||||
offset: usize,
|
offset: usize,
|
||||||
name: String,
|
name: String,
|
||||||
embedding: Vec<f32>,
|
embedding: Vec<f32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct IndexedFile {
|
#[derive(Debug, sqlx::FromRow)]
|
||||||
|
pub struct IndexedFile {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
sha1: String,
|
sha1: String,
|
||||||
documents: Vec<Document>,
|
documents: Vec<Document>,
|
||||||
|
@ -64,9 +69,24 @@ impl VectorStore {
|
||||||
language_registry: &Arc<LanguageRegistry>,
|
language_registry: &Arc<LanguageRegistry>,
|
||||||
file_path: PathBuf,
|
file_path: PathBuf,
|
||||||
) -> Result<IndexedFile> {
|
) -> Result<IndexedFile> {
|
||||||
eprintln!("indexing file {file_path:?}");
|
// This is creating dummy documents to test the database writes.
|
||||||
Err(anyhow!("not implemented"))
|
let mut documents = vec![];
|
||||||
// todo!();
|
let mut rng = rand::thread_rng();
|
||||||
|
let rand_num_of_documents: u8 = rng.gen_range(0..200);
|
||||||
|
for _ in 0..rand_num_of_documents {
|
||||||
|
let doc = Document {
|
||||||
|
offset: 0,
|
||||||
|
name: "test symbol".to_string(),
|
||||||
|
embedding: vec![0.32 as f32; 768],
|
||||||
|
};
|
||||||
|
documents.push(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ok(IndexedFile {
|
||||||
|
path: file_path,
|
||||||
|
sha1: "asdfasdfasdf".to_string(),
|
||||||
|
documents,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_project(&mut self, project: ModelHandle<Project>, cx: &mut ModelContext<Self>) {
|
fn add_project(&mut self, project: ModelHandle<Project>, cx: &mut ModelContext<Self>) {
|
||||||
|
@ -100,13 +120,17 @@ impl VectorStore {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.detach();
|
.detach();
|
||||||
|
|
||||||
cx.background()
|
cx.background()
|
||||||
.spawn(async move {
|
.spawn(async move {
|
||||||
|
// Initialize Database, creates database and tables if not exists
|
||||||
|
VectorDatabase::initialize_database().await.log_err();
|
||||||
while let Ok(indexed_file) = indexed_files_rx.recv().await {
|
while let Ok(indexed_file) = indexed_files_rx.recv().await {
|
||||||
// write document to database
|
VectorDatabase::insert_file(indexed_file).await.log_err();
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.detach();
|
.detach();
|
||||||
|
|
||||||
cx.background()
|
cx.background()
|
||||||
.scoped(|scope| {
|
.scoped(|scope| {
|
||||||
for _ in 0..cx.background().num_cpus() {
|
for _ in 0..cx.background().num_cpus() {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue