removed tokio and sqlx dependency, added dummy embeddings provider to save on open ai costs when testing

This commit is contained in:
KCaverly 2023-06-23 10:25:12 -04:00
parent dd309070eb
commit c071b271be
5 changed files with 45 additions and 54 deletions

2
Cargo.lock generated
View file

@ -7912,7 +7912,6 @@ name = "vector_store"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-compat",
"async-trait", "async-trait",
"futures 0.3.28", "futures 0.3.28",
"gpui", "gpui",
@ -7925,7 +7924,6 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"smol", "smol",
"sqlx",
"tree-sitter", "tree-sitter",
"util", "util",
"workspace", "workspace",

View file

@ -17,8 +17,6 @@ util = { path = "../util" }
anyhow.workspace = true anyhow.workspace = true
futures.workspace = true futures.workspace = true
smol.workspace = true smol.workspace = true
sqlx = { version = "0.6", features = ["sqlite","runtime-tokio-rustls"] }
async-compat = "0.2.1"
rusqlite = "0.27.0" rusqlite = "0.27.0"
isahc.workspace = true isahc.workspace = true
log.workspace = true log.workspace = true

View file

@ -1,6 +1,5 @@
use anyhow::Result; use anyhow::Result;
use async_compat::{Compat, CompatExt}; use rusqlite::params;
use sqlx::{migrate::MigrateDatabase, Sqlite, SqlitePool};
use crate::IndexedFile; use crate::IndexedFile;
@ -13,32 +12,20 @@ pub struct VectorDatabase {}
impl VectorDatabase { impl VectorDatabase {
pub async fn initialize_database() -> Result<()> { pub async fn initialize_database() -> Result<()> {
// If database doesnt exist create database // This will create the database if it doesnt exist
if !Sqlite::database_exists(VECTOR_DB_URL) let db = rusqlite::Connection::open(VECTOR_DB_URL)?;
.compat()
.await
.unwrap_or(false)
{
Sqlite::create_database(VECTOR_DB_URL).compat().await?;
}
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?;
// Initialize Vector Databasing Tables // Initialize Vector Databasing Tables
// We may be able to skip this assuming the database is never created db.execute(
// without creating the tables at the same time.
sqlx::query(
"CREATE TABLE IF NOT EXISTS files ( "CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
path NVARCHAR(100) NOT NULL, path NVARCHAR(100) NOT NULL,
sha1 NVARCHAR(40) NOT NULL sha1 NVARCHAR(40) NOT NULL
)", )",
) [],
.execute(&db) )?;
.compat()
.await?;
sqlx::query( db.execute(
"CREATE TABLE IF NOT EXISTS documents ( "CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL, file_id INTEGER NOT NULL,
@ -47,26 +34,22 @@ impl VectorDatabase {
embedding BLOB NOT NULL, embedding BLOB NOT NULL,
FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE FOREIGN KEY(file_id) REFERENCES files(id) ON DELETE CASCADE
)", )",
) [],
.execute(&db) )?;
.compat()
.await?;
Ok(()) Ok(())
} }
pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> { pub async fn insert_file(indexed_file: IndexedFile) -> Result<()> {
// Write to files table, and return generated id. // Write to files table, and return generated id.
let db = SqlitePool::connect(VECTOR_DB_URL).compat().await?; let db = rusqlite::Connection::open(VECTOR_DB_URL)?;
let files_insert = sqlx::query("INSERT INTO files (path, sha1) VALUES ($1, $2)") let files_insert = db.execute(
.bind(indexed_file.path.to_str()) "INSERT INTO files (path, sha1) VALUES (?1, ?2)",
.bind(indexed_file.sha1) params![indexed_file.path.to_str(), indexed_file.sha1],
.execute(&db) )?;
.compat()
.await?;
let inserted_id = files_insert.last_insert_rowid(); let inserted_id = db.last_insert_rowid();
// I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again // I stole this from https://stackoverflow.com/questions/71829931/how-do-i-convert-a-negative-f32-value-to-binary-string-and-back-again
// I imagine there is a better way to serialize to/from blob // I imagine there is a better way to serialize to/from blob
@ -88,16 +71,15 @@ impl VectorDatabase {
// Currently inserting at approximately 3400 documents a second // Currently inserting at approximately 3400 documents a second
// I imagine we can speed this up with a bulk insert of some kind. // I imagine we can speed this up with a bulk insert of some kind.
for document in indexed_file.documents { for document in indexed_file.documents {
sqlx::query( db.execute(
"INSERT INTO documents (file_id, offset, name, embedding) VALUES ($1, $2, $3, $4)", "INSERT INTO documents (file_id, offset, name, embedding) VALUES (?1, ?2, ?3, ?4)",
) params![
.bind(inserted_id) inserted_id,
.bind(document.offset.to_string()) document.offset.to_string(),
.bind(document.name) document.name,
.bind(get_binary_from_values(document.embedding)) get_binary_from_values(document.embedding)
.execute(&db) ],
.compat() )?;
.await?;
} }
Ok(()) Ok(())

View file

@ -47,6 +47,18 @@ pub trait EmbeddingProvider: Sync {
async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>>; async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>>;
} }
pub struct DummyEmbeddings {}
#[async_trait]
impl EmbeddingProvider for DummyEmbeddings {
async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> {
// 1024 is the OpenAI Embeddings size for ada models.
// the model we will likely be starting with.
let dummy_vec = vec![0.32 as f32; 1024];
return Ok(vec![dummy_vec; spans.len()]);
}
}
#[async_trait] #[async_trait]
impl EmbeddingProvider for OpenAIEmbeddings { impl EmbeddingProvider for OpenAIEmbeddings {
async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> { async fn embed_batch(&self, spans: Vec<&str>) -> Result<Vec<Vec<f32>>> {

View file

@ -3,7 +3,7 @@ mod embedding;
use anyhow::{anyhow, Result}; use anyhow::{anyhow, Result};
use db::VectorDatabase; use db::VectorDatabase;
use embedding::{EmbeddingProvider, OpenAIEmbeddings}; use embedding::{DummyEmbeddings, EmbeddingProvider, OpenAIEmbeddings};
use gpui::{AppContext, Entity, ModelContext, ModelHandle}; use gpui::{AppContext, Entity, ModelContext, ModelHandle};
use language::LanguageRegistry; use language::LanguageRegistry;
use project::{Fs, Project}; use project::{Fs, Project};
@ -38,14 +38,14 @@ pub fn init(
.detach(); .detach();
} }
#[derive(Debug, sqlx::FromRow)] #[derive(Debug)]
struct Document { struct Document {
offset: usize, offset: usize,
name: String, name: String,
embedding: Vec<f32>, embedding: Vec<f32>,
} }
#[derive(Debug, sqlx::FromRow)] #[derive(Debug)]
pub struct IndexedFile { pub struct IndexedFile {
path: PathBuf, path: PathBuf,
sha1: String, sha1: String,
@ -188,7 +188,8 @@ impl VectorStore {
}) })
.detach(); .detach();
let provider = OpenAIEmbeddings { client }; // let provider = OpenAIEmbeddings { client };
let provider = DummyEmbeddings {};
let t0 = Instant::now(); let t0 = Instant::now();