From 66c967da8837b572b6f07c1965d020a0779f439f Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 12 Sep 2023 16:25:31 -0400 Subject: [PATCH 01/14] start work on eval script for semantic_index --- Cargo.lock | 19 ++++ crates/semantic_index/Cargo.toml | 4 + crates/semantic_index/eval/tree-sitter.json | 10 +++ crates/semantic_index/examples/eval.rs | 97 +++++++++++++++++++++ script/evaluate_semantic_index | 3 + 5 files changed, 133 insertions(+) create mode 100644 crates/semantic_index/eval/tree-sitter.json create mode 100644 crates/semantic_index/examples/eval.rs create mode 100755 script/evaluate_semantic_index diff --git a/Cargo.lock b/Cargo.lock index 775e1d2b8e..a66391ed07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3061,6 +3061,8 @@ dependencies = [ "libc", "libgit2-sys", "log", + "openssl-probe", + "openssl-sys", "url", ] @@ -4015,7 +4017,9 @@ checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" dependencies = [ "cc", "libc", + "libssh2-sys", "libz-sys", + "openssl-sys", "pkg-config", ] @@ -4056,6 +4060,20 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libssh2-sys" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b094a36eb4b8b8c8a7b4b8ae43b2944502be3e59cd87687595cf6b0a71b3f4ca" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + [[package]] name = "libz-sys" version = "1.1.12" @@ -6731,6 +6749,7 @@ dependencies = [ "editor", "env_logger 0.9.3", "futures 0.3.28", + "git2", "globset", "gpui", "isahc", diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 72a36efd50..b5537dd2fa 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -50,6 +50,7 @@ project = { path = "../project", features = ["test-support"] } rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"]} +git2 = { version = "0.15"} pretty_assertions.workspace = true rand.workspace = true @@ -67,3 +68,6 @@ tree-sitter-elixir.workspace = true tree-sitter-lua.workspace = true tree-sitter-ruby.workspace = true tree-sitter-php.workspace = true + +[[example]] +name = "eval" diff --git a/crates/semantic_index/eval/tree-sitter.json b/crates/semantic_index/eval/tree-sitter.json new file mode 100644 index 0000000000..a469543cf4 --- /dev/null +++ b/crates/semantic_index/eval/tree-sitter.json @@ -0,0 +1,10 @@ +{ + "repo": "https://github.com/tree-sitter/tree-sitter.git", + "commit": "46af27796a76c72d8466627d499f2bca4af958ee", + "assertions": [ + { + "query": "", + "matches": [] + } + ] +} diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs new file mode 100644 index 0000000000..c3950757ce --- /dev/null +++ b/crates/semantic_index/examples/eval.rs @@ -0,0 +1,97 @@ +use git2::{Object, Oid, Repository}; +use serde::Deserialize; +use std::path::{Path, PathBuf}; +use std::{env, fs}; + +#[derive(Deserialize, Clone)] +struct QueryMatches { + query: String, + matches: Vec, +} + +#[derive(Deserialize, Clone)] +struct RepoEval { + repo: String, + commit: String, + assertions: Vec, +} + +const TMP_REPO_PATH: &str = "./target/eval_repos"; + +fn parse_eval() -> anyhow::Result> { + let eval_folder = env::current_dir()? + .as_path() + .parent() + .unwrap() + .join("crates/semantic_index/eval"); + + let mut repo_evals: Vec = Vec::new(); + for entry in fs::read_dir(eval_folder)? { + let file_path = entry.unwrap().path(); + if let Some(extension) = file_path.extension() { + if extension == "json" { + if let Ok(file) = fs::read_to_string(file_path) { + let repo_eval = serde_json::from_str(file.as_str()); + + match repo_eval { + Ok(repo_eval) => { + repo_evals.push(repo_eval); + } + Err(err) => { + println!("Err: {:?}", err); + } + } + } + } + } + } + + Ok(repo_evals) +} + +fn clone_repo(repo_eval: RepoEval) -> anyhow::Result { + let repo_name = Path::new(repo_eval.repo.as_str()) + .file_name() + .unwrap() + .to_str() + .unwrap() + .to_owned() + .replace(".git", ""); + let clone_path = Path::new(TMP_REPO_PATH).join(&repo_name).to_path_buf(); + + // Delete Clone Path if already exists + let _ = fs::remove_dir_all(&clone_path); + + // Clone in Repo + git2::build::RepoBuilder::new() + // .branch(repo_eval.sha.as_str()) + .clone(repo_eval.repo.as_str(), clone_path.as_path())?; + + let repo: Repository = Repository::open(clone_path.clone())?; + let obj: Object = repo + .find_commit(Oid::from_str(repo_eval.commit.as_str())?)? + .into_object(); + repo.checkout_tree(&obj, None)?; + repo.set_head_detached(obj.id())?; + + Ok(clone_path) +} + +fn main() { + if let Ok(repo_evals) = parse_eval() { + for repo in repo_evals { + let cloned = clone_repo(repo.clone()); + match cloned { + Ok(clone_path) => { + println!( + "Cloned {:?} @ {:?} into {:?}", + repo.repo, repo.commit, clone_path + ); + } + Err(err) => { + println!("Error Cloning: {:?}", err); + } + } + } + } +} diff --git a/script/evaluate_semantic_index b/script/evaluate_semantic_index new file mode 100755 index 0000000000..e9a96a02b4 --- /dev/null +++ b/script/evaluate_semantic_index @@ -0,0 +1,3 @@ +#!/bin/bash + +cargo run -p semantic_index --example eval From 0d14bbbf5b14ae4045cea65a68e3d6341f48f79c Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 12 Sep 2023 20:36:06 -0400 Subject: [PATCH 02/14] add eval values for tree-sitter --- crates/semantic_index/README.md | 39 +++----- crates/semantic_index/eval/tree-sitter.json | 98 ++++++++++++++++++++- 2 files changed, 110 insertions(+), 27 deletions(-) diff --git a/crates/semantic_index/README.md b/crates/semantic_index/README.md index 86e68dc414..85f83af121 100644 --- a/crates/semantic_index/README.md +++ b/crates/semantic_index/README.md @@ -1,31 +1,20 @@ -WIP: Sample SQL Queries -/* +# Semantic Index -create table "files" ( -"id" INTEGER PRIMARY KEY, -"path" VARCHAR, -"sha1" VARCHAR, -); +## Evaluation -create table symbols ( -"file_id" INTEGER REFERENCES("files", "id") ON CASCADE DELETE, -"offset" INTEGER, -"embedding" VECTOR, -); +### Metrics -insert into "files" ("path", "sha1") values ("src/main.rs", "sha1") return id; -insert into symbols ( -"file_id", -"start", -"end", -"embedding" -) values ( -(id,), -(id,), -(id,), -(id,), -) +nDCG@k: +- "The value of NDCG is determined by comparing the relevance of the items returned by the search engine to the relevance of the item that a hypothetical "ideal" search engine would return. +- "The relevance of result is represented by a score (also known as a 'grade') that is assigned to the search query. The scores of these results are then discounted based on their position in the search results -- did they get recommended first or last?" +MRR@k: +- "Mean reciprocal rank quantifies the rank of the first relevant item found in teh recommendation list." -*/ +MAP@k: +- "Mean average precision averages the precision@k metric at each relevant item position in the recommendation list. + +Resources: +- [Evaluating recommendation metrics](https://www.shaped.ai/blog/evaluating-recommendation-systems-map-mmr-ndcg) +- [Math Walkthrough](https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0) diff --git a/crates/semantic_index/eval/tree-sitter.json b/crates/semantic_index/eval/tree-sitter.json index a469543cf4..4f2edfb063 100644 --- a/crates/semantic_index/eval/tree-sitter.json +++ b/crates/semantic_index/eval/tree-sitter.json @@ -3,8 +3,102 @@ "commit": "46af27796a76c72d8466627d499f2bca4af958ee", "assertions": [ { - "query": "", - "matches": [] + "query": "What attributes are available for the tags configuration struct?", + "matches": [ + "tags/src/lib.rs:24" + ] + }, + { + "query": "create a new tag configuration", + "matches": [ + "tags/src/lib.rs:119" + ] + }, + { + "query": "generate tags based on config", + "matches": [ + "tags/src/lib.rs:261", + ] + }, + { + "query": "match on ts quantifier in rust", + "matches": [ + "lib/binding_rust/lib.rs:139" + ] + }, + { + "query": "cli command to generate tags", + "matches": [ + "cli/src/tags.rs:10" + ] + }, + { + "query": "what version of the tree-sitter-tags package is active?", + "matches": [ + "tags/Cargo.toml:4" + ] + }, + { + "query": "Insert a new parse state", + "matches": [ + "cli/src/generate/build_tables/build_parse_table.rs:153" + ] + }, + { + "query": "Handle conflict when numerous actions occur on the same symbol", + "matches": [ + "cli/src/generate/build_tables/build_parse_table.rs:363", + "cli/src/generate/build_tables/build_parse_table.rs:442", + ] + }, + { + "query": "Match based on associativity of actions", + "matches": [ + "cri/src/generate/build_tables/build_parse_table.rs:542", + ] + }, + { + "query": "Format token set display", + "matches": [ + "cli/src/generate/build_tables/item.rs:246", + ] + }, + { + "query": "extract choices from rule", + "matches": [ + "cli/src/generate/prepare_grammar/flatten_grammar.rs:124" + ] + }, + { + "query": "How do we identify if a symbol is being used?", + "matches": [ + "cli/src/generate/prepare_grammar/flatten_grammar.rs:175" + ] + }, + { + "query": "How do we launch the playground?", + "matches": [ + "cli/src/playground.rs:46" + ] + }, + { + "query": "How do we test treesitter query matches in rust?", + "matches": [ + "cli/src/query_testing.rs:152", + "cli/src/tests/query_test.rs:781", + "cli/src/tests/query_test.rs:2163", + "cli/src/tests/query_test.rs:3781", + "cli/src/tests/query_test.rs:887" + ] + }, + { + "query": "What does the CLI do?", + "matches": [ + "cli/README.md:10", + "cli/loader/README.md:3", + "docs/section-5-implementation.md:14", + "docs/section-5-implementation.md:18" + ] } ] } From d4fbe990520fd079dc99e8120a0d08ff1076ef69 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 12 Sep 2023 21:27:35 -0400 Subject: [PATCH 03/14] add eval for gpt-engineer --- crates/semantic_index/eval/gpt-engineer.json | 114 +++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 crates/semantic_index/eval/gpt-engineer.json diff --git a/crates/semantic_index/eval/gpt-engineer.json b/crates/semantic_index/eval/gpt-engineer.json new file mode 100644 index 0000000000..d7c08cd505 --- /dev/null +++ b/crates/semantic_index/eval/gpt-engineer.json @@ -0,0 +1,114 @@ +{ + "repo": "https://github.com/AntonOsika/gpt-engineer.git", + "commit": "7735a6445bae3611c62f521e6464c67c957f87c2", + "assertions": [ + { + "query": "How do I contribute to this project?", + "matches": [ + ".github/CONTRIBUTING.md:1", + "ROADMAP.md:48" + ] + }, + { + "query": "What version of the openai package is active?", + "matches": [ + "pyproject.toml:14" + ] + }, + { + "query": "Ask user for clarification", + "matches": [ + "gpt-engineer/steps.py:69" + ] + }, + { + "query": "generate tests for python code", + "matches": [ + "gpt-engineer/steps.py:153" + ] + }, + { + "query": "get item from database based on key", + "matches": [ + "gpt-engineer/db.py:42", + "gpt-engineer/db.py:68" + ] + }, + { + "query": "prompt user to select files", + "matches": [ + "gpt-engineer/file_selector.py:171", + "gpt-engineer/file_selector.py:306", + "gpt-engineer/file_selector.py:289", + "gpt-engineer/file_selector.py:234" + ] + }, + { + "query": "send to rudderstack", + "matches": [ + "gpt-engineer/collect.py:11", + "gpt-engineer/collect.py:38" + ] + }, + { + "query": "parse code blocks from chat messages", + "matches": [ + "gpt-engineer/chat_to_files.py:10", + "docs/intro/chat_parsing.md:1" + ] + }, + { + "query": "how do I use the docker cli?", + "matches": [ + "docker/README.md:1" + ] + }, + { + "query": "ask the user if the code ran successfully?", + "matches": [ + "gpt-engineer/learning.py:54" + ] + }, + { + "query": "how is consent granted by the user?", + "matches": [ + "gpt-engineer/learning.py:107", + "gpt-engineer/learning.py:130", + "gpt-engineer/learning.py:152" + ] + }, + { + "query": "what are all the different steps the agent can take?", + "matches": [ + "docs/intro/steps_module.md:1", + "gpt-engineer/steps.py:391" + ] + }, + { + "query": "ask the user for clarification?", + "matches": [ + "gpt-engineer/steps.py:69" + ] + }, + { + "query": "what models are available?", + "matches": [ + "gpt-engineer/ai.py:315", + "gpt-engineer/ai.py:341", + "docs/open-models.md:1" + ] + }, + { + "query": "what is the current focus of the project?", + "matches": [ + "ROADMAP.md:11" + ] + }, + { + "query": "does the agent know how to fix code?", + "matches": [ + "gpt-engineer/steps.py:367" + ] + } + ] +} From 6f29582fb064e709056236ceb732335d63bbbfe4 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 13 Sep 2023 10:32:36 -0400 Subject: [PATCH 04/14] progress on eval --- crates/semantic_index/eval/gpt-engineer.json | 2 +- crates/semantic_index/eval/tree-sitter.json | 2 +- crates/semantic_index/examples/eval.rs | 80 +++++++++++++++++++- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/crates/semantic_index/eval/gpt-engineer.json b/crates/semantic_index/eval/gpt-engineer.json index d7c08cd505..64322e8384 100644 --- a/crates/semantic_index/eval/gpt-engineer.json +++ b/crates/semantic_index/eval/gpt-engineer.json @@ -12,7 +12,7 @@ { "query": "What version of the openai package is active?", "matches": [ - "pyproject.toml:14" + "pyprojet.toml:14" ] }, { diff --git a/crates/semantic_index/eval/tree-sitter.json b/crates/semantic_index/eval/tree-sitter.json index 4f2edfb063..52d1e9df16 100644 --- a/crates/semantic_index/eval/tree-sitter.json +++ b/crates/semantic_index/eval/tree-sitter.json @@ -48,7 +48,7 @@ "query": "Handle conflict when numerous actions occur on the same symbol", "matches": [ "cli/src/generate/build_tables/build_parse_table.rs:363", - "cli/src/generate/build_tables/build_parse_table.rs:442", + "cli/src/generate/build_tables/build_parse_table.rs:442" ] }, { diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index c3950757ce..f666f5c281 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -1,19 +1,36 @@ use git2::{Object, Oid, Repository}; +use semantic_index::SearchResult; use serde::Deserialize; use std::path::{Path, PathBuf}; use std::{env, fs}; #[derive(Deserialize, Clone)] -struct QueryMatches { +struct EvaluationQuery { query: String, matches: Vec, } +impl EvaluationQuery { + fn match_pairs(&self) -> Vec<(PathBuf, usize)> { + let mut pairs = Vec::new(); + for match_identifier in self.matches { + let match_parts = match_identifier.split(":"); + + if let Some(file_path) = match_parts.next() { + if let Some(row_number) = match_parts.next() { + pairs.push((PathBuf::from(file_path), from_str::(row_number))); + } + } + + pairs + } +} + #[derive(Deserialize, Clone)] struct RepoEval { repo: String, commit: String, - assertions: Vec, + assertions: Vec, } const TMP_REPO_PATH: &str = "./target/eval_repos"; @@ -77,7 +94,60 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result { Ok(clone_path) } +fn dcg(hits: Vec) -> f32 { + let mut result = 0.0; + for (idx, hit) in hits.iter().enumerate() { + result += *hit as f32 / (2.0 + idx as f32).log2(); + } + + println!("DCG: {:?}", result); + result +} + +fn evaluate_ndcg(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 { + + // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of + // items returned by the search engine relative to the hypothetical ideal. + // Relevance is represented as a series of booleans, in which each search result returned + // is identified as being inside the test set of matches (1) or not (0). + + // For example, if result 1, 3 and 5 match the 3 relevant results provided + // actual dcg is calculated against a vector of [1, 0, 1, 0, 1] + // whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0] + // as this ideal vector assumes the 3 relevant results provided were returned first + // normalized dcg is then calculated as actual dcg / ideal dcg. + + // NDCG ranges from 0 to 1, which higher values indicating better performance + // Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated + // including only the top k values returned. + // The @k metrics can help you identify, at what point does the relevant results start to fall off. + // Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually + // very high quality, whereas rank results quickly drop off after the first result. + + let ideal = vec![1; cmp::min(eval_query.matches.len(), k)]; + + return dcg(hits) / dcg(ideal); +} + +fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 { + +} + +fn evaluate_repo(repo_eval: RepoEval, clone_path: PathBuf) { + + // Launch new repo as a new Zed workspace/project + // Index the project + // Search each eval_query + // Calculate Statistics + +} + fn main() { + + // zed/main.rs + // creating an app and running it, gives you the context. + // create a project, find_or_create_local_worktree. + if let Ok(repo_evals) = parse_eval() { for repo in repo_evals { let cloned = clone_repo(repo.clone()); @@ -85,8 +155,12 @@ fn main() { Ok(clone_path) => { println!( "Cloned {:?} @ {:?} into {:?}", - repo.repo, repo.commit, clone_path + repo.repo, repo.commit, &clone_path ); + + // Evaluate Repo + evaluate_repo(repo, clone_path); + } Err(err) => { println!("Error Cloning: {:?}", err); From eff44f9aa4412399c1c642eb271c4e5ec8297cec Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 13 Sep 2023 20:02:15 -0400 Subject: [PATCH 05/14] semantic index eval, indexing appropriately --- Cargo.lock | 4 + crates/semantic_index/Cargo.toml | 4 + crates/semantic_index/eval/tree-sitter.json | 6 +- crates/semantic_index/examples/eval.rs | 194 ++++++++++++++++---- crates/semantic_index/src/semantic_index.rs | 6 +- 5 files changed, 168 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a66391ed07..b0f46a90d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6744,6 +6744,7 @@ dependencies = [ "anyhow", "async-trait", "bincode", + "client", "collections", "ctor", "editor", @@ -6757,6 +6758,7 @@ dependencies = [ "lazy_static", "log", "matrixmultiply", + "node_runtime", "parking_lot 0.11.2", "parse_duration", "picker", @@ -6766,6 +6768,7 @@ dependencies = [ "rand 0.8.5", "rpc", "rusqlite", + "rust-embed", "schemars", "serde", "serde_json", @@ -6788,6 +6791,7 @@ dependencies = [ "unindent", "util", "workspace", + "zed", ] [[package]] diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index b5537dd2fa..a20f29fd68 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -51,6 +51,10 @@ rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"]} git2 = { version = "0.15"} +rust-embed = { version = "8.0", features = ["include-exclude"] } +client = { path = "../client" } +zed = { path = "../zed"} +node_runtime = { path = "../node_runtime"} pretty_assertions.workspace = true rand.workspace = true diff --git a/crates/semantic_index/eval/tree-sitter.json b/crates/semantic_index/eval/tree-sitter.json index 52d1e9df16..d3dcc86937 100644 --- a/crates/semantic_index/eval/tree-sitter.json +++ b/crates/semantic_index/eval/tree-sitter.json @@ -17,7 +17,7 @@ { "query": "generate tags based on config", "matches": [ - "tags/src/lib.rs:261", + "tags/src/lib.rs:261" ] }, { @@ -54,13 +54,13 @@ { "query": "Match based on associativity of actions", "matches": [ - "cri/src/generate/build_tables/build_parse_table.rs:542", + "cri/src/generate/build_tables/build_parse_table.rs:542" ] }, { "query": "Format token set display", "matches": [ - "cli/src/generate/build_tables/item.rs:246", + "cli/src/generate/build_tables/item.rs:246" ] }, { diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index f666f5c281..67ee52e28c 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -1,8 +1,46 @@ +use anyhow::{anyhow, Result}; +use client::{self, UserStore}; use git2::{Object, Oid, Repository}; -use semantic_index::SearchResult; +use gpui::{AppContext, AssetSource, ModelHandle, Task}; +use language::LanguageRegistry; +use node_runtime::RealNodeRuntime; +use project::{Fs, Project, RealFs}; +use rust_embed::RustEmbed; +use semantic_index::embedding::OpenAIEmbeddings; +use semantic_index::semantic_index_settings::SemanticIndexSettings; +use semantic_index::{SearchResult, SemanticIndex}; use serde::Deserialize; -use std::path::{Path, PathBuf}; -use std::{env, fs}; +use settings::{default_settings, handle_settings_file_changes, watch_config_file, SettingsStore}; +use std::path::{self, Path, PathBuf}; +use std::sync::Arc; +use std::time::Duration; +use std::{cmp, env, fs}; +use util::channel::{RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}; +use util::http::{self, HttpClient}; +use util::paths::{self, EMBEDDINGS_DIR}; +use zed::languages; + +#[derive(RustEmbed)] +#[folder = "../../assets"] +#[include = "fonts/**/*"] +#[include = "icons/**/*"] +#[include = "themes/**/*"] +#[include = "sounds/**/*"] +#[include = "*.md"] +#[exclude = "*.DS_Store"] +pub struct Assets; + +impl AssetSource for Assets { + fn load(&self, path: &str) -> Result> { + Self::get(path) + .map(|f| f.data) + .ok_or_else(|| anyhow!("could not find asset at path \"{}\"", path)) + } + + fn list(&self, path: &str) -> Vec> { + Self::iter().filter(|p| p.starts_with(path)).collect() + } +} #[derive(Deserialize, Clone)] struct EvaluationQuery { @@ -13,15 +51,18 @@ struct EvaluationQuery { impl EvaluationQuery { fn match_pairs(&self) -> Vec<(PathBuf, usize)> { let mut pairs = Vec::new(); - for match_identifier in self.matches { - let match_parts = match_identifier.split(":"); + for match_identifier in self.matches.iter() { + let mut match_parts = match_identifier.split(":"); if let Some(file_path) = match_parts.next() { if let Some(row_number) = match_parts.next() { - pairs.push((PathBuf::from(file_path), from_str::(row_number))); + pairs.push(( + PathBuf::from(file_path), + row_number.parse::().unwrap(), + )); } } - + } pairs } } @@ -33,7 +74,7 @@ struct RepoEval { assertions: Vec, } -const TMP_REPO_PATH: &str = "./target/eval_repos"; +const TMP_REPO_PATH: &str = "eval_repos"; fn parse_eval() -> anyhow::Result> { let eval_folder = env::current_dir()? @@ -74,7 +115,12 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result { .unwrap() .to_owned() .replace(".git", ""); - let clone_path = Path::new(TMP_REPO_PATH).join(&repo_name).to_path_buf(); + + let clone_path = fs::canonicalize(env::current_dir()?)? + .parent() + .ok_or(anyhow!("path canonicalization failed"))? + .join(TMP_REPO_PATH) + .join(&repo_name); // Delete Clone Path if already exists let _ = fs::remove_dir_all(&clone_path); @@ -105,7 +151,6 @@ fn dcg(hits: Vec) -> f32 { } fn evaluate_ndcg(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 { - // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of // items returned by the search engine relative to the hypothetical ideal. // Relevance is represented as a series of booleans, in which each search result returned @@ -125,47 +170,118 @@ fn evaluate_ndcg(eval_query: EvaluationQuery, search_results: Vec, // very high quality, whereas rank results quickly drop off after the first result. let ideal = vec![1; cmp::min(eval_query.matches.len(), k)]; + let hits = vec![1]; return dcg(hits) / dcg(ideal); } -fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 { - -} - -fn evaluate_repo(repo_eval: RepoEval, clone_path: PathBuf) { - - // Launch new repo as a new Zed workspace/project - // Index the project - // Search each eval_query - // Calculate Statistics +// fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 {} +fn init_logger() { + env_logger::init(); } fn main() { + // Launch new repo as a new Zed workspace/project + let app = gpui::App::new(Assets).unwrap(); + let fs = Arc::new(RealFs); + let http = http::client(); + let user_settings_file_rx = + watch_config_file(app.background(), fs.clone(), paths::SETTINGS.clone()); + let http_client = http::client(); + init_logger(); - // zed/main.rs - // creating an app and running it, gives you the context. - // create a project, find_or_create_local_worktree. + app.run(move |cx| { + cx.set_global(*RELEASE_CHANNEL); - if let Ok(repo_evals) = parse_eval() { - for repo in repo_evals { - let cloned = clone_repo(repo.clone()); - match cloned { - Ok(clone_path) => { - println!( - "Cloned {:?} @ {:?} into {:?}", - repo.repo, repo.commit, &clone_path - ); + let client = client::Client::new(http.clone(), cx); + let user_store = cx.add_model(|cx| UserStore::new(client.clone(), http_client.clone(), cx)); - // Evaluate Repo - evaluate_repo(repo, clone_path); + // Initialize Settings + let mut store = SettingsStore::default(); + store + .set_default_settings(default_settings().as_ref(), cx) + .unwrap(); + cx.set_global(store); + handle_settings_file_changes(user_settings_file_rx, cx); - } - Err(err) => { - println!("Error Cloning: {:?}", err); + // Initialize Languages + let login_shell_env_loaded = Task::ready(()); + let mut languages = LanguageRegistry::new(login_shell_env_loaded); + languages.set_executor(cx.background().clone()); + let languages = Arc::new(languages); + + let node_runtime = RealNodeRuntime::new(http.clone()); + languages::init(languages.clone(), node_runtime.clone()); + + project::Project::init(&client, cx); + semantic_index::init(fs.clone(), http.clone(), languages.clone(), cx); + + settings::register::(cx); + + let db_file_path = EMBEDDINGS_DIR + .join(Path::new(RELEASE_CHANNEL_NAME.as_str())) + .join("embeddings_db"); + + let languages = languages.clone(); + let fs = fs.clone(); + cx.spawn(|mut cx| async move { + let semantic_index = SemanticIndex::new( + fs.clone(), + db_file_path, + Arc::new(OpenAIEmbeddings::new(http_client, cx.background())), + languages.clone(), + cx.clone(), + ) + .await?; + + if let Ok(repo_evals) = parse_eval() { + for repo in repo_evals { + let cloned = clone_repo(repo.clone()); + match cloned { + Ok(clone_path) => { + log::trace!( + "Cloned {:?} @ {:?} into {:?}", + repo.repo, + repo.commit, + &clone_path + ); + + // Create Project + let project = cx.update(|cx| { + Project::local( + client.clone(), + user_store.clone(), + languages.clone(), + fs.clone(), + cx, + ) + }); + + // Register Worktree + let _ = project + .update(&mut cx, |project, cx| { + println!( + "Creating worktree in project: {:?}", + clone_path.clone() + ); + project.find_or_create_local_worktree(clone_path, true, cx) + }) + .await; + + let _ = semantic_index + .update(&mut cx, |index, cx| index.index_project(project, cx)) + .await; + } + Err(err) => { + log::trace!("Error cloning: {:?}", err); + } + } } } - } - } + + anyhow::Ok(()) + }) + .detach(); + }); } diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index 115bf5d7a8..63bcc900f2 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -1,5 +1,5 @@ mod db; -mod embedding; +pub mod embedding; mod embedding_queue; mod parsing; pub mod semantic_index_settings; @@ -301,7 +301,7 @@ impl SemanticIndex { } } - async fn new( + pub async fn new( fs: Arc, database_path: PathBuf, embedding_provider: Arc, @@ -837,8 +837,6 @@ impl SemanticIndex { cx: &mut ModelContext, ) -> Task> { if !self.projects.contains_key(&project.downgrade()) { - log::trace!("Registering Project for Semantic Index"); - let subscription = cx.subscribe(&project, |this, project, event, cx| match event { project::Event::WorktreeAdded | project::Event::WorktreeRemoved(_) => { this.project_worktrees_changed(project.clone(), cx); From 0c1b2e5aa6a83b75d218a82676d2523147180a10 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Wed, 13 Sep 2023 20:04:53 -0400 Subject: [PATCH 06/14] cleaned up warnings --- crates/semantic_index/examples/eval.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index 67ee52e28c..27ee93d093 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -1,22 +1,21 @@ use anyhow::{anyhow, Result}; use client::{self, UserStore}; use git2::{Object, Oid, Repository}; -use gpui::{AppContext, AssetSource, ModelHandle, Task}; +use gpui::{AssetSource, Task}; use language::LanguageRegistry; use node_runtime::RealNodeRuntime; -use project::{Fs, Project, RealFs}; +use project::{Project, RealFs}; use rust_embed::RustEmbed; use semantic_index::embedding::OpenAIEmbeddings; use semantic_index::semantic_index_settings::SemanticIndexSettings; use semantic_index::{SearchResult, SemanticIndex}; use serde::Deserialize; use settings::{default_settings, handle_settings_file_changes, watch_config_file, SettingsStore}; -use std::path::{self, Path, PathBuf}; +use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::time::Duration; use std::{cmp, env, fs}; use util::channel::{RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}; -use util::http::{self, HttpClient}; +use util::http::{self}; use util::paths::{self, EMBEDDINGS_DIR}; use zed::languages; From 137dda3ee656036ccba6c3554f33e4135c2ba12e Mon Sep 17 00:00:00 2001 From: KCaverly Date: Thu, 14 Sep 2023 09:30:19 -0400 Subject: [PATCH 07/14] wip eval framework for semantic index --- crates/semantic_index/examples/eval.rs | 63 ++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index 27ee93d093..4935b160a4 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -1,7 +1,8 @@ use anyhow::{anyhow, Result}; use client::{self, UserStore}; +use collections::HashMap; use git2::{Object, Oid, Repository}; -use gpui::{AssetSource, Task}; +use gpui::{AssetSource, AsyncAppContext, ModelHandle, Task}; use language::LanguageRegistry; use node_runtime::RealNodeRuntime; use project::{Project, RealFs}; @@ -13,6 +14,7 @@ use serde::Deserialize; use settings::{default_settings, handle_settings_file_changes, watch_config_file, SettingsStore}; use std::path::{Path, PathBuf}; use std::sync::Arc; +use std::time::{Duration, Instant}; use std::{cmp, env, fs}; use util::channel::{RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}; use util::http::{self}; @@ -73,6 +75,15 @@ struct RepoEval { assertions: Vec, } +struct EvaluationResults { + token_count: usize, + span_count: usize, + time_to_index: Duration, + time_to_search: Vec, + ndcg: HashMap, + map: HashMap, +} + const TMP_REPO_PATH: &str = "eval_repos"; fn parse_eval() -> anyhow::Result> { @@ -180,6 +191,42 @@ fn init_logger() { env_logger::init(); } +async fn evaluate_repo( + index: ModelHandle, + project: ModelHandle, + query_matches: Vec, + cx: &mut AsyncAppContext, +) -> Result<()> { + // Index Project + let index_t0 = Instant::now(); + index + .update(cx, |index, cx| index.index_project(project.clone(), cx)) + .await?; + let index_time = index_t0.elapsed(); + println!("Time to Index: {:?}", index_time.as_secs()); + + for query in query_matches { + // Query each match in order + let search_t0 = Instant::now(); + let search_results = index + .update(cx, |index, mut cx| { + index.search_project(project.clone(), query.query, 10, vec![], vec![], cx) + }) + .await?; + let search_time = search_t0.elapsed(); + println!("Time to Search: {:?}", search_time.as_secs()); + + // Evaluate ndcg@k, for k = 1, 3, 5, 10 + // Evaluate map@k, for k = 1, 3, 5, 10 + // Evaluate span count + // Evaluate token count + // Evaluate time to index + // Evaluate time to search + } + + anyhow::Ok(()) +} + fn main() { // Launch new repo as a new Zed workspace/project let app = gpui::App::new(Assets).unwrap(); @@ -260,17 +307,17 @@ fn main() { // Register Worktree let _ = project .update(&mut cx, |project, cx| { - println!( - "Creating worktree in project: {:?}", - clone_path.clone() - ); project.find_or_create_local_worktree(clone_path, true, cx) }) .await; - let _ = semantic_index - .update(&mut cx, |index, cx| index.index_project(project, cx)) - .await; + evaluate_repo( + semantic_index.clone(), + project, + repo.assertions, + &mut cx, + ) + .await?; } Err(err) => { log::trace!("Error cloning: {:?}", err); From 04bd107ada2417599fe9b2de1974f75c38646832 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 15 Sep 2023 10:36:21 -0400 Subject: [PATCH 08/14] add ndcg@k to evaluate metrics --- crates/semantic_index/eval/gpt-engineer.json | 44 +++++------ crates/semantic_index/examples/eval.rs | 78 +++++++++++++++++--- 2 files changed, 88 insertions(+), 34 deletions(-) diff --git a/crates/semantic_index/eval/gpt-engineer.json b/crates/semantic_index/eval/gpt-engineer.json index 64322e8384..4a96e9f5ff 100644 --- a/crates/semantic_index/eval/gpt-engineer.json +++ b/crates/semantic_index/eval/gpt-engineer.json @@ -1,5 +1,5 @@ { - "repo": "https://github.com/AntonOsika/gpt-engineer.git", + "repo": "https://github.com/AntonOsika/gpt_engineer.git", "commit": "7735a6445bae3611c62f521e6464c67c957f87c2", "assertions": [ { @@ -12,48 +12,48 @@ { "query": "What version of the openai package is active?", "matches": [ - "pyprojet.toml:14" + "pyproject.toml:14" ] }, { "query": "Ask user for clarification", "matches": [ - "gpt-engineer/steps.py:69" + "gpt_engineer/steps.py:69" ] }, { "query": "generate tests for python code", "matches": [ - "gpt-engineer/steps.py:153" + "gpt_engineer/steps.py:153" ] }, { "query": "get item from database based on key", "matches": [ - "gpt-engineer/db.py:42", - "gpt-engineer/db.py:68" + "gpt_engineer/db.py:42", + "gpt_engineer/db.py:68" ] }, { "query": "prompt user to select files", "matches": [ - "gpt-engineer/file_selector.py:171", - "gpt-engineer/file_selector.py:306", - "gpt-engineer/file_selector.py:289", - "gpt-engineer/file_selector.py:234" + "gpt_engineer/file_selector.py:171", + "gpt_engineer/file_selector.py:306", + "gpt_engineer/file_selector.py:289", + "gpt_engineer/file_selector.py:234" ] }, { "query": "send to rudderstack", "matches": [ - "gpt-engineer/collect.py:11", - "gpt-engineer/collect.py:38" + "gpt_engineer/collect.py:11", + "gpt_engineer/collect.py:38" ] }, { "query": "parse code blocks from chat messages", "matches": [ - "gpt-engineer/chat_to_files.py:10", + "gpt_engineer/chat_to_files.py:10", "docs/intro/chat_parsing.md:1" ] }, @@ -66,35 +66,35 @@ { "query": "ask the user if the code ran successfully?", "matches": [ - "gpt-engineer/learning.py:54" + "gpt_engineer/learning.py:54" ] }, { "query": "how is consent granted by the user?", "matches": [ - "gpt-engineer/learning.py:107", - "gpt-engineer/learning.py:130", - "gpt-engineer/learning.py:152" + "gpt_engineer/learning.py:107", + "gpt_engineer/learning.py:130", + "gpt_engineer/learning.py:152" ] }, { "query": "what are all the different steps the agent can take?", "matches": [ "docs/intro/steps_module.md:1", - "gpt-engineer/steps.py:391" + "gpt_engineer/steps.py:391" ] }, { "query": "ask the user for clarification?", "matches": [ - "gpt-engineer/steps.py:69" + "gpt_engineer/steps.py:69" ] }, { "query": "what models are available?", "matches": [ - "gpt-engineer/ai.py:315", - "gpt-engineer/ai.py:341", + "gpt_engineer/ai.py:315", + "gpt_engineer/ai.py:341", "docs/open-models.md:1" ] }, @@ -107,7 +107,7 @@ { "query": "does the agent know how to fix code?", "matches": [ - "gpt-engineer/steps.py:367" + "gpt_engineer/steps.py:367" ] } ] diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index 4935b160a4..0a13623b79 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -2,7 +2,7 @@ use anyhow::{anyhow, Result}; use client::{self, UserStore}; use collections::HashMap; use git2::{Object, Oid, Repository}; -use gpui::{AssetSource, AsyncAppContext, ModelHandle, Task}; +use gpui::{AppContext, AssetSource, AsyncAppContext, ModelHandle, Task}; use language::LanguageRegistry; use node_runtime::RealNodeRuntime; use project::{Project, RealFs}; @@ -50,17 +50,14 @@ struct EvaluationQuery { } impl EvaluationQuery { - fn match_pairs(&self) -> Vec<(PathBuf, usize)> { + fn match_pairs(&self) -> Vec<(PathBuf, u32)> { let mut pairs = Vec::new(); for match_identifier in self.matches.iter() { let mut match_parts = match_identifier.split(":"); if let Some(file_path) = match_parts.next() { if let Some(row_number) = match_parts.next() { - pairs.push(( - PathBuf::from(file_path), - row_number.parse::().unwrap(), - )); + pairs.push((PathBuf::from(file_path), row_number.parse::().unwrap())); } } } @@ -156,11 +153,15 @@ fn dcg(hits: Vec) -> f32 { result += *hit as f32 / (2.0 + idx as f32).log2(); } - println!("DCG: {:?}", result); result } -fn evaluate_ndcg(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 { +fn evaluate_ndcg( + eval_query: EvaluationQuery, + search_results: Vec, + k: usize, + cx: &AsyncAppContext, +) -> Vec { // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of // items returned by the search engine relative to the hypothetical ideal. // Relevance is represented as a series of booleans, in which each search result returned @@ -180,9 +181,58 @@ fn evaluate_ndcg(eval_query: EvaluationQuery, search_results: Vec, // very high quality, whereas rank results quickly drop off after the first result. let ideal = vec![1; cmp::min(eval_query.matches.len(), k)]; - let hits = vec![1]; - return dcg(hits) / dcg(ideal); + let mut hits = Vec::new(); + for result in search_results { + let (path, start_row, end_row) = result.buffer.read_with(cx, |buffer, cx| { + let path = buffer.file().unwrap().path().to_path_buf(); + let start_row = buffer.offset_to_point(result.range.start.offset).row; + let end_row = buffer.offset_to_point(result.range.end.offset).row; + (path, start_row, end_row) + }); + + let match_pairs = eval_query.match_pairs(); + let mut found = 0; + for (match_path, match_row) in match_pairs { + if match_path == path { + if match_row >= start_row && match_row <= end_row { + found = 1; + break; + } + } + } + + hits.push(found); + } + + // For now, we are calculating ideal_hits a bit different, as technically + // with overlapping ranges, one match can result in more than result. + let mut ideal_hits = hits.clone(); + ideal_hits.retain(|x| x == &1); + + let ideal = if ideal.len() > ideal_hits.len() { + ideal + } else { + ideal_hits + }; + + // Fill ideal to 10 length + let mut filled_ideal = [0; 10]; + for (idx, i) in ideal.to_vec().into_iter().enumerate() { + filled_ideal[idx] = i; + } + + let mut ndcg = Vec::new(); + for idx in 1..(hits.len() + 1) { + let hits_at_k = hits[0..idx].to_vec(); + let ideal_at_k = filled_ideal[0..idx].to_vec(); + + let at_k = dcg(hits_at_k.clone()) / dcg(ideal_at_k.clone()); + + ndcg.push(at_k); + } + + ndcg } // fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 {} @@ -209,14 +259,17 @@ async fn evaluate_repo( // Query each match in order let search_t0 = Instant::now(); let search_results = index - .update(cx, |index, mut cx| { - index.search_project(project.clone(), query.query, 10, vec![], vec![], cx) + .update(cx, |index, cx| { + index.search_project(project.clone(), query.clone().query, 10, vec![], vec![], cx) }) .await?; let search_time = search_t0.elapsed(); println!("Time to Search: {:?}", search_time.as_secs()); // Evaluate ndcg@k, for k = 1, 3, 5, 10 + let ndcg = evaluate_ndcg(query, search_results, 10, cx); + println!("NDCG: {:?}", ndcg); + // Evaluate map@k, for k = 1, 3, 5, 10 // Evaluate span count // Evaluate token count @@ -259,6 +312,7 @@ fn main() { let node_runtime = RealNodeRuntime::new(http.clone()); languages::init(languages.clone(), node_runtime.clone()); + language::init(cx); project::Project::init(&client, cx); semantic_index::init(fs.clone(), http.clone(), languages.clone(), cx); From 566bb9f71b8beebd723f07b7c83536d80613d7a2 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 18 Sep 2023 09:57:52 -0400 Subject: [PATCH 09/14] add map to evaluation suite for semantic_index --- crates/semantic_index/examples/eval.rs | 82 +++++++++++++++++--------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index 0a13623b79..f0243b8b12 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -126,6 +126,8 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result { let clone_path = fs::canonicalize(env::current_dir()?)? .parent() .ok_or(anyhow!("path canonicalization failed"))? + .parent() + .unwrap() .join(TMP_REPO_PATH) .join(&repo_name); @@ -156,30 +158,12 @@ fn dcg(hits: Vec) -> f32 { result } -fn evaluate_ndcg( +fn get_hits( eval_query: EvaluationQuery, search_results: Vec, k: usize, cx: &AsyncAppContext, -) -> Vec { - // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of - // items returned by the search engine relative to the hypothetical ideal. - // Relevance is represented as a series of booleans, in which each search result returned - // is identified as being inside the test set of matches (1) or not (0). - - // For example, if result 1, 3 and 5 match the 3 relevant results provided - // actual dcg is calculated against a vector of [1, 0, 1, 0, 1] - // whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0] - // as this ideal vector assumes the 3 relevant results provided were returned first - // normalized dcg is then calculated as actual dcg / ideal dcg. - - // NDCG ranges from 0 to 1, which higher values indicating better performance - // Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated - // including only the top k values returned. - // The @k metrics can help you identify, at what point does the relevant results start to fall off. - // Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually - // very high quality, whereas rank results quickly drop off after the first result. - +) -> (Vec, Vec) { let ideal = vec![1; cmp::min(eval_query.matches.len(), k)]; let mut hits = Vec::new(); @@ -222,10 +206,32 @@ fn evaluate_ndcg( filled_ideal[idx] = i; } + (filled_ideal.to_vec(), hits) +} + +fn evaluate_ndcg(hits: Vec, ideal: Vec) -> Vec { + // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of + // items returned by the search engine relative to the hypothetical ideal. + // Relevance is represented as a series of booleans, in which each search result returned + // is identified as being inside the test set of matches (1) or not (0). + + // For example, if result 1, 3 and 5 match the 3 relevant results provided + // actual dcg is calculated against a vector of [1, 0, 1, 0, 1] + // whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0] + // as this ideal vector assumes the 3 relevant results provided were returned first + // normalized dcg is then calculated as actual dcg / ideal dcg. + + // NDCG ranges from 0 to 1, which higher values indicating better performance + // Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated + // including only the top k values returned. + // The @k metrics can help you identify, at what point does the relevant results start to fall off. + // Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually + // very high quality, whereas rank results quickly drop off after the first result. + let mut ndcg = Vec::new(); for idx in 1..(hits.len() + 1) { let hits_at_k = hits[0..idx].to_vec(); - let ideal_at_k = filled_ideal[0..idx].to_vec(); + let ideal_at_k = ideal[0..idx].to_vec(); let at_k = dcg(hits_at_k.clone()) / dcg(ideal_at_k.clone()); @@ -235,7 +241,24 @@ fn evaluate_ndcg( ndcg } -// fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec, k: usize) -> f32 {} +fn evaluate_map(hits: Vec) -> Vec { + let mut map_at_k = Vec::new(); + + let non_zero = hits.iter().sum::() as f32; + if non_zero == 0.0 { + return vec![0.0; hits.len()]; + } + + let mut rolling_non_zero = 0.0; + let mut rolling_map = 0.0; + for (idx, h) in hits.into_iter().enumerate() { + rolling_non_zero += h as f32; + rolling_map += rolling_non_zero / (idx + 1) as f32; + map_at_k.push(rolling_map / non_zero); + } + + map_at_k +} fn init_logger() { env_logger::init(); @@ -253,7 +276,7 @@ async fn evaluate_repo( .update(cx, |index, cx| index.index_project(project.clone(), cx)) .await?; let index_time = index_t0.elapsed(); - println!("Time to Index: {:?}", index_time.as_secs()); + println!("Time to Index: {:?}", index_time.as_millis()); for query in query_matches { // Query each match in order @@ -264,17 +287,22 @@ async fn evaluate_repo( }) .await?; let search_time = search_t0.elapsed(); - println!("Time to Search: {:?}", search_time.as_secs()); + println!("Time to Search: {:?}", search_time.as_millis()); + + // Get Hits/Ideal + let k = 10; + let (ideal, hits) = self::get_hits(query, search_results, k, cx); // Evaluate ndcg@k, for k = 1, 3, 5, 10 - let ndcg = evaluate_ndcg(query, search_results, 10, cx); + let ndcg = evaluate_ndcg(hits.clone(), ideal); println!("NDCG: {:?}", ndcg); // Evaluate map@k, for k = 1, 3, 5, 10 + let map = evaluate_map(hits); + println!("MAP: {:?}", map); + // Evaluate span count // Evaluate token count - // Evaluate time to index - // Evaluate time to search } anyhow::Ok(()) From 25bd35742615ee627903b245a3d6c8e1b4ba3f4d Mon Sep 17 00:00:00 2001 From: KCaverly Date: Mon, 18 Sep 2023 18:25:02 -0400 Subject: [PATCH 10/14] add recall and precision to semantic index --- crates/semantic_index/eval/gpt-engineer.json | 2 +- crates/semantic_index/examples/eval.rs | 212 ++++++++++++++++--- 2 files changed, 179 insertions(+), 35 deletions(-) diff --git a/crates/semantic_index/eval/gpt-engineer.json b/crates/semantic_index/eval/gpt-engineer.json index 4a96e9f5ff..d008cc65d1 100644 --- a/crates/semantic_index/eval/gpt-engineer.json +++ b/crates/semantic_index/eval/gpt-engineer.json @@ -1,5 +1,5 @@ { - "repo": "https://github.com/AntonOsika/gpt_engineer.git", + "repo": "https://github.com/AntonOsika/gpt-engineer.git", "commit": "7735a6445bae3611c62f521e6464c67c957f87c2", "assertions": [ { diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index f0243b8b12..546071c6f1 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -10,7 +10,7 @@ use rust_embed::RustEmbed; use semantic_index::embedding::OpenAIEmbeddings; use semantic_index::semantic_index_settings::SemanticIndexSettings; use semantic_index::{SearchResult, SemanticIndex}; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use settings::{default_settings, handle_settings_file_changes, watch_config_file, SettingsStore}; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -43,7 +43,7 @@ impl AssetSource for Assets { } } -#[derive(Deserialize, Clone)] +#[derive(Deserialize, Clone, Serialize)] struct EvaluationQuery { query: String, matches: Vec, @@ -72,15 +72,6 @@ struct RepoEval { assertions: Vec, } -struct EvaluationResults { - token_count: usize, - span_count: usize, - time_to_index: Duration, - time_to_search: Vec, - ndcg: HashMap, - map: HashMap, -} - const TMP_REPO_PATH: &str = "eval_repos"; fn parse_eval() -> anyhow::Result> { @@ -114,7 +105,7 @@ fn parse_eval() -> anyhow::Result> { Ok(repo_evals) } -fn clone_repo(repo_eval: RepoEval) -> anyhow::Result { +fn clone_repo(repo_eval: RepoEval) -> anyhow::Result<(String, PathBuf)> { let repo_name = Path::new(repo_eval.repo.as_str()) .file_name() .unwrap() @@ -146,7 +137,7 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result { repo.checkout_tree(&obj, None)?; repo.set_head_detached(obj.id())?; - Ok(clone_path) + Ok((repo_name, clone_path)) } fn dcg(hits: Vec) -> f32 { @@ -253,30 +244,165 @@ fn evaluate_map(hits: Vec) -> Vec { let mut rolling_map = 0.0; for (idx, h) in hits.into_iter().enumerate() { rolling_non_zero += h as f32; - rolling_map += rolling_non_zero / (idx + 1) as f32; + if h == 1 { + rolling_map += rolling_non_zero / (idx + 1) as f32; + } map_at_k.push(rolling_map / non_zero); } map_at_k } +fn evaluate_mrr(hits: Vec) -> f32 { + for (idx, h) in hits.into_iter().enumerate() { + if h == 1 { + return 1.0 / (idx + 1) as f32; + } + } + + return 0.0; +} + fn init_logger() { env_logger::init(); } +#[derive(Serialize)] +struct QueryMetrics { + query: EvaluationQuery, + millis_to_search: Duration, + ndcg: Vec, + map: Vec, + mrr: f32, + hits: Vec, + precision: Vec, + recall: Vec, +} + +#[derive(Serialize)] +struct SummaryMetrics { + millis_to_search: f32, + ndcg: Vec, + map: Vec, + mrr: f32, + precision: Vec, + recall: Vec, +} + +#[derive(Serialize)] +struct RepoEvaluationMetrics { + millis_to_index: Duration, + query_metrics: Vec, + repo_metrics: Option, +} + +impl RepoEvaluationMetrics { + fn new(millis_to_index: Duration) -> Self { + RepoEvaluationMetrics { + millis_to_index, + query_metrics: Vec::new(), + repo_metrics: None, + } + } + + fn save(&self, repo_name: String) -> Result<()> { + let results_string = serde_json::to_string(&self)?; + fs::write(format!("./{}_evaluation.json", repo_name), results_string) + .expect("Unable to write file"); + Ok(()) + } + + fn summarize(&mut self) { + let l = self.query_metrics.len() as f32; + let millis_to_search: f32 = self + .query_metrics + .iter() + .map(|metrics| metrics.millis_to_search.as_millis()) + .sum::() as f32 + / l; + + let mut ndcg_sum = vec![0.0; 10]; + let mut map_sum = vec![0.0; 10]; + let mut precision_sum = vec![0.0; 10]; + let mut recall_sum = vec![0.0; 10]; + let mut mmr_sum = 0.0; + + for query_metric in self.query_metrics.iter() { + for (ndcg, query_ndcg) in ndcg_sum.iter_mut().zip(query_metric.ndcg.clone()) { + *ndcg += query_ndcg; + } + + for (mapp, query_map) in map_sum.iter_mut().zip(query_metric.map.clone()) { + *mapp += query_map; + } + + for (pre, query_pre) in precision_sum.iter_mut().zip(query_metric.precision.clone()) { + *pre += query_pre; + } + + for (rec, query_rec) in recall_sum.iter_mut().zip(query_metric.recall.clone()) { + *rec += query_rec; + } + + mmr_sum += query_metric.mrr; + } + + let ndcg = ndcg_sum.iter().map(|val| val / l).collect::>(); + let map = map_sum.iter().map(|val| val / l).collect::>(); + let precision = precision_sum + .iter() + .map(|val| val / l) + .collect::>(); + let recall = recall_sum.iter().map(|val| val / l).collect::>(); + let mrr = mmr_sum / l; + + self.repo_metrics = Some(SummaryMetrics { + millis_to_search, + ndcg, + map, + mrr, + precision, + recall, + }) + } +} + +fn evaluate_precision(hits: Vec) -> Vec { + let mut rolling_hit: f32 = 0.0; + let mut precision = Vec::new(); + for (idx, hit) in hits.into_iter().enumerate() { + rolling_hit += hit as f32; + precision.push(rolling_hit / ((idx as f32) + 1.0)); + } + + precision +} + +fn evaluate_recall(hits: Vec, ideal: Vec) -> Vec { + let total_relevant = ideal.iter().sum::() as f32; + let mut recall = Vec::new(); + let mut rolling_hit: f32 = 0.0; + for hit in hits { + rolling_hit += hit as f32; + recall.push(rolling_hit / total_relevant); + } + + recall +} + async fn evaluate_repo( + repo_name: String, index: ModelHandle, project: ModelHandle, query_matches: Vec, cx: &mut AsyncAppContext, -) -> Result<()> { +) -> Result { // Index Project let index_t0 = Instant::now(); index .update(cx, |index, cx| index.index_project(project.clone(), cx)) .await?; - let index_time = index_t0.elapsed(); - println!("Time to Index: {:?}", index_time.as_millis()); + let mut repo_metrics = RepoEvaluationMetrics::new(index_t0.elapsed()); for query in query_matches { // Query each match in order @@ -286,26 +412,45 @@ async fn evaluate_repo( index.search_project(project.clone(), query.clone().query, 10, vec![], vec![], cx) }) .await?; - let search_time = search_t0.elapsed(); - println!("Time to Search: {:?}", search_time.as_millis()); + let millis_to_search = search_t0.elapsed(); // Get Hits/Ideal let k = 10; - let (ideal, hits) = self::get_hits(query, search_results, k, cx); + let (ideal, hits) = self::get_hits(query.clone(), search_results, k, cx); // Evaluate ndcg@k, for k = 1, 3, 5, 10 - let ndcg = evaluate_ndcg(hits.clone(), ideal); - println!("NDCG: {:?}", ndcg); + let ndcg = evaluate_ndcg(hits.clone(), ideal.clone()); // Evaluate map@k, for k = 1, 3, 5, 10 - let map = evaluate_map(hits); - println!("MAP: {:?}", map); + let map = evaluate_map(hits.clone()); - // Evaluate span count - // Evaluate token count + // Evaluate mrr + let mrr = evaluate_mrr(hits.clone()); + + // Evaluate precision + let precision = evaluate_precision(hits.clone()); + + // Evaluate Recall + let recall = evaluate_recall(hits.clone(), ideal); + + let query_metrics = QueryMetrics { + query, + millis_to_search, + ndcg, + map, + mrr, + hits, + precision, + recall, + }; + + repo_metrics.query_metrics.push(query_metrics); } - anyhow::Ok(()) + repo_metrics.summarize(); + repo_metrics.save(repo_name); + + anyhow::Ok(repo_metrics) } fn main() { @@ -367,12 +512,10 @@ fn main() { for repo in repo_evals { let cloned = clone_repo(repo.clone()); match cloned { - Ok(clone_path) => { - log::trace!( + Ok((repo_name, clone_path)) => { + println!( "Cloned {:?} @ {:?} into {:?}", - repo.repo, - repo.commit, - &clone_path + repo.repo, repo.commit, &clone_path ); // Create Project @@ -393,7 +536,8 @@ fn main() { }) .await; - evaluate_repo( + let repo_metrics = evaluate_repo( + repo_name, semantic_index.clone(), project, repo.assertions, @@ -402,7 +546,7 @@ fn main() { .await?; } Err(err) => { - log::trace!("Error cloning: {:?}", err); + println!("Error cloning: {:?}", err); } } } From d85acceeecc21046397972c10e69586a3fa6b5b7 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 19 Sep 2023 16:13:47 -0400 Subject: [PATCH 11/14] move git2 to workspace dependency globally --- Cargo.lock | 18 --------------- Cargo.toml | 1 + crates/fs/Cargo.toml | 2 +- crates/git/Cargo.toml | 2 +- crates/project/Cargo.toml | 2 +- crates/semantic_index/Cargo.toml | 2 +- crates/semantic_index/examples/eval.rs | 31 ++------------------------ crates/util/Cargo.toml | 4 ++-- 8 files changed, 9 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6753629177..506b104fc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3059,8 +3059,6 @@ dependencies = [ "libc", "libgit2-sys", "log", - "openssl-probe", - "openssl-sys", "url", ] @@ -4023,9 +4021,7 @@ checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" dependencies = [ "cc", "libc", - "libssh2-sys", "libz-sys", - "openssl-sys", "pkg-config", ] @@ -4066,20 +4062,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "libssh2-sys" -version = "0.2.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b094a36eb4b8b8c8a7b4b8ae43b2944502be3e59cd87687595cf6b0a71b3f4ca" -dependencies = [ - "cc", - "libc", - "libz-sys", - "openssl-sys", - "pkg-config", - "vcpkg", -] - [[package]] name = "libz-sys" version = "1.1.12" diff --git a/Cargo.toml b/Cargo.toml index 96070658b9..b8fa79a4e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,6 +116,7 @@ toml = { version = "0.5" } tree-sitter = "0.20" unindent = { version = "0.1.7" } pretty_assertions = "1.3.0" +git2 = { version = "0.15", default-features = false } tree-sitter-bash = { git = "https://github.com/tree-sitter/tree-sitter-bash", rev = "1b0321ee85701d5036c334a6f04761cdc672e64c" } tree-sitter-c = "0.20.1" diff --git a/crates/fs/Cargo.toml b/crates/fs/Cargo.toml index 7584dec21a..78146c3a9d 100644 --- a/crates/fs/Cargo.toml +++ b/crates/fs/Cargo.toml @@ -26,7 +26,7 @@ lazy_static.workspace = true parking_lot.workspace = true smol.workspace = true regex.workspace = true -git2 = { version = "0.15", default-features = false } +git2.workspace = true serde.workspace = true serde_derive.workspace = true serde_json.workspace = true diff --git a/crates/git/Cargo.toml b/crates/git/Cargo.toml index 8b91ee5373..72668ba766 100644 --- a/crates/git/Cargo.toml +++ b/crates/git/Cargo.toml @@ -20,7 +20,7 @@ smol.workspace = true parking_lot.workspace = true async-trait.workspace = true futures.workspace = true -git2 = { version = "0.15", default-features = false } +git2.workspace = true [dev-dependencies] unindent.workspace = true diff --git a/crates/project/Cargo.toml b/crates/project/Cargo.toml index 0dc76ed54a..ffea6646e9 100644 --- a/crates/project/Cargo.toml +++ b/crates/project/Cargo.toml @@ -75,6 +75,6 @@ lsp = { path = "../lsp", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"] } util = { path = "../util", features = ["test-support"] } rpc = { path = "../rpc", features = ["test-support"] } -git2 = { version = "0.15", default-features = false } +git2.workspace = true tempdir.workspace = true unindent.workspace = true diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 0c31f89b62..2997f5aa0b 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -51,11 +51,11 @@ project = { path = "../project", features = ["test-support"] } rpc = { path = "../rpc", features = ["test-support"] } workspace = { path = "../workspace", features = ["test-support"] } settings = { path = "../settings", features = ["test-support"]} -git2 = { version = "0.15"} rust-embed = { version = "8.0", features = ["include-exclude"] } client = { path = "../client" } zed = { path = "../zed"} node_runtime = { path = "../node_runtime"} +git2.workspace = true pretty_assertions.workspace = true rand.workspace = true diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index 546071c6f1..37da380b89 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -1,12 +1,10 @@ use anyhow::{anyhow, Result}; use client::{self, UserStore}; -use collections::HashMap; use git2::{Object, Oid, Repository}; -use gpui::{AppContext, AssetSource, AsyncAppContext, ModelHandle, Task}; +use gpui::{AsyncAppContext, ModelHandle, Task}; use language::LanguageRegistry; use node_runtime::RealNodeRuntime; use project::{Project, RealFs}; -use rust_embed::RustEmbed; use semantic_index::embedding::OpenAIEmbeddings; use semantic_index::semantic_index_settings::SemanticIndexSettings; use semantic_index::{SearchResult, SemanticIndex}; @@ -21,28 +19,6 @@ use util::http::{self}; use util::paths::{self, EMBEDDINGS_DIR}; use zed::languages; -#[derive(RustEmbed)] -#[folder = "../../assets"] -#[include = "fonts/**/*"] -#[include = "icons/**/*"] -#[include = "themes/**/*"] -#[include = "sounds/**/*"] -#[include = "*.md"] -#[exclude = "*.DS_Store"] -pub struct Assets; - -impl AssetSource for Assets { - fn load(&self, path: &str) -> Result> { - Self::get(path) - .map(|f| f.data) - .ok_or_else(|| anyhow!("could not find asset at path \"{}\"", path)) - } - - fn list(&self, path: &str) -> Vec> { - Self::iter().filter(|p| p.starts_with(path)).collect() - } -} - #[derive(Deserialize, Clone, Serialize)] struct EvaluationQuery { query: String, @@ -455,11 +431,9 @@ async fn evaluate_repo( fn main() { // Launch new repo as a new Zed workspace/project - let app = gpui::App::new(Assets).unwrap(); + let app = gpui::App::new(()).unwrap(); let fs = Arc::new(RealFs); let http = http::client(); - let user_settings_file_rx = - watch_config_file(app.background(), fs.clone(), paths::SETTINGS.clone()); let http_client = http::client(); init_logger(); @@ -475,7 +449,6 @@ fn main() { .set_default_settings(default_settings().as_ref(), cx) .unwrap(); cx.set_global(store); - handle_settings_file_changes(user_settings_file_rx, cx); // Initialize Languages let login_shell_env_loaded = Task::ready(()); diff --git a/crates/util/Cargo.toml b/crates/util/Cargo.toml index 8d9594fbeb..6ab76b0850 100644 --- a/crates/util/Cargo.toml +++ b/crates/util/Cargo.toml @@ -25,10 +25,10 @@ rust-embed.workspace = true tempdir = { workspace = true, optional = true } serde.workspace = true serde_json.workspace = true -git2 = { version = "0.15", default-features = false, optional = true } +git2 = { workspace = true, optional = true } dirs = "3.0" take-until = "0.2.0" [dev-dependencies] tempdir.workspace = true -git2 = { version = "0.15", default-features = false } +git2.workspace = true From b57b5c0b3374071342f48b9dea1ee0e3c47edec7 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 19 Sep 2023 16:36:51 -0400 Subject: [PATCH 12/14] updated git2 to use ssl --- Cargo.lock | 18 ++++++++++++++++++ Cargo.toml | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 506b104fc3..6753629177 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3059,6 +3059,8 @@ dependencies = [ "libc", "libgit2-sys", "log", + "openssl-probe", + "openssl-sys", "url", ] @@ -4021,7 +4023,9 @@ checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" dependencies = [ "cc", "libc", + "libssh2-sys", "libz-sys", + "openssl-sys", "pkg-config", ] @@ -4062,6 +4066,20 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libssh2-sys" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b094a36eb4b8b8c8a7b4b8ae43b2944502be3e59cd87687595cf6b0a71b3f4ca" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + [[package]] name = "libz-sys" version = "1.1.12" diff --git a/Cargo.toml b/Cargo.toml index b8fa79a4e3..3299986d2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,7 +116,7 @@ toml = { version = "0.5" } tree-sitter = "0.20" unindent = { version = "0.1.7" } pretty_assertions = "1.3.0" -git2 = { version = "0.15", default-features = false } +git2 = { version = "0.15" } tree-sitter-bash = { git = "https://github.com/tree-sitter/tree-sitter-bash", rev = "1b0321ee85701d5036c334a6f04761cdc672e64c" } tree-sitter-c = "0.20.1" From 25cb79e475deb1bfe0a3d5a6d46c9c9d06b44898 Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 19 Sep 2023 18:55:15 -0400 Subject: [PATCH 13/14] remove git2 dependency for repository cloning in semantic_index eval --- Cargo.lock | 19 ------------------ Cargo.toml | 2 +- crates/semantic_index/Cargo.toml | 1 - crates/semantic_index/examples/eval.rs | 27 +++++++++++++------------- 4 files changed, 14 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6753629177..e6dbada745 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3059,8 +3059,6 @@ dependencies = [ "libc", "libgit2-sys", "log", - "openssl-probe", - "openssl-sys", "url", ] @@ -4023,9 +4021,7 @@ checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" dependencies = [ "cc", "libc", - "libssh2-sys", "libz-sys", - "openssl-sys", "pkg-config", ] @@ -4066,20 +4062,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "libssh2-sys" -version = "0.2.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b094a36eb4b8b8c8a7b4b8ae43b2944502be3e59cd87687595cf6b0a71b3f4ca" -dependencies = [ - "cc", - "libc", - "libz-sys", - "openssl-sys", - "pkg-config", - "vcpkg", -] - [[package]] name = "libz-sys" version = "1.1.12" @@ -6756,7 +6738,6 @@ dependencies = [ "editor", "env_logger 0.9.3", "futures 0.3.28", - "git2", "globset", "gpui", "isahc", diff --git a/Cargo.toml b/Cargo.toml index 3299986d2c..c1876434ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,7 +116,7 @@ toml = { version = "0.5" } tree-sitter = "0.20" unindent = { version = "0.1.7" } pretty_assertions = "1.3.0" -git2 = { version = "0.15" } +git2 = { version = "0.15", default-features = false} tree-sitter-bash = { git = "https://github.com/tree-sitter/tree-sitter-bash", rev = "1b0321ee85701d5036c334a6f04761cdc672e64c" } tree-sitter-c = "0.20.1" diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index 2997f5aa0b..44afecb0c2 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -55,7 +55,6 @@ rust-embed = { version = "8.0", features = ["include-exclude"] } client = { path = "../client" } zed = { path = "../zed"} node_runtime = { path = "../node_runtime"} -git2.workspace = true pretty_assertions.workspace = true rand.workspace = true diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index 37da380b89..be2a1e8a52 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -1,6 +1,5 @@ use anyhow::{anyhow, Result}; use client::{self, UserStore}; -use git2::{Object, Oid, Repository}; use gpui::{AsyncAppContext, ModelHandle, Task}; use language::LanguageRegistry; use node_runtime::RealNodeRuntime; @@ -11,6 +10,7 @@ use semantic_index::{SearchResult, SemanticIndex}; use serde::{Deserialize, Serialize}; use settings::{default_settings, handle_settings_file_changes, watch_config_file, SettingsStore}; use std::path::{Path, PathBuf}; +use std::process::Command; use std::sync::Arc; use std::time::{Duration, Instant}; use std::{cmp, env, fs}; @@ -95,23 +95,22 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result<(String, PathBuf)> { .ok_or(anyhow!("path canonicalization failed"))? .parent() .unwrap() - .join(TMP_REPO_PATH) - .join(&repo_name); + .join(TMP_REPO_PATH); // Delete Clone Path if already exists let _ = fs::remove_dir_all(&clone_path); + let _ = fs::create_dir(&clone_path); - // Clone in Repo - git2::build::RepoBuilder::new() - // .branch(repo_eval.sha.as_str()) - .clone(repo_eval.repo.as_str(), clone_path.as_path())?; - - let repo: Repository = Repository::open(clone_path.clone())?; - let obj: Object = repo - .find_commit(Oid::from_str(repo_eval.commit.as_str())?)? - .into_object(); - repo.checkout_tree(&obj, None)?; - repo.set_head_detached(obj.id())?; + let _ = Command::new("git") + .args(["clone", repo_eval.repo.as_str()]) + .current_dir(clone_path.clone()) + .output()?; + // Update clone path to be new directory housing the repo. + let clone_path = clone_path.join(repo_name.clone()); + let _ = Command::new("git") + .args(["checkout", repo_eval.commit.as_str()]) + .current_dir(clone_path.clone()) + .output()?; Ok((repo_name, clone_path)) } From 11b3bfdc99a9955b9fce62cbc44cec273f987bcc Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 19 Sep 2023 19:05:26 -0400 Subject: [PATCH 14/14] fix warnings --- crates/semantic_index/examples/eval.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/semantic_index/examples/eval.rs b/crates/semantic_index/examples/eval.rs index be2a1e8a52..15406cf63e 100644 --- a/crates/semantic_index/examples/eval.rs +++ b/crates/semantic_index/examples/eval.rs @@ -8,7 +8,7 @@ use semantic_index::embedding::OpenAIEmbeddings; use semantic_index::semantic_index_settings::SemanticIndexSettings; use semantic_index::{SearchResult, SemanticIndex}; use serde::{Deserialize, Serialize}; -use settings::{default_settings, handle_settings_file_changes, watch_config_file, SettingsStore}; +use settings::{default_settings, SettingsStore}; use std::path::{Path, PathBuf}; use std::process::Command; use std::sync::Arc; @@ -16,7 +16,7 @@ use std::time::{Duration, Instant}; use std::{cmp, env, fs}; use util::channel::{RELEASE_CHANNEL, RELEASE_CHANNEL_NAME}; use util::http::{self}; -use util::paths::{self, EMBEDDINGS_DIR}; +use util::paths::EMBEDDINGS_DIR; use zed::languages; #[derive(Deserialize, Clone, Serialize)] @@ -134,7 +134,7 @@ fn get_hits( let mut hits = Vec::new(); for result in search_results { - let (path, start_row, end_row) = result.buffer.read_with(cx, |buffer, cx| { + let (path, start_row, end_row) = result.buffer.read_with(cx, |buffer, _cx| { let path = buffer.file().unwrap().path().to_path_buf(); let start_row = buffer.offset_to_point(result.range.start.offset).row; let end_row = buffer.offset_to_point(result.range.end.offset).row; @@ -423,7 +423,7 @@ async fn evaluate_repo( } repo_metrics.summarize(); - repo_metrics.save(repo_name); + let _ = repo_metrics.save(repo_name); anyhow::Ok(repo_metrics) } @@ -508,7 +508,7 @@ fn main() { }) .await; - let repo_metrics = evaluate_repo( + let _ = evaluate_repo( repo_name, semantic_index.clone(), project,