add map to evaluation suite for semantic_index

This commit is contained in:
KCaverly 2023-09-18 09:57:52 -04:00
parent 04bd107ada
commit 566bb9f71b

View file

@ -126,6 +126,8 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result<PathBuf> {
let clone_path = fs::canonicalize(env::current_dir()?)?
.parent()
.ok_or(anyhow!("path canonicalization failed"))?
.parent()
.unwrap()
.join(TMP_REPO_PATH)
.join(&repo_name);
@ -156,30 +158,12 @@ fn dcg(hits: Vec<usize>) -> f32 {
result
}
fn evaluate_ndcg(
fn get_hits(
eval_query: EvaluationQuery,
search_results: Vec<SearchResult>,
k: usize,
cx: &AsyncAppContext,
) -> Vec<f32> {
// NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of
// items returned by the search engine relative to the hypothetical ideal.
// Relevance is represented as a series of booleans, in which each search result returned
// is identified as being inside the test set of matches (1) or not (0).
// For example, if result 1, 3 and 5 match the 3 relevant results provided
// actual dcg is calculated against a vector of [1, 0, 1, 0, 1]
// whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0]
// as this ideal vector assumes the 3 relevant results provided were returned first
// normalized dcg is then calculated as actual dcg / ideal dcg.
// NDCG ranges from 0 to 1, which higher values indicating better performance
// Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated
// including only the top k values returned.
// The @k metrics can help you identify, at what point does the relevant results start to fall off.
// Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually
// very high quality, whereas rank results quickly drop off after the first result.
) -> (Vec<usize>, Vec<usize>) {
let ideal = vec![1; cmp::min(eval_query.matches.len(), k)];
let mut hits = Vec::new();
@ -222,10 +206,32 @@ fn evaluate_ndcg(
filled_ideal[idx] = i;
}
(filled_ideal.to_vec(), hits)
}
fn evaluate_ndcg(hits: Vec<usize>, ideal: Vec<usize>) -> Vec<f32> {
// NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of
// items returned by the search engine relative to the hypothetical ideal.
// Relevance is represented as a series of booleans, in which each search result returned
// is identified as being inside the test set of matches (1) or not (0).
// For example, if result 1, 3 and 5 match the 3 relevant results provided
// actual dcg is calculated against a vector of [1, 0, 1, 0, 1]
// whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0]
// as this ideal vector assumes the 3 relevant results provided were returned first
// normalized dcg is then calculated as actual dcg / ideal dcg.
// NDCG ranges from 0 to 1, which higher values indicating better performance
// Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated
// including only the top k values returned.
// The @k metrics can help you identify, at what point does the relevant results start to fall off.
// Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually
// very high quality, whereas rank results quickly drop off after the first result.
let mut ndcg = Vec::new();
for idx in 1..(hits.len() + 1) {
let hits_at_k = hits[0..idx].to_vec();
let ideal_at_k = filled_ideal[0..idx].to_vec();
let ideal_at_k = ideal[0..idx].to_vec();
let at_k = dcg(hits_at_k.clone()) / dcg(ideal_at_k.clone());
@ -235,7 +241,24 @@ fn evaluate_ndcg(
ndcg
}
// fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec<SearchResult>, k: usize) -> f32 {}
fn evaluate_map(hits: Vec<usize>) -> Vec<f32> {
let mut map_at_k = Vec::new();
let non_zero = hits.iter().sum::<usize>() as f32;
if non_zero == 0.0 {
return vec![0.0; hits.len()];
}
let mut rolling_non_zero = 0.0;
let mut rolling_map = 0.0;
for (idx, h) in hits.into_iter().enumerate() {
rolling_non_zero += h as f32;
rolling_map += rolling_non_zero / (idx + 1) as f32;
map_at_k.push(rolling_map / non_zero);
}
map_at_k
}
fn init_logger() {
env_logger::init();
@ -253,7 +276,7 @@ async fn evaluate_repo(
.update(cx, |index, cx| index.index_project(project.clone(), cx))
.await?;
let index_time = index_t0.elapsed();
println!("Time to Index: {:?}", index_time.as_secs());
println!("Time to Index: {:?}", index_time.as_millis());
for query in query_matches {
// Query each match in order
@ -264,17 +287,22 @@ async fn evaluate_repo(
})
.await?;
let search_time = search_t0.elapsed();
println!("Time to Search: {:?}", search_time.as_secs());
println!("Time to Search: {:?}", search_time.as_millis());
// Get Hits/Ideal
let k = 10;
let (ideal, hits) = self::get_hits(query, search_results, k, cx);
// Evaluate ndcg@k, for k = 1, 3, 5, 10
let ndcg = evaluate_ndcg(query, search_results, 10, cx);
let ndcg = evaluate_ndcg(hits.clone(), ideal);
println!("NDCG: {:?}", ndcg);
// Evaluate map@k, for k = 1, 3, 5, 10
let map = evaluate_map(hits);
println!("MAP: {:?}", map);
// Evaluate span count
// Evaluate token count
// Evaluate time to index
// Evaluate time to search
}
anyhow::Ok(())