add map to evaluation suite for semantic_index

2023-09-18 09:57:52 -04:00 · 2023-09-18 09:57:52 -04:00 · 566bb9f71b
commit 566bb9f71b
parent 04bd107ada
1 changed files with 55 additions and 27 deletions
--- a/crates/semantic_index/examples/eval.rs
+++ b/crates/semantic_index/examples/eval.rs
@ -126,6 +126,8 @@ fn clone_repo(repo_eval: RepoEval) -> anyhow::Result<PathBuf> {
    let clone_path = fs::canonicalize(env::current_dir()?)?
        .parent()
        .ok_or(anyhow!("path canonicalization failed"))?
+        .parent()
+        .unwrap()
        .join(TMP_REPO_PATH)
        .join(&repo_name);

@ -156,30 +158,12 @@ fn dcg(hits: Vec<usize>) -> f32 {
    result
 }

-fn evaluate_ndcg(
+fn get_hits(
    eval_query: EvaluationQuery,
    search_results: Vec<SearchResult>,
    k: usize,
    cx: &AsyncAppContext,
-) -> Vec<f32> {
-    // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of
-    // items returned by the search engine relative to the hypothetical ideal.
-    // Relevance is represented as a series of booleans, in which each search result returned
-    // is identified as being inside the test set of matches (1) or not (0).
-
-    // For example, if result 1, 3 and 5 match the 3 relevant results provided
-    // actual dcg is calculated against a vector of [1, 0, 1, 0, 1]
-    // whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0]
-    // as this ideal vector assumes the 3 relevant results provided were returned first
-    // normalized dcg is then calculated as actual dcg / ideal dcg.
-
-    // NDCG ranges from 0 to 1, which higher values indicating better performance
-    // Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated
-    // including only the top k values returned.
-    // The @k metrics can help you identify, at what point does the relevant results start to fall off.
-    // Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually
-    // very high quality, whereas rank results quickly drop off after the first result.
-
+) -> (Vec<usize>, Vec<usize>) {
    let ideal = vec![1; cmp::min(eval_query.matches.len(), k)];

    let mut hits = Vec::new();
@ -222,10 +206,32 @@ fn evaluate_ndcg(
        filled_ideal[idx] = i;
    }

+    (filled_ideal.to_vec(), hits)
+}
+
+fn evaluate_ndcg(hits: Vec<usize>, ideal: Vec<usize>) -> Vec<f32> {
+    // NDCG or Normalized Discounted Cumulative Gain, is determined by comparing the relevance of
+    // items returned by the search engine relative to the hypothetical ideal.
+    // Relevance is represented as a series of booleans, in which each search result returned
+    // is identified as being inside the test set of matches (1) or not (0).
+
+    // For example, if result 1, 3 and 5 match the 3 relevant results provided
+    // actual dcg is calculated against a vector of [1, 0, 1, 0, 1]
+    // whereas ideal dcg is calculated against a vector of [1, 1, 1, 0, 0]
+    // as this ideal vector assumes the 3 relevant results provided were returned first
+    // normalized dcg is then calculated as actual dcg / ideal dcg.
+
+    // NDCG ranges from 0 to 1, which higher values indicating better performance
+    // Commonly NDCG is expressed as NDCG@k, in which k represents the metric calculated
+    // including only the top k values returned.
+    // The @k metrics can help you identify, at what point does the relevant results start to fall off.
+    // Ie. a NDCG@1 of 0.9 and a NDCG@3 of 0.5 may indicate that the first result returned in usually
+    // very high quality, whereas rank results quickly drop off after the first result.
+
    let mut ndcg = Vec::new();
    for idx in 1..(hits.len() + 1) {
        let hits_at_k = hits[0..idx].to_vec();
-        let ideal_at_k = filled_ideal[0..idx].to_vec();
+        let ideal_at_k = ideal[0..idx].to_vec();

        let at_k = dcg(hits_at_k.clone()) / dcg(ideal_at_k.clone());

@ -235,7 +241,24 @@ fn evaluate_ndcg(
    ndcg
 }

-// fn evaluate_map(eval_query: EvaluationQuery, search_results: Vec<SearchResult>, k: usize) -> f32 {}
+fn evaluate_map(hits: Vec<usize>) -> Vec<f32> {
+    let mut map_at_k = Vec::new();
+
+    let non_zero = hits.iter().sum::<usize>() as f32;
+    if non_zero == 0.0 {
+        return vec![0.0; hits.len()];
+    }
+
+    let mut rolling_non_zero = 0.0;
+    let mut rolling_map = 0.0;
+    for (idx, h) in hits.into_iter().enumerate() {
+        rolling_non_zero += h as f32;
+        rolling_map += rolling_non_zero / (idx + 1) as f32;
+        map_at_k.push(rolling_map / non_zero);
+    }
+
+    map_at_k
+}

 fn init_logger() {
    env_logger::init();
@ -253,7 +276,7 @@ async fn evaluate_repo(
        .update(cx, |index, cx| index.index_project(project.clone(), cx))
        .await?;
    let index_time = index_t0.elapsed();
-    println!("Time to Index: {:?}", index_time.as_secs());
+    println!("Time to Index: {:?}", index_time.as_millis());

    for query in query_matches {
        // Query each match in order
@ -264,17 +287,22 @@ async fn evaluate_repo(
            })
            .await?;
        let search_time = search_t0.elapsed();
-        println!("Time to Search: {:?}", search_time.as_secs());
+        println!("Time to Search: {:?}", search_time.as_millis());
+
+        // Get Hits/Ideal
+        let k = 10;
+        let (ideal, hits) = self::get_hits(query, search_results, k, cx);

        // Evaluate ndcg@k, for k = 1, 3, 5, 10
-        let ndcg = evaluate_ndcg(query, search_results, 10, cx);
+        let ndcg = evaluate_ndcg(hits.clone(), ideal);
        println!("NDCG: {:?}", ndcg);

        // Evaluate map@k, for k = 1, 3, 5, 10
+        let map = evaluate_map(hits);
+        println!("MAP: {:?}", map);
+
        // Evaluate span count
        // Evaluate token count
-        // Evaluate time to index
-        // Evaluate time to search
    }

    anyhow::Ok(())