Add support for judge repetitions in eval (#28811)

Release Notes: - N/A --------- Co-authored-by: Thomas <thomas@zed.dev>
2025-04-15 17:18:02 -06:00 · 2025-04-15 17:18:02 -06:00 · 102ea6ac79
commit 102ea6ac79
parent 5d3718df2d
2 changed files with 51 additions and 20 deletions
--- a/crates/eval/src/eval.rs
+++ b/crates/eval/src/eval.rs
@ -42,6 +42,9 @@ struct Args {
    /// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
    #[arg(long, value_delimiter = ',')]
    languages: Option<Vec<String>>,
+    /// How many times to run the judge on each example run.
+    #[arg(long, default_value = "3")]
+    judge_repetitions: u32,
 }

 fn main() {
@ -203,18 +206,23 @@ fn main() {
                example.setup().await?;
            }

+            let judge_repetitions = args.judge_repetitions;
            let tasks = examples
                .into_iter()
                .map(|example| {
                    let app_state = app_state.clone();
                    let model = model.clone();
                    cx.spawn(async move |cx| {
-                        (run_example(&example, model, app_state, cx).await, example)
+                        (
+                            run_example(&example, model, app_state, judge_repetitions, cx).await,
+                            example,
+                        )
                    })
                })
                .collect::<Vec<_>>();

-            let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await;
+            let results: Vec<(Result<Vec<Result<JudgeOutput>>>, Example)> =
+                future::join_all(tasks).await;

            println!("\n\n");
            println!("========================================");
@ -229,16 +237,25 @@ fn main() {
                    Err(err) => {
                        println!("💥 {}{:?}", example.log_prefix, err);
                    }
-                    Ok(judge_output) => {
-                        const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
+                    Ok(judge_results) => {
+                        for judge_result in judge_results {
+                            match judge_result {
+                                Ok(judge_output) => {
+                                    const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];

-                        println!(
-                            "{} {}{}",
-                            SCORES[judge_output.score.min(5) as usize],
-                            example.log_prefix,
-                            judge_output.score,
-                        );
-                        judge_scores.push(judge_output.score);
+                                    println!(
+                                        "{} {}{}",
+                                        SCORES[judge_output.score.min(5) as usize],
+                                        example.log_prefix,
+                                        judge_output.score,
+                                    );
+                                    judge_scores.push(judge_output.score);
+                                }
+                                Err(err) => {
+                                    println!("💥 {}{:?}", example.log_prefix, err);
+                                }
+                            }
+                        }
                    }
                }
                println!(
@ -266,12 +283,18 @@ async fn run_example(
    example: &Example,
    model: Arc<dyn LanguageModel>,
    app_state: Arc<AgentAppState>,
+    judge_repetitions: u32,
    cx: &mut AsyncApp,
-) -> Result<JudgeOutput> {
+) -> Result<Vec<Result<JudgeOutput>>> {
    cx.update(|cx| example.run(model.clone(), app_state, cx))?
        .await?;
    let diff = example.repository_diff().await?;
-    example.judge(model, diff, cx).await
+
+    let judge_tasks = (0..judge_repetitions)
+        .map(|round| example.judge(model.clone(), diff.clone(), round, cx))
+        .collect::<Vec<_>>();
+
+    Ok(future::join_all(judge_tasks).await)
 }

 fn list_all_examples() -> Result<Vec<PathBuf>> {
--- a/crates/eval/src/example.rs
+++ b/crates/eval/src/example.rs
@ -58,6 +58,8 @@ pub struct Example {
    pub criteria: String,
    /// Markdown output file to append to
    pub output_file: Option<Arc<Mutex<File>>>,
+    /// Path to the output run directory.
+    pub run_dir: PathBuf,
    /// Path to markdown output file
    pub output_file_path: PathBuf,
    /// Prefix used for logging that identifies this example
@ -103,6 +105,7 @@ impl Example {
            base: toml::from_str(&fs::read_to_string(&base_path)?)?,
            prompt: fs::read_to_string(prompt_path.clone())?,
            criteria: fs::read_to_string(criteria_path.clone())?,
+            run_dir: run_dir.to_path_buf(),
            output_file: None,
            output_file_path,
            log_prefix: name,
@ -425,6 +428,10 @@ impl Example {
            println!("{}Getting repository diff", this.log_prefix);
            let repository_diff = this.repository_diff().await?;

+            let repository_diff_path = this.run_dir.join(format!("{}.diff", this.name));
+            let mut repository_diff_output_file = File::create(&repository_diff_path)?;
+            writeln!(&mut repository_diff_output_file, "{}", &repository_diff).log_err();
+
            println!("{}Getting diagnostics", this.log_prefix);
            let diagnostics = cx
                .update(move |cx| {
@ -456,6 +463,7 @@ impl Example {
        &self,
        model: Arc<dyn LanguageModel>,
        repository_diff: String,
+        judge_repetitions: u32,
        cx: &AsyncApp,
    ) -> Result<JudgeOutput> {
        let judge_prompt = include_str!("judge_prompt.hbs");
@ -483,14 +491,14 @@ impl Example {

        let response = send_language_model_request(model, request, cx).await?;

-        let output_file_ref = self.output_file();
-        let mut output_file = output_file_ref.lock().unwrap();
+        let judge_file_path = self.run_dir.join(format!(
+            "{}_judge_{}.md",
+            self.name, // This is the eval_name
+            judge_repetitions
+        ));

-        writeln!(&mut output_file, "\n\n").log_err();
-        writeln!(&mut output_file, "========================================").log_err();
-        writeln!(&mut output_file, "              JUDGE OUTPUT              ").log_err();
-        writeln!(&mut output_file, "========================================").log_err();
-        writeln!(&mut output_file, "\n{}", &response).log_err();
+        let mut judge_output_file = File::create(&judge_file_path)?;
+        writeln!(&mut judge_output_file, "{}", &response).log_err();

        parse_judge_output(&response)
    }