diff --git a/crates/eval/src/eval.rs b/crates/eval/src/eval.rs index 7831707af2..cf3792343e 100644 --- a/crates/eval/src/eval.rs +++ b/crates/eval/src/eval.rs @@ -42,6 +42,9 @@ struct Args { /// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run. #[arg(long, value_delimiter = ',')] languages: Option>, + /// How many times to run the judge on each example run. + #[arg(long, default_value = "3")] + judge_repetitions: u32, } fn main() { @@ -203,18 +206,23 @@ fn main() { example.setup().await?; } + let judge_repetitions = args.judge_repetitions; let tasks = examples .into_iter() .map(|example| { let app_state = app_state.clone(); let model = model.clone(); cx.spawn(async move |cx| { - (run_example(&example, model, app_state, cx).await, example) + ( + run_example(&example, model, app_state, judge_repetitions, cx).await, + example, + ) }) }) .collect::>(); - let results: Vec<(Result, Example)> = future::join_all(tasks).await; + let results: Vec<(Result>>, Example)> = + future::join_all(tasks).await; println!("\n\n"); println!("========================================"); @@ -229,16 +237,25 @@ fn main() { Err(err) => { println!("💥 {}{:?}", example.log_prefix, err); } - Ok(judge_output) => { - const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"]; + Ok(judge_results) => { + for judge_result in judge_results { + match judge_result { + Ok(judge_output) => { + const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"]; - println!( - "{} {}{}", - SCORES[judge_output.score.min(5) as usize], - example.log_prefix, - judge_output.score, - ); - judge_scores.push(judge_output.score); + println!( + "{} {}{}", + SCORES[judge_output.score.min(5) as usize], + example.log_prefix, + judge_output.score, + ); + judge_scores.push(judge_output.score); + } + Err(err) => { + println!("💥 {}{:?}", example.log_prefix, err); + } + } + } } } println!( @@ -266,12 +283,18 @@ async fn run_example( example: &Example, model: Arc, app_state: Arc, + judge_repetitions: u32, cx: &mut AsyncApp, -) -> Result { +) -> Result>> { cx.update(|cx| example.run(model.clone(), app_state, cx))? .await?; let diff = example.repository_diff().await?; - example.judge(model, diff, cx).await + + let judge_tasks = (0..judge_repetitions) + .map(|round| example.judge(model.clone(), diff.clone(), round, cx)) + .collect::>(); + + Ok(future::join_all(judge_tasks).await) } fn list_all_examples() -> Result> { diff --git a/crates/eval/src/example.rs b/crates/eval/src/example.rs index bbd00bb449..dce961504b 100644 --- a/crates/eval/src/example.rs +++ b/crates/eval/src/example.rs @@ -58,6 +58,8 @@ pub struct Example { pub criteria: String, /// Markdown output file to append to pub output_file: Option>>, + /// Path to the output run directory. + pub run_dir: PathBuf, /// Path to markdown output file pub output_file_path: PathBuf, /// Prefix used for logging that identifies this example @@ -103,6 +105,7 @@ impl Example { base: toml::from_str(&fs::read_to_string(&base_path)?)?, prompt: fs::read_to_string(prompt_path.clone())?, criteria: fs::read_to_string(criteria_path.clone())?, + run_dir: run_dir.to_path_buf(), output_file: None, output_file_path, log_prefix: name, @@ -425,6 +428,10 @@ impl Example { println!("{}Getting repository diff", this.log_prefix); let repository_diff = this.repository_diff().await?; + let repository_diff_path = this.run_dir.join(format!("{}.diff", this.name)); + let mut repository_diff_output_file = File::create(&repository_diff_path)?; + writeln!(&mut repository_diff_output_file, "{}", &repository_diff).log_err(); + println!("{}Getting diagnostics", this.log_prefix); let diagnostics = cx .update(move |cx| { @@ -456,6 +463,7 @@ impl Example { &self, model: Arc, repository_diff: String, + judge_repetitions: u32, cx: &AsyncApp, ) -> Result { let judge_prompt = include_str!("judge_prompt.hbs"); @@ -483,14 +491,14 @@ impl Example { let response = send_language_model_request(model, request, cx).await?; - let output_file_ref = self.output_file(); - let mut output_file = output_file_ref.lock().unwrap(); + let judge_file_path = self.run_dir.join(format!( + "{}_judge_{}.md", + self.name, // This is the eval_name + judge_repetitions + )); - writeln!(&mut output_file, "\n\n").log_err(); - writeln!(&mut output_file, "========================================").log_err(); - writeln!(&mut output_file, " JUDGE OUTPUT ").log_err(); - writeln!(&mut output_file, "========================================").log_err(); - writeln!(&mut output_file, "\n{}", &response).log_err(); + let mut judge_output_file = File::create(&judge_file_path)?; + writeln!(&mut judge_output_file, "{}", &response).log_err(); parse_judge_output(&response) }