Add support for judge repetitions in eval (#28811)

Release Notes:

- N/A

---------

Co-authored-by: Thomas <thomas@zed.dev>
This commit is contained in:
Michael Sloan 2025-04-15 17:18:02 -06:00 committed by GitHub
parent 5d3718df2d
commit 102ea6ac79
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 51 additions and 20 deletions

View file

@ -42,6 +42,9 @@ struct Args {
/// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run. /// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
#[arg(long, value_delimiter = ',')] #[arg(long, value_delimiter = ',')]
languages: Option<Vec<String>>, languages: Option<Vec<String>>,
/// How many times to run the judge on each example run.
#[arg(long, default_value = "3")]
judge_repetitions: u32,
} }
fn main() { fn main() {
@ -203,18 +206,23 @@ fn main() {
example.setup().await?; example.setup().await?;
} }
let judge_repetitions = args.judge_repetitions;
let tasks = examples let tasks = examples
.into_iter() .into_iter()
.map(|example| { .map(|example| {
let app_state = app_state.clone(); let app_state = app_state.clone();
let model = model.clone(); let model = model.clone();
cx.spawn(async move |cx| { cx.spawn(async move |cx| {
(run_example(&example, model, app_state, cx).await, example) (
run_example(&example, model, app_state, judge_repetitions, cx).await,
example,
)
}) })
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await; let results: Vec<(Result<Vec<Result<JudgeOutput>>>, Example)> =
future::join_all(tasks).await;
println!("\n\n"); println!("\n\n");
println!("========================================"); println!("========================================");
@ -229,6 +237,9 @@ fn main() {
Err(err) => { Err(err) => {
println!("💥 {}{:?}", example.log_prefix, err); println!("💥 {}{:?}", example.log_prefix, err);
} }
Ok(judge_results) => {
for judge_result in judge_results {
match judge_result {
Ok(judge_output) => { Ok(judge_output) => {
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"]; const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
@ -240,6 +251,12 @@ fn main() {
); );
judge_scores.push(judge_output.score); judge_scores.push(judge_output.score);
} }
Err(err) => {
println!("💥 {}{:?}", example.log_prefix, err);
}
}
}
}
} }
println!( println!(
"{} > {}", "{} > {}",
@ -266,12 +283,18 @@ async fn run_example(
example: &Example, example: &Example,
model: Arc<dyn LanguageModel>, model: Arc<dyn LanguageModel>,
app_state: Arc<AgentAppState>, app_state: Arc<AgentAppState>,
judge_repetitions: u32,
cx: &mut AsyncApp, cx: &mut AsyncApp,
) -> Result<JudgeOutput> { ) -> Result<Vec<Result<JudgeOutput>>> {
cx.update(|cx| example.run(model.clone(), app_state, cx))? cx.update(|cx| example.run(model.clone(), app_state, cx))?
.await?; .await?;
let diff = example.repository_diff().await?; let diff = example.repository_diff().await?;
example.judge(model, diff, cx).await
let judge_tasks = (0..judge_repetitions)
.map(|round| example.judge(model.clone(), diff.clone(), round, cx))
.collect::<Vec<_>>();
Ok(future::join_all(judge_tasks).await)
} }
fn list_all_examples() -> Result<Vec<PathBuf>> { fn list_all_examples() -> Result<Vec<PathBuf>> {

View file

@ -58,6 +58,8 @@ pub struct Example {
pub criteria: String, pub criteria: String,
/// Markdown output file to append to /// Markdown output file to append to
pub output_file: Option<Arc<Mutex<File>>>, pub output_file: Option<Arc<Mutex<File>>>,
/// Path to the output run directory.
pub run_dir: PathBuf,
/// Path to markdown output file /// Path to markdown output file
pub output_file_path: PathBuf, pub output_file_path: PathBuf,
/// Prefix used for logging that identifies this example /// Prefix used for logging that identifies this example
@ -103,6 +105,7 @@ impl Example {
base: toml::from_str(&fs::read_to_string(&base_path)?)?, base: toml::from_str(&fs::read_to_string(&base_path)?)?,
prompt: fs::read_to_string(prompt_path.clone())?, prompt: fs::read_to_string(prompt_path.clone())?,
criteria: fs::read_to_string(criteria_path.clone())?, criteria: fs::read_to_string(criteria_path.clone())?,
run_dir: run_dir.to_path_buf(),
output_file: None, output_file: None,
output_file_path, output_file_path,
log_prefix: name, log_prefix: name,
@ -425,6 +428,10 @@ impl Example {
println!("{}Getting repository diff", this.log_prefix); println!("{}Getting repository diff", this.log_prefix);
let repository_diff = this.repository_diff().await?; let repository_diff = this.repository_diff().await?;
let repository_diff_path = this.run_dir.join(format!("{}.diff", this.name));
let mut repository_diff_output_file = File::create(&repository_diff_path)?;
writeln!(&mut repository_diff_output_file, "{}", &repository_diff).log_err();
println!("{}Getting diagnostics", this.log_prefix); println!("{}Getting diagnostics", this.log_prefix);
let diagnostics = cx let diagnostics = cx
.update(move |cx| { .update(move |cx| {
@ -456,6 +463,7 @@ impl Example {
&self, &self,
model: Arc<dyn LanguageModel>, model: Arc<dyn LanguageModel>,
repository_diff: String, repository_diff: String,
judge_repetitions: u32,
cx: &AsyncApp, cx: &AsyncApp,
) -> Result<JudgeOutput> { ) -> Result<JudgeOutput> {
let judge_prompt = include_str!("judge_prompt.hbs"); let judge_prompt = include_str!("judge_prompt.hbs");
@ -483,14 +491,14 @@ impl Example {
let response = send_language_model_request(model, request, cx).await?; let response = send_language_model_request(model, request, cx).await?;
let output_file_ref = self.output_file(); let judge_file_path = self.run_dir.join(format!(
let mut output_file = output_file_ref.lock().unwrap(); "{}_judge_{}.md",
self.name, // This is the eval_name
judge_repetitions
));
writeln!(&mut output_file, "\n\n").log_err(); let mut judge_output_file = File::create(&judge_file_path)?;
writeln!(&mut output_file, "========================================").log_err(); writeln!(&mut judge_output_file, "{}", &response).log_err();
writeln!(&mut output_file, " JUDGE OUTPUT ").log_err();
writeln!(&mut output_file, "========================================").log_err();
writeln!(&mut output_file, "\n{}", &response).log_err();
parse_judge_output(&response) parse_judge_output(&response)
} }