Agent Eval: Initial support for running examples repeatedly (#28844)

Not ideal as it creates a separate worktree for each repetition

Release Notes:

- N/A
This commit is contained in:
Michael Sloan 2025-04-16 00:35:55 -06:00 committed by GitHub
parent 609895d95f
commit 9a9f2e71ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 25 additions and 10 deletions

View file

@ -44,6 +44,10 @@ struct Args {
model: String,
#[arg(long, value_delimiter = ',')]
languages: Option<Vec<String>>,
/// How many times to run each example. Note that this is currently not very efficient as N
/// worktrees will be created for the examples.
#[arg(long, default_value = "1")]
repetitions: u32,
/// How many times to run the judge on each example run.
#[arg(long, default_value = "3")]
judge_repetitions: u32,
@ -146,12 +150,20 @@ fn main() {
continue;
}
let name_len = example.name.len();
if name_len > max_name_width {
max_name_width = example.name.len();
}
// TODO: This creates a worktree per repetition. Ideally these examples should
// either be run sequentially on the same worktree, or reuse worktrees when there
// are more examples to run than the concurrency limit.
for repetition_number in 0..args.repetitions {
let mut example = example.clone();
example.set_repetition_number(repetition_number);
examples.push(example);
let name_len = example.name.len();
if name_len > max_name_width {
max_name_width = example.name.len();
}
examples.push(example);
}
}
println!("Skipped examples: {}\n", skipped.join(", "));