Agent Eval: Initial support for running examples repeatedly (#28844)

Not ideal as it creates a separate worktree for each repetition

Release Notes:

- N/A
This commit is contained in:
Michael Sloan 2025-04-16 00:35:55 -06:00 committed by GitHub
parent 609895d95f
commit 9a9f2e71ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 25 additions and 10 deletions

View file

@ -44,6 +44,10 @@ struct Args {
model: String, model: String,
#[arg(long, value_delimiter = ',')] #[arg(long, value_delimiter = ',')]
languages: Option<Vec<String>>, languages: Option<Vec<String>>,
/// How many times to run each example. Note that this is currently not very efficient as N
/// worktrees will be created for the examples.
#[arg(long, default_value = "1")]
repetitions: u32,
/// How many times to run the judge on each example run. /// How many times to run the judge on each example run.
#[arg(long, default_value = "3")] #[arg(long, default_value = "3")]
judge_repetitions: u32, judge_repetitions: u32,
@ -146,6 +150,13 @@ fn main() {
continue; continue;
} }
// TODO: This creates a worktree per repetition. Ideally these examples should
// either be run sequentially on the same worktree, or reuse worktrees when there
// are more examples to run than the concurrency limit.
for repetition_number in 0..args.repetitions {
let mut example = example.clone();
example.set_repetition_number(repetition_number);
let name_len = example.name.len(); let name_len = example.name.len();
if name_len > max_name_width { if name_len > max_name_width {
max_name_width = example.name.len(); max_name_width = example.name.len();
@ -153,6 +164,7 @@ fn main() {
examples.push(example); examples.push(example);
} }
}
println!("Skipped examples: {}\n", skipped.join(", ")); println!("Skipped examples: {}\n", skipped.join(", "));

View file

@ -94,11 +94,7 @@ impl Example {
let base_path = dir_path.join("base.toml"); let base_path = dir_path.join("base.toml");
let prompt_path = dir_path.join("prompt.md"); let prompt_path = dir_path.join("prompt.md");
let criteria_path = dir_path.join("criteria.md"); let criteria_path = dir_path.join("criteria.md");
let output_file_path = run_dir.join(format!("{}.md", name));
let output_file_path = run_dir.join(format!(
"{}.md",
dir_path.file_name().unwrap().to_str().unwrap()
));
Ok(Example { Ok(Example {
name: name.clone(), name: name.clone(),
@ -112,6 +108,13 @@ impl Example {
}) })
} }
pub fn set_repetition_number(&mut self, repetition_number: u32) {
if repetition_number > 0 {
self.name = format!("{}-{}", self.name, repetition_number);
self.output_file_path = self.run_dir.join(format!("{}.md", self.name));
}
}
pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) { pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) {
self.log_prefix = format!( self.log_prefix = format!(
"{}{:<width$}\x1b[0m | ", "{}{:<width$}\x1b[0m | ",