Agent eval: output paths to log files at the end (#28724)

Release Notes:

- N/A
This commit is contained in:
Michael Sloan 2025-04-14 17:04:07 -06:00 committed by GitHub
parent 5f897b0e00
commit 0d6e455bf6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 17 additions and 15 deletions

View file

@ -110,13 +110,15 @@ fn main() {
continue; continue;
} }
examples.push((example_path, example)); println!("{}> Logging to {:?}", example.name, example.log_file_path);
examples.push(example);
} }
let mut repo_urls = HashSet::new(); let mut repo_urls = HashSet::new();
let mut clone_tasks = Vec::new(); let mut clone_tasks = Vec::new();
for (_, example) in examples.iter() { for example in examples.iter() {
let repo_url = example.base.url.clone(); let repo_url = example.base.url.clone();
if repo_urls.insert(repo_url.clone()) { if repo_urls.insert(repo_url.clone()) {
let repo_path = repo_path_for_url(&repo_url); let repo_path = repo_path_for_url(&repo_url);
@ -149,25 +151,22 @@ fn main() {
future::join_all(clone_tasks).await; future::join_all(clone_tasks).await;
for (_, example) in examples.iter() { for example in examples.iter() {
example.setup().await?; example.setup().await?;
} }
let tasks = examples let tasks = examples
.into_iter() .into_iter()
.map(|(example_path, example)| { .map(|example| {
let app_state = app_state.clone(); let app_state = app_state.clone();
let model = model.clone(); let model = model.clone();
cx.spawn(async move |cx| { cx.spawn(async move |cx| {
( (run_example(&example, model, app_state, cx).await, example)
example_path,
run_example(example, model, app_state, cx).await,
)
}) })
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let results: Vec<(PathBuf, Result<JudgeOutput>)> = future::join_all(tasks).await; let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await;
println!("\n\n"); println!("\n\n");
println!("========================================"); println!("========================================");
@ -177,11 +176,11 @@ fn main() {
let mut judge_scores = Vec::new(); let mut judge_scores = Vec::new();
for (example_path, result) in results { for (result, example) in results {
let example_name = example_path.file_name().unwrap().to_string_lossy(); println!("📜 {:<30}: {:?}", example.name, example.log_file_path);
match result { match result {
Err(err) => { Err(err) => {
println!("💥 {:<30}: {:?}", example_name, err); println!("💥 {:<30}: {:?}", example.name, err);
} }
Ok(judge_output) => { Ok(judge_output) => {
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"]; const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
@ -189,7 +188,7 @@ fn main() {
println!( println!(
"{} {:<30}: {}", "{} {:<30}: {}",
SCORES[judge_output.score.min(5) as usize], SCORES[judge_output.score.min(5) as usize],
example_name, example.name,
judge_output.score, judge_output.score,
); );
judge_scores.push(judge_output.score); judge_scores.push(judge_output.score);
@ -212,7 +211,7 @@ fn main() {
} }
async fn run_example( async fn run_example(
mut example: Example, example: &Example,
model: Arc<dyn LanguageModel>, model: Arc<dyn LanguageModel>,
app_state: Arc<AgentAppState>, app_state: Arc<AgentAppState>,
cx: &mut AsyncApp, cx: &mut AsyncApp,

View file

@ -58,6 +58,8 @@ pub struct Example {
pub criteria: String, pub criteria: String,
/// Markdown log file to append to /// Markdown log file to append to
pub log_file: Arc<Mutex<File>>, pub log_file: Arc<Mutex<File>>,
/// Path to markdown log file
pub log_file_path: PathBuf,
} }
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
@ -102,6 +104,7 @@ impl Example {
prompt: fs::read_to_string(prompt_path.clone())?, prompt: fs::read_to_string(prompt_path.clone())?,
criteria: fs::read_to_string(criteria_path.clone())?, criteria: fs::read_to_string(criteria_path.clone())?,
log_file, log_file,
log_file_path,
}) })
} }
@ -400,7 +403,7 @@ impl Example {
} }
pub async fn judge( pub async fn judge(
&mut self, &self,
model: Arc<dyn LanguageModel>, model: Arc<dyn LanguageModel>,
repository_diff: String, repository_diff: String,
cx: &AsyncApp, cx: &AsyncApp,