Agent eval: output paths to log files at the end (#28724)
Release Notes: - N/A
This commit is contained in:
parent
5f897b0e00
commit
0d6e455bf6
2 changed files with 17 additions and 15 deletions
|
@ -110,13 +110,15 @@ fn main() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
examples.push((example_path, example));
|
println!("{}> Logging to {:?}", example.name, example.log_file_path);
|
||||||
|
|
||||||
|
examples.push(example);
|
||||||
}
|
}
|
||||||
let mut repo_urls = HashSet::new();
|
let mut repo_urls = HashSet::new();
|
||||||
|
|
||||||
let mut clone_tasks = Vec::new();
|
let mut clone_tasks = Vec::new();
|
||||||
|
|
||||||
for (_, example) in examples.iter() {
|
for example in examples.iter() {
|
||||||
let repo_url = example.base.url.clone();
|
let repo_url = example.base.url.clone();
|
||||||
if repo_urls.insert(repo_url.clone()) {
|
if repo_urls.insert(repo_url.clone()) {
|
||||||
let repo_path = repo_path_for_url(&repo_url);
|
let repo_path = repo_path_for_url(&repo_url);
|
||||||
|
@ -149,25 +151,22 @@ fn main() {
|
||||||
|
|
||||||
future::join_all(clone_tasks).await;
|
future::join_all(clone_tasks).await;
|
||||||
|
|
||||||
for (_, example) in examples.iter() {
|
for example in examples.iter() {
|
||||||
example.setup().await?;
|
example.setup().await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let tasks = examples
|
let tasks = examples
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(example_path, example)| {
|
.map(|example| {
|
||||||
let app_state = app_state.clone();
|
let app_state = app_state.clone();
|
||||||
let model = model.clone();
|
let model = model.clone();
|
||||||
cx.spawn(async move |cx| {
|
cx.spawn(async move |cx| {
|
||||||
(
|
(run_example(&example, model, app_state, cx).await, example)
|
||||||
example_path,
|
|
||||||
run_example(example, model, app_state, cx).await,
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
let results: Vec<(PathBuf, Result<JudgeOutput>)> = future::join_all(tasks).await;
|
let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await;
|
||||||
|
|
||||||
println!("\n\n");
|
println!("\n\n");
|
||||||
println!("========================================");
|
println!("========================================");
|
||||||
|
@ -177,11 +176,11 @@ fn main() {
|
||||||
|
|
||||||
let mut judge_scores = Vec::new();
|
let mut judge_scores = Vec::new();
|
||||||
|
|
||||||
for (example_path, result) in results {
|
for (result, example) in results {
|
||||||
let example_name = example_path.file_name().unwrap().to_string_lossy();
|
println!("📜 {:<30}: {:?}", example.name, example.log_file_path);
|
||||||
match result {
|
match result {
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
println!("💥 {:<30}: {:?}", example_name, err);
|
println!("💥 {:<30}: {:?}", example.name, err);
|
||||||
}
|
}
|
||||||
Ok(judge_output) => {
|
Ok(judge_output) => {
|
||||||
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
|
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
|
||||||
|
@ -189,7 +188,7 @@ fn main() {
|
||||||
println!(
|
println!(
|
||||||
"{} {:<30}: {}",
|
"{} {:<30}: {}",
|
||||||
SCORES[judge_output.score.min(5) as usize],
|
SCORES[judge_output.score.min(5) as usize],
|
||||||
example_name,
|
example.name,
|
||||||
judge_output.score,
|
judge_output.score,
|
||||||
);
|
);
|
||||||
judge_scores.push(judge_output.score);
|
judge_scores.push(judge_output.score);
|
||||||
|
@ -212,7 +211,7 @@ fn main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn run_example(
|
async fn run_example(
|
||||||
mut example: Example,
|
example: &Example,
|
||||||
model: Arc<dyn LanguageModel>,
|
model: Arc<dyn LanguageModel>,
|
||||||
app_state: Arc<AgentAppState>,
|
app_state: Arc<AgentAppState>,
|
||||||
cx: &mut AsyncApp,
|
cx: &mut AsyncApp,
|
||||||
|
|
|
@ -58,6 +58,8 @@ pub struct Example {
|
||||||
pub criteria: String,
|
pub criteria: String,
|
||||||
/// Markdown log file to append to
|
/// Markdown log file to append to
|
||||||
pub log_file: Arc<Mutex<File>>,
|
pub log_file: Arc<Mutex<File>>,
|
||||||
|
/// Path to markdown log file
|
||||||
|
pub log_file_path: PathBuf,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
|
@ -102,6 +104,7 @@ impl Example {
|
||||||
prompt: fs::read_to_string(prompt_path.clone())?,
|
prompt: fs::read_to_string(prompt_path.clone())?,
|
||||||
criteria: fs::read_to_string(criteria_path.clone())?,
|
criteria: fs::read_to_string(criteria_path.clone())?,
|
||||||
log_file,
|
log_file,
|
||||||
|
log_file_path,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -400,7 +403,7 @@ impl Example {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn judge(
|
pub async fn judge(
|
||||||
&mut self,
|
&self,
|
||||||
model: Arc<dyn LanguageModel>,
|
model: Arc<dyn LanguageModel>,
|
||||||
repository_diff: String,
|
repository_diff: String,
|
||||||
cx: &AsyncApp,
|
cx: &AsyncApp,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue