Add support for judge repetitions in eval (#28811)
Release Notes: - N/A --------- Co-authored-by: Thomas <thomas@zed.dev>
This commit is contained in:
parent
5d3718df2d
commit
102ea6ac79
2 changed files with 51 additions and 20 deletions
|
@ -42,6 +42,9 @@ struct Args {
|
|||
/// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
|
||||
#[arg(long, value_delimiter = ',')]
|
||||
languages: Option<Vec<String>>,
|
||||
/// How many times to run the judge on each example run.
|
||||
#[arg(long, default_value = "3")]
|
||||
judge_repetitions: u32,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
@ -203,18 +206,23 @@ fn main() {
|
|||
example.setup().await?;
|
||||
}
|
||||
|
||||
let judge_repetitions = args.judge_repetitions;
|
||||
let tasks = examples
|
||||
.into_iter()
|
||||
.map(|example| {
|
||||
let app_state = app_state.clone();
|
||||
let model = model.clone();
|
||||
cx.spawn(async move |cx| {
|
||||
(run_example(&example, model, app_state, cx).await, example)
|
||||
(
|
||||
run_example(&example, model, app_state, judge_repetitions, cx).await,
|
||||
example,
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let results: Vec<(Result<JudgeOutput>, Example)> = future::join_all(tasks).await;
|
||||
let results: Vec<(Result<Vec<Result<JudgeOutput>>>, Example)> =
|
||||
future::join_all(tasks).await;
|
||||
|
||||
println!("\n\n");
|
||||
println!("========================================");
|
||||
|
@ -229,16 +237,25 @@ fn main() {
|
|||
Err(err) => {
|
||||
println!("💥 {}{:?}", example.log_prefix, err);
|
||||
}
|
||||
Ok(judge_output) => {
|
||||
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
|
||||
Ok(judge_results) => {
|
||||
for judge_result in judge_results {
|
||||
match judge_result {
|
||||
Ok(judge_output) => {
|
||||
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
|
||||
|
||||
println!(
|
||||
"{} {}{}",
|
||||
SCORES[judge_output.score.min(5) as usize],
|
||||
example.log_prefix,
|
||||
judge_output.score,
|
||||
);
|
||||
judge_scores.push(judge_output.score);
|
||||
println!(
|
||||
"{} {}{}",
|
||||
SCORES[judge_output.score.min(5) as usize],
|
||||
example.log_prefix,
|
||||
judge_output.score,
|
||||
);
|
||||
judge_scores.push(judge_output.score);
|
||||
}
|
||||
Err(err) => {
|
||||
println!("💥 {}{:?}", example.log_prefix, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
println!(
|
||||
|
@ -266,12 +283,18 @@ async fn run_example(
|
|||
example: &Example,
|
||||
model: Arc<dyn LanguageModel>,
|
||||
app_state: Arc<AgentAppState>,
|
||||
judge_repetitions: u32,
|
||||
cx: &mut AsyncApp,
|
||||
) -> Result<JudgeOutput> {
|
||||
) -> Result<Vec<Result<JudgeOutput>>> {
|
||||
cx.update(|cx| example.run(model.clone(), app_state, cx))?
|
||||
.await?;
|
||||
let diff = example.repository_diff().await?;
|
||||
example.judge(model, diff, cx).await
|
||||
|
||||
let judge_tasks = (0..judge_repetitions)
|
||||
.map(|round| example.judge(model.clone(), diff.clone(), round, cx))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
Ok(future::join_all(judge_tasks).await)
|
||||
}
|
||||
|
||||
fn list_all_examples() -> Result<Vec<PathBuf>> {
|
||||
|
|
|
@ -58,6 +58,8 @@ pub struct Example {
|
|||
pub criteria: String,
|
||||
/// Markdown output file to append to
|
||||
pub output_file: Option<Arc<Mutex<File>>>,
|
||||
/// Path to the output run directory.
|
||||
pub run_dir: PathBuf,
|
||||
/// Path to markdown output file
|
||||
pub output_file_path: PathBuf,
|
||||
/// Prefix used for logging that identifies this example
|
||||
|
@ -103,6 +105,7 @@ impl Example {
|
|||
base: toml::from_str(&fs::read_to_string(&base_path)?)?,
|
||||
prompt: fs::read_to_string(prompt_path.clone())?,
|
||||
criteria: fs::read_to_string(criteria_path.clone())?,
|
||||
run_dir: run_dir.to_path_buf(),
|
||||
output_file: None,
|
||||
output_file_path,
|
||||
log_prefix: name,
|
||||
|
@ -425,6 +428,10 @@ impl Example {
|
|||
println!("{}Getting repository diff", this.log_prefix);
|
||||
let repository_diff = this.repository_diff().await?;
|
||||
|
||||
let repository_diff_path = this.run_dir.join(format!("{}.diff", this.name));
|
||||
let mut repository_diff_output_file = File::create(&repository_diff_path)?;
|
||||
writeln!(&mut repository_diff_output_file, "{}", &repository_diff).log_err();
|
||||
|
||||
println!("{}Getting diagnostics", this.log_prefix);
|
||||
let diagnostics = cx
|
||||
.update(move |cx| {
|
||||
|
@ -456,6 +463,7 @@ impl Example {
|
|||
&self,
|
||||
model: Arc<dyn LanguageModel>,
|
||||
repository_diff: String,
|
||||
judge_repetitions: u32,
|
||||
cx: &AsyncApp,
|
||||
) -> Result<JudgeOutput> {
|
||||
let judge_prompt = include_str!("judge_prompt.hbs");
|
||||
|
@ -483,14 +491,14 @@ impl Example {
|
|||
|
||||
let response = send_language_model_request(model, request, cx).await?;
|
||||
|
||||
let output_file_ref = self.output_file();
|
||||
let mut output_file = output_file_ref.lock().unwrap();
|
||||
let judge_file_path = self.run_dir.join(format!(
|
||||
"{}_judge_{}.md",
|
||||
self.name, // This is the eval_name
|
||||
judge_repetitions
|
||||
));
|
||||
|
||||
writeln!(&mut output_file, "\n\n").log_err();
|
||||
writeln!(&mut output_file, "========================================").log_err();
|
||||
writeln!(&mut output_file, " JUDGE OUTPUT ").log_err();
|
||||
writeln!(&mut output_file, "========================================").log_err();
|
||||
writeln!(&mut output_file, "\n{}", &response).log_err();
|
||||
let mut judge_output_file = File::create(&judge_file_path)?;
|
||||
writeln!(&mut judge_output_file, "{}", &response).log_err();
|
||||
|
||||
parse_judge_output(&response)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue