eval: Improve readability with colors and alignment (#28761)

![CleanShot 2025-04-15 at 10 35
39@2x](https://github.com/user-attachments/assets/495d96fb-fe2f-478b-a9d6-678c1184db9a)


Release Notes:

- N/A
This commit is contained in:
Agus Zubiaga 2025-04-15 07:50:01 -06:00 committed by GitHub
parent 2b89b97cd1
commit e4cf7fe8f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 123 additions and 56 deletions

View file

@ -26,6 +26,7 @@ use settings::{Settings, SettingsStore};
use std::collections::HashSet; use std::collections::HashSet;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::Arc; use std::sync::Arc;
use std::usize;
use util::ResultExt as _; use util::ResultExt as _;
pub const RUNS_DIR: &str = "./crates/eval/runs"; pub const RUNS_DIR: &str = "./crates/eval/runs";
@ -97,8 +98,27 @@ fn main() {
std::fs::create_dir_all(&run_dir)?; std::fs::create_dir_all(&run_dir)?;
let mut examples = Vec::new(); let mut examples = Vec::new();
for example_path in example_paths {
let example = Example::load_from_directory(&example_path, &run_dir)?; const COLORS: [&str; 12] = [
"\x1b[31m", // Red
"\x1b[32m", // Green
"\x1b[33m", // Yellow
"\x1b[34m", // Blue
"\x1b[35m", // Magenta
"\x1b[36m", // Cyan
"\x1b[91m", // Bright Red
"\x1b[92m", // Bright Green
"\x1b[93m", // Bright Yellow
"\x1b[94m", // Bright Blue
"\x1b[95m", // Bright Magenta
"\x1b[96m", // Bright Cyan
];
let mut max_name_width = 0;
let mut skipped = Vec::new();
for example_path in &example_paths {
let example = Example::load_from_directory(example_path, &run_dir)?;
if !example if !example
.base .base
@ -106,25 +126,49 @@ fn main() {
.as_ref() .as_ref()
.map_or(false, |lang| languages.contains(lang)) .map_or(false, |lang| languages.contains(lang))
{ {
println!("Skipping {}", example.name); skipped.push(example.name);
continue; continue;
} }
println!("{}> Logging to {:?}", example.name, example.log_file_path); let name_len = example.name.len();
if name_len > max_name_width {
max_name_width = example.name.len();
}
examples.push(example); examples.push(example);
} }
let mut repo_urls = HashSet::new();
println!("Skipped examples: {}\n", skipped.join(", "));
if examples.is_empty() {
eprintln!("Filter matched no examples");
return cx.update(|cx| cx.quit());
}
let mut repo_urls = HashSet::new();
let mut clone_tasks = Vec::new(); let mut clone_tasks = Vec::new();
for example in examples.iter() { for (i, example) in examples.iter_mut().enumerate() {
let color = COLORS[i % COLORS.len()].to_string();
example.set_log_prefix_style(&color, max_name_width);
println!(
"{}Logging to: {}",
example.log_prefix,
example.output_file_path.display()
);
let repo_url = example.base.url.clone(); let repo_url = example.base.url.clone();
if repo_urls.insert(repo_url.clone()) { if repo_urls.insert(repo_url.clone()) {
let repo_path = repo_path_for_url(&repo_url); let repo_path = repo_path_for_url(&repo_url);
if !repo_path.join(".git").is_dir() { if !repo_path.join(".git").is_dir() {
println!("Cloning: {}", repo_url); println!(
"{:<width$} < {}",
"↓ Cloning",
repo_url,
width = max_name_width
);
let git_task = cx.spawn(async move |_cx| { let git_task = cx.spawn(async move |_cx| {
std::fs::create_dir_all(&repo_path)?; std::fs::create_dir_all(&repo_path)?;
@ -134,7 +178,12 @@ fn main() {
clone_tasks.push(git_task); clone_tasks.push(git_task);
} else { } else {
println!("Already cloned: {}", repo_url); println!(
"{:<width$} < {}",
"✔︎ Already cloned",
repo_url,
width = max_name_width
);
let actual_origin = let actual_origin =
run_git(&repo_path, &["remote", "get-url", "origin"]).await?; run_git(&repo_path, &["remote", "get-url", "origin"]).await?;
@ -177,23 +226,27 @@ fn main() {
let mut judge_scores = Vec::new(); let mut judge_scores = Vec::new();
for (result, example) in results { for (result, example) in results {
println!("📜 {:<30}: {:?}", example.name, example.log_file_path);
match result { match result {
Err(err) => { Err(err) => {
println!("💥 {:<30}: {:?}", example.name, err); println!("💥 {}{:?}", example.log_prefix, err);
} }
Ok(judge_output) => { Ok(judge_output) => {
const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"]; const SCORES: [&str; 6] = ["💀", "😭", "😔", "😐", "🙂", "🤩"];
println!( println!(
"{} {:<30}: {}", "{} {}{}",
SCORES[judge_output.score.min(5) as usize], SCORES[judge_output.score.min(5) as usize],
example.name, example.log_prefix,
judge_output.score, judge_output.score,
); );
judge_scores.push(judge_output.score); judge_scores.push(judge_output.score);
} }
} }
println!(
"{} > {}",
" ".repeat(max_name_width),
example.output_file_path.display()
);
} }
let score_count = judge_scores.len(); let score_count = judge_scores.len();

View file

@ -56,10 +56,12 @@ pub struct Example {
pub prompt: String, pub prompt: String,
/// Content of `criteria.md` /// Content of `criteria.md`
pub criteria: String, pub criteria: String,
/// Markdown log file to append to /// Markdown output file to append to
pub log_file: Arc<Mutex<File>>, pub output_file: Arc<Mutex<File>>,
/// Path to markdown log file /// Path to markdown output file
pub log_file_path: PathBuf, pub output_file_path: PathBuf,
/// Prefix used for logging that identifies this example
pub log_prefix: String,
} }
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone)]
@ -86,28 +88,41 @@ pub struct JudgeOutput {
impl Example { impl Example {
/// Load an example from a directory containing base.toml, prompt.md, and criteria.md /// Load an example from a directory containing base.toml, prompt.md, and criteria.md
pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result<Self> { pub fn load_from_directory(dir_path: &Path, run_dir: &Path) -> Result<Self> {
let name = dir_path.file_name().unwrap().to_string_lossy().to_string(); let name = Self::name_from_path(dir_path);
let base_path = dir_path.join("base.toml"); let base_path = dir_path.join("base.toml");
let prompt_path = dir_path.join("prompt.md"); let prompt_path = dir_path.join("prompt.md");
let criteria_path = dir_path.join("criteria.md"); let criteria_path = dir_path.join("criteria.md");
let log_file_path = run_dir.join(format!( let output_file_path = run_dir.join(format!(
"{}.md", "{}.md",
dir_path.file_name().unwrap().to_str().unwrap() dir_path.file_name().unwrap().to_str().unwrap()
)); ));
let log_file = Arc::new(Mutex::new(File::create(&log_file_path).unwrap())); let output_file = Arc::new(Mutex::new(File::create(&output_file_path).unwrap()));
println!("{}> Logging to {:?}", name, log_file_path);
Ok(Example { Ok(Example {
name, name: name.clone(),
base: toml::from_str(&fs::read_to_string(&base_path)?)?, base: toml::from_str(&fs::read_to_string(&base_path)?)?,
prompt: fs::read_to_string(prompt_path.clone())?, prompt: fs::read_to_string(prompt_path.clone())?,
criteria: fs::read_to_string(criteria_path.clone())?, criteria: fs::read_to_string(criteria_path.clone())?,
log_file, output_file,
log_file_path, output_file_path,
log_prefix: name,
}) })
} }
pub fn set_log_prefix_style(&mut self, color: &str, name_width: usize) {
self.log_prefix = format!(
"{}{:<width$}\x1b[0m | ",
color,
self.name,
width = name_width
);
}
pub fn name_from_path(path: &Path) -> String {
path.file_name().unwrap().to_string_lossy().to_string()
}
pub fn worktree_path(&self) -> PathBuf { pub fn worktree_path(&self) -> PathBuf {
Path::new(WORKTREES_DIR) Path::new(WORKTREES_DIR)
.canonicalize() .canonicalize()
@ -120,7 +135,7 @@ impl Example {
pub async fn setup(&self) -> Result<()> { pub async fn setup(&self) -> Result<()> {
let repo_path = repo_path_for_url(&self.base.url); let repo_path = repo_path_for_url(&self.base.url);
println!("{}> Fetching", self.name); println!("{}Fetching", self.log_prefix);
run_git( run_git(
&repo_path, &repo_path,
@ -131,7 +146,7 @@ impl Example {
let worktree_path = self.worktree_path(); let worktree_path = self.worktree_path();
if worktree_path.is_dir() { if worktree_path.is_dir() {
println!("{}> Resetting existing worktree", self.name); println!("{}Resetting existing worktree", self.log_prefix);
// TODO: consider including "-x" to remove ignored files. The downside of this is that // TODO: consider including "-x" to remove ignored files. The downside of this is that
// it will also remove build artifacts, and so prevent incremental reuse there. // it will also remove build artifacts, and so prevent incremental reuse there.
@ -139,7 +154,7 @@ impl Example {
run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?; run_git(&worktree_path, &["reset", "--hard", "HEAD"]).await?;
run_git(&worktree_path, &["checkout", &self.base.revision]).await?; run_git(&worktree_path, &["checkout", &self.base.revision]).await?;
} else { } else {
println!("{}> Creating worktree", self.name); println!("{}Creating worktree", self.log_prefix);
let worktree_path_string = worktree_path.to_string_lossy().to_string(); let worktree_path_string = worktree_path.to_string_lossy().to_string();
@ -235,7 +250,7 @@ impl Example {
// TODO: remove this once the diagnostics tool waits for new diagnostics // TODO: remove this once the diagnostics tool waits for new diagnostics
cx.background_executor().timer(Duration::new(5, 0)).await; cx.background_executor().timer(Duration::new(5, 0)).await;
wait_for_lang_server(&lsp_store, this.name.clone(), cx).await?; wait_for_lang_server(&lsp_store, this.log_prefix.clone(), cx).await?;
lsp_store.update(cx, |lsp_store, cx| { lsp_store.update(cx, |lsp_store, cx| {
lsp_open_handle.update(cx, |buffer, cx| { lsp_open_handle.update(cx, |buffer, cx| {
@ -272,11 +287,11 @@ impl Example {
thread_store.update(cx, |thread_store, cx| thread_store.create_thread(cx))?; thread_store.update(cx, |thread_store, cx| thread_store.create_thread(cx))?;
{ {
let mut log_file = this.log_file.lock().unwrap(); let mut output_file = this.output_file.lock().unwrap();
writeln!(&mut log_file, "👤 USER:").log_err(); writeln!(&mut output_file, "👤 USER:").log_err();
writeln!(&mut log_file, "{}", this.prompt).log_err(); writeln!(&mut output_file, "{}", this.prompt).log_err();
writeln!(&mut log_file, "🤖 ASSISTANT:").log_err(); writeln!(&mut output_file, "🤖 ASSISTANT:").log_err();
log_file.flush().log_err(); output_file.flush().log_err();
} }
let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> = let tool_use_counts: Arc<Mutex<HashMap<Arc<str>, u32>>> =
@ -289,8 +304,8 @@ impl Example {
}); });
let event_handler_task = cx.spawn({ let event_handler_task = cx.spawn({
let log_file = this.log_file.clone(); let output_file = this.output_file.clone();
let name = this.name.clone(); let log_prefix = this.log_prefix.clone();
let tool_use_counts = tool_use_counts.clone(); let tool_use_counts = tool_use_counts.clone();
let thread = thread.downgrade(); let thread = thread.downgrade();
async move |cx| { async move |cx| {
@ -305,7 +320,7 @@ impl Example {
return Err(anyhow!("ThreadEvent channel ended early")); return Err(anyhow!("ThreadEvent channel ended early"));
}; };
let mut log_file = log_file.lock().unwrap(); let mut output_file = output_file.lock().unwrap();
match event { match event {
ThreadEvent::Stopped(reason) => match reason { ThreadEvent::Stopped(reason) => match reason {
@ -324,15 +339,15 @@ impl Example {
break Err(anyhow!(thread_error.clone())); break Err(anyhow!(thread_error.clone()));
} }
ThreadEvent::StreamedAssistantText(_, chunk) => { ThreadEvent::StreamedAssistantText(_, chunk) => {
write!(&mut log_file, "{}", chunk).log_err(); write!(&mut output_file, "{}", chunk).log_err();
} }
ThreadEvent::StreamedAssistantThinking(_, chunk) => { ThreadEvent::StreamedAssistantThinking(_, chunk) => {
write!(&mut log_file, "{}", chunk).log_err(); write!(&mut output_file, "{}", chunk).log_err();
} }
ThreadEvent::UsePendingTools { tool_uses } => { ThreadEvent::UsePendingTools { tool_uses } => {
writeln!(&mut log_file, "\n\nUSING TOOLS:").log_err(); writeln!(&mut output_file, "\n\nUSING TOOLS:").log_err();
for tool_use in tool_uses { for tool_use in tool_uses {
writeln!(&mut log_file, "{}: {}", tool_use.name, tool_use.input) writeln!(&mut output_file, "{}: {}", tool_use.name, tool_use.input)
.log_err(); .log_err();
} }
} }
@ -343,12 +358,12 @@ impl Example {
} => { } => {
if let Some(tool_use) = pending_tool_use { if let Some(tool_use) = pending_tool_use {
let message = format!("TOOL FINISHED: {}", tool_use.name); let message = format!("TOOL FINISHED: {}", tool_use.name);
println!("{name}> {message}"); println!("{}{message}", log_prefix);
writeln!(&mut log_file, "\n{}", message).log_err(); writeln!(&mut output_file, "\n{}", message).log_err();
} }
thread.update(cx, |thread, _cx| { thread.update(cx, |thread, _cx| {
if let Some(tool_result) = thread.tool_result(&tool_use_id) { if let Some(tool_result) = thread.tool_result(&tool_use_id) {
writeln!(&mut log_file, "\n{}\n", tool_result.content).log_err(); writeln!(&mut output_file, "\n{}\n", tool_result.content).log_err();
let mut tool_use_counts = tool_use_counts.lock().unwrap(); let mut tool_use_counts = tool_use_counts.lock().unwrap();
*tool_use_counts *tool_use_counts
.entry(tool_result.tool_name.clone()) .entry(tool_result.tool_name.clone())
@ -359,7 +374,7 @@ impl Example {
_ => {} _ => {}
} }
log_file.flush().log_err(); output_file.flush().log_err();
} }
} }
}); });
@ -373,7 +388,7 @@ impl Example {
event_handler_task.await?; event_handler_task.await?;
if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() { if let Some((_, lsp_store)) = lsp_open_handle_and_store.as_ref() {
wait_for_lang_server(lsp_store, this.name.clone(), cx).await?; wait_for_lang_server(lsp_store, this.log_prefix.clone(), cx).await?;
} }
let repository_diff = this.repository_diff().await?; let repository_diff = this.repository_diff().await?;
@ -433,13 +448,13 @@ impl Example {
let response = send_language_model_request(model, request, cx).await?; let response = send_language_model_request(model, request, cx).await?;
let mut log_file = self.log_file.lock().unwrap(); let mut output_file = self.output_file.lock().unwrap();
writeln!(&mut log_file, "\n\n").log_err(); writeln!(&mut output_file, "\n\n").log_err();
writeln!(&mut log_file, "========================================").log_err(); writeln!(&mut output_file, "========================================").log_err();
writeln!(&mut log_file, " JUDGE OUTPUT ").log_err(); writeln!(&mut output_file, " JUDGE OUTPUT ").log_err();
writeln!(&mut log_file, "========================================").log_err(); writeln!(&mut output_file, "========================================").log_err();
writeln!(&mut log_file, "\n{}", &response).log_err(); writeln!(&mut output_file, "\n{}", &response).log_err();
parse_judge_output(&response) parse_judge_output(&response)
} }
@ -453,7 +468,7 @@ impl Example {
fn wait_for_lang_server( fn wait_for_lang_server(
lsp_store: &Entity<LspStore>, lsp_store: &Entity<LspStore>,
name: String, log_prefix: String,
cx: &mut AsyncApp, cx: &mut AsyncApp,
) -> Task<Result<()>> { ) -> Task<Result<()>> {
if cx if cx
@ -464,13 +479,13 @@ fn wait_for_lang_server(
return Task::ready(anyhow::Ok(())); return Task::ready(anyhow::Ok(()));
} }
println!("{}> ⏵ Waiting for language server", name); println!("{}⏵ Waiting for language server", log_prefix);
let (mut tx, mut rx) = mpsc::channel(1); let (mut tx, mut rx) = mpsc::channel(1);
let subscription = let subscription =
cx.subscribe(&lsp_store, { cx.subscribe(&lsp_store, {
let name = name.clone(); let log_prefix = log_prefix.clone();
move |lsp_store, event, cx| { move |lsp_store, event, cx| {
match event { match event {
project::LspStoreEvent::LanguageServerUpdate { project::LspStoreEvent::LanguageServerUpdate {
@ -482,7 +497,7 @@ fn wait_for_lang_server(
}, },
), ),
.. ..
} => println!("{name}> {message}"), } => println!("{}{message}", log_prefix),
_ => {} _ => {}
} }
@ -496,7 +511,7 @@ fn wait_for_lang_server(
let timeout = cx.background_executor().timer(Duration::new(60 * 5, 0)); let timeout = cx.background_executor().timer(Duration::new(60 * 5, 0));
let result = futures::select! { let result = futures::select! {
_ = rx.next() => { _ = rx.next() => {
println!("{}> ⚑ Language server idle", name); println!("{}⚑ Language server idle", log_prefix);
anyhow::Ok(()) anyhow::Ok(())
}, },
_ = timeout.fuse() => { _ = timeout.fuse() => {
@ -623,7 +638,6 @@ pub async fn send_language_model_request(
while let Some(chunk_result) = stream.stream.next().await { while let Some(chunk_result) = stream.stream.next().await {
match chunk_result { match chunk_result {
Ok(chunk_str) => { Ok(chunk_str) => {
print!("{}", &chunk_str);
full_response.push_str(&chunk_str); full_response.push_str(&chunk_str);
} }
Err(err) => { Err(err) => {