Move assistant_evals to agent_evals and remove Judge logic (#28233)

Release Notes: - N/A
2025-04-07 13:28:06 -05:00 · 2025-04-07 13:28:06 -05:00 · f3274851d9
commit f3274851d9
parent 500d8f2943
13 changed files with 73 additions and 638 deletions
--- a/crates/agent_eval/src/eval.rs
+++ b/crates/agent_eval/src/eval.rs
@ -0,0 +1,384 @@
+use crate::git_commands::{run_git, setup_temp_repo};
+use crate::headless_assistant::{HeadlessAppState, HeadlessAssistant};
+use crate::{get_exercise_language, get_exercise_name};
+use agent::RequestKind;
+use anyhow::{Result, anyhow};
+use collections::HashMap;
+use gpui::{App, Task};
+use language_model::{LanguageModel, TokenUsage};
+use serde::{Deserialize, Serialize};
+use std::{
+    fs,
+    io::Write,
+    path::{Path, PathBuf},
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct EvalResult {
+    pub exercise_name: String,
+    pub diff: String,
+    pub assistant_response: String,
+    pub elapsed_time_ms: u128,
+    pub timestamp: u128,
+    // Token usage fields
+    pub input_tokens: usize,
+    pub output_tokens: usize,
+    pub total_tokens: usize,
+    pub tool_use_counts: usize,
+}
+
+pub struct EvalOutput {
+    pub diff: String,
+    pub last_message: String,
+    pub elapsed_time: Duration,
+    pub assistant_response_count: usize,
+    pub tool_use_counts: HashMap<Arc<str>, u32>,
+    pub token_usage: TokenUsage,
+}
+
+#[derive(Deserialize)]
+pub struct EvalSetup {
+    pub url: String,
+    pub base_sha: String,
+}
+
+pub struct Eval {
+    pub repo_path: PathBuf,
+    pub eval_setup: EvalSetup,
+    pub user_prompt: String,
+}
+
+impl Eval {
+    // Keep this method for potential future use, but mark it as intentionally unused
+    #[allow(dead_code)]
+    pub async fn load(_name: String, path: PathBuf, repos_dir: &Path) -> Result<Self> {
+        let prompt_path = path.join("prompt.txt");
+        let user_prompt = smol::unblock(|| std::fs::read_to_string(prompt_path)).await?;
+        let setup_path = path.join("setup.json");
+        let setup_contents = smol::unblock(|| std::fs::read_to_string(setup_path)).await?;
+        let eval_setup = serde_json_lenient::from_str_lenient::<EvalSetup>(&setup_contents)?;
+
+        // Move this internal function inside the load method since it's only used here
+        fn repo_dir_name(url: &str) -> String {
+            url.trim_start_matches("https://")
+                .replace(|c: char| !c.is_alphanumeric(), "_")
+        }
+
+        let repo_path = repos_dir.join(repo_dir_name(&eval_setup.url));
+
+        Ok(Eval {
+            repo_path,
+            eval_setup,
+            user_prompt,
+        })
+    }
+
+    pub fn run(
+        self,
+        app_state: Arc<HeadlessAppState>,
+        model: Arc<dyn LanguageModel>,
+        cx: &mut App,
+    ) -> Task<Result<EvalOutput>> {
+        cx.spawn(async move |cx| {
+            run_git(&self.repo_path, &["checkout", &self.eval_setup.base_sha]).await?;
+
+            let (assistant, done_rx) =
+                cx.update(|cx| HeadlessAssistant::new(app_state.clone(), cx))??;
+
+            let _worktree = assistant
+                .update(cx, |assistant, cx| {
+                    assistant.project.update(cx, |project, cx| {
+                        project.create_worktree(&self.repo_path, true, cx)
+                    })
+                })?
+                .await?;
+
+            let start_time = std::time::SystemTime::now();
+
+            let (system_prompt_context, load_error) = cx
+                .update(|cx| {
+                    assistant
+                        .read(cx)
+                        .thread
+                        .read(cx)
+                        .load_system_prompt_context(cx)
+                })?
+                .await;
+
+            if let Some(load_error) = load_error {
+                return Err(anyhow!("{:?}", load_error));
+            };
+
+            assistant.update(cx, |assistant, cx| {
+                assistant.thread.update(cx, |thread, cx| {
+                    let context = vec![];
+                    thread.insert_user_message(self.user_prompt.clone(), context, None, cx);
+                    thread.set_system_prompt_context(system_prompt_context);
+                    thread.send_to_model(model, RequestKind::Chat, cx);
+                });
+            })?;
+
+            done_rx.recv().await??;
+
+            // Add this section to check untracked files
+            println!("Checking for untracked files:");
+            let untracked = run_git(
+                &self.repo_path,
+                &["ls-files", "--others", "--exclude-standard"],
+            )
+            .await?;
+            if untracked.is_empty() {
+                println!("No untracked files found");
+            } else {
+                // Add all files to git so they appear in the diff
+                println!("Adding untracked files to git");
+                run_git(&self.repo_path, &["add", "."]).await?;
+            }
+
+            // get git status
+            let _status = run_git(&self.repo_path, &["status", "--short"]).await?;
+
+            let elapsed_time = start_time.elapsed()?;
+
+            // Get diff of staged changes (the files we just added)
+            let staged_diff = run_git(&self.repo_path, &["diff", "--staged"]).await?;
+
+            // Get diff of unstaged changes
+            let unstaged_diff = run_git(&self.repo_path, &["diff"]).await?;
+
+            // Combine both diffs
+            let diff = if unstaged_diff.is_empty() {
+                staged_diff
+            } else if staged_diff.is_empty() {
+                unstaged_diff
+            } else {
+                format!(
+                    "# Staged changes\n{}\n\n# Unstaged changes\n{}",
+                    staged_diff, unstaged_diff
+                )
+            };
+
+            assistant.update(cx, |assistant, cx| {
+                let thread = assistant.thread.read(cx);
+                let last_message = thread.messages().last().unwrap();
+                if last_message.role != language_model::Role::Assistant {
+                    return Err(anyhow!("Last message is not from assistant"));
+                }
+                let assistant_response_count = thread
+                    .messages()
+                    .filter(|message| message.role == language_model::Role::Assistant)
+                    .count();
+                Ok(EvalOutput {
+                    diff,
+                    last_message: last_message.to_string(),
+                    elapsed_time,
+                    assistant_response_count,
+                    tool_use_counts: assistant.tool_use_counts.clone(),
+                    token_usage: thread.cumulative_token_usage(),
+                })
+            })?
+        })
+    }
+}
+
+impl EvalOutput {
+    // Keep this method for potential future use, but mark it as intentionally unused
+    #[allow(dead_code)]
+    pub fn save_to_directory(&self, output_dir: &Path, eval_output_value: String) -> Result<()> {
+        // Create the output directory if it doesn't exist
+        fs::create_dir_all(&output_dir)?;
+
+        // Save the diff to a file
+        let diff_path = output_dir.join("diff.patch");
+        let mut diff_file = fs::File::create(&diff_path)?;
+        diff_file.write_all(self.diff.as_bytes())?;
+
+        // Save the last message to a file
+        let message_path = output_dir.join("assistant_response.txt");
+        let mut message_file = fs::File::create(&message_path)?;
+        message_file.write_all(self.last_message.as_bytes())?;
+
+        // Current metrics for this run
+        let current_metrics = serde_json::json!({
+            "elapsed_time_ms": self.elapsed_time.as_millis(),
+            "assistant_response_count": self.assistant_response_count,
+            "tool_use_counts": self.tool_use_counts,
+            "token_usage": self.token_usage,
+            "eval_output_value": eval_output_value,
+        });
+
+        // Get current timestamp in milliseconds
+        let timestamp = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)?
+            .as_millis()
+            .to_string();
+
+        // Path to metrics file
+        let metrics_path = output_dir.join("metrics.json");
+
+        // Load existing metrics if the file exists, or create a new object
+        let mut historical_metrics = if metrics_path.exists() {
+            let metrics_content = fs::read_to_string(&metrics_path)?;
+            serde_json::from_str::<serde_json::Value>(&metrics_content)
+                .unwrap_or_else(|_| serde_json::json!({}))
+        } else {
+            serde_json::json!({})
+        };
+
+        // Add new run with timestamp as key
+        if let serde_json::Value::Object(ref mut map) = historical_metrics {
+            map.insert(timestamp, current_metrics);
+        }
+
+        // Write updated metrics back to file
+        let metrics_json = serde_json::to_string_pretty(&historical_metrics)?;
+        let mut metrics_file = fs::File::create(&metrics_path)?;
+        metrics_file.write_all(metrics_json.as_bytes())?;
+
+        Ok(())
+    }
+}
+
+pub async fn read_instructions(exercise_path: &Path) -> Result<String> {
+    let instructions_path = exercise_path.join(".docs").join("instructions.md");
+    println!("Reading instructions from: {}", instructions_path.display());
+    let instructions = smol::unblock(move || std::fs::read_to_string(&instructions_path)).await?;
+    Ok(instructions)
+}
+
+pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -> Result<()> {
+    let eval_dir = exercise_path.join("evaluation");
+    fs::create_dir_all(&eval_dir)?;
+
+    let eval_file = eval_dir.join("evals.json");
+
+    println!("Saving evaluation results to: {}", eval_file.display());
+    println!(
+        "Results to save: {} evaluations for exercise path: {}",
+        results.len(),
+        exercise_path.display()
+    );
+
+    // Check file existence before reading/writing
+    if eval_file.exists() {
+        println!("Existing evals.json file found, will update it");
+    } else {
+        println!("No existing evals.json file found, will create new one");
+    }
+
+    // Structure to organize evaluations by test name and timestamp
+    let mut eval_data: serde_json::Value = if eval_file.exists() {
+        let content = fs::read_to_string(&eval_file)?;
+        serde_json::from_str(&content).unwrap_or_else(|_| serde_json::json!({}))
+    } else {
+        serde_json::json!({})
+    };
+
+    // Get current timestamp for this batch of results
+    let timestamp = SystemTime::now()
+        .duration_since(SystemTime::UNIX_EPOCH)?
+        .as_millis()
+        .to_string();
+
+    // Group the new results by test name (exercise name)
+    for result in results {
+        let exercise_name = &result.exercise_name;
+
+        println!("Adding result: exercise={}", exercise_name);
+
+        // Ensure the exercise entry exists
+        if eval_data.get(exercise_name).is_none() {
+            eval_data[exercise_name] = serde_json::json!({});
+        }
+
+        // Ensure the timestamp entry exists as an object
+        if eval_data[exercise_name].get(&timestamp).is_none() {
+            eval_data[exercise_name][&timestamp] = serde_json::json!({});
+        }
+
+        // Add this result under the timestamp with template name as key
+        eval_data[exercise_name][&timestamp] = serde_json::to_value(&result)?;
+    }
+
+    // Write back to file with pretty formatting
+    let json_content = serde_json::to_string_pretty(&eval_data)?;
+    match fs::write(&eval_file, json_content) {
+        Ok(_) => println!("✓ Successfully saved results to {}", eval_file.display()),
+        Err(e) => println!("✗ Failed to write results file: {}", e),
+    }
+
+    Ok(())
+}
+
+pub async fn run_exercise_eval(
+    exercise_path: PathBuf,
+    model: Arc<dyn LanguageModel>,
+    app_state: Arc<HeadlessAppState>,
+    base_sha: String,
+    _framework_path: PathBuf,
+    cx: gpui::AsyncApp,
+) -> Result<EvalResult> {
+    let exercise_name = get_exercise_name(&exercise_path);
+    let language = get_exercise_language(&exercise_path)?;
+    let mut instructions = read_instructions(&exercise_path).await?;
+    instructions.push_str(&format!(
+        "\n\nWhen writing the code for this prompt, use {} to achieve the goal.",
+        language
+    ));
+
+    println!("Running evaluation for exercise: {}", exercise_name);
+
+    // Create temporary directory with exercise files
+    let temp_dir = setup_temp_repo(&exercise_path, &base_sha).await?;
+    let temp_path = temp_dir.path().to_path_buf();
+
+    let local_commit_sha = run_git(&temp_path, &["rev-parse", "HEAD"]).await?;
+
+    let start_time = SystemTime::now();
+
+    // Create a basic eval struct to work with the existing system
+    let eval = Eval {
+        repo_path: temp_path.clone(),
+        eval_setup: EvalSetup {
+            url: format!("file://{}", temp_path.display()),
+            base_sha: local_commit_sha, // Use the local commit SHA instead of the framework base SHA
+        },
+        user_prompt: instructions.clone(),
+    };
+
+    // Run the evaluation
+    let eval_output = cx
+        .update(|cx| eval.run(app_state.clone(), model.clone(), cx))?
+        .await?;
+
+    // Get diff from git
+    let diff = eval_output.diff.clone();
+
+    let elapsed_time = start_time.elapsed()?;
+
+    // Calculate total tokens as the sum of input and output tokens
+    let input_tokens = eval_output.token_usage.input_tokens;
+    let output_tokens = eval_output.token_usage.output_tokens;
+    let tool_use_counts = eval_output.tool_use_counts.values().sum::<u32>();
+    let total_tokens = input_tokens + output_tokens;
+
+    // Save results to evaluation directory
+    let result = EvalResult {
+        exercise_name: exercise_name.clone(),
+        diff,
+        assistant_response: eval_output.last_message.clone(),
+        elapsed_time_ms: elapsed_time.as_millis(),
+        timestamp: SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)?
+            .as_millis(),
+        // Convert u32 token counts to usize
+        input_tokens: input_tokens.try_into().unwrap(),
+        output_tokens: output_tokens.try_into().unwrap(),
+        total_tokens: total_tokens.try_into().unwrap(),
+        tool_use_counts: tool_use_counts.try_into().unwrap(),
+    };
+
+    Ok(result)
+}
--- a/crates/agent_eval/src/get_exercise.rs
+++ b/crates/agent_eval/src/get_exercise.rs
@ -0,0 +1,149 @@
+use anyhow::{Result, anyhow};
+use std::{
+    fs,
+    path::{Path, PathBuf},
+};
+
+pub fn get_exercise_name(exercise_path: &Path) -> String {
+    exercise_path
+        .file_name()
+        .unwrap_or_default()
+        .to_string_lossy()
+        .to_string()
+}
+
+pub fn get_exercise_language(exercise_path: &Path) -> Result<String> {
+    // Extract the language from path (data/python/exercises/... => python)
+    let parts: Vec<_> = exercise_path.components().collect();
+
+    for (i, part) in parts.iter().enumerate() {
+        if i > 0 && part.as_os_str() == "eval_code" {
+            if i + 1 < parts.len() {
+                let language = parts[i + 1].as_os_str().to_string_lossy().to_string();
+                return Ok(language);
+            }
+        }
+    }
+
+    Err(anyhow!(
+        "Could not determine language from path: {:?}",
+        exercise_path
+    ))
+}
+
+pub fn find_exercises(
+    framework_path: &Path,
+    languages: &[&str],
+    max_per_language: Option<usize>,
+) -> Result<Vec<PathBuf>> {
+    let mut all_exercises = Vec::new();
+
+    println!("Searching for exercises in languages: {:?}", languages);
+
+    for language in languages {
+        let language_dir = framework_path
+            .join("eval_code")
+            .join(language)
+            .join("exercises")
+            .join("practice");
+
+        println!("Checking language directory: {:?}", language_dir);
+        if !language_dir.exists() {
+            println!("Warning: Language directory not found: {:?}", language_dir);
+            continue;
+        }
+
+        let mut exercises = Vec::new();
+        match fs::read_dir(&language_dir) {
+            Ok(entries) => {
+                for entry_result in entries {
+                    match entry_result {
+                        Ok(entry) => {
+                            let path = entry.path();
+
+                            if path.is_dir() {
+                                // Special handling for "internal" directory
+                                if *language == "internal" {
+                                    // Check for repo_info.json to validate it's an internal exercise
+                                    let repo_info_path = path.join(".meta").join("repo_info.json");
+                                    let instructions_path =
+                                        path.join(".docs").join("instructions.md");
+
+                                    if repo_info_path.exists() && instructions_path.exists() {
+                                        exercises.push(path);
+                                    }
+                                } else {
+                                    // Map the language to the file extension - original code
+                                    let language_extension = match *language {
+                                        "python" => "py",
+                                        "go" => "go",
+                                        "rust" => "rs",
+                                        "typescript" => "ts",
+                                        "javascript" => "js",
+                                        "ruby" => "rb",
+                                        "php" => "php",
+                                        "bash" => "sh",
+                                        "multi" => "diff",
+                                        _ => continue, // Skip unsupported languages
+                                    };
+
+                                    // Check if this is a valid exercise with instructions and example
+                                    let instructions_path =
+                                        path.join(".docs").join("instructions.md");
+                                    let has_instructions = instructions_path.exists();
+                                    let example_path = path
+                                        .join(".meta")
+                                        .join(format!("example.{}", language_extension));
+                                    let has_example = example_path.exists();
+
+                                    if has_instructions && has_example {
+                                        exercises.push(path);
+                                    }
+                                }
+                            }
+                        }
+                        Err(err) => println!("Error reading directory entry: {}", err),
+                    }
+                }
+            }
+            Err(err) => println!(
+                "Error reading directory {}: {}",
+                language_dir.display(),
+                err
+            ),
+        }
+
+        // Sort exercises by name for consistent selection
+        exercises.sort_by(|a, b| {
+            let a_name = a.file_name().unwrap_or_default().to_string_lossy();
+            let b_name = b.file_name().unwrap_or_default().to_string_lossy();
+            a_name.cmp(&b_name)
+        });
+
+        // Apply the limit if specified
+        if let Some(limit) = max_per_language {
+            if exercises.len() > limit {
+                println!(
+                    "Limiting {} exercises to {} for language {}",
+                    exercises.len(),
+                    limit,
+                    language
+                );
+                exercises.truncate(limit);
+            }
+        }
+
+        println!(
+            "Found {} exercises for language {}: {:?}",
+            exercises.len(),
+            language,
+            exercises
+                .iter()
+                .map(|p| p.file_name().unwrap_or_default().to_string_lossy())
+                .collect::<Vec<_>>()
+        );
+        all_exercises.extend(exercises);
+    }
+
+    Ok(all_exercises)
+}
--- a/crates/agent_eval/src/git_commands.rs
+++ b/crates/agent_eval/src/git_commands.rs
@ -0,0 +1,125 @@
+use anyhow::{Result, anyhow};
+use serde::Deserialize;
+use std::{fs, path::Path};
+use tempfile::TempDir;
+use util::command::new_smol_command;
+use walkdir::WalkDir;
+
+#[derive(Debug, Deserialize)]
+pub struct SetupConfig {
+    #[serde(rename = "base.sha")]
+    pub base_sha: String,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct RepoInfo {
+    pub remote_url: String,
+    pub head_sha: String,
+}
+
+pub async fn run_git(repo_path: &Path, args: &[&str]) -> Result<String> {
+    let output = new_smol_command("git")
+        .current_dir(repo_path)
+        .args(args)
+        .output()
+        .await?;
+
+    if output.status.success() {
+        Ok(String::from_utf8(output.stdout)?.trim().to_string())
+    } else {
+        Err(anyhow!(
+            "Git command failed: {} with status: {}",
+            args.join(" "),
+            output.status
+        ))
+    }
+}
+
+pub async fn read_base_sha(framework_path: &Path) -> Result<String> {
+    let setup_path = framework_path.join("setup.json");
+    let setup_content = smol::unblock(move || std::fs::read_to_string(&setup_path)).await?;
+    let setup_config: SetupConfig = serde_json_lenient::from_str_lenient(&setup_content)?;
+    Ok(setup_config.base_sha)
+}
+
+pub async fn read_repo_info(exercise_path: &Path) -> Result<RepoInfo> {
+    let repo_info_path = exercise_path.join(".meta").join("repo_info.json");
+    println!("Reading repo info from: {}", repo_info_path.display());
+    let repo_info_content = smol::unblock(move || std::fs::read_to_string(&repo_info_path)).await?;
+    let repo_info: RepoInfo = serde_json_lenient::from_str_lenient(&repo_info_content)?;
+
+    // Remove any quotes from the strings
+    let remote_url = repo_info.remote_url.trim_matches('"').to_string();
+    let head_sha = repo_info.head_sha.trim_matches('"').to_string();
+
+    Ok(RepoInfo {
+        remote_url,
+        head_sha,
+    })
+}
+
+pub async fn setup_temp_repo(exercise_path: &Path, _base_sha: &str) -> Result<TempDir> {
+    let temp_dir = TempDir::new()?;
+
+    // Check if this is an internal exercise by looking for repo_info.json
+    let repo_info_path = exercise_path.join(".meta").join("repo_info.json");
+    if repo_info_path.exists() {
+        // This is an internal exercise, handle it differently
+        let repo_info = read_repo_info(exercise_path).await?;
+
+        // Clone the repository to the temp directory
+        let url = repo_info.remote_url;
+        let clone_path = temp_dir.path();
+        println!(
+            "Cloning repository from {} to {}",
+            url,
+            clone_path.display()
+        );
+        run_git(
+            &std::env::current_dir()?,
+            &["clone", &url, &clone_path.to_string_lossy()],
+        )
+        .await?;
+
+        // Checkout the specified commit
+        println!("Checking out commit: {}", repo_info.head_sha);
+        run_git(temp_dir.path(), &["checkout", &repo_info.head_sha]).await?;
+
+        println!("Successfully set up internal repository");
+    } else {
+        // Original code for regular exercises
+        // Copy the exercise files to the temp directory, excluding .docs and .meta
+        for entry in WalkDir::new(exercise_path).min_depth(0).max_depth(10) {
+            let entry = entry?;
+            let source_path = entry.path();
+
+            // Skip .docs and .meta directories completely
+            if source_path.starts_with(exercise_path.join(".docs"))
+                || source_path.starts_with(exercise_path.join(".meta"))
+            {
+                continue;
+            }
+
+            if source_path.is_file() {
+                let relative_path = source_path.strip_prefix(exercise_path)?;
+                let dest_path = temp_dir.path().join(relative_path);
+
+                // Make sure parent directories exist
+                if let Some(parent) = dest_path.parent() {
+                    fs::create_dir_all(parent)?;
+                }
+
+                fs::copy(source_path, dest_path)?;
+            }
+        }
+
+        // Initialize git repo in the temp directory
+        run_git(temp_dir.path(), &["init"]).await?;
+        run_git(temp_dir.path(), &["add", "."]).await?;
+        run_git(temp_dir.path(), &["commit", "-m", "Initial commit"]).await?;
+
+        println!("Created temp repo without .docs and .meta directories");
+    }
+
+    Ok(temp_dir)
+}
--- a/crates/agent_eval/src/headless_assistant.rs
+++ b/crates/agent_eval/src/headless_assistant.rs
@ -0,0 +1,246 @@
+use agent::{RequestKind, Thread, ThreadEvent, ThreadStore};
+use anyhow::anyhow;
+use assistant_tool::ToolWorkingSet;
+use client::{Client, UserStore};
+use collections::HashMap;
+use dap::DapRegistry;
+use gpui::{App, Entity, SemanticVersion, Subscription, Task, prelude::*};
+use language::LanguageRegistry;
+use language_model::{
+    AuthenticateError, LanguageModel, LanguageModelProviderId, LanguageModelRegistry,
+};
+use node_runtime::NodeRuntime;
+use project::{Project, RealFs};
+use prompt_store::PromptBuilder;
+use settings::SettingsStore;
+use smol::channel;
+use std::sync::Arc;
+
+/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
+pub struct HeadlessAppState {
+    pub languages: Arc<LanguageRegistry>,
+    pub client: Arc<Client>,
+    pub user_store: Entity<UserStore>,
+    pub fs: Arc<dyn fs::Fs>,
+    pub node_runtime: NodeRuntime,
+
+    // Additional fields not present in `workspace::AppState`.
+    pub prompt_builder: Arc<PromptBuilder>,
+}
+
+pub struct HeadlessAssistant {
+    pub thread: Entity<Thread>,
+    pub project: Entity<Project>,
+    #[allow(dead_code)]
+    pub thread_store: Entity<ThreadStore>,
+    pub tool_use_counts: HashMap<Arc<str>, u32>,
+    pub done_tx: channel::Sender<anyhow::Result<()>>,
+    _subscription: Subscription,
+}
+
+impl HeadlessAssistant {
+    pub fn new(
+        app_state: Arc<HeadlessAppState>,
+        cx: &mut App,
+    ) -> anyhow::Result<(Entity<Self>, channel::Receiver<anyhow::Result<()>>)> {
+        let env = None;
+        let project = Project::local(
+            app_state.client.clone(),
+            app_state.node_runtime.clone(),
+            app_state.user_store.clone(),
+            app_state.languages.clone(),
+            Arc::new(DapRegistry::default()),
+            app_state.fs.clone(),
+            env,
+            cx,
+        );
+
+        let tools = Arc::new(ToolWorkingSet::default());
+        let thread_store =
+            ThreadStore::new(project.clone(), tools, app_state.prompt_builder.clone(), cx)?;
+
+        let thread = thread_store.update(cx, |thread_store, cx| thread_store.create_thread(cx));
+
+        let (done_tx, done_rx) = channel::unbounded::<anyhow::Result<()>>();
+
+        let headless_thread = cx.new(move |cx| Self {
+            _subscription: cx.subscribe(&thread, Self::handle_thread_event),
+            thread,
+            project,
+            thread_store,
+            tool_use_counts: HashMap::default(),
+            done_tx,
+        });
+
+        Ok((headless_thread, done_rx))
+    }
+
+    fn handle_thread_event(
+        &mut self,
+        thread: Entity<Thread>,
+        event: &ThreadEvent,
+        cx: &mut Context<Self>,
+    ) {
+        match event {
+            ThreadEvent::ShowError(err) => self
+                .done_tx
+                .send_blocking(Err(anyhow!("{:?}", err)))
+                .unwrap(),
+            ThreadEvent::DoneStreaming => {
+                let thread = thread.read(cx);
+                if let Some(message) = thread.messages().last() {
+                    println!("Message: {}", message.to_string());
+                }
+                if thread.all_tools_finished() {
+                    self.done_tx.send_blocking(Ok(())).unwrap()
+                }
+            }
+            ThreadEvent::UsePendingTools => {
+                thread.update(cx, |thread, cx| {
+                    thread.use_pending_tools(cx);
+                });
+            }
+            ThreadEvent::ToolConfirmationNeeded => {
+                // Automatically approve all tools that need confirmation in headless mode
+                println!("Tool confirmation needed - automatically approving in headless mode");
+
+                // Get the tools needing confirmation
+                let tools_needing_confirmation: Vec<_> = thread
+                    .read(cx)
+                    .tools_needing_confirmation()
+                    .cloned()
+                    .collect();
+
+                // Run each tool that needs confirmation
+                for tool_use in tools_needing_confirmation {
+                    if let Some(tool) = thread.read(cx).tools().tool(&tool_use.name, cx) {
+                        thread.update(cx, |thread, cx| {
+                            println!("Auto-approving tool: {}", tool_use.name);
+
+                            // Create a request to send to the tool
+                            let request = thread.to_completion_request(RequestKind::Chat, cx);
+                            let messages = Arc::new(request.messages);
+
+                            // Run the tool
+                            thread.run_tool(
+                                tool_use.id.clone(),
+                                tool_use.ui_text.clone(),
+                                tool_use.input.clone(),
+                                &messages,
+                                tool,
+                                cx,
+                            );
+                        });
+                    }
+                }
+            }
+            ThreadEvent::ToolFinished {
+                tool_use_id,
+                pending_tool_use,
+                ..
+            } => {
+                if let Some(pending_tool_use) = pending_tool_use {
+                    println!(
+                        "Used tool {} with input: {}",
+                        pending_tool_use.name, pending_tool_use.input
+                    );
+                    *self
+                        .tool_use_counts
+                        .entry(pending_tool_use.name.clone())
+                        .or_insert(0) += 1;
+                }
+                if let Some(tool_result) = thread.read(cx).tool_result(tool_use_id) {
+                    println!("Tool result: {:?}", tool_result);
+                }
+                if thread.read(cx).all_tools_finished() {
+                    let model_registry = LanguageModelRegistry::read_global(cx);
+                    if let Some(model) = model_registry.default_model() {
+                        thread.update(cx, |thread, cx| {
+                            thread.attach_tool_results(cx);
+                            thread.send_to_model(model.model, RequestKind::Chat, cx);
+                        });
+                    } else {
+                        println!(
+                            "Warning: No active language model available to continue conversation"
+                        );
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+}
+
+pub fn init(cx: &mut App) -> Arc<HeadlessAppState> {
+    release_channel::init(SemanticVersion::default(), cx);
+    gpui_tokio::init(cx);
+
+    let mut settings_store = SettingsStore::new(cx);
+    settings_store
+        .set_default_settings(settings::default_settings().as_ref(), cx)
+        .unwrap();
+    cx.set_global(settings_store);
+    client::init_settings(cx);
+    Project::init_settings(cx);
+
+    let client = Client::production(cx);
+    cx.set_http_client(client.http_client().clone());
+
+    let git_binary_path = None;
+    let fs = Arc::new(RealFs::new(
+        git_binary_path,
+        cx.background_executor().clone(),
+    ));
+
+    let languages = Arc::new(LanguageRegistry::new(cx.background_executor().clone()));
+
+    let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
+
+    language::init(cx);
+    language_model::init(client.clone(), cx);
+    language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
+    assistant_tools::init(client.http_client().clone(), cx);
+    context_server::init(cx);
+    let stdout_is_a_pty = false;
+    let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
+    agent::init(fs.clone(), client.clone(), prompt_builder.clone(), cx);
+
+    Arc::new(HeadlessAppState {
+        languages,
+        client,
+        user_store,
+        fs,
+        node_runtime: NodeRuntime::unavailable(),
+        prompt_builder,
+    })
+}
+
+pub fn find_model(model_name: &str, cx: &App) -> anyhow::Result<Arc<dyn LanguageModel>> {
+    let model_registry = LanguageModelRegistry::read_global(cx);
+    let model = model_registry
+        .available_models(cx)
+        .find(|model| model.id().0 == model_name);
+
+    let Some(model) = model else {
+        return Err(anyhow!(
+            "No language model named {} was available. Available models: {}",
+            model_name,
+            model_registry
+                .available_models(cx)
+                .map(|model| model.id().0.clone())
+                .collect::<Vec<_>>()
+                .join(", ")
+        ));
+    };
+
+    Ok(model)
+}
+
+pub fn authenticate_model_provider(
+    provider_id: LanguageModelProviderId,
+    cx: &mut App,
+) -> Task<std::result::Result<(), AuthenticateError>> {
+    let model_registry = LanguageModelRegistry::read_global(cx);
+    let model_provider = model_registry.provider(&provider_id).unwrap();
+    model_provider.authenticate(cx)
+}
--- a/crates/agent_eval/src/main.rs
+++ b/crates/agent_eval/src/main.rs
@ -0,0 +1,205 @@
+mod eval;
+mod get_exercise;
+mod git_commands;
+mod headless_assistant;
+
+use clap::Parser;
+use eval::{run_exercise_eval, save_eval_results};
+use futures::stream::{self, StreamExt};
+use get_exercise::{find_exercises, get_exercise_language, get_exercise_name};
+use git_commands::read_base_sha;
+use gpui::Application;
+use headless_assistant::{authenticate_model_provider, find_model};
+use language_model::LanguageModelRegistry;
+use reqwest_client::ReqwestClient;
+use std::{path::PathBuf, sync::Arc};
+
+#[derive(Parser, Debug)]
+#[command(
+    name = "agent_eval",
+    disable_version_flag = true,
+    before_help = "Tool eval runner"
+)]
+struct Args {
+    /// Match the names of evals to run.
+    #[arg(long)]
+    exercise_names: Vec<String>,
+    /// Runs all exercises, causes the exercise_names to be ignored.
+    #[arg(long)]
+    all: bool,
+    /// Supported language types to evaluate (default: internal).
+    /// Internal is data generated from the agent panel
+    #[arg(long, default_value = "internal")]
+    languages: String,
+    /// Name of the model (default: "claude-3-7-sonnet-latest")
+    #[arg(long, default_value = "claude-3-7-sonnet-latest")]
+    model_name: String,
+    /// Name of the editor model (default: value of `--model_name`).
+    #[arg(long)]
+    editor_model_name: Option<String>,
+    /// Number of evaluations to run concurrently (default: 3)
+    #[arg(short, long, default_value = "5")]
+    concurrency: usize,
+    /// Maximum number of exercises to evaluate per language
+    #[arg(long)]
+    max_exercises_per_language: Option<usize>,
+}
+
+fn main() {
+    env_logger::init();
+    let args = Args::parse();
+    let http_client = Arc::new(ReqwestClient::new());
+    let app = Application::headless().with_http_client(http_client.clone());
+
+    // Path to the zed-ace-framework repo
+    let framework_path = PathBuf::from("../zed-ace-framework")
+        .canonicalize()
+        .unwrap();
+
+    // Fix the 'languages' lifetime issue by creating owned Strings instead of slices
+    let languages: Vec<String> = args.languages.split(',').map(|s| s.to_string()).collect();
+
+    println!("Using zed-ace-framework at: {:?}", framework_path);
+    println!("Evaluating languages: {:?}", languages);
+
+    app.run(move |cx| {
+        let app_state = headless_assistant::init(cx);
+
+        let model = find_model(&args.model_name, cx).unwrap();
+        let editor_model = if let Some(model_name) = &args.editor_model_name {
+            find_model(model_name, cx).unwrap()
+        } else {
+            model.clone()
+        };
+
+        LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
+            registry.set_default_model(Some(model.clone()), cx);
+        });
+
+        let model_provider_id = model.provider_id();
+        let editor_model_provider_id = editor_model.provider_id();
+
+        let framework_path_clone = framework_path.clone();
+        let languages_clone = languages.clone();
+        let exercise_names = args.exercise_names.clone();
+        let all_flag = args.all;
+
+        cx.spawn(async move |cx| {
+            // Authenticate all model providers first
+            cx.update(|cx| authenticate_model_provider(model_provider_id.clone(), cx))
+                .unwrap()
+                .await
+                .unwrap();
+            cx.update(|cx| authenticate_model_provider(editor_model_provider_id.clone(), cx))
+                .unwrap()
+                .await
+                .unwrap();
+
+            println!("framework path: {}", framework_path_clone.display());
+
+            let base_sha = read_base_sha(&framework_path_clone).await.unwrap();
+
+            println!("base sha: {}", base_sha);
+
+            let all_exercises = find_exercises(
+                &framework_path_clone,
+                &languages_clone
+                    .iter()
+                    .map(|s| s.as_str())
+                    .collect::<Vec<_>>(),
+                args.max_exercises_per_language,
+            )
+            .unwrap();
+            println!("Found {} exercises total", all_exercises.len());
+
+            // Filter exercises if specific ones were requested
+            let exercises_to_run = if !exercise_names.is_empty() {
+                // If exercise names are specified, filter by them regardless of --all flag
+                all_exercises
+                    .into_iter()
+                    .filter(|path| {
+                        let name = get_exercise_name(path);
+                        exercise_names.iter().any(|filter| name.contains(filter))
+                    })
+                    .collect()
+            } else if all_flag {
+                // Only use all_flag if no exercise names are specified
+                all_exercises
+            } else {
+                // Default behavior (no filters)
+                all_exercises
+            };
+
+            println!("Will run {} exercises", exercises_to_run.len());
+
+            // Create exercise eval tasks - each exercise is a single task that will run templates sequentially
+            let exercise_tasks: Vec<_> = exercises_to_run
+                .into_iter()
+                .map(|exercise_path| {
+                    let exercise_name = get_exercise_name(&exercise_path);
+                    let model_clone = model.clone();
+                    let app_state_clone = app_state.clone();
+                    let base_sha_clone = base_sha.clone();
+                    let framework_path_clone = framework_path_clone.clone();
+                    let cx_clone = cx.clone();
+
+                    async move {
+                        println!("Processing exercise: {}", exercise_name);
+                        let mut exercise_results = Vec::new();
+
+                        match run_exercise_eval(
+                            exercise_path.clone(),
+                            model_clone.clone(),
+                            app_state_clone.clone(),
+                            base_sha_clone.clone(),
+                            framework_path_clone.clone(),
+                            cx_clone.clone(),
+                        )
+                        .await
+                        {
+                            Ok(result) => {
+                                println!("Completed {}", exercise_name);
+                                exercise_results.push(result);
+                            }
+                            Err(err) => {
+                                println!("Error running {}: {}", exercise_name, err);
+                            }
+                        }
+
+                        // Save results for this exercise
+                        if !exercise_results.is_empty() {
+                            if let Err(err) =
+                                save_eval_results(&exercise_path, exercise_results.clone()).await
+                            {
+                                println!("Error saving results for {}: {}", exercise_name, err);
+                            } else {
+                                println!("Saved results for {}", exercise_name);
+                            }
+                        }
+
+                        exercise_results
+                    }
+                })
+                .collect();
+
+            println!(
+                "Running {} exercises with concurrency: {}",
+                exercise_tasks.len(),
+                args.concurrency
+            );
+
+            // Run exercises concurrently, with each exercise running its templates sequentially
+            let all_results = stream::iter(exercise_tasks)
+                .buffer_unordered(args.concurrency)
+                .flat_map(stream::iter)
+                .collect::<Vec<_>>()
+                .await;
+
+            println!("Completed {} evaluation runs", all_results.len());
+            cx.update(|cx| cx.quit()).unwrap();
+        })
+        .detach();
+    });
+
+    println!("Done running evals");
+}