Move assistant_evals
to agent_evals
and remove Judge logic (#28233)
Release Notes: - N/A
This commit is contained in:
parent
500d8f2943
commit
f3274851d9
13 changed files with 73 additions and 638 deletions
384
crates/agent_eval/src/eval.rs
Normal file
384
crates/agent_eval/src/eval.rs
Normal file
|
@ -0,0 +1,384 @@
|
|||
use crate::git_commands::{run_git, setup_temp_repo};
|
||||
use crate::headless_assistant::{HeadlessAppState, HeadlessAssistant};
|
||||
use crate::{get_exercise_language, get_exercise_name};
|
||||
use agent::RequestKind;
|
||||
use anyhow::{Result, anyhow};
|
||||
use collections::HashMap;
|
||||
use gpui::{App, Task};
|
||||
use language_model::{LanguageModel, TokenUsage};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
fs,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct EvalResult {
|
||||
pub exercise_name: String,
|
||||
pub diff: String,
|
||||
pub assistant_response: String,
|
||||
pub elapsed_time_ms: u128,
|
||||
pub timestamp: u128,
|
||||
// Token usage fields
|
||||
pub input_tokens: usize,
|
||||
pub output_tokens: usize,
|
||||
pub total_tokens: usize,
|
||||
pub tool_use_counts: usize,
|
||||
}
|
||||
|
||||
pub struct EvalOutput {
|
||||
pub diff: String,
|
||||
pub last_message: String,
|
||||
pub elapsed_time: Duration,
|
||||
pub assistant_response_count: usize,
|
||||
pub tool_use_counts: HashMap<Arc<str>, u32>,
|
||||
pub token_usage: TokenUsage,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct EvalSetup {
|
||||
pub url: String,
|
||||
pub base_sha: String,
|
||||
}
|
||||
|
||||
pub struct Eval {
|
||||
pub repo_path: PathBuf,
|
||||
pub eval_setup: EvalSetup,
|
||||
pub user_prompt: String,
|
||||
}
|
||||
|
||||
impl Eval {
|
||||
// Keep this method for potential future use, but mark it as intentionally unused
|
||||
#[allow(dead_code)]
|
||||
pub async fn load(_name: String, path: PathBuf, repos_dir: &Path) -> Result<Self> {
|
||||
let prompt_path = path.join("prompt.txt");
|
||||
let user_prompt = smol::unblock(|| std::fs::read_to_string(prompt_path)).await?;
|
||||
let setup_path = path.join("setup.json");
|
||||
let setup_contents = smol::unblock(|| std::fs::read_to_string(setup_path)).await?;
|
||||
let eval_setup = serde_json_lenient::from_str_lenient::<EvalSetup>(&setup_contents)?;
|
||||
|
||||
// Move this internal function inside the load method since it's only used here
|
||||
fn repo_dir_name(url: &str) -> String {
|
||||
url.trim_start_matches("https://")
|
||||
.replace(|c: char| !c.is_alphanumeric(), "_")
|
||||
}
|
||||
|
||||
let repo_path = repos_dir.join(repo_dir_name(&eval_setup.url));
|
||||
|
||||
Ok(Eval {
|
||||
repo_path,
|
||||
eval_setup,
|
||||
user_prompt,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn run(
|
||||
self,
|
||||
app_state: Arc<HeadlessAppState>,
|
||||
model: Arc<dyn LanguageModel>,
|
||||
cx: &mut App,
|
||||
) -> Task<Result<EvalOutput>> {
|
||||
cx.spawn(async move |cx| {
|
||||
run_git(&self.repo_path, &["checkout", &self.eval_setup.base_sha]).await?;
|
||||
|
||||
let (assistant, done_rx) =
|
||||
cx.update(|cx| HeadlessAssistant::new(app_state.clone(), cx))??;
|
||||
|
||||
let _worktree = assistant
|
||||
.update(cx, |assistant, cx| {
|
||||
assistant.project.update(cx, |project, cx| {
|
||||
project.create_worktree(&self.repo_path, true, cx)
|
||||
})
|
||||
})?
|
||||
.await?;
|
||||
|
||||
let start_time = std::time::SystemTime::now();
|
||||
|
||||
let (system_prompt_context, load_error) = cx
|
||||
.update(|cx| {
|
||||
assistant
|
||||
.read(cx)
|
||||
.thread
|
||||
.read(cx)
|
||||
.load_system_prompt_context(cx)
|
||||
})?
|
||||
.await;
|
||||
|
||||
if let Some(load_error) = load_error {
|
||||
return Err(anyhow!("{:?}", load_error));
|
||||
};
|
||||
|
||||
assistant.update(cx, |assistant, cx| {
|
||||
assistant.thread.update(cx, |thread, cx| {
|
||||
let context = vec![];
|
||||
thread.insert_user_message(self.user_prompt.clone(), context, None, cx);
|
||||
thread.set_system_prompt_context(system_prompt_context);
|
||||
thread.send_to_model(model, RequestKind::Chat, cx);
|
||||
});
|
||||
})?;
|
||||
|
||||
done_rx.recv().await??;
|
||||
|
||||
// Add this section to check untracked files
|
||||
println!("Checking for untracked files:");
|
||||
let untracked = run_git(
|
||||
&self.repo_path,
|
||||
&["ls-files", "--others", "--exclude-standard"],
|
||||
)
|
||||
.await?;
|
||||
if untracked.is_empty() {
|
||||
println!("No untracked files found");
|
||||
} else {
|
||||
// Add all files to git so they appear in the diff
|
||||
println!("Adding untracked files to git");
|
||||
run_git(&self.repo_path, &["add", "."]).await?;
|
||||
}
|
||||
|
||||
// get git status
|
||||
let _status = run_git(&self.repo_path, &["status", "--short"]).await?;
|
||||
|
||||
let elapsed_time = start_time.elapsed()?;
|
||||
|
||||
// Get diff of staged changes (the files we just added)
|
||||
let staged_diff = run_git(&self.repo_path, &["diff", "--staged"]).await?;
|
||||
|
||||
// Get diff of unstaged changes
|
||||
let unstaged_diff = run_git(&self.repo_path, &["diff"]).await?;
|
||||
|
||||
// Combine both diffs
|
||||
let diff = if unstaged_diff.is_empty() {
|
||||
staged_diff
|
||||
} else if staged_diff.is_empty() {
|
||||
unstaged_diff
|
||||
} else {
|
||||
format!(
|
||||
"# Staged changes\n{}\n\n# Unstaged changes\n{}",
|
||||
staged_diff, unstaged_diff
|
||||
)
|
||||
};
|
||||
|
||||
assistant.update(cx, |assistant, cx| {
|
||||
let thread = assistant.thread.read(cx);
|
||||
let last_message = thread.messages().last().unwrap();
|
||||
if last_message.role != language_model::Role::Assistant {
|
||||
return Err(anyhow!("Last message is not from assistant"));
|
||||
}
|
||||
let assistant_response_count = thread
|
||||
.messages()
|
||||
.filter(|message| message.role == language_model::Role::Assistant)
|
||||
.count();
|
||||
Ok(EvalOutput {
|
||||
diff,
|
||||
last_message: last_message.to_string(),
|
||||
elapsed_time,
|
||||
assistant_response_count,
|
||||
tool_use_counts: assistant.tool_use_counts.clone(),
|
||||
token_usage: thread.cumulative_token_usage(),
|
||||
})
|
||||
})?
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl EvalOutput {
|
||||
// Keep this method for potential future use, but mark it as intentionally unused
|
||||
#[allow(dead_code)]
|
||||
pub fn save_to_directory(&self, output_dir: &Path, eval_output_value: String) -> Result<()> {
|
||||
// Create the output directory if it doesn't exist
|
||||
fs::create_dir_all(&output_dir)?;
|
||||
|
||||
// Save the diff to a file
|
||||
let diff_path = output_dir.join("diff.patch");
|
||||
let mut diff_file = fs::File::create(&diff_path)?;
|
||||
diff_file.write_all(self.diff.as_bytes())?;
|
||||
|
||||
// Save the last message to a file
|
||||
let message_path = output_dir.join("assistant_response.txt");
|
||||
let mut message_file = fs::File::create(&message_path)?;
|
||||
message_file.write_all(self.last_message.as_bytes())?;
|
||||
|
||||
// Current metrics for this run
|
||||
let current_metrics = serde_json::json!({
|
||||
"elapsed_time_ms": self.elapsed_time.as_millis(),
|
||||
"assistant_response_count": self.assistant_response_count,
|
||||
"tool_use_counts": self.tool_use_counts,
|
||||
"token_usage": self.token_usage,
|
||||
"eval_output_value": eval_output_value,
|
||||
});
|
||||
|
||||
// Get current timestamp in milliseconds
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)?
|
||||
.as_millis()
|
||||
.to_string();
|
||||
|
||||
// Path to metrics file
|
||||
let metrics_path = output_dir.join("metrics.json");
|
||||
|
||||
// Load existing metrics if the file exists, or create a new object
|
||||
let mut historical_metrics = if metrics_path.exists() {
|
||||
let metrics_content = fs::read_to_string(&metrics_path)?;
|
||||
serde_json::from_str::<serde_json::Value>(&metrics_content)
|
||||
.unwrap_or_else(|_| serde_json::json!({}))
|
||||
} else {
|
||||
serde_json::json!({})
|
||||
};
|
||||
|
||||
// Add new run with timestamp as key
|
||||
if let serde_json::Value::Object(ref mut map) = historical_metrics {
|
||||
map.insert(timestamp, current_metrics);
|
||||
}
|
||||
|
||||
// Write updated metrics back to file
|
||||
let metrics_json = serde_json::to_string_pretty(&historical_metrics)?;
|
||||
let mut metrics_file = fs::File::create(&metrics_path)?;
|
||||
metrics_file.write_all(metrics_json.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn read_instructions(exercise_path: &Path) -> Result<String> {
|
||||
let instructions_path = exercise_path.join(".docs").join("instructions.md");
|
||||
println!("Reading instructions from: {}", instructions_path.display());
|
||||
let instructions = smol::unblock(move || std::fs::read_to_string(&instructions_path)).await?;
|
||||
Ok(instructions)
|
||||
}
|
||||
|
||||
pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -> Result<()> {
|
||||
let eval_dir = exercise_path.join("evaluation");
|
||||
fs::create_dir_all(&eval_dir)?;
|
||||
|
||||
let eval_file = eval_dir.join("evals.json");
|
||||
|
||||
println!("Saving evaluation results to: {}", eval_file.display());
|
||||
println!(
|
||||
"Results to save: {} evaluations for exercise path: {}",
|
||||
results.len(),
|
||||
exercise_path.display()
|
||||
);
|
||||
|
||||
// Check file existence before reading/writing
|
||||
if eval_file.exists() {
|
||||
println!("Existing evals.json file found, will update it");
|
||||
} else {
|
||||
println!("No existing evals.json file found, will create new one");
|
||||
}
|
||||
|
||||
// Structure to organize evaluations by test name and timestamp
|
||||
let mut eval_data: serde_json::Value = if eval_file.exists() {
|
||||
let content = fs::read_to_string(&eval_file)?;
|
||||
serde_json::from_str(&content).unwrap_or_else(|_| serde_json::json!({}))
|
||||
} else {
|
||||
serde_json::json!({})
|
||||
};
|
||||
|
||||
// Get current timestamp for this batch of results
|
||||
let timestamp = SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)?
|
||||
.as_millis()
|
||||
.to_string();
|
||||
|
||||
// Group the new results by test name (exercise name)
|
||||
for result in results {
|
||||
let exercise_name = &result.exercise_name;
|
||||
|
||||
println!("Adding result: exercise={}", exercise_name);
|
||||
|
||||
// Ensure the exercise entry exists
|
||||
if eval_data.get(exercise_name).is_none() {
|
||||
eval_data[exercise_name] = serde_json::json!({});
|
||||
}
|
||||
|
||||
// Ensure the timestamp entry exists as an object
|
||||
if eval_data[exercise_name].get(×tamp).is_none() {
|
||||
eval_data[exercise_name][×tamp] = serde_json::json!({});
|
||||
}
|
||||
|
||||
// Add this result under the timestamp with template name as key
|
||||
eval_data[exercise_name][×tamp] = serde_json::to_value(&result)?;
|
||||
}
|
||||
|
||||
// Write back to file with pretty formatting
|
||||
let json_content = serde_json::to_string_pretty(&eval_data)?;
|
||||
match fs::write(&eval_file, json_content) {
|
||||
Ok(_) => println!("✓ Successfully saved results to {}", eval_file.display()),
|
||||
Err(e) => println!("✗ Failed to write results file: {}", e),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn run_exercise_eval(
|
||||
exercise_path: PathBuf,
|
||||
model: Arc<dyn LanguageModel>,
|
||||
app_state: Arc<HeadlessAppState>,
|
||||
base_sha: String,
|
||||
_framework_path: PathBuf,
|
||||
cx: gpui::AsyncApp,
|
||||
) -> Result<EvalResult> {
|
||||
let exercise_name = get_exercise_name(&exercise_path);
|
||||
let language = get_exercise_language(&exercise_path)?;
|
||||
let mut instructions = read_instructions(&exercise_path).await?;
|
||||
instructions.push_str(&format!(
|
||||
"\n\nWhen writing the code for this prompt, use {} to achieve the goal.",
|
||||
language
|
||||
));
|
||||
|
||||
println!("Running evaluation for exercise: {}", exercise_name);
|
||||
|
||||
// Create temporary directory with exercise files
|
||||
let temp_dir = setup_temp_repo(&exercise_path, &base_sha).await?;
|
||||
let temp_path = temp_dir.path().to_path_buf();
|
||||
|
||||
let local_commit_sha = run_git(&temp_path, &["rev-parse", "HEAD"]).await?;
|
||||
|
||||
let start_time = SystemTime::now();
|
||||
|
||||
// Create a basic eval struct to work with the existing system
|
||||
let eval = Eval {
|
||||
repo_path: temp_path.clone(),
|
||||
eval_setup: EvalSetup {
|
||||
url: format!("file://{}", temp_path.display()),
|
||||
base_sha: local_commit_sha, // Use the local commit SHA instead of the framework base SHA
|
||||
},
|
||||
user_prompt: instructions.clone(),
|
||||
};
|
||||
|
||||
// Run the evaluation
|
||||
let eval_output = cx
|
||||
.update(|cx| eval.run(app_state.clone(), model.clone(), cx))?
|
||||
.await?;
|
||||
|
||||
// Get diff from git
|
||||
let diff = eval_output.diff.clone();
|
||||
|
||||
let elapsed_time = start_time.elapsed()?;
|
||||
|
||||
// Calculate total tokens as the sum of input and output tokens
|
||||
let input_tokens = eval_output.token_usage.input_tokens;
|
||||
let output_tokens = eval_output.token_usage.output_tokens;
|
||||
let tool_use_counts = eval_output.tool_use_counts.values().sum::<u32>();
|
||||
let total_tokens = input_tokens + output_tokens;
|
||||
|
||||
// Save results to evaluation directory
|
||||
let result = EvalResult {
|
||||
exercise_name: exercise_name.clone(),
|
||||
diff,
|
||||
assistant_response: eval_output.last_message.clone(),
|
||||
elapsed_time_ms: elapsed_time.as_millis(),
|
||||
timestamp: SystemTime::now()
|
||||
.duration_since(SystemTime::UNIX_EPOCH)?
|
||||
.as_millis(),
|
||||
// Convert u32 token counts to usize
|
||||
input_tokens: input_tokens.try_into().unwrap(),
|
||||
output_tokens: output_tokens.try_into().unwrap(),
|
||||
total_tokens: total_tokens.try_into().unwrap(),
|
||||
tool_use_counts: tool_use_counts.try_into().unwrap(),
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
149
crates/agent_eval/src/get_exercise.rs
Normal file
149
crates/agent_eval/src/get_exercise.rs
Normal file
|
@ -0,0 +1,149 @@
|
|||
use anyhow::{Result, anyhow};
|
||||
use std::{
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
pub fn get_exercise_name(exercise_path: &Path) -> String {
|
||||
exercise_path
|
||||
.file_name()
|
||||
.unwrap_or_default()
|
||||
.to_string_lossy()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
pub fn get_exercise_language(exercise_path: &Path) -> Result<String> {
|
||||
// Extract the language from path (data/python/exercises/... => python)
|
||||
let parts: Vec<_> = exercise_path.components().collect();
|
||||
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
if i > 0 && part.as_os_str() == "eval_code" {
|
||||
if i + 1 < parts.len() {
|
||||
let language = parts[i + 1].as_os_str().to_string_lossy().to_string();
|
||||
return Ok(language);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!(
|
||||
"Could not determine language from path: {:?}",
|
||||
exercise_path
|
||||
))
|
||||
}
|
||||
|
||||
pub fn find_exercises(
|
||||
framework_path: &Path,
|
||||
languages: &[&str],
|
||||
max_per_language: Option<usize>,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
let mut all_exercises = Vec::new();
|
||||
|
||||
println!("Searching for exercises in languages: {:?}", languages);
|
||||
|
||||
for language in languages {
|
||||
let language_dir = framework_path
|
||||
.join("eval_code")
|
||||
.join(language)
|
||||
.join("exercises")
|
||||
.join("practice");
|
||||
|
||||
println!("Checking language directory: {:?}", language_dir);
|
||||
if !language_dir.exists() {
|
||||
println!("Warning: Language directory not found: {:?}", language_dir);
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut exercises = Vec::new();
|
||||
match fs::read_dir(&language_dir) {
|
||||
Ok(entries) => {
|
||||
for entry_result in entries {
|
||||
match entry_result {
|
||||
Ok(entry) => {
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
// Special handling for "internal" directory
|
||||
if *language == "internal" {
|
||||
// Check for repo_info.json to validate it's an internal exercise
|
||||
let repo_info_path = path.join(".meta").join("repo_info.json");
|
||||
let instructions_path =
|
||||
path.join(".docs").join("instructions.md");
|
||||
|
||||
if repo_info_path.exists() && instructions_path.exists() {
|
||||
exercises.push(path);
|
||||
}
|
||||
} else {
|
||||
// Map the language to the file extension - original code
|
||||
let language_extension = match *language {
|
||||
"python" => "py",
|
||||
"go" => "go",
|
||||
"rust" => "rs",
|
||||
"typescript" => "ts",
|
||||
"javascript" => "js",
|
||||
"ruby" => "rb",
|
||||
"php" => "php",
|
||||
"bash" => "sh",
|
||||
"multi" => "diff",
|
||||
_ => continue, // Skip unsupported languages
|
||||
};
|
||||
|
||||
// Check if this is a valid exercise with instructions and example
|
||||
let instructions_path =
|
||||
path.join(".docs").join("instructions.md");
|
||||
let has_instructions = instructions_path.exists();
|
||||
let example_path = path
|
||||
.join(".meta")
|
||||
.join(format!("example.{}", language_extension));
|
||||
let has_example = example_path.exists();
|
||||
|
||||
if has_instructions && has_example {
|
||||
exercises.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => println!("Error reading directory entry: {}", err),
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => println!(
|
||||
"Error reading directory {}: {}",
|
||||
language_dir.display(),
|
||||
err
|
||||
),
|
||||
}
|
||||
|
||||
// Sort exercises by name for consistent selection
|
||||
exercises.sort_by(|a, b| {
|
||||
let a_name = a.file_name().unwrap_or_default().to_string_lossy();
|
||||
let b_name = b.file_name().unwrap_or_default().to_string_lossy();
|
||||
a_name.cmp(&b_name)
|
||||
});
|
||||
|
||||
// Apply the limit if specified
|
||||
if let Some(limit) = max_per_language {
|
||||
if exercises.len() > limit {
|
||||
println!(
|
||||
"Limiting {} exercises to {} for language {}",
|
||||
exercises.len(),
|
||||
limit,
|
||||
language
|
||||
);
|
||||
exercises.truncate(limit);
|
||||
}
|
||||
}
|
||||
|
||||
println!(
|
||||
"Found {} exercises for language {}: {:?}",
|
||||
exercises.len(),
|
||||
language,
|
||||
exercises
|
||||
.iter()
|
||||
.map(|p| p.file_name().unwrap_or_default().to_string_lossy())
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
all_exercises.extend(exercises);
|
||||
}
|
||||
|
||||
Ok(all_exercises)
|
||||
}
|
125
crates/agent_eval/src/git_commands.rs
Normal file
125
crates/agent_eval/src/git_commands.rs
Normal file
|
@ -0,0 +1,125 @@
|
|||
use anyhow::{Result, anyhow};
|
||||
use serde::Deserialize;
|
||||
use std::{fs, path::Path};
|
||||
use tempfile::TempDir;
|
||||
use util::command::new_smol_command;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct SetupConfig {
|
||||
#[serde(rename = "base.sha")]
|
||||
pub base_sha: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct RepoInfo {
|
||||
pub remote_url: String,
|
||||
pub head_sha: String,
|
||||
}
|
||||
|
||||
pub async fn run_git(repo_path: &Path, args: &[&str]) -> Result<String> {
|
||||
let output = new_smol_command("git")
|
||||
.current_dir(repo_path)
|
||||
.args(args)
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if output.status.success() {
|
||||
Ok(String::from_utf8(output.stdout)?.trim().to_string())
|
||||
} else {
|
||||
Err(anyhow!(
|
||||
"Git command failed: {} with status: {}",
|
||||
args.join(" "),
|
||||
output.status
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn read_base_sha(framework_path: &Path) -> Result<String> {
|
||||
let setup_path = framework_path.join("setup.json");
|
||||
let setup_content = smol::unblock(move || std::fs::read_to_string(&setup_path)).await?;
|
||||
let setup_config: SetupConfig = serde_json_lenient::from_str_lenient(&setup_content)?;
|
||||
Ok(setup_config.base_sha)
|
||||
}
|
||||
|
||||
pub async fn read_repo_info(exercise_path: &Path) -> Result<RepoInfo> {
|
||||
let repo_info_path = exercise_path.join(".meta").join("repo_info.json");
|
||||
println!("Reading repo info from: {}", repo_info_path.display());
|
||||
let repo_info_content = smol::unblock(move || std::fs::read_to_string(&repo_info_path)).await?;
|
||||
let repo_info: RepoInfo = serde_json_lenient::from_str_lenient(&repo_info_content)?;
|
||||
|
||||
// Remove any quotes from the strings
|
||||
let remote_url = repo_info.remote_url.trim_matches('"').to_string();
|
||||
let head_sha = repo_info.head_sha.trim_matches('"').to_string();
|
||||
|
||||
Ok(RepoInfo {
|
||||
remote_url,
|
||||
head_sha,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn setup_temp_repo(exercise_path: &Path, _base_sha: &str) -> Result<TempDir> {
|
||||
let temp_dir = TempDir::new()?;
|
||||
|
||||
// Check if this is an internal exercise by looking for repo_info.json
|
||||
let repo_info_path = exercise_path.join(".meta").join("repo_info.json");
|
||||
if repo_info_path.exists() {
|
||||
// This is an internal exercise, handle it differently
|
||||
let repo_info = read_repo_info(exercise_path).await?;
|
||||
|
||||
// Clone the repository to the temp directory
|
||||
let url = repo_info.remote_url;
|
||||
let clone_path = temp_dir.path();
|
||||
println!(
|
||||
"Cloning repository from {} to {}",
|
||||
url,
|
||||
clone_path.display()
|
||||
);
|
||||
run_git(
|
||||
&std::env::current_dir()?,
|
||||
&["clone", &url, &clone_path.to_string_lossy()],
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Checkout the specified commit
|
||||
println!("Checking out commit: {}", repo_info.head_sha);
|
||||
run_git(temp_dir.path(), &["checkout", &repo_info.head_sha]).await?;
|
||||
|
||||
println!("Successfully set up internal repository");
|
||||
} else {
|
||||
// Original code for regular exercises
|
||||
// Copy the exercise files to the temp directory, excluding .docs and .meta
|
||||
for entry in WalkDir::new(exercise_path).min_depth(0).max_depth(10) {
|
||||
let entry = entry?;
|
||||
let source_path = entry.path();
|
||||
|
||||
// Skip .docs and .meta directories completely
|
||||
if source_path.starts_with(exercise_path.join(".docs"))
|
||||
|| source_path.starts_with(exercise_path.join(".meta"))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if source_path.is_file() {
|
||||
let relative_path = source_path.strip_prefix(exercise_path)?;
|
||||
let dest_path = temp_dir.path().join(relative_path);
|
||||
|
||||
// Make sure parent directories exist
|
||||
if let Some(parent) = dest_path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
fs::copy(source_path, dest_path)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize git repo in the temp directory
|
||||
run_git(temp_dir.path(), &["init"]).await?;
|
||||
run_git(temp_dir.path(), &["add", "."]).await?;
|
||||
run_git(temp_dir.path(), &["commit", "-m", "Initial commit"]).await?;
|
||||
|
||||
println!("Created temp repo without .docs and .meta directories");
|
||||
}
|
||||
|
||||
Ok(temp_dir)
|
||||
}
|
246
crates/agent_eval/src/headless_assistant.rs
Normal file
246
crates/agent_eval/src/headless_assistant.rs
Normal file
|
@ -0,0 +1,246 @@
|
|||
use agent::{RequestKind, Thread, ThreadEvent, ThreadStore};
|
||||
use anyhow::anyhow;
|
||||
use assistant_tool::ToolWorkingSet;
|
||||
use client::{Client, UserStore};
|
||||
use collections::HashMap;
|
||||
use dap::DapRegistry;
|
||||
use gpui::{App, Entity, SemanticVersion, Subscription, Task, prelude::*};
|
||||
use language::LanguageRegistry;
|
||||
use language_model::{
|
||||
AuthenticateError, LanguageModel, LanguageModelProviderId, LanguageModelRegistry,
|
||||
};
|
||||
use node_runtime::NodeRuntime;
|
||||
use project::{Project, RealFs};
|
||||
use prompt_store::PromptBuilder;
|
||||
use settings::SettingsStore;
|
||||
use smol::channel;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Subset of `workspace::AppState` needed by `HeadlessAssistant`, with additional fields.
|
||||
pub struct HeadlessAppState {
|
||||
pub languages: Arc<LanguageRegistry>,
|
||||
pub client: Arc<Client>,
|
||||
pub user_store: Entity<UserStore>,
|
||||
pub fs: Arc<dyn fs::Fs>,
|
||||
pub node_runtime: NodeRuntime,
|
||||
|
||||
// Additional fields not present in `workspace::AppState`.
|
||||
pub prompt_builder: Arc<PromptBuilder>,
|
||||
}
|
||||
|
||||
pub struct HeadlessAssistant {
|
||||
pub thread: Entity<Thread>,
|
||||
pub project: Entity<Project>,
|
||||
#[allow(dead_code)]
|
||||
pub thread_store: Entity<ThreadStore>,
|
||||
pub tool_use_counts: HashMap<Arc<str>, u32>,
|
||||
pub done_tx: channel::Sender<anyhow::Result<()>>,
|
||||
_subscription: Subscription,
|
||||
}
|
||||
|
||||
impl HeadlessAssistant {
|
||||
pub fn new(
|
||||
app_state: Arc<HeadlessAppState>,
|
||||
cx: &mut App,
|
||||
) -> anyhow::Result<(Entity<Self>, channel::Receiver<anyhow::Result<()>>)> {
|
||||
let env = None;
|
||||
let project = Project::local(
|
||||
app_state.client.clone(),
|
||||
app_state.node_runtime.clone(),
|
||||
app_state.user_store.clone(),
|
||||
app_state.languages.clone(),
|
||||
Arc::new(DapRegistry::default()),
|
||||
app_state.fs.clone(),
|
||||
env,
|
||||
cx,
|
||||
);
|
||||
|
||||
let tools = Arc::new(ToolWorkingSet::default());
|
||||
let thread_store =
|
||||
ThreadStore::new(project.clone(), tools, app_state.prompt_builder.clone(), cx)?;
|
||||
|
||||
let thread = thread_store.update(cx, |thread_store, cx| thread_store.create_thread(cx));
|
||||
|
||||
let (done_tx, done_rx) = channel::unbounded::<anyhow::Result<()>>();
|
||||
|
||||
let headless_thread = cx.new(move |cx| Self {
|
||||
_subscription: cx.subscribe(&thread, Self::handle_thread_event),
|
||||
thread,
|
||||
project,
|
||||
thread_store,
|
||||
tool_use_counts: HashMap::default(),
|
||||
done_tx,
|
||||
});
|
||||
|
||||
Ok((headless_thread, done_rx))
|
||||
}
|
||||
|
||||
fn handle_thread_event(
|
||||
&mut self,
|
||||
thread: Entity<Thread>,
|
||||
event: &ThreadEvent,
|
||||
cx: &mut Context<Self>,
|
||||
) {
|
||||
match event {
|
||||
ThreadEvent::ShowError(err) => self
|
||||
.done_tx
|
||||
.send_blocking(Err(anyhow!("{:?}", err)))
|
||||
.unwrap(),
|
||||
ThreadEvent::DoneStreaming => {
|
||||
let thread = thread.read(cx);
|
||||
if let Some(message) = thread.messages().last() {
|
||||
println!("Message: {}", message.to_string());
|
||||
}
|
||||
if thread.all_tools_finished() {
|
||||
self.done_tx.send_blocking(Ok(())).unwrap()
|
||||
}
|
||||
}
|
||||
ThreadEvent::UsePendingTools => {
|
||||
thread.update(cx, |thread, cx| {
|
||||
thread.use_pending_tools(cx);
|
||||
});
|
||||
}
|
||||
ThreadEvent::ToolConfirmationNeeded => {
|
||||
// Automatically approve all tools that need confirmation in headless mode
|
||||
println!("Tool confirmation needed - automatically approving in headless mode");
|
||||
|
||||
// Get the tools needing confirmation
|
||||
let tools_needing_confirmation: Vec<_> = thread
|
||||
.read(cx)
|
||||
.tools_needing_confirmation()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
// Run each tool that needs confirmation
|
||||
for tool_use in tools_needing_confirmation {
|
||||
if let Some(tool) = thread.read(cx).tools().tool(&tool_use.name, cx) {
|
||||
thread.update(cx, |thread, cx| {
|
||||
println!("Auto-approving tool: {}", tool_use.name);
|
||||
|
||||
// Create a request to send to the tool
|
||||
let request = thread.to_completion_request(RequestKind::Chat, cx);
|
||||
let messages = Arc::new(request.messages);
|
||||
|
||||
// Run the tool
|
||||
thread.run_tool(
|
||||
tool_use.id.clone(),
|
||||
tool_use.ui_text.clone(),
|
||||
tool_use.input.clone(),
|
||||
&messages,
|
||||
tool,
|
||||
cx,
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
ThreadEvent::ToolFinished {
|
||||
tool_use_id,
|
||||
pending_tool_use,
|
||||
..
|
||||
} => {
|
||||
if let Some(pending_tool_use) = pending_tool_use {
|
||||
println!(
|
||||
"Used tool {} with input: {}",
|
||||
pending_tool_use.name, pending_tool_use.input
|
||||
);
|
||||
*self
|
||||
.tool_use_counts
|
||||
.entry(pending_tool_use.name.clone())
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
if let Some(tool_result) = thread.read(cx).tool_result(tool_use_id) {
|
||||
println!("Tool result: {:?}", tool_result);
|
||||
}
|
||||
if thread.read(cx).all_tools_finished() {
|
||||
let model_registry = LanguageModelRegistry::read_global(cx);
|
||||
if let Some(model) = model_registry.default_model() {
|
||||
thread.update(cx, |thread, cx| {
|
||||
thread.attach_tool_results(cx);
|
||||
thread.send_to_model(model.model, RequestKind::Chat, cx);
|
||||
});
|
||||
} else {
|
||||
println!(
|
||||
"Warning: No active language model available to continue conversation"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(cx: &mut App) -> Arc<HeadlessAppState> {
|
||||
release_channel::init(SemanticVersion::default(), cx);
|
||||
gpui_tokio::init(cx);
|
||||
|
||||
let mut settings_store = SettingsStore::new(cx);
|
||||
settings_store
|
||||
.set_default_settings(settings::default_settings().as_ref(), cx)
|
||||
.unwrap();
|
||||
cx.set_global(settings_store);
|
||||
client::init_settings(cx);
|
||||
Project::init_settings(cx);
|
||||
|
||||
let client = Client::production(cx);
|
||||
cx.set_http_client(client.http_client().clone());
|
||||
|
||||
let git_binary_path = None;
|
||||
let fs = Arc::new(RealFs::new(
|
||||
git_binary_path,
|
||||
cx.background_executor().clone(),
|
||||
));
|
||||
|
||||
let languages = Arc::new(LanguageRegistry::new(cx.background_executor().clone()));
|
||||
|
||||
let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
|
||||
|
||||
language::init(cx);
|
||||
language_model::init(client.clone(), cx);
|
||||
language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
|
||||
assistant_tools::init(client.http_client().clone(), cx);
|
||||
context_server::init(cx);
|
||||
let stdout_is_a_pty = false;
|
||||
let prompt_builder = PromptBuilder::load(fs.clone(), stdout_is_a_pty, cx);
|
||||
agent::init(fs.clone(), client.clone(), prompt_builder.clone(), cx);
|
||||
|
||||
Arc::new(HeadlessAppState {
|
||||
languages,
|
||||
client,
|
||||
user_store,
|
||||
fs,
|
||||
node_runtime: NodeRuntime::unavailable(),
|
||||
prompt_builder,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn find_model(model_name: &str, cx: &App) -> anyhow::Result<Arc<dyn LanguageModel>> {
|
||||
let model_registry = LanguageModelRegistry::read_global(cx);
|
||||
let model = model_registry
|
||||
.available_models(cx)
|
||||
.find(|model| model.id().0 == model_name);
|
||||
|
||||
let Some(model) = model else {
|
||||
return Err(anyhow!(
|
||||
"No language model named {} was available. Available models: {}",
|
||||
model_name,
|
||||
model_registry
|
||||
.available_models(cx)
|
||||
.map(|model| model.id().0.clone())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
));
|
||||
};
|
||||
|
||||
Ok(model)
|
||||
}
|
||||
|
||||
pub fn authenticate_model_provider(
|
||||
provider_id: LanguageModelProviderId,
|
||||
cx: &mut App,
|
||||
) -> Task<std::result::Result<(), AuthenticateError>> {
|
||||
let model_registry = LanguageModelRegistry::read_global(cx);
|
||||
let model_provider = model_registry.provider(&provider_id).unwrap();
|
||||
model_provider.authenticate(cx)
|
||||
}
|
205
crates/agent_eval/src/main.rs
Normal file
205
crates/agent_eval/src/main.rs
Normal file
|
@ -0,0 +1,205 @@
|
|||
mod eval;
|
||||
mod get_exercise;
|
||||
mod git_commands;
|
||||
mod headless_assistant;
|
||||
|
||||
use clap::Parser;
|
||||
use eval::{run_exercise_eval, save_eval_results};
|
||||
use futures::stream::{self, StreamExt};
|
||||
use get_exercise::{find_exercises, get_exercise_language, get_exercise_name};
|
||||
use git_commands::read_base_sha;
|
||||
use gpui::Application;
|
||||
use headless_assistant::{authenticate_model_provider, find_model};
|
||||
use language_model::LanguageModelRegistry;
|
||||
use reqwest_client::ReqwestClient;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(
|
||||
name = "agent_eval",
|
||||
disable_version_flag = true,
|
||||
before_help = "Tool eval runner"
|
||||
)]
|
||||
struct Args {
|
||||
/// Match the names of evals to run.
|
||||
#[arg(long)]
|
||||
exercise_names: Vec<String>,
|
||||
/// Runs all exercises, causes the exercise_names to be ignored.
|
||||
#[arg(long)]
|
||||
all: bool,
|
||||
/// Supported language types to evaluate (default: internal).
|
||||
/// Internal is data generated from the agent panel
|
||||
#[arg(long, default_value = "internal")]
|
||||
languages: String,
|
||||
/// Name of the model (default: "claude-3-7-sonnet-latest")
|
||||
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
|
||||
model_name: String,
|
||||
/// Name of the editor model (default: value of `--model_name`).
|
||||
#[arg(long)]
|
||||
editor_model_name: Option<String>,
|
||||
/// Number of evaluations to run concurrently (default: 3)
|
||||
#[arg(short, long, default_value = "5")]
|
||||
concurrency: usize,
|
||||
/// Maximum number of exercises to evaluate per language
|
||||
#[arg(long)]
|
||||
max_exercises_per_language: Option<usize>,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
env_logger::init();
|
||||
let args = Args::parse();
|
||||
let http_client = Arc::new(ReqwestClient::new());
|
||||
let app = Application::headless().with_http_client(http_client.clone());
|
||||
|
||||
// Path to the zed-ace-framework repo
|
||||
let framework_path = PathBuf::from("../zed-ace-framework")
|
||||
.canonicalize()
|
||||
.unwrap();
|
||||
|
||||
// Fix the 'languages' lifetime issue by creating owned Strings instead of slices
|
||||
let languages: Vec<String> = args.languages.split(',').map(|s| s.to_string()).collect();
|
||||
|
||||
println!("Using zed-ace-framework at: {:?}", framework_path);
|
||||
println!("Evaluating languages: {:?}", languages);
|
||||
|
||||
app.run(move |cx| {
|
||||
let app_state = headless_assistant::init(cx);
|
||||
|
||||
let model = find_model(&args.model_name, cx).unwrap();
|
||||
let editor_model = if let Some(model_name) = &args.editor_model_name {
|
||||
find_model(model_name, cx).unwrap()
|
||||
} else {
|
||||
model.clone()
|
||||
};
|
||||
|
||||
LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
|
||||
registry.set_default_model(Some(model.clone()), cx);
|
||||
});
|
||||
|
||||
let model_provider_id = model.provider_id();
|
||||
let editor_model_provider_id = editor_model.provider_id();
|
||||
|
||||
let framework_path_clone = framework_path.clone();
|
||||
let languages_clone = languages.clone();
|
||||
let exercise_names = args.exercise_names.clone();
|
||||
let all_flag = args.all;
|
||||
|
||||
cx.spawn(async move |cx| {
|
||||
// Authenticate all model providers first
|
||||
cx.update(|cx| authenticate_model_provider(model_provider_id.clone(), cx))
|
||||
.unwrap()
|
||||
.await
|
||||
.unwrap();
|
||||
cx.update(|cx| authenticate_model_provider(editor_model_provider_id.clone(), cx))
|
||||
.unwrap()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
println!("framework path: {}", framework_path_clone.display());
|
||||
|
||||
let base_sha = read_base_sha(&framework_path_clone).await.unwrap();
|
||||
|
||||
println!("base sha: {}", base_sha);
|
||||
|
||||
let all_exercises = find_exercises(
|
||||
&framework_path_clone,
|
||||
&languages_clone
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<Vec<_>>(),
|
||||
args.max_exercises_per_language,
|
||||
)
|
||||
.unwrap();
|
||||
println!("Found {} exercises total", all_exercises.len());
|
||||
|
||||
// Filter exercises if specific ones were requested
|
||||
let exercises_to_run = if !exercise_names.is_empty() {
|
||||
// If exercise names are specified, filter by them regardless of --all flag
|
||||
all_exercises
|
||||
.into_iter()
|
||||
.filter(|path| {
|
||||
let name = get_exercise_name(path);
|
||||
exercise_names.iter().any(|filter| name.contains(filter))
|
||||
})
|
||||
.collect()
|
||||
} else if all_flag {
|
||||
// Only use all_flag if no exercise names are specified
|
||||
all_exercises
|
||||
} else {
|
||||
// Default behavior (no filters)
|
||||
all_exercises
|
||||
};
|
||||
|
||||
println!("Will run {} exercises", exercises_to_run.len());
|
||||
|
||||
// Create exercise eval tasks - each exercise is a single task that will run templates sequentially
|
||||
let exercise_tasks: Vec<_> = exercises_to_run
|
||||
.into_iter()
|
||||
.map(|exercise_path| {
|
||||
let exercise_name = get_exercise_name(&exercise_path);
|
||||
let model_clone = model.clone();
|
||||
let app_state_clone = app_state.clone();
|
||||
let base_sha_clone = base_sha.clone();
|
||||
let framework_path_clone = framework_path_clone.clone();
|
||||
let cx_clone = cx.clone();
|
||||
|
||||
async move {
|
||||
println!("Processing exercise: {}", exercise_name);
|
||||
let mut exercise_results = Vec::new();
|
||||
|
||||
match run_exercise_eval(
|
||||
exercise_path.clone(),
|
||||
model_clone.clone(),
|
||||
app_state_clone.clone(),
|
||||
base_sha_clone.clone(),
|
||||
framework_path_clone.clone(),
|
||||
cx_clone.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
println!("Completed {}", exercise_name);
|
||||
exercise_results.push(result);
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Error running {}: {}", exercise_name, err);
|
||||
}
|
||||
}
|
||||
|
||||
// Save results for this exercise
|
||||
if !exercise_results.is_empty() {
|
||||
if let Err(err) =
|
||||
save_eval_results(&exercise_path, exercise_results.clone()).await
|
||||
{
|
||||
println!("Error saving results for {}: {}", exercise_name, err);
|
||||
} else {
|
||||
println!("Saved results for {}", exercise_name);
|
||||
}
|
||||
}
|
||||
|
||||
exercise_results
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
println!(
|
||||
"Running {} exercises with concurrency: {}",
|
||||
exercise_tasks.len(),
|
||||
args.concurrency
|
||||
);
|
||||
|
||||
// Run exercises concurrently, with each exercise running its templates sequentially
|
||||
let all_results = stream::iter(exercise_tasks)
|
||||
.buffer_unordered(args.concurrency)
|
||||
.flat_map(stream::iter)
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
|
||||
println!("Completed {} evaluation runs", all_results.len());
|
||||
cx.update(|cx| cx.quit()).unwrap();
|
||||
})
|
||||
.detach();
|
||||
});
|
||||
|
||||
println!("Done running evals");
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue