Move assistant_evals
to agent_evals
and remove Judge logic (#28233)
Release Notes: - N/A
This commit is contained in:
parent
500d8f2943
commit
f3274851d9
13 changed files with 73 additions and 638 deletions
74
Cargo.lock
generated
74
Cargo.lock
generated
|
@ -125,6 +125,43 @@ dependencies = [
|
|||
"zed_actions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "agent_eval"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"agent",
|
||||
"anyhow",
|
||||
"assistant_tool",
|
||||
"assistant_tools",
|
||||
"clap",
|
||||
"client",
|
||||
"collections",
|
||||
"context_server",
|
||||
"dap",
|
||||
"env_logger 0.11.8",
|
||||
"fs",
|
||||
"futures 0.3.31",
|
||||
"gpui",
|
||||
"gpui_tokio",
|
||||
"language",
|
||||
"language_model",
|
||||
"language_models",
|
||||
"node_runtime",
|
||||
"project",
|
||||
"prompt_store",
|
||||
"release_channel",
|
||||
"reqwest_client",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_json_lenient",
|
||||
"settings",
|
||||
"smol",
|
||||
"tempfile",
|
||||
"util",
|
||||
"walkdir",
|
||||
"workspace-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.7.8"
|
||||
|
@ -580,43 +617,6 @@ dependencies = [
|
|||
"zed_actions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "assistant_eval"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"agent",
|
||||
"anyhow",
|
||||
"assistant_tool",
|
||||
"assistant_tools",
|
||||
"clap",
|
||||
"client",
|
||||
"collections",
|
||||
"context_server",
|
||||
"dap",
|
||||
"env_logger 0.11.8",
|
||||
"fs",
|
||||
"futures 0.3.31",
|
||||
"gpui",
|
||||
"gpui_tokio",
|
||||
"language",
|
||||
"language_model",
|
||||
"language_models",
|
||||
"node_runtime",
|
||||
"project",
|
||||
"prompt_store",
|
||||
"release_channel",
|
||||
"reqwest_client",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_json_lenient",
|
||||
"settings",
|
||||
"smol",
|
||||
"tempfile",
|
||||
"util",
|
||||
"walkdir",
|
||||
"workspace-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "assistant_settings"
|
||||
version = "0.1.0"
|
||||
|
|
|
@ -8,7 +8,7 @@ members = [
|
|||
"crates/assets",
|
||||
"crates/assistant",
|
||||
"crates/assistant_context_editor",
|
||||
"crates/assistant_eval",
|
||||
"crates/agent_eval",
|
||||
"crates/assistant_settings",
|
||||
"crates/assistant_slash_command",
|
||||
"crates/assistant_slash_commands",
|
||||
|
@ -215,7 +215,7 @@ askpass = { path = "crates/askpass" }
|
|||
assets = { path = "crates/assets" }
|
||||
assistant = { path = "crates/assistant" }
|
||||
assistant_context_editor = { path = "crates/assistant_context_editor" }
|
||||
assistant_eval = { path = "crates/assistant_eval" }
|
||||
assistant_eval = { path = "crates/agent_eval" }
|
||||
assistant_settings = { path = "crates/assistant_settings" }
|
||||
assistant_slash_command = { path = "crates/assistant_slash_command" }
|
||||
assistant_slash_commands = { path = "crates/assistant_slash_commands" }
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[package]
|
||||
name = "assistant_eval"
|
||||
name = "agent_eval"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
publish.workspace = true
|
||||
|
@ -9,7 +9,7 @@ license = "GPL-3.0-or-later"
|
|||
workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "assistant_eval"
|
||||
name = "agent_eval"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
|
@ -1,6 +1,6 @@
|
|||
use crate::git_commands::{run_git, setup_temp_repo};
|
||||
use crate::headless_assistant::{HeadlessAppState, HeadlessAssistant};
|
||||
use crate::{get_exercise_language, get_exercise_name, templates_eval::Template};
|
||||
use crate::{get_exercise_language, get_exercise_name};
|
||||
use agent::RequestKind;
|
||||
use anyhow::{Result, anyhow};
|
||||
use collections::HashMap;
|
||||
|
@ -18,8 +18,6 @@ use std::{
|
|||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct EvalResult {
|
||||
pub exercise_name: String,
|
||||
pub template_name: String,
|
||||
pub score: String,
|
||||
pub diff: String,
|
||||
pub assistant_response: String,
|
||||
pub elapsed_time_ms: u128,
|
||||
|
@ -29,7 +27,6 @@ pub struct EvalResult {
|
|||
pub output_tokens: usize,
|
||||
pub total_tokens: usize,
|
||||
pub tool_use_counts: usize,
|
||||
pub judge_model_name: String, // Added field for judge model name
|
||||
}
|
||||
|
||||
pub struct EvalOutput {
|
||||
|
@ -251,29 +248,6 @@ pub async fn read_instructions(exercise_path: &Path) -> Result<String> {
|
|||
Ok(instructions)
|
||||
}
|
||||
|
||||
pub async fn read_example_solution(exercise_path: &Path, language: &str) -> Result<String> {
|
||||
// Map the language to the file extension
|
||||
let language_extension = match language {
|
||||
"python" => "py",
|
||||
"go" => "go",
|
||||
"rust" => "rs",
|
||||
"typescript" => "ts",
|
||||
"javascript" => "js",
|
||||
"ruby" => "rb",
|
||||
"php" => "php",
|
||||
"bash" => "sh",
|
||||
"multi" => "diff",
|
||||
"internal" => "diff",
|
||||
_ => return Err(anyhow!("Unsupported language: {}", language)),
|
||||
};
|
||||
let example_path = exercise_path
|
||||
.join(".meta")
|
||||
.join(format!("example.{}", language_extension));
|
||||
println!("Reading example solution from: {}", example_path.display());
|
||||
let example = smol::unblock(move || std::fs::read_to_string(&example_path)).await?;
|
||||
Ok(example)
|
||||
}
|
||||
|
||||
pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -> Result<()> {
|
||||
let eval_dir = exercise_path.join("evaluation");
|
||||
fs::create_dir_all(&eval_dir)?;
|
||||
|
@ -311,12 +285,8 @@ pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -
|
|||
// Group the new results by test name (exercise name)
|
||||
for result in results {
|
||||
let exercise_name = &result.exercise_name;
|
||||
let template_name = &result.template_name;
|
||||
|
||||
println!(
|
||||
"Adding result: exercise={}, template={}",
|
||||
exercise_name, template_name
|
||||
);
|
||||
println!("Adding result: exercise={}", exercise_name);
|
||||
|
||||
// Ensure the exercise entry exists
|
||||
if eval_data.get(exercise_name).is_none() {
|
||||
|
@ -329,7 +299,7 @@ pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -
|
|||
}
|
||||
|
||||
// Add this result under the timestamp with template name as key
|
||||
eval_data[exercise_name][×tamp][template_name] = serde_json::to_value(&result)?;
|
||||
eval_data[exercise_name][×tamp] = serde_json::to_value(&result)?;
|
||||
}
|
||||
|
||||
// Write back to file with pretty formatting
|
||||
|
@ -344,9 +314,7 @@ pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -
|
|||
|
||||
pub async fn run_exercise_eval(
|
||||
exercise_path: PathBuf,
|
||||
template: Template,
|
||||
model: Arc<dyn LanguageModel>,
|
||||
judge_model: Arc<dyn LanguageModel>,
|
||||
app_state: Arc<HeadlessAppState>,
|
||||
base_sha: String,
|
||||
_framework_path: PathBuf,
|
||||
|
@ -359,68 +327,15 @@ pub async fn run_exercise_eval(
|
|||
"\n\nWhen writing the code for this prompt, use {} to achieve the goal.",
|
||||
language
|
||||
));
|
||||
let example_solution = read_example_solution(&exercise_path, &language).await?;
|
||||
|
||||
println!(
|
||||
"Running evaluation for exercise: {} with template: {}",
|
||||
exercise_name, template.name
|
||||
);
|
||||
println!("Running evaluation for exercise: {}", exercise_name);
|
||||
|
||||
// Create temporary directory with exercise files
|
||||
let temp_dir = setup_temp_repo(&exercise_path, &base_sha).await?;
|
||||
let temp_path = temp_dir.path().to_path_buf();
|
||||
|
||||
if template.name == "ProjectCreation" {
|
||||
for entry in fs::read_dir(&temp_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
// Skip directories that start with dot (like .docs, .meta, .git)
|
||||
if path.is_dir()
|
||||
&& path
|
||||
.file_name()
|
||||
.and_then(|name| name.to_str())
|
||||
.map(|name| name.starts_with("."))
|
||||
.unwrap_or(false)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Delete regular files
|
||||
if path.is_file() {
|
||||
println!(" Deleting file: {}", path.display());
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the deletion so it shows up in the diff
|
||||
run_git(&temp_path, &["add", "."]).await?;
|
||||
run_git(
|
||||
&temp_path,
|
||||
&["commit", "-m", "Remove root files for clean slate"],
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let local_commit_sha = run_git(&temp_path, &["rev-parse", "HEAD"]).await?;
|
||||
|
||||
// Prepare prompt based on template
|
||||
let prompt = match template.name {
|
||||
"ProjectCreation" => format!(
|
||||
"I need to create a new implementation for this exercise. Please create all the necessary files in the best location.\n\n{}",
|
||||
instructions
|
||||
),
|
||||
"CodeModification" => format!(
|
||||
"I need help updating my code to meet these requirements. Please modify the appropriate files:\n\n{}",
|
||||
instructions
|
||||
),
|
||||
"ConversationalGuidance" => format!(
|
||||
"I'm trying to solve this coding exercise but I'm not sure where to start. Can you help me understand the requirements and guide me through the solution process without writing code for me?\n\n{}",
|
||||
instructions
|
||||
),
|
||||
_ => instructions.clone(),
|
||||
};
|
||||
|
||||
let start_time = SystemTime::now();
|
||||
|
||||
// Create a basic eval struct to work with the existing system
|
||||
|
@ -430,7 +345,7 @@ pub async fn run_exercise_eval(
|
|||
url: format!("file://{}", temp_path.display()),
|
||||
base_sha: local_commit_sha, // Use the local commit SHA instead of the framework base SHA
|
||||
},
|
||||
user_prompt: prompt,
|
||||
user_prompt: instructions.clone(),
|
||||
};
|
||||
|
||||
// Run the evaluation
|
||||
|
@ -441,79 +356,6 @@ pub async fn run_exercise_eval(
|
|||
// Get diff from git
|
||||
let diff = eval_output.diff.clone();
|
||||
|
||||
// For project creation template, we need to compare with reference implementation
|
||||
let judge_output = if template.name == "ProjectCreation" {
|
||||
let project_judge_prompt = template
|
||||
.content
|
||||
.replace(
|
||||
"<!-- ```requirements go here``` -->",
|
||||
&format!("```\n{}\n```", instructions),
|
||||
)
|
||||
.replace(
|
||||
"<!-- ```reference code goes here``` -->",
|
||||
&format!("```{}\n{}\n```", language, example_solution),
|
||||
)
|
||||
.replace(
|
||||
"<!-- ```git diff goes here``` -->",
|
||||
&format!("```\n{}\n```", diff),
|
||||
);
|
||||
|
||||
// Use the run_with_prompt method which we'll add to judge.rs
|
||||
let judge = crate::judge::Judge {
|
||||
original_diff: None,
|
||||
original_message: Some(project_judge_prompt),
|
||||
model: judge_model.clone(),
|
||||
};
|
||||
|
||||
cx.update(|cx| judge.run_with_prompt(cx))?.await?
|
||||
} else if template.name == "CodeModification" {
|
||||
// For CodeModification, we'll compare the example solution with the LLM-generated solution
|
||||
let code_judge_prompt = template
|
||||
.content
|
||||
.replace(
|
||||
"<!-- ```reference code goes here``` -->",
|
||||
&format!("```{}\n{}\n```", language, example_solution),
|
||||
)
|
||||
.replace(
|
||||
"<!-- ```git diff goes here``` -->",
|
||||
&format!("```\n{}\n```", diff),
|
||||
);
|
||||
|
||||
// Use the run_with_prompt method
|
||||
let judge = crate::judge::Judge {
|
||||
original_diff: None,
|
||||
original_message: Some(code_judge_prompt),
|
||||
model: judge_model.clone(),
|
||||
};
|
||||
|
||||
cx.update(|cx| judge.run_with_prompt(cx))?.await?
|
||||
} else {
|
||||
// Conversational template
|
||||
let conv_judge_prompt = template
|
||||
.content
|
||||
.replace(
|
||||
"<!-- ```query goes here``` -->",
|
||||
&format!("```\n{}\n```", instructions),
|
||||
)
|
||||
.replace(
|
||||
"<!-- ```transcript goes here``` -->",
|
||||
&format!("```\n{}\n```", eval_output.last_message),
|
||||
)
|
||||
.replace(
|
||||
"<!-- ```git diff goes here``` -->",
|
||||
&format!("```\n{}\n```", diff),
|
||||
);
|
||||
|
||||
// Use the run_with_prompt method for consistency
|
||||
let judge = crate::judge::Judge {
|
||||
original_diff: None,
|
||||
original_message: Some(conv_judge_prompt),
|
||||
model: judge_model.clone(),
|
||||
};
|
||||
|
||||
cx.update(|cx| judge.run_with_prompt(cx))?.await?
|
||||
};
|
||||
|
||||
let elapsed_time = start_time.elapsed()?;
|
||||
|
||||
// Calculate total tokens as the sum of input and output tokens
|
||||
|
@ -522,14 +364,9 @@ pub async fn run_exercise_eval(
|
|||
let tool_use_counts = eval_output.tool_use_counts.values().sum::<u32>();
|
||||
let total_tokens = input_tokens + output_tokens;
|
||||
|
||||
// Get judge model name
|
||||
let judge_model_name = judge_model.id().0.to_string();
|
||||
|
||||
// Save results to evaluation directory
|
||||
let result = EvalResult {
|
||||
exercise_name: exercise_name.clone(),
|
||||
template_name: template.name.to_string(),
|
||||
score: judge_output.trim().to_string(),
|
||||
diff,
|
||||
assistant_response: eval_output.last_message.clone(),
|
||||
elapsed_time_ms: elapsed_time.as_millis(),
|
||||
|
@ -541,7 +378,6 @@ pub async fn run_exercise_eval(
|
|||
output_tokens: output_tokens.try_into().unwrap(),
|
||||
total_tokens: total_tokens.try_into().unwrap(),
|
||||
tool_use_counts: tool_use_counts.try_into().unwrap(),
|
||||
judge_model_name, // Add judge model name to result
|
||||
};
|
||||
|
||||
Ok(result)
|
|
@ -4,12 +4,10 @@ use assistant_tool::ToolWorkingSet;
|
|||
use client::{Client, UserStore};
|
||||
use collections::HashMap;
|
||||
use dap::DapRegistry;
|
||||
use futures::StreamExt;
|
||||
use gpui::{App, AsyncApp, Entity, SemanticVersion, Subscription, Task, prelude::*};
|
||||
use gpui::{App, Entity, SemanticVersion, Subscription, Task, prelude::*};
|
||||
use language::LanguageRegistry;
|
||||
use language_model::{
|
||||
AuthenticateError, LanguageModel, LanguageModelProviderId, LanguageModelRegistry,
|
||||
LanguageModelRequest,
|
||||
};
|
||||
use node_runtime::NodeRuntime;
|
||||
use project::{Project, RealFs};
|
||||
|
@ -246,34 +244,3 @@ pub fn authenticate_model_provider(
|
|||
let model_provider = model_registry.provider(&provider_id).unwrap();
|
||||
model_provider.authenticate(cx)
|
||||
}
|
||||
|
||||
pub async fn send_language_model_request(
|
||||
model: Arc<dyn LanguageModel>,
|
||||
request: LanguageModelRequest,
|
||||
cx: &mut AsyncApp,
|
||||
) -> anyhow::Result<String> {
|
||||
match model.stream_completion_text(request, &cx).await {
|
||||
Ok(mut stream) => {
|
||||
let mut full_response = String::new();
|
||||
|
||||
// Process the response stream
|
||||
while let Some(chunk_result) = stream.stream.next().await {
|
||||
match chunk_result {
|
||||
Ok(chunk_str) => {
|
||||
full_response.push_str(&chunk_str);
|
||||
}
|
||||
Err(err) => {
|
||||
return Err(anyhow!(
|
||||
"Error receiving response from language model: {err}"
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(full_response)
|
||||
}
|
||||
Err(err) => Err(anyhow!(
|
||||
"Failed to get response from language model. Error was: {err}"
|
||||
)),
|
||||
}
|
||||
}
|
|
@ -2,8 +2,6 @@ mod eval;
|
|||
mod get_exercise;
|
||||
mod git_commands;
|
||||
mod headless_assistant;
|
||||
mod judge;
|
||||
mod templates_eval;
|
||||
|
||||
use clap::Parser;
|
||||
use eval::{run_exercise_eval, save_eval_results};
|
||||
|
@ -15,11 +13,10 @@ use headless_assistant::{authenticate_model_provider, find_model};
|
|||
use language_model::LanguageModelRegistry;
|
||||
use reqwest_client::ReqwestClient;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
use templates_eval::all_templates;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(
|
||||
name = "assistant_eval",
|
||||
name = "agent_eval",
|
||||
disable_version_flag = true,
|
||||
before_help = "Tool eval runner"
|
||||
)]
|
||||
|
@ -37,24 +34,17 @@ struct Args {
|
|||
/// Name of the model (default: "claude-3-7-sonnet-latest")
|
||||
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
|
||||
model_name: String,
|
||||
/// Name of the judge model (default: value of `--model_name`).
|
||||
/// Name of the editor model (default: value of `--model_name`).
|
||||
#[arg(long)]
|
||||
judge_model_name: Option<String>,
|
||||
editor_model_name: Option<String>,
|
||||
/// Number of evaluations to run concurrently (default: 3)
|
||||
#[arg(short, long, default_value = "3")]
|
||||
#[arg(short, long, default_value = "5")]
|
||||
concurrency: usize,
|
||||
/// Maximum number of exercises to evaluate per language
|
||||
#[arg(long)]
|
||||
max_exercises_per_language: Option<usize>,
|
||||
}
|
||||
|
||||
// First, let's define the order in which templates should be executed
|
||||
const TEMPLATE_EXECUTION_ORDER: [&str; 3] = [
|
||||
"ProjectCreation",
|
||||
"CodeModification",
|
||||
"ConversationalGuidance",
|
||||
];
|
||||
|
||||
fn main() {
|
||||
env_logger::init();
|
||||
let args = Args::parse();
|
||||
|
@ -76,7 +66,7 @@ fn main() {
|
|||
let app_state = headless_assistant::init(cx);
|
||||
|
||||
let model = find_model(&args.model_name, cx).unwrap();
|
||||
let judge_model = if let Some(model_name) = &args.judge_model_name {
|
||||
let editor_model = if let Some(model_name) = &args.editor_model_name {
|
||||
find_model(model_name, cx).unwrap()
|
||||
} else {
|
||||
model.clone()
|
||||
|
@ -87,7 +77,7 @@ fn main() {
|
|||
});
|
||||
|
||||
let model_provider_id = model.provider_id();
|
||||
let judge_model_provider_id = judge_model.provider_id();
|
||||
let editor_model_provider_id = editor_model.provider_id();
|
||||
|
||||
let framework_path_clone = framework_path.clone();
|
||||
let languages_clone = languages.clone();
|
||||
|
@ -100,15 +90,17 @@ fn main() {
|
|||
.unwrap()
|
||||
.await
|
||||
.unwrap();
|
||||
cx.update(|cx| authenticate_model_provider(judge_model_provider_id.clone(), cx))
|
||||
cx.update(|cx| authenticate_model_provider(editor_model_provider_id.clone(), cx))
|
||||
.unwrap()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Read base SHA from setup.json
|
||||
println!("framework path: {}", framework_path_clone.display());
|
||||
|
||||
let base_sha = read_base_sha(&framework_path_clone).await.unwrap();
|
||||
|
||||
// Find all exercises for the specified languages
|
||||
println!("base sha: {}", base_sha);
|
||||
|
||||
let all_exercises = find_exercises(
|
||||
&framework_path_clone,
|
||||
&languages_clone
|
||||
|
@ -140,23 +132,12 @@ fn main() {
|
|||
|
||||
println!("Will run {} exercises", exercises_to_run.len());
|
||||
|
||||
// Get all templates and sort them according to the execution order
|
||||
let mut templates = all_templates();
|
||||
templates.sort_by_key(|template| {
|
||||
TEMPLATE_EXECUTION_ORDER
|
||||
.iter()
|
||||
.position(|&name| name == template.name)
|
||||
.unwrap_or(usize::MAX)
|
||||
});
|
||||
|
||||
// Create exercise eval tasks - each exercise is a single task that will run templates sequentially
|
||||
let exercise_tasks: Vec<_> = exercises_to_run
|
||||
.into_iter()
|
||||
.map(|exercise_path| {
|
||||
let exercise_name = get_exercise_name(&exercise_path);
|
||||
let templates_clone = templates.clone();
|
||||
let model_clone = model.clone();
|
||||
let judge_model_clone = judge_model.clone();
|
||||
let app_state_clone = app_state.clone();
|
||||
let base_sha_clone = base_sha.clone();
|
||||
let framework_path_clone = framework_path_clone.clone();
|
||||
|
@ -166,56 +147,22 @@ fn main() {
|
|||
println!("Processing exercise: {}", exercise_name);
|
||||
let mut exercise_results = Vec::new();
|
||||
|
||||
// Determine the language for this exercise
|
||||
let language = match get_exercise_language(&exercise_path) {
|
||||
Ok(lang) => lang,
|
||||
match run_exercise_eval(
|
||||
exercise_path.clone(),
|
||||
model_clone.clone(),
|
||||
app_state_clone.clone(),
|
||||
base_sha_clone.clone(),
|
||||
framework_path_clone.clone(),
|
||||
cx_clone.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
println!("Completed {}", exercise_name);
|
||||
exercise_results.push(result);
|
||||
}
|
||||
Err(err) => {
|
||||
println!(
|
||||
"Error determining language for {}: {}",
|
||||
exercise_name, err
|
||||
);
|
||||
return exercise_results;
|
||||
}
|
||||
};
|
||||
|
||||
// Run each template sequentially for this exercise
|
||||
for template in templates_clone {
|
||||
// For "multi" or "internal" language, only run the CodeModification template
|
||||
if (language == "multi" || language == "internal")
|
||||
&& template.name != "CodeModification"
|
||||
{
|
||||
println!(
|
||||
"Skipping {} template for {} language",
|
||||
template.name, language
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
match run_exercise_eval(
|
||||
exercise_path.clone(),
|
||||
template.clone(),
|
||||
model_clone.clone(),
|
||||
judge_model_clone.clone(),
|
||||
app_state_clone.clone(),
|
||||
base_sha_clone.clone(),
|
||||
framework_path_clone.clone(),
|
||||
cx_clone.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
println!(
|
||||
"Completed {} with template {} - score: {}",
|
||||
exercise_name, template.name, result.score
|
||||
);
|
||||
exercise_results.push(result);
|
||||
}
|
||||
Err(err) => {
|
||||
println!(
|
||||
"Error running {} with template {}: {}",
|
||||
exercise_name, template.name, err
|
||||
);
|
||||
}
|
||||
println!("Error running {}: {}", exercise_name, err);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
# Tool Evals
|
||||
|
||||
A framework for evaluating and benchmarking the agent panel generations.
|
||||
|
||||
## Overview
|
||||
|
||||
Tool Evals provides a headless environment for running assistants evaluations on code repositories. It automates the process of:
|
||||
|
||||
1. Setting up test code and repositories
|
||||
2. Sending prompts to language models
|
||||
3. Allowing the assistant to use tools to modify code
|
||||
4. Collecting metrics on performance and tool usage
|
||||
5. Evaluating results against known good solutions
|
||||
|
||||
## How It Works
|
||||
|
||||
The system consists of several key components:
|
||||
|
||||
- **Eval**: Loads exercises from the zed-ace-framework repository, creates temporary repos, and executes evaluations
|
||||
- **HeadlessAssistant**: Provides a headless environment for running the AI assistant
|
||||
- **Judge**: Evaluates AI-generated solutions against reference implementations and assigns scores
|
||||
- **Templates**: Defines evaluation frameworks for different tasks (Project Creation, Code Modification, Conversational Guidance)
|
||||
|
||||
## Setup Requirements
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Rust and Cargo
|
||||
- Git
|
||||
- Python (for report generation)
|
||||
- Network access to clone repositories
|
||||
- Appropriate API keys for language models and git services (Anthropic, GitHub, etc.)
|
||||
|
||||
### Environment Variables
|
||||
|
||||
Ensure you have the required API keys set, either from a dev run of Zed or via these environment variables:
|
||||
- `ZED_ANTHROPIC_API_KEY` for Claude models
|
||||
- `ZED_GITHUB_API_KEY` for GitHub API (or similar)
|
||||
|
||||
## Usage
|
||||
|
||||
### Running Evaluations
|
||||
|
||||
```bash
|
||||
# Run all tests
|
||||
cargo run -p assistant_eval -- --all
|
||||
|
||||
# Run only specific languages
|
||||
cargo run -p assistant_eval -- --all --languages python,rust
|
||||
|
||||
# Limit concurrent evaluations
|
||||
cargo run -p assistant_eval -- --all --concurrency 5
|
||||
|
||||
# Limit number of exercises per language
|
||||
cargo run -p assistant_eval -- --all --max-exercises-per-language 3
|
||||
```
|
||||
|
||||
### Evaluation Template Types
|
||||
|
||||
The system supports three types of evaluation templates:
|
||||
|
||||
1. **ProjectCreation**: Tests the model's ability to create new implementations from scratch
|
||||
2. **CodeModification**: Tests the model's ability to modify existing code to meet new requirements
|
||||
3. **ConversationalGuidance**: Tests the model's ability to provide guidance without writing code
|
||||
|
||||
### Support Repo
|
||||
|
||||
The [zed-industries/zed-ace-framework](https://github.com/zed-industries/zed-ace-framework) contains the analytics and reporting scripts.
|
|
@ -1,37 +0,0 @@
|
|||
use crate::headless_assistant::send_language_model_request;
|
||||
use anyhow::anyhow;
|
||||
use gpui::{App, Task};
|
||||
use language_model::{
|
||||
LanguageModel, LanguageModelRequest, LanguageModelRequestMessage, MessageContent, Role,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct Judge {
|
||||
#[allow(dead_code)]
|
||||
pub original_diff: Option<String>,
|
||||
pub original_message: Option<String>,
|
||||
pub model: Arc<dyn LanguageModel>,
|
||||
}
|
||||
|
||||
impl Judge {
|
||||
pub fn run_with_prompt(&self, cx: &mut App) -> Task<anyhow::Result<String>> {
|
||||
let Some(prompt) = self.original_message.as_ref() else {
|
||||
return Task::ready(Err(anyhow!("No prompt provided in original_message")));
|
||||
};
|
||||
|
||||
let request = LanguageModelRequest {
|
||||
messages: vec![LanguageModelRequestMessage {
|
||||
role: Role::User,
|
||||
content: vec![MessageContent::Text(prompt.clone())],
|
||||
cache: false,
|
||||
}],
|
||||
temperature: Some(0.0),
|
||||
tools: Vec::new(),
|
||||
stop: Vec::new(),
|
||||
};
|
||||
|
||||
let model = self.model.clone();
|
||||
let request = request.clone();
|
||||
cx.spawn(async move |cx| send_language_model_request(model, request, cx).await)
|
||||
}
|
||||
}
|
|
@ -1,210 +0,0 @@
|
|||
#[derive(Clone, Debug)]
|
||||
pub struct Template {
|
||||
pub name: &'static str,
|
||||
pub content: &'static str,
|
||||
}
|
||||
|
||||
pub fn all_templates() -> Vec<Template> {
|
||||
vec![
|
||||
Template {
|
||||
name: "ProjectCreation",
|
||||
content: r#"
|
||||
# Project Creation Evaluation Template
|
||||
|
||||
## Instructions
|
||||
|
||||
Evaluate how well the AI assistant created a new implementation from scratch. Score it between 0.0 and 1.0 based on quality and fulfillment of requirements.
|
||||
- 1.0 = Perfect implementation that creates all necessary files with correct functionality.
|
||||
- 0.0 = Completely fails to create working files or meet requirements.
|
||||
|
||||
Note: A git diff output is required. If no code changes are provided (i.e., no git diff output), the score must be 0.0.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
Please consider the following aspects in order of importance:
|
||||
|
||||
1. **File Creation (25%)**
|
||||
- Did the assistant create all necessary files?
|
||||
- Are the files appropriately named and organized?
|
||||
- Did the assistant create a complete solution without missing components?
|
||||
|
||||
2. **Functional Correctness (40%)**
|
||||
- Does the implementation fulfill all specified requirements?
|
||||
- Does it handle edge cases properly?
|
||||
- Is it free of logical errors and bugs?
|
||||
- Do all components work together as expected?
|
||||
|
||||
3. **Code Quality (20%)**
|
||||
- Is the code well-structured, readable and well-documented?
|
||||
- Does it follow language-specific best practices?
|
||||
- Is there proper error handling?
|
||||
- Are naming conventions clear and consistent?
|
||||
|
||||
4. **Architecture Design (15%)**
|
||||
- Is the code modular and extensible?
|
||||
- Is there proper separation of concerns?
|
||||
- Are appropriate design patterns used?
|
||||
- Is the overall architecture appropriate for the requirements?
|
||||
|
||||
## Input
|
||||
|
||||
Requirements:
|
||||
<!-- ```requirements go here``` -->
|
||||
|
||||
Reference Implementation:
|
||||
<!-- ```reference code goes here``` -->
|
||||
|
||||
AI-Generated Implementation (git diff output):
|
||||
<!-- ```git diff goes here``` -->
|
||||
|
||||
## Output Format
|
||||
|
||||
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
|
||||
|
||||
EXAMPLE ONE:
|
||||
|
||||
0.92
|
||||
|
||||
EXAMPLE TWO:
|
||||
|
||||
0.85
|
||||
|
||||
EXAMPLE THREE:
|
||||
|
||||
0.78
|
||||
"#,
|
||||
},
|
||||
Template {
|
||||
name: "CodeModification",
|
||||
content: r#"
|
||||
# Code Modification Evaluation Template
|
||||
|
||||
## Instructions
|
||||
|
||||
Evaluate how well the AI assistant modified existing code to meet requirements. Score between 0.0 and 1.0 based on quality and appropriateness of changes.
|
||||
- 1.0 = Perfect modifications that correctly implement all requirements.
|
||||
- 0.0 = Failed to make appropriate changes or introduced serious errors.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
Please consider the following aspects in order of importance:
|
||||
|
||||
1. **Functional Correctness (50%)**
|
||||
- Do the modifications correctly implement the requirements?
|
||||
- Did the assistant modify the right files and code sections?
|
||||
- Are the changes free of bugs and logical errors?
|
||||
- Do the modifications maintain compatibility with existing code?
|
||||
|
||||
2. **Modification Approach (25%)**
|
||||
- Are the changes minimal and focused on what needs to be changed?
|
||||
- Did the assistant avoid unnecessary modifications?
|
||||
- Are the changes integrated seamlessly with the existing codebase?
|
||||
- Did the assistant preserve the original code style and patterns?
|
||||
|
||||
3. **Code Quality (15%)**
|
||||
- Are the modifications well-structured and documented?
|
||||
- Do they follow the same conventions as the original code?
|
||||
- Is there proper error handling in the modified code?
|
||||
- Are the changes readable and maintainable?
|
||||
|
||||
4. **Solution Completeness (10%)**
|
||||
- Do the modifications completely address all requirements?
|
||||
- Are there any missing changes or overlooked requirements?
|
||||
- Did the assistant consider all necessary edge cases?
|
||||
|
||||
## Input
|
||||
|
||||
Original:
|
||||
<!-- ```reference code goes here``` -->
|
||||
|
||||
New (git diff output):
|
||||
<!-- ```git diff goes here``` -->
|
||||
|
||||
## Output Format
|
||||
|
||||
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
|
||||
|
||||
EXAMPLE ONE:
|
||||
|
||||
0.92
|
||||
|
||||
EXAMPLE TWO:
|
||||
|
||||
0.85
|
||||
|
||||
EXAMPLE THREE:
|
||||
|
||||
0.78
|
||||
"#,
|
||||
},
|
||||
Template {
|
||||
name: "ConversationalGuidance",
|
||||
content: r#"
|
||||
# Conversational Guidance Evaluation Template
|
||||
|
||||
## Instructions
|
||||
|
||||
Evaluate the quality of the AI assistant's conversational guidance and score it between 0.0 and 1.0.
|
||||
- 1.0 = Perfect guidance with ideal information gathering, clarification, and advice without writing code.
|
||||
- 0.0 = Completely unhelpful, inappropriate guidance, or wrote code when it should not have.
|
||||
|
||||
## Evaluation Criteria
|
||||
|
||||
ABSOLUTE REQUIREMENT:
|
||||
- The assistant should NOT generate complete code solutions in conversation mode.
|
||||
- If the git diff shows the assistant wrote complete code, the score should be significantly reduced.
|
||||
|
||||
1. **Information Gathering Effectiveness (30%)**
|
||||
- Did the assistant ask relevant and precise questions?
|
||||
- Did it efficiently narrow down the problem scope?
|
||||
- Did it avoid unnecessary or redundant questions?
|
||||
- Was questioning appropriately paced and contextual?
|
||||
|
||||
2. **Conceptual Guidance (30%)**
|
||||
- Did the assistant provide high-level approaches and strategies?
|
||||
- Did it explain relevant concepts and algorithms?
|
||||
- Did it offer planning advice without implementing the solution?
|
||||
- Did it suggest a structured approach to solving the problem?
|
||||
|
||||
3. **Educational Value (20%)**
|
||||
- Did the assistant help the user understand the problem better?
|
||||
- Did it provide explanations that would help the user learn?
|
||||
- Did it guide without simply giving away answers?
|
||||
- Did it encourage the user to think through parts of the problem?
|
||||
|
||||
4. **Conversation Quality (20%)**
|
||||
- Was the conversation logically structured and easy to follow?
|
||||
- Did the assistant maintain appropriate context throughout?
|
||||
- Was the interaction helpful without being condescending?
|
||||
- Did the conversation reach a satisfactory conclusion with clear next steps?
|
||||
|
||||
## Input
|
||||
|
||||
Initial Query:
|
||||
<!-- ```query goes here``` -->
|
||||
|
||||
Conversation Transcript:
|
||||
<!-- ```transcript goes here``` -->
|
||||
|
||||
Git Diff:
|
||||
<!-- ```git diff goes here``` -->
|
||||
|
||||
## Output Format
|
||||
|
||||
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
|
||||
|
||||
EXAMPLE ONE:
|
||||
|
||||
0.92
|
||||
|
||||
EXAMPLE TWO:
|
||||
|
||||
0.85
|
||||
|
||||
EXAMPLE THREE:
|
||||
|
||||
0.78
|
||||
"#,
|
||||
},
|
||||
]
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue