Move assistant_evals to agent_evals and remove Judge logic (#28233)

Release Notes:

- N/A
This commit is contained in:
Thomas Mickley-Doyle 2025-04-07 13:28:06 -05:00 committed by GitHub
parent 500d8f2943
commit f3274851d9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 73 additions and 638 deletions

74
Cargo.lock generated
View file

@ -125,6 +125,43 @@ dependencies = [
"zed_actions",
]
[[package]]
name = "agent_eval"
version = "0.1.0"
dependencies = [
"agent",
"anyhow",
"assistant_tool",
"assistant_tools",
"clap",
"client",
"collections",
"context_server",
"dap",
"env_logger 0.11.8",
"fs",
"futures 0.3.31",
"gpui",
"gpui_tokio",
"language",
"language_model",
"language_models",
"node_runtime",
"project",
"prompt_store",
"release_channel",
"reqwest_client",
"serde",
"serde_json",
"serde_json_lenient",
"settings",
"smol",
"tempfile",
"util",
"walkdir",
"workspace-hack",
]
[[package]]
name = "ahash"
version = "0.7.8"
@ -580,43 +617,6 @@ dependencies = [
"zed_actions",
]
[[package]]
name = "assistant_eval"
version = "0.1.0"
dependencies = [
"agent",
"anyhow",
"assistant_tool",
"assistant_tools",
"clap",
"client",
"collections",
"context_server",
"dap",
"env_logger 0.11.8",
"fs",
"futures 0.3.31",
"gpui",
"gpui_tokio",
"language",
"language_model",
"language_models",
"node_runtime",
"project",
"prompt_store",
"release_channel",
"reqwest_client",
"serde",
"serde_json",
"serde_json_lenient",
"settings",
"smol",
"tempfile",
"util",
"walkdir",
"workspace-hack",
]
[[package]]
name = "assistant_settings"
version = "0.1.0"

View file

@ -8,7 +8,7 @@ members = [
"crates/assets",
"crates/assistant",
"crates/assistant_context_editor",
"crates/assistant_eval",
"crates/agent_eval",
"crates/assistant_settings",
"crates/assistant_slash_command",
"crates/assistant_slash_commands",
@ -215,7 +215,7 @@ askpass = { path = "crates/askpass" }
assets = { path = "crates/assets" }
assistant = { path = "crates/assistant" }
assistant_context_editor = { path = "crates/assistant_context_editor" }
assistant_eval = { path = "crates/assistant_eval" }
assistant_eval = { path = "crates/agent_eval" }
assistant_settings = { path = "crates/assistant_settings" }
assistant_slash_command = { path = "crates/assistant_slash_command" }
assistant_slash_commands = { path = "crates/assistant_slash_commands" }

View file

@ -1,5 +1,5 @@
[package]
name = "assistant_eval"
name = "agent_eval"
version = "0.1.0"
edition.workspace = true
publish.workspace = true
@ -9,7 +9,7 @@ license = "GPL-3.0-or-later"
workspace = true
[[bin]]
name = "assistant_eval"
name = "agent_eval"
path = "src/main.rs"
[dependencies]

View file

@ -1,6 +1,6 @@
use crate::git_commands::{run_git, setup_temp_repo};
use crate::headless_assistant::{HeadlessAppState, HeadlessAssistant};
use crate::{get_exercise_language, get_exercise_name, templates_eval::Template};
use crate::{get_exercise_language, get_exercise_name};
use agent::RequestKind;
use anyhow::{Result, anyhow};
use collections::HashMap;
@ -18,8 +18,6 @@ use std::{
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct EvalResult {
pub exercise_name: String,
pub template_name: String,
pub score: String,
pub diff: String,
pub assistant_response: String,
pub elapsed_time_ms: u128,
@ -29,7 +27,6 @@ pub struct EvalResult {
pub output_tokens: usize,
pub total_tokens: usize,
pub tool_use_counts: usize,
pub judge_model_name: String, // Added field for judge model name
}
pub struct EvalOutput {
@ -251,29 +248,6 @@ pub async fn read_instructions(exercise_path: &Path) -> Result<String> {
Ok(instructions)
}
pub async fn read_example_solution(exercise_path: &Path, language: &str) -> Result<String> {
// Map the language to the file extension
let language_extension = match language {
"python" => "py",
"go" => "go",
"rust" => "rs",
"typescript" => "ts",
"javascript" => "js",
"ruby" => "rb",
"php" => "php",
"bash" => "sh",
"multi" => "diff",
"internal" => "diff",
_ => return Err(anyhow!("Unsupported language: {}", language)),
};
let example_path = exercise_path
.join(".meta")
.join(format!("example.{}", language_extension));
println!("Reading example solution from: {}", example_path.display());
let example = smol::unblock(move || std::fs::read_to_string(&example_path)).await?;
Ok(example)
}
pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -> Result<()> {
let eval_dir = exercise_path.join("evaluation");
fs::create_dir_all(&eval_dir)?;
@ -311,12 +285,8 @@ pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -
// Group the new results by test name (exercise name)
for result in results {
let exercise_name = &result.exercise_name;
let template_name = &result.template_name;
println!(
"Adding result: exercise={}, template={}",
exercise_name, template_name
);
println!("Adding result: exercise={}", exercise_name);
// Ensure the exercise entry exists
if eval_data.get(exercise_name).is_none() {
@ -329,7 +299,7 @@ pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -
}
// Add this result under the timestamp with template name as key
eval_data[exercise_name][&timestamp][template_name] = serde_json::to_value(&result)?;
eval_data[exercise_name][&timestamp] = serde_json::to_value(&result)?;
}
// Write back to file with pretty formatting
@ -344,9 +314,7 @@ pub async fn save_eval_results(exercise_path: &Path, results: Vec<EvalResult>) -
pub async fn run_exercise_eval(
exercise_path: PathBuf,
template: Template,
model: Arc<dyn LanguageModel>,
judge_model: Arc<dyn LanguageModel>,
app_state: Arc<HeadlessAppState>,
base_sha: String,
_framework_path: PathBuf,
@ -359,68 +327,15 @@ pub async fn run_exercise_eval(
"\n\nWhen writing the code for this prompt, use {} to achieve the goal.",
language
));
let example_solution = read_example_solution(&exercise_path, &language).await?;
println!(
"Running evaluation for exercise: {} with template: {}",
exercise_name, template.name
);
println!("Running evaluation for exercise: {}", exercise_name);
// Create temporary directory with exercise files
let temp_dir = setup_temp_repo(&exercise_path, &base_sha).await?;
let temp_path = temp_dir.path().to_path_buf();
if template.name == "ProjectCreation" {
for entry in fs::read_dir(&temp_path)? {
let entry = entry?;
let path = entry.path();
// Skip directories that start with dot (like .docs, .meta, .git)
if path.is_dir()
&& path
.file_name()
.and_then(|name| name.to_str())
.map(|name| name.starts_with("."))
.unwrap_or(false)
{
continue;
}
// Delete regular files
if path.is_file() {
println!(" Deleting file: {}", path.display());
fs::remove_file(path)?;
}
}
// Commit the deletion so it shows up in the diff
run_git(&temp_path, &["add", "."]).await?;
run_git(
&temp_path,
&["commit", "-m", "Remove root files for clean slate"],
)
.await?;
}
let local_commit_sha = run_git(&temp_path, &["rev-parse", "HEAD"]).await?;
// Prepare prompt based on template
let prompt = match template.name {
"ProjectCreation" => format!(
"I need to create a new implementation for this exercise. Please create all the necessary files in the best location.\n\n{}",
instructions
),
"CodeModification" => format!(
"I need help updating my code to meet these requirements. Please modify the appropriate files:\n\n{}",
instructions
),
"ConversationalGuidance" => format!(
"I'm trying to solve this coding exercise but I'm not sure where to start. Can you help me understand the requirements and guide me through the solution process without writing code for me?\n\n{}",
instructions
),
_ => instructions.clone(),
};
let start_time = SystemTime::now();
// Create a basic eval struct to work with the existing system
@ -430,7 +345,7 @@ pub async fn run_exercise_eval(
url: format!("file://{}", temp_path.display()),
base_sha: local_commit_sha, // Use the local commit SHA instead of the framework base SHA
},
user_prompt: prompt,
user_prompt: instructions.clone(),
};
// Run the evaluation
@ -441,79 +356,6 @@ pub async fn run_exercise_eval(
// Get diff from git
let diff = eval_output.diff.clone();
// For project creation template, we need to compare with reference implementation
let judge_output = if template.name == "ProjectCreation" {
let project_judge_prompt = template
.content
.replace(
"<!-- ```requirements go here``` -->",
&format!("```\n{}\n```", instructions),
)
.replace(
"<!-- ```reference code goes here``` -->",
&format!("```{}\n{}\n```", language, example_solution),
)
.replace(
"<!-- ```git diff goes here``` -->",
&format!("```\n{}\n```", diff),
);
// Use the run_with_prompt method which we'll add to judge.rs
let judge = crate::judge::Judge {
original_diff: None,
original_message: Some(project_judge_prompt),
model: judge_model.clone(),
};
cx.update(|cx| judge.run_with_prompt(cx))?.await?
} else if template.name == "CodeModification" {
// For CodeModification, we'll compare the example solution with the LLM-generated solution
let code_judge_prompt = template
.content
.replace(
"<!-- ```reference code goes here``` -->",
&format!("```{}\n{}\n```", language, example_solution),
)
.replace(
"<!-- ```git diff goes here``` -->",
&format!("```\n{}\n```", diff),
);
// Use the run_with_prompt method
let judge = crate::judge::Judge {
original_diff: None,
original_message: Some(code_judge_prompt),
model: judge_model.clone(),
};
cx.update(|cx| judge.run_with_prompt(cx))?.await?
} else {
// Conversational template
let conv_judge_prompt = template
.content
.replace(
"<!-- ```query goes here``` -->",
&format!("```\n{}\n```", instructions),
)
.replace(
"<!-- ```transcript goes here``` -->",
&format!("```\n{}\n```", eval_output.last_message),
)
.replace(
"<!-- ```git diff goes here``` -->",
&format!("```\n{}\n```", diff),
);
// Use the run_with_prompt method for consistency
let judge = crate::judge::Judge {
original_diff: None,
original_message: Some(conv_judge_prompt),
model: judge_model.clone(),
};
cx.update(|cx| judge.run_with_prompt(cx))?.await?
};
let elapsed_time = start_time.elapsed()?;
// Calculate total tokens as the sum of input and output tokens
@ -522,14 +364,9 @@ pub async fn run_exercise_eval(
let tool_use_counts = eval_output.tool_use_counts.values().sum::<u32>();
let total_tokens = input_tokens + output_tokens;
// Get judge model name
let judge_model_name = judge_model.id().0.to_string();
// Save results to evaluation directory
let result = EvalResult {
exercise_name: exercise_name.clone(),
template_name: template.name.to_string(),
score: judge_output.trim().to_string(),
diff,
assistant_response: eval_output.last_message.clone(),
elapsed_time_ms: elapsed_time.as_millis(),
@ -541,7 +378,6 @@ pub async fn run_exercise_eval(
output_tokens: output_tokens.try_into().unwrap(),
total_tokens: total_tokens.try_into().unwrap(),
tool_use_counts: tool_use_counts.try_into().unwrap(),
judge_model_name, // Add judge model name to result
};
Ok(result)

View file

@ -4,12 +4,10 @@ use assistant_tool::ToolWorkingSet;
use client::{Client, UserStore};
use collections::HashMap;
use dap::DapRegistry;
use futures::StreamExt;
use gpui::{App, AsyncApp, Entity, SemanticVersion, Subscription, Task, prelude::*};
use gpui::{App, Entity, SemanticVersion, Subscription, Task, prelude::*};
use language::LanguageRegistry;
use language_model::{
AuthenticateError, LanguageModel, LanguageModelProviderId, LanguageModelRegistry,
LanguageModelRequest,
};
use node_runtime::NodeRuntime;
use project::{Project, RealFs};
@ -246,34 +244,3 @@ pub fn authenticate_model_provider(
let model_provider = model_registry.provider(&provider_id).unwrap();
model_provider.authenticate(cx)
}
pub async fn send_language_model_request(
model: Arc<dyn LanguageModel>,
request: LanguageModelRequest,
cx: &mut AsyncApp,
) -> anyhow::Result<String> {
match model.stream_completion_text(request, &cx).await {
Ok(mut stream) => {
let mut full_response = String::new();
// Process the response stream
while let Some(chunk_result) = stream.stream.next().await {
match chunk_result {
Ok(chunk_str) => {
full_response.push_str(&chunk_str);
}
Err(err) => {
return Err(anyhow!(
"Error receiving response from language model: {err}"
));
}
}
}
Ok(full_response)
}
Err(err) => Err(anyhow!(
"Failed to get response from language model. Error was: {err}"
)),
}
}

View file

@ -2,8 +2,6 @@ mod eval;
mod get_exercise;
mod git_commands;
mod headless_assistant;
mod judge;
mod templates_eval;
use clap::Parser;
use eval::{run_exercise_eval, save_eval_results};
@ -15,11 +13,10 @@ use headless_assistant::{authenticate_model_provider, find_model};
use language_model::LanguageModelRegistry;
use reqwest_client::ReqwestClient;
use std::{path::PathBuf, sync::Arc};
use templates_eval::all_templates;
#[derive(Parser, Debug)]
#[command(
name = "assistant_eval",
name = "agent_eval",
disable_version_flag = true,
before_help = "Tool eval runner"
)]
@ -37,24 +34,17 @@ struct Args {
/// Name of the model (default: "claude-3-7-sonnet-latest")
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
model_name: String,
/// Name of the judge model (default: value of `--model_name`).
/// Name of the editor model (default: value of `--model_name`).
#[arg(long)]
judge_model_name: Option<String>,
editor_model_name: Option<String>,
/// Number of evaluations to run concurrently (default: 3)
#[arg(short, long, default_value = "3")]
#[arg(short, long, default_value = "5")]
concurrency: usize,
/// Maximum number of exercises to evaluate per language
#[arg(long)]
max_exercises_per_language: Option<usize>,
}
// First, let's define the order in which templates should be executed
const TEMPLATE_EXECUTION_ORDER: [&str; 3] = [
"ProjectCreation",
"CodeModification",
"ConversationalGuidance",
];
fn main() {
env_logger::init();
let args = Args::parse();
@ -76,7 +66,7 @@ fn main() {
let app_state = headless_assistant::init(cx);
let model = find_model(&args.model_name, cx).unwrap();
let judge_model = if let Some(model_name) = &args.judge_model_name {
let editor_model = if let Some(model_name) = &args.editor_model_name {
find_model(model_name, cx).unwrap()
} else {
model.clone()
@ -87,7 +77,7 @@ fn main() {
});
let model_provider_id = model.provider_id();
let judge_model_provider_id = judge_model.provider_id();
let editor_model_provider_id = editor_model.provider_id();
let framework_path_clone = framework_path.clone();
let languages_clone = languages.clone();
@ -100,15 +90,17 @@ fn main() {
.unwrap()
.await
.unwrap();
cx.update(|cx| authenticate_model_provider(judge_model_provider_id.clone(), cx))
cx.update(|cx| authenticate_model_provider(editor_model_provider_id.clone(), cx))
.unwrap()
.await
.unwrap();
// Read base SHA from setup.json
println!("framework path: {}", framework_path_clone.display());
let base_sha = read_base_sha(&framework_path_clone).await.unwrap();
// Find all exercises for the specified languages
println!("base sha: {}", base_sha);
let all_exercises = find_exercises(
&framework_path_clone,
&languages_clone
@ -140,23 +132,12 @@ fn main() {
println!("Will run {} exercises", exercises_to_run.len());
// Get all templates and sort them according to the execution order
let mut templates = all_templates();
templates.sort_by_key(|template| {
TEMPLATE_EXECUTION_ORDER
.iter()
.position(|&name| name == template.name)
.unwrap_or(usize::MAX)
});
// Create exercise eval tasks - each exercise is a single task that will run templates sequentially
let exercise_tasks: Vec<_> = exercises_to_run
.into_iter()
.map(|exercise_path| {
let exercise_name = get_exercise_name(&exercise_path);
let templates_clone = templates.clone();
let model_clone = model.clone();
let judge_model_clone = judge_model.clone();
let app_state_clone = app_state.clone();
let base_sha_clone = base_sha.clone();
let framework_path_clone = framework_path_clone.clone();
@ -166,56 +147,22 @@ fn main() {
println!("Processing exercise: {}", exercise_name);
let mut exercise_results = Vec::new();
// Determine the language for this exercise
let language = match get_exercise_language(&exercise_path) {
Ok(lang) => lang,
match run_exercise_eval(
exercise_path.clone(),
model_clone.clone(),
app_state_clone.clone(),
base_sha_clone.clone(),
framework_path_clone.clone(),
cx_clone.clone(),
)
.await
{
Ok(result) => {
println!("Completed {}", exercise_name);
exercise_results.push(result);
}
Err(err) => {
println!(
"Error determining language for {}: {}",
exercise_name, err
);
return exercise_results;
}
};
// Run each template sequentially for this exercise
for template in templates_clone {
// For "multi" or "internal" language, only run the CodeModification template
if (language == "multi" || language == "internal")
&& template.name != "CodeModification"
{
println!(
"Skipping {} template for {} language",
template.name, language
);
continue;
}
match run_exercise_eval(
exercise_path.clone(),
template.clone(),
model_clone.clone(),
judge_model_clone.clone(),
app_state_clone.clone(),
base_sha_clone.clone(),
framework_path_clone.clone(),
cx_clone.clone(),
)
.await
{
Ok(result) => {
println!(
"Completed {} with template {} - score: {}",
exercise_name, template.name, result.score
);
exercise_results.push(result);
}
Err(err) => {
println!(
"Error running {} with template {}: {}",
exercise_name, template.name, err
);
}
println!("Error running {}: {}", exercise_name, err);
}
}

View file

@ -1,68 +0,0 @@
# Tool Evals
A framework for evaluating and benchmarking the agent panel generations.
## Overview
Tool Evals provides a headless environment for running assistants evaluations on code repositories. It automates the process of:
1. Setting up test code and repositories
2. Sending prompts to language models
3. Allowing the assistant to use tools to modify code
4. Collecting metrics on performance and tool usage
5. Evaluating results against known good solutions
## How It Works
The system consists of several key components:
- **Eval**: Loads exercises from the zed-ace-framework repository, creates temporary repos, and executes evaluations
- **HeadlessAssistant**: Provides a headless environment for running the AI assistant
- **Judge**: Evaluates AI-generated solutions against reference implementations and assigns scores
- **Templates**: Defines evaluation frameworks for different tasks (Project Creation, Code Modification, Conversational Guidance)
## Setup Requirements
### Prerequisites
- Rust and Cargo
- Git
- Python (for report generation)
- Network access to clone repositories
- Appropriate API keys for language models and git services (Anthropic, GitHub, etc.)
### Environment Variables
Ensure you have the required API keys set, either from a dev run of Zed or via these environment variables:
- `ZED_ANTHROPIC_API_KEY` for Claude models
- `ZED_GITHUB_API_KEY` for GitHub API (or similar)
## Usage
### Running Evaluations
```bash
# Run all tests
cargo run -p assistant_eval -- --all
# Run only specific languages
cargo run -p assistant_eval -- --all --languages python,rust
# Limit concurrent evaluations
cargo run -p assistant_eval -- --all --concurrency 5
# Limit number of exercises per language
cargo run -p assistant_eval -- --all --max-exercises-per-language 3
```
### Evaluation Template Types
The system supports three types of evaluation templates:
1. **ProjectCreation**: Tests the model's ability to create new implementations from scratch
2. **CodeModification**: Tests the model's ability to modify existing code to meet new requirements
3. **ConversationalGuidance**: Tests the model's ability to provide guidance without writing code
### Support Repo
The [zed-industries/zed-ace-framework](https://github.com/zed-industries/zed-ace-framework) contains the analytics and reporting scripts.

View file

@ -1,37 +0,0 @@
use crate::headless_assistant::send_language_model_request;
use anyhow::anyhow;
use gpui::{App, Task};
use language_model::{
LanguageModel, LanguageModelRequest, LanguageModelRequestMessage, MessageContent, Role,
};
use std::sync::Arc;
pub struct Judge {
#[allow(dead_code)]
pub original_diff: Option<String>,
pub original_message: Option<String>,
pub model: Arc<dyn LanguageModel>,
}
impl Judge {
pub fn run_with_prompt(&self, cx: &mut App) -> Task<anyhow::Result<String>> {
let Some(prompt) = self.original_message.as_ref() else {
return Task::ready(Err(anyhow!("No prompt provided in original_message")));
};
let request = LanguageModelRequest {
messages: vec![LanguageModelRequestMessage {
role: Role::User,
content: vec![MessageContent::Text(prompt.clone())],
cache: false,
}],
temperature: Some(0.0),
tools: Vec::new(),
stop: Vec::new(),
};
let model = self.model.clone();
let request = request.clone();
cx.spawn(async move |cx| send_language_model_request(model, request, cx).await)
}
}

View file

@ -1,210 +0,0 @@
#[derive(Clone, Debug)]
pub struct Template {
pub name: &'static str,
pub content: &'static str,
}
pub fn all_templates() -> Vec<Template> {
vec![
Template {
name: "ProjectCreation",
content: r#"
# Project Creation Evaluation Template
## Instructions
Evaluate how well the AI assistant created a new implementation from scratch. Score it between 0.0 and 1.0 based on quality and fulfillment of requirements.
- 1.0 = Perfect implementation that creates all necessary files with correct functionality.
- 0.0 = Completely fails to create working files or meet requirements.
Note: A git diff output is required. If no code changes are provided (i.e., no git diff output), the score must be 0.0.
## Evaluation Criteria
Please consider the following aspects in order of importance:
1. **File Creation (25%)**
- Did the assistant create all necessary files?
- Are the files appropriately named and organized?
- Did the assistant create a complete solution without missing components?
2. **Functional Correctness (40%)**
- Does the implementation fulfill all specified requirements?
- Does it handle edge cases properly?
- Is it free of logical errors and bugs?
- Do all components work together as expected?
3. **Code Quality (20%)**
- Is the code well-structured, readable and well-documented?
- Does it follow language-specific best practices?
- Is there proper error handling?
- Are naming conventions clear and consistent?
4. **Architecture Design (15%)**
- Is the code modular and extensible?
- Is there proper separation of concerns?
- Are appropriate design patterns used?
- Is the overall architecture appropriate for the requirements?
## Input
Requirements:
<!-- ```requirements go here``` -->
Reference Implementation:
<!-- ```reference code goes here``` -->
AI-Generated Implementation (git diff output):
<!-- ```git diff goes here``` -->
## Output Format
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
EXAMPLE ONE:
0.92
EXAMPLE TWO:
0.85
EXAMPLE THREE:
0.78
"#,
},
Template {
name: "CodeModification",
content: r#"
# Code Modification Evaluation Template
## Instructions
Evaluate how well the AI assistant modified existing code to meet requirements. Score between 0.0 and 1.0 based on quality and appropriateness of changes.
- 1.0 = Perfect modifications that correctly implement all requirements.
- 0.0 = Failed to make appropriate changes or introduced serious errors.
## Evaluation Criteria
Please consider the following aspects in order of importance:
1. **Functional Correctness (50%)**
- Do the modifications correctly implement the requirements?
- Did the assistant modify the right files and code sections?
- Are the changes free of bugs and logical errors?
- Do the modifications maintain compatibility with existing code?
2. **Modification Approach (25%)**
- Are the changes minimal and focused on what needs to be changed?
- Did the assistant avoid unnecessary modifications?
- Are the changes integrated seamlessly with the existing codebase?
- Did the assistant preserve the original code style and patterns?
3. **Code Quality (15%)**
- Are the modifications well-structured and documented?
- Do they follow the same conventions as the original code?
- Is there proper error handling in the modified code?
- Are the changes readable and maintainable?
4. **Solution Completeness (10%)**
- Do the modifications completely address all requirements?
- Are there any missing changes or overlooked requirements?
- Did the assistant consider all necessary edge cases?
## Input
Original:
<!-- ```reference code goes here``` -->
New (git diff output):
<!-- ```git diff goes here``` -->
## Output Format
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
EXAMPLE ONE:
0.92
EXAMPLE TWO:
0.85
EXAMPLE THREE:
0.78
"#,
},
Template {
name: "ConversationalGuidance",
content: r#"
# Conversational Guidance Evaluation Template
## Instructions
Evaluate the quality of the AI assistant's conversational guidance and score it between 0.0 and 1.0.
- 1.0 = Perfect guidance with ideal information gathering, clarification, and advice without writing code.
- 0.0 = Completely unhelpful, inappropriate guidance, or wrote code when it should not have.
## Evaluation Criteria
ABSOLUTE REQUIREMENT:
- The assistant should NOT generate complete code solutions in conversation mode.
- If the git diff shows the assistant wrote complete code, the score should be significantly reduced.
1. **Information Gathering Effectiveness (30%)**
- Did the assistant ask relevant and precise questions?
- Did it efficiently narrow down the problem scope?
- Did it avoid unnecessary or redundant questions?
- Was questioning appropriately paced and contextual?
2. **Conceptual Guidance (30%)**
- Did the assistant provide high-level approaches and strategies?
- Did it explain relevant concepts and algorithms?
- Did it offer planning advice without implementing the solution?
- Did it suggest a structured approach to solving the problem?
3. **Educational Value (20%)**
- Did the assistant help the user understand the problem better?
- Did it provide explanations that would help the user learn?
- Did it guide without simply giving away answers?
- Did it encourage the user to think through parts of the problem?
4. **Conversation Quality (20%)**
- Was the conversation logically structured and easy to follow?
- Did the assistant maintain appropriate context throughout?
- Was the interaction helpful without being condescending?
- Did the conversation reach a satisfactory conclusion with clear next steps?
## Input
Initial Query:
<!-- ```query goes here``` -->
Conversation Transcript:
<!-- ```transcript goes here``` -->
Git Diff:
<!-- ```git diff goes here``` -->
## Output Format
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
EXAMPLE ONE:
0.92
EXAMPLE TWO:
0.85
EXAMPLE THREE:
0.78
"#,
},
]
}