Add initial implementation of evaluating changes generated by the assistant (#26799)

Release Notes:

- N/A

---------

Co-authored-by: Richard Feldman <oss@rtfeldman.com>
Co-authored-by: Thomas <thomas@zed.dev>
This commit is contained in:
Michael Sloan 2025-03-14 17:10:25 -06:00 committed by GitHub
parent e9b4fa1465
commit 7a888de9f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1113 additions and 24 deletions

View file

@ -0,0 +1,121 @@
use crate::eval::EvalOutput;
use crate::headless_assistant::send_language_model_request;
use anyhow::anyhow;
use gpui::{App, Task};
use language_model::{
LanguageModel, LanguageModelRequest, LanguageModelRequestMessage, MessageContent, Role,
};
use std::{path::Path, sync::Arc};
pub struct Judge {
pub original_diff: Option<String>,
#[allow(dead_code)]
pub original_message: Option<String>,
pub model: Arc<dyn LanguageModel>,
}
impl Judge {
pub async fn load(eval_path: &Path, model: Arc<dyn LanguageModel>) -> anyhow::Result<Judge> {
let original_diff_path = eval_path.join("original.diff");
let original_diff = smol::unblock(move || {
if std::fs::exists(&original_diff_path)? {
anyhow::Ok(Some(std::fs::read_to_string(&original_diff_path)?))
} else {
anyhow::Ok(None)
}
});
let original_message_path = eval_path.join("original_message.txt");
let original_message = smol::unblock(move || {
if std::fs::exists(&original_message_path)? {
anyhow::Ok(Some(std::fs::read_to_string(&original_message_path)?))
} else {
anyhow::Ok(None)
}
});
Ok(Self {
original_diff: original_diff.await?,
original_message: original_message.await?,
model,
})
}
pub fn run(&self, eval_output: &EvalOutput, cx: &mut App) -> Task<anyhow::Result<String>> {
let Some(original_diff) = self.original_diff.as_ref() else {
return Task::ready(Err(anyhow!("No original.diff found")));
};
// TODO: check for empty diff?
let prompt = diff_comparison_prompt(&original_diff, &eval_output.diff);
let request = LanguageModelRequest {
messages: vec![LanguageModelRequestMessage {
role: Role::User,
content: vec![MessageContent::Text(prompt)],
cache: false,
}],
temperature: Some(0.0),
tools: Vec::new(),
stop: Vec::new(),
};
let model = self.model.clone();
cx.spawn(move |cx| send_language_model_request(model, request, cx))
}
}
pub fn diff_comparison_prompt(original_diff: &str, new_diff: &str) -> String {
format!(
r#"# Git Diff Similarity Evaluation Template
## Instructions
Compare the two diffs and score them between 0.0 and 1.0 based on their functional similarity.
- 1.0 = Perfect functional match (achieves identical results)
- 0.0 = No functional similarity whatsoever
## Evaluation Criteria
Please consider the following aspects in order of importance:
1. **Functional Equivalence (60%)**
- Do both diffs achieve the same end result?
- Are the changes functionally equivalent despite possibly using different approaches?
- Do the modifications address the same issues or implement the same features?
2. **Logical Structure (20%)**
- Are the logical flows similar?
- Do the modifications affect the same code paths?
- Are control structures (if/else, loops, etc.) modified in similar ways?
3. **Code Content (15%)**
- Are similar lines added/removed?
- Are the same variables, functions, or methods being modified?
- Are the same APIs or libraries being used?
4. **File Layout (5%)**
- Are the same files being modified?
- Are changes occurring in similar locations within files?
## Input
Original Diff:
```git
{}
```
New Diff:
```git
{}
```
## Output Format
THE ONLY OUTPUT SHOULD BE A SCORE BETWEEN 0.0 AND 1.0.
Example output:
0.85"#,
original_diff, new_diff
)
}