Introduce a new StreamingEditFileTool (#29733)

This pull request introduces a new tool for streaming edits. The short-term goal is for this tool to replace the existing `EditFileTool`, but we want to get this out the door as soon as possible so that we can start testing it. `StreamingEditFileTool` is mutually exclusive with `EditFileTool`. It will be enabled by default for anyone who has the `agent-stream-edits` feature flag, as well as people that set `assistant.stream_edits` to `true` in their settings. ### Implementation Streaming is achieved by requesting a completion while the `edit_file` tool gets called. We invoke the model by taking the existing conversation with the agent and appending a prompt specifically tailored for editing. In that prompt, we ask the model to produce a stream of `<old_text>`/`<new_text>` tags. As the model streams text in, we incrementally parse it and start editing as soon as we can. ### Evals Note that, as part of this pull request, I also defined some new evals that I used to drive the behavior of the recursive LLM call. To run them, use this command: ```bash cargo test --package=assistant_tools --features eval -- eval_extract_handle_command_output ``` Or comment out the `#[cfg_attr(not(feature = "eval"), ignore)]` macro. I recommend running them one at a time, because right now we don't really have a way of orchestrating of all these evals. I think we should invest into that effort once the new agent panel goes live. Release Notes: - N/A --------- Co-authored-by: Nathan Sobo <nathan@zed.dev> Co-authored-by: Bennet Bo Fenner <bennetbo@gmx.de> Co-authored-by: Oleksiy Syvokon <oleksiy.syvokon@gmail.com>
2025-05-01 17:37:43 +02:00 · 2025-05-01 17:37:43 +02:00 · f891dfb358
commit f891dfb358
parent e3a2d52472
32 changed files with 49077 additions and 20 deletions
--- a/crates/assistant_tools/src/edit_agent/evals.rs
+++ b/crates/assistant_tools/src/edit_agent/evals.rs
@ -0,0 +1,889 @@
+use super::*;
+use crate::{
+    ReadFileToolInput, grep_tool::GrepToolInput,
+    streaming_edit_file_tool::StreamingEditFileToolInput,
+};
+use Role::*;
+use anyhow::{Context, anyhow};
+use client::{Client, UserStore};
+use collections::HashMap;
+use fs::FakeFs;
+use gpui::{AppContext, TestAppContext};
+use indoc::indoc;
+use language_model::{
+    LanguageModelRegistry, LanguageModelToolResult, LanguageModelToolUse, LanguageModelToolUseId,
+};
+use project::Project;
+use rand::prelude::*;
+use reqwest_client::ReqwestClient;
+use serde_json::json;
+use std::{
+    cmp::Reverse,
+    fmt::{self, Display},
+    io::Write as _,
+    sync::mpsc,
+};
+use util::path;
+
+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_extract_handle_command_output() {
+    let input_file_path = "root/blame.rs";
+    let input_file_content = include_str!("evals/fixtures/extract_handle_command_output/before.rs");
+    let output_file_content = include_str!("evals/fixtures/extract_handle_command_output/after.rs");
+    let edit_description = "Extract `handle_command_output` method from `run_git_blame`.";
+    eval(
+        100,
+        0.95,
+        EvalInput {
+            conversation: vec![
+                message(
+                    User,
+                    [text(indoc! {"
+                        Read the `{input_file_path}` file and extract a method in
+                        the final stanza of `run_git_blame` to deal with command failures,
+                        call it `handle_command_output` and take the std::process::Output as the only parameter.
+
+                        Add it right next to `run_git_blame` and copy it verbatim from `run_git_blame`.
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: None,
+                            end_line: None,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result("tool_1", "read_file", input_file_content)],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_2",
+                        "edit_file",
+                        StreamingEditFileToolInput {
+                            display_description: edit_description.into(),
+                            path: input_file_path.into(),
+                        },
+                    )],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: input_file_content.into(),
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::AssertEqual(output_file_content.into()),
+        },
+    );
+}
+
+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_delete_run_git_blame() {
+    let input_file_path = "root/blame.rs";
+    let input_file_content = include_str!("evals/fixtures/delete_run_git_blame/before.rs");
+    let output_file_content = include_str!("evals/fixtures/delete_run_git_blame/after.rs");
+    let edit_description = "Delete the `run_git_blame` function.";
+    eval(
+        100,
+        0.95,
+        EvalInput {
+            conversation: vec![
+                message(
+                    User,
+                    [text(indoc! {"
+                        Read the `{input_file_path}` file and delete `run_git_blame`. Just that
+                        one function, not its usages.
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: None,
+                            end_line: None,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result("tool_1", "read_file", input_file_content)],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_2",
+                        "edit_file",
+                        StreamingEditFileToolInput {
+                            display_description: edit_description.into(),
+                            path: input_file_path.into(),
+                        },
+                    )],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: input_file_content.into(),
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::AssertEqual(output_file_content.into()),
+        },
+    );
+}
+
+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
+    let input_file_path = "root/lib.rs";
+    let input_file_content =
+        include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
+    let edit_description = "Update compile_parser_to_wasm to use wasi-sdk instead of emscripten";
+    eval(
+        100,
+        0.95,
+        EvalInput {
+            conversation: vec![
+                message(
+                    User,
+                    [text(indoc! {"
+                        Read the `{input_file_path}` file and change `compile_parser_to_wasm` to use `wasi-sdk` instead of emscripten.
+                        Use `ureq` to download the SDK for the current platform and architecture.
+                        Extract the archive into a sibling of `lib` inside the `tree-sitter` directory in the cache_dir.
+                        Compile the parser to wasm using the `bin/clang` executable (or `bin/clang.exe` on windows)
+                        that's inside of the archive.
+                        Don't re-download the SDK if that executable already exists.
+
+                        Use these clang flags: -fPIC -shared -Os -Wl,--export=tree_sitter_{language_name}
+
+                        Here are the available wasi-sdk assets:
+                        - wasi-sdk-25.0-x86_64-macos.tar.gz
+                        - wasi-sdk-25.0-arm64-macos.tar.gz
+                        - wasi-sdk-25.0-x86_64-linux.tar.gz
+                        - wasi-sdk-25.0-arm64-linux.tar.gz
+                        - wasi-sdk-25.0-x86_64-linux.tar.gz
+                        - wasi-sdk-25.0-arm64-linux.tar.gz
+                        - wasi-sdk-25.0-x86_64-windows.tar.gz
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: Some(971),
+                            end_line: Some(1050),
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_1",
+                        "read_file",
+                        lines(input_file_content, 971..1050),
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_2",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: Some(1050),
+                            end_line: Some(1100),
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_2",
+                        "read_file",
+                        lines(input_file_content, 1050..1100),
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_3",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: Some(1100),
+                            end_line: Some(1150),
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_3",
+                        "read_file",
+                        lines(input_file_content, 1100..1150),
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_4",
+                        "edit_file",
+                        StreamingEditFileToolInput {
+                            display_description: edit_description.into(),
+                            path: input_file_path.into(),
+                        },
+                    )],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: input_file_content.into(),
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::JudgeDiff(indoc! {"
+                - The compile_parser_to_wasm method has been changed to use wasi-sdk
+                - ureq is used to download the SDK for current platform and architecture
+            "}),
+        },
+    );
+}
+
+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_disable_cursor_blinking() {
+    let input_file_path = "root/editor.rs";
+    let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
+    let output_file_content = include_str!("evals/fixtures/disable_cursor_blinking/after.rs");
+    let edit_description = "Comment out the call to `BlinkManager::enable`";
+    eval(
+        100,
+        0.6, // TODO: make this eval better
+        EvalInput {
+            conversation: vec![
+                message(User, [text("Let's research how to cursor blinking works.")]),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "grep",
+                        GrepToolInput {
+                            regex: "blink".into(),
+                            include_pattern: None,
+                            offset: 0,
+                            case_sensitive: false,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_1",
+                        "grep",
+                        [
+                            lines(input_file_content, 100..400),
+                            lines(input_file_content, 800..1300),
+                            lines(input_file_content, 1600..2000),
+                            lines(input_file_content, 5000..5500),
+                            lines(input_file_content, 8000..9000),
+                            lines(input_file_content, 18455..18470),
+                            lines(input_file_content, 20000..20500),
+                            lines(input_file_content, 21000..21300),
+                        ]
+                        .join("Match found:\n\n"),
+                    )],
+                ),
+                message(
+                    User,
+                    [text(indoc! {"
+                        Comment out the lines that interact with the BlinkManager.
+                        Keep the outer `update` blocks, but comments everything that's inside (including if statements).
+                        Don't add additional comments.
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_4",
+                        "edit_file",
+                        StreamingEditFileToolInput {
+                            display_description: edit_description.into(),
+                            path: input_file_path.into(),
+                        },
+                    )],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: input_file_content.into(),
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::AssertEqual(output_file_content.into()),
+        },
+    );
+}
+
+#[test]
+#[cfg_attr(not(feature = "eval"), ignore)]
+fn eval_from_pixels_constructor() {
+    let input_file_path = "root/canvas.rs";
+    let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
+    let edit_description = "Implement from_pixels constructor and add tests.";
+    eval(
+        100,
+        0.95,
+        EvalInput {
+            conversation: vec![
+                message(
+                    User,
+                    [text(indoc! {"
+                        Introduce a new `from_pixels` constructor in Canvas and
+                        also add tests for it in the same file.
+                    "})],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_1",
+                        "read_file",
+                        ReadFileToolInput {
+                            path: input_file_path.into(),
+                            start_line: None,
+                            end_line: None,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result("tool_1", "read_file", input_file_content)],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_2",
+                        "grep",
+                        GrepToolInput {
+                            regex: "mod\\s+tests".into(),
+                            include_pattern: Some("font-kit/src/canvas.rs".into()),
+                            offset: 0,
+                            case_sensitive: false,
+                        },
+                    )],
+                ),
+                message(User, [tool_result("tool_2", "grep", "No matches found")]),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_3",
+                        "grep",
+                        GrepToolInput {
+                            regex: "mod\\s+tests".into(),
+                            include_pattern: Some("font-kit/src/**/*.rs".into()),
+                            offset: 0,
+                            case_sensitive: false,
+                        },
+                    )],
+                ),
+                message(User, [tool_result("tool_3", "grep", "No matches found")]),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_4",
+                        "grep",
+                        GrepToolInput {
+                            regex: "#\\[test\\]".into(),
+                            include_pattern: Some("font-kit/src/**/*.rs".into()),
+                            offset: 0,
+                            case_sensitive: false,
+                        },
+                    )],
+                ),
+                message(
+                    User,
+                    [tool_result(
+                        "tool_4",
+                        "grep",
+                        indoc! {"
+                            Found 6 matches:
+
+                            ## Matches in font-kit/src/loaders/core_text.rs
+
+                            ### mod test › L926-936
+                            ```
+                            mod test {
+                                use super::Font;
+                                use crate::properties::{Stretch, Weight};
+
+                                #[cfg(feature = \"source\")]
+                                use crate::source::SystemSource;
+
+                                static TEST_FONT_POSTSCRIPT_NAME: &'static str = \"ArialMT\";
+
+                                #[cfg(feature = \"source\")]
+                                #[test]
+                            ```
+
+                            55 lines remaining in ancestor node. Read the file to see all.
+
+                            ### mod test › L947-951
+                            ```
+                                }
+
+                                #[test]
+                                fn test_core_text_to_css_font_weight() {
+                                    // Exact matches
+                            ```
+
+                            ### mod test › L959-963
+                            ```
+                                }
+
+                                #[test]
+                                fn test_core_text_to_css_font_stretch() {
+                                    // Exact matches
+                            ```
+
+                            ## Matches in font-kit/src/loaders/freetype.rs
+
+                            ### mod test › L1238-1248
+                            ```
+                            mod test {
+                                use crate::loaders::freetype::Font;
+
+                                static PCF_FONT_PATH: &str = \"resources/tests/times-roman-pcf/timR12.pcf\";
+                                static PCF_FONT_POSTSCRIPT_NAME: &str = \"Times-Roman\";
+
+                                #[test]
+                                fn get_pcf_postscript_name() {
+                                    let font = Font::from_path(PCF_FONT_PATH, 0).unwrap();
+                                    assert_eq!(font.postscript_name().unwrap(), PCF_FONT_POSTSCRIPT_NAME);
+                                }
+                            ```
+
+                            1 lines remaining in ancestor node. Read the file to see all.
+
+                            ## Matches in font-kit/src/sources/core_text.rs
+
+                            ### mod test › L265-275
+                            ```
+                            mod test {
+                                use crate::properties::{Stretch, Weight};
+
+                                #[test]
+                                fn test_css_to_core_text_font_weight() {
+                                    // Exact matches
+                                    assert_eq!(super::css_to_core_text_font_weight(Weight(100.0)), -0.7);
+                                    assert_eq!(super::css_to_core_text_font_weight(Weight(400.0)), 0.0);
+                                    assert_eq!(super::css_to_core_text_font_weight(Weight(700.0)), 0.4);
+                                    assert_eq!(super::css_to_core_text_font_weight(Weight(900.0)), 0.8);
+
+                            ```
+
+                            27 lines remaining in ancestor node. Read the file to see all.
+
+                            ### mod test › L278-282
+                            ```
+                                }
+
+                                #[test]
+                                fn test_css_to_core_text_font_stretch() {
+                                    // Exact matches
+                            ```
+                        "},
+                    )],
+                ),
+                message(
+                    Assistant,
+                    [tool_use(
+                        "tool_5",
+                        "edit_file",
+                        StreamingEditFileToolInput {
+                            display_description: edit_description.into(),
+                            path: input_file_path.into(),
+                        },
+                    )],
+                ),
+            ],
+            input_path: input_file_path.into(),
+            input_content: input_file_content.into(),
+            edit_description: edit_description.into(),
+            assertion: EvalAssertion::JudgeDiff(indoc! {"
+                - The diff contains a new `from_pixels` constructor
+                - The diff contains new tests for the `from_pixels` constructor
+            "}),
+        },
+    );
+}
+
+fn message(
+    role: Role,
+    contents: impl IntoIterator<Item = MessageContent>,
+) -> LanguageModelRequestMessage {
+    LanguageModelRequestMessage {
+        role,
+        content: contents.into_iter().collect(),
+        cache: false,
+    }
+}
+
+fn text(text: impl Into<String>) -> MessageContent {
+    MessageContent::Text(text.into())
+}
+
+fn lines(input: &str, range: Range<usize>) -> String {
+    input
+        .lines()
+        .skip(range.start)
+        .take(range.len())
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+fn tool_use(
+    id: impl Into<Arc<str>>,
+    name: impl Into<Arc<str>>,
+    input: impl Serialize,
+) -> MessageContent {
+    MessageContent::ToolUse(LanguageModelToolUse {
+        id: LanguageModelToolUseId::from(id.into()),
+        name: name.into(),
+        raw_input: serde_json::to_string_pretty(&input).unwrap(),
+        input: serde_json::to_value(input).unwrap(),
+        is_input_complete: true,
+    })
+}
+
+fn tool_result(
+    id: impl Into<Arc<str>>,
+    name: impl Into<Arc<str>>,
+    result: impl Into<Arc<str>>,
+) -> MessageContent {
+    MessageContent::ToolResult(LanguageModelToolResult {
+        tool_use_id: LanguageModelToolUseId::from(id.into()),
+        tool_name: name.into(),
+        is_error: false,
+        content: result.into(),
+    })
+}
+
+#[derive(Clone)]
+struct EvalInput {
+    conversation: Vec<LanguageModelRequestMessage>,
+    input_path: PathBuf,
+    input_content: String,
+    edit_description: String,
+    assertion: EvalAssertion,
+}
+
+fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
+    let mut evaluated_count = 0;
+    report_progress(evaluated_count, iterations);
+
+    let (tx, rx) = mpsc::channel();
+
+    // Cache the last message in the conversation, and run one instance of the eval so that
+    // all the next ones are cached.
+    eval.conversation.last_mut().unwrap().cache = true;
+    run_eval(eval.clone(), tx.clone());
+
+    let executor = gpui::background_executor();
+    for _ in 1..iterations {
+        let eval = eval.clone();
+        let tx = tx.clone();
+        executor.spawn(async move { run_eval(eval, tx) }).detach();
+    }
+    drop(tx);
+
+    let mut failed_count = 0;
+    let mut failed_evals = HashMap::default();
+    let mut errored_evals = HashMap::default();
+    let mut eval_outputs = Vec::new();
+    let mut cumulative_parser_metrics = EditParserMetrics::default();
+    while let Ok(output) = rx.recv() {
+        match output {
+            Ok(output) => {
+                cumulative_parser_metrics += output.edit_output._parser_metrics.clone();
+                eval_outputs.push(output.clone());
+                if output.assertion.score < 80 {
+                    failed_count += 1;
+                    failed_evals
+                        .entry(output.buffer_text.clone())
+                        .or_insert(Vec::new())
+                        .push(output);
+                }
+            }
+            Err(error) => {
+                failed_count += 1;
+                *errored_evals.entry(format!("{:?}", error)).or_insert(0) += 1;
+            }
+        }
+
+        evaluated_count += 1;
+        report_progress(evaluated_count, iterations);
+    }
+
+    let actual_pass_ratio = (iterations - failed_count) as f32 / iterations as f32;
+    println!("Actual pass ratio: {}\n", actual_pass_ratio);
+    if actual_pass_ratio < expected_pass_ratio {
+        let mut errored_evals = errored_evals.into_iter().collect::<Vec<_>>();
+        errored_evals.sort_by_key(|(_, count)| Reverse(*count));
+        for (error, count) in errored_evals {
+            println!("Eval errored {} times. Error: {}", count, error);
+        }
+
+        let mut failed_evals = failed_evals.into_iter().collect::<Vec<_>>();
+        failed_evals.sort_by_key(|(_, evals)| Reverse(evals.len()));
+        for (_buffer_output, failed_evals) in failed_evals {
+            let eval_output = failed_evals.first().unwrap();
+            println!("Eval failed {} times", failed_evals.len());
+            println!("{}", eval_output);
+        }
+
+        panic!(
+            "Actual pass ratio: {}\nExpected pass ratio: {}",
+            actual_pass_ratio, expected_pass_ratio
+        );
+    }
+
+    let mismatched_tag_ratio =
+        cumulative_parser_metrics.mismatched_tags as f32 / cumulative_parser_metrics.tags as f32;
+    if mismatched_tag_ratio > 0.02 {
+        for eval_output in eval_outputs {
+            println!("{}", eval_output);
+        }
+        panic!("Too many mismatched tags: {:?}", cumulative_parser_metrics);
+    }
+}
+
+fn run_eval(eval: EvalInput, tx: mpsc::Sender<Result<EvalOutput>>) {
+    let dispatcher = gpui::TestDispatcher::new(StdRng::from_entropy());
+    let mut cx = TestAppContext::build(dispatcher, None);
+    let output = cx.executor().block_test(async {
+        let test = EditAgentTest::new(&mut cx).await;
+        test.eval(eval, &mut cx).await
+    });
+    tx.send(output).unwrap();
+}
+
+#[derive(Clone)]
+struct EvalOutput {
+    assertion: EvalAssertionResult,
+    buffer_text: String,
+    edit_output: EditAgentOutput,
+    diff: String,
+}
+
+impl Display for EvalOutput {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "Score: {:?}", self.assertion.score)?;
+        if let Some(message) = self.assertion.message.as_ref() {
+            writeln!(f, "Message: {}", message)?;
+        }
+
+        writeln!(f, "Diff:\n{}", self.diff)?;
+
+        writeln!(
+            f,
+            "Parser Metrics:\n{:#?}",
+            self.edit_output._parser_metrics
+        )?;
+        writeln!(f, "Raw Edits:\n{}", self.edit_output._raw_edits)?;
+        Ok(())
+    }
+}
+
+fn report_progress(evaluated_count: usize, iterations: usize) {
+    print!("\r\x1b[KEvaluated {}/{}", evaluated_count, iterations);
+    std::io::stdout().flush().unwrap();
+}
+
+struct EditAgentTest {
+    agent: EditAgent,
+    project: Entity<Project>,
+    judge_model: Arc<dyn LanguageModel>,
+}
+
+impl EditAgentTest {
+    async fn new(cx: &mut TestAppContext) -> Self {
+        cx.executor().allow_parking();
+        cx.update(settings::init);
+        cx.update(Project::init_settings);
+        cx.update(language::init);
+        cx.update(gpui_tokio::init);
+        cx.update(client::init_settings);
+
+        let fs = FakeFs::new(cx.executor().clone());
+        fs.insert_tree("/root", json!({})).await;
+        let project = Project::test(fs.clone(), [path!("/root").as_ref()], cx).await;
+        let (agent_model, judge_model) = cx
+            .update(|cx| {
+                let http_client = ReqwestClient::user_agent("agent tests").unwrap();
+                cx.set_http_client(Arc::new(http_client));
+
+                let client = Client::production(cx);
+                let user_store = cx.new(|cx| UserStore::new(client.clone(), cx));
+                language_model::init(client.clone(), cx);
+                language_models::init(user_store.clone(), client.clone(), fs.clone(), cx);
+
+                cx.spawn(async move |cx| {
+                    let agent_model =
+                        Self::load_model("anthropic", "claude-3-7-sonnet-latest", cx).await;
+                    let judge_model =
+                        Self::load_model("anthropic", "claude-3-7-sonnet-latest", cx).await;
+                    (agent_model.unwrap(), judge_model.unwrap())
+                })
+            })
+            .await;
+        let action_log = cx.new(|_| ActionLog::new(project.clone()));
+
+        Self {
+            agent: EditAgent::new(agent_model, action_log, Templates::new()),
+            project,
+            judge_model,
+        }
+    }
+
+    async fn load_model(
+        provider: &str,
+        id: &str,
+        cx: &mut AsyncApp,
+    ) -> Result<Arc<dyn LanguageModel>> {
+        let (provider, model) = cx.update(|cx| {
+            let models = LanguageModelRegistry::read_global(cx);
+            let model = models
+                .available_models(cx)
+                .find(|model| model.provider_id().0 == provider && model.id().0 == id)
+                .unwrap();
+            let provider = models.provider(&model.provider_id()).unwrap();
+            (provider, model)
+        })?;
+        cx.update(|cx| provider.authenticate(cx))?.await?;
+        Ok(model)
+    }
+
+    async fn eval(&self, eval: EvalInput, cx: &mut TestAppContext) -> Result<EvalOutput> {
+        let path = self
+            .project
+            .read_with(cx, |project, cx| {
+                project.find_project_path(eval.input_path, cx)
+            })
+            .unwrap();
+        let buffer = self
+            .project
+            .update(cx, |project, cx| project.open_buffer(path, cx))
+            .await
+            .unwrap();
+        buffer.update(cx, |buffer, cx| {
+            buffer.set_text(eval.input_content.clone(), cx)
+        });
+        let (edit_output, _events) = self.agent.edit(
+            buffer.clone(),
+            eval.edit_description,
+            eval.conversation,
+            &mut cx.to_async(),
+        );
+        let edit_output = edit_output.await?;
+        let buffer_text = buffer.read_with(cx, |buffer, _| buffer.text());
+        let actual_diff = language::unified_diff(&eval.input_content, &buffer_text);
+        let assertion = match eval.assertion {
+            EvalAssertion::AssertEqual(expected_output) => EvalAssertionResult {
+                score: if strip_empty_lines(&buffer_text) == strip_empty_lines(&expected_output) {
+                    100
+                } else {
+                    0
+                },
+                message: None,
+            },
+            EvalAssertion::JudgeDiff(assertions) => self
+                .judge_diff(&actual_diff, assertions, &cx.to_async())
+                .await
+                .context("failed comparing diffs")?,
+        };
+
+        Ok(EvalOutput {
+            assertion,
+            diff: actual_diff,
+            buffer_text,
+            edit_output,
+        })
+    }
+
+    async fn judge_diff(
+        &self,
+        diff: &str,
+        assertions: &'static str,
+        cx: &AsyncApp,
+    ) -> Result<EvalAssertionResult> {
+        let prompt = DiffJudgeTemplate {
+            diff: diff.to_string(),
+            assertions,
+        }
+        .render(&self.agent.templates)
+        .unwrap();
+
+        let request = LanguageModelRequest {
+            messages: vec![LanguageModelRequestMessage {
+                role: Role::User,
+                content: vec![prompt.into()],
+                cache: false,
+            }],
+            ..Default::default()
+        };
+        let mut response = self.judge_model.stream_completion_text(request, cx).await?;
+        let mut output = String::new();
+        while let Some(chunk) = response.stream.next().await {
+            let chunk = chunk?;
+            output.push_str(&chunk);
+        }
+
+        // Parse the score from the response
+        let re = regex::Regex::new(r"<score>(\d+)</score>").unwrap();
+        if let Some(captures) = re.captures(&output) {
+            if let Some(score_match) = captures.get(1) {
+                let score = score_match.as_str().parse().unwrap_or(0);
+                return Ok(EvalAssertionResult {
+                    score,
+                    message: Some(output),
+                });
+            }
+        }
+
+        Err(anyhow!(
+            "No score found in response. Raw output: {}",
+            output
+        ))
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+enum EvalAssertion {
+    AssertEqual(String),
+    JudgeDiff(&'static str),
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash)]
+struct EvalAssertionResult {
+    score: usize,
+    message: Option<String>,
+}
+
+#[derive(Serialize)]
+pub struct DiffJudgeTemplate {
+    diff: String,
+    assertions: &'static str,
+}
+
+impl Template for DiffJudgeTemplate {
+    const TEMPLATE_NAME: &'static str = "diff_judge.hbs";
+}
+
+fn strip_empty_lines(text: &str) -> String {
+    text.lines()
+        .filter(|line| !line.trim().is_empty())
+        .collect::<Vec<_>>()
+        .join("\n")
+}