agent: Fix creating files with Gemini (#31439)
This change instructs models to wrap new file content in Markdown fences and introduces a parser for this format. The reasons are: 1. This is the format we put a lot of effort into explaining in the system prompt. 2. Gemini really prefers to do it. 3. It adds an option for a model to think before writing the content The `eval_zode` pass rate for GEmini models goes from 0% to 100%. Other models were already at 100%, this hasn't changed. Release Notes: - N/A
This commit is contained in:
parent
bffde7c6b4
commit
6253b95f82
7 changed files with 356 additions and 45 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -683,6 +683,7 @@ dependencies = [
|
||||||
"language_models",
|
"language_models",
|
||||||
"log",
|
"log",
|
||||||
"markdown",
|
"markdown",
|
||||||
|
"once_cell",
|
||||||
"open",
|
"open",
|
||||||
"paths",
|
"paths",
|
||||||
"portable-pty",
|
"portable-pty",
|
||||||
|
|
|
@ -62,6 +62,7 @@ which.workspace = true
|
||||||
workspace-hack.workspace = true
|
workspace-hack.workspace = true
|
||||||
workspace.workspace = true
|
workspace.workspace = true
|
||||||
zed_llm_client.workspace = true
|
zed_llm_client.workspace = true
|
||||||
|
once_cell = "1.21.3"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
client = { workspace = true, features = ["test-support"] }
|
client = { workspace = true, features = ["test-support"] }
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
mod create_file_parser;
|
||||||
mod edit_parser;
|
mod edit_parser;
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod evals;
|
mod evals;
|
||||||
|
@ -6,6 +7,7 @@ use crate::{Template, Templates};
|
||||||
use aho_corasick::AhoCorasick;
|
use aho_corasick::AhoCorasick;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use assistant_tool::ActionLog;
|
use assistant_tool::ActionLog;
|
||||||
|
use create_file_parser::{CreateFileParser, CreateFileParserEvent};
|
||||||
use edit_parser::{EditParser, EditParserEvent, EditParserMetrics};
|
use edit_parser::{EditParser, EditParserEvent, EditParserMetrics};
|
||||||
use futures::{
|
use futures::{
|
||||||
Stream, StreamExt,
|
Stream, StreamExt,
|
||||||
|
@ -123,16 +125,16 @@ impl EditAgent {
|
||||||
mpsc::UnboundedReceiver<EditAgentOutputEvent>,
|
mpsc::UnboundedReceiver<EditAgentOutputEvent>,
|
||||||
) {
|
) {
|
||||||
let (output_events_tx, output_events_rx) = mpsc::unbounded();
|
let (output_events_tx, output_events_rx) = mpsc::unbounded();
|
||||||
|
let (parse_task, parse_rx) = Self::parse_create_file_chunks(edit_chunks, cx);
|
||||||
let this = self.clone();
|
let this = self.clone();
|
||||||
let task = cx.spawn(async move |cx| {
|
let task = cx.spawn(async move |cx| {
|
||||||
this.action_log
|
this.action_log
|
||||||
.update(cx, |log, cx| log.buffer_created(buffer.clone(), cx))?;
|
.update(cx, |log, cx| log.buffer_created(buffer.clone(), cx))?;
|
||||||
let output = this
|
this.overwrite_with_chunks_internal(buffer, parse_rx, output_events_tx, cx)
|
||||||
.overwrite_with_chunks_internal(buffer, edit_chunks, output_events_tx, cx)
|
.await?;
|
||||||
.await;
|
|
||||||
this.project
|
this.project
|
||||||
.update(cx, |project, cx| project.set_agent_location(None, cx))?;
|
.update(cx, |project, cx| project.set_agent_location(None, cx))?;
|
||||||
output
|
parse_task.await
|
||||||
});
|
});
|
||||||
(task, output_events_rx)
|
(task, output_events_rx)
|
||||||
}
|
}
|
||||||
|
@ -140,10 +142,10 @@ impl EditAgent {
|
||||||
async fn overwrite_with_chunks_internal(
|
async fn overwrite_with_chunks_internal(
|
||||||
&self,
|
&self,
|
||||||
buffer: Entity<Buffer>,
|
buffer: Entity<Buffer>,
|
||||||
edit_chunks: impl 'static + Send + Stream<Item = Result<String, LanguageModelCompletionError>>,
|
mut parse_rx: UnboundedReceiver<Result<CreateFileParserEvent>>,
|
||||||
output_events_tx: mpsc::UnboundedSender<EditAgentOutputEvent>,
|
output_events_tx: mpsc::UnboundedSender<EditAgentOutputEvent>,
|
||||||
cx: &mut AsyncApp,
|
cx: &mut AsyncApp,
|
||||||
) -> Result<EditAgentOutput> {
|
) -> Result<()> {
|
||||||
cx.update(|cx| {
|
cx.update(|cx| {
|
||||||
buffer.update(cx, |buffer, cx| buffer.set_text("", cx));
|
buffer.update(cx, |buffer, cx| buffer.set_text("", cx));
|
||||||
self.action_log.update(cx, |log, cx| {
|
self.action_log.update(cx, |log, cx| {
|
||||||
|
@ -163,34 +165,31 @@ impl EditAgent {
|
||||||
.ok();
|
.ok();
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let mut raw_edits = String::new();
|
while let Some(event) = parse_rx.next().await {
|
||||||
pin_mut!(edit_chunks);
|
match event? {
|
||||||
while let Some(chunk) = edit_chunks.next().await {
|
CreateFileParserEvent::NewTextChunk { chunk } => {
|
||||||
let chunk = chunk?;
|
cx.update(|cx| {
|
||||||
raw_edits.push_str(&chunk);
|
buffer.update(cx, |buffer, cx| buffer.append(chunk, cx));
|
||||||
cx.update(|cx| {
|
self.action_log
|
||||||
buffer.update(cx, |buffer, cx| buffer.append(chunk, cx));
|
.update(cx, |log, cx| log.buffer_edited(buffer.clone(), cx));
|
||||||
self.action_log
|
self.project.update(cx, |project, cx| {
|
||||||
.update(cx, |log, cx| log.buffer_edited(buffer.clone(), cx));
|
project.set_agent_location(
|
||||||
self.project.update(cx, |project, cx| {
|
Some(AgentLocation {
|
||||||
project.set_agent_location(
|
buffer: buffer.downgrade(),
|
||||||
Some(AgentLocation {
|
position: language::Anchor::MAX,
|
||||||
buffer: buffer.downgrade(),
|
}),
|
||||||
position: language::Anchor::MAX,
|
cx,
|
||||||
}),
|
)
|
||||||
cx,
|
});
|
||||||
)
|
})?;
|
||||||
});
|
output_events_tx
|
||||||
})?;
|
.unbounded_send(EditAgentOutputEvent::Edited)
|
||||||
output_events_tx
|
.ok();
|
||||||
.unbounded_send(EditAgentOutputEvent::Edited)
|
}
|
||||||
.ok();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(EditAgentOutput {
|
Ok(())
|
||||||
raw_edits,
|
|
||||||
parser_metrics: EditParserMetrics::default(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn edit(
|
pub fn edit(
|
||||||
|
@ -435,6 +434,44 @@ impl EditAgent {
|
||||||
(output, rx)
|
(output, rx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_create_file_chunks(
|
||||||
|
chunks: impl 'static + Send + Stream<Item = Result<String, LanguageModelCompletionError>>,
|
||||||
|
cx: &mut AsyncApp,
|
||||||
|
) -> (
|
||||||
|
Task<Result<EditAgentOutput>>,
|
||||||
|
UnboundedReceiver<Result<CreateFileParserEvent>>,
|
||||||
|
) {
|
||||||
|
let (tx, rx) = mpsc::unbounded();
|
||||||
|
let output = cx.background_spawn(async move {
|
||||||
|
pin_mut!(chunks);
|
||||||
|
|
||||||
|
let mut parser = CreateFileParser::new();
|
||||||
|
let mut raw_edits = String::new();
|
||||||
|
while let Some(chunk) = chunks.next().await {
|
||||||
|
match chunk {
|
||||||
|
Ok(chunk) => {
|
||||||
|
raw_edits.push_str(&chunk);
|
||||||
|
for event in parser.push(Some(&chunk)) {
|
||||||
|
tx.unbounded_send(Ok(event))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
tx.unbounded_send(Err(error.into()))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Send final events with None to indicate completion
|
||||||
|
for event in parser.push(None) {
|
||||||
|
tx.unbounded_send(Ok(event))?;
|
||||||
|
}
|
||||||
|
Ok(EditAgentOutput {
|
||||||
|
raw_edits,
|
||||||
|
parser_metrics: EditParserMetrics::default(),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
(output, rx)
|
||||||
|
}
|
||||||
|
|
||||||
fn reindent_new_text_chunks(
|
fn reindent_new_text_chunks(
|
||||||
delta: IndentDelta,
|
delta: IndentDelta,
|
||||||
mut stream: impl Unpin + Stream<Item = Result<EditParserEvent>>,
|
mut stream: impl Unpin + Stream<Item = Result<EditParserEvent>>,
|
||||||
|
@ -1138,7 +1175,7 @@ mod tests {
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
chunks_tx.unbounded_send("jkl\n").unwrap();
|
chunks_tx.unbounded_send("```\njkl\n").unwrap();
|
||||||
cx.run_until_parked();
|
cx.run_until_parked();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
drain_events(&mut events),
|
drain_events(&mut events),
|
||||||
|
@ -1146,7 +1183,7 @@ mod tests {
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
|
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
|
||||||
"jkl\n"
|
"jkl"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
project.read_with(cx, |project, _| project.agent_location()),
|
project.read_with(cx, |project, _| project.agent_location()),
|
||||||
|
@ -1164,7 +1201,7 @@ mod tests {
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
|
buffer.read_with(cx, |buffer, _| buffer.snapshot().text()),
|
||||||
"jkl\nmno\n"
|
"jkl\nmno"
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
project.read_with(cx, |project, _| project.agent_location()),
|
project.read_with(cx, |project, _| project.agent_location()),
|
||||||
|
@ -1174,7 +1211,7 @@ mod tests {
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
chunks_tx.unbounded_send("pqr").unwrap();
|
chunks_tx.unbounded_send("pqr\n```").unwrap();
|
||||||
cx.run_until_parked();
|
cx.run_until_parked();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
drain_events(&mut events),
|
drain_events(&mut events),
|
||||||
|
|
218
crates/assistant_tools/src/edit_agent/create_file_parser.rs
Normal file
218
crates/assistant_tools/src/edit_agent/create_file_parser.rs
Normal file
|
@ -0,0 +1,218 @@
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
|
use smallvec::SmallVec;
|
||||||
|
use util::debug_panic;
|
||||||
|
|
||||||
|
const START_MARKER: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n?```\S*\n").unwrap());
|
||||||
|
const END_MARKER: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n```\s*$").unwrap());
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum CreateFileParserEvent {
|
||||||
|
NewTextChunk { chunk: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct CreateFileParser {
|
||||||
|
state: ParserState,
|
||||||
|
buffer: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
enum ParserState {
|
||||||
|
Pending,
|
||||||
|
WithinText,
|
||||||
|
Finishing,
|
||||||
|
Finished,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CreateFileParser {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
CreateFileParser {
|
||||||
|
state: ParserState::Pending,
|
||||||
|
buffer: String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push(&mut self, chunk: Option<&str>) -> SmallVec<[CreateFileParserEvent; 1]> {
|
||||||
|
if chunk.is_none() {
|
||||||
|
self.state = ParserState::Finishing;
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunk = chunk.unwrap_or_default();
|
||||||
|
|
||||||
|
self.buffer.push_str(chunk);
|
||||||
|
|
||||||
|
let mut edit_events = SmallVec::new();
|
||||||
|
loop {
|
||||||
|
match &mut self.state {
|
||||||
|
ParserState::Pending => {
|
||||||
|
if let Some(m) = START_MARKER.find(&self.buffer) {
|
||||||
|
self.buffer.drain(..m.end());
|
||||||
|
self.state = ParserState::WithinText;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ParserState::WithinText => {
|
||||||
|
let text = self.buffer.trim_end_matches(&['`', '\n', ' ']);
|
||||||
|
let text_len = text.len();
|
||||||
|
|
||||||
|
if text_len > 0 {
|
||||||
|
edit_events.push(CreateFileParserEvent::NewTextChunk {
|
||||||
|
chunk: self.buffer.drain(..text_len).collect(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ParserState::Finishing => {
|
||||||
|
if let Some(m) = END_MARKER.find(&self.buffer) {
|
||||||
|
self.buffer.drain(m.start()..);
|
||||||
|
}
|
||||||
|
if !self.buffer.is_empty() {
|
||||||
|
if !self.buffer.ends_with('\n') {
|
||||||
|
self.buffer.push('\n');
|
||||||
|
}
|
||||||
|
edit_events.push(CreateFileParserEvent::NewTextChunk {
|
||||||
|
chunk: self.buffer.drain(..).collect(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
self.state = ParserState::Finished;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
ParserState::Finished => debug_panic!("Can't call parser after finishing"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
edit_events
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use indoc::indoc;
|
||||||
|
use rand::prelude::*;
|
||||||
|
use std::cmp;
|
||||||
|
|
||||||
|
#[gpui::test(iterations = 100)]
|
||||||
|
fn test_happy_path(mut rng: StdRng) {
|
||||||
|
let mut parser = CreateFileParser::new();
|
||||||
|
assert_eq!(
|
||||||
|
parse_random_chunks("```\nHello world\n```", &mut parser, &mut rng),
|
||||||
|
"Hello world".to_string()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gpui::test(iterations = 100)]
|
||||||
|
fn test_cut_prefix(mut rng: StdRng) {
|
||||||
|
let mut parser = CreateFileParser::new();
|
||||||
|
assert_eq!(
|
||||||
|
parse_random_chunks(
|
||||||
|
indoc! {"
|
||||||
|
Let me write this file for you:
|
||||||
|
|
||||||
|
```
|
||||||
|
Hello world
|
||||||
|
```
|
||||||
|
|
||||||
|
"},
|
||||||
|
&mut parser,
|
||||||
|
&mut rng
|
||||||
|
),
|
||||||
|
"Hello world".to_string()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gpui::test(iterations = 100)]
|
||||||
|
fn test_language_name_on_fences(mut rng: StdRng) {
|
||||||
|
let mut parser = CreateFileParser::new();
|
||||||
|
assert_eq!(
|
||||||
|
parse_random_chunks(
|
||||||
|
indoc! {"
|
||||||
|
```rust
|
||||||
|
Hello world
|
||||||
|
```
|
||||||
|
|
||||||
|
"},
|
||||||
|
&mut parser,
|
||||||
|
&mut rng
|
||||||
|
),
|
||||||
|
"Hello world".to_string()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gpui::test(iterations = 100)]
|
||||||
|
fn test_leave_suffix(mut rng: StdRng) {
|
||||||
|
let mut parser = CreateFileParser::new();
|
||||||
|
assert_eq!(
|
||||||
|
parse_random_chunks(
|
||||||
|
indoc! {"
|
||||||
|
Let me write this file for you:
|
||||||
|
|
||||||
|
```
|
||||||
|
Hello world
|
||||||
|
```
|
||||||
|
|
||||||
|
The end
|
||||||
|
"},
|
||||||
|
&mut parser,
|
||||||
|
&mut rng
|
||||||
|
),
|
||||||
|
// This output is marlformed, so we're doing our best effort
|
||||||
|
"Hello world\n```\n\nThe end\n".to_string()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[gpui::test(iterations = 100)]
|
||||||
|
fn test_inner_fences(mut rng: StdRng) {
|
||||||
|
let mut parser = CreateFileParser::new();
|
||||||
|
assert_eq!(
|
||||||
|
parse_random_chunks(
|
||||||
|
indoc! {"
|
||||||
|
Let me write this file for you:
|
||||||
|
|
||||||
|
```
|
||||||
|
```
|
||||||
|
Hello world
|
||||||
|
```
|
||||||
|
```
|
||||||
|
"},
|
||||||
|
&mut parser,
|
||||||
|
&mut rng
|
||||||
|
),
|
||||||
|
// This output is marlformed, so we're doing our best effort
|
||||||
|
"```\nHello world\n```\n".to_string()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_random_chunks(input: &str, parser: &mut CreateFileParser, rng: &mut StdRng) -> String {
|
||||||
|
let chunk_count = rng.gen_range(1..=cmp::min(input.len(), 50));
|
||||||
|
let mut chunk_indices = (0..input.len()).choose_multiple(rng, chunk_count);
|
||||||
|
chunk_indices.sort();
|
||||||
|
chunk_indices.push(input.len());
|
||||||
|
|
||||||
|
let chunk_indices = chunk_indices
|
||||||
|
.into_iter()
|
||||||
|
.map(Some)
|
||||||
|
.chain(vec![None])
|
||||||
|
.collect::<Vec<Option<usize>>>();
|
||||||
|
|
||||||
|
let mut edit = String::default();
|
||||||
|
let mut last_ix = 0;
|
||||||
|
for chunk_ix in chunk_indices {
|
||||||
|
let mut chunk = None;
|
||||||
|
if let Some(chunk_ix) = chunk_ix {
|
||||||
|
chunk = Some(&input[last_ix..chunk_ix]);
|
||||||
|
last_ix = chunk_ix;
|
||||||
|
}
|
||||||
|
|
||||||
|
for event in parser.push(chunk) {
|
||||||
|
match event {
|
||||||
|
CreateFileParserEvent::NewTextChunk { chunk } => {
|
||||||
|
edit.push_str(&chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
edit
|
||||||
|
}
|
||||||
|
}
|
|
@ -163,6 +163,15 @@ fn eval_delete_run_git_blame() {
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
fn eval_translate_doc_comments() {
|
fn eval_translate_doc_comments() {
|
||||||
|
// Results for 2025-05-22
|
||||||
|
//
|
||||||
|
// Model | Pass rate
|
||||||
|
// ============================================
|
||||||
|
//
|
||||||
|
// claude-3.7-sonnet |
|
||||||
|
// gemini-2.5-pro-preview-03-25 | 1.0
|
||||||
|
// gemini-2.5-flash-preview-04-17 |
|
||||||
|
// gpt-4.1 |
|
||||||
let input_file_path = "root/canvas.rs";
|
let input_file_path = "root/canvas.rs";
|
||||||
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
|
let input_file_content = include_str!("evals/fixtures/translate_doc_comments/before.rs");
|
||||||
let edit_description = "Translate all doc comments to Italian";
|
let edit_description = "Translate all doc comments to Italian";
|
||||||
|
@ -216,6 +225,15 @@ fn eval_translate_doc_comments() {
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
|
fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
|
||||||
|
// Results for 2025-05-22
|
||||||
|
//
|
||||||
|
// Model | Pass rate
|
||||||
|
// ============================================
|
||||||
|
//
|
||||||
|
// claude-3.7-sonnet | 0.98
|
||||||
|
// gemini-2.5-pro-preview-03-25 | 0.99
|
||||||
|
// gemini-2.5-flash-preview-04-17 |
|
||||||
|
// gpt-4.1 |
|
||||||
let input_file_path = "root/lib.rs";
|
let input_file_path = "root/lib.rs";
|
||||||
let input_file_content =
|
let input_file_content =
|
||||||
include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
|
include_str!("evals/fixtures/use_wasi_sdk_in_compile_parser_to_wasm/before.rs");
|
||||||
|
@ -332,6 +350,15 @@ fn eval_use_wasi_sdk_in_compile_parser_to_wasm() {
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
fn eval_disable_cursor_blinking() {
|
fn eval_disable_cursor_blinking() {
|
||||||
|
// Results for 2025-05-22
|
||||||
|
//
|
||||||
|
// Model | Pass rate
|
||||||
|
// ============================================
|
||||||
|
//
|
||||||
|
// claude-3.7-sonnet |
|
||||||
|
// gemini-2.5-pro-preview-03-25 | 1.0
|
||||||
|
// gemini-2.5-flash-preview-04-17 |
|
||||||
|
// gpt-4.1 |
|
||||||
let input_file_path = "root/editor.rs";
|
let input_file_path = "root/editor.rs";
|
||||||
let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
|
let input_file_content = include_str!("evals/fixtures/disable_cursor_blinking/before.rs");
|
||||||
let edit_description = "Comment out the call to `BlinkManager::enable`";
|
let edit_description = "Comment out the call to `BlinkManager::enable`";
|
||||||
|
@ -406,6 +433,15 @@ fn eval_disable_cursor_blinking() {
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
fn eval_from_pixels_constructor() {
|
fn eval_from_pixels_constructor() {
|
||||||
|
// Results for 2025-05-22
|
||||||
|
//
|
||||||
|
// Model | Pass rate
|
||||||
|
// ============================================
|
||||||
|
//
|
||||||
|
// claude-3.7-sonnet |
|
||||||
|
// gemini-2.5-pro-preview-03-25 | 0.94
|
||||||
|
// gemini-2.5-flash-preview-04-17 |
|
||||||
|
// gpt-4.1 |
|
||||||
let input_file_path = "root/canvas.rs";
|
let input_file_path = "root/canvas.rs";
|
||||||
let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
|
let input_file_content = include_str!("evals/fixtures/from_pixels_constructor/before.rs");
|
||||||
let edit_description = "Implement from_pixels constructor and add tests.";
|
let edit_description = "Implement from_pixels constructor and add tests.";
|
||||||
|
@ -597,11 +633,20 @@ fn eval_from_pixels_constructor() {
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
fn eval_zode() {
|
fn eval_zode() {
|
||||||
|
// Results for 2025-05-22
|
||||||
|
//
|
||||||
|
// Model | Pass rate
|
||||||
|
// ============================================
|
||||||
|
//
|
||||||
|
// claude-3.7-sonnet | 1.0
|
||||||
|
// gemini-2.5-pro-preview-03-25 | 1.0
|
||||||
|
// gemini-2.5-flash-preview-04-17 | 1.0
|
||||||
|
// gpt-4.1 | 1.0
|
||||||
let input_file_path = "root/zode.py";
|
let input_file_path = "root/zode.py";
|
||||||
let input_content = None;
|
let input_content = None;
|
||||||
let edit_description = "Create the main Zode CLI script";
|
let edit_description = "Create the main Zode CLI script";
|
||||||
eval(
|
eval(
|
||||||
200,
|
50,
|
||||||
1.,
|
1.,
|
||||||
EvalInput::from_conversation(
|
EvalInput::from_conversation(
|
||||||
vec![
|
vec![
|
||||||
|
@ -694,6 +739,15 @@ fn eval_zode() {
|
||||||
#[test]
|
#[test]
|
||||||
#[cfg_attr(not(feature = "eval"), ignore)]
|
#[cfg_attr(not(feature = "eval"), ignore)]
|
||||||
fn eval_add_overwrite_test() {
|
fn eval_add_overwrite_test() {
|
||||||
|
// Results for 2025-05-22
|
||||||
|
//
|
||||||
|
// Model | Pass rate
|
||||||
|
// ============================================
|
||||||
|
//
|
||||||
|
// claude-3.7-sonnet | 0.16
|
||||||
|
// gemini-2.5-pro-preview-03-25 | 0.35
|
||||||
|
// gemini-2.5-flash-preview-04-17 |
|
||||||
|
// gpt-4.1 |
|
||||||
let input_file_path = "root/action_log.rs";
|
let input_file_path = "root/action_log.rs";
|
||||||
let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
|
let input_file_content = include_str!("evals/fixtures/add_overwrite_test/before.rs");
|
||||||
let edit_description = "Add a new test for overwriting a file in action_log.rs";
|
let edit_description = "Add a new test for overwriting a file in action_log.rs";
|
||||||
|
@ -920,14 +974,11 @@ fn eval_create_empty_file() {
|
||||||
// thoughts into it. This issue is not specific to empty files, but
|
// thoughts into it. This issue is not specific to empty files, but
|
||||||
// it's easier to reproduce with them.
|
// it's easier to reproduce with them.
|
||||||
//
|
//
|
||||||
|
// Results for 2025-05-21:
|
||||||
//
|
//
|
||||||
// Model | Pass rate
|
// Model | Pass rate
|
||||||
// ============================================
|
// ============================================
|
||||||
//
|
//
|
||||||
// --------------------------------------------
|
|
||||||
// Prompt version: 2025-05-21
|
|
||||||
// --------------------------------------------
|
|
||||||
//
|
|
||||||
// claude-3.7-sonnet | 1.00
|
// claude-3.7-sonnet | 1.00
|
||||||
// gemini-2.5-pro-preview-03-25 | 1.00
|
// gemini-2.5-pro-preview-03-25 | 1.00
|
||||||
// gemini-2.5-flash-preview-04-17 | 1.00
|
// gemini-2.5-flash-preview-04-17 | 1.00
|
||||||
|
@ -1430,7 +1481,7 @@ impl EditAgentTest {
|
||||||
model.provider_id() == selected_model.provider
|
model.provider_id() == selected_model.provider
|
||||||
&& model.id() == selected_model.model
|
&& model.id() == selected_model.model
|
||||||
})
|
})
|
||||||
.unwrap();
|
.expect("Model not found");
|
||||||
let provider = models.provider(&model.provider_id()).unwrap();
|
let provider = models.provider(&model.provider_id()).unwrap();
|
||||||
(provider, model)
|
(provider, model)
|
||||||
})?;
|
})?;
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
You are an expert engineer and your task is to write a new file from scratch.
|
You are an expert engineer and your task is to write a new file from scratch.
|
||||||
|
|
||||||
You MUST respond directly with the file's content, without explanations, additional text or triple backticks.
|
You MUST respond with the file's content wrapped in triple backticks (```).
|
||||||
|
The backticks should be on their own line.
|
||||||
The text you output will be saved verbatim as the content of the file.
|
The text you output will be saved verbatim as the content of the file.
|
||||||
Tool calls have been disabled. You MUST start your response directly with the file's new content.
|
Tool calls have been disabled.
|
||||||
|
Start your response with ```.
|
||||||
|
|
||||||
<file_path>
|
<file_path>
|
||||||
{{path}}
|
{{path}}
|
||||||
|
|
|
@ -43,7 +43,8 @@ NEW TEXT 3 HERE
|
||||||
- Always close all tags properly
|
- Always close all tags properly
|
||||||
|
|
||||||
|
|
||||||
{{!-- This example is important for Gemini 2.5 --}}
|
{{!-- The following example adds almost 10% pass rate for Gemini 2.5.
|
||||||
|
Claude and gpt-4.1 don't really need it. --}}
|
||||||
<example>
|
<example>
|
||||||
<edits>
|
<edits>
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue