Code block evals (#29619)
Add a targeted eval for code block formatting, and revise the system prompt accordingly. ### Eval before, n=8 <img width="728" alt="eval before" src="https://github.com/user-attachments/assets/552b6146-3d26-4eaa-86f9-9fc36c0cadf2" /> ### Eval after prompt change, n=8 (excluding the new evals, so just testing the prompt change) <img width="717" alt="eval after" src="https://github.com/user-attachments/assets/c78c7a54-4c65-470c-b135-8691584cd73e" /> Release Notes: - N/A
This commit is contained in:
parent
2508e491d5
commit
d7004030b3
10 changed files with 536 additions and 134 deletions
|
@ -44,6 +44,7 @@ language_extension.workspace = true
|
|||
language_model.workspace = true
|
||||
language_models.workspace = true
|
||||
languages = { workspace = true, features = ["load-grammars"] }
|
||||
markdown.workspace = true
|
||||
node_runtime.workspace = true
|
||||
pathdiff.workspace = true
|
||||
paths.workspace = true
|
||||
|
|
|
@ -10,13 +10,13 @@ use crate::{
|
|||
ToolMetrics,
|
||||
assertions::{AssertionsReport, RanAssertion, RanAssertionResult},
|
||||
};
|
||||
use agent::{ContextLoadResult, ThreadEvent};
|
||||
use agent::{ContextLoadResult, Thread, ThreadEvent};
|
||||
use anyhow::{Result, anyhow};
|
||||
use async_trait::async_trait;
|
||||
use buffer_diff::DiffHunkStatus;
|
||||
use collections::HashMap;
|
||||
use futures::{FutureExt as _, StreamExt, channel::mpsc, select_biased};
|
||||
use gpui::{AppContext, AsyncApp, Entity};
|
||||
use gpui::{App, AppContext, AsyncApp, Entity};
|
||||
use language_model::{LanguageModel, Role, StopReason};
|
||||
|
||||
pub const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
|
||||
|
@ -314,7 +314,7 @@ impl ExampleContext {
|
|||
for message in thread.messages().skip(message_count_before) {
|
||||
messages.push(Message {
|
||||
_role: message.role,
|
||||
_text: message.to_string(),
|
||||
text: message.to_string(),
|
||||
tool_use: thread
|
||||
.tool_uses_for_message(message.id, cx)
|
||||
.into_iter()
|
||||
|
@ -362,6 +362,90 @@ impl ExampleContext {
|
|||
})
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn agent_thread(&self) -> Entity<Thread> {
|
||||
self.agent_thread.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl AppContext for ExampleContext {
|
||||
type Result<T> = anyhow::Result<T>;
|
||||
|
||||
fn new<T: 'static>(
|
||||
&mut self,
|
||||
build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
|
||||
) -> Self::Result<Entity<T>> {
|
||||
self.app.new(build_entity)
|
||||
}
|
||||
|
||||
fn reserve_entity<T: 'static>(&mut self) -> Self::Result<gpui::Reservation<T>> {
|
||||
self.app.reserve_entity()
|
||||
}
|
||||
|
||||
fn insert_entity<T: 'static>(
|
||||
&mut self,
|
||||
reservation: gpui::Reservation<T>,
|
||||
build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
|
||||
) -> Self::Result<Entity<T>> {
|
||||
self.app.insert_entity(reservation, build_entity)
|
||||
}
|
||||
|
||||
fn update_entity<T, R>(
|
||||
&mut self,
|
||||
handle: &Entity<T>,
|
||||
update: impl FnOnce(&mut T, &mut gpui::Context<T>) -> R,
|
||||
) -> Self::Result<R>
|
||||
where
|
||||
T: 'static,
|
||||
{
|
||||
self.app.update_entity(handle, update)
|
||||
}
|
||||
|
||||
fn read_entity<T, R>(
|
||||
&self,
|
||||
handle: &Entity<T>,
|
||||
read: impl FnOnce(&T, &App) -> R,
|
||||
) -> Self::Result<R>
|
||||
where
|
||||
T: 'static,
|
||||
{
|
||||
self.app.read_entity(handle, read)
|
||||
}
|
||||
|
||||
fn update_window<T, F>(&mut self, window: gpui::AnyWindowHandle, f: F) -> Result<T>
|
||||
where
|
||||
F: FnOnce(gpui::AnyView, &mut gpui::Window, &mut App) -> T,
|
||||
{
|
||||
self.app.update_window(window, f)
|
||||
}
|
||||
|
||||
fn read_window<T, R>(
|
||||
&self,
|
||||
window: &gpui::WindowHandle<T>,
|
||||
read: impl FnOnce(Entity<T>, &App) -> R,
|
||||
) -> Result<R>
|
||||
where
|
||||
T: 'static,
|
||||
{
|
||||
self.app.read_window(window, read)
|
||||
}
|
||||
|
||||
fn background_spawn<R>(
|
||||
&self,
|
||||
future: impl std::future::Future<Output = R> + Send + 'static,
|
||||
) -> gpui::Task<R>
|
||||
where
|
||||
R: Send + 'static,
|
||||
{
|
||||
self.app.background_spawn(future)
|
||||
}
|
||||
|
||||
fn read_global<G, R>(&self, callback: impl FnOnce(&G, &App) -> R) -> Self::Result<R>
|
||||
where
|
||||
G: gpui::Global,
|
||||
{
|
||||
self.app.read_global(callback)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -391,12 +475,16 @@ impl Response {
|
|||
pub fn tool_uses(&self) -> impl Iterator<Item = &ToolUse> {
|
||||
self.messages.iter().flat_map(|msg| &msg.tool_use)
|
||||
}
|
||||
|
||||
pub fn texts(&self) -> impl Iterator<Item = String> {
|
||||
self.messages.iter().map(|message| message.text.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Message {
|
||||
_role: Role,
|
||||
_text: String,
|
||||
text: String,
|
||||
tool_use: Vec<ToolUse>,
|
||||
}
|
||||
|
||||
|
|
191
crates/eval/src/examples/code_block_citations.rs
Normal file
191
crates/eval/src/examples/code_block_citations.rs
Normal file
|
@ -0,0 +1,191 @@
|
|||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use markdown::PathWithRange;
|
||||
|
||||
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
|
||||
|
||||
pub struct CodeBlockCitations;
|
||||
|
||||
const FENCE: &str = "```";
|
||||
|
||||
#[async_trait(?Send)]
|
||||
impl Example for CodeBlockCitations {
|
||||
fn meta(&self) -> ExampleMetadata {
|
||||
ExampleMetadata {
|
||||
name: "code_block_citations".to_string(),
|
||||
url: "https://github.com/zed-industries/zed.git".to_string(),
|
||||
revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
|
||||
language_server: Some(LanguageServer {
|
||||
file_extension: "rs".to_string(),
|
||||
allow_preexisting_diagnostics: false,
|
||||
}),
|
||||
max_assertions: None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
|
||||
const FILENAME: &str = "assistant_tool.rs";
|
||||
cx.push_user_message(format!(
|
||||
r#"
|
||||
Show me the method bodies of all the methods of the `Tool` trait in {FILENAME}.
|
||||
|
||||
Please show each method in a separate code snippet.
|
||||
"#
|
||||
));
|
||||
|
||||
// Verify that the messages all have the correct formatting.
|
||||
let texts: Vec<String> = cx.run_to_end().await?.texts().collect();
|
||||
let closing_fence = format!("\n{FENCE}");
|
||||
|
||||
for text in texts.iter() {
|
||||
let mut text = text.as_str();
|
||||
|
||||
while let Some(index) = text.find(FENCE) {
|
||||
// Advance text past the opening backticks.
|
||||
text = &text[index + FENCE.len()..];
|
||||
|
||||
// Find the closing backticks.
|
||||
let content_len = text.find(&closing_fence);
|
||||
|
||||
// Verify the citation format - e.g. ```path/to/foo.txt#L123-456
|
||||
if let Some(citation_len) = text.find('\n') {
|
||||
let citation = &text[..citation_len];
|
||||
|
||||
if let Ok(()) =
|
||||
cx.assert(citation.contains("/"), format!("Slash in {citation:?}",))
|
||||
{
|
||||
let path_range = PathWithRange::new(citation);
|
||||
let path = cx
|
||||
.agent_thread()
|
||||
.update(cx, |thread, cx| {
|
||||
thread
|
||||
.project()
|
||||
.read(cx)
|
||||
.find_project_path(path_range.path, cx)
|
||||
})
|
||||
.ok()
|
||||
.flatten();
|
||||
|
||||
if let Ok(path) = cx.assert_some(path, format!("Valid path: {citation:?}"))
|
||||
{
|
||||
let buffer_text = {
|
||||
let buffer = match cx.agent_thread().update(cx, |thread, cx| {
|
||||
thread
|
||||
.project()
|
||||
.update(cx, |project, cx| project.open_buffer(path, cx))
|
||||
}) {
|
||||
Ok(buffer_task) => buffer_task.await.ok(),
|
||||
Err(err) => {
|
||||
cx.assert(
|
||||
false,
|
||||
format!("Expected Ok(buffer), not {err:?}"),
|
||||
)
|
||||
.ok();
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
let Ok(buffer_text) = cx.assert_some(
|
||||
buffer.and_then(|buffer| {
|
||||
buffer.read_with(cx, |buffer, _| buffer.text()).ok()
|
||||
}),
|
||||
"Reading buffer text succeeded",
|
||||
) else {
|
||||
continue;
|
||||
};
|
||||
buffer_text
|
||||
};
|
||||
|
||||
if let Some(content_len) = content_len {
|
||||
// + 1 because there's a newline character after the citation.
|
||||
let content =
|
||||
&text[(citation.len() + 1)..content_len - (citation.len() + 1)];
|
||||
|
||||
cx.assert(
|
||||
buffer_text.contains(&content),
|
||||
"Code block content was found in file",
|
||||
)
|
||||
.ok();
|
||||
|
||||
if let Some(range) = path_range.range {
|
||||
let start_line_index = range.start.line.saturating_sub(1);
|
||||
let line_count =
|
||||
range.end.line.saturating_sub(start_line_index);
|
||||
let mut snippet = buffer_text
|
||||
.lines()
|
||||
.skip(start_line_index as usize)
|
||||
.take(line_count as usize)
|
||||
.collect::<Vec<&str>>()
|
||||
.join("\n");
|
||||
|
||||
if let Some(start_col) = range.start.col {
|
||||
snippet = snippet[start_col as usize..].to_string();
|
||||
}
|
||||
|
||||
if let Some(end_col) = range.end.col {
|
||||
let last_line = snippet.lines().last().unwrap();
|
||||
snippet = snippet
|
||||
[..snippet.len() - last_line.len() + end_col as usize]
|
||||
.to_string();
|
||||
}
|
||||
|
||||
cx.assert_eq(
|
||||
snippet.as_str(),
|
||||
content,
|
||||
"Code block snippet was at specified line/col",
|
||||
)
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cx.assert(
|
||||
false,
|
||||
format!("Opening {FENCE} did not have a newline anywhere after it."),
|
||||
)
|
||||
.ok();
|
||||
}
|
||||
|
||||
if let Some(content_len) = content_len {
|
||||
// Advance past the closing backticks
|
||||
text = &text[content_len + FENCE.len()..];
|
||||
} else {
|
||||
// There were no closing backticks associated with these opening backticks.
|
||||
cx.assert(
|
||||
false,
|
||||
"Code block opening had matching closing backticks.".to_string(),
|
||||
)
|
||||
.ok();
|
||||
|
||||
// There are no more code blocks to parse, so we're done.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
|
||||
vec![
|
||||
JudgeAssertion {
|
||||
id: "trait method bodies are shown".to_string(),
|
||||
description:
|
||||
"All method bodies of the Tool trait are shown."
|
||||
.to_string(),
|
||||
},
|
||||
JudgeAssertion {
|
||||
id: "code blocks used".to_string(),
|
||||
description:
|
||||
"All code snippets are rendered inside markdown code blocks (as opposed to any other formatting besides code blocks)."
|
||||
.to_string(),
|
||||
},
|
||||
JudgeAssertion {
|
||||
id: "code blocks use backticks".to_string(),
|
||||
description:
|
||||
format!("All markdown code blocks use backtick fences ({FENCE}) rather than indentation.")
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -12,12 +12,14 @@ use util::serde::default_true;
|
|||
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
|
||||
|
||||
mod add_arg_to_trait_method;
|
||||
mod code_block_citations;
|
||||
mod file_search;
|
||||
|
||||
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
|
||||
let mut threads: Vec<Rc<dyn Example>> = vec![
|
||||
Rc::new(file_search::FileSearchExample),
|
||||
Rc::new(add_arg_to_trait_method::AddArgToTraitMethod),
|
||||
Rc::new(code_block_citations::CodeBlockCitations),
|
||||
];
|
||||
|
||||
for example_path in list_declarative_examples(examples_dir).unwrap() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue