Code block evals (#29619)

Add a targeted eval for code block formatting, and revise the system
prompt accordingly.

### Eval before, n=8

<img width="728" alt="eval before"
src="https://github.com/user-attachments/assets/552b6146-3d26-4eaa-86f9-9fc36c0cadf2"
/>

### Eval after prompt change, n=8 (excluding the new evals, so just
testing the prompt change)

<img width="717" alt="eval after"
src="https://github.com/user-attachments/assets/c78c7a54-4c65-470c-b135-8691584cd73e"
/>

Release Notes:

- N/A
This commit is contained in:
Richard Feldman 2025-04-29 18:52:09 -04:00 committed by GitHub
parent 2508e491d5
commit d7004030b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 536 additions and 134 deletions

View file

@ -0,0 +1,191 @@
use anyhow::Result;
use async_trait::async_trait;
use markdown::PathWithRange;
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion, LanguageServer};
pub struct CodeBlockCitations;
const FENCE: &str = "```";
#[async_trait(?Send)]
impl Example for CodeBlockCitations {
fn meta(&self) -> ExampleMetadata {
ExampleMetadata {
name: "code_block_citations".to_string(),
url: "https://github.com/zed-industries/zed.git".to_string(),
revision: "f69aeb6311dde3c0b8979c293d019d66498d54f2".to_string(),
language_server: Some(LanguageServer {
file_extension: "rs".to_string(),
allow_preexisting_diagnostics: false,
}),
max_assertions: None,
}
}
async fn conversation(&self, cx: &mut ExampleContext) -> Result<()> {
const FILENAME: &str = "assistant_tool.rs";
cx.push_user_message(format!(
r#"
Show me the method bodies of all the methods of the `Tool` trait in {FILENAME}.
Please show each method in a separate code snippet.
"#
));
// Verify that the messages all have the correct formatting.
let texts: Vec<String> = cx.run_to_end().await?.texts().collect();
let closing_fence = format!("\n{FENCE}");
for text in texts.iter() {
let mut text = text.as_str();
while let Some(index) = text.find(FENCE) {
// Advance text past the opening backticks.
text = &text[index + FENCE.len()..];
// Find the closing backticks.
let content_len = text.find(&closing_fence);
// Verify the citation format - e.g. ```path/to/foo.txt#L123-456
if let Some(citation_len) = text.find('\n') {
let citation = &text[..citation_len];
if let Ok(()) =
cx.assert(citation.contains("/"), format!("Slash in {citation:?}",))
{
let path_range = PathWithRange::new(citation);
let path = cx
.agent_thread()
.update(cx, |thread, cx| {
thread
.project()
.read(cx)
.find_project_path(path_range.path, cx)
})
.ok()
.flatten();
if let Ok(path) = cx.assert_some(path, format!("Valid path: {citation:?}"))
{
let buffer_text = {
let buffer = match cx.agent_thread().update(cx, |thread, cx| {
thread
.project()
.update(cx, |project, cx| project.open_buffer(path, cx))
}) {
Ok(buffer_task) => buffer_task.await.ok(),
Err(err) => {
cx.assert(
false,
format!("Expected Ok(buffer), not {err:?}"),
)
.ok();
break;
}
};
let Ok(buffer_text) = cx.assert_some(
buffer.and_then(|buffer| {
buffer.read_with(cx, |buffer, _| buffer.text()).ok()
}),
"Reading buffer text succeeded",
) else {
continue;
};
buffer_text
};
if let Some(content_len) = content_len {
// + 1 because there's a newline character after the citation.
let content =
&text[(citation.len() + 1)..content_len - (citation.len() + 1)];
cx.assert(
buffer_text.contains(&content),
"Code block content was found in file",
)
.ok();
if let Some(range) = path_range.range {
let start_line_index = range.start.line.saturating_sub(1);
let line_count =
range.end.line.saturating_sub(start_line_index);
let mut snippet = buffer_text
.lines()
.skip(start_line_index as usize)
.take(line_count as usize)
.collect::<Vec<&str>>()
.join("\n");
if let Some(start_col) = range.start.col {
snippet = snippet[start_col as usize..].to_string();
}
if let Some(end_col) = range.end.col {
let last_line = snippet.lines().last().unwrap();
snippet = snippet
[..snippet.len() - last_line.len() + end_col as usize]
.to_string();
}
cx.assert_eq(
snippet.as_str(),
content,
"Code block snippet was at specified line/col",
)
.ok();
}
}
}
}
} else {
cx.assert(
false,
format!("Opening {FENCE} did not have a newline anywhere after it."),
)
.ok();
}
if let Some(content_len) = content_len {
// Advance past the closing backticks
text = &text[content_len + FENCE.len()..];
} else {
// There were no closing backticks associated with these opening backticks.
cx.assert(
false,
"Code block opening had matching closing backticks.".to_string(),
)
.ok();
// There are no more code blocks to parse, so we're done.
break;
}
}
}
Ok(())
}
fn thread_assertions(&self) -> Vec<JudgeAssertion> {
vec![
JudgeAssertion {
id: "trait method bodies are shown".to_string(),
description:
"All method bodies of the Tool trait are shown."
.to_string(),
},
JudgeAssertion {
id: "code blocks used".to_string(),
description:
"All code snippets are rendered inside markdown code blocks (as opposed to any other formatting besides code blocks)."
.to_string(),
},
JudgeAssertion {
id: "code blocks use backticks".to_string(),
description:
format!("All markdown code blocks use backtick fences ({FENCE}) rather than indentation.")
}
]
}
}

View file

@ -12,12 +12,14 @@ use util::serde::default_true;
use crate::example::{Example, ExampleContext, ExampleMetadata, JudgeAssertion};
mod add_arg_to_trait_method;
mod code_block_citations;
mod file_search;
pub fn all(examples_dir: &Path) -> Vec<Rc<dyn Example>> {
let mut threads: Vec<Rc<dyn Example>> = vec![
Rc::new(file_search::FileSearchExample),
Rc::new(add_arg_to_trait_method::AddArgToTraitMethod),
Rc::new(code_block_citations::CodeBlockCitations),
];
for example_path in list_declarative_examples(examples_dir).unwrap() {