Code block evals (#29619)

Add a targeted eval for code block formatting, and revise the system
prompt accordingly.

### Eval before, n=8

<img width="728" alt="eval before"
src="https://github.com/user-attachments/assets/552b6146-3d26-4eaa-86f9-9fc36c0cadf2"
/>

### Eval after prompt change, n=8 (excluding the new evals, so just
testing the prompt change)

<img width="717" alt="eval after"
src="https://github.com/user-attachments/assets/c78c7a54-4c65-470c-b135-8691584cd73e"
/>

Release Notes:

- N/A
This commit is contained in:
Richard Feldman 2025-04-29 18:52:09 -04:00 committed by GitHub
parent 2508e491d5
commit d7004030b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 536 additions and 134 deletions

View file

@ -10,13 +10,13 @@ use crate::{
ToolMetrics,
assertions::{AssertionsReport, RanAssertion, RanAssertionResult},
};
use agent::{ContextLoadResult, ThreadEvent};
use agent::{ContextLoadResult, Thread, ThreadEvent};
use anyhow::{Result, anyhow};
use async_trait::async_trait;
use buffer_diff::DiffHunkStatus;
use collections::HashMap;
use futures::{FutureExt as _, StreamExt, channel::mpsc, select_biased};
use gpui::{AppContext, AsyncApp, Entity};
use gpui::{App, AppContext, AsyncApp, Entity};
use language_model::{LanguageModel, Role, StopReason};
pub const THREAD_EVENT_TIMEOUT: Duration = Duration::from_secs(60 * 2);
@ -314,7 +314,7 @@ impl ExampleContext {
for message in thread.messages().skip(message_count_before) {
messages.push(Message {
_role: message.role,
_text: message.to_string(),
text: message.to_string(),
tool_use: thread
.tool_uses_for_message(message.id, cx)
.into_iter()
@ -362,6 +362,90 @@ impl ExampleContext {
})
.unwrap()
}
pub fn agent_thread(&self) -> Entity<Thread> {
self.agent_thread.clone()
}
}
impl AppContext for ExampleContext {
type Result<T> = anyhow::Result<T>;
fn new<T: 'static>(
&mut self,
build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
) -> Self::Result<Entity<T>> {
self.app.new(build_entity)
}
fn reserve_entity<T: 'static>(&mut self) -> Self::Result<gpui::Reservation<T>> {
self.app.reserve_entity()
}
fn insert_entity<T: 'static>(
&mut self,
reservation: gpui::Reservation<T>,
build_entity: impl FnOnce(&mut gpui::Context<T>) -> T,
) -> Self::Result<Entity<T>> {
self.app.insert_entity(reservation, build_entity)
}
fn update_entity<T, R>(
&mut self,
handle: &Entity<T>,
update: impl FnOnce(&mut T, &mut gpui::Context<T>) -> R,
) -> Self::Result<R>
where
T: 'static,
{
self.app.update_entity(handle, update)
}
fn read_entity<T, R>(
&self,
handle: &Entity<T>,
read: impl FnOnce(&T, &App) -> R,
) -> Self::Result<R>
where
T: 'static,
{
self.app.read_entity(handle, read)
}
fn update_window<T, F>(&mut self, window: gpui::AnyWindowHandle, f: F) -> Result<T>
where
F: FnOnce(gpui::AnyView, &mut gpui::Window, &mut App) -> T,
{
self.app.update_window(window, f)
}
fn read_window<T, R>(
&self,
window: &gpui::WindowHandle<T>,
read: impl FnOnce(Entity<T>, &App) -> R,
) -> Result<R>
where
T: 'static,
{
self.app.read_window(window, read)
}
fn background_spawn<R>(
&self,
future: impl std::future::Future<Output = R> + Send + 'static,
) -> gpui::Task<R>
where
R: Send + 'static,
{
self.app.background_spawn(future)
}
fn read_global<G, R>(&self, callback: impl FnOnce(&G, &App) -> R) -> Self::Result<R>
where
G: gpui::Global,
{
self.app.read_global(callback)
}
}
#[derive(Debug)]
@ -391,12 +475,16 @@ impl Response {
pub fn tool_uses(&self) -> impl Iterator<Item = &ToolUse> {
self.messages.iter().flat_map(|msg| &msg.tool_use)
}
pub fn texts(&self) -> impl Iterator<Item = String> {
self.messages.iter().map(|message| message.text.clone())
}
}
#[derive(Debug)]
pub struct Message {
_role: Role,
_text: String,
text: String,
tool_use: Vec<ToolUse>,
}