Include EditAgent's raw output when inspecting thread (#30337)

This allows us to debug the raw edits that were generated when people
report feedback, when running evals and when opening the thread as
Markdown.

Release Notes:

- Improved debug output for agent threads.
This commit is contained in:
Antonio Scandurra 2025-05-09 08:58:45 +02:00 committed by GitHub
parent ea7756b362
commit 1b593f616f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 37 additions and 14 deletions

View file

@ -1,4 +1,6 @@
use derive_more::{Add, AddAssign};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::{cmp, mem, ops::Range};
@ -13,7 +15,9 @@ pub enum EditParserEvent {
NewTextChunk { chunk: String, done: bool },
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Add, AddAssign)]
#[derive(
Clone, Debug, Default, PartialEq, Eq, Add, AddAssign, Serialize, Deserialize, JsonSchema,
)]
pub struct EditParserMetrics {
pub tags: usize,
pub mismatched_tags: usize,

View file

@ -1116,7 +1116,7 @@ fn eval(iterations: usize, expected_pass_ratio: f32, mut eval: EvalInput) {
while let Ok(output) = rx.recv() {
match output {
Ok(output) => {
cumulative_parser_metrics += output.sample.edit_output._parser_metrics.clone();
cumulative_parser_metrics += output.sample.edit_output.parser_metrics.clone();
eval_outputs.push(output.clone());
if output.assertion.score < 80 {
failed_count += 1;
@ -1197,9 +1197,9 @@ impl Display for EvalOutput {
writeln!(
f,
"Parser Metrics:\n{:#?}",
self.sample.edit_output._parser_metrics
self.sample.edit_output.parser_metrics
)?;
writeln!(f, "Raw Edits:\n{}", self.sample.edit_output._raw_edits)?;
writeln!(f, "Raw Edits:\n{}", self.sample.edit_output.raw_edits)?;
Ok(())
}
}