agent: Overwrite files more cautiously (#30649)
1. The `edit_file` tool tended to use `create_or_overwrite` a bit too often, leading to corruption of long files. This change replaces the boolean flag with an `EditFileMode` enum, which helps Agent make a more deliberate choice when overwriting files. With this change, the pass rate of the new eval increased from 10% to 100%. 2. eval: Added ability to run eval on top of an existing thread. Threads can now be loaded from JSON files in the `SerializedThread` format, which makes it easy to use real threads as starting points for tests/evals. 3. Don't try to restore tool cards when running in headless or eval mode -- we don't have a window to properly do this. Release Notes: - N/A
This commit is contained in:
parent
22f76ac1a7
commit
255d8f7cf8
18 changed files with 425 additions and 37 deletions
|
@ -711,9 +711,9 @@ fn print_report(
|
|||
.values()
|
||||
.flat_map(|results| {
|
||||
results.iter().map(|(example, _)| {
|
||||
let absolute_path = example.run_directory.join("last.messages.json");
|
||||
pathdiff::diff_paths(&absolute_path, run_dir)
|
||||
.unwrap_or_else(|| absolute_path.clone())
|
||||
let absolute_path = run_dir.join(example.run_directory.join("last.messages.json"));
|
||||
let cwd = std::env::current_dir().expect("Can't get current dir");
|
||||
pathdiff::diff_paths(&absolute_path, cwd).unwrap_or_else(|| absolute_path.clone())
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue