Add new action to run agent eval (#29158)

The old one wasn't linking, and
https://github.com/zed-industries/zed/pull/29081 has a bunch of merge
conflicts. Wanted to start simple/small.

## Todo

* [x] Remove low-signal examples
* [x] Make the eval run on a cron, on main, and on any PR with the
`run-eval` label
* [x] Noise in logs about failure to write settings
    ```
[2025-04-21T20:45:04Z ERROR settings] Failed to write settings to file
"/home/runner/.config/zed/settings.json"
    
       Caused by:
No such file or directory (os error 2) at path
"/home/runner/.config/zed/.tmpLewFEs"
    ```
* [x] `Agentic loop stalled`
(https://github.com/zed-industries/zed/actions/runs/14581044243/job/40897622894)
* [x] Make sure that events are recorded in snowflake
* [ ] Change judge criteria to be more explicit about meanings of scores

Release Notes:

- N/A

---------

Co-authored-by: Antonio Scandurra <me@as-cii.com>
Co-authored-by: Agus Zubiaga <hi@aguz.me>
Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
Co-authored-by: Thomas Mickley-Doyle <tmickleydoyle@gmail.com>
This commit is contained in:
Nathan Sobo 2025-04-21 22:30:21 -06:00 committed by GitHub
parent b14356d1d3
commit 458ffaa134
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
58 changed files with 291 additions and 385 deletions

View file

@ -1010,6 +1010,7 @@ impl ActiveThread {
}
}
ThreadEvent::CheckpointChanged => cx.notify(),
ThreadEvent::ReceivedTextChunk => {}
}
}

View file

@ -1231,6 +1231,7 @@ impl Thread {
current_token_usage = token_usage;
}
LanguageModelCompletionEvent::Text(chunk) => {
cx.emit(ThreadEvent::ReceivedTextChunk);
if let Some(last_message) = thread.messages.last_mut() {
if last_message.role == Role::Assistant {
last_message.push_text(&chunk);
@ -1780,7 +1781,7 @@ impl Thread {
thread_data,
final_project_snapshot
);
client.telemetry().flush_events();
client.telemetry().flush_events().await;
Ok(())
})
@ -1825,7 +1826,7 @@ impl Thread {
thread_data,
final_project_snapshot
);
client.telemetry().flush_events();
client.telemetry().flush_events().await;
Ok(())
})
@ -2081,7 +2082,7 @@ impl Thread {
github_login = github_login
);
client.telemetry().flush_events();
client.telemetry().flush_events().await;
}
}
})
@ -2199,6 +2200,7 @@ pub enum ThreadEvent {
ShowError(ThreadError),
UsageUpdated(RequestUsage),
StreamedCompletion,
ReceivedTextChunk,
StreamedAssistantText(MessageId, String),
StreamedAssistantThinking(MessageId, String),
StreamedToolUse {