agent: Add telemetry for eval runs (#28816)
Release Notes: - N/A --------- Co-authored-by: Joseph <joseph@zed.dev>
This commit is contained in:
parent
1eb948654a
commit
222d4a2546
4 changed files with 89 additions and 6 deletions
3
Cargo.lock
generated
3
Cargo.lock
generated
|
@ -4886,6 +4886,7 @@ dependencies = [
|
||||||
"collections",
|
"collections",
|
||||||
"context_server",
|
"context_server",
|
||||||
"dap",
|
"dap",
|
||||||
|
"dirs 5.0.1",
|
||||||
"env_logger 0.11.8",
|
"env_logger 0.11.8",
|
||||||
"extension",
|
"extension",
|
||||||
"fs",
|
"fs",
|
||||||
|
@ -4907,9 +4908,11 @@ dependencies = [
|
||||||
"serde",
|
"serde",
|
||||||
"settings",
|
"settings",
|
||||||
"shellexpand 2.1.2",
|
"shellexpand 2.1.2",
|
||||||
|
"telemetry",
|
||||||
"toml 0.8.20",
|
"toml 0.8.20",
|
||||||
"unindent",
|
"unindent",
|
||||||
"util",
|
"util",
|
||||||
|
"uuid",
|
||||||
"workspace-hack",
|
"workspace-hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ client.workspace = true
|
||||||
collections.workspace = true
|
collections.workspace = true
|
||||||
context_server.workspace = true
|
context_server.workspace = true
|
||||||
dap.workspace = true
|
dap.workspace = true
|
||||||
|
dirs = "5.0"
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
extension.workspace = true
|
extension.workspace = true
|
||||||
fs.workspace = true
|
fs.workspace = true
|
||||||
|
@ -37,9 +38,11 @@ reqwest_client.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
settings.workspace = true
|
settings.workspace = true
|
||||||
shellexpand.workspace = true
|
shellexpand.workspace = true
|
||||||
|
telemetry.workspace = true
|
||||||
toml.workspace = true
|
toml.workspace = true
|
||||||
unindent.workspace = true
|
unindent.workspace = true
|
||||||
util.workspace = true
|
util.workspace = true
|
||||||
|
uuid = { version = "1.6", features = ["v4"] }
|
||||||
workspace-hack.workspace = true
|
workspace-hack.workspace = true
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
mod example;
|
mod example;
|
||||||
|
mod ids;
|
||||||
|
|
||||||
use client::{Client, ProxySettings, UserStore};
|
use client::{Client, ProxySettings, UserStore};
|
||||||
pub(crate) use example::*;
|
pub(crate) use example::*;
|
||||||
|
use telemetry;
|
||||||
|
|
||||||
use ::fs::RealFs;
|
use ::fs::RealFs;
|
||||||
use anyhow::{Result, anyhow};
|
use anyhow::{Result, anyhow};
|
||||||
|
@ -39,7 +41,6 @@ struct Args {
|
||||||
/// Model to use (default: "claude-3-7-sonnet-latest")
|
/// Model to use (default: "claude-3-7-sonnet-latest")
|
||||||
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
|
#[arg(long, default_value = "claude-3-7-sonnet-latest")]
|
||||||
model: String,
|
model: String,
|
||||||
/// Languages to run (comma-separated, e.g. "js,ts,py"). If unspecified, only Rust examples are run.
|
|
||||||
#[arg(long, value_delimiter = ',')]
|
#[arg(long, value_delimiter = ',')]
|
||||||
languages: Option<Vec<String>>,
|
languages: Option<Vec<String>>,
|
||||||
/// How many times to run the judge on each example run.
|
/// How many times to run the judge on each example run.
|
||||||
|
@ -77,6 +78,15 @@ fn main() {
|
||||||
app.run(move |cx| {
|
app.run(move |cx| {
|
||||||
let app_state = init(cx);
|
let app_state = init(cx);
|
||||||
|
|
||||||
|
let system_id = ids::get_or_create_id(&ids::eval_system_id_path()).ok();
|
||||||
|
let installation_id = ids::get_or_create_id(&ids::eval_installation_id_path()).ok();
|
||||||
|
let session_id = uuid::Uuid::new_v4().to_string();
|
||||||
|
|
||||||
|
app_state
|
||||||
|
.client
|
||||||
|
.telemetry()
|
||||||
|
.start(system_id, installation_id, session_id, cx);
|
||||||
|
|
||||||
let model = find_model("claude-3-7-sonnet-latest", cx).unwrap();
|
let model = find_model("claude-3-7-sonnet-latest", cx).unwrap();
|
||||||
|
|
||||||
LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
|
LanguageModelRegistry::global(cx).update(cx, |registry, cx| {
|
||||||
|
@ -273,6 +283,11 @@ fn main() {
|
||||||
/ (score_count as f32);
|
/ (score_count as f32);
|
||||||
println!("\nAverage score: {average_score}");
|
println!("\nAverage score: {average_score}");
|
||||||
|
|
||||||
|
std::thread::sleep(std::time::Duration::from_secs(2));
|
||||||
|
|
||||||
|
// Flush telemetry events before exiting
|
||||||
|
app_state.client.telemetry().flush_events();
|
||||||
|
|
||||||
cx.update(|cx| cx.quit())
|
cx.update(|cx| cx.quit())
|
||||||
})
|
})
|
||||||
.detach_and_log_err(cx);
|
.detach_and_log_err(cx);
|
||||||
|
@ -286,15 +301,49 @@ async fn run_example(
|
||||||
judge_repetitions: u32,
|
judge_repetitions: u32,
|
||||||
cx: &mut AsyncApp,
|
cx: &mut AsyncApp,
|
||||||
) -> Result<Vec<Result<JudgeOutput>>> {
|
) -> Result<Vec<Result<JudgeOutput>>> {
|
||||||
cx.update(|cx| example.run(model.clone(), app_state, cx))?
|
let run_output = cx
|
||||||
|
.update(|cx| example.run(model.clone(), app_state.clone(), cx))?
|
||||||
.await?;
|
.await?;
|
||||||
let diff = example.repository_diff().await?;
|
let diff = example.repository_diff().await?;
|
||||||
|
|
||||||
let judge_tasks = (0..judge_repetitions)
|
// Run judge for each repetition
|
||||||
.map(|round| example.judge(model.clone(), diff.clone(), round, cx))
|
let mut results = Vec::new();
|
||||||
.collect::<Vec<_>>();
|
for round in 0..judge_repetitions {
|
||||||
|
let judge_result = example.judge(model.clone(), diff.clone(), round, cx).await;
|
||||||
|
|
||||||
Ok(future::join_all(judge_tasks).await)
|
// Log telemetry for this judge result
|
||||||
|
if let Ok(judge_output) = &judge_result {
|
||||||
|
let cohort_id = example
|
||||||
|
.output_file_path
|
||||||
|
.parent()
|
||||||
|
.and_then(|p| p.file_name())
|
||||||
|
.map(|name| name.to_string_lossy().to_string())
|
||||||
|
.unwrap_or(chrono::Local::now().format("%Y-%m-%d_%H-%M-%S").to_string());
|
||||||
|
|
||||||
|
telemetry::event!(
|
||||||
|
"Agent Eval Completed",
|
||||||
|
cohort_id = cohort_id,
|
||||||
|
example_name = example.name.clone(),
|
||||||
|
round = round,
|
||||||
|
score = judge_output.score,
|
||||||
|
analysis = judge_output.analysis,
|
||||||
|
tool_use_counts = run_output.tool_use_counts,
|
||||||
|
response_count = run_output.response_count,
|
||||||
|
token_usage = run_output.token_usage,
|
||||||
|
model = model.telemetry_id(),
|
||||||
|
model_provider = model.provider_id().to_string(),
|
||||||
|
repository_url = example.base.url.clone(),
|
||||||
|
repository_revision = example.base.revision.clone(),
|
||||||
|
diagnostics_summary = run_output.diagnostics
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(judge_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
app_state.client.telemetry().flush_events();
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn list_all_examples() -> Result<Vec<PathBuf>> {
|
fn list_all_examples() -> Result<Vec<PathBuf>> {
|
||||||
|
|
28
crates/eval/src/ids.rs
Normal file
28
crates/eval/src/ids.rs
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
use anyhow::Result;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
pub fn get_or_create_id(path: &Path) -> Result<String> {
|
||||||
|
if let Ok(id) = fs::read_to_string(path) {
|
||||||
|
let trimmed = id.trim();
|
||||||
|
if !trimmed.is_empty() {
|
||||||
|
return Ok(trimmed.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let new_id = Uuid::new_v4().to_string();
|
||||||
|
fs::write(path, &new_id)?;
|
||||||
|
Ok(new_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn eval_system_id_path() -> PathBuf {
|
||||||
|
dirs::data_local_dir()
|
||||||
|
.unwrap_or_else(|| PathBuf::from("."))
|
||||||
|
.join("zed-eval-system-id")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn eval_installation_id_path() -> PathBuf {
|
||||||
|
dirs::data_local_dir()
|
||||||
|
.unwrap_or_else(|| PathBuf::from("."))
|
||||||
|
.join("zed-eval-installation-id")
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue