Hang diagnostics (#11190)

Release Notes:

- Added diagnostics for main-thread hangs on macOS. These are only
enabled if you've opted into diagnostics.

---------

Co-authored-by: Mikayla <mikayla@zed.dev>
This commit is contained in:
Conrad Irwin 2024-04-29 18:13:28 -06:00 committed by GitHub
parent 04cd8dd0f2
commit 0697b417a0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 721 additions and 356 deletions

View file

@ -18,11 +18,15 @@ use telemetry_events::{
ActionEvent, AppEvent, AssistantEvent, CallEvent, CopilotEvent, CpuEvent, EditEvent,
EditorEvent, Event, EventRequestBody, EventWrapper, ExtensionEvent, MemoryEvent, SettingEvent,
};
use uuid::Uuid;
static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";
pub fn router() -> Router {
Router::new()
.route("/telemetry/events", post(post_events))
.route("/telemetry/crashes", post(post_crash))
.route("/telemetry/hangs", post(post_hang))
}
pub struct ZedChecksumHeader(Vec<u8>);
@ -85,8 +89,6 @@ pub async fn post_crash(
headers: HeaderMap,
body: Bytes,
) -> Result<()> {
static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";
let report = IpsFile::parse(&body)?;
let version_threshold = SemanticVersion::new(0, 123, 0);
@ -222,6 +224,107 @@ pub async fn post_crash(
Ok(())
}
pub async fn post_hang(
Extension(app): Extension<Arc<AppState>>,
TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
body: Bytes,
) -> Result<()> {
let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
return Err(Error::Http(
StatusCode::INTERNAL_SERVER_ERROR,
"events not enabled".into(),
))?;
};
if checksum != expected {
return Err(Error::Http(
StatusCode::BAD_REQUEST,
"invalid checksum".into(),
))?;
}
let incident_id = Uuid::new_v4().to_string();
// dump JSON into S3 so we can get frame offsets if we need to.
if let Some(blob_store_client) = app.blob_store_client.as_ref() {
blob_store_client
.put_object()
.bucket(CRASH_REPORTS_BUCKET)
.key(incident_id.clone() + ".hang.json")
.acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
.body(ByteStream::from(body.to_vec()))
.send()
.await
.map_err(|e| log::error!("Failed to upload crash: {}", e))
.ok();
}
let report: telemetry_events::HangReport = serde_json::from_slice(&body).map_err(|err| {
log::error!("can't parse report json: {err}");
Error::Internal(anyhow!(err))
})?;
let mut backtrace = "Possible hang detected on main threadL".to_string();
let unknown = "<unknown>".to_string();
for frame in report.backtrace.iter() {
backtrace.push_str(&format!("\n{}", frame.symbols.first().unwrap_or(&unknown)));
}
tracing::error!(
service = "client",
version = %report.app_version.unwrap_or_default().to_string(),
os_name = %report.os_name,
os_version = report.os_version.unwrap_or_default().to_string(),
incident_id = %incident_id,
installation_id = %report.installation_id.unwrap_or_default(),
backtrace = %backtrace,
"hang report");
if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
let payload = slack::WebhookBody::new(|w| {
w.add_section(|s| s.text(slack::Text::markdown("Possible Hang".to_string())))
.add_section(|s| {
s.add_field(slack::Text::markdown(format!(
"*Version:*\n {} ",
report.app_version.unwrap_or_default()
)))
.add_field({
let hostname = app.config.blob_store_url.clone().unwrap_or_default();
let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
hostname.strip_prefix("http://").unwrap_or_default()
});
slack::Text::markdown(format!(
"*Incident:*\n<https://{}.{}/{}.hang.json|{}…>",
CRASH_REPORTS_BUCKET,
hostname,
incident_id,
incident_id.chars().take(8).collect::<String>(),
))
})
})
.add_rich_text(|r| r.add_preformatted(|p| p.add_text(backtrace)))
});
let payload_json = serde_json::to_string(&payload).map_err(|err| {
log::error!("Failed to serialize payload to JSON: {err}");
Error::Internal(anyhow!(err))
})?;
reqwest::Client::new()
.post(slack_panics_webhook)
.header("Content-Type", "application/json")
.body(payload_json)
.send()
.await
.map_err(|err| {
log::error!("Failed to send payload to Slack: {err}");
Error::Internal(anyhow!(err))
})?;
}
Ok(())
}
pub async fn post_events(
Extension(app): Extension<Arc<AppState>>,
TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
@ -235,19 +338,14 @@ pub async fn post_events(
))?
};
let Some(checksum_seed) = app.config.zed_client_checksum_seed.as_ref() else {
let Some(expected) = calculate_json_checksum(app.clone(), &body) else {
return Err(Error::Http(
StatusCode::INTERNAL_SERVER_ERROR,
"events not enabled".into(),
))?;
};
let mut summer = Sha256::new();
summer.update(checksum_seed);
summer.update(&body);
summer.update(checksum_seed);
if &checksum != &summer.finalize()[..] {
if checksum != expected {
return Err(Error::Http(
StatusCode::BAD_REQUEST,
"invalid checksum".into(),
@ -1061,3 +1159,15 @@ impl ActionEventRow {
}
}
}
pub fn calculate_json_checksum(app: Arc<AppState>, json: &impl AsRef<[u8]>) -> Option<Vec<u8>> {
let Some(checksum_seed) = app.config.zed_client_checksum_seed.as_ref() else {
return None;
};
let mut summer = Sha256::new();
summer.update(checksum_seed);
summer.update(&json);
summer.update(checksum_seed);
Some(summer.finalize().into_iter().collect())
}