Separate minidump crashes from panics (#36267)

The minidump-based crash reporting is now entirely separate from our
legacy panic_hook-based reporting. This should improve the association
of minidumps with their metadata and give us more consistent crash
reports.

Release Notes:

- N/A

---------

Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
This commit is contained in:
Julia Ryan 2025-08-16 01:33:32 -05:00 committed by GitHub
parent f5f14111ef
commit 7784fac288
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 315 additions and 248 deletions

View file

@ -12,6 +12,8 @@ minidumper.workspace = true
paths.workspace = true
release_channel.workspace = true
smol.workspace = true
serde.workspace = true
serde_json.workspace = true
workspace-hack.workspace = true
[lints]

View file

@ -2,15 +2,17 @@ use crash_handler::CrashHandler;
use log::info;
use minidumper::{Client, LoopAction, MinidumpBinary};
use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
use serde::{Deserialize, Serialize};
use std::{
env,
fs::File,
fs::{self, File},
io,
panic::Location,
path::{Path, PathBuf},
process::{self, Command},
sync::{
LazyLock, OnceLock,
Arc, OnceLock,
atomic::{AtomicBool, Ordering},
},
thread,
@ -18,19 +20,17 @@ use std::{
};
// set once the crash handler has initialized and the client has connected to it
pub static CRASH_HANDLER: AtomicBool = AtomicBool::new(false);
pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
// set when the first minidump request is made to avoid generating duplicate crash reports
pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
const CRASH_HANDLER_TIMEOUT: Duration = Duration::from_secs(60);
const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
pub static GENERATE_MINIDUMPS: LazyLock<bool> = LazyLock::new(|| {
*RELEASE_CHANNEL != ReleaseChannel::Dev || env::var("ZED_GENERATE_MINIDUMPS").is_ok()
});
pub async fn init(id: String) {
if !*GENERATE_MINIDUMPS {
pub async fn init(crash_init: InitCrashHandler) {
if *RELEASE_CHANNEL == ReleaseChannel::Dev && env::var("ZED_GENERATE_MINIDUMPS").is_err() {
return;
}
let exe = env::current_exe().expect("unable to find ourselves");
let zed_pid = process::id();
// TODO: we should be able to get away with using 1 crash-handler process per machine,
@ -61,9 +61,11 @@ pub async fn init(id: String) {
smol::Timer::after(retry_frequency).await;
}
let client = maybe_client.unwrap();
client.send_message(1, id).unwrap(); // set session id on the server
client
.send_message(1, serde_json::to_vec(&crash_init).unwrap())
.unwrap();
let client = std::sync::Arc::new(client);
let client = Arc::new(client);
let handler = crash_handler::CrashHandler::attach(unsafe {
let client = client.clone();
crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
@ -72,7 +74,6 @@ pub async fn init(id: String) {
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
.is_ok()
{
client.send_message(2, "mistakes were made").unwrap();
client.ping().unwrap();
client.request_dump(crash_context).is_ok()
} else {
@ -87,7 +88,7 @@ pub async fn init(id: String) {
{
handler.set_ptracer(Some(server_pid));
}
CRASH_HANDLER.store(true, Ordering::Release);
CRASH_HANDLER.set(client.clone()).ok();
std::mem::forget(handler);
info!("crash handler registered");
@ -98,14 +99,43 @@ pub async fn init(id: String) {
}
pub struct CrashServer {
session_id: OnceLock<String>,
initialization_params: OnceLock<InitCrashHandler>,
panic_info: OnceLock<CrashPanic>,
has_connection: Arc<AtomicBool>,
}
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct CrashInfo {
pub init: InitCrashHandler,
pub panic: Option<CrashPanic>,
}
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct InitCrashHandler {
pub session_id: String,
pub zed_version: String,
pub release_channel: String,
pub commit_sha: String,
// pub gpu: String,
}
#[derive(Deserialize, Serialize, Debug, Clone)]
pub struct CrashPanic {
pub message: String,
pub span: String,
}
impl minidumper::ServerHandler for CrashServer {
fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
let err_message = "Need to send a message with the ID upon starting the crash handler";
let err_message = "Missing initialization data";
let dump_path = paths::logs_dir()
.join(self.session_id.get().expect(err_message))
.join(
&self
.initialization_params
.get()
.expect(err_message)
.session_id,
)
.with_extension("dmp");
let file = File::create(&dump_path)?;
Ok((file, dump_path))
@ -122,38 +152,71 @@ impl minidumper::ServerHandler for CrashServer {
info!("failed to write minidump: {:#}", e);
}
}
let crash_info = CrashInfo {
init: self
.initialization_params
.get()
.expect("not initialized")
.clone(),
panic: self.panic_info.get().cloned(),
};
let crash_data_path = paths::logs_dir()
.join(&crash_info.init.session_id)
.with_extension("json");
fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
LoopAction::Exit
}
fn on_message(&self, kind: u32, buffer: Vec<u8>) {
let message = String::from_utf8(buffer).expect("invalid utf-8");
info!("kind: {kind}, message: {message}",);
if kind == 1 {
self.session_id
.set(message)
.expect("session id already initialized");
match kind {
1 => {
let init_data =
serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
self.initialization_params
.set(init_data)
.expect("already initialized");
}
2 => {
let panic_data =
serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
self.panic_info.set(panic_data).expect("already panicked");
}
_ => {
panic!("invalid message kind");
}
}
}
fn on_client_disconnected(&self, clients: usize) -> LoopAction {
info!("client disconnected, {clients} remaining");
if clients == 0 {
LoopAction::Exit
} else {
LoopAction::Continue
}
fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
LoopAction::Exit
}
fn on_client_connected(&self, _clients: usize) -> LoopAction {
self.has_connection.store(true, Ordering::SeqCst);
LoopAction::Continue
}
}
pub fn handle_panic() {
if !*GENERATE_MINIDUMPS {
return;
}
pub fn handle_panic(message: String, span: Option<&Location>) {
let span = span
.map(|loc| format!("{}:{}", loc.file(), loc.line()))
.unwrap_or_default();
// wait 500ms for the crash handler process to start up
// if it's still not there just write panic info and no minidump
let retry_frequency = Duration::from_millis(100);
for _ in 0..5 {
if CRASH_HANDLER.load(Ordering::Acquire) {
if let Some(client) = CRASH_HANDLER.get() {
client
.send_message(
2,
serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
)
.ok();
log::error!("triggering a crash to generate a minidump...");
#[cfg(target_os = "linux")]
CrashHandler.simulate_signal(crash_handler::Signal::Trap as u32);
@ -170,14 +233,30 @@ pub fn crash_server(socket: &Path) {
log::info!("Couldn't create socket, there may already be a running crash server");
return;
};
let ab = AtomicBool::new(false);
let shutdown = Arc::new(AtomicBool::new(false));
let has_connection = Arc::new(AtomicBool::new(false));
std::thread::spawn({
let shutdown = shutdown.clone();
let has_connection = has_connection.clone();
move || {
std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
if !has_connection.load(Ordering::SeqCst) {
shutdown.store(true, Ordering::SeqCst);
}
}
});
server
.run(
Box::new(CrashServer {
session_id: OnceLock::new(),
initialization_params: OnceLock::new(),
panic_info: OnceLock::new(),
has_connection,
}),
&ab,
Some(CRASH_HANDLER_TIMEOUT),
&shutdown,
Some(CRASH_HANDLER_PING_TIMEOUT),
)
.expect("failed to run server");
}