Separate minidump crashes from panics (#36267)
The minidump-based crash reporting is now entirely separate from our legacy panic_hook-based reporting. This should improve the association of minidumps with their metadata and give us more consistent crash reports. Release Notes: - N/A --------- Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
This commit is contained in:
parent
f5f14111ef
commit
7784fac288
8 changed files with 315 additions and 248 deletions
|
@ -12,6 +12,8 @@ minidumper.workspace = true
|
|||
paths.workspace = true
|
||||
release_channel.workspace = true
|
||||
smol.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
workspace-hack.workspace = true
|
||||
|
||||
[lints]
|
||||
|
|
|
@ -2,15 +2,17 @@ use crash_handler::CrashHandler;
|
|||
use log::info;
|
||||
use minidumper::{Client, LoopAction, MinidumpBinary};
|
||||
use release_channel::{RELEASE_CHANNEL, ReleaseChannel};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::{
|
||||
env,
|
||||
fs::File,
|
||||
fs::{self, File},
|
||||
io,
|
||||
panic::Location,
|
||||
path::{Path, PathBuf},
|
||||
process::{self, Command},
|
||||
sync::{
|
||||
LazyLock, OnceLock,
|
||||
Arc, OnceLock,
|
||||
atomic::{AtomicBool, Ordering},
|
||||
},
|
||||
thread,
|
||||
|
@ -18,19 +20,17 @@ use std::{
|
|||
};
|
||||
|
||||
// set once the crash handler has initialized and the client has connected to it
|
||||
pub static CRASH_HANDLER: AtomicBool = AtomicBool::new(false);
|
||||
pub static CRASH_HANDLER: OnceLock<Arc<Client>> = OnceLock::new();
|
||||
// set when the first minidump request is made to avoid generating duplicate crash reports
|
||||
pub static REQUESTED_MINIDUMP: AtomicBool = AtomicBool::new(false);
|
||||
const CRASH_HANDLER_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
const CRASH_HANDLER_PING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
const CRASH_HANDLER_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
pub static GENERATE_MINIDUMPS: LazyLock<bool> = LazyLock::new(|| {
|
||||
*RELEASE_CHANNEL != ReleaseChannel::Dev || env::var("ZED_GENERATE_MINIDUMPS").is_ok()
|
||||
});
|
||||
|
||||
pub async fn init(id: String) {
|
||||
if !*GENERATE_MINIDUMPS {
|
||||
pub async fn init(crash_init: InitCrashHandler) {
|
||||
if *RELEASE_CHANNEL == ReleaseChannel::Dev && env::var("ZED_GENERATE_MINIDUMPS").is_err() {
|
||||
return;
|
||||
}
|
||||
|
||||
let exe = env::current_exe().expect("unable to find ourselves");
|
||||
let zed_pid = process::id();
|
||||
// TODO: we should be able to get away with using 1 crash-handler process per machine,
|
||||
|
@ -61,9 +61,11 @@ pub async fn init(id: String) {
|
|||
smol::Timer::after(retry_frequency).await;
|
||||
}
|
||||
let client = maybe_client.unwrap();
|
||||
client.send_message(1, id).unwrap(); // set session id on the server
|
||||
client
|
||||
.send_message(1, serde_json::to_vec(&crash_init).unwrap())
|
||||
.unwrap();
|
||||
|
||||
let client = std::sync::Arc::new(client);
|
||||
let client = Arc::new(client);
|
||||
let handler = crash_handler::CrashHandler::attach(unsafe {
|
||||
let client = client.clone();
|
||||
crash_handler::make_crash_event(move |crash_context: &crash_handler::CrashContext| {
|
||||
|
@ -72,7 +74,6 @@ pub async fn init(id: String) {
|
|||
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
client.send_message(2, "mistakes were made").unwrap();
|
||||
client.ping().unwrap();
|
||||
client.request_dump(crash_context).is_ok()
|
||||
} else {
|
||||
|
@ -87,7 +88,7 @@ pub async fn init(id: String) {
|
|||
{
|
||||
handler.set_ptracer(Some(server_pid));
|
||||
}
|
||||
CRASH_HANDLER.store(true, Ordering::Release);
|
||||
CRASH_HANDLER.set(client.clone()).ok();
|
||||
std::mem::forget(handler);
|
||||
info!("crash handler registered");
|
||||
|
||||
|
@ -98,14 +99,43 @@ pub async fn init(id: String) {
|
|||
}
|
||||
|
||||
pub struct CrashServer {
|
||||
session_id: OnceLock<String>,
|
||||
initialization_params: OnceLock<InitCrashHandler>,
|
||||
panic_info: OnceLock<CrashPanic>,
|
||||
has_connection: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||
pub struct CrashInfo {
|
||||
pub init: InitCrashHandler,
|
||||
pub panic: Option<CrashPanic>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||
pub struct InitCrashHandler {
|
||||
pub session_id: String,
|
||||
pub zed_version: String,
|
||||
pub release_channel: String,
|
||||
pub commit_sha: String,
|
||||
// pub gpu: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
pub struct CrashPanic {
|
||||
pub message: String,
|
||||
pub span: String,
|
||||
}
|
||||
|
||||
impl minidumper::ServerHandler for CrashServer {
|
||||
fn create_minidump_file(&self) -> Result<(File, PathBuf), io::Error> {
|
||||
let err_message = "Need to send a message with the ID upon starting the crash handler";
|
||||
let err_message = "Missing initialization data";
|
||||
let dump_path = paths::logs_dir()
|
||||
.join(self.session_id.get().expect(err_message))
|
||||
.join(
|
||||
&self
|
||||
.initialization_params
|
||||
.get()
|
||||
.expect(err_message)
|
||||
.session_id,
|
||||
)
|
||||
.with_extension("dmp");
|
||||
let file = File::create(&dump_path)?;
|
||||
Ok((file, dump_path))
|
||||
|
@ -122,38 +152,71 @@ impl minidumper::ServerHandler for CrashServer {
|
|||
info!("failed to write minidump: {:#}", e);
|
||||
}
|
||||
}
|
||||
|
||||
let crash_info = CrashInfo {
|
||||
init: self
|
||||
.initialization_params
|
||||
.get()
|
||||
.expect("not initialized")
|
||||
.clone(),
|
||||
panic: self.panic_info.get().cloned(),
|
||||
};
|
||||
|
||||
let crash_data_path = paths::logs_dir()
|
||||
.join(&crash_info.init.session_id)
|
||||
.with_extension("json");
|
||||
|
||||
fs::write(crash_data_path, serde_json::to_vec(&crash_info).unwrap()).ok();
|
||||
|
||||
LoopAction::Exit
|
||||
}
|
||||
|
||||
fn on_message(&self, kind: u32, buffer: Vec<u8>) {
|
||||
let message = String::from_utf8(buffer).expect("invalid utf-8");
|
||||
info!("kind: {kind}, message: {message}",);
|
||||
if kind == 1 {
|
||||
self.session_id
|
||||
.set(message)
|
||||
.expect("session id already initialized");
|
||||
match kind {
|
||||
1 => {
|
||||
let init_data =
|
||||
serde_json::from_slice::<InitCrashHandler>(&buffer).expect("invalid init data");
|
||||
self.initialization_params
|
||||
.set(init_data)
|
||||
.expect("already initialized");
|
||||
}
|
||||
2 => {
|
||||
let panic_data =
|
||||
serde_json::from_slice::<CrashPanic>(&buffer).expect("invalid panic data");
|
||||
self.panic_info.set(panic_data).expect("already panicked");
|
||||
}
|
||||
_ => {
|
||||
panic!("invalid message kind");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn on_client_disconnected(&self, clients: usize) -> LoopAction {
|
||||
info!("client disconnected, {clients} remaining");
|
||||
if clients == 0 {
|
||||
LoopAction::Exit
|
||||
} else {
|
||||
LoopAction::Continue
|
||||
}
|
||||
fn on_client_disconnected(&self, _clients: usize) -> LoopAction {
|
||||
LoopAction::Exit
|
||||
}
|
||||
|
||||
fn on_client_connected(&self, _clients: usize) -> LoopAction {
|
||||
self.has_connection.store(true, Ordering::SeqCst);
|
||||
LoopAction::Continue
|
||||
}
|
||||
}
|
||||
|
||||
pub fn handle_panic() {
|
||||
if !*GENERATE_MINIDUMPS {
|
||||
return;
|
||||
}
|
||||
pub fn handle_panic(message: String, span: Option<&Location>) {
|
||||
let span = span
|
||||
.map(|loc| format!("{}:{}", loc.file(), loc.line()))
|
||||
.unwrap_or_default();
|
||||
|
||||
// wait 500ms for the crash handler process to start up
|
||||
// if it's still not there just write panic info and no minidump
|
||||
let retry_frequency = Duration::from_millis(100);
|
||||
for _ in 0..5 {
|
||||
if CRASH_HANDLER.load(Ordering::Acquire) {
|
||||
if let Some(client) = CRASH_HANDLER.get() {
|
||||
client
|
||||
.send_message(
|
||||
2,
|
||||
serde_json::to_vec(&CrashPanic { message, span }).unwrap(),
|
||||
)
|
||||
.ok();
|
||||
log::error!("triggering a crash to generate a minidump...");
|
||||
#[cfg(target_os = "linux")]
|
||||
CrashHandler.simulate_signal(crash_handler::Signal::Trap as u32);
|
||||
|
@ -170,14 +233,30 @@ pub fn crash_server(socket: &Path) {
|
|||
log::info!("Couldn't create socket, there may already be a running crash server");
|
||||
return;
|
||||
};
|
||||
let ab = AtomicBool::new(false);
|
||||
|
||||
let shutdown = Arc::new(AtomicBool::new(false));
|
||||
let has_connection = Arc::new(AtomicBool::new(false));
|
||||
|
||||
std::thread::spawn({
|
||||
let shutdown = shutdown.clone();
|
||||
let has_connection = has_connection.clone();
|
||||
move || {
|
||||
std::thread::sleep(CRASH_HANDLER_CONNECT_TIMEOUT);
|
||||
if !has_connection.load(Ordering::SeqCst) {
|
||||
shutdown.store(true, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
server
|
||||
.run(
|
||||
Box::new(CrashServer {
|
||||
session_id: OnceLock::new(),
|
||||
initialization_params: OnceLock::new(),
|
||||
panic_info: OnceLock::new(),
|
||||
has_connection,
|
||||
}),
|
||||
&ab,
|
||||
Some(CRASH_HANDLER_TIMEOUT),
|
||||
&shutdown,
|
||||
Some(CRASH_HANDLER_PING_TIMEOUT),
|
||||
)
|
||||
.expect("failed to run server");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue