remote server: Do not spawn server when proxy reconnects (#18864)

This ensures that we only ever reconnect to a running server and not
spawn a new server with no state.

This avoids the problem of the server process crashing, `proxy`
reconnecting, starting a new server, and the user getting errors like
"unknown buffer id: ...".

Release Notes:

- N/A

---------

Co-authored-by: Bennet Bo Fenner <bennet@zed.dev>
This commit is contained in:
Thorsten Ball 2024-10-09 16:51:12 +02:00 committed by GitHub
parent dbf986d37a
commit c674d73734
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 248 additions and 91 deletions

View file

@ -0,0 +1,25 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum ProxyLaunchError {
#[error("Attempted reconnect, but server not running.")]
ServerNotRunning,
}
impl ProxyLaunchError {
pub fn to_exit_code(&self) -> i32 {
match self {
// We're using 90 as the exit code, because 0-78 are often taken
// by shells and other conventions and >128 also has certain meanings
// in certain contexts.
Self::ServerNotRunning => 90,
}
}
pub fn from_exit_code(exit_code: i32) -> Option<Self> {
match exit_code {
90 => Some(Self::ServerNotRunning),
_ => None,
}
}
}

View file

@ -1,5 +1,6 @@
pub mod json_log;
pub mod protocol;
pub mod proxy;
pub mod ssh_session;
pub use ssh_session::{

View file

@ -3,6 +3,7 @@ use crate::{
protocol::{
message_len_from_buffer, read_message_with_len, write_message, MessageId, MESSAGE_LEN_SIZE,
},
proxy::ProxyLaunchError,
};
use anyhow::{anyhow, Context as _, Result};
use collections::HashMap;
@ -271,6 +272,7 @@ enum State {
attempts: usize,
},
ReconnectExhausted,
ServerNotRunning,
}
impl fmt::Display for State {
@ -282,6 +284,7 @@ impl fmt::Display for State {
Self::ReconnectFailed { .. } => write!(f, "reconnect failed"),
Self::ReconnectExhausted => write!(f, "reconnect exhausted"),
Self::HeartbeatMissed { .. } => write!(f, "heartbeat missed"),
Self::ServerNotRunning { .. } => write!(f, "server not running"),
}
}
}
@ -297,10 +300,23 @@ impl State {
}
fn can_reconnect(&self) -> bool {
matches!(
self,
Self::Connected { .. } | Self::HeartbeatMissed { .. } | Self::ReconnectFailed { .. }
)
match self {
Self::Connected { .. }
| Self::HeartbeatMissed { .. }
| Self::ReconnectFailed { .. } => true,
State::Connecting
| State::Reconnecting
| State::ReconnectExhausted
| State::ServerNotRunning => false,
}
}
fn is_reconnect_failed(&self) -> bool {
matches!(self, Self::ReconnectFailed { .. })
}
fn is_reconnecting(&self) -> bool {
matches!(self, Self::Reconnecting { .. })
}
fn heartbeat_recovered(self) -> Self {
@ -377,6 +393,7 @@ impl From<&State> for ConnectionState {
State::Reconnecting | State::ReconnectFailed { .. } => Self::Reconnecting,
State::HeartbeatMissed { .. } => Self::HeartbeatMissed,
State::ReconnectExhausted => Self::Disconnected,
State::ServerNotRunning => Self::Disconnected,
}
}
}
@ -426,6 +443,7 @@ impl SshRemoteClient {
let (ssh_connection, ssh_proxy_process) = Self::establish_connection(
unique_identifier,
false,
connection_options,
delegate.clone(),
&mut cx,
@ -496,6 +514,7 @@ impl SshRemoteClient {
} else {
"no state set".to_string()
};
log::info!("aborting reconnect, because not in state that allows reconnecting");
return Err(anyhow!(error));
}
@ -527,7 +546,10 @@ impl SshRemoteClient {
forwarder,
..
} => (attempts, ssh_connection, delegate, forwarder),
State::Connecting | State::Reconnecting | State::ReconnectExhausted => unreachable!(),
State::Connecting
| State::Reconnecting
| State::ReconnectExhausted
| State::ServerNotRunning => unreachable!(),
};
let attempts = attempts + 1;
@ -536,11 +558,12 @@ impl SshRemoteClient {
"Failed to reconnect to after {} attempts, giving up",
MAX_RECONNECT_ATTEMPTS
);
*lock = Some(State::ReconnectExhausted);
drop(lock);
self.set_state(State::ReconnectExhausted, cx);
return Ok(());
}
*lock = Some(State::Reconnecting);
drop(lock);
self.set_state(State::Reconnecting, cx);
log::info!("Trying to reconnect to ssh server... Attempt {}", attempts);
@ -580,6 +603,7 @@ impl SshRemoteClient {
let (ssh_connection, ssh_process) = match Self::establish_connection(
identifier,
true,
connection_options,
delegate.clone(),
&mut cx,
@ -616,33 +640,39 @@ impl SshRemoteClient {
cx.spawn(|this, mut cx| async move {
let new_state = reconnect_task.await;
this.update(&mut cx, |this, cx| {
match &new_state {
State::Connecting
| State::Reconnecting { .. }
| State::HeartbeatMissed { .. } => {}
State::Connected { .. } => {
log::info!("Successfully reconnected");
this.try_set_state(cx, |old_state| {
if old_state.is_reconnecting() {
match &new_state {
State::Connecting
| State::Reconnecting { .. }
| State::HeartbeatMissed { .. }
| State::ServerNotRunning => {}
State::Connected { .. } => {
log::info!("Successfully reconnected");
}
State::ReconnectFailed {
error, attempts, ..
} => {
log::error!(
"Reconnect attempt {} failed: {:?}. Starting new attempt...",
attempts,
error
);
}
State::ReconnectExhausted => {
log::error!("Reconnect attempt failed and all attempts exhausted");
}
}
Some(new_state)
} else {
None
}
State::ReconnectFailed {
error, attempts, ..
} => {
log::error!(
"Reconnect attempt {} failed: {:?}. Starting new attempt...",
attempts,
error
);
}
State::ReconnectExhausted => {
log::error!("Reconnect attempt failed and all attempts exhausted");
}
}
});
let reconnect_failed = matches!(new_state, State::ReconnectFailed { .. });
*this.state.lock() = Some(new_state);
cx.notify();
if reconnect_failed {
if this.state_is(State::is_reconnect_failed) {
this.reconnect(cx)
} else {
log::debug!("State has transition from Reconnecting into new state while attempting reconnect. Ignoring new state.");
Ok(())
}
})
@ -676,8 +706,10 @@ impl SshRemoteClient {
missed_heartbeats,
MAX_MISSED_HEARTBEATS
);
} else {
} else if missed_heartbeats != 0 {
missed_heartbeats = 0;
} else {
continue;
}
let result = this.update(&mut cx, |this, mut cx| {
@ -697,12 +729,12 @@ impl SshRemoteClient {
cx: &mut ModelContext<Self>,
) -> ControlFlow<()> {
let state = self.state.lock().take().unwrap();
self.state.lock().replace(if missed_heartbeats > 0 {
let next_state = if missed_heartbeats > 0 {
state.heartbeat_missed()
} else {
state.heartbeat_recovered()
});
cx.notify();
};
self.set_state(next_state, cx);
if missed_heartbeats >= MAX_MISSED_HEARTBEATS {
log::error!(
@ -743,7 +775,7 @@ impl SshRemoteClient {
select_biased! {
outgoing = outgoing_rx.next().fuse() => {
let Some(outgoing) = outgoing else {
return anyhow::Ok(());
return anyhow::Ok(None);
};
write_message(&mut child_stdin, &mut stdin_buffer, outgoing).await?;
@ -755,11 +787,7 @@ impl SshRemoteClient {
child_stdin.close().await?;
outgoing_rx.close();
let status = ssh_proxy_process.status().await?;
if !status.success() {
log::error!("ssh process exited with status: {status:?}");
return Err(anyhow!("ssh process exited with non-zero status code: {:?}", status.code()));
}
return Ok(());
return Ok(status.code());
}
Ok(len) => {
if len < stdout_buffer.len() {
@ -813,19 +841,56 @@ impl SshRemoteClient {
cx.spawn(|mut cx| async move {
let result = io_task.await;
if let Err(error) = result {
log::warn!("ssh io task died with error: {:?}. reconnecting...", error);
this.update(&mut cx, |this, cx| {
this.reconnect(cx).ok();
})?;
match result {
Ok(Some(exit_code)) => {
if let Some(error) = ProxyLaunchError::from_exit_code(exit_code) {
match error {
ProxyLaunchError::ServerNotRunning => {
log::error!("failed to reconnect because server is not running");
this.update(&mut cx, |this, cx| {
this.set_state(State::ServerNotRunning, cx);
})?;
}
}
} else if exit_code > 0 {
log::error!("proxy process terminated unexpectedly");
}
}
Ok(None) => {}
Err(error) => {
log::warn!("ssh io task died with error: {:?}. reconnecting...", error);
this.update(&mut cx, |this, cx| {
this.reconnect(cx).ok();
})?;
}
}
Ok(())
})
}
fn state_is(&self, check: impl FnOnce(&State) -> bool) -> bool {
self.state.lock().as_ref().map_or(false, check)
}
fn try_set_state(
&self,
cx: &mut ModelContext<Self>,
map: impl FnOnce(&State) -> Option<State>,
) {
if let Some(new_state) = self.state.lock().as_ref().and_then(map) {
self.set_state(new_state, cx);
}
}
fn set_state(&self, state: State, cx: &mut ModelContext<Self>) {
log::info!("setting state to '{}'", &state);
self.state.lock().replace(state);
cx.notify();
}
async fn establish_connection(
unique_identifier: String,
reconnect: bool,
connection_options: SshConnectionOptions,
delegate: Arc<dyn SshClientDelegate>,
cx: &mut AsyncAppContext,
@ -851,14 +916,19 @@ impl SshRemoteClient {
delegate.set_status(Some("Starting proxy"), cx);
let mut start_proxy_command = format!(
"RUST_LOG={} RUST_BACKTRACE={} {:?} proxy --identifier {}",
std::env::var("RUST_LOG").unwrap_or_default(),
std::env::var("RUST_BACKTRACE").unwrap_or_default(),
remote_binary_path,
unique_identifier,
);
if reconnect {
start_proxy_command.push_str(" --reconnect");
}
let ssh_proxy_process = socket
.ssh_command(format!(
"RUST_LOG={} RUST_BACKTRACE={} {:?} proxy --identifier {}",
std::env::var("RUST_LOG").unwrap_or_default(),
std::env::var("RUST_BACKTRACE").unwrap_or_default(),
remote_binary_path,
unique_identifier,
))
.ssh_command(start_proxy_command)
// IMPORTANT: we kill this process when we drop the task that uses it.
.kill_on_drop(true)
.spawn()