remote server: Do not spawn server when proxy reconnects (#18864)
This ensures that we only ever reconnect to a running server and not spawn a new server with no state. This avoids the problem of the server process crashing, `proxy` reconnecting, starting a new server, and the user getting errors like "unknown buffer id: ...". Release Notes: - N/A --------- Co-authored-by: Bennet Bo Fenner <bennet@zed.dev>
This commit is contained in:
parent
dbf986d37a
commit
c674d73734
7 changed files with 248 additions and 91 deletions
|
@ -31,6 +31,7 @@ serde.workspace = true
|
|||
serde_json.workspace = true
|
||||
smol.workspace = true
|
||||
tempfile.workspace = true
|
||||
thiserror.workspace = true
|
||||
util.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
25
crates/remote/src/proxy.rs
Normal file
25
crates/remote/src/proxy.rs
Normal file
|
@ -0,0 +1,25 @@
|
|||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ProxyLaunchError {
|
||||
#[error("Attempted reconnect, but server not running.")]
|
||||
ServerNotRunning,
|
||||
}
|
||||
|
||||
impl ProxyLaunchError {
|
||||
pub fn to_exit_code(&self) -> i32 {
|
||||
match self {
|
||||
// We're using 90 as the exit code, because 0-78 are often taken
|
||||
// by shells and other conventions and >128 also has certain meanings
|
||||
// in certain contexts.
|
||||
Self::ServerNotRunning => 90,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_exit_code(exit_code: i32) -> Option<Self> {
|
||||
match exit_code {
|
||||
90 => Some(Self::ServerNotRunning),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
pub mod json_log;
|
||||
pub mod protocol;
|
||||
pub mod proxy;
|
||||
pub mod ssh_session;
|
||||
|
||||
pub use ssh_session::{
|
||||
|
|
|
@ -3,6 +3,7 @@ use crate::{
|
|||
protocol::{
|
||||
message_len_from_buffer, read_message_with_len, write_message, MessageId, MESSAGE_LEN_SIZE,
|
||||
},
|
||||
proxy::ProxyLaunchError,
|
||||
};
|
||||
use anyhow::{anyhow, Context as _, Result};
|
||||
use collections::HashMap;
|
||||
|
@ -271,6 +272,7 @@ enum State {
|
|||
attempts: usize,
|
||||
},
|
||||
ReconnectExhausted,
|
||||
ServerNotRunning,
|
||||
}
|
||||
|
||||
impl fmt::Display for State {
|
||||
|
@ -282,6 +284,7 @@ impl fmt::Display for State {
|
|||
Self::ReconnectFailed { .. } => write!(f, "reconnect failed"),
|
||||
Self::ReconnectExhausted => write!(f, "reconnect exhausted"),
|
||||
Self::HeartbeatMissed { .. } => write!(f, "heartbeat missed"),
|
||||
Self::ServerNotRunning { .. } => write!(f, "server not running"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -297,10 +300,23 @@ impl State {
|
|||
}
|
||||
|
||||
fn can_reconnect(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::Connected { .. } | Self::HeartbeatMissed { .. } | Self::ReconnectFailed { .. }
|
||||
)
|
||||
match self {
|
||||
Self::Connected { .. }
|
||||
| Self::HeartbeatMissed { .. }
|
||||
| Self::ReconnectFailed { .. } => true,
|
||||
State::Connecting
|
||||
| State::Reconnecting
|
||||
| State::ReconnectExhausted
|
||||
| State::ServerNotRunning => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_reconnect_failed(&self) -> bool {
|
||||
matches!(self, Self::ReconnectFailed { .. })
|
||||
}
|
||||
|
||||
fn is_reconnecting(&self) -> bool {
|
||||
matches!(self, Self::Reconnecting { .. })
|
||||
}
|
||||
|
||||
fn heartbeat_recovered(self) -> Self {
|
||||
|
@ -377,6 +393,7 @@ impl From<&State> for ConnectionState {
|
|||
State::Reconnecting | State::ReconnectFailed { .. } => Self::Reconnecting,
|
||||
State::HeartbeatMissed { .. } => Self::HeartbeatMissed,
|
||||
State::ReconnectExhausted => Self::Disconnected,
|
||||
State::ServerNotRunning => Self::Disconnected,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -426,6 +443,7 @@ impl SshRemoteClient {
|
|||
|
||||
let (ssh_connection, ssh_proxy_process) = Self::establish_connection(
|
||||
unique_identifier,
|
||||
false,
|
||||
connection_options,
|
||||
delegate.clone(),
|
||||
&mut cx,
|
||||
|
@ -496,6 +514,7 @@ impl SshRemoteClient {
|
|||
} else {
|
||||
"no state set".to_string()
|
||||
};
|
||||
log::info!("aborting reconnect, because not in state that allows reconnecting");
|
||||
return Err(anyhow!(error));
|
||||
}
|
||||
|
||||
|
@ -527,7 +546,10 @@ impl SshRemoteClient {
|
|||
forwarder,
|
||||
..
|
||||
} => (attempts, ssh_connection, delegate, forwarder),
|
||||
State::Connecting | State::Reconnecting | State::ReconnectExhausted => unreachable!(),
|
||||
State::Connecting
|
||||
| State::Reconnecting
|
||||
| State::ReconnectExhausted
|
||||
| State::ServerNotRunning => unreachable!(),
|
||||
};
|
||||
|
||||
let attempts = attempts + 1;
|
||||
|
@ -536,11 +558,12 @@ impl SshRemoteClient {
|
|||
"Failed to reconnect to after {} attempts, giving up",
|
||||
MAX_RECONNECT_ATTEMPTS
|
||||
);
|
||||
*lock = Some(State::ReconnectExhausted);
|
||||
drop(lock);
|
||||
self.set_state(State::ReconnectExhausted, cx);
|
||||
return Ok(());
|
||||
}
|
||||
*lock = Some(State::Reconnecting);
|
||||
drop(lock);
|
||||
self.set_state(State::Reconnecting, cx);
|
||||
|
||||
log::info!("Trying to reconnect to ssh server... Attempt {}", attempts);
|
||||
|
||||
|
@ -580,6 +603,7 @@ impl SshRemoteClient {
|
|||
|
||||
let (ssh_connection, ssh_process) = match Self::establish_connection(
|
||||
identifier,
|
||||
true,
|
||||
connection_options,
|
||||
delegate.clone(),
|
||||
&mut cx,
|
||||
|
@ -616,33 +640,39 @@ impl SshRemoteClient {
|
|||
cx.spawn(|this, mut cx| async move {
|
||||
let new_state = reconnect_task.await;
|
||||
this.update(&mut cx, |this, cx| {
|
||||
match &new_state {
|
||||
State::Connecting
|
||||
| State::Reconnecting { .. }
|
||||
| State::HeartbeatMissed { .. } => {}
|
||||
State::Connected { .. } => {
|
||||
log::info!("Successfully reconnected");
|
||||
this.try_set_state(cx, |old_state| {
|
||||
if old_state.is_reconnecting() {
|
||||
match &new_state {
|
||||
State::Connecting
|
||||
| State::Reconnecting { .. }
|
||||
| State::HeartbeatMissed { .. }
|
||||
| State::ServerNotRunning => {}
|
||||
State::Connected { .. } => {
|
||||
log::info!("Successfully reconnected");
|
||||
}
|
||||
State::ReconnectFailed {
|
||||
error, attempts, ..
|
||||
} => {
|
||||
log::error!(
|
||||
"Reconnect attempt {} failed: {:?}. Starting new attempt...",
|
||||
attempts,
|
||||
error
|
||||
);
|
||||
}
|
||||
State::ReconnectExhausted => {
|
||||
log::error!("Reconnect attempt failed and all attempts exhausted");
|
||||
}
|
||||
}
|
||||
Some(new_state)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
State::ReconnectFailed {
|
||||
error, attempts, ..
|
||||
} => {
|
||||
log::error!(
|
||||
"Reconnect attempt {} failed: {:?}. Starting new attempt...",
|
||||
attempts,
|
||||
error
|
||||
);
|
||||
}
|
||||
State::ReconnectExhausted => {
|
||||
log::error!("Reconnect attempt failed and all attempts exhausted");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let reconnect_failed = matches!(new_state, State::ReconnectFailed { .. });
|
||||
*this.state.lock() = Some(new_state);
|
||||
cx.notify();
|
||||
if reconnect_failed {
|
||||
if this.state_is(State::is_reconnect_failed) {
|
||||
this.reconnect(cx)
|
||||
} else {
|
||||
log::debug!("State has transition from Reconnecting into new state while attempting reconnect. Ignoring new state.");
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
|
@ -676,8 +706,10 @@ impl SshRemoteClient {
|
|||
missed_heartbeats,
|
||||
MAX_MISSED_HEARTBEATS
|
||||
);
|
||||
} else {
|
||||
} else if missed_heartbeats != 0 {
|
||||
missed_heartbeats = 0;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
let result = this.update(&mut cx, |this, mut cx| {
|
||||
|
@ -697,12 +729,12 @@ impl SshRemoteClient {
|
|||
cx: &mut ModelContext<Self>,
|
||||
) -> ControlFlow<()> {
|
||||
let state = self.state.lock().take().unwrap();
|
||||
self.state.lock().replace(if missed_heartbeats > 0 {
|
||||
let next_state = if missed_heartbeats > 0 {
|
||||
state.heartbeat_missed()
|
||||
} else {
|
||||
state.heartbeat_recovered()
|
||||
});
|
||||
cx.notify();
|
||||
};
|
||||
self.set_state(next_state, cx);
|
||||
|
||||
if missed_heartbeats >= MAX_MISSED_HEARTBEATS {
|
||||
log::error!(
|
||||
|
@ -743,7 +775,7 @@ impl SshRemoteClient {
|
|||
select_biased! {
|
||||
outgoing = outgoing_rx.next().fuse() => {
|
||||
let Some(outgoing) = outgoing else {
|
||||
return anyhow::Ok(());
|
||||
return anyhow::Ok(None);
|
||||
};
|
||||
|
||||
write_message(&mut child_stdin, &mut stdin_buffer, outgoing).await?;
|
||||
|
@ -755,11 +787,7 @@ impl SshRemoteClient {
|
|||
child_stdin.close().await?;
|
||||
outgoing_rx.close();
|
||||
let status = ssh_proxy_process.status().await?;
|
||||
if !status.success() {
|
||||
log::error!("ssh process exited with status: {status:?}");
|
||||
return Err(anyhow!("ssh process exited with non-zero status code: {:?}", status.code()));
|
||||
}
|
||||
return Ok(());
|
||||
return Ok(status.code());
|
||||
}
|
||||
Ok(len) => {
|
||||
if len < stdout_buffer.len() {
|
||||
|
@ -813,19 +841,56 @@ impl SshRemoteClient {
|
|||
cx.spawn(|mut cx| async move {
|
||||
let result = io_task.await;
|
||||
|
||||
if let Err(error) = result {
|
||||
log::warn!("ssh io task died with error: {:?}. reconnecting...", error);
|
||||
this.update(&mut cx, |this, cx| {
|
||||
this.reconnect(cx).ok();
|
||||
})?;
|
||||
match result {
|
||||
Ok(Some(exit_code)) => {
|
||||
if let Some(error) = ProxyLaunchError::from_exit_code(exit_code) {
|
||||
match error {
|
||||
ProxyLaunchError::ServerNotRunning => {
|
||||
log::error!("failed to reconnect because server is not running");
|
||||
this.update(&mut cx, |this, cx| {
|
||||
this.set_state(State::ServerNotRunning, cx);
|
||||
})?;
|
||||
}
|
||||
}
|
||||
} else if exit_code > 0 {
|
||||
log::error!("proxy process terminated unexpectedly");
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(error) => {
|
||||
log::warn!("ssh io task died with error: {:?}. reconnecting...", error);
|
||||
this.update(&mut cx, |this, cx| {
|
||||
this.reconnect(cx).ok();
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
fn state_is(&self, check: impl FnOnce(&State) -> bool) -> bool {
|
||||
self.state.lock().as_ref().map_or(false, check)
|
||||
}
|
||||
|
||||
fn try_set_state(
|
||||
&self,
|
||||
cx: &mut ModelContext<Self>,
|
||||
map: impl FnOnce(&State) -> Option<State>,
|
||||
) {
|
||||
if let Some(new_state) = self.state.lock().as_ref().and_then(map) {
|
||||
self.set_state(new_state, cx);
|
||||
}
|
||||
}
|
||||
|
||||
fn set_state(&self, state: State, cx: &mut ModelContext<Self>) {
|
||||
log::info!("setting state to '{}'", &state);
|
||||
self.state.lock().replace(state);
|
||||
cx.notify();
|
||||
}
|
||||
|
||||
async fn establish_connection(
|
||||
unique_identifier: String,
|
||||
reconnect: bool,
|
||||
connection_options: SshConnectionOptions,
|
||||
delegate: Arc<dyn SshClientDelegate>,
|
||||
cx: &mut AsyncAppContext,
|
||||
|
@ -851,14 +916,19 @@ impl SshRemoteClient {
|
|||
|
||||
delegate.set_status(Some("Starting proxy"), cx);
|
||||
|
||||
let mut start_proxy_command = format!(
|
||||
"RUST_LOG={} RUST_BACKTRACE={} {:?} proxy --identifier {}",
|
||||
std::env::var("RUST_LOG").unwrap_or_default(),
|
||||
std::env::var("RUST_BACKTRACE").unwrap_or_default(),
|
||||
remote_binary_path,
|
||||
unique_identifier,
|
||||
);
|
||||
if reconnect {
|
||||
start_proxy_command.push_str(" --reconnect");
|
||||
}
|
||||
|
||||
let ssh_proxy_process = socket
|
||||
.ssh_command(format!(
|
||||
"RUST_LOG={} RUST_BACKTRACE={} {:?} proxy --identifier {}",
|
||||
std::env::var("RUST_LOG").unwrap_or_default(),
|
||||
std::env::var("RUST_BACKTRACE").unwrap_or_default(),
|
||||
remote_binary_path,
|
||||
unique_identifier,
|
||||
))
|
||||
.ssh_command(start_proxy_command)
|
||||
// IMPORTANT: we kill this process when we drop the task that uses it.
|
||||
.kill_on_drop(true)
|
||||
.spawn()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue