Slow down reconnects on collab (#32418)

We believe that collab deploys currently cause outages because:

* All clients try to reconnect simultaneously
* This causes very high CPU usage on collab (and to some extent, the
database)
* This means that collab is slow to respond to clients
* So clients timeout and retry, over and over again.

We hope by letting clients in in buckets of 512, we can accept some
minor slowness to avoid
complete downtime, while we rewrite the system.

Release Notes:

- N/A
This commit is contained in:
Conrad Irwin 2025-06-09 19:59:04 -06:00 committed by GitHub
parent 6d64058fc6
commit ee2a329981
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 45 additions and 1 deletions

View file

@ -68,7 +68,7 @@ use std::{
rc::Rc,
sync::{
Arc, OnceLock,
atomic::{AtomicBool, Ordering::SeqCst},
atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst},
},
time::{Duration, Instant},
};
@ -89,10 +89,36 @@ pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
const MESSAGE_COUNT_PER_PAGE: usize = 100;
const MAX_MESSAGE_LEN: usize = 1024;
const NOTIFICATION_COUNT_PER_PAGE: usize = 50;
const MAX_CONCURRENT_CONNECTIONS: usize = 512;
static CONCURRENT_CONNECTIONS: AtomicUsize = AtomicUsize::new(0);
type MessageHandler =
Box<dyn Send + Sync + Fn(Box<dyn AnyTypedEnvelope>, Session) -> BoxFuture<'static, ()>>;
pub struct ConnectionGuard;
impl ConnectionGuard {
pub fn try_acquire() -> Result<Self, ()> {
let current_connections = CONCURRENT_CONNECTIONS.fetch_add(1, SeqCst);
if current_connections >= MAX_CONCURRENT_CONNECTIONS {
CONCURRENT_CONNECTIONS.fetch_sub(1, SeqCst);
tracing::error!(
"too many concurrent connections: {}",
current_connections + 1
);
return Err(());
}
Ok(ConnectionGuard)
}
}
impl Drop for ConnectionGuard {
fn drop(&mut self) {
CONCURRENT_CONNECTIONS.fetch_sub(1, SeqCst);
}
}
struct Response<R> {
peer: Arc<Peer>,
receipt: Receipt<R>,
@ -725,6 +751,7 @@ impl Server {
system_id: Option<String>,
send_connection_id: Option<oneshot::Sender<ConnectionId>>,
executor: Executor,
connection_guard: Option<ConnectionGuard>,
) -> impl Future<Output = ()> + use<> {
let this = self.clone();
let span = info_span!("handle connection", %address,
@ -745,6 +772,7 @@ impl Server {
tracing::error!("server is tearing down");
return
}
let (connection_id, handle_io, mut incoming_rx) = this
.peer
.add_connection(connection, {
@ -786,6 +814,7 @@ impl Server {
tracing::error!(?error, "failed to send initial client update");
return;
}
drop(connection_guard);
let handle_io = handle_io.fuse();
futures::pin_mut!(handle_io);
@ -1157,6 +1186,19 @@ pub async fn handle_websocket_request(
}
let socket_address = socket_address.to_string();
// Acquire connection guard before WebSocket upgrade
let connection_guard = match ConnectionGuard::try_acquire() {
Ok(guard) => guard,
Err(()) => {
return (
StatusCode::SERVICE_UNAVAILABLE,
"Too many concurrent connections",
)
.into_response();
}
};
ws.on_upgrade(move |socket| {
let socket = socket
.map_ok(to_tungstenite_message)
@ -1174,6 +1216,7 @@ pub async fn handle_websocket_request(
system_id_header.map(|header| header.to_string()),
None,
Executor::Production,
Some(connection_guard),
)
.await;
}

View file

@ -258,6 +258,7 @@ impl TestServer {
None,
Some(connection_id_tx),
Executor::Deterministic(cx.background_executor().clone()),
None,
))
.detach();
let connection_id = connection_id_rx.await.map_err(|e| {