From b10d80cbb601240e3f6e1acf2d96d2de8fa1eb88 Mon Sep 17 00:00:00 2001 From: Jitter Date: Sat, 6 Dec 2025 19:15:42 +0530 Subject: [PATCH] fix: detect dead nodes via HTTP/2 keepalives (Issue #1001) (#1025) Co-authored-by: weisd --- Cargo.lock | 1 + crates/protos/src/lib.rs | 9 ++++++-- docs/cluster_recovery.md | 50 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 docs/cluster_recovery.md diff --git a/Cargo.lock b/Cargo.lock index b9a69113..08ae3bbf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7006,6 +7006,7 @@ dependencies = [ "serde_urlencoded", "shadow-rs", "socket2 0.6.1", + "subtle", "sysctl", "sysinfo", "thiserror 2.0.17", diff --git a/crates/protos/src/lib.rs b/crates/protos/src/lib.rs index 396976c5..73bebe71 100644 --- a/crates/protos/src/lib.rs +++ b/crates/protos/src/lib.rs @@ -45,7 +45,13 @@ pub async fn node_service_time_out_client( let channel = match channel { Some(channel) => channel, None => { - let connector = Endpoint::from_shared(addr.to_string())?.connect_timeout(Duration::from_secs(60)); + let connector = Endpoint::from_shared(addr.to_string())? + .connect_timeout(Duration::from_secs(5)) + .tcp_keepalive(Some(Duration::from_secs(10))) + .http2_keep_alive_interval(Duration::from_secs(5)) + .keep_alive_timeout(Duration::from_secs(3)) + .keep_alive_while_idle(true) + .timeout(Duration::from_secs(60)); let channel = connector.connect().await?; { @@ -55,7 +61,6 @@ pub async fn node_service_time_out_client( } }; - // let timeout_channel = Timeout::new(channel, Duration::from_secs(60)); Ok(NodeServiceClient::with_interceptor( channel, Box::new(move |mut req: Request<()>| { diff --git a/docs/cluster_recovery.md b/docs/cluster_recovery.md new file mode 100644 index 00000000..6c339af6 --- /dev/null +++ b/docs/cluster_recovery.md @@ -0,0 +1,50 @@ +# Resolution Report: Issue #1001 - Cluster Recovery from Abrupt Power-Off + +## 1. Issue Description +**Problem**: The cluster failed to recover gracefully when a node experienced an abrupt power-off (hard failure). +**Symptoms**: +- The application became unable to upload files. +- The Console Web UI became unresponsive across the cluster. +- The system "hung" indefinitely, unlike the immediate recovery observed during a graceful process termination (`kill`). + +**Root Cause**: +The standard TCP protocol does not immediately detect a silent peer disappearance (power loss) because no `FIN` or `RST` packets are sent. Without active application-layer heartbeats, the surviving nodes kept connections implementation in an `ESTABLISHED` state, waiting indefinitely for responses that would never arrive. + +--- + +## 2. Technical Approach +To resolve this, we needed to transform the passive failure detection (waiting for TCP timeout) into an active detection mechanism. + +### Key Objectives: +1. **Fail Fast**: Detect dead peers in seconds, not minutes. +2. **Accuracy**: Distinguish between network congestion and actual node failure. +3. **Safety**: Ensure no thread or task blocks forever on a remote procedure call (RPC). + +--- + +## 3. Implemented Solution +We modified the internal gRPC client configuration in `crates/protos/src/lib.rs` to implement a multi-layered health check strategy. + +### Configuration Changes + +```rust +let connector = Endpoint::from_shared(addr.to_string())? + .connect_timeout(Duration::from_secs(5)) + // 1. App-Layer Heartbeats (Primary Detection) + // Sends a hidden HTTP/2 PING frame every 5 seconds. + .http2_keep_alive_interval(Duration::from_secs(5)) + // If PING is not acknowledged within 3 seconds, closes connection. + .keep_alive_timeout(Duration::from_secs(3)) + // Ensures PINGs are sent even when no active requests are in flight. + .keep_alive_while_idle(true) + // 2. Transport-Layer Keepalive (OS Backup) + .tcp_keepalive(Some(Duration::from_secs(10))) + // 3. Global Safety Net + // Hard deadline for any RPC operation. + .timeout(Duration::from_secs(60)); +``` + +### Outcome +- **Detection Time**: Reduced from ~15+ minutes (OS default) to **~8 seconds** (5s interval + 3s timeout). +- **Behavior**: When a node loses power, surviving peers now detect the lost connection almost immediately, throwing a protocol error that triggers standard cluster recovery/failover logic. +- **Result**: The cluster now handles power-offs with the same resilience as graceful shutdowns.