From b10d80cbb601240e3f6e1acf2d96d2de8fa1eb88 Mon Sep 17 00:00:00 2001
From: Jitter <jitterx69@gmail.com>
Date: Sat, 6 Dec 2025 19:15:42 +0530
Subject: [PATCH] fix: detect dead nodes via HTTP/2 keepalives (Issue #1001)
 (#1025)

Co-authored-by: weisd <im@weisd.in>
---
 Cargo.lock               |  1 +
 crates/protos/src/lib.rs |  9 ++++++--
 docs/cluster_recovery.md | 50 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 docs/cluster_recovery.md

diff --git a/Cargo.lock b/Cargo.lock
index b9a69113..08ae3bbf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7006,6 +7006,7 @@ dependencies = [
  "serde_urlencoded",
  "shadow-rs",
  "socket2 0.6.1",
+ "subtle",
  "sysctl",
  "sysinfo",
  "thiserror 2.0.17",
diff --git a/crates/protos/src/lib.rs b/crates/protos/src/lib.rs
index 396976c5..73bebe71 100644
--- a/crates/protos/src/lib.rs
+++ b/crates/protos/src/lib.rs
@@ -45,7 +45,13 @@ pub async fn node_service_time_out_client(
     let channel = match channel {
         Some(channel) => channel,
         None => {
-            let connector = Endpoint::from_shared(addr.to_string())?.connect_timeout(Duration::from_secs(60));
+            let connector = Endpoint::from_shared(addr.to_string())?
+                .connect_timeout(Duration::from_secs(5))
+                .tcp_keepalive(Some(Duration::from_secs(10)))
+                .http2_keep_alive_interval(Duration::from_secs(5))
+                .keep_alive_timeout(Duration::from_secs(3))
+                .keep_alive_while_idle(true)
+                .timeout(Duration::from_secs(60));
             let channel = connector.connect().await?;
 
             {
@@ -55,7 +61,6 @@ pub async fn node_service_time_out_client(
         }
     };
 
-    // let timeout_channel = Timeout::new(channel, Duration::from_secs(60));
     Ok(NodeServiceClient::with_interceptor(
         channel,
         Box::new(move |mut req: Request<()>| {
diff --git a/docs/cluster_recovery.md b/docs/cluster_recovery.md
new file mode 100644
index 00000000..6c339af6
--- /dev/null
+++ b/docs/cluster_recovery.md
@@ -0,0 +1,50 @@
+# Resolution Report: Issue #1001 - Cluster Recovery from Abrupt Power-Off
+
+## 1. Issue Description
+**Problem**: The cluster failed to recover gracefully when a node experienced an abrupt power-off (hard failure).
+**Symptoms**:
+-   The application became unable to upload files.
+-   The Console Web UI became unresponsive across the cluster.
+-   The system "hung" indefinitely, unlike the immediate recovery observed during a graceful process termination (`kill`).
+
+**Root Cause**:
+The standard TCP protocol does not immediately detect a silent peer disappearance (power loss) because no `FIN` or `RST` packets are sent. Without active application-layer heartbeats, the surviving nodes kept connections implementation in an `ESTABLISHED` state, waiting indefinitely for responses that would never arrive.
+
+---
+
+## 2. Technical Approach
+To resolve this, we needed to transform the passive failure detection (waiting for TCP timeout) into an active detection mechanism.
+
+### Key Objectives:
+1.  **Fail Fast**: Detect dead peers in seconds, not minutes.
+2.  **Accuracy**: Distinguish between network congestion and actual node failure.
+3.  **Safety**: Ensure no thread or task blocks forever on a remote procedure call (RPC).
+
+---
+
+## 3. Implemented Solution
+We modified the internal gRPC client configuration in `crates/protos/src/lib.rs` to implement a multi-layered health check strategy.
+
+### Configuration Changes
+
+```rust
+let connector = Endpoint::from_shared(addr.to_string())?
+    .connect_timeout(Duration::from_secs(5))
+    // 1. App-Layer Heartbeats (Primary Detection)
+    // Sends a hidden HTTP/2 PING frame every 5 seconds.
+    .http2_keep_alive_interval(Duration::from_secs(5))
+    // If PING is not acknowledged within 3 seconds, closes connection.
+    .keep_alive_timeout(Duration::from_secs(3))
+    // Ensures PINGs are sent even when no active requests are in flight.
+    .keep_alive_while_idle(true)
+    // 2. Transport-Layer Keepalive (OS Backup)
+    .tcp_keepalive(Some(Duration::from_secs(10)))
+    // 3. Global Safety Net
+    // Hard deadline for any RPC operation.
+    .timeout(Duration::from_secs(60));
+```
+
+### Outcome
+-   **Detection Time**: Reduced from ~15+ minutes (OS default) to **~8 seconds** (5s interval + 3s timeout).
+-   **Behavior**: When a node loses power, surviving peers now detect the lost connection almost immediately, throwing a protocol error that triggers standard cluster recovery/failover logic.
+-   **Result**: The cluster now handles power-offs with the same resilience as graceful shutdowns.