Fix/issue #1001 dead node detection (#1054)

Co-authored-by: weisd <im@weisd.in>
Co-authored-by: Jitterx69 <mohit@example.com>
This commit is contained in:
Jitter
2025-12-08 09:59:46 +05:30
committed by GitHub
parent 834025d9e3
commit 76d25d9a20
9 changed files with 369 additions and 79 deletions

View File

@@ -190,16 +190,32 @@ impl NotificationSys {
pub async fn storage_info<S: StorageAPI>(&self, api: &S) -> rustfs_madmin::StorageInfo {
let mut futures = Vec::with_capacity(self.peer_clients.len());
let endpoints = get_global_endpoints();
let peer_timeout = Duration::from_secs(2); // Same timeout as server_info
for client in self.peer_clients.iter() {
let endpoints = endpoints.clone();
futures.push(async move {
if let Some(client) = client {
match client.local_storage_info().await {
Ok(info) => Some(info),
Err(_) => Some(rustfs_madmin::StorageInfo {
disks: get_offline_disks(&client.host.to_string(), &get_global_endpoints()),
..Default::default()
}),
let host = client.host.to_string();
// Wrap in timeout to ensure we don't hang on dead peers
match timeout(peer_timeout, client.local_storage_info()).await {
Ok(Ok(info)) => Some(info),
Ok(Err(err)) => {
warn!("peer {} storage_info failed: {}", host, err);
Some(rustfs_madmin::StorageInfo {
disks: get_offline_disks(&host, &endpoints),
..Default::default()
})
}
Err(_) => {
warn!("peer {} storage_info timed out after {:?}", host, peer_timeout);
client.evict_connection().await;
Some(rustfs_madmin::StorageInfo {
disks: get_offline_disks(&host, &endpoints),
..Default::default()
})
}
}
} else {
None
@@ -230,13 +246,19 @@ impl NotificationSys {
futures.push(async move {
if let Some(client) = client {
let host = client.host.to_string();
call_peer_with_timeout(
peer_timeout,
&host,
|| client.server_info(),
|| offline_server_properties(&host, &endpoints),
)
.await
match timeout(peer_timeout, client.server_info()).await {
Ok(Ok(info)) => info,
Ok(Err(err)) => {
warn!("peer {} server_info failed: {}", host, err);
// client.server_info handles eviction internally on error, but fallback needed
offline_server_properties(&host, &endpoints)
}
Err(_) => {
warn!("peer {} server_info timed out after {:?}", host, peer_timeout);
client.evict_connection().await;
offline_server_properties(&host, &endpoints)
}
}
} else {
ServerProperties::default()
}

View File

@@ -26,7 +26,7 @@ use rustfs_madmin::{
net::NetInfo,
};
use rustfs_protos::{
node_service_time_out_client,
evict_failed_connection, node_service_time_out_client,
proto_gen::node_service::{
DeleteBucketMetadataRequest, DeletePolicyRequest, DeleteServiceAccountRequest, DeleteUserRequest, GetCpusRequest,
GetMemInfoRequest, GetMetricsRequest, GetNetInfoRequest, GetOsInfoRequest, GetPartitionsRequest, GetProcInfoRequest,
@@ -82,10 +82,25 @@ impl PeerRestClient {
(remote, all)
}
/// Evict the connection to this peer from the global cache.
/// This should be called when communication with this peer fails.
pub async fn evict_connection(&self) {
evict_failed_connection(&self.grid_host).await;
}
}
impl PeerRestClient {
pub async fn local_storage_info(&self) -> Result<rustfs_madmin::StorageInfo> {
let result = self.local_storage_info_inner().await;
if result.is_err() {
// Evict stale connection on any error for cluster recovery
self.evict_connection().await;
}
result
}
async fn local_storage_info_inner(&self) -> Result<rustfs_madmin::StorageInfo> {
let mut client = node_service_time_out_client(&self.grid_host)
.await
.map_err(|err| Error::other(err.to_string()))?;
@@ -107,6 +122,15 @@ impl PeerRestClient {
}
pub async fn server_info(&self) -> Result<ServerProperties> {
let result = self.server_info_inner().await;
if result.is_err() {
// Evict stale connection on any error for cluster recovery
self.evict_connection().await;
}
result
}
async fn server_info_inner(&self) -> Result<ServerProperties> {
let mut client = node_service_time_out_client(&self.grid_host)
.await
.map_err(|err| Error::other(err.to_string()))?;
@@ -478,7 +502,11 @@ impl PeerRestClient {
access_key: access_key.to_string(),
});
let response = client.delete_user(request).await?.into_inner();
let result = client.delete_user(request).await;
if result.is_err() {
self.evict_connection().await;
}
let response = result?.into_inner();
if !response.success {
if let Some(msg) = response.error_info {
return Err(Error::other(msg));
@@ -496,7 +524,11 @@ impl PeerRestClient {
access_key: access_key.to_string(),
});
let response = client.delete_service_account(request).await?.into_inner();
let result = client.delete_service_account(request).await;
if result.is_err() {
self.evict_connection().await;
}
let response = result?.into_inner();
if !response.success {
if let Some(msg) = response.error_info {
return Err(Error::other(msg));
@@ -515,7 +547,11 @@ impl PeerRestClient {
temp,
});
let response = client.load_user(request).await?.into_inner();
let result = client.load_user(request).await;
if result.is_err() {
self.evict_connection().await;
}
let response = result?.into_inner();
if !response.success {
if let Some(msg) = response.error_info {
return Err(Error::other(msg));
@@ -533,7 +569,11 @@ impl PeerRestClient {
access_key: access_key.to_string(),
});
let response = client.load_service_account(request).await?.into_inner();
let result = client.load_service_account(request).await;
if result.is_err() {
self.evict_connection().await;
}
let response = result?.into_inner();
if !response.success {
if let Some(msg) = response.error_info {
return Err(Error::other(msg));
@@ -551,7 +591,11 @@ impl PeerRestClient {
group: group.to_string(),
});
let response = client.load_group(request).await?.into_inner();
let result = client.load_group(request).await;
if result.is_err() {
self.evict_connection().await;
}
let response = result?.into_inner();
if !response.success {
if let Some(msg) = response.error_info {
return Err(Error::other(msg));