mirror of
https://github.com/rustfs/rustfs.git
synced 2026-01-17 01:30:33 +00:00
Fix: Implement priority-based heal queue with comprehensive diagnostic logging (#884)
* Initial plan * Implement priority-based heal queue with deduplication Co-authored-by: houseme <4829346+houseme@users.noreply.github.com> * Apply cargo fmt formatting fixes Co-authored-by: houseme <4829346+houseme@users.noreply.github.com> * Add queue monitoring, better error handling, and adaptive processing - Add priority-based queue statistics tracking - Implement queue capacity warnings (>80% full) - Process multiple tasks per cycle when capacity allows - Add proper error logging for failed heal request submissions - Add Hash trait to HealPriority for HashMap support - Improve observability with detailed queue status logs Co-authored-by: houseme <4829346+houseme@users.noreply.github.com> * Add comprehensive diagnostic logging to heal flow - Add detailed logging for shard availability checks during reads - Log successful heal request submissions - Add detailed logging in heal_object for disk checking - Log each disk's heal decision with reason and endpoint - Add logging for heal writing and renaming phases - Improve variable naming (nil_count -> available_shards) - Add success confirmation messages for renamed healed data These logs will help diagnose where the heal flow is failing. Co-authored-by: houseme <4829346+houseme@users.noreply.github.com> * fix * feat(heal): enhance robustness, safety, and observability of heal process - **Logging & Observability**: - Upgrade critical heal logs from [info](/crates/e2e_test/src/reliant/node_interact_test.rs:196:0-213:1) to `warn` for better visibility. - Implement structured logging with `tracing` fields for machine readability. - Add `#[tracing::instrument]` to [HealTask](c/crates/ahm/src/heal/task.rs:182:0-205:1) and [SetDisks](/crates/ecstore/src/set_disk.rs:120:0-131:1) methods for automatic context propagation. - **Robustness**: - Add exponential backoff retry (3 attempts) for acquiring write locks in [heal_object](/crates/ahm/src/heal/storage.rs:438:4-460:5) to handle contention. - Handle [rename_data](/crates/ecstore/src/set_disk.rs:392:4-516:5) failures gracefully by preserving temporary files instead of forcing deletion, preventing potential data loss. - **Data Safety**: - Fix [object_exists](/crates/ahm/src/heal/storage.rs:395:4-412:5) to propagate IO errors instead of treating them as "object not found". - Update [ErasureSetHealer](/crates/ahm/src/heal/erasure_healer.rs:28:0-33:1) to mark objects as failed rather than skipped when existence checks error, ensuring they are tracked for retry. * fix * fmt * improve code for heal_object * fix * fix * fix --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: houseme <4829346+houseme@users.noreply.github.com> Co-authored-by: houseme <housemecn@gmail.com>
This commit is contained in:
@@ -49,8 +49,9 @@ impl ErasureSetHealer {
|
||||
}
|
||||
|
||||
/// execute erasure set heal with resume
|
||||
#[tracing::instrument(skip(self, buckets), fields(set_disk_id = %set_disk_id, bucket_count = buckets.len()))]
|
||||
pub async fn heal_erasure_set(&self, buckets: &[String], set_disk_id: &str) -> Result<()> {
|
||||
info!("Starting erasure set heal for {} buckets on set disk {}", buckets.len(), set_disk_id);
|
||||
info!("Starting erasure set heal");
|
||||
|
||||
// 1. generate or get task id
|
||||
let task_id = self.get_or_create_task_id(set_disk_id).await?;
|
||||
@@ -231,6 +232,7 @@ impl ErasureSetHealer {
|
||||
|
||||
/// heal single bucket with resume
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(skip(self, current_object_index, processed_objects, successful_objects, failed_objects, _skipped_objects, resume_manager, checkpoint_manager), fields(bucket = %bucket, bucket_index = bucket_index))]
|
||||
async fn heal_bucket_with_resume(
|
||||
&self,
|
||||
bucket: &str,
|
||||
@@ -243,7 +245,7 @@ impl ErasureSetHealer {
|
||||
resume_manager: &ResumeManager,
|
||||
checkpoint_manager: &CheckpointManager,
|
||||
) -> Result<()> {
|
||||
info!(target: "rustfs:ahm:heal_bucket_with_resume" ,"Starting heal for bucket: {} from object index {}", bucket, current_object_index);
|
||||
info!(target: "rustfs:ahm:heal_bucket_with_resume" ,"Starting heal for bucket from object index {}", current_object_index);
|
||||
|
||||
// 1. get bucket info
|
||||
let _bucket_info = match self.storage.get_bucket_info(bucket).await? {
|
||||
@@ -273,7 +275,9 @@ impl ErasureSetHealer {
|
||||
let object_exists = match self.storage.object_exists(bucket, object).await {
|
||||
Ok(exists) => exists,
|
||||
Err(e) => {
|
||||
warn!("Failed to check existence of {}/{}: {}, skipping", bucket, object, e);
|
||||
warn!("Failed to check existence of {}/{}: {}, marking as failed", bucket, object, e);
|
||||
*failed_objects += 1;
|
||||
checkpoint_manager.add_failed_object(object.clone()).await?;
|
||||
*current_object_index = obj_idx + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use rustfs_ecstore::disk::DiskAPI;
|
||||
use rustfs_ecstore::disk::error::DiskError;
|
||||
use rustfs_ecstore::global::GLOBAL_LOCAL_DISK_MAP;
|
||||
use std::{
|
||||
collections::{HashMap, VecDeque},
|
||||
collections::{BinaryHeap, HashMap, HashSet},
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
@@ -33,6 +33,151 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// Priority queue wrapper for heal requests
|
||||
/// Uses BinaryHeap for priority-based ordering while maintaining FIFO for same-priority items
|
||||
#[derive(Debug)]
|
||||
struct PriorityHealQueue {
|
||||
/// Heap of (priority, sequence, request) tuples
|
||||
heap: BinaryHeap<PriorityQueueItem>,
|
||||
/// Sequence counter for FIFO ordering within same priority
|
||||
sequence: u64,
|
||||
/// Set of request keys to prevent duplicates
|
||||
dedup_keys: HashSet<String>,
|
||||
}
|
||||
|
||||
/// Wrapper for heap items to implement proper ordering
|
||||
#[derive(Debug)]
|
||||
struct PriorityQueueItem {
|
||||
priority: HealPriority,
|
||||
sequence: u64,
|
||||
request: HealRequest,
|
||||
}
|
||||
|
||||
impl Eq for PriorityQueueItem {}
|
||||
|
||||
impl PartialEq for PriorityQueueItem {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.priority == other.priority && self.sequence == other.sequence
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for PriorityQueueItem {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// First compare by priority (higher priority first)
|
||||
match self.priority.cmp(&other.priority) {
|
||||
std::cmp::Ordering::Equal => {
|
||||
// If priorities are equal, use sequence for FIFO (lower sequence first)
|
||||
other.sequence.cmp(&self.sequence)
|
||||
}
|
||||
ordering => ordering,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for PriorityQueueItem {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PriorityHealQueue {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
heap: BinaryHeap::new(),
|
||||
sequence: 0,
|
||||
dedup_keys: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.heap.len()
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.heap.is_empty()
|
||||
}
|
||||
|
||||
fn push(&mut self, request: HealRequest) -> bool {
|
||||
let key = Self::make_dedup_key(&request);
|
||||
|
||||
// Check for duplicates
|
||||
if self.dedup_keys.contains(&key) {
|
||||
return false; // Duplicate request, don't add
|
||||
}
|
||||
|
||||
self.dedup_keys.insert(key);
|
||||
self.sequence += 1;
|
||||
self.heap.push(PriorityQueueItem {
|
||||
priority: request.priority,
|
||||
sequence: self.sequence,
|
||||
request,
|
||||
});
|
||||
true
|
||||
}
|
||||
|
||||
/// Get statistics about queue contents by priority
|
||||
fn get_priority_stats(&self) -> HashMap<HealPriority, usize> {
|
||||
let mut stats = HashMap::new();
|
||||
for item in &self.heap {
|
||||
*stats.entry(item.priority).or_insert(0) += 1;
|
||||
}
|
||||
stats
|
||||
}
|
||||
|
||||
fn pop(&mut self) -> Option<HealRequest> {
|
||||
self.heap.pop().map(|item| {
|
||||
let key = Self::make_dedup_key(&item.request);
|
||||
self.dedup_keys.remove(&key);
|
||||
item.request
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a deduplication key from a heal request
|
||||
fn make_dedup_key(request: &HealRequest) -> String {
|
||||
match &request.heal_type {
|
||||
HealType::Object {
|
||||
bucket,
|
||||
object,
|
||||
version_id,
|
||||
} => {
|
||||
format!("object:{}:{}:{}", bucket, object, version_id.as_deref().unwrap_or(""))
|
||||
}
|
||||
HealType::Bucket { bucket } => {
|
||||
format!("bucket:{}", bucket)
|
||||
}
|
||||
HealType::ErasureSet { set_disk_id, .. } => {
|
||||
format!("erasure_set:{}", set_disk_id)
|
||||
}
|
||||
HealType::Metadata { bucket, object } => {
|
||||
format!("metadata:{}:{}", bucket, object)
|
||||
}
|
||||
HealType::MRF { meta_path } => {
|
||||
format!("mrf:{}", meta_path)
|
||||
}
|
||||
HealType::ECDecode {
|
||||
bucket,
|
||||
object,
|
||||
version_id,
|
||||
} => {
|
||||
format!("ecdecode:{}:{}:{}", bucket, object, version_id.as_deref().unwrap_or(""))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a request with the same key already exists in the queue
|
||||
#[allow(dead_code)]
|
||||
fn contains_key(&self, request: &HealRequest) -> bool {
|
||||
let key = Self::make_dedup_key(request);
|
||||
self.dedup_keys.contains(&key)
|
||||
}
|
||||
|
||||
/// Check if an erasure set heal request for a specific set_disk_id exists
|
||||
fn contains_erasure_set(&self, set_disk_id: &str) -> bool {
|
||||
let key = format!("erasure_set:{}", set_disk_id);
|
||||
self.dedup_keys.contains(&key)
|
||||
}
|
||||
}
|
||||
|
||||
/// Heal config
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealConfig {
|
||||
@@ -85,8 +230,8 @@ pub struct HealManager {
|
||||
state: Arc<RwLock<HealState>>,
|
||||
/// Active heal tasks
|
||||
active_heals: Arc<Mutex<HashMap<String, Arc<HealTask>>>>,
|
||||
/// Heal queue
|
||||
heal_queue: Arc<Mutex<VecDeque<HealRequest>>>,
|
||||
/// Heal queue (priority-based)
|
||||
heal_queue: Arc<Mutex<PriorityHealQueue>>,
|
||||
/// Storage layer interface
|
||||
storage: Arc<dyn HealStorageAPI>,
|
||||
/// Cancel token
|
||||
@@ -103,7 +248,7 @@ impl HealManager {
|
||||
config: Arc::new(RwLock::new(config)),
|
||||
state: Arc::new(RwLock::new(HealState::default())),
|
||||
active_heals: Arc::new(Mutex::new(HashMap::new())),
|
||||
heal_queue: Arc::new(Mutex::new(VecDeque::new())),
|
||||
heal_queue: Arc::new(Mutex::new(PriorityHealQueue::new())),
|
||||
storage,
|
||||
cancel_token: CancellationToken::new(),
|
||||
statistics: Arc::new(RwLock::new(HealStatistics::new())),
|
||||
@@ -161,17 +306,54 @@ impl HealManager {
|
||||
let config = self.config.read().await;
|
||||
let mut queue = self.heal_queue.lock().await;
|
||||
|
||||
if queue.len() >= config.queue_size {
|
||||
let queue_len = queue.len();
|
||||
let queue_capacity = config.queue_size;
|
||||
|
||||
if queue_len >= queue_capacity {
|
||||
return Err(Error::ConfigurationError {
|
||||
message: "Heal queue is full".to_string(),
|
||||
message: format!("Heal queue is full ({}/{})", queue_len, queue_capacity),
|
||||
});
|
||||
}
|
||||
|
||||
// Warn when queue is getting full (>80% capacity)
|
||||
let capacity_threshold = (queue_capacity as f64 * 0.8) as usize;
|
||||
if queue_len >= capacity_threshold {
|
||||
warn!(
|
||||
"Heal queue is {}% full ({}/{}). Consider increasing queue size or processing capacity.",
|
||||
(queue_len * 100) / queue_capacity,
|
||||
queue_len,
|
||||
queue_capacity
|
||||
);
|
||||
}
|
||||
|
||||
let request_id = request.id.clone();
|
||||
queue.push_back(request);
|
||||
let priority = request.priority;
|
||||
|
||||
// Try to push the request; if it's a duplicate, still return the request_id
|
||||
let is_new = queue.push(request);
|
||||
|
||||
// Log queue statistics periodically (when adding high/urgent priority items)
|
||||
if matches!(priority, HealPriority::High | HealPriority::Urgent) {
|
||||
let stats = queue.get_priority_stats();
|
||||
info!(
|
||||
"Heal queue stats after adding {:?} priority request: total={}, urgent={}, high={}, normal={}, low={}",
|
||||
priority,
|
||||
queue_len + 1,
|
||||
stats.get(&HealPriority::Urgent).unwrap_or(&0),
|
||||
stats.get(&HealPriority::High).unwrap_or(&0),
|
||||
stats.get(&HealPriority::Normal).unwrap_or(&0),
|
||||
stats.get(&HealPriority::Low).unwrap_or(&0)
|
||||
);
|
||||
}
|
||||
|
||||
drop(queue);
|
||||
|
||||
info!("Submitted heal request: {}", request_id);
|
||||
if is_new {
|
||||
info!("Submitted heal request: {} with priority: {:?}", request_id, priority);
|
||||
} else {
|
||||
info!("Heal request already queued (duplicate): {}", request_id);
|
||||
}
|
||||
|
||||
Ok(request_id)
|
||||
}
|
||||
|
||||
@@ -321,13 +503,7 @@ impl HealManager {
|
||||
let mut skip = false;
|
||||
{
|
||||
let queue = heal_queue.lock().await;
|
||||
if queue.iter().any(|req| {
|
||||
matches!(
|
||||
&req.heal_type,
|
||||
crate::heal::task::HealType::ErasureSet { set_disk_id: queued_id, .. }
|
||||
if queued_id == &set_disk_id
|
||||
)
|
||||
}) {
|
||||
if queue.contains_erasure_set(&set_disk_id) {
|
||||
skip = true;
|
||||
}
|
||||
}
|
||||
@@ -358,7 +534,7 @@ impl HealManager {
|
||||
HealPriority::Normal,
|
||||
);
|
||||
let mut queue = heal_queue.lock().await;
|
||||
queue.push_back(req);
|
||||
queue.push(req);
|
||||
info!("Enqueued auto erasure set heal for endpoint: {} (set_disk_id: {})", ep, set_disk_id);
|
||||
}
|
||||
}
|
||||
@@ -369,8 +545,9 @@ impl HealManager {
|
||||
}
|
||||
|
||||
/// Process heal queue
|
||||
/// Processes multiple tasks per cycle when capacity allows and queue has high-priority items
|
||||
async fn process_heal_queue(
|
||||
heal_queue: &Arc<Mutex<VecDeque<HealRequest>>>,
|
||||
heal_queue: &Arc<Mutex<PriorityHealQueue>>,
|
||||
active_heals: &Arc<Mutex<HashMap<String, Arc<HealTask>>>>,
|
||||
config: &Arc<RwLock<HealConfig>>,
|
||||
statistics: &Arc<RwLock<HealStatistics>>,
|
||||
@@ -379,51 +556,83 @@ impl HealManager {
|
||||
let config = config.read().await;
|
||||
let mut active_heals_guard = active_heals.lock().await;
|
||||
|
||||
// check if new heal tasks can be started
|
||||
if active_heals_guard.len() >= config.max_concurrent_heals {
|
||||
// Check if new heal tasks can be started
|
||||
let active_count = active_heals_guard.len();
|
||||
if active_count >= config.max_concurrent_heals {
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate how many tasks we can start this cycle
|
||||
let available_slots = config.max_concurrent_heals - active_count;
|
||||
|
||||
let mut queue = heal_queue.lock().await;
|
||||
if let Some(request) = queue.pop_front() {
|
||||
let task = Arc::new(HealTask::from_request(request, storage.clone()));
|
||||
let task_id = task.id.clone();
|
||||
active_heals_guard.insert(task_id.clone(), task.clone());
|
||||
drop(active_heals_guard);
|
||||
let active_heals_clone = active_heals.clone();
|
||||
let statistics_clone = statistics.clone();
|
||||
let queue_len = queue.len();
|
||||
|
||||
// start heal task
|
||||
tokio::spawn(async move {
|
||||
info!("Starting heal task: {}", task_id);
|
||||
let result = task.execute().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
info!("Heal task completed successfully: {}", task_id);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Heal task failed: {} - {}", task_id, e);
|
||||
}
|
||||
}
|
||||
let mut active_heals_guard = active_heals_clone.lock().await;
|
||||
if let Some(completed_task) = active_heals_guard.remove(&task_id) {
|
||||
// update statistics
|
||||
let mut stats = statistics_clone.write().await;
|
||||
match completed_task.get_status().await {
|
||||
HealTaskStatus::Completed => {
|
||||
stats.update_task_completion(true);
|
||||
if queue_len == 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process multiple tasks if:
|
||||
// 1. We have available slots
|
||||
// 2. Queue is not empty
|
||||
// Prioritize urgent/high priority tasks by processing up to 2 tasks per cycle if available
|
||||
let tasks_to_process = if queue_len > 0 {
|
||||
std::cmp::min(available_slots, std::cmp::min(2, queue_len))
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
for _ in 0..tasks_to_process {
|
||||
if let Some(request) = queue.pop() {
|
||||
let task_priority = request.priority;
|
||||
let task = Arc::new(HealTask::from_request(request, storage.clone()));
|
||||
let task_id = task.id.clone();
|
||||
active_heals_guard.insert(task_id.clone(), task.clone());
|
||||
let active_heals_clone = active_heals.clone();
|
||||
let statistics_clone = statistics.clone();
|
||||
|
||||
// start heal task
|
||||
tokio::spawn(async move {
|
||||
info!("Starting heal task: {} with priority: {:?}", task_id, task_priority);
|
||||
let result = task.execute().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
info!("Heal task completed successfully: {}", task_id);
|
||||
}
|
||||
_ => {
|
||||
stats.update_task_completion(false);
|
||||
Err(e) => {
|
||||
error!("Heal task failed: {} - {}", task_id, e);
|
||||
}
|
||||
}
|
||||
stats.update_running_tasks(active_heals_guard.len() as u64);
|
||||
}
|
||||
});
|
||||
let mut active_heals_guard = active_heals_clone.lock().await;
|
||||
if let Some(completed_task) = active_heals_guard.remove(&task_id) {
|
||||
// update statistics
|
||||
let mut stats = statistics_clone.write().await;
|
||||
match completed_task.get_status().await {
|
||||
HealTaskStatus::Completed => {
|
||||
stats.update_task_completion(true);
|
||||
}
|
||||
_ => {
|
||||
stats.update_task_completion(false);
|
||||
}
|
||||
}
|
||||
stats.update_running_tasks(active_heals_guard.len() as u64);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// update statistics
|
||||
let mut stats = statistics.write().await;
|
||||
stats.total_tasks += 1;
|
||||
// Update statistics for all started tasks
|
||||
let mut stats = statistics.write().await;
|
||||
stats.total_tasks += tasks_to_process as u64;
|
||||
|
||||
// Log queue status if items remain
|
||||
if !queue.is_empty() {
|
||||
let remaining = queue.len();
|
||||
if remaining > 10 {
|
||||
info!("Heal queue has {} pending requests, {} tasks active", remaining, active_heals_guard.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -438,3 +647,333 @@ impl std::fmt::Debug for HealManager {
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::heal::task::{HealOptions, HealPriority, HealRequest, HealType};
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_ordering() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
// Add requests with different priorities
|
||||
let low_req = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket1".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Low,
|
||||
);
|
||||
|
||||
let normal_req = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket2".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let high_req = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket3".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::High,
|
||||
);
|
||||
|
||||
let urgent_req = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket4".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Urgent,
|
||||
);
|
||||
|
||||
// Add in random order: low, high, normal, urgent
|
||||
assert!(queue.push(low_req));
|
||||
assert!(queue.push(high_req));
|
||||
assert!(queue.push(normal_req));
|
||||
assert!(queue.push(urgent_req));
|
||||
|
||||
assert_eq!(queue.len(), 4);
|
||||
|
||||
// Should pop in priority order: urgent, high, normal, low
|
||||
let popped1 = queue.pop().unwrap();
|
||||
assert_eq!(popped1.priority, HealPriority::Urgent);
|
||||
|
||||
let popped2 = queue.pop().unwrap();
|
||||
assert_eq!(popped2.priority, HealPriority::High);
|
||||
|
||||
let popped3 = queue.pop().unwrap();
|
||||
assert_eq!(popped3.priority, HealPriority::Normal);
|
||||
|
||||
let popped4 = queue.pop().unwrap();
|
||||
assert_eq!(popped4.priority, HealPriority::Low);
|
||||
|
||||
assert_eq!(queue.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_fifo_same_priority() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
// Add multiple requests with same priority
|
||||
let req1 = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket1".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let req2 = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket2".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let req3 = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket3".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let id1 = req1.id.clone();
|
||||
let id2 = req2.id.clone();
|
||||
let id3 = req3.id.clone();
|
||||
|
||||
assert!(queue.push(req1));
|
||||
assert!(queue.push(req2));
|
||||
assert!(queue.push(req3));
|
||||
|
||||
// Should maintain FIFO order for same priority
|
||||
let popped1 = queue.pop().unwrap();
|
||||
assert_eq!(popped1.id, id1);
|
||||
|
||||
let popped2 = queue.pop().unwrap();
|
||||
assert_eq!(popped2.id, id2);
|
||||
|
||||
let popped3 = queue.pop().unwrap();
|
||||
assert_eq!(popped3.id, id3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_deduplication() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
let req1 = HealRequest::new(
|
||||
HealType::Object {
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
version_id: None,
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let req2 = HealRequest::new(
|
||||
HealType::Object {
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
version_id: None,
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::High,
|
||||
);
|
||||
|
||||
// First request should be added
|
||||
assert!(queue.push(req1));
|
||||
assert_eq!(queue.len(), 1);
|
||||
|
||||
// Second request with same object should be rejected (duplicate)
|
||||
assert!(!queue.push(req2));
|
||||
assert_eq!(queue.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_contains_erasure_set() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
let req = HealRequest::new(
|
||||
HealType::ErasureSet {
|
||||
buckets: vec!["bucket1".to_string()],
|
||||
set_disk_id: "pool_0_set_1".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
assert!(queue.push(req));
|
||||
assert!(queue.contains_erasure_set("pool_0_set_1"));
|
||||
assert!(!queue.contains_erasure_set("pool_0_set_2"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_dedup_key_generation() {
|
||||
// Test different heal types generate different keys
|
||||
let obj_req = HealRequest::new(
|
||||
HealType::Object {
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
version_id: None,
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let bucket_req = HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket1".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let erasure_req = HealRequest::new(
|
||||
HealType::ErasureSet {
|
||||
buckets: vec!["bucket1".to_string()],
|
||||
set_disk_id: "pool_0_set_1".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
);
|
||||
|
||||
let obj_key = PriorityHealQueue::make_dedup_key(&obj_req);
|
||||
let bucket_key = PriorityHealQueue::make_dedup_key(&bucket_req);
|
||||
let erasure_key = PriorityHealQueue::make_dedup_key(&erasure_req);
|
||||
|
||||
// All keys should be different
|
||||
assert_ne!(obj_key, bucket_key);
|
||||
assert_ne!(obj_key, erasure_key);
|
||||
assert_ne!(bucket_key, erasure_key);
|
||||
|
||||
assert!(obj_key.starts_with("object:"));
|
||||
assert!(bucket_key.starts_with("bucket:"));
|
||||
assert!(erasure_key.starts_with("erasure_set:"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_mixed_priorities_and_types() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
// Add various requests
|
||||
let requests = vec![
|
||||
(
|
||||
HealType::Object {
|
||||
bucket: "b1".to_string(),
|
||||
object: "o1".to_string(),
|
||||
version_id: None,
|
||||
},
|
||||
HealPriority::Low,
|
||||
),
|
||||
(
|
||||
HealType::Bucket {
|
||||
bucket: "b2".to_string(),
|
||||
},
|
||||
HealPriority::Urgent,
|
||||
),
|
||||
(
|
||||
HealType::ErasureSet {
|
||||
buckets: vec!["b3".to_string()],
|
||||
set_disk_id: "pool_0_set_1".to_string(),
|
||||
},
|
||||
HealPriority::Normal,
|
||||
),
|
||||
(
|
||||
HealType::Object {
|
||||
bucket: "b4".to_string(),
|
||||
object: "o4".to_string(),
|
||||
version_id: None,
|
||||
},
|
||||
HealPriority::High,
|
||||
),
|
||||
];
|
||||
|
||||
for (heal_type, priority) in requests {
|
||||
let req = HealRequest::new(heal_type, HealOptions::default(), priority);
|
||||
queue.push(req);
|
||||
}
|
||||
|
||||
assert_eq!(queue.len(), 4);
|
||||
|
||||
// Check they come out in priority order
|
||||
let priorities: Vec<HealPriority> = (0..4).filter_map(|_| queue.pop().map(|r| r.priority)).collect();
|
||||
|
||||
assert_eq!(
|
||||
priorities,
|
||||
vec![
|
||||
HealPriority::Urgent,
|
||||
HealPriority::High,
|
||||
HealPriority::Normal,
|
||||
HealPriority::Low,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_stats() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
// Add requests with different priorities
|
||||
for _ in 0..3 {
|
||||
queue.push(HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: format!("bucket-low-{}", queue.len()),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Low,
|
||||
));
|
||||
}
|
||||
|
||||
for _ in 0..2 {
|
||||
queue.push(HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: format!("bucket-normal-{}", queue.len()),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
));
|
||||
}
|
||||
|
||||
queue.push(HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "bucket-high".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::High,
|
||||
));
|
||||
|
||||
let stats = queue.get_priority_stats();
|
||||
|
||||
assert_eq!(*stats.get(&HealPriority::Low).unwrap_or(&0), 3);
|
||||
assert_eq!(*stats.get(&HealPriority::Normal).unwrap_or(&0), 2);
|
||||
assert_eq!(*stats.get(&HealPriority::High).unwrap_or(&0), 1);
|
||||
assert_eq!(*stats.get(&HealPriority::Urgent).unwrap_or(&0), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_priority_queue_is_empty() {
|
||||
let mut queue = PriorityHealQueue::new();
|
||||
|
||||
assert!(queue.is_empty());
|
||||
|
||||
queue.push(HealRequest::new(
|
||||
HealType::Bucket {
|
||||
bucket: "test".to_string(),
|
||||
},
|
||||
HealOptions::default(),
|
||||
HealPriority::Normal,
|
||||
));
|
||||
|
||||
assert!(!queue.is_empty());
|
||||
|
||||
queue.pop();
|
||||
|
||||
assert!(queue.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,13 +400,13 @@ impl HealStorageAPI for ECStoreHealStorage {
|
||||
match self.ecstore.get_object_info(bucket, object, &Default::default()).await {
|
||||
Ok(_) => Ok(true), // Object exists
|
||||
Err(e) => {
|
||||
// Map ObjectNotFound to false, other errors to false as well for safety
|
||||
// Map ObjectNotFound to false, other errors must be propagated!
|
||||
if matches!(e, rustfs_ecstore::error::StorageError::ObjectNotFound(_, _)) {
|
||||
debug!("Object not found: {}/{}", bucket, object);
|
||||
Ok(false)
|
||||
} else {
|
||||
debug!("Error checking object existence {}/{}: {}", bucket, object, e);
|
||||
Ok(false) // Treat errors as non-existence to be safe
|
||||
error!("Error checking object existence {}/{}: {}", bucket, object, e);
|
||||
Err(Error::other(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ pub enum HealType {
|
||||
}
|
||||
|
||||
/// Heal priority
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
|
||||
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub enum HealPriority {
|
||||
/// Low priority
|
||||
Low = 0,
|
||||
@@ -272,6 +272,7 @@ impl HealTask {
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(self), fields(task_id = %self.id, heal_type = ?self.heal_type))]
|
||||
pub async fn execute(&self) -> Result<()> {
|
||||
// update status and timestamps atomically to avoid race conditions
|
||||
let now = SystemTime::now();
|
||||
@@ -285,7 +286,7 @@ impl HealTask {
|
||||
*task_start_instant = Some(start_instant);
|
||||
}
|
||||
|
||||
info!("Starting heal task: {} with type: {:?}", self.id, self.heal_type);
|
||||
info!("Task started");
|
||||
|
||||
let result = match &self.heal_type {
|
||||
HealType::Object {
|
||||
@@ -315,7 +316,7 @@ impl HealTask {
|
||||
Ok(_) => {
|
||||
let mut status = self.status.write().await;
|
||||
*status = HealTaskStatus::Completed;
|
||||
info!("Heal task completed successfully: {}", self.id);
|
||||
info!("Task completed successfully");
|
||||
}
|
||||
Err(Error::TaskCancelled) => {
|
||||
let mut status = self.status.write().await;
|
||||
@@ -354,8 +355,9 @@ impl HealTask {
|
||||
}
|
||||
|
||||
// specific heal implementation method
|
||||
#[tracing::instrument(skip(self), fields(bucket = %bucket, object = %object, version_id = ?version_id))]
|
||||
async fn heal_object(&self, bucket: &str, object: &str, version_id: Option<&str>) -> Result<()> {
|
||||
info!("Healing object: {}/{}", bucket, object);
|
||||
info!("Starting object heal workflow");
|
||||
|
||||
// update progress
|
||||
{
|
||||
@@ -365,7 +367,7 @@ impl HealTask {
|
||||
}
|
||||
|
||||
// Step 1: Check if object exists and get metadata
|
||||
info!("Step 1: Checking object existence and metadata");
|
||||
warn!("Step 1: Checking object existence and metadata");
|
||||
self.check_control_flags().await?;
|
||||
let object_exists = self.await_with_control(self.storage.object_exists(bucket, object)).await?;
|
||||
if !object_exists {
|
||||
@@ -424,7 +426,7 @@ impl HealTask {
|
||||
|
||||
// If heal failed and remove_corrupted is enabled, delete the corrupted object
|
||||
if self.options.remove_corrupted {
|
||||
warn!("Removing corrupted object: {}/{}", bucket, object);
|
||||
info!("Removing corrupted object: {}/{}", bucket, object);
|
||||
if !self.options.dry_run {
|
||||
self.await_with_control(self.storage.delete_object(bucket, object)).await?;
|
||||
info!("Successfully deleted corrupted object: {}/{}", bucket, object);
|
||||
@@ -447,11 +449,9 @@ impl HealTask {
|
||||
info!("Step 3: Verifying heal result");
|
||||
let object_size = result.object_size as u64;
|
||||
info!(
|
||||
"Heal completed successfully: {}/{} ({} bytes, {} drives healed)",
|
||||
bucket,
|
||||
object,
|
||||
object_size,
|
||||
result.after.drives.len()
|
||||
object_size = object_size,
|
||||
drives_healed = result.after.drives.len(),
|
||||
"Heal completed successfully"
|
||||
);
|
||||
|
||||
{
|
||||
@@ -481,7 +481,7 @@ impl HealTask {
|
||||
|
||||
// If heal failed and remove_corrupted is enabled, delete the corrupted object
|
||||
if self.options.remove_corrupted {
|
||||
warn!("Removing corrupted object: {}/{}", bucket, object);
|
||||
info!("Removing corrupted object: {}/{}", bucket, object);
|
||||
if !self.options.dry_run {
|
||||
self.await_with_control(self.storage.delete_object(bucket, object)).await?;
|
||||
info!("Successfully deleted corrupted object: {}/{}", bucket, object);
|
||||
|
||||
@@ -34,7 +34,7 @@ use rustfs_filemeta::{
|
||||
};
|
||||
use rustfs_utils::http::{
|
||||
AMZ_BUCKET_REPLICATION_STATUS, AMZ_OBJECT_TAGGING, AMZ_TAGGING_DIRECTIVE, CONTENT_ENCODING, HeaderExt as _,
|
||||
RESERVED_METADATA_PREFIX, RESERVED_METADATA_PREFIX_LOWER, RUSTFS_REPLICATION_AUTUAL_OBJECT_SIZE,
|
||||
RESERVED_METADATA_PREFIX, RESERVED_METADATA_PREFIX_LOWER, RUSTFS_REPLICATION_ACTUAL_OBJECT_SIZE,
|
||||
RUSTFS_REPLICATION_RESET_STATUS, SSEC_ALGORITHM_HEADER, SSEC_KEY_HEADER, SSEC_KEY_MD5_HEADER, headers,
|
||||
};
|
||||
use rustfs_utils::path::path_join_buf;
|
||||
@@ -2324,7 +2324,7 @@ async fn replicate_object_with_multipart(
|
||||
let mut user_metadata = HashMap::new();
|
||||
|
||||
user_metadata.insert(
|
||||
RUSTFS_REPLICATION_AUTUAL_OBJECT_SIZE.to_string(),
|
||||
RUSTFS_REPLICATION_ACTUAL_OBJECT_SIZE.to_string(),
|
||||
object_info
|
||||
.user_defined
|
||||
.get(&format!("{RESERVED_METADATA_PREFIX}actual-size"))
|
||||
|
||||
@@ -140,6 +140,12 @@ pub enum DiskError {
|
||||
|
||||
#[error("io error {0}")]
|
||||
Io(io::Error),
|
||||
|
||||
#[error("source stalled")]
|
||||
SourceStalled,
|
||||
|
||||
#[error("timeout")]
|
||||
Timeout,
|
||||
}
|
||||
|
||||
impl DiskError {
|
||||
@@ -366,6 +372,8 @@ impl Clone for DiskError {
|
||||
DiskError::ErasureWriteQuorum => DiskError::ErasureWriteQuorum,
|
||||
DiskError::ErasureReadQuorum => DiskError::ErasureReadQuorum,
|
||||
DiskError::ShortWrite => DiskError::ShortWrite,
|
||||
DiskError::SourceStalled => DiskError::SourceStalled,
|
||||
DiskError::Timeout => DiskError::Timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -412,6 +420,8 @@ impl DiskError {
|
||||
DiskError::ErasureWriteQuorum => 0x25,
|
||||
DiskError::ErasureReadQuorum => 0x26,
|
||||
DiskError::ShortWrite => 0x27,
|
||||
DiskError::SourceStalled => 0x28,
|
||||
DiskError::Timeout => 0x29,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -456,6 +466,8 @@ impl DiskError {
|
||||
0x25 => Some(DiskError::ErasureWriteQuorum),
|
||||
0x26 => Some(DiskError::ErasureReadQuorum),
|
||||
0x27 => Some(DiskError::ShortWrite),
|
||||
0x28 => Some(DiskError::SourceStalled),
|
||||
0x29 => Some(DiskError::Timeout),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1316,9 +1316,26 @@ impl SetDisks {
|
||||
|
||||
for (i, meta) in metas.iter().enumerate() {
|
||||
if !meta.is_valid() {
|
||||
debug!(
|
||||
index = i,
|
||||
valid = false,
|
||||
version_id = ?meta.version_id,
|
||||
mod_time = ?meta.mod_time,
|
||||
"find_file_info_in_quorum: skipping invalid meta"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!(
|
||||
index = i,
|
||||
valid = true,
|
||||
version_id = ?meta.version_id,
|
||||
mod_time = ?meta.mod_time,
|
||||
deleted = meta.deleted,
|
||||
size = meta.size,
|
||||
"find_file_info_in_quorum: inspecting meta"
|
||||
);
|
||||
|
||||
let etag_only = mod_time.is_none() && etag.is_some() && meta.get_etag().is_some_and(|v| &v == etag.as_ref().unwrap());
|
||||
let mod_valid = mod_time == &meta.mod_time;
|
||||
|
||||
@@ -1344,6 +1361,13 @@ impl SetDisks {
|
||||
meta_hashes[i] = Some(hex(hasher.clone().finalize().as_slice()));
|
||||
|
||||
hasher.reset();
|
||||
} else {
|
||||
debug!(
|
||||
index = i,
|
||||
etag_only_match = etag_only,
|
||||
mod_valid_match = mod_valid,
|
||||
"find_file_info_in_quorum: meta does not match common etag or mod_time, skipping hash calculation"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1492,7 +1516,7 @@ impl SetDisks {
|
||||
object: &str,
|
||||
version_id: &str,
|
||||
opts: &ReadOptions,
|
||||
) -> Result<Vec<rustfs_filemeta::FileInfo>> {
|
||||
) -> Result<Vec<FileInfo>> {
|
||||
// Use existing disk selection logic
|
||||
let disks = self.disks.read().await;
|
||||
let required_reads = self.format.erasure.sets.len();
|
||||
@@ -2181,11 +2205,11 @@ impl SetDisks {
|
||||
// TODO: replicatio
|
||||
|
||||
if fi.deleted {
|
||||
if opts.version_id.is_none() || opts.delete_marker {
|
||||
return Err(to_object_err(StorageError::FileNotFound, vec![bucket, object]));
|
||||
return if opts.version_id.is_none() || opts.delete_marker {
|
||||
Err(to_object_err(StorageError::FileNotFound, vec![bucket, object]))
|
||||
} else {
|
||||
return Err(to_object_err(StorageError::MethodNotAllowed, vec![bucket, object]));
|
||||
}
|
||||
Err(to_object_err(StorageError::MethodNotAllowed, vec![bucket, object]))
|
||||
};
|
||||
}
|
||||
|
||||
Ok((oi, write_quorum))
|
||||
@@ -2331,28 +2355,54 @@ impl SetDisks {
|
||||
// Check if we have missing shards even though we can read successfully
|
||||
// This happens when a node was offline during write and comes back online
|
||||
let total_shards = erasure.data_shards + erasure.parity_shards;
|
||||
let missing_shards = total_shards - nil_count;
|
||||
if missing_shards > 0 && nil_count >= erasure.data_shards {
|
||||
let available_shards = nil_count;
|
||||
let missing_shards = total_shards - available_shards;
|
||||
|
||||
info!(
|
||||
bucket,
|
||||
object,
|
||||
part_number,
|
||||
total_shards,
|
||||
available_shards,
|
||||
missing_shards,
|
||||
data_shards = erasure.data_shards,
|
||||
parity_shards = erasure.parity_shards,
|
||||
"Shard availability check"
|
||||
);
|
||||
|
||||
if missing_shards > 0 && available_shards >= erasure.data_shards {
|
||||
// We have missing shards but enough to read - trigger background heal
|
||||
info!(
|
||||
bucket,
|
||||
object,
|
||||
part_number,
|
||||
missing_shards,
|
||||
available_shards = nil_count,
|
||||
available_shards,
|
||||
pool_index,
|
||||
set_index,
|
||||
"Detected missing shards during read, triggering background heal"
|
||||
);
|
||||
let _ = rustfs_common::heal_channel::send_heal_request(
|
||||
rustfs_common::heal_channel::create_heal_request_with_options(
|
||||
if let Err(e) =
|
||||
rustfs_common::heal_channel::send_heal_request(rustfs_common::heal_channel::create_heal_request_with_options(
|
||||
bucket.to_string(),
|
||||
Some(object.to_string()),
|
||||
false,
|
||||
Some(HealChannelPriority::Normal), // Use low priority for proactive healing
|
||||
Some(HealChannelPriority::Normal),
|
||||
Some(pool_index),
|
||||
Some(set_index),
|
||||
),
|
||||
)
|
||||
.await;
|
||||
))
|
||||
.await
|
||||
{
|
||||
warn!(
|
||||
bucket,
|
||||
object,
|
||||
part_number,
|
||||
error = %e,
|
||||
"Failed to enqueue heal request for missing shards"
|
||||
);
|
||||
} else {
|
||||
warn!(bucket, object, part_number, "Successfully enqueued heal request for missing shards");
|
||||
}
|
||||
}
|
||||
|
||||
// debug!(
|
||||
@@ -2376,7 +2426,7 @@ impl SetDisks {
|
||||
match de_err {
|
||||
DiskError::FileNotFound | DiskError::FileCorrupt => {
|
||||
error!("erasure.decode err 111 {:?}", &de_err);
|
||||
let _ = rustfs_common::heal_channel::send_heal_request(
|
||||
if let Err(e) = rustfs_common::heal_channel::send_heal_request(
|
||||
rustfs_common::heal_channel::create_heal_request_with_options(
|
||||
bucket.to_string(),
|
||||
Some(object.to_string()),
|
||||
@@ -2386,7 +2436,16 @@ impl SetDisks {
|
||||
Some(set_index),
|
||||
),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
warn!(
|
||||
bucket,
|
||||
object,
|
||||
part_number,
|
||||
error = %e,
|
||||
"Failed to enqueue heal request after decode error"
|
||||
);
|
||||
}
|
||||
has_err = false;
|
||||
}
|
||||
_ => {}
|
||||
@@ -2526,6 +2585,7 @@ impl SetDisks {
|
||||
Ok((new_disks, new_infos, healing))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(self, opts), fields(bucket = %bucket, object = %object, version_id = %version_id))]
|
||||
async fn heal_object(
|
||||
&self,
|
||||
bucket: &str,
|
||||
@@ -2533,10 +2593,7 @@ impl SetDisks {
|
||||
version_id: &str,
|
||||
opts: &HealOpts,
|
||||
) -> disk::error::Result<(HealResultItem, Option<DiskError>)> {
|
||||
info!(
|
||||
"SetDisks heal_object: bucket={}, object={}, version_id={}, opts={:?}",
|
||||
bucket, object, version_id, opts
|
||||
);
|
||||
info!(?opts, "Starting heal_object");
|
||||
let mut result = HealResultItem {
|
||||
heal_item_type: HealItemType::Object.to_string(),
|
||||
bucket: bucket.to_string(),
|
||||
@@ -2568,20 +2625,34 @@ impl SetDisks {
|
||||
if reuse_existing_lock {
|
||||
None
|
||||
} else {
|
||||
let start_time = std::time::Instant::now();
|
||||
let lock_result = self
|
||||
.fast_lock_manager
|
||||
.acquire_write_lock(bucket, object, self.locker_owner.as_str())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
let elapsed = start_time.elapsed();
|
||||
let message = self.format_lock_error(bucket, object, "write", &e);
|
||||
error!("Failed to acquire write lock for heal operation after {:?}: {}", elapsed, message);
|
||||
DiskError::other(message)
|
||||
})?;
|
||||
let elapsed = start_time.elapsed();
|
||||
info!("Successfully acquired write lock for object: {} in {:?}", object, elapsed);
|
||||
Some(lock_result)
|
||||
let mut lock_result = None;
|
||||
for i in 0..3 {
|
||||
let start_time = Instant::now();
|
||||
match self
|
||||
.fast_lock_manager
|
||||
.acquire_write_lock(bucket, object, self.locker_owner.as_str())
|
||||
.await
|
||||
{
|
||||
Ok(res) => {
|
||||
let elapsed = start_time.elapsed();
|
||||
info!(duration = ?elapsed, attempt = i + 1, "Write lock acquired");
|
||||
lock_result = Some(res);
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
let elapsed = start_time.elapsed();
|
||||
info!(error = ?e, attempt = i + 1, duration = ?elapsed, "Lock acquisition failed, retrying");
|
||||
if i < 2 {
|
||||
tokio::time::sleep(Duration::from_millis(50 * (i as u64 + 1))).await;
|
||||
} else {
|
||||
let message = self.format_lock_error(bucket, object, "write", &e);
|
||||
error!("Failed to acquire write lock after retries: {}", message);
|
||||
return Err(DiskError::other(message));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lock_result
|
||||
}
|
||||
} else {
|
||||
info!("Skipping lock acquisition (no_lock=true)");
|
||||
@@ -2598,8 +2669,37 @@ impl SetDisks {
|
||||
|
||||
let disks = { self.disks.read().await.clone() };
|
||||
|
||||
let (mut parts_metadata, errs) = Self::read_all_fileinfo(&disks, "", bucket, object, version_id, true, true).await?;
|
||||
info!("Read file info: parts_metadata.len()={}, errs={:?}", parts_metadata.len(), errs);
|
||||
let (mut parts_metadata, errs) = {
|
||||
let mut retry_count = 0;
|
||||
loop {
|
||||
let (parts, errs) = Self::read_all_fileinfo(&disks, "", bucket, object, version_id, true, true).await?;
|
||||
|
||||
// Check if we have enough valid metadata to proceed
|
||||
// If we have too many errors, and we haven't exhausted retries, try again
|
||||
let valid_count = errs.iter().filter(|e| e.is_none()).count();
|
||||
// Simple heuristic: if valid_count is less than expected quorum (e.g. half disks), retry
|
||||
// But we don't know the exact quorum yet. Let's just retry on high error rate if possible.
|
||||
// Actually, read_all_fileinfo shouldn't fail easily.
|
||||
// Let's just retry if we see ANY non-NotFound errors that might be transient (like timeouts)
|
||||
|
||||
let has_transient_error = errs
|
||||
.iter()
|
||||
.any(|e| matches!(e, Some(DiskError::SourceStalled) | Some(DiskError::Timeout)));
|
||||
|
||||
if !has_transient_error || retry_count >= 3 {
|
||||
break (parts, errs);
|
||||
}
|
||||
|
||||
info!(
|
||||
"read_all_fileinfo encountered transient errors, retrying (attempt {}/3). Errs: {:?}",
|
||||
retry_count + 1,
|
||||
errs
|
||||
);
|
||||
tokio::time::sleep(Duration::from_millis(50 * (retry_count as u64 + 1))).await;
|
||||
retry_count += 1;
|
||||
}
|
||||
};
|
||||
info!(parts_count = parts_metadata.len(), ?errs, "File info read complete");
|
||||
if DiskError::is_all_not_found(&errs) {
|
||||
warn!(
|
||||
"heal_object failed, all obj part not found, bucket: {}, obj: {}, version_id: {}",
|
||||
@@ -2618,7 +2718,7 @@ impl SetDisks {
|
||||
));
|
||||
}
|
||||
|
||||
info!("About to call object_quorum_from_meta with parts_metadata.len()={}", parts_metadata.len());
|
||||
info!(parts_count = parts_metadata.len(), "Initiating quorum check");
|
||||
match Self::object_quorum_from_meta(&parts_metadata, &errs, self.default_parity_count) {
|
||||
Ok((read_quorum, _)) => {
|
||||
result.parity_blocks = result.disk_count - read_quorum as usize;
|
||||
@@ -2643,10 +2743,12 @@ impl SetDisks {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// info!(
|
||||
// "disks_with_all_parts: got available_disks: {:?}, data_errs_by_disk: {:?}, data_errs_by_part: {:?}, latest_meta: {:?}",
|
||||
// available_disks, data_errs_by_disk, data_errs_by_part, latest_meta
|
||||
// );
|
||||
info!(
|
||||
"disks_with_all_parts results: available_disks count={}, total_disks={}",
|
||||
available_disks.iter().filter(|d| d.is_some()).count(),
|
||||
available_disks.len()
|
||||
);
|
||||
|
||||
let erasure = if !latest_meta.deleted && !latest_meta.is_remote() {
|
||||
// Initialize erasure coding
|
||||
erasure_coding::Erasure::new(
|
||||
@@ -2666,10 +2768,7 @@ impl SetDisks {
|
||||
let mut outdate_disks = vec![None; disk_len];
|
||||
let mut disks_to_heal_count = 0;
|
||||
|
||||
// info!(
|
||||
// "errs: {:?}, data_errs_by_disk: {:?}, latest_meta: {:?}",
|
||||
// errs, data_errs_by_disk, latest_meta
|
||||
// );
|
||||
info!("Checking {} disks for healing needs (bucket={}, object={})", disk_len, bucket, object);
|
||||
for index in 0..available_disks.len() {
|
||||
let (yes, reason) = should_heal_object_on_disk(
|
||||
&errs[index],
|
||||
@@ -2677,9 +2776,16 @@ impl SetDisks {
|
||||
&parts_metadata[index],
|
||||
&latest_meta,
|
||||
);
|
||||
|
||||
info!(
|
||||
"Disk {} heal check: should_heal={}, reason={:?}, err={:?}, endpoint={}",
|
||||
index, yes, reason, errs[index], self.set_endpoints[index]
|
||||
);
|
||||
|
||||
if yes {
|
||||
outdate_disks[index] = disks[index].clone();
|
||||
disks_to_heal_count += 1;
|
||||
info!("Disk {} marked for healing (endpoint={})", index, self.set_endpoints[index]);
|
||||
}
|
||||
|
||||
let drive_state = match reason {
|
||||
@@ -2707,6 +2813,11 @@ impl SetDisks {
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"Heal check complete: {} disks need healing out of {} total (bucket={}, object={})",
|
||||
disks_to_heal_count, disk_len, bucket, object
|
||||
);
|
||||
|
||||
if DiskError::is_all_not_found(&errs) {
|
||||
warn!(
|
||||
"heal_object failed, all obj part not found, bucket: {}, obj: {}, version_id: {}",
|
||||
@@ -2741,9 +2852,18 @@ impl SetDisks {
|
||||
);
|
||||
|
||||
if !latest_meta.deleted && disks_to_heal_count > latest_meta.erasure.parity_blocks {
|
||||
let total_disks = parts_metadata.len();
|
||||
let healthy_count = total_disks.saturating_sub(disks_to_heal_count);
|
||||
let required_data = total_disks.saturating_sub(latest_meta.erasure.parity_blocks);
|
||||
|
||||
error!(
|
||||
"file({} : {}) part corrupt too much, can not to fix, disks_to_heal_count: {}, parity_blocks: {}",
|
||||
bucket, object, disks_to_heal_count, latest_meta.erasure.parity_blocks
|
||||
"Data corruption detected for {}/{}: Insufficient healthy shards. Need at least {} data shards, but found only {} healthy disks. (Missing/Corrupt: {}, Parity: {})",
|
||||
bucket,
|
||||
object,
|
||||
required_data,
|
||||
healthy_count,
|
||||
disks_to_heal_count,
|
||||
latest_meta.erasure.parity_blocks
|
||||
);
|
||||
|
||||
// Allow for dangling deletes, on versions that have DataDir missing etc.
|
||||
@@ -2775,7 +2895,7 @@ impl SetDisks {
|
||||
Ok((self.default_heal_result(m, &t_errs, bucket, object, version_id).await, Some(derr)))
|
||||
}
|
||||
Err(err) => {
|
||||
// t_errs = vec![Some(err.clone()); errs.len()];
|
||||
// t_errs = vec![Some(err.clone()]; errs.len());
|
||||
let mut t_errs = Vec::with_capacity(errs.len());
|
||||
for _ in 0..errs.len() {
|
||||
t_errs.push(Some(err.clone()));
|
||||
@@ -2930,7 +3050,7 @@ impl SetDisks {
|
||||
);
|
||||
for (index, disk) in latest_disks.iter().enumerate() {
|
||||
if let Some(outdated_disk) = &out_dated_disks[index] {
|
||||
info!("Creating writer for index {} (outdated disk)", index);
|
||||
info!(disk_index = index, "Creating writer for outdated disk");
|
||||
let writer = create_bitrot_writer(
|
||||
is_inline_buffer,
|
||||
Some(outdated_disk),
|
||||
@@ -2943,7 +3063,7 @@ impl SetDisks {
|
||||
.await?;
|
||||
writers.push(Some(writer));
|
||||
} else {
|
||||
info!("Skipping writer for index {} (not outdated)", index);
|
||||
info!(disk_index = index, "Skipping writer (disk not outdated)");
|
||||
writers.push(None);
|
||||
}
|
||||
|
||||
@@ -2953,7 +3073,7 @@ impl SetDisks {
|
||||
// // Box::new(Cursor::new(Vec::new()))
|
||||
// // } else {
|
||||
// // let disk = disk.clone();
|
||||
// // let part_path = format!("{}/{}/part.{}", tmp_id, dst_data_dir, part.number);
|
||||
// // let part_path = format!("{}/{}/part.{}", object, src_data_dir, part.number);
|
||||
// // disk.create_file("", RUSTFS_META_TMP_BUCKET, &part_path, 0).await?
|
||||
// // }
|
||||
// // };
|
||||
@@ -3049,6 +3169,12 @@ impl SetDisks {
|
||||
}
|
||||
}
|
||||
// Rename from tmp location to the actual location.
|
||||
info!(
|
||||
"Starting rename phase: {} disks to process (bucket={}, object={})",
|
||||
out_dated_disks.iter().filter(|d| d.is_some()).count(),
|
||||
bucket,
|
||||
object
|
||||
);
|
||||
for (index, outdated_disk) in out_dated_disks.iter().enumerate() {
|
||||
if let Some(disk) = outdated_disk {
|
||||
// record the index of the updated disks
|
||||
@@ -3057,8 +3183,8 @@ impl SetDisks {
|
||||
parts_metadata[index].set_healing();
|
||||
|
||||
info!(
|
||||
"rename temp data, src_volume: {}, src_path: {}, dst_volume: {}, dst_path: {}",
|
||||
RUSTFS_META_TMP_BUCKET, tmp_id, bucket, object
|
||||
"Renaming healed data for disk {} (endpoint={}): src_volume={}, src_path={}, dst_volume={}, dst_path={}",
|
||||
index, self.set_endpoints[index], RUSTFS_META_TMP_BUCKET, tmp_id, bucket, object
|
||||
);
|
||||
let rename_result = disk
|
||||
.rename_data(RUSTFS_META_TMP_BUCKET, &tmp_id, parts_metadata[index].clone(), bucket, object)
|
||||
@@ -3066,10 +3192,15 @@ impl SetDisks {
|
||||
|
||||
if let Err(err) = &rename_result {
|
||||
info!(
|
||||
"rename temp data err: {}. Try fallback to direct xl.meta overwrite...",
|
||||
err.to_string()
|
||||
error = %err,
|
||||
disk_index = index,
|
||||
endpoint = %self.set_endpoints[index],
|
||||
"Rename failed, attempting fallback"
|
||||
);
|
||||
|
||||
// Preserve temp files for safety
|
||||
info!(temp_uuid = %tmp_id, "Rename failed, preserving temporary files for safety");
|
||||
|
||||
let healthy_index = latest_disks.iter().position(|d| d.is_some()).unwrap_or(0);
|
||||
|
||||
if let Some(healthy_disk) = &latest_disks[healthy_index] {
|
||||
@@ -3107,7 +3238,10 @@ impl SetDisks {
|
||||
));
|
||||
}
|
||||
} else {
|
||||
info!("remove temp object, volume: {}, path: {}", RUSTFS_META_TMP_BUCKET, tmp_id);
|
||||
info!(
|
||||
"Successfully renamed healed data for disk {} (endpoint={}), removing temp files from volume={}, path={}",
|
||||
index, self.set_endpoints[index], RUSTFS_META_TMP_BUCKET, tmp_id
|
||||
);
|
||||
|
||||
self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_id)
|
||||
.await
|
||||
@@ -3441,7 +3575,10 @@ impl SetDisks {
|
||||
}
|
||||
Ok(m)
|
||||
} else {
|
||||
error!("delete_if_dang_ling: is_object_dang_ling errs={:?}", errs);
|
||||
error!(
|
||||
"Object {}/{} is corrupted but not dangling (some parts exist). Preserving data for potential manual recovery. Errors: {:?}",
|
||||
bucket, object, errs
|
||||
);
|
||||
Err(DiskError::ErasureReadQuorum)
|
||||
}
|
||||
}
|
||||
@@ -4144,7 +4281,7 @@ impl StorageAPI for SetDisks {
|
||||
|
||||
// Acquire locks in batch mode (best effort, matching previous behavior)
|
||||
let mut batch = rustfs_lock::BatchLockRequest::new(self.locker_owner.as_str()).with_all_or_nothing(false);
|
||||
let mut unique_objects: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||
let mut unique_objects: HashSet<String> = HashSet::new();
|
||||
for dobj in &objects {
|
||||
if unique_objects.insert(dobj.object_name.clone()) {
|
||||
batch = batch.add_write_lock(bucket, dobj.object_name.clone());
|
||||
@@ -4159,7 +4296,7 @@ impl StorageAPI for SetDisks {
|
||||
.collect();
|
||||
let _lock_guards = batch_result.guards;
|
||||
|
||||
let failed_map: HashMap<(String, String), rustfs_lock::fast_lock::LockResult> = batch_result
|
||||
let failed_map: HashMap<(String, String), LockResult> = batch_result
|
||||
.failed_locks
|
||||
.into_iter()
|
||||
.map(|(key, err)| ((key.bucket.as_ref().to_string(), key.object.as_ref().to_string()), err))
|
||||
@@ -4506,7 +4643,7 @@ impl StorageAPI for SetDisks {
|
||||
_rx: CancellationToken,
|
||||
_bucket: &str,
|
||||
_prefix: &str,
|
||||
_result: tokio::sync::mpsc::Sender<ObjectInfoOrErr>,
|
||||
_result: Sender<ObjectInfoOrErr>,
|
||||
_opts: WalkOptions,
|
||||
) -> Result<()> {
|
||||
unimplemented!()
|
||||
@@ -4538,15 +4675,25 @@ impl StorageAPI for SetDisks {
|
||||
|
||||
#[tracing::instrument(skip(self))]
|
||||
async fn add_partial(&self, bucket: &str, object: &str, version_id: &str) -> Result<()> {
|
||||
let _ = rustfs_common::heal_channel::send_heal_request(rustfs_common::heal_channel::create_heal_request_with_options(
|
||||
bucket.to_string(),
|
||||
Some(object.to_string()),
|
||||
false,
|
||||
Some(HealChannelPriority::Normal),
|
||||
Some(self.pool_index),
|
||||
Some(self.set_index),
|
||||
))
|
||||
.await;
|
||||
if let Err(e) =
|
||||
rustfs_common::heal_channel::send_heal_request(rustfs_common::heal_channel::create_heal_request_with_options(
|
||||
bucket.to_string(),
|
||||
Some(object.to_string()),
|
||||
false,
|
||||
Some(HealChannelPriority::Normal),
|
||||
Some(self.pool_index),
|
||||
Some(self.set_index),
|
||||
))
|
||||
.await
|
||||
{
|
||||
warn!(
|
||||
bucket,
|
||||
object,
|
||||
version_id,
|
||||
error = %e,
|
||||
"Failed to enqueue heal request for partial object"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -4832,11 +4979,11 @@ impl StorageAPI for SetDisks {
|
||||
false,
|
||||
)?;
|
||||
let mut p_reader = PutObjReader::new(hash_reader);
|
||||
if let Err(err) = self_.clone().put_object(bucket, object, &mut p_reader, &ropts).await {
|
||||
return set_restore_header_fn(&mut oi, Some(to_object_err(err, vec![bucket, object]))).await;
|
||||
return if let Err(err) = self_.clone().put_object(bucket, object, &mut p_reader, &ropts).await {
|
||||
set_restore_header_fn(&mut oi, Some(to_object_err(err, vec![bucket, object]))).await
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
}
|
||||
|
||||
let res = self_.clone().new_multipart_upload(bucket, object, &ropts).await?;
|
||||
@@ -5940,7 +6087,7 @@ impl StorageAPI for SetDisks {
|
||||
bucket.to_string(),
|
||||
Some(object.to_string()),
|
||||
false,
|
||||
Some(rustfs_common::heal_channel::HealChannelPriority::Normal),
|
||||
Some(HealChannelPriority::Normal),
|
||||
Some(self.pool_index),
|
||||
Some(self.set_index),
|
||||
))
|
||||
@@ -6000,6 +6147,55 @@ impl StorageAPI for SetDisks {
|
||||
version_id: &str,
|
||||
opts: &HealOpts,
|
||||
) -> Result<(HealResultItem, Option<Error>)> {
|
||||
let mut effective_object = object.to_string();
|
||||
|
||||
// Optimization: Only attempt correction if the name looks suspicious (quotes or URL encoded)
|
||||
// and the original object does NOT exist.
|
||||
let has_quotes = (effective_object.starts_with('\'') && effective_object.ends_with('\''))
|
||||
|| (effective_object.starts_with('"') && effective_object.ends_with('"'));
|
||||
let has_percent = effective_object.contains('%');
|
||||
|
||||
if has_quotes || has_percent {
|
||||
let disks = self.disks.read().await;
|
||||
// 1. Check if the original object exists (lightweight check)
|
||||
let (_, errs) = Self::read_all_fileinfo(&disks, "", bucket, &effective_object, version_id, false, false).await?;
|
||||
|
||||
if DiskError::is_all_not_found(&errs) {
|
||||
// Original not found. Try candidates.
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
// Candidate 1: URL Decoded (Priority for web access issues)
|
||||
if has_percent {
|
||||
if let Ok(decoded) = urlencoding::decode(&effective_object) {
|
||||
if decoded != effective_object {
|
||||
candidates.push(decoded.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Candidate 2: Quote Stripped (For shell copy-paste issues)
|
||||
if has_quotes && effective_object.len() >= 2 {
|
||||
candidates.push(effective_object[1..effective_object.len() - 1].to_string());
|
||||
}
|
||||
|
||||
// Check candidates
|
||||
for candidate in candidates {
|
||||
let (_, errs_cand) =
|
||||
Self::read_all_fileinfo(&disks, "", bucket, &candidate, version_id, false, false).await?;
|
||||
|
||||
if !DiskError::is_all_not_found(&errs_cand) {
|
||||
info!(
|
||||
"Heal request for object '{}' failed (not found). Auto-corrected to '{}'.",
|
||||
effective_object, candidate
|
||||
);
|
||||
effective_object = candidate;
|
||||
break; // Found a match, stop searching
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let object = effective_object.as_str();
|
||||
|
||||
let _write_lock_guard = if !opts.no_lock {
|
||||
let key = rustfs_lock::fast_lock::types::ObjectKey::new(bucket, object);
|
||||
let mut skip_lock = false;
|
||||
@@ -6014,10 +6210,10 @@ impl StorageAPI for SetDisks {
|
||||
skip_lock = true;
|
||||
}
|
||||
}
|
||||
|
||||
if skip_lock {
|
||||
None
|
||||
} else {
|
||||
info!(?opts, "Starting heal_object");
|
||||
Some(
|
||||
self.fast_lock_manager
|
||||
.acquire_write_lock(bucket, object, self.locker_owner.as_str())
|
||||
|
||||
@@ -166,7 +166,7 @@ pub const RUSTFS_FORCE_DELETE: &str = "X-Rustfs-Force-Delete";
|
||||
pub const RUSTFS_INCLUDE_DELETED: &str = "X-Rustfs-Include-Deleted";
|
||||
|
||||
pub const RUSTFS_REPLICATION_RESET_STATUS: &str = "X-Rustfs-Replication-Reset-Status";
|
||||
pub const RUSTFS_REPLICATION_AUTUAL_OBJECT_SIZE: &str = "X-Rustfs-Replication-Actual-Object-Size";
|
||||
pub const RUSTFS_REPLICATION_ACTUAL_OBJECT_SIZE: &str = "X-Rustfs-Replication-Actual-Object-Size";
|
||||
|
||||
pub const RUSTFS_BUCKET_SOURCE_VERSION_ID: &str = "X-Rustfs-Source-Version-Id";
|
||||
pub const RUSTFS_BUCKET_SOURCE_MTIME: &str = "X-Rustfs-Source-Mtime";
|
||||
|
||||
Reference in New Issue
Block a user