Files
rustfs/crates/ecstore/src/set_disk.rs
guojidan 62a01f3801 Performance: improve (#514)
* Performance: improve

Signed-off-by: junxiang Mu <1948535941@qq.com>

* remove dirty

Signed-off-by: junxiang Mu <1948535941@qq.com>

* fix some err

Signed-off-by: junxiang Mu <1948535941@qq.com>

---------

Signed-off-by: junxiang Mu <1948535941@qq.com>
2025-09-11 19:48:28 +08:00

6767 lines
246 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#![allow(unused_imports)]
#![allow(unused_variables)]
use crate::batch_processor::{AsyncBatchProcessor, get_global_processors};
use crate::bitrot::{create_bitrot_reader, create_bitrot_writer};
use crate::bucket::lifecycle::lifecycle::TRANSITION_COMPLETE;
use crate::bucket::versioning::VersioningApi;
use crate::bucket::versioning_sys::BucketVersioningSys;
use crate::client::{object_api_utils::extract_etag, transition_api::ReaderImpl};
use crate::disk::STORAGE_FORMAT_FILE;
use crate::disk::error_reduce::{OBJECT_OP_IGNORED_ERRS, reduce_read_quorum_errs, reduce_write_quorum_errs};
use crate::disk::{
self, CHECK_PART_DISK_NOT_FOUND, CHECK_PART_FILE_CORRUPT, CHECK_PART_FILE_NOT_FOUND, CHECK_PART_SUCCESS,
conv_part_err_to_int, has_part_err,
};
use crate::erasure_coding;
use crate::erasure_coding::bitrot_verify;
use crate::error::{Error, Result};
use crate::error::{ObjectApiError, is_err_object_not_found};
use crate::global::{GLOBAL_LocalNodeName, GLOBAL_TierConfigMgr};
use crate::store_api::ListObjectVersionsInfo;
use crate::store_api::{ListPartsInfo, ObjectToDelete};
use crate::{
bucket::lifecycle::bucket_lifecycle_ops::{gen_transition_objname, get_transitioned_object_reader, put_restore_opts},
cache_value::metacache_set::{ListPathRawOptions, list_path_raw},
config::{GLOBAL_STORAGE_CLASS, storageclass},
disk::{
CheckPartsResp, DeleteOptions, DiskAPI, DiskInfo, DiskInfoOptions, DiskOption, DiskStore, FileInfoVersions,
RUSTFS_META_BUCKET, RUSTFS_META_MULTIPART_BUCKET, RUSTFS_META_TMP_BUCKET, ReadMultipleReq, ReadMultipleResp, ReadOptions,
UpdateMetadataOpts, endpoint::Endpoint, error::DiskError, format::FormatV3, new_disk,
},
error::{StorageError, to_object_err},
event::name::EventName,
event_notification::{EventArgs, send_event},
global::{GLOBAL_LOCAL_DISK_MAP, GLOBAL_LOCAL_DISK_SET_DRIVES, get_global_deployment_id, is_dist_erasure},
store_api::{
BucketInfo, BucketOptions, CompletePart, DeleteBucketOptions, DeletedObject, GetObjectReader, HTTPRangeSpec,
ListMultipartsInfo, ListObjectsV2Info, MakeBucketOptions, MultipartInfo, MultipartUploadResult, ObjectIO, ObjectInfo,
ObjectOptions, PartInfo, PutObjReader, StorageAPI,
},
store_init::load_format_erasure,
};
use bytes::Bytes;
use bytesize::ByteSize;
use chrono::Utc;
use futures::future::join_all;
use glob::Pattern;
use http::HeaderMap;
use md5::{Digest as Md5Digest, Md5};
use rand::{Rng, seq::SliceRandom};
use regex::Regex;
use rustfs_common::heal_channel::{DriveState, HealChannelPriority, HealItemType, HealOpts, HealScanMode, send_heal_disk};
use rustfs_filemeta::headers::RESERVED_METADATA_PREFIX_LOWER;
use rustfs_filemeta::{
FileInfo, FileMeta, FileMetaShallowVersion, MetaCacheEntries, MetaCacheEntry, MetadataResolutionParams, ObjectPartInfo,
RawFileInfo, file_info_from_raw,
headers::{AMZ_OBJECT_TAGGING, AMZ_STORAGE_CLASS},
merge_file_meta_versions,
};
use rustfs_lock::NamespaceLockManager;
use rustfs_madmin::heal_commands::{HealDriveInfo, HealResultItem};
use rustfs_rio::{EtagResolvable, HashReader, TryGetIndex as _, WarpReader};
use rustfs_utils::{
HashAlgorithm,
crypto::{base64_decode, base64_encode, hex},
path::{SLASH_SEPARATOR, encode_dir_object, has_suffix, path_join_buf},
};
use rustfs_workers::workers::Workers;
use s3s::header::X_AMZ_RESTORE;
use sha2::{Digest, Sha256};
use std::hash::Hash;
use std::mem::{self};
use std::time::SystemTime;
use std::{
collections::{HashMap, HashSet},
io::{Cursor, Write},
path::Path,
sync::Arc,
time::Duration,
};
use time::OffsetDateTime;
use tokio::{
io::AsyncWrite,
sync::{RwLock, broadcast},
};
use tokio::{
select,
sync::mpsc::{self, Sender},
time::interval,
};
use tracing::error;
use tracing::{debug, info, warn};
use uuid::Uuid;
pub const DEFAULT_READ_BUFFER_SIZE: usize = 1024 * 1024;
pub const MAX_PARTS_COUNT: usize = 10000;
#[derive(Clone, Debug)]
pub struct SetDisks {
pub fast_lock_manager: Arc<rustfs_lock::FastObjectLockManager>,
pub locker_owner: String,
pub disks: Arc<RwLock<Vec<Option<DiskStore>>>>,
pub set_endpoints: Vec<Endpoint>,
pub set_drive_count: usize,
pub default_parity_count: usize,
pub set_index: usize,
pub pool_index: usize,
pub format: FormatV3,
}
impl SetDisks {
#[allow(clippy::too_many_arguments)]
pub async fn new(
fast_lock_manager: Arc<rustfs_lock::FastObjectLockManager>,
locker_owner: String,
disks: Arc<RwLock<Vec<Option<DiskStore>>>>,
set_drive_count: usize,
default_parity_count: usize,
set_index: usize,
pool_index: usize,
set_endpoints: Vec<Endpoint>,
format: FormatV3,
) -> Arc<Self> {
Arc::new(SetDisks {
fast_lock_manager,
locker_owner,
disks,
set_drive_count,
default_parity_count,
set_index,
pool_index,
format,
set_endpoints,
})
}
async fn get_disks_internal(&self) -> Vec<Option<DiskStore>> {
let rl = self.disks.read().await;
rl.clone()
}
pub async fn get_local_disks(&self) -> Vec<Option<DiskStore>> {
let rl = self.disks.read().await;
let mut disks: Vec<Option<DiskStore>> = rl
.clone()
.into_iter()
.filter(|v| v.as_ref().is_some_and(|d| d.is_local()))
.collect();
let mut rng = rand::rng();
disks.shuffle(&mut rng);
disks
}
async fn get_online_disks(&self) -> Vec<Option<DiskStore>> {
let mut disks = self.get_disks_internal().await;
// TODO: diskinfo filter online
let mut new_disk = Vec::with_capacity(disks.len());
for disk in disks.iter() {
if let Some(d) = disk {
if d.is_online().await {
new_disk.push(disk.clone());
}
}
}
let mut rng = rand::rng();
disks.shuffle(&mut rng);
new_disk
}
async fn get_online_local_disks(&self) -> Vec<Option<DiskStore>> {
let mut disks = self.get_online_disks().await;
let mut rng = rand::rng();
disks.shuffle(&mut rng);
disks
.into_iter()
.filter(|v| v.as_ref().is_some_and(|d| d.is_local()))
.collect()
}
pub async fn get_online_disks_with_healing(&self, incl_healing: bool) -> (Vec<DiskStore>, bool) {
let (disks, _, healing) = self.get_online_disks_with_healing_and_info(incl_healing).await;
(disks, healing > 0)
}
pub async fn get_online_disks_with_healing_and_info(&self, incl_healing: bool) -> (Vec<DiskStore>, Vec<DiskInfo>, usize) {
let mut disks = self.get_disks_internal().await;
let mut infos = Vec::with_capacity(disks.len());
let mut futures = Vec::with_capacity(disks.len());
let mut numbers: Vec<usize> = (0..disks.len()).collect();
{
let mut rng = rand::rng();
disks.shuffle(&mut rng);
numbers.shuffle(&mut rng);
}
for &i in numbers.iter() {
let disk = disks[i].clone();
futures.push(async move {
if let Some(disk) = disk {
disk.disk_info(&DiskInfoOptions::default()).await
} else {
Err(DiskError::DiskNotFound)
}
});
}
// Use optimized batch processor for disk info retrieval
let processor = get_global_processors().metadata_processor();
let results = processor.execute_batch(futures).await;
for result in results {
match result {
Ok(res) => {
infos.push(res);
}
Err(err) => {
infos.push(DiskInfo {
error: err.to_string(),
..Default::default()
});
}
}
}
let mut healing: usize = 0;
let mut scanning_disks = Vec::new();
let mut healing_disks = Vec::new();
let mut scanning_infos = Vec::new();
let mut healing_infos = Vec::new();
let mut new_disks = Vec::new();
let mut new_infos = Vec::new();
for &i in numbers.iter() {
let (info, disk) = (infos[i].clone(), disks[i].clone());
if !info.error.is_empty() || disk.is_none() {
continue;
}
if info.healing {
healing += 1;
if incl_healing {
healing_disks.push(disk.unwrap());
healing_infos.push(info);
}
continue;
}
if !info.healing {
new_disks.push(disk.unwrap());
new_infos.push(info);
} else {
scanning_disks.push(disk.unwrap());
scanning_infos.push(info);
}
}
new_disks.extend(scanning_disks);
new_infos.extend(scanning_infos);
new_disks.extend(healing_disks);
new_infos.extend(healing_infos);
(new_disks, new_infos, healing)
}
async fn _get_local_disks(&self) -> Vec<Option<DiskStore>> {
let mut disks = self.get_disks_internal().await;
let mut rng = rand::rng();
disks.shuffle(&mut rng);
disks
.into_iter()
.filter(|v| v.as_ref().is_some_and(|d| d.is_local()))
.collect()
}
fn default_read_quorum(&self) -> usize {
self.set_drive_count - self.default_parity_count
}
fn default_write_quorum(&self) -> usize {
let mut data_count = self.set_drive_count - self.default_parity_count;
if data_count == self.default_parity_count {
data_count += 1
}
data_count
}
#[tracing::instrument(level = "debug", skip(disks, file_infos))]
#[allow(clippy::type_complexity)]
async fn rename_data(
disks: &[Option<DiskStore>],
src_bucket: &str,
src_object: &str,
file_infos: &[FileInfo],
dst_bucket: &str,
dst_object: &str,
write_quorum: usize,
) -> disk::error::Result<(Vec<Option<DiskStore>>, Option<Vec<u8>>, Option<Uuid>)> {
let mut futures = Vec::with_capacity(disks.len());
// let mut ress = Vec::with_capacity(disks.len());
let mut errs = Vec::with_capacity(disks.len());
let src_bucket = Arc::new(src_bucket.to_string());
let src_object = Arc::new(src_object.to_string());
let dst_bucket = Arc::new(dst_bucket.to_string());
let dst_object = Arc::new(dst_object.to_string());
for (i, (disk, file_info)) in disks.iter().zip(file_infos.iter()).enumerate() {
let mut file_info = file_info.clone();
let disk = disk.clone();
let src_bucket = src_bucket.clone();
let src_object = src_object.clone();
let dst_object = dst_object.clone();
let dst_bucket = dst_bucket.clone();
futures.push(tokio::spawn(async move {
if file_info.erasure.index == 0 {
file_info.erasure.index = i + 1;
}
if !file_info.is_valid() {
return Err(DiskError::FileCorrupt);
}
if let Some(disk) = disk {
disk.rename_data(&src_bucket, &src_object, file_info, &dst_bucket, &dst_object)
.await
} else {
Err(DiskError::DiskNotFound)
}
}));
}
let mut disk_versions = vec![None; disks.len()];
let mut data_dirs = vec![None; disks.len()];
let results = join_all(futures).await;
for (idx, result) in results.iter().enumerate() {
match result.as_ref().map_err(|_| DiskError::Unexpected)? {
Ok(res) => {
data_dirs[idx] = res.old_data_dir;
disk_versions[idx].clone_from(&res.sign);
errs.push(None);
}
Err(e) => {
errs.push(Some(e.clone()));
}
}
}
let mut futures = Vec::with_capacity(disks.len());
if let Some(ret_err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
// TODO: 并发
for (i, err) in errs.iter().enumerate() {
if err.is_some() {
continue;
}
if let Some(disk) = disks[i].as_ref() {
let fi = file_infos[i].clone();
let old_data_dir = data_dirs[i];
let disk = disk.clone();
let src_bucket = src_bucket.clone();
let src_object = src_object.clone();
futures.push(tokio::spawn(async move {
let _ = disk
.delete_version(
&src_bucket,
&src_object,
fi,
false,
DeleteOptions {
undo_write: true,
old_data_dir,
..Default::default()
},
)
.await
.map_err(|e| {
debug!("rename_data delete_version err {:?}", e);
e
});
}));
}
}
let _ = join_all(futures).await;
return Err(ret_err);
}
let versions = None;
// TODO: reduceCommonVersions
let data_dir = Self::reduce_common_data_dir(&data_dirs, write_quorum);
// // TODO: reduce_common_data_dir
// if let Some(old_dir) = rename_ress
// .iter()
// .filter_map(|v| if v.is_some() { v.as_ref().unwrap().old_data_dir } else { None })
// .map(|v| v.to_string())
// .next()
// {
// let cm_errs = self.commit_rename_data_dir(&shuffle_disks, &bucket, &object, &old_dir).await;
// warn!("put_object commit_rename_data_dir:{:?}", &cm_errs);
// }
// self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_dir).await?;
Ok((Self::eval_disks(disks, &errs), versions, data_dir))
}
fn reduce_common_data_dir(data_dirs: &Vec<Option<Uuid>>, write_quorum: usize) -> Option<Uuid> {
let mut data_dirs_count = HashMap::new();
for ddir in data_dirs {
*data_dirs_count.entry(ddir).or_insert(0) += 1;
}
let mut max = 0;
let mut data_dir = None;
for (ddir, count) in data_dirs_count {
if count > max {
max = count;
data_dir = *ddir;
}
}
if max >= write_quorum { data_dir } else { None }
}
#[allow(dead_code)]
#[tracing::instrument(level = "debug", skip(self, disks))]
async fn commit_rename_data_dir(
&self,
disks: &[Option<DiskStore>],
bucket: &str,
object: &str,
data_dir: &str,
write_quorum: usize,
) -> disk::error::Result<()> {
let file_path = Arc::new(format!("{object}/{data_dir}"));
let bucket = Arc::new(bucket.to_string());
let futures = disks.iter().map(|disk| {
let file_path = file_path.clone();
let bucket = bucket.clone();
let disk = disk.clone();
tokio::spawn(async move {
if let Some(disk) = disk {
(disk
.delete(
&bucket,
&file_path,
DeleteOptions {
recursive: true,
..Default::default()
},
)
.await)
.err()
} else {
Some(DiskError::DiskNotFound)
}
})
});
let errs: Vec<Option<DiskError>> = join_all(futures)
.await
.into_iter()
.map(|e| e.unwrap_or(Some(DiskError::Unexpected)))
.collect();
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
return Err(err);
}
Ok(())
}
#[tracing::instrument(skip(disks))]
async fn cleanup_multipart_path(disks: &[Option<DiskStore>], paths: &[String]) {
let mut errs = Vec::with_capacity(disks.len());
// Use improved simple batch processor instead of join_all for better performance
let processor = get_global_processors().write_processor();
let tasks: Vec<_> = disks
.iter()
.map(|disk| {
let disk = disk.clone();
let paths = paths.to_vec();
async move {
if let Some(disk) = disk {
disk.delete_paths(RUSTFS_META_MULTIPART_BUCKET, &paths).await
} else {
Err(DiskError::DiskNotFound)
}
}
})
.collect();
let results = processor.execute_batch(tasks).await;
for result in results {
match result {
Ok(_) => {
errs.push(None);
}
Err(e) => {
errs.push(Some(e));
}
}
}
if errs.iter().any(|e| e.is_some()) {
warn!("cleanup_multipart_path errs {:?}", &errs);
}
}
async fn read_parts(
disks: &[Option<DiskStore>],
bucket: &str,
part_meta_paths: &[String],
part_numbers: &[usize],
read_quorum: usize,
) -> disk::error::Result<Vec<ObjectPartInfo>> {
let mut errs = Vec::with_capacity(disks.len());
let mut object_parts = Vec::with_capacity(disks.len());
// Use batch processor for better performance
let processor = get_global_processors().read_processor();
let bucket = bucket.to_string();
let part_meta_paths = part_meta_paths.to_vec();
let tasks: Vec<_> = disks
.iter()
.map(|disk| {
let disk = disk.clone();
let bucket = bucket.clone();
let part_meta_paths = part_meta_paths.clone();
async move {
if let Some(disk) = disk {
disk.read_parts(&bucket, &part_meta_paths).await
} else {
Err(DiskError::DiskNotFound)
}
}
})
.collect();
let results = processor.execute_batch(tasks).await;
for result in results {
match result {
Ok(res) => {
errs.push(None);
object_parts.push(res);
}
Err(e) => {
errs.push(Some(e));
object_parts.push(vec![]);
}
}
}
if let Some(err) = reduce_read_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, read_quorum) {
return Err(err);
}
let mut ret = vec![ObjectPartInfo::default(); part_meta_paths.len()];
for (part_idx, part_info) in part_meta_paths.iter().enumerate() {
let mut part_meta_quorum = HashMap::new();
let mut part_infos = Vec::new();
for (j, parts) in object_parts.iter().enumerate() {
if parts.len() != part_meta_paths.len() {
*part_meta_quorum.entry(part_info.clone()).or_insert(0) += 1;
continue;
}
if !parts[part_idx].etag.is_empty() {
*part_meta_quorum.entry(parts[part_idx].etag.clone()).or_insert(0) += 1;
part_infos.push(parts[part_idx].clone());
continue;
}
*part_meta_quorum.entry(part_info.clone()).or_insert(0) += 1;
}
let mut max_quorum = 0;
let mut max_etag = None;
let mut max_part_meta = None;
for (etag, quorum) in part_meta_quorum.iter() {
if quorum > &max_quorum {
max_quorum = *quorum;
max_etag = Some(etag);
max_part_meta = Some(etag);
}
}
let mut found = None;
for info in part_infos.iter() {
if let Some(etag) = max_etag
&& info.etag == *etag
{
found = Some(info.clone());
break;
}
if let Some(part_meta) = max_part_meta
&& info.etag.is_empty()
&& part_meta.ends_with(format!("part.{0}.meta", info.number).as_str())
{
found = Some(info.clone());
break;
}
}
if let (Some(found), Some(max_etag)) = (found, max_etag)
&& !found.etag.is_empty()
&& part_meta_quorum.get(max_etag).unwrap_or(&0) >= &read_quorum
{
ret[part_idx] = found.clone();
} else {
ret[part_idx] = ObjectPartInfo {
number: part_numbers[part_idx],
error: Some(format!("part.{} not found", part_numbers[part_idx])),
..Default::default()
};
}
}
Ok(ret)
}
async fn list_parts(disks: &[Option<DiskStore>], part_path: &str, read_quorum: usize) -> disk::error::Result<Vec<usize>> {
let mut futures = Vec::with_capacity(disks.len());
for (i, disk) in disks.iter().enumerate() {
futures.push(async move {
if let Some(disk) = disk {
disk.list_dir(RUSTFS_META_MULTIPART_BUCKET, RUSTFS_META_MULTIPART_BUCKET, part_path, -1)
.await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let mut errs = Vec::with_capacity(disks.len());
let mut object_parts = Vec::with_capacity(disks.len());
let results = join_all(futures).await;
for result in results {
match result {
Ok(res) => {
errs.push(None);
object_parts.push(res);
}
Err(e) => {
errs.push(Some(e));
object_parts.push(vec![]);
}
}
}
if let Some(err) = reduce_read_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, read_quorum) {
return Err(err);
}
let mut part_quorum_map: HashMap<usize, usize> = HashMap::new();
for drive_parts in object_parts {
let mut parts_with_meta_count: HashMap<usize, usize> = HashMap::new();
// part files can be either part.N or part.N.meta
for part_path in drive_parts {
if let Some(num_str) = part_path.strip_prefix("part.") {
if let Some(meta_idx) = num_str.find(".meta") {
if let Ok(part_num) = num_str[..meta_idx].parse::<usize>() {
*parts_with_meta_count.entry(part_num).or_insert(0) += 1;
}
} else if let Ok(part_num) = num_str.parse::<usize>() {
*parts_with_meta_count.entry(part_num).or_insert(0) += 1;
}
}
}
// Include only part.N.meta files with corresponding part.N
for (&part_num, &cnt) in &parts_with_meta_count {
if cnt >= 2 {
*part_quorum_map.entry(part_num).or_insert(0) += 1;
}
}
}
let mut part_numbers = Vec::with_capacity(part_quorum_map.len());
for (part_num, count) in part_quorum_map {
if count >= read_quorum {
part_numbers.push(part_num);
}
}
part_numbers.sort();
Ok(part_numbers)
}
#[tracing::instrument(skip(disks, meta))]
async fn rename_part(
disks: &[Option<DiskStore>],
src_bucket: &str,
src_object: &str,
dst_bucket: &str,
dst_object: &str,
meta: Bytes,
write_quorum: usize,
) -> disk::error::Result<Vec<Option<DiskStore>>> {
let src_bucket = Arc::new(src_bucket.to_string());
let src_object = Arc::new(src_object.to_string());
let dst_bucket = Arc::new(dst_bucket.to_string());
let dst_object = Arc::new(dst_object.to_string());
let mut errs = Vec::with_capacity(disks.len());
let futures = disks.iter().map(|disk| {
let disk = disk.clone();
let meta = meta.clone();
let src_bucket = src_bucket.clone();
let src_object = src_object.clone();
let dst_bucket = dst_bucket.clone();
let dst_object = dst_object.clone();
tokio::spawn(async move {
if let Some(disk) = disk {
disk.rename_part(&src_bucket, &src_object, &dst_bucket, &dst_object, meta)
.await
} else {
Err(DiskError::DiskNotFound)
}
})
});
let results = join_all(futures).await;
for result in results {
match result? {
Ok(_) => {
errs.push(None);
}
Err(e) => {
errs.push(Some(e));
}
}
}
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
warn!("rename_part errs {:?}", &errs);
Self::cleanup_multipart_path(disks, &[dst_object.to_string(), format!("{dst_object}.meta")]).await;
return Err(err);
}
let disks = Self::eval_disks(disks, &errs);
Ok(disks)
}
fn eval_disks(disks: &[Option<DiskStore>], errs: &[Option<DiskError>]) -> Vec<Option<DiskStore>> {
if disks.len() != errs.len() {
return Vec::new();
}
let mut online_disks = vec![None; disks.len()];
for (i, err_op) in errs.iter().enumerate() {
if err_op.is_none() {
online_disks[i].clone_from(&disks[i]);
}
}
online_disks
}
// async fn write_all(disks: &[Option<DiskStore>], bucket: &str, object: &str, buff: Vec<u8>) -> Vec<Option<Error>> {
// let mut futures = Vec::with_capacity(disks.len());
// let mut errors = Vec::with_capacity(disks.len());
// for disk in disks.iter() {
// if disk.is_none() {
// errors.push(Some(Error::new(DiskError::DiskNotFound)));
// continue;
// }
// let disk = disk.as_ref().unwrap();
// futures.push(disk.write_all(bucket, object, buff.clone()));
// }
// let results = join_all(futures).await;
// for result in results {
// match result {
// Ok(_) => {
// errors.push(None);
// }
// Err(e) => {
// errors.push(Some(e));
// }
// }
// }
// errors
// }
#[tracing::instrument(skip(disks, files))]
async fn write_unique_file_info(
disks: &[Option<DiskStore>],
org_bucket: &str,
bucket: &str,
prefix: &str,
files: &[FileInfo],
write_quorum: usize,
) -> disk::error::Result<()> {
let mut futures = Vec::with_capacity(disks.len());
let mut errs = Vec::with_capacity(disks.len());
for (i, disk) in disks.iter().enumerate() {
let mut file_info = files[i].clone();
file_info.erasure.index = i + 1;
futures.push(async move {
if let Some(disk) = disk {
disk.write_metadata(org_bucket, bucket, prefix, file_info).await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(_) => {
errs.push(None);
}
Err(e) => {
errs.push(Some(e));
}
}
}
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
// TODO: 并发
for (i, err) in errs.iter().enumerate() {
if err.is_some() {
continue;
}
if let Some(disk) = disks[i].as_ref() {
let _ = disk
.delete(
bucket,
&path_join_buf(&[prefix, STORAGE_FORMAT_FILE]),
DeleteOptions {
recursive: true,
..Default::default()
},
)
.await
.map_err(|e| {
warn!("write meta revert err {:?}", e);
e
});
}
}
return Err(err);
}
Ok(())
}
fn get_upload_id_dir(bucket: &str, object: &str, upload_id: &str) -> String {
// warn!("get_upload_id_dir upload_id {:?}", upload_id);
let upload_uuid = base64_decode(upload_id.as_bytes())
.and_then(|v| {
String::from_utf8(v).map_or(Ok(upload_id.to_owned()), |v| {
let parts: Vec<_> = v.splitn(2, '.').collect();
if parts.len() == 2 {
Ok(parts[1].to_string())
} else {
Ok(upload_id.to_string())
}
})
})
.unwrap_or_default();
format!("{}/{}", Self::get_multipart_sha_dir(bucket, object), upload_uuid)
}
fn get_multipart_sha_dir(bucket: &str, object: &str) -> String {
let path = format!("{bucket}/{object}");
let mut hasher = Sha256::new();
hasher.update(path);
hex(hasher.finalize())
}
fn common_parity(parities: &[i32], default_parity_count: i32) -> i32 {
let n = parities.len() as i32;
let mut occ_map: HashMap<i32, i32> = HashMap::new();
for &p in parities {
*occ_map.entry(p).or_insert(0) += 1;
}
let mut max_occ = 0;
let mut cparity = 0;
for (&parity, &occ) in &occ_map {
if parity == -1 {
// Ignore non defined parity
continue;
}
let mut read_quorum = n - parity;
if default_parity_count > 0 && parity == 0 {
// In this case, parity == 0 implies that this object version is a
// delete marker
read_quorum = n / 2 + 1;
}
if occ < read_quorum {
// Ignore this parity since we don't have enough shards for read quorum
continue;
}
if occ > max_occ {
max_occ = occ;
cparity = parity;
}
}
if max_occ == 0 {
// Did not find anything useful
return -1;
}
cparity
}
fn list_object_modtimes(parts_metadata: &[FileInfo], errs: &[Option<DiskError>]) -> Vec<Option<OffsetDateTime>> {
let mut times = vec![None; parts_metadata.len()];
for (i, metadata) in parts_metadata.iter().enumerate() {
if errs[i].is_some() {
continue;
}
times[i] = metadata.mod_time
}
times
}
fn common_time(times: &[Option<OffsetDateTime>], quorum: usize) -> Option<OffsetDateTime> {
let (time, count) = Self::common_time_and_occurrence(times);
if count >= quorum { time } else { None }
}
fn common_time_and_occurrence(times: &[Option<OffsetDateTime>]) -> (Option<OffsetDateTime>, usize) {
let mut time_occurrence_map = HashMap::new();
// Ignore the uuid sentinel and count the rest.
for time in times.iter().flatten() {
*time_occurrence_map.entry(time.unix_timestamp_nanos()).or_insert(0) += 1;
}
let mut maxima = 0; // Counter for remembering max occurrence of elements.
let mut latest = 0;
// Find the common cardinality from previously collected
// occurrences of elements.
for (&nano, &count) in &time_occurrence_map {
if count < maxima {
continue;
}
// We are at or above maxima
if count > maxima || nano > latest {
maxima = count;
latest = nano;
}
}
if latest == 0 {
return (None, maxima);
}
if let Ok(time) = OffsetDateTime::from_unix_timestamp_nanos(latest) {
(Some(time), maxima)
} else {
(None, maxima)
}
}
fn common_etag(etags: &[Option<String>], quorum: usize) -> Option<String> {
let (etag, count) = Self::common_etags(etags);
if count >= quorum { etag } else { None }
}
fn common_etags(etags: &[Option<String>]) -> (Option<String>, usize) {
let mut etags_map = HashMap::new();
for etag in etags.iter().flatten() {
*etags_map.entry(etag).or_insert(0) += 1;
}
let mut maxima = 0; // Counter for remembering max occurrence of elements.
let mut latest = None;
for (&etag, &count) in &etags_map {
if count < maxima {
continue;
}
// We are at or above maxima
if count > maxima {
maxima = count;
latest = Some(etag.clone());
}
}
(latest, maxima)
}
fn list_object_etags(parts_metadata: &[FileInfo], errs: &[Option<DiskError>]) -> Vec<Option<String>> {
let mut etags = vec![None; parts_metadata.len()];
for (i, metadata) in parts_metadata.iter().enumerate() {
if errs[i].is_some() {
continue;
}
if let Some(etag) = metadata.metadata.get("etag") {
etags[i] = Some(etag.clone())
}
}
etags
}
fn list_object_parities(parts_metadata: &[FileInfo], errs: &[Option<DiskError>]) -> Vec<i32> {
let total_shards = parts_metadata.len();
let half = total_shards as i32 / 2;
let mut parities: Vec<i32> = vec![-1; total_shards];
for (index, metadata) in parts_metadata.iter().enumerate() {
if errs[index].is_some() {
parities[index] = -1;
continue;
}
if !metadata.is_valid() {
parities[index] = -1;
continue;
}
if metadata.deleted || metadata.size == 0 {
parities[index] = half;
// } else if metadata.transition_status == "TransitionComplete" {
// TODO: metadata.transition_status
// parities[index] = total_shards - (total_shards / 2 + 1);
} else {
parities[index] = metadata.erasure.parity_blocks as i32;
}
}
parities
}
// Returns per object readQuorum and writeQuorum
// readQuorum is the min required disks to read data.
// writeQuorum is the min required disks to write data.
#[tracing::instrument(level = "debug", skip(parts_metadata))]
fn object_quorum_from_meta(
parts_metadata: &[FileInfo],
errs: &[Option<DiskError>],
default_parity_count: usize,
) -> disk::error::Result<(i32, i32)> {
let expected_rquorum = if default_parity_count == 0 {
parts_metadata.len()
} else {
parts_metadata.len() / 2
};
if let Some(err) = reduce_read_quorum_errs(errs, OBJECT_OP_IGNORED_ERRS, expected_rquorum) {
// let object = parts_metadata.first().map(|v| v.name.clone()).unwrap_or_default();
// error!("object_quorum_from_meta: {:?}, errs={:?}, object={:?}", err, errs, object);
return Err(err);
}
if default_parity_count == 0 {
return Ok((parts_metadata.len() as i32, parts_metadata.len() as i32));
}
let parities = Self::list_object_parities(parts_metadata, errs);
let parity_blocks = Self::common_parity(&parities, default_parity_count as i32);
if parity_blocks < 0 {
error!("object_quorum_from_meta: parity_blocks < 0, errs={:?}", errs);
return Err(DiskError::ErasureReadQuorum);
}
let data_blocks = parts_metadata.len() as i32 - parity_blocks;
let write_quorum = if data_blocks == parity_blocks {
data_blocks + 1
} else {
data_blocks
};
Ok((data_blocks, write_quorum))
}
#[tracing::instrument(level = "debug", skip(disks, parts_metadata))]
fn list_online_disks(
disks: &[Option<DiskStore>],
parts_metadata: &[FileInfo],
errs: &[Option<DiskError>],
quorum: usize,
) -> (Vec<Option<DiskStore>>, Option<OffsetDateTime>, Option<String>) {
let mod_times = Self::list_object_modtimes(parts_metadata, errs);
let etags = Self::list_object_etags(parts_metadata, errs);
let mod_time = Self::common_time(&mod_times, quorum);
let etag = Self::common_etag(&etags, quorum);
let mut new_disk = vec![None; disks.len()];
for (i, &t) in mod_times.iter().enumerate() {
if parts_metadata[i].is_valid() && mod_time == t {
new_disk[i].clone_from(&disks[i]);
}
}
(new_disk, mod_time, etag)
}
#[tracing::instrument(level = "debug", skip(self))]
async fn check_upload_id_exists(
&self,
bucket: &str,
object: &str,
upload_id: &str,
write: bool,
) -> Result<(FileInfo, Vec<FileInfo>)> {
let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
let disks = self.disks.read().await;
let disks = disks.clone();
let (parts_metadata, errs) =
Self::read_all_fileinfo(&disks, bucket, RUSTFS_META_MULTIPART_BUCKET, &upload_id_path, "", false, false).await?;
let map_err_notfound = |err: DiskError| {
if err == DiskError::FileNotFound {
return StorageError::InvalidUploadID(bucket.to_owned(), object.to_owned(), upload_id.to_owned());
}
err.into()
};
let (read_quorum, write_quorum) =
Self::object_quorum_from_meta(&parts_metadata, &errs, self.default_parity_count).map_err(map_err_notfound)?;
if read_quorum < 0 {
error!("check_upload_id_exists: read_quorum < 0, errs={:?}", errs);
return Err(Error::ErasureReadQuorum);
}
if write_quorum < 0 {
return Err(Error::ErasureWriteQuorum);
}
let mut quorum = read_quorum as usize;
if write {
quorum = write_quorum as usize;
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, quorum) {
return Err(map_err_notfound(err));
}
} else if let Some(err) = reduce_read_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, quorum) {
return Err(map_err_notfound(err));
}
let (_, mod_time, etag) = Self::list_online_disks(&disks, &parts_metadata, &errs, quorum);
let fi = Self::pick_valid_fileinfo(&parts_metadata, mod_time, etag, quorum)?;
Ok((fi, parts_metadata))
}
fn pick_valid_fileinfo(
metas: &[FileInfo],
mod_time: Option<OffsetDateTime>,
etag: Option<String>,
quorum: usize,
) -> disk::error::Result<FileInfo> {
Self::find_file_info_in_quorum(metas, &mod_time, &etag, quorum)
}
fn find_file_info_in_quorum(
metas: &[FileInfo],
mod_time: &Option<OffsetDateTime>,
etag: &Option<String>,
quorum: usize,
) -> disk::error::Result<FileInfo> {
if quorum < 1 {
error!("find_file_info_in_quorum: quorum < 1");
return Err(DiskError::ErasureReadQuorum);
}
let mut meta_hashes = vec![None; metas.len()];
let mut hasher = Sha256::new();
for (i, meta) in metas.iter().enumerate() {
if !meta.is_valid() {
continue;
}
let etag_only = mod_time.is_none() && etag.is_some() && meta.get_etag().is_some_and(|v| &v == etag.as_ref().unwrap());
let mod_valid = mod_time == &meta.mod_time;
if etag_only || mod_valid {
for part in meta.parts.iter() {
let _ = hasher.write(format!("part.{}", part.number).as_bytes())?;
let _ = hasher.write(format!("part.{}", part.size).as_bytes())?;
}
if !meta.deleted && meta.size != 0 {
let _ = hasher.write(format!("{}+{}", meta.erasure.data_blocks, meta.erasure.parity_blocks).as_bytes())?;
let _ = hasher.write(format!("{:?}", meta.erasure.distribution).as_bytes())?;
}
if meta.is_remote() {
// TODO:
}
// TODO: IsEncrypted
// TODO: IsCompressed
hasher.flush()?;
meta_hashes[i] = Some(hex(hasher.clone().finalize().as_slice()));
hasher.reset();
}
}
let mut count_map = HashMap::new();
for hash in meta_hashes.iter().flatten() {
*count_map.entry(hash).or_insert(0) += 1;
}
let mut max_val = None;
let mut max_count = 0;
for (&val, &count) in &count_map {
if count > max_count {
max_val = Some(val);
max_count = count;
}
}
if max_count < quorum {
error!("find_file_info_in_quorum: max_count < quorum, max_val={:?}", max_val);
return Err(DiskError::ErasureReadQuorum);
}
let mut found_fi = None;
let mut found = false;
let mut valid_obj_map = HashMap::new();
for (i, op_hash) in meta_hashes.iter().enumerate() {
if let Some(hash) = op_hash {
if let Some(max_hash) = max_val {
if hash == max_hash {
if metas[i].is_valid() && !found {
found_fi = Some(metas[i].clone());
found = true;
}
let props = ObjProps {
mod_time: metas[i].mod_time,
num_versions: metas[i].num_versions,
};
*valid_obj_map.entry(props).or_insert(0) += 1;
}
}
}
}
if found {
let mut fi = found_fi.unwrap();
for (val, &count) in &valid_obj_map {
if count > quorum {
fi.mod_time = val.mod_time;
fi.num_versions = val.num_versions;
fi.is_latest = val.mod_time.is_none();
break;
}
}
return Ok(fi);
}
error!("find_file_info_in_quorum: fileinfo not found");
Err(DiskError::ErasureReadQuorum)
}
#[tracing::instrument(level = "debug", skip(disks))]
async fn read_all_fileinfo(
disks: &[Option<DiskStore>],
org_bucket: &str,
bucket: &str,
object: &str,
version_id: &str,
read_data: bool,
healing: bool,
) -> disk::error::Result<(Vec<FileInfo>, Vec<Option<DiskError>>)> {
let mut ress = Vec::with_capacity(disks.len());
let mut errors = Vec::with_capacity(disks.len());
let opts = Arc::new(ReadOptions {
read_data,
healing,
..Default::default()
});
let org_bucket = Arc::new(org_bucket.to_string());
let bucket = Arc::new(bucket.to_string());
let object = Arc::new(object.to_string());
let version_id = Arc::new(version_id.to_string());
let futures = disks.iter().map(|disk| {
let disk = disk.clone();
let opts = opts.clone();
let org_bucket = org_bucket.clone();
let bucket = bucket.clone();
let object = object.clone();
let version_id = version_id.clone();
tokio::spawn(async move {
if let Some(disk) = disk {
if version_id.is_empty() {
match disk.read_xl(&bucket, &object, read_data).await {
Ok(info) => {
let fi = file_info_from_raw(info, &bucket, &object, read_data).await?;
Ok(fi)
}
Err(err) => Err(err),
}
} else {
disk.read_version(&org_bucket, &bucket, &object, &version_id, &opts).await
}
} else {
Err(DiskError::DiskNotFound)
}
})
});
// Wait for all tasks to complete
let results = join_all(futures).await;
for result in results {
match result {
Ok(res) => match res {
Ok(file_info) => {
ress.push(file_info);
errors.push(None);
}
Err(e) => {
ress.push(FileInfo::default());
errors.push(Some(e));
}
},
Err(_) => {
ress.push(FileInfo::default());
errors.push(Some(DiskError::Unexpected));
}
}
}
Ok((ress, errors))
}
// Optimized version using batch processor with quorum support
pub async fn read_version_optimized(
&self,
bucket: &str,
object: &str,
version_id: &str,
opts: &ReadOptions,
) -> Result<Vec<rustfs_filemeta::FileInfo>> {
// Use existing disk selection logic
let disks = self.disks.read().await;
let required_reads = self.format.erasure.sets.len();
// Clone parameters outside the closure to avoid lifetime issues
let bucket = bucket.to_string();
let object = object.to_string();
let version_id = version_id.to_string();
let opts = opts.clone();
let processor = get_global_processors().read_processor();
let tasks: Vec<_> = disks
.iter()
.take(required_reads + 2) // Read a few extra for reliability
.filter_map(|disk| {
disk.as_ref().map(|d| {
let disk = d.clone();
let bucket = bucket.clone();
let object = object.clone();
let version_id = version_id.clone();
let opts = opts.clone();
async move { disk.read_version(&bucket, &bucket, &object, &version_id, &opts).await }
})
})
.collect();
match processor.execute_batch_with_quorum(tasks, required_reads).await {
Ok(results) => Ok(results),
Err(_) => Err(DiskError::FileNotFound.into()), // Use existing error type
}
}
async fn read_all_xl(
disks: &[Option<DiskStore>],
bucket: &str,
object: &str,
read_data: bool,
incl_free_vers: bool,
) -> (Vec<FileInfo>, Vec<Option<DiskError>>) {
let (fileinfos, errs) = Self::read_all_raw_file_info(disks, bucket, object, read_data).await;
Self::pick_latest_quorum_files_info(fileinfos, errs, bucket, object, read_data, incl_free_vers).await
}
async fn read_all_raw_file_info(
disks: &[Option<DiskStore>],
bucket: &str,
object: &str,
read_data: bool,
) -> (Vec<Option<RawFileInfo>>, Vec<Option<DiskError>>) {
let mut ress = Vec::with_capacity(disks.len());
let mut errors = Vec::with_capacity(disks.len());
let mut futures = Vec::with_capacity(disks.len());
for disk in disks.iter() {
futures.push(async move {
if let Some(disk) = disk {
disk.read_xl(bucket, object, read_data).await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(res) => {
ress.push(Some(res));
errors.push(None);
}
Err(e) => {
ress.push(None);
errors.push(Some(e));
}
}
}
(ress, errors)
}
async fn pick_latest_quorum_files_info(
fileinfos: Vec<Option<RawFileInfo>>,
errs: Vec<Option<DiskError>>,
bucket: &str,
object: &str,
read_data: bool,
_incl_free_vers: bool,
) -> (Vec<FileInfo>, Vec<Option<DiskError>>) {
let mut metadata_array = vec![None; fileinfos.len()];
let mut meta_file_infos = vec![FileInfo::default(); fileinfos.len()];
let mut metadata_shallow_versions = vec![None; fileinfos.len()];
let mut v2_bufs = {
if !read_data {
vec![Vec::new(); fileinfos.len()]
} else {
Vec::new()
}
};
let mut errs = errs;
for (idx, info_op) in fileinfos.iter().enumerate() {
if let Some(info) = info_op {
if !read_data {
v2_bufs[idx] = info.buf.clone();
}
let xlmeta = match FileMeta::load(&info.buf) {
Ok(res) => res,
Err(err) => {
errs[idx] = Some(err.into());
continue;
}
};
metadata_array[idx] = Some(xlmeta);
meta_file_infos[idx] = FileInfo::default();
}
}
for (idx, info_op) in metadata_array.iter().enumerate() {
if let Some(info) = info_op {
metadata_shallow_versions[idx] = Some(info.versions.clone());
}
}
let shallow_versions: Vec<Vec<FileMetaShallowVersion>> = metadata_shallow_versions.iter().flatten().cloned().collect();
let read_quorum = fileinfos.len().div_ceil(2);
let versions = merge_file_meta_versions(read_quorum, false, 1, &shallow_versions);
let meta = FileMeta {
versions,
..Default::default()
};
let finfo = match meta.into_fileinfo(bucket, object, "", true, true) {
Ok(res) => res,
Err(err) => {
for item in errs.iter_mut() {
if item.is_none() {
*item = Some(err.clone().into());
}
}
return (meta_file_infos, errs);
}
};
if !finfo.is_valid() {
for item in errs.iter_mut() {
if item.is_none() {
*item = Some(DiskError::FileCorrupt);
}
}
return (meta_file_infos, errs);
}
let vid = finfo.version_id.unwrap_or(Uuid::nil());
for (idx, meta_op) in metadata_array.iter().enumerate() {
if let Some(meta) = meta_op {
match meta.into_fileinfo(bucket, object, vid.to_string().as_str(), read_data, true) {
Ok(res) => meta_file_infos[idx] = res,
Err(err) => errs[idx] = Some(err.into()),
}
}
}
(meta_file_infos, errs)
}
async fn read_multiple_files(disks: &[Option<DiskStore>], req: ReadMultipleReq, read_quorum: usize) -> Vec<ReadMultipleResp> {
let mut futures = Vec::with_capacity(disks.len());
let mut ress = Vec::with_capacity(disks.len());
let mut errors = Vec::with_capacity(disks.len());
for disk in disks.iter() {
let req = req.clone();
futures.push(async move {
if let Some(disk) = disk {
disk.read_multiple(req).await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(res) => {
ress.push(Some(res));
errors.push(None);
}
Err(e) => {
ress.push(None);
errors.push(Some(e));
}
}
}
// debug!("ReadMultipleResp ress {:?}", ress);
// debug!("ReadMultipleResp errors {:?}", errors);
let mut ret = Vec::with_capacity(req.files.len());
for want in req.files.iter() {
let mut quorum = 0;
let mut get_res = ReadMultipleResp::default();
for res in ress.iter() {
if res.is_none() {
continue;
}
let disk_res = res.as_ref().unwrap();
for resp in disk_res.iter() {
if !resp.error.is_empty() || !resp.exists {
continue;
}
if &resp.file != want || resp.bucket != req.bucket || resp.prefix != req.prefix {
continue;
}
quorum += 1;
if get_res.mod_time > resp.mod_time || get_res.data.len() > resp.data.len() {
continue;
}
get_res = resp.clone();
}
}
if quorum < read_quorum {
// debug!("quorum < read_quorum: {} < {}", quorum, read_quorum);
get_res.exists = false;
get_res.error = Error::ErasureReadQuorum.to_string();
get_res.data = Vec::new();
}
ret.push(get_res);
}
// log err
ret
}
pub async fn connect_disks(&self) {
let rl = self.disks.read().await;
let disks = rl.clone();
// 主动释放锁
drop(rl);
for (i, opdisk) in disks.iter().enumerate() {
if let Some(disk) = opdisk {
if disk.is_online().await && disk.get_disk_location().set_idx.is_some() {
info!("Disk {:?} is online", disk.to_string());
continue;
}
let _ = disk.close().await;
}
if let Some(endpoint) = self.set_endpoints.get(i) {
info!("will renew disk, opdisk: {:?}", opdisk);
self.renew_disk(endpoint).await;
}
}
}
pub async fn renew_disk(&self, ep: &Endpoint) {
debug!("renew_disk start {:?}", ep);
let (new_disk, fm) = match Self::connect_endpoint(ep).await {
Ok(res) => res,
Err(e) => {
warn!("connect_endpoint err {:?}", &e);
if ep.is_local && e == DiskError::UnformattedDisk {
info!("unformatteddisk will trigger heal_disk, {:?}", ep);
let set_disk_id = format!("pool_{}_set_{}", ep.pool_idx, ep.set_idx);
let _ = send_heal_disk(set_disk_id, Some(HealChannelPriority::Normal)).await;
}
return;
}
};
let (set_idx, disk_idx) = match self.find_disk_index(&fm) {
Ok(res) => res,
Err(e) => {
warn!("find_disk_index err {:?}", e);
return;
}
};
// check endpoint 是否一致
let _ = new_disk.set_disk_id(Some(fm.erasure.this)).await;
if new_disk.is_local() {
let mut global_local_disk_map = GLOBAL_LOCAL_DISK_MAP.write().await;
let path = new_disk.endpoint().to_string();
global_local_disk_map.insert(path, Some(new_disk.clone()));
if is_dist_erasure().await {
let mut local_set_drives = GLOBAL_LOCAL_DISK_SET_DRIVES.write().await;
local_set_drives[self.pool_index][set_idx][disk_idx] = Some(new_disk.clone());
}
}
debug!("renew_disk update {:?}", fm.erasure.this);
let mut disk_lock = self.disks.write().await;
disk_lock[disk_idx] = Some(new_disk);
}
fn find_disk_index(&self, fm: &FormatV3) -> Result<(usize, usize)> {
self.format.check_other(fm)?;
if fm.erasure.this.is_nil() {
return Err(Error::other("DriveID: offline"));
}
for i in 0..self.format.erasure.sets.len() {
for j in 0..self.format.erasure.sets[0].len() {
if fm.erasure.this == self.format.erasure.sets[i][j] {
return Ok((i, j));
}
}
}
Err(Error::other("DriveID: not found"))
}
async fn connect_endpoint(ep: &Endpoint) -> disk::error::Result<(DiskStore, FormatV3)> {
let disk = new_disk(ep, &DiskOption::default()).await?;
let fm = load_format_erasure(&disk, false).await?;
Ok((disk, fm))
}
// pub async fn walk_dir(&self, opts: &WalkDirOptions) -> (Vec<Option<Vec<MetaCacheEntry>>>, Vec<Option<Error>>) {
// let disks = self.disks.read().await;
// let disks = disks.clone();
// let mut futures = Vec::new();
// let mut errs = Vec::new();
// let mut ress = Vec::new();
// for disk in disks.iter() {
// let opts = opts.clone();
// futures.push(async move {
// if let Some(disk) = disk {
// disk.walk_dir(opts, &mut Writer::NotUse).await
// } else {
// Err(DiskError::DiskNotFound)
// }
// });
// }
// let results = join_all(futures).await;
// for res in results {
// match res {
// Ok(entries) => {
// ress.push(Some(entries));
// errs.push(None);
// }
// Err(e) => {
// ress.push(None);
// errs.push(Some(e));
// }
// }
// }
// (ress, errs)
// }
// async fn remove_object_part(
// &self,
// bucket: &str,
// object: &str,
// upload_id: &str,
// data_dir: &str,
// part_num: usize,
// ) -> Result<()> {
// let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
// let disks = self.disks.read().await;
// let disks = disks.clone();
// let file_path = format!("{}/{}/part.{}", upload_id_path, data_dir, part_num);
// let mut futures = Vec::with_capacity(disks.len());
// let mut errors = Vec::with_capacity(disks.len());
// for disk in disks.iter() {
// let file_path = file_path.clone();
// let meta_file_path = format!("{}.meta", file_path);
// futures.push(async move {
// if let Some(disk) = disk {
// disk.delete(RUSTFS_META_MULTIPART_BUCKET, &file_path, DeleteOptions::default())
// .await?;
// disk.delete(RUSTFS_META_MULTIPART_BUCKET, &meta_file_path, DeleteOptions::default())
// .await
// } else {
// Err(DiskError::DiskNotFound)
// }
// });
// }
// let results = join_all(futures).await;
// for result in results {
// match result {
// Ok(_) => {
// errors.push(None);
// }
// Err(e) => {
// errors.push(Some(e));
// }
// }
// }
// Ok(())
// }
// async fn remove_part_meta(&self, bucket: &str, object: &str, upload_id: &str, data_dir: &str, part_num: usize) -> Result<()> {
// let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
// let disks = self.disks.read().await;
// let disks = disks.clone();
// // let disks = Self::shuffle_disks(&disks, &fi.erasure.distribution);
// let file_path = format!("{}/{}/part.{}.meta", upload_id_path, data_dir, part_num);
// let mut futures = Vec::with_capacity(disks.len());
// let mut errors = Vec::with_capacity(disks.len());
// for disk in disks.iter() {
// let file_path = file_path.clone();
// futures.push(async move {
// if let Some(disk) = disk {
// disk.delete(RUSTFS_META_MULTIPART_BUCKET, &file_path, DeleteOptions::default())
// .await
// } else {
// Err(DiskError::DiskNotFound)
// }
// });
// }
// let results = join_all(futures).await;
// for result in results {
// match result {
// Ok(_) => {
// errors.push(None);
// }
// Err(e) => {
// errors.push(Some(e));
// }
// }
// }
// Ok(())
// }
#[tracing::instrument(skip(self))]
pub async fn delete_all(&self, bucket: &str, prefix: &str) -> Result<()> {
let disks = self.disks.read().await;
let disks = disks.clone();
let mut futures = Vec::with_capacity(disks.len());
let mut errors = Vec::with_capacity(disks.len());
for disk in disks.iter() {
futures.push(async move {
if let Some(disk) = disk {
disk.delete(
bucket,
prefix,
DeleteOptions {
recursive: true,
..Default::default()
},
)
.await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(_) => {
errors.push(None);
}
Err(e) => {
errors.push(Some(e));
}
}
}
// debug!("delete_all errs {:?}", &errors);
Ok(())
}
// 打乱顺序
fn shuffle_disks_and_parts_metadata_by_index(
disks: &[Option<DiskStore>],
parts_metadata: &[FileInfo],
fi: &FileInfo,
) -> (Vec<Option<DiskStore>>, Vec<FileInfo>) {
let mut shuffled_disks = vec![None; disks.len()];
let mut shuffled_parts_metadata = vec![FileInfo::default(); parts_metadata.len()];
let distribution = &fi.erasure.distribution;
let mut inconsistent = 0;
for (k, v) in parts_metadata.iter().enumerate() {
if disks[k].is_none() {
inconsistent += 1;
continue;
}
if !v.is_valid() {
inconsistent += 1;
continue;
}
if distribution[k] != v.erasure.index {
inconsistent += 1;
continue;
}
let block_idx = distribution[k];
shuffled_parts_metadata[block_idx - 1] = parts_metadata[k].clone();
shuffled_disks[block_idx - 1].clone_from(&disks[k]);
}
if inconsistent < fi.erasure.parity_blocks {
return (shuffled_disks, shuffled_parts_metadata);
}
Self::shuffle_disks_and_parts_metadata(disks, parts_metadata, fi)
}
// 打乱顺序
fn shuffle_disks_and_parts_metadata(
disks: &[Option<DiskStore>],
parts_metadata: &[FileInfo],
fi: &FileInfo,
) -> (Vec<Option<DiskStore>>, Vec<FileInfo>) {
let init = fi.mod_time.is_none();
let mut shuffled_disks = vec![None; disks.len()];
let mut shuffled_parts_metadata = vec![FileInfo::default(); parts_metadata.len()];
let distribution = &fi.erasure.distribution;
for (k, v) in disks.iter().enumerate() {
if v.is_none() {
continue;
}
if !init && !parts_metadata[k].is_valid() {
continue;
}
// if !init && fi.xlv1 != parts_metadata[k].xlv1 {
// continue;
// }
let block_idx = distribution[k];
shuffled_parts_metadata[block_idx - 1] = parts_metadata[k].clone();
shuffled_disks[block_idx - 1].clone_from(&disks[k]);
}
(shuffled_disks, shuffled_parts_metadata)
}
// Return shuffled partsMetadata depending on distribution.
fn shuffle_parts_metadata(parts_metadata: &[FileInfo], distribution: &[usize]) -> Vec<FileInfo> {
if distribution.is_empty() {
return parts_metadata.to_vec();
}
let mut shuffled_parts_metadata = vec![FileInfo::default(); parts_metadata.len()];
// Shuffle slice xl metadata for expected distribution.
for index in 0..parts_metadata.len() {
let block_index = distribution[index];
shuffled_parts_metadata[block_index - 1] = parts_metadata[index].clone();
}
shuffled_parts_metadata
}
// shuffle_disks TODO: use origin value
fn shuffle_disks(disks: &[Option<DiskStore>], distribution: &[usize]) -> Vec<Option<DiskStore>> {
if distribution.is_empty() {
return disks.to_vec();
}
let mut shuffled_disks = vec![None; disks.len()];
for (i, v) in disks.iter().enumerate() {
let idx = distribution[i];
shuffled_disks[idx - 1].clone_from(v);
}
shuffled_disks
}
#[tracing::instrument(level = "debug", skip(self))]
async fn get_object_fileinfo(
&self,
bucket: &str,
object: &str,
opts: &ObjectOptions,
read_data: bool,
) -> Result<(FileInfo, Vec<FileInfo>, Vec<Option<DiskStore>>)> {
let disks = self.disks.read().await;
let disks = disks.clone();
let vid = opts.version_id.clone().unwrap_or_default();
// TODO: 优化并发 可用数量中断
let (parts_metadata, errs) = Self::read_all_fileinfo(&disks, "", bucket, object, vid.as_str(), read_data, false).await?;
// warn!("get_object_fileinfo parts_metadata {:?}", &parts_metadata);
// warn!("get_object_fileinfo {}/{} errs {:?}", bucket, object, &errs);
let _min_disks = self.set_drive_count - self.default_parity_count;
let (read_quorum, _) = match Self::object_quorum_from_meta(&parts_metadata, &errs, self.default_parity_count)
.map_err(|err| to_object_err(err.into(), vec![bucket, object]))
{
Ok(v) => v,
Err(e) => {
// error!("Self::object_quorum_from_meta: {:?}, bucket: {}, object: {}", &e, bucket, object);
return Err(e);
}
};
if let Some(err) = reduce_read_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, read_quorum as usize) {
error!("reduce_read_quorum_errs: {:?}, bucket: {}, object: {}", &err, bucket, object);
return Err(to_object_err(err.into(), vec![bucket, object]));
}
let (op_online_disks, mot_time, etag) = Self::list_online_disks(&disks, &parts_metadata, &errs, read_quorum as usize);
let fi = Self::pick_valid_fileinfo(&parts_metadata, mot_time, etag, read_quorum as usize)?;
if errs.iter().any(|err| err.is_some()) {
let _ =
rustfs_common::heal_channel::send_heal_request(rustfs_common::heal_channel::create_heal_request_with_options(
fi.volume.to_string(), // bucket
Some(fi.name.to_string()), // object_prefix
false, // force_start
Some(HealChannelPriority::Normal), // priority
Some(self.pool_index), // pool_index
Some(self.set_index), // set_index
))
.await;
}
// debug!("get_object_fileinfo pick fi {:?}", &fi);
// let online_disks: Vec<Option<DiskStore>> = op_online_disks.iter().filter(|v| v.is_some()).cloned().collect();
Ok((fi, parts_metadata, op_online_disks))
}
async fn get_object_info_and_quorum(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<(ObjectInfo, usize)> {
let (fi, _, _) = self.get_object_fileinfo(bucket, object, opts, false).await?;
let write_quorum = fi.write_quorum(self.default_write_quorum());
let oi = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended);
// TODO: replicatio
if fi.deleted {
if opts.version_id.is_none() || opts.delete_marker {
return Err(to_object_err(StorageError::FileNotFound, vec![bucket, object]));
} else {
return Err(to_object_err(StorageError::MethodNotAllowed, vec![bucket, object]));
}
}
Ok((oi, write_quorum))
}
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(
level = "debug",
skip( writer,disks,fi,files),
fields(start_time=?time::OffsetDateTime::now_utc())
)]
async fn get_object_with_fileinfo<W>(
// &self,
bucket: &str,
object: &str,
offset: usize,
length: i64,
writer: &mut W,
fi: FileInfo,
files: Vec<FileInfo>,
disks: &[Option<DiskStore>],
set_index: usize,
pool_index: usize,
) -> Result<()>
where
W: AsyncWrite + Send + Sync + Unpin + 'static,
{
let (disks, files) = Self::shuffle_disks_and_parts_metadata_by_index(disks, &files, &fi);
let total_size = fi.size as usize;
let length = if length < 0 {
fi.size as usize - offset
} else {
length as usize
};
if offset > total_size || offset + length > total_size {
error!("get_object_with_fileinfo offset out of range: {}, total_size: {}", offset, total_size);
return Err(Error::other("offset out of range"));
}
let (part_index, mut part_offset) = fi.to_part_offset(offset)?;
// debug!(
// "get_object_with_fileinfo start offset:{}, part_index:{},part_offset:{}",
// offset, part_index, part_offset
// );
let mut end_offset = offset;
if length > 0 {
end_offset += length - 1
}
let (last_part_index, _) = fi.to_part_offset(end_offset)?;
let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size);
let mut total_read = 0;
for i in part_index..=last_part_index {
if total_read == length {
break;
}
let part_number = fi.parts[i].number;
let part_size = fi.parts[i].size;
let mut part_length = part_size - part_offset;
if part_length > (length - total_read) {
part_length = length - total_read
}
let till_offset = erasure.shard_file_offset(part_offset, part_length, part_size);
let read_offset = (part_offset / erasure.block_size) * erasure.shard_size();
let mut readers = Vec::with_capacity(disks.len());
let mut errors = Vec::with_capacity(disks.len());
for (idx, disk_op) in disks.iter().enumerate() {
match create_bitrot_reader(
files[idx].data.as_deref(),
disk_op.as_ref(),
bucket,
&format!("{}/{}/part.{}", object, files[idx].data_dir.unwrap_or_default(), part_number),
read_offset,
till_offset,
erasure.shard_size(),
HashAlgorithm::HighwayHash256,
)
.await
{
Ok(Some(reader)) => {
readers.push(Some(reader));
errors.push(None);
}
Ok(None) => {
readers.push(None);
errors.push(Some(DiskError::DiskNotFound));
}
Err(e) => {
readers.push(None);
errors.push(Some(e));
}
}
}
let nil_count = errors.iter().filter(|&e| e.is_none()).count();
if nil_count < erasure.data_shards {
if let Some(read_err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, erasure.data_shards) {
error!("create_bitrot_reader reduce_read_quorum_errs {:?}", &errors);
return Err(to_object_err(read_err.into(), vec![bucket, object]));
}
error!("create_bitrot_reader not enough disks to read: {:?}", &errors);
return Err(Error::other(format!("not enough disks to read: {errors:?}")));
}
// debug!(
// "read part {} part_offset {},part_length {},part_size {} ",
// part_number, part_offset, part_length, part_size
// );
let (written, err) = erasure.decode(writer, readers, part_offset, part_length, part_size).await;
if let Some(e) = err {
let de_err: DiskError = e.into();
let mut has_err = true;
if written == part_length {
match de_err {
DiskError::FileNotFound | DiskError::FileCorrupt => {
error!("erasure.decode err 111 {:?}", &de_err);
let _ = rustfs_common::heal_channel::send_heal_request(
rustfs_common::heal_channel::create_heal_request_with_options(
bucket.to_string(),
Some(object.to_string()),
false,
Some(HealChannelPriority::Normal),
Some(pool_index),
Some(set_index),
),
)
.await;
has_err = false;
}
_ => {}
}
}
if has_err {
error!("erasure.decode err {} {:?}", written, &de_err);
return Err(de_err.into());
}
}
// debug!("ec decode {} written size {}", part_number, n);
total_read += part_length;
part_offset = 0;
}
// debug!("read end");
Ok(())
}
async fn update_object_meta(
&self,
bucket: &str,
object: &str,
fi: FileInfo,
disks: &[Option<DiskStore>],
) -> disk::error::Result<()> {
self.update_object_meta_with_opts(bucket, object, fi, disks, &UpdateMetadataOpts::default())
.await
}
async fn update_object_meta_with_opts(
&self,
bucket: &str,
object: &str,
fi: FileInfo,
disks: &[Option<DiskStore>],
opts: &UpdateMetadataOpts,
) -> disk::error::Result<()> {
if fi.metadata.is_empty() {
return Ok(());
}
let mut futures = Vec::with_capacity(disks.len());
let mut errs = Vec::with_capacity(disks.len());
for disk in disks.iter() {
let fi = fi.clone();
futures.push(async move {
if let Some(disk) = disk {
disk.update_metadata(bucket, object, fi, opts).await
} else {
Err(DiskError::DiskNotFound)
}
})
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(_) => {
errs.push(None);
}
Err(e) => {
errs.push(Some(e));
}
}
}
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, fi.write_quorum(self.default_write_quorum())) {
return Err(err);
}
Ok(())
}
async fn get_online_disk_with_healing(&self, incl_healing: bool) -> Result<(Vec<Option<DiskStore>>, bool)> {
let (new_disks, _, healing) = self.get_online_disk_with_healing_and_info(incl_healing).await?;
Ok((new_disks, healing > 0))
}
async fn get_online_disk_with_healing_and_info(
&self,
incl_healing: bool,
) -> Result<(Vec<Option<DiskStore>>, Vec<DiskInfo>, usize)> {
let mut infos = vec![DiskInfo::default(); self.disks.read().await.len()];
for (idx, disk) in self.disks.write().await.iter().enumerate() {
if let Some(disk) = disk {
match disk.disk_info(&DiskInfoOptions::default()).await {
Ok(disk_info) => infos[idx] = disk_info,
Err(err) => infos[idx].error = err.to_string(),
}
} else {
infos[idx].error = "disk not found".to_string();
}
}
let mut new_disks = Vec::new();
let mut healing_disks = Vec::new();
let mut scanning_disks = Vec::new();
let mut new_infos = Vec::new();
let mut healing_infos = Vec::new();
let mut scanning_infos = Vec::new();
let mut healing = 0;
infos.iter().zip(self.disks.write().await.iter()).for_each(|(info, disk)| {
if info.error.is_empty() {
if info.healing {
healing += 1;
if incl_healing {
healing_disks.push(disk.clone());
healing_infos.push(info.clone());
}
} else if !info.scanning {
new_disks.push(disk.clone());
new_infos.push(info.clone());
} else {
scanning_disks.push(disk.clone());
scanning_infos.push(info.clone());
}
}
});
// Prefer non-scanning disks over disks which are currently being scanned.
new_disks.extend(scanning_disks);
new_infos.extend(scanning_infos);
// Then add healing disks.
new_disks.extend(healing_disks);
new_infos.extend(healing_infos);
Ok((new_disks, new_infos, healing))
}
async fn heal_object(
&self,
bucket: &str,
object: &str,
version_id: &str,
opts: &HealOpts,
) -> disk::error::Result<(HealResultItem, Option<DiskError>)> {
info!(
"SetDisks heal_object: bucket={}, object={}, version_id={}, opts={:?}",
bucket, object, version_id, opts
);
let mut result = HealResultItem {
heal_item_type: HealItemType::Object.to_string(),
bucket: bucket.to_string(),
object: object.to_string(),
version_id: version_id.to_string(),
disk_count: self.disks.read().await.len(),
..Default::default()
};
let _write_lock_guard = if !opts.no_lock {
info!("Acquiring write lock for object: {}, owner: {}", object, self.locker_owner);
// Check if lock is already held
let key = rustfs_lock::fast_lock::types::ObjectKey::new(bucket, object);
if let Some(lock_info) = self.fast_lock_manager.get_lock_info(&key) {
warn!("Lock already exists for object {}: {:?}", object, lock_info);
} else {
info!("No existing lock found for object {}", object);
}
let start_time = std::time::Instant::now();
let lock_result = self
.fast_lock_manager
.acquire_write_lock(bucket, object, self.locker_owner.as_str())
.await
.map_err(|e| {
let elapsed = start_time.elapsed();
error!("Failed to acquire write lock for heal operation after {:?}: {:?}", elapsed, e);
DiskError::other(format!("Failed to acquire write lock for heal operation: {:?}", e))
})?;
let elapsed = start_time.elapsed();
info!("Successfully acquired write lock for object: {} in {:?}", object, elapsed);
Some(lock_result)
} else {
info!("Skipping lock acquisition (no_lock=true)");
None
};
let version_id_op = {
if version_id.is_empty() {
None
} else {
Some(version_id.to_string())
}
};
let disks = { self.disks.read().await.clone() };
let (mut parts_metadata, errs) = Self::read_all_fileinfo(&disks, "", bucket, object, version_id, true, true).await?;
info!("Read file info: parts_metadata.len()={}, errs={:?}", parts_metadata.len(), errs);
if DiskError::is_all_not_found(&errs) {
warn!(
"heal_object failed, all obj part not found, bucket: {}, obj: {}, version_id: {}",
bucket, object, version_id
);
let err = if !version_id.is_empty() {
DiskError::FileVersionNotFound
} else {
DiskError::FileNotFound
};
// Nothing to do, file is already gone.
return Ok((
self.default_heal_result(FileInfo::default(), &errs, bucket, object, version_id)
.await,
Some(err),
));
}
info!("About to call object_quorum_from_meta with parts_metadata.len()={}", parts_metadata.len());
match Self::object_quorum_from_meta(&parts_metadata, &errs, self.default_parity_count) {
Ok((read_quorum, _)) => {
result.parity_blocks = result.disk_count - read_quorum as usize;
result.data_blocks = read_quorum as usize;
let ((online_disks, mod_time, etag), disk_len) = {
let disks = self.disks.read().await;
let disk_len = disks.len();
(Self::list_online_disks(&disks, &parts_metadata, &errs, read_quorum as usize), disk_len)
};
match Self::pick_valid_fileinfo(&parts_metadata, mod_time, etag, read_quorum as usize) {
Ok(latest_meta) => {
let (available_disks, data_errs_by_disk, data_errs_by_part) = disks_with_all_parts(
&online_disks,
&mut parts_metadata,
&errs,
&latest_meta,
bucket,
object,
opts.scan_mode,
)
.await?;
// info!(
// "disks_with_all_parts: got available_disks: {:?}, data_errs_by_disk: {:?}, data_errs_by_part: {:?}, latest_meta: {:?}",
// available_disks, data_errs_by_disk, data_errs_by_part, latest_meta
// );
let erasure = if !latest_meta.deleted && !latest_meta.is_remote() {
// Initialize erasure coding
erasure_coding::Erasure::new(
latest_meta.erasure.data_blocks,
latest_meta.erasure.parity_blocks,
latest_meta.erasure.block_size,
)
} else {
erasure_coding::Erasure::default()
};
result.object_size =
ObjectInfo::from_file_info(&latest_meta, bucket, object, true).get_actual_size()? as usize;
// Loop to find number of disks with valid data, per-drive
// data state and a list of outdated disks on which data needs
// to be healed.
let mut outdate_disks = vec![None; disk_len];
let mut disks_to_heal_count = 0;
// info!(
// "errs: {:?}, data_errs_by_disk: {:?}, latest_meta: {:?}",
// errs, data_errs_by_disk, latest_meta
// );
for index in 0..available_disks.len() {
let (yes, reason) = should_heal_object_on_disk(
&errs[index],
&data_errs_by_disk[&index],
&parts_metadata[index],
&latest_meta,
);
if yes {
outdate_disks[index] = disks[index].clone();
disks_to_heal_count += 1;
}
let drive_state = match reason {
Some(err) => match err {
DiskError::DiskNotFound => DriveState::Offline.to_string(),
DiskError::FileNotFound
| DiskError::FileVersionNotFound
| DiskError::VolumeNotFound
| DiskError::PartMissingOrCorrupt
| DiskError::OutdatedXLMeta => DriveState::Missing.to_string(),
_ => DriveState::Corrupt.to_string(),
},
None => DriveState::Ok.to_string(),
};
result.before.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: self.set_endpoints[index].to_string(),
state: drive_state.to_string(),
});
result.after.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: self.set_endpoints[index].to_string(),
state: drive_state.to_string(),
});
}
if DiskError::is_all_not_found(&errs) {
warn!(
"heal_object failed, all obj part not found, bucket: {}, obj: {}, version_id: {}",
bucket, object, version_id
);
let err = if !version_id.is_empty() {
DiskError::FileVersionNotFound
} else {
DiskError::FileNotFound
};
return Ok((
self.default_heal_result(FileInfo::default(), &errs, bucket, object, version_id)
.await,
Some(err),
));
}
if disks_to_heal_count == 0 {
info!("No disks to heal, returning early");
return Ok((result, None));
}
if opts.dry_run {
info!("Dry run mode, returning early");
return Ok((result, None));
}
info!(
"Proceeding with heal: disks_to_heal_count={}, dry_run={}",
disks_to_heal_count, opts.dry_run
);
if !latest_meta.deleted && disks_to_heal_count > latest_meta.erasure.parity_blocks {
error!(
"file({} : {}) part corrupt too much, can not to fix, disks_to_heal_count: {}, parity_blocks: {}",
bucket, object, disks_to_heal_count, latest_meta.erasure.parity_blocks
);
// Allow for dangling deletes, on versions that have DataDir missing etc.
// this would end up restoring the correct readable versions.
return match self
.delete_if_dang_ling(
bucket,
object,
&parts_metadata,
&errs,
&data_errs_by_part,
ObjectOptions {
version_id: version_id_op.clone(),
..Default::default()
},
)
.await
{
Ok(m) => {
let derr = if !version_id.is_empty() {
DiskError::FileVersionNotFound
} else {
DiskError::FileNotFound
};
let mut t_errs = Vec::with_capacity(errs.len());
for _ in 0..errs.len() {
t_errs.push(None);
}
Ok((self.default_heal_result(m, &t_errs, bucket, object, version_id).await, Some(derr)))
}
Err(err) => {
// t_errs = vec![Some(err.clone()); errs.len()];
let mut t_errs = Vec::with_capacity(errs.len());
for _ in 0..errs.len() {
t_errs.push(Some(err.clone()));
}
Ok((
self.default_heal_result(FileInfo::default(), &t_errs, bucket, object, version_id)
.await,
Some(err),
))
}
};
}
if !latest_meta.deleted && latest_meta.erasure.distribution.len() != available_disks.len() {
let err_str = format!(
"unexpected file distribution ({:?}) from available disks ({:?}), looks like backend disks have been manually modified refusing to heal {}/{}({})",
latest_meta.erasure.distribution, available_disks, bucket, object, version_id
);
warn!(err_str);
let err = DiskError::other(err_str);
return Ok((
self.default_heal_result(latest_meta, &errs, bucket, object, version_id).await,
Some(err),
));
}
let latest_disks = Self::shuffle_disks(&available_disks, &latest_meta.erasure.distribution);
if !latest_meta.deleted && latest_meta.erasure.distribution.len() != outdate_disks.len() {
let err_str = format!(
"unexpected file distribution ({:?}) from outdated disks ({:?}), looks like backend disks have been manually modified refusing to heal {}/{}({})",
latest_meta.erasure.distribution, outdate_disks, bucket, object, version_id
);
warn!(err_str);
let err = DiskError::other(err_str);
return Ok((
self.default_heal_result(latest_meta, &errs, bucket, object, version_id).await,
Some(err),
));
}
if !latest_meta.deleted && latest_meta.erasure.distribution.len() != parts_metadata.len() {
let err_str = format!(
"unexpected file distribution ({:?}) from metadata entries ({:?}), looks like backend disks have been manually modified refusing to heal {}/{}({})",
latest_meta.erasure.distribution,
parts_metadata.len(),
bucket,
object,
version_id
);
warn!(err_str);
let err = DiskError::other(err_str);
return Ok((
self.default_heal_result(latest_meta, &errs, bucket, object, version_id).await,
Some(err),
));
}
let out_dated_disks = Self::shuffle_disks(&outdate_disks, &latest_meta.erasure.distribution);
let mut parts_metadata = Self::shuffle_parts_metadata(&parts_metadata, &latest_meta.erasure.distribution);
let mut copy_parts_metadata = vec![None; parts_metadata.len()];
for (index, disk) in latest_disks.iter().enumerate() {
if disk.is_some() {
copy_parts_metadata[index] = Some(parts_metadata[index].clone());
}
}
let clean_file_info = |fi: &FileInfo| -> FileInfo {
let mut nfi = fi.clone();
if !nfi.is_remote() {
nfi.data = None;
nfi.erasure.index = 0;
nfi.erasure.checksums = Vec::new();
}
nfi
};
for (index, disk) in out_dated_disks.iter().enumerate() {
if disk.is_some() {
// Make sure to write the FileInfo information
// that is expected to be in quorum.
parts_metadata[index] = clean_file_info(&latest_meta);
}
}
// We write at temporary location and then rename to final location.
let tmp_id = Uuid::new_v4().to_string();
let src_data_dir = latest_meta.data_dir.unwrap().to_string();
let dst_data_dir = latest_meta.data_dir.unwrap();
info!(
"Checking heal conditions: deleted={}, is_remote={}",
latest_meta.deleted,
latest_meta.is_remote()
);
if !latest_meta.deleted && !latest_meta.is_remote() {
let erasure_info = latest_meta.erasure;
for part in latest_meta.parts.iter() {
let till_offset = erasure.shard_file_offset(0, part.size, part.size);
let checksum_algo = erasure_info.get_checksum_info(part.number).algorithm;
let mut readers = Vec::with_capacity(latest_disks.len());
let mut writers = Vec::with_capacity(out_dated_disks.len());
// let mut errors = Vec::with_capacity(out_dated_disks.len());
let mut prefer = vec![false; latest_disks.len()];
for (index, disk) in latest_disks.iter().enumerate() {
if let (Some(disk), Some(metadata)) = (disk, &copy_parts_metadata[index]) {
match create_bitrot_reader(
metadata.data.as_deref(),
Some(disk),
bucket,
&format!("{}/{}/part.{}", object, src_data_dir, part.number),
0,
till_offset,
erasure.shard_size(),
checksum_algo.clone(),
)
.await
{
Ok(Some(reader)) => {
readers.push(Some(reader));
}
Ok(None) => {
error!("heal_object disk not available");
readers.push(None);
continue;
}
Err(e) => {
error!("heal_object read_file err: {:?}", e);
readers.push(None);
continue;
}
}
prefer[index] = disk.host_name().is_empty();
} else {
readers.push(None);
// errors.push(Some(DiskError::DiskNotFound));
}
}
let is_inline_buffer = {
if let Some(sc) = GLOBAL_STORAGE_CLASS.get() {
sc.should_inline(erasure.shard_file_size(latest_meta.size), false)
} else {
false
}
};
// create writers for all disk positions, but only for outdated disks
info!(
"Creating writers: latest_disks len={}, out_dated_disks len={}",
latest_disks.len(),
out_dated_disks.len()
);
for (index, disk) in latest_disks.iter().enumerate() {
if let Some(outdated_disk) = &out_dated_disks[index] {
info!("Creating writer for index {} (outdated disk)", index);
let writer = create_bitrot_writer(
is_inline_buffer,
Some(outdated_disk),
RUSTFS_META_TMP_BUCKET,
&format!("{}/{}/part.{}", tmp_id, dst_data_dir, part.number),
erasure.shard_file_size(part.size as i64),
erasure.shard_size(),
HashAlgorithm::HighwayHash256,
)
.await?;
writers.push(Some(writer));
} else {
info!("Skipping writer for index {} (not outdated)", index);
writers.push(None);
}
// if let Some(disk) = disk {
// // let filewriter = {
// // if is_inline_buffer {
// // Box::new(Cursor::new(Vec::new()))
// // } else {
// // let disk = disk.clone();
// // let part_path = format!("{}/{}/part.{}", tmp_id, dst_data_dir, part.number);
// // disk.create_file("", RUSTFS_META_TMP_BUCKET, &part_path, 0).await?
// // }
// // };
// if is_inline_buffer {
// let writer = BitrotWriter::new(
// Writer::from_cursor(Cursor::new(Vec::new())),
// erasure.shard_size(),
// HashAlgorithm::HighwayHash256,
// );
// writers.push(Some(writer));
// } else {
// let f = disk
// .create_file(
// "",
// RUSTFS_META_TMP_BUCKET,
// &format!("{}/{}/part.{}", tmp_id, dst_data_dir, part.number),
// 0,
// )
// .await?;
// let writer = BitrotWriter::new(
// Writer::from_tokio_writer(f),
// erasure.shard_size(),
// HashAlgorithm::HighwayHash256,
// );
// writers.push(Some(writer));
// }
// // let writer = new_bitrot_filewriter(
// // disk.clone(),
// // RUSTFS_META_TMP_BUCKET,
// // format!("{}/{}/part.{}", tmp_id, dst_data_dir, part.number).as_str(),
// // is_inline_buffer,
// // DEFAULT_BITROT_ALGO,
// // erasure.shard_size(erasure.block_size),
// // )
// // .await?;
// // writers.push(Some(writer));
// } else {
// writers.push(None);
// }
}
// Heal each part. erasure.Heal() will write the healed
// part to .rustfs/tmp/uuid/ which needs to be renamed
// later to the final location.
erasure.heal(&mut writers, readers, part.size, &prefer).await?;
// close_bitrot_writers(&mut writers).await?;
for (index, disk) in out_dated_disks.iter().enumerate() {
if disk.is_none() {
continue;
}
if writers[index].is_none() {
outdate_disks[index] = None;
disks_to_heal_count -= 1;
continue;
}
parts_metadata[index].data_dir = Some(dst_data_dir);
parts_metadata[index].add_object_part(
part.number,
part.etag.clone(),
part.size,
part.mod_time,
part.actual_size,
part.index.clone(),
);
if is_inline_buffer {
if let Some(writer) = writers[index].take() {
// if let Some(w) = writer.as_any().downcast_ref::<BitrotFileWriter>() {
// parts_metadata[index].data = Some(w.inline_data().to_vec());
// }
parts_metadata[index].data =
Some(writer.into_inline_data().map(bytes::Bytes::from).unwrap_or_default());
}
parts_metadata[index].set_inline_data();
} else {
parts_metadata[index].data = None;
}
}
if disks_to_heal_count == 0 {
return Ok((
result,
Some(DiskError::other(format!(
"all drives had write errors, unable to heal {bucket}/{object}"
))),
));
}
}
}
// Rename from tmp location to the actual location.
for (index, outdated_disk) in out_dated_disks.iter().enumerate() {
if let Some(disk) = outdated_disk {
// record the index of the updated disks
parts_metadata[index].erasure.index = index + 1;
// Attempt a rename now from healed data to final location.
parts_metadata[index].set_healing();
info!(
"rename temp data, src_volume: {}, src_path: {}, dst_volume: {}, dst_path: {}",
RUSTFS_META_TMP_BUCKET, tmp_id, bucket, object
);
let rename_result = disk
.rename_data(RUSTFS_META_TMP_BUCKET, &tmp_id, parts_metadata[index].clone(), bucket, object)
.await;
if let Err(err) = &rename_result {
info!(
"rename temp data err: {}. Try fallback to direct xl.meta overwrite...",
err.to_string()
);
let healthy_index = latest_disks.iter().position(|d| d.is_some()).unwrap_or(0);
if let Some(healthy_disk) = &latest_disks[healthy_index] {
let xlmeta_path = format!("{object}/xl.meta");
match healthy_disk.read_all(bucket, &xlmeta_path).await {
Ok(xlmeta_bytes) => {
if let Err(e) = disk.write_all(bucket, &xlmeta_path, xlmeta_bytes).await {
info!("fallback xl.meta overwrite failed: {}", e.to_string());
return Ok((
result,
Some(DiskError::other(format!("fallback xl.meta overwrite failed: {e}"))),
));
} else {
info!("fallback xl.meta overwrite succeeded for disk {}", disk.to_string());
}
}
Err(e) => {
info!("read healthy xl.meta failed: {}", e.to_string());
return Ok((
result,
Some(DiskError::other(format!("read healthy xl.meta failed: {e}"))),
));
}
}
} else {
info!("no healthy disk found for xl.meta fallback overwrite");
return Ok((
result,
Some(DiskError::other("no healthy disk found for xl.meta fallback overwrite")),
));
}
} else {
info!("remove temp object, volume: {}, path: {}", RUSTFS_META_TMP_BUCKET, tmp_id);
self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_id)
.await
.map_err(DiskError::other)?;
if parts_metadata[index].is_remote() {
let rm_data_dir = parts_metadata[index].data_dir.unwrap().to_string();
let d_path = Path::new(&encode_dir_object(object)).join(rm_data_dir);
disk.delete(
bucket,
d_path.to_str().unwrap(),
DeleteOptions {
immediate: true,
recursive: true,
..Default::default()
},
)
.await?;
}
for (i, v) in result.before.drives.iter().enumerate() {
if v.endpoint == disk.endpoint().to_string() {
result.after.drives[i].state = DriveState::Ok.to_string();
}
}
}
}
}
Ok((result, None))
}
Err(err) => {
warn!("heal_object can not pick valid file info");
Ok((result, Some(err)))
}
}
}
Err(err) => {
let data_errs_by_part = HashMap::new();
match self
.delete_if_dang_ling(
bucket,
object,
&parts_metadata,
&errs,
&data_errs_by_part,
ObjectOptions {
version_id: version_id_op.clone(),
..Default::default()
},
)
.await
{
Ok(m) => {
let err = if !version_id.is_empty() {
DiskError::FileVersionNotFound
} else {
DiskError::FileNotFound
};
Ok((self.default_heal_result(m, &errs, bucket, object, version_id).await, Some(err)))
}
Err(_) => Ok((
self.default_heal_result(FileInfo::default(), &errs, bucket, object, version_id)
.await,
Some(err),
)),
}
}
}
}
async fn heal_object_dir(
&self,
bucket: &str,
object: &str,
dry_run: bool,
remove: bool,
) -> Result<(HealResultItem, Option<DiskError>)> {
let _write_lock_guard = self
.fast_lock_manager
.acquire_write_lock("", object, self.locker_owner.as_str())
.await
.map_err(|e| DiskError::other(format!("Failed to acquire write lock for heal directory operation: {:?}", e)))?;
let disks = {
let disks = self.disks.read().await;
disks.clone()
};
let mut result = HealResultItem {
heal_item_type: HealItemType::Object.to_string(),
bucket: bucket.to_string(),
object: object.to_string(),
disk_count: self.disks.read().await.len(),
parity_blocks: self.default_parity_count,
data_blocks: disks.len() - self.default_parity_count,
object_size: 0,
..Default::default()
};
result.before.drives = vec![HealDriveInfo::default(); disks.len()];
result.after.drives = vec![HealDriveInfo::default(); disks.len()];
let errs = stat_all_dirs(&disks, bucket, object).await;
let dang_ling_object = is_object_dir_dang_ling(&errs);
if dang_ling_object && !dry_run && remove {
let mut futures = Vec::with_capacity(disks.len());
for disk in disks.iter().flatten() {
let disk = disk.clone();
let bucket = bucket.to_string();
let object = object.to_string();
futures.push(tokio::spawn(async move {
let _ = disk
.delete(
&bucket,
&object,
DeleteOptions {
recursive: false,
immediate: false,
..Default::default()
},
)
.await;
}));
}
// ignore errors
let _ = join_all(futures).await;
}
for (err, drive) in errs.iter().zip(self.set_endpoints.iter()) {
let endpoint = drive.to_string();
let drive_state = match err {
Some(err) => match err {
DiskError::DiskNotFound => DriveState::Offline.to_string(),
DiskError::FileNotFound | DiskError::VolumeNotFound => DriveState::Missing.to_string(),
_ => DriveState::Corrupt.to_string(),
},
None => DriveState::Ok.to_string(),
};
result.before.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: endpoint.clone(),
state: drive_state.to_string(),
});
result.after.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint,
state: drive_state.to_string(),
});
}
if dang_ling_object || DiskError::is_all_not_found(&errs) {
return Ok((result, Some(DiskError::FileNotFound)));
}
if dry_run {
// Quit without try to heal the object dir
return Ok((result, None));
}
for (index, (err, disk)) in errs.iter().zip(disks.iter()).enumerate() {
if let (Some(DiskError::VolumeNotFound | DiskError::FileNotFound), Some(disk)) = (err, disk) {
let vol_path = Path::new(bucket).join(object);
let drive_state = match disk.make_volume(vol_path.to_str().unwrap()).await {
Ok(_) => DriveState::Ok.to_string(),
Err(merr) => match merr {
DiskError::VolumeExists => DriveState::Ok.to_string(),
DiskError::DiskNotFound => DriveState::Offline.to_string(),
_ => DriveState::Corrupt.to_string(),
},
};
result.after.drives[index].state = drive_state.to_string();
}
}
Ok((result, None))
}
async fn default_heal_result(
&self,
lfi: FileInfo,
errs: &[Option<DiskError>],
bucket: &str,
object: &str,
version_id: &str,
) -> HealResultItem {
let disk_len = { self.disks.read().await.len() };
let mut result = HealResultItem {
heal_item_type: HealItemType::Object.to_string(),
bucket: bucket.to_string(),
object: object.to_string(),
object_size: lfi.size as usize,
version_id: version_id.to_string(),
disk_count: disk_len,
..Default::default()
};
if lfi.is_valid() {
result.parity_blocks = lfi.erasure.parity_blocks;
} else {
result.parity_blocks = self.default_parity_count;
}
result.data_blocks = disk_len - result.parity_blocks;
for (index, disk) in self.disks.read().await.iter().enumerate() {
if disk.is_none() {
result.before.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: self.set_endpoints[index].to_string(),
state: DriveState::Offline.to_string(),
});
result.after.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: self.set_endpoints[index].to_string(),
state: DriveState::Offline.to_string(),
});
}
let mut drive_state = DriveState::Corrupt;
if let Some(err) = &errs[index] {
if err == &DiskError::FileNotFound || err == &DiskError::VolumeNotFound {
drive_state = DriveState::Missing;
}
} else {
drive_state = DriveState::Ok;
}
result.before.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: self.set_endpoints[index].to_string(),
state: drive_state.to_string(),
});
result.after.drives.push(HealDriveInfo {
uuid: "".to_string(),
endpoint: self.set_endpoints[index].to_string(),
state: drive_state.to_string(),
});
}
result
}
async fn delete_if_dang_ling(
&self,
bucket: &str,
object: &str,
meta_arr: &[FileInfo],
errs: &[Option<DiskError>],
data_errs_by_part: &HashMap<usize, Vec<usize>>,
opts: ObjectOptions,
) -> disk::error::Result<FileInfo> {
if let Ok(m) = is_object_dang_ling(meta_arr, errs, data_errs_by_part) {
let mut tags = HashMap::new();
tags.insert("set", self.set_index.to_string());
tags.insert("pool", self.pool_index.to_string());
tags.insert("merrs", join_errs(errs));
tags.insert("derrs", format!("{data_errs_by_part:?}"));
if m.is_valid() {
tags.insert("sz", m.size.to_string());
tags.insert(
"mt",
m.mod_time
.as_ref()
.map_or(String::new(), |mod_time| mod_time.unix_timestamp().to_string()),
);
tags.insert("d:p", format!("{}:{}", m.erasure.data_blocks, m.erasure.parity_blocks));
} else {
tags.insert("invalid", "1".to_string());
tags.insert(
"d:p",
format!("{}:{}", self.set_drive_count - self.default_parity_count, self.default_parity_count),
);
}
let mut offline = 0;
for (i, err) in errs.iter().enumerate() {
let mut found = false;
if let Some(err) = err {
if err == &DiskError::DiskNotFound {
found = true;
}
}
for p in data_errs_by_part {
if let Some(v) = p.1.get(i) {
if *v == CHECK_PART_DISK_NOT_FOUND {
found = true;
break;
}
}
}
if found {
offline += 1;
}
}
if offline > 0 {
tags.insert("offline", offline.to_string());
}
// TODO: audit
let mut fi = FileInfo::default();
if let Some(ref version_id) = opts.version_id {
fi.version_id = Uuid::parse_str(version_id).ok();
}
// TODO: tier
for disk in self.disks.read().await.iter().flatten() {
let _ = disk
.delete_version(bucket, object, fi.clone(), false, DeleteOptions::default())
.await;
}
Ok(m)
} else {
error!("delete_if_dang_ling: is_object_dang_ling errs={:?}", errs);
Err(DiskError::ErasureReadQuorum)
}
}
async fn delete_prefix(&self, bucket: &str, prefix: &str) -> disk::error::Result<()> {
let disks = self.get_disks_internal().await;
let write_quorum = disks.len() / 2 + 1;
let mut futures = Vec::with_capacity(disks.len());
for disk_op in disks.iter() {
let bucket = bucket.to_string();
let prefix = prefix.to_string();
futures.push(async move {
if let Some(disk) = disk_op {
disk.delete(
&bucket,
&prefix,
DeleteOptions {
recursive: true,
immediate: true,
..Default::default()
},
)
.await
} else {
Ok(())
}
});
}
let errs = join_all(futures).await.into_iter().map(|v| v.err()).collect::<Vec<_>>();
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
return Err(err);
}
Ok(())
}
pub async fn update_restore_metadata(
&self,
bucket: &str,
object: &str,
obj_info: &ObjectInfo,
opts: &ObjectOptions,
) -> Result<()> {
let mut oi = obj_info.clone();
oi.metadata_only = true;
oi.user_defined.remove(X_AMZ_RESTORE.as_str());
let version_id = oi.version_id.map(|v| v.to_string());
let obj = self
.copy_object(
bucket,
object,
bucket,
object,
&mut oi,
&ObjectOptions {
version_id: version_id.clone(),
..Default::default()
},
&ObjectOptions {
version_id,
..Default::default()
},
)
.await;
obj?;
Ok(())
}
async fn check_write_precondition(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Option<StorageError> {
let mut opts = opts.clone();
let http_preconditions = opts.http_preconditions?;
opts.http_preconditions = None;
// Never claim a lock here, to avoid deadlock
// - If no_lock is false, we must have obtained the lock out side of this function
// - If no_lock is true, we should not obtain locks
opts.no_lock = true;
let oi = self.get_object_info(bucket, object, &opts).await;
match oi {
Ok(oi) => {
if should_prevent_write(&oi, http_preconditions.if_none_match, http_preconditions.if_match) {
return Some(StorageError::PreconditionFailed);
}
}
Err(StorageError::VersionNotFound(_, _, _))
| Err(StorageError::ObjectNotFound(_, _))
| Err(StorageError::ErasureReadQuorum) => {
// When the object is not found,
// - if If-Match is set, we should return 404 NotFound
// - if If-None-Match is set, we should be able to proceed with the request
if http_preconditions.if_match.is_some() {
return Some(StorageError::ObjectNotFound(bucket.to_string(), object.to_string()));
}
}
Err(e) => {
return Some(e);
}
}
None
}
}
#[async_trait::async_trait]
impl ObjectIO for SetDisks {
#[tracing::instrument(level = "debug", skip(self))]
async fn get_object_reader(
&self,
bucket: &str,
object: &str,
range: Option<HTTPRangeSpec>,
h: HeaderMap,
opts: &ObjectOptions,
) -> Result<GetObjectReader> {
// Acquire a shared read-lock early to protect read consistency
let _read_lock_guard = if !opts.no_lock {
Some(
self.fast_lock_manager
.acquire_read_lock("", object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?,
)
} else {
None
};
let (fi, files, disks) = self
.get_object_fileinfo(bucket, object, opts, true)
.await
.map_err(|err| to_object_err(err, vec![bucket, object]))?;
let object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended);
if object_info.delete_marker {
if opts.version_id.is_none() {
return Err(to_object_err(Error::FileNotFound, vec![bucket, object]));
}
return Err(to_object_err(Error::MethodNotAllowed, vec![bucket, object]));
}
// if object_info.size == 0 {
// let empty_rd: Box<dyn AsyncRead> = Box::new(Bytes::new());
// return Ok(GetObjectReader {
// stream: empty_rd,
// object_info,
// });
// }
if object_info.size == 0 {
if let Some(rs) = range {
let _ = rs.get_offset_length(object_info.size)?;
}
let reader = GetObjectReader {
stream: Box::new(Cursor::new(Vec::new())),
object_info,
};
return Ok(reader);
}
// TODO: remote
let (rd, wd) = tokio::io::duplex(DEFAULT_READ_BUFFER_SIZE);
let (reader, offset, length) = GetObjectReader::new(Box::new(rd), range, &object_info, opts, &h)?;
// let disks = disks.clone();
let bucket = bucket.to_owned();
let object = object.to_owned();
let set_index = self.set_index;
let pool_index = self.pool_index;
// Move the read-lock guard into the task so it lives for the duration of the read
// let _guard_to_hold = _read_lock_guard; // moved into closure below
tokio::spawn(async move {
// let _guard = _guard_to_hold; // keep guard alive until task ends
if let Err(e) = Self::get_object_with_fileinfo(
&bucket,
&object,
offset,
length,
&mut Box::new(wd),
fi,
files,
&disks,
set_index,
pool_index,
)
.await
{
error!("get_object_with_fileinfo err {:?}", e);
};
// error!("get_object_with_fileinfo end {}/{}", bucket, object);
});
Ok(reader)
}
#[tracing::instrument(level = "debug", skip(self, data,))]
async fn put_object(&self, bucket: &str, object: &str, data: &mut PutObjReader, opts: &ObjectOptions) -> Result<ObjectInfo> {
let disks = self.disks.read().await;
// Acquire per-object exclusive lock via RAII guard. It auto-releases asynchronously on drop.
let _object_lock_guard = if !opts.no_lock {
Some(
self.fast_lock_manager
.acquire_write_lock("", object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?,
)
} else {
None
};
if let Some(http_preconditions) = opts.http_preconditions.clone() {
if let Some(err) = self.check_write_precondition(bucket, object, opts).await {
return Err(err);
}
}
let mut user_defined = opts.user_defined.clone();
let sc_parity_drives = {
if let Some(sc) = GLOBAL_STORAGE_CLASS.get() {
sc.get_parity_for_sc(user_defined.get(AMZ_STORAGE_CLASS).cloned().unwrap_or_default().as_str())
} else {
None
}
};
let mut parity_drives = sc_parity_drives.unwrap_or(self.default_parity_count);
if opts.max_parity {
parity_drives = disks.len() / 2;
}
let data_drives = disks.len() - parity_drives;
let mut write_quorum = data_drives;
if data_drives == parity_drives {
write_quorum += 1
}
let mut fi = FileInfo::new([bucket, object].join("/").as_str(), data_drives, parity_drives);
fi.version_id = {
if let Some(ref vid) = opts.version_id {
Some(Uuid::parse_str(vid.as_str()).map_err(Error::other)?)
} else {
None
}
};
if opts.versioned && fi.version_id.is_none() {
fi.version_id = Some(Uuid::new_v4());
}
fi.data_dir = Some(Uuid::new_v4());
let parts_metadata = vec![fi.clone(); disks.len()];
let (shuffle_disks, mut parts_metadatas) = Self::shuffle_disks_and_parts_metadata(&disks, &parts_metadata, &fi);
let tmp_dir = Uuid::new_v4().to_string();
let tmp_object = format!("{}/{}/part.1", tmp_dir, fi.data_dir.unwrap());
let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size);
let is_inline_buffer = {
if let Some(sc) = GLOBAL_STORAGE_CLASS.get() {
sc.should_inline(erasure.shard_file_size(data.size()), opts.versioned)
} else {
false
}
};
let mut writers = Vec::with_capacity(shuffle_disks.len());
let mut errors = Vec::with_capacity(shuffle_disks.len());
for disk_op in shuffle_disks.iter() {
if let Some(disk) = disk_op {
let writer = create_bitrot_writer(
is_inline_buffer,
Some(disk),
RUSTFS_META_TMP_BUCKET,
&tmp_object,
erasure.shard_file_size(data.size()),
erasure.shard_size(),
HashAlgorithm::HighwayHash256,
)
.await?;
// let writer = if is_inline_buffer {
// BitrotWriter::new(
// Writer::from_cursor(Cursor::new(Vec::new())),
// erasure.shard_size(),
// HashAlgorithm::HighwayHash256,
// )
// } else {
// let f = match disk
// .create_file("", RUSTFS_META_TMP_BUCKET, &tmp_object, erasure.shard_file_size(data.content_length))
// .await
// {
// Ok(f) => f,
// Err(e) => {
// errors.push(Some(e));
// writers.push(None);
// continue;
// }
// };
// BitrotWriter::new(Writer::from_tokio_writer(f), erasure.shard_size(), HashAlgorithm::HighwayHash256)
// };
writers.push(Some(writer));
errors.push(None);
} else {
errors.push(Some(DiskError::DiskNotFound));
writers.push(None);
}
}
let nil_count = errors.iter().filter(|&e| e.is_none()).count();
if nil_count < write_quorum {
error!("not enough disks to write: {:?}", errors);
if let Some(write_err) = reduce_write_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, write_quorum) {
return Err(to_object_err(write_err.into(), vec![bucket, object]));
}
return Err(Error::other(format!("not enough disks to write: {errors:?}")));
}
let stream = mem::replace(
&mut data.stream,
HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false)?,
);
let (reader, w_size) = match Arc::new(erasure).encode(stream, &mut writers, write_quorum).await {
Ok((r, w)) => (r, w),
Err(e) => {
error!("encode err {:?}", e);
return Err(e.into());
}
}; // TODO: 出错,删除临时目录
let _ = mem::replace(&mut data.stream, reader);
// if let Err(err) = close_bitrot_writers(&mut writers).await {
// error!("close_bitrot_writers err {:?}", err);
// }
if (w_size as i64) < data.size() {
return Err(Error::other("put_object write size < data.size()"));
}
if user_defined.contains_key(&format!("{RESERVED_METADATA_PREFIX_LOWER}compression")) {
user_defined.insert(format!("{RESERVED_METADATA_PREFIX_LOWER}compression-size"), w_size.to_string());
}
let index_op = data.stream.try_get_index().map(|v| v.clone().into_vec());
//TODO: userDefined
let etag = data.stream.try_resolve_etag().unwrap_or_default();
user_defined.insert("etag".to_owned(), etag.clone());
if !user_defined.contains_key("content-type") {
// get content-type
}
let mut actual_size = data.actual_size();
if actual_size < 0 {
let is_compressed = fi.is_compressed();
if !is_compressed {
actual_size = w_size as i64;
}
}
if let Some(sc) = user_defined.get(AMZ_STORAGE_CLASS) {
if sc == storageclass::STANDARD {
let _ = user_defined.remove(AMZ_STORAGE_CLASS);
}
}
let now = OffsetDateTime::now_utc();
for (i, fi) in parts_metadatas.iter_mut().enumerate() {
fi.metadata = user_defined.clone();
if is_inline_buffer {
if let Some(writer) = writers[i].take() {
fi.data = Some(writer.into_inline_data().map(bytes::Bytes::from).unwrap_or_default());
}
fi.set_inline_data();
}
fi.mod_time = Some(now);
fi.size = w_size as i64;
fi.versioned = opts.versioned || opts.version_suspended;
fi.add_object_part(1, etag.clone(), w_size, fi.mod_time, actual_size, index_op.clone());
if opts.data_movement {
fi.set_data_moved();
}
}
drop(writers); // drop writers to close all files, this is to prevent FileAccessDenied errors when renaming data
let (online_disks, _, op_old_dir) = Self::rename_data(
&shuffle_disks,
RUSTFS_META_TMP_BUCKET,
tmp_dir.as_str(),
&parts_metadatas,
bucket,
object,
write_quorum,
)
.await?;
if let Some(old_dir) = op_old_dir {
self.commit_rename_data_dir(&shuffle_disks, bucket, object, &old_dir.to_string(), write_quorum)
.await?;
}
self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_dir).await?;
for (i, op_disk) in online_disks.iter().enumerate() {
if let Some(disk) = op_disk {
if disk.is_online().await {
fi = parts_metadatas[i].clone();
break;
}
}
}
fi.is_latest = true;
// TODO: version support
Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended))
}
}
#[async_trait::async_trait]
impl StorageAPI for SetDisks {
#[tracing::instrument(skip(self))]
async fn backend_info(&self) -> rustfs_madmin::BackendInfo {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn storage_info(&self) -> rustfs_madmin::StorageInfo {
let disks = self.get_disks_internal().await;
get_storage_info(&disks, &self.set_endpoints).await
}
#[tracing::instrument(skip(self))]
async fn local_storage_info(&self) -> rustfs_madmin::StorageInfo {
let disks = self.get_disks_internal().await;
let mut local_disks: Vec<Option<Arc<disk::Disk>>> = Vec::new();
let mut local_endpoints = Vec::new();
for (i, ep) in self.set_endpoints.iter().enumerate() {
if ep.is_local {
local_disks.push(disks[i].clone());
local_endpoints.push(ep.clone());
}
}
get_storage_info(&local_disks, &local_endpoints).await
}
#[tracing::instrument(skip(self))]
async fn list_bucket(&self, _opts: &BucketOptions) -> Result<Vec<BucketInfo>> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn make_bucket(&self, _bucket: &str, _opts: &MakeBucketOptions) -> Result<()> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn get_bucket_info(&self, _bucket: &str, _opts: &BucketOptions) -> Result<BucketInfo> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn copy_object(
&self,
src_bucket: &str,
src_object: &str,
_dst_bucket: &str,
_dst_object: &str,
src_info: &mut ObjectInfo,
src_opts: &ObjectOptions,
dst_opts: &ObjectOptions,
) -> Result<ObjectInfo> {
// FIXME: TODO:
if !src_info.metadata_only {
return Err(StorageError::NotImplemented);
}
// Guard lock for source object metadata update
let _lock_guard = self
.fast_lock_manager
.acquire_write_lock("", src_object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?;
let disks = self.get_disks_internal().await;
let (mut metas, errs) = {
if let Some(vid) = &src_opts.version_id {
Self::read_all_fileinfo(&disks, "", src_bucket, src_object, vid, true, false).await?
} else {
Self::read_all_xl(&disks, src_bucket, src_object, true, false).await
}
};
let (read_quorum, write_quorum) = match Self::object_quorum_from_meta(&metas, &errs, self.default_parity_count) {
Ok((r, w)) => (r as usize, w as usize),
Err(mut err) => {
if err == DiskError::ErasureReadQuorum
&& !src_bucket.starts_with(RUSTFS_META_BUCKET)
&& self
.delete_if_dang_ling(src_bucket, src_object, &metas, &errs, &HashMap::new(), src_opts.clone())
.await
.is_ok()
{
if src_opts.version_id.is_some() {
err = DiskError::FileVersionNotFound
} else {
err = DiskError::FileNotFound
}
}
return Err(to_object_err(err.into(), vec![src_bucket, src_object]));
}
};
let (online_disks, mod_time, etag) = Self::list_online_disks(&disks, &metas, &errs, read_quorum);
let mut fi = Self::pick_valid_fileinfo(&metas, mod_time, etag, read_quorum)
.map_err(|e| to_object_err(e.into(), vec![src_bucket, src_object]))?;
if fi.deleted {
if src_opts.version_id.is_none() {
return Err(to_object_err(Error::FileNotFound, vec![src_bucket, src_object]));
}
return Err(to_object_err(Error::MethodNotAllowed, vec![src_bucket, src_object]));
}
let version_id = {
if src_info.version_only {
if let Some(vid) = &dst_opts.version_id {
Some(Uuid::parse_str(vid)?)
} else {
Some(Uuid::new_v4())
}
} else {
src_info.version_id
}
};
let inline_data = fi.inline_data();
fi.metadata = src_info.user_defined.clone();
if let Some(etag) = &src_info.etag {
fi.metadata.insert("etag".to_owned(), etag.clone());
}
let mod_time = OffsetDateTime::now_utc();
for fi in metas.iter_mut() {
if fi.is_valid() {
fi.metadata = src_info.user_defined.clone();
fi.mod_time = Some(mod_time);
fi.version_id = version_id;
fi.versioned = src_opts.versioned || src_opts.version_suspended;
if !fi.inline_data() {
fi.data = None;
}
if inline_data {
fi.set_inline_data();
}
}
}
Self::write_unique_file_info(&online_disks, "", src_bucket, src_object, &metas, write_quorum)
.await
.map_err(|e| to_object_err(e.into(), vec![src_bucket, src_object]))?;
Ok(ObjectInfo::from_file_info(
&fi,
src_bucket,
src_object,
src_opts.versioned || src_opts.version_suspended,
))
}
#[tracing::instrument(skip(self))]
async fn delete_object_version(&self, bucket: &str, object: &str, fi: &FileInfo, force_del_marker: bool) -> Result<()> {
// Guard lock for single object delete-version
let _lock_guard = self
.fast_lock_manager
.acquire_write_lock("", object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?;
let disks = self.get_disks(0, 0).await?;
let write_quorum = disks.len() / 2 + 1;
let mut futures = Vec::with_capacity(disks.len());
let mut errs = Vec::with_capacity(disks.len());
for disk in disks.iter() {
futures.push(async move {
if let Some(disk) = disk {
match disk
.delete_version(bucket, object, fi.clone(), force_del_marker, DeleteOptions::default())
.await
{
Ok(r) => Ok(r),
Err(e) => Err(e),
}
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(_) => {
errs.push(None);
}
Err(e) => {
errs.push(Some(e));
}
}
}
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
return Err(err.into());
}
Ok(())
}
#[tracing::instrument(skip(self))]
async fn delete_objects(
&self,
bucket: &str,
objects: Vec<ObjectToDelete>,
opts: ObjectOptions,
) -> Result<(Vec<DeletedObject>, Vec<Option<Error>>)> {
// 默认返回值
let mut del_objects = vec![DeletedObject::default(); objects.len()];
let mut del_errs = Vec::with_capacity(objects.len());
for _ in 0..objects.len() {
del_errs.push(None)
}
// Use fast batch locking to acquire all locks atomically
let mut _guards: HashMap<String, rustfs_lock::FastLockGuard> = HashMap::new();
let mut unique_objects: std::collections::HashSet<String> = std::collections::HashSet::new();
// Collect unique object names
for dobj in &objects {
unique_objects.insert(dobj.object_name.clone());
}
// Acquire all locks in batch to prevent deadlocks
for object_name in unique_objects {
match self
.fast_lock_manager
.acquire_write_lock("", object_name.as_str(), self.locker_owner.as_str())
.await
{
Ok(guard) => {
_guards.insert(object_name, guard);
}
Err(_) => {
// Mark all operations on this object as failed
for (i, dobj) in objects.iter().enumerate() {
if dobj.object_name == object_name {
del_errs[i] = Some(Error::other("can not get lock. please retry"));
}
}
}
}
}
// let mut del_fvers = Vec::with_capacity(objects.len());
let ver_cfg = BucketVersioningSys::get(bucket).await.unwrap_or_default();
let mut vers_map: HashMap<&String, FileInfoVersions> = HashMap::new();
for (i, dobj) in objects.iter().enumerate() {
let mut vr = FileInfo {
name: dobj.object_name.clone(),
version_id: dobj.version_id,
idx: i,
..Default::default()
};
vr.set_tier_free_version_id(&Uuid::new_v4().to_string());
// 删除
// del_objects[i].object_name.clone_from(&vr.name);
// del_objects[i].version_id = vr.version_id.map(|v| v.to_string());
if dobj.version_id.is_none() {
let (suspended, versioned) = (ver_cfg.suspended(), ver_cfg.prefix_enabled(dobj.object_name.as_str()));
if suspended || versioned {
vr.mod_time = Some(OffsetDateTime::now_utc());
vr.deleted = true;
if versioned {
vr.version_id = Some(Uuid::new_v4());
}
}
}
let v = {
if vers_map.contains_key(&dobj.object_name) {
let val = vers_map.get_mut(&dobj.object_name).unwrap();
val.versions.push(vr.clone());
val.clone()
} else {
FileInfoVersions {
name: vr.name.clone(),
versions: vec![vr.clone()],
..Default::default()
}
}
};
if vr.deleted {
del_objects[i] = DeletedObject {
delete_marker: vr.deleted,
delete_marker_version_id: vr.version_id.map(|v| v.to_string()),
delete_marker_mtime: vr.mod_time,
object_name: vr.name.clone(),
..Default::default()
}
} else {
del_objects[i] = DeletedObject {
object_name: vr.name.clone(),
version_id: vr.version_id.map(|v| v.to_string()),
..Default::default()
}
}
// Only add to vers_map if we hold the lock
if _guards.contains_key(&dobj.object_name) {
vers_map.insert(&dobj.object_name, v);
}
}
let mut vers = Vec::with_capacity(vers_map.len());
for (_, mut fi_vers) in vers_map {
fi_vers.versions.sort_by(|a, b| a.deleted.cmp(&b.deleted));
fi_vers.versions.reverse();
if let Some(index) = fi_vers.versions.iter().position(|fi| fi.deleted) {
fi_vers.versions.truncate(index + 1);
}
vers.push(fi_vers);
}
let disks = self.disks.read().await;
let disks = disks.clone();
let mut futures = Vec::with_capacity(disks.len());
// let mut errors = Vec::with_capacity(disks.len());
for disk in disks.iter() {
let vers = vers.clone();
futures.push(async move {
if let Some(disk) = disk {
disk.delete_versions(bucket, vers, DeleteOptions::default()).await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for errs in results.into_iter().flatten() {
// TODO: handle err reduceWriteQuorumErrs
for err in errs.iter().flatten() {
warn!("result err {:?}", err);
}
}
Ok((del_objects, del_errs))
}
#[tracing::instrument(skip(self))]
async fn delete_object(&self, bucket: &str, object: &str, opts: ObjectOptions) -> Result<ObjectInfo> {
// Guard lock for single object delete
let _lock_guard = if !opts.delete_prefix {
Some(
self.fast_lock_manager
.acquire_write_lock("", object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?,
)
} else {
None
};
if opts.delete_prefix {
self.delete_prefix(bucket, object)
.await
.map_err(|e| to_object_err(e.into(), vec![bucket, object]))?;
return Ok(ObjectInfo::default());
}
let (oi, write_quorum) = match self.get_object_info_and_quorum(bucket, object, &opts).await {
Ok((oi, wq)) => (oi, wq),
Err(e) => {
return Err(to_object_err(e, vec![bucket, object]));
}
};
let mark_delete = oi.version_id.is_some();
let mut delete_marker = opts.versioned;
let mod_time = if let Some(mt) = opts.mod_time {
mt
} else {
OffsetDateTime::now_utc()
};
let find_vid = Uuid::new_v4();
if mark_delete && (opts.versioned || opts.version_suspended) {
if !delete_marker {
delete_marker = opts.version_suspended && opts.version_id.is_none();
}
let mut fi = FileInfo {
name: object.to_string(),
deleted: delete_marker,
mark_deleted: mark_delete,
mod_time: Some(mod_time),
..Default::default() // TODO: replication
};
fi.set_tier_free_version_id(&find_vid.to_string());
if opts.skip_free_version {
fi.set_skip_tier_free_version();
}
fi.version_id = if let Some(vid) = opts.version_id {
Some(Uuid::parse_str(vid.as_str())?)
} else if opts.versioned {
Some(Uuid::new_v4())
} else {
None
};
self.delete_object_version(bucket, object, &fi, opts.delete_marker)
.await
.map_err(|e| to_object_err(e, vec![bucket, object]))?;
return Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended));
}
let version_id = opts.version_id.as_ref().and_then(|v| Uuid::parse_str(v).ok());
// Create a single object deletion request
let mut vr = FileInfo {
name: object.to_string(),
version_id: opts.version_id.as_ref().and_then(|v| Uuid::parse_str(v).ok()),
..Default::default()
};
// Handle versioning
let (suspended, versioned) = (opts.version_suspended, opts.versioned);
if opts.version_id.is_none() && (suspended || versioned) {
vr.mod_time = Some(OffsetDateTime::now_utc());
vr.deleted = true;
if versioned {
vr.version_id = Some(Uuid::new_v4());
}
}
let vers = vec![FileInfoVersions {
name: vr.name.clone(),
versions: vec![vr.clone()],
..Default::default()
}];
let disks = self.disks.read().await;
let disks = disks.clone();
let write_quorum = disks.len() / 2 + 1;
let mut futures = Vec::with_capacity(disks.len());
let mut errs = Vec::with_capacity(disks.len());
for disk in disks.iter() {
let vers = vers.clone();
futures.push(async move {
if let Some(disk) = disk {
disk.delete_versions(bucket, vers, DeleteOptions::default()).await
} else {
Err(DiskError::DiskNotFound)
}
});
}
let results = join_all(futures).await;
for result in results {
match result {
Ok(disk_errs) => {
// Handle errors from disk operations
for err in disk_errs.iter().flatten() {
warn!("delete_object disk error: {:?}", err);
}
errs.push(None);
}
Err(e) => {
errs.push(Some(e));
}
}
}
// Check write quorum
if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) {
return Err(to_object_err(err.into(), vec![bucket, object]));
}
// Create result ObjectInfo
let result_info = if vr.deleted {
ObjectInfo {
bucket: bucket.to_string(),
name: object.to_string(),
delete_marker: true,
mod_time: vr.mod_time,
version_id: vr.version_id,
..Default::default()
}
} else {
ObjectInfo {
bucket: bucket.to_string(),
name: object.to_string(),
version_id: vr.version_id,
..Default::default()
}
};
Ok(result_info)
}
#[tracing::instrument(skip(self))]
async fn list_objects_v2(
self: Arc<Self>,
_bucket: &str,
_prefix: &str,
_continuation_token: Option<String>,
_delimiter: Option<String>,
_max_keys: i32,
_fetch_owner: bool,
_start_after: Option<String>,
) -> Result<ListObjectsV2Info> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn list_object_versions(
self: Arc<Self>,
_bucket: &str,
_prefix: &str,
_marker: Option<String>,
_version_marker: Option<String>,
_delimiter: Option<String>,
_max_keys: i32,
) -> Result<ListObjectVersionsInfo> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn get_object_info(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<ObjectInfo> {
// Acquire a shared read-lock to protect consistency during info fetch
let _read_lock_guard = if !opts.no_lock {
Some(
self.fast_lock_manager
.acquire_read_lock("", object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?,
)
} else {
None
};
let (fi, _, _) = self
.get_object_fileinfo(bucket, object, opts, false)
.await
.map_err(|e| to_object_err(e, vec![bucket, object]))?;
// warn!("get object_info fi {:?}", &fi);
let oi = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended);
Ok(oi)
}
#[tracing::instrument(skip(self))]
async fn add_partial(&self, bucket: &str, object: &str, version_id: &str) -> Result<()> {
let _ = rustfs_common::heal_channel::send_heal_request(rustfs_common::heal_channel::create_heal_request_with_options(
bucket.to_string(),
Some(object.to_string()),
false,
Some(HealChannelPriority::Normal),
Some(self.pool_index),
Some(self.set_index),
))
.await;
Ok(())
}
#[tracing::instrument(skip(self))]
async fn put_object_metadata(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<ObjectInfo> {
// TODO: nslock
// Guard lock for metadata update
let _lock_guard = if !opts.no_lock {
Some(
self.fast_lock_manager
.acquire_write_lock("", object, self.locker_owner.as_str())
.await
.map_err(|_| Error::other("can not get lock. please retry".to_string()))?,
)
} else {
None
};
let disks = self.get_disks_internal().await;
let (metas, errs) = {
if opts.version_id.is_some() {
Self::read_all_fileinfo(
&disks,
"",
bucket,
object,
opts.version_id.as_ref().unwrap().to_string().as_str(),
false,
false,
)
.await?
} else {
Self::read_all_xl(&disks, bucket, object, false, false).await
}
};
let read_quorum = match Self::object_quorum_from_meta(&metas, &errs, self.default_parity_count) {
Ok((res, _)) => res,
Err(mut err) => {
if err == DiskError::ErasureReadQuorum
&& !bucket.starts_with(RUSTFS_META_BUCKET)
&& self
.delete_if_dang_ling(bucket, object, &metas, &errs, &HashMap::new(), opts.clone())
.await
.is_ok()
{
if opts.version_id.is_some() {
err = DiskError::FileVersionNotFound
} else {
err = DiskError::FileNotFound
}
}
return Err(to_object_err(err.into(), vec![bucket, object]));
}
};
let read_quorum = read_quorum as usize;
let (online_disks, mod_time, etag) = Self::list_online_disks(&disks, &metas, &errs, read_quorum);
let mut fi = Self::pick_valid_fileinfo(&metas, mod_time, etag, read_quorum)
.map_err(|e| to_object_err(e.into(), vec![bucket, object]))?;
if fi.deleted {
return Err(to_object_err(Error::MethodNotAllowed, vec![bucket, object]));
}
let obj_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended);
for (k, v) in obj_info.user_defined {
fi.metadata.insert(k, v);
}
if let Some(mt) = &opts.eval_metadata {
for (k, v) in mt {
fi.metadata.insert(k.clone(), v.clone());
}
}
fi.mod_time = opts.mod_time;
if let Some(ref version_id) = opts.version_id {
fi.version_id = Uuid::parse_str(version_id).ok();
}
self.update_object_meta(bucket, object, fi.clone(), &online_disks)
.await
.map_err(|e| to_object_err(e.into(), vec![bucket, object]))?;
Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended))
}
#[tracing::instrument(skip(self))]
async fn get_object_tags(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<String> {
let oi = self.get_object_info(bucket, object, opts).await?;
Ok(oi.user_tags)
}
#[tracing::instrument(level = "debug", skip(self))]
async fn transition_object(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<()> {
let mut tier_config_mgr = GLOBAL_TierConfigMgr.write().await;
let tgt_client = match tier_config_mgr.get_driver(&opts.transition.tier).await {
Ok(client) => client,
Err(err) => {
return Err(Error::other(err.to_string()));
}
};
// Acquire write-lock early; hold for the whole transition operation scope
// let mut _lock_guard: Option<rustfs_lock::LockGuard> = None;
// if !opts.no_lock {
// let guard_opt = self
// .namespace_lock
// .lock_guard(object, &self.locker_owner, Duration::from_secs(5), Duration::from_secs(10))
// .await?;
// if guard_opt.is_none() {
// return Err(Error::other("can not get lock. please retry".to_string()));
// }
// _lock_guard = guard_opt;
// }
let (mut fi, meta_arr, online_disks) = self.get_object_fileinfo(bucket, object, opts, true).await?;
/*if err != nil {
return Err(to_object_err(err, vec![bucket, object]));
}*/
/*if fi.deleted {
if opts.version_id.is_none() {
return Err(to_object_err(DiskError::FileNotFound, vec![bucket, object]));
}
return Err(to_object_err(ERR_METHOD_NOT_ALLOWED, vec![bucket, object]));
}*/
if !opts.mod_time.expect("err").unix_timestamp() == fi.mod_time.as_ref().expect("err").unix_timestamp()
|| opts.transition.etag != extract_etag(&fi.metadata)
{
return Err(to_object_err(Error::from(DiskError::FileNotFound), vec![bucket, object]));
}
if fi.transition_status == TRANSITION_COMPLETE {
return Ok(());
}
/*if fi.xlv1 {
if let Err(err) = self.heal_object(bucket, object, "", &HealOpts {no_lock: true, ..Default::default()}) {
return err.expect("err");
}
(fi, meta_arr, online_disks) = self.get_object_fileinfo(&bucket, &object, &opts, true);
if err != nil {
return to_object_err(err, vec![bucket, object]);
}
}*/
let dest_obj = gen_transition_objname(bucket);
if let Err(err) = dest_obj {
return Err(to_object_err(err, vec![]));
}
let dest_obj = dest_obj.unwrap();
let oi = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended);
let (pr, mut pw) = tokio::io::duplex(fi.erasure.block_size);
let reader = ReaderImpl::ObjectBody(GetObjectReader {
stream: Box::new(pr),
object_info: oi,
});
let cloned_bucket = bucket.to_string();
let cloned_object = object.to_string();
let cloned_fi = fi.clone();
let set_index = self.set_index;
let pool_index = self.pool_index;
tokio::spawn(async move {
if let Err(e) = Self::get_object_with_fileinfo(
&cloned_bucket,
&cloned_object,
0,
cloned_fi.size,
&mut pw,
cloned_fi,
meta_arr,
&online_disks,
set_index,
pool_index,
)
.await
{
error!("get_object_with_fileinfo err {:?}", e);
};
});
let rv = tgt_client
.put_with_meta(&dest_obj, reader, fi.size, {
let mut m = HashMap::<String, String>::new();
m.insert("name".to_string(), object.to_string());
m
})
.await;
if let Err(err) = rv {
return Err(StorageError::Io(err));
}
let rv = rv.unwrap();
fi.transition_status = TRANSITION_COMPLETE.to_string();
fi.transitioned_objname = dest_obj;
fi.transition_tier = opts.transition.tier.clone();
fi.transition_version_id = if rv.is_empty() { None } else { Some(Uuid::parse_str(&rv)?) };
let mut event_name = EventName::ObjectTransitionComplete.as_ref();
let disks = self.get_disks(0, 0).await?;
if let Err(err) = self.delete_object_version(bucket, object, &fi, false).await {
event_name = EventName::ObjectTransitionFailed.as_ref();
}
for disk in disks.iter() {
if let Some(disk) = disk {
if disk.is_online().await {
continue;
}
}
let _ = self.add_partial(bucket, object, opts.version_id.as_ref().expect("err")).await;
break;
}
let obj_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended);
send_event(EventArgs {
event_name: event_name.to_string(),
bucket_name: bucket.to_string(),
object: obj_info,
user_agent: "Internal: [ILM-Transition]".to_string(),
host: GLOBAL_LocalNodeName.to_string(),
..Default::default()
});
//let tags = opts.lifecycle_audit_event.tags();
//auditLogLifecycle(ctx, objInfo, ILMTransition, tags, traceFn)
Ok(())
}
#[tracing::instrument(level = "debug", skip(self))]
async fn restore_transitioned_object(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<()> {
// Acquire write-lock early for the restore operation
// let mut _lock_guard: Option<rustfs_lock::LockGuard> = None;
// if !opts.no_lock {
// let guard_opt = self
// .namespace_lock
// .lock_guard(object, &self.locker_owner, Duration::from_secs(5), Duration::from_secs(10))
// .await?;
// if guard_opt.is_none() {
// return Err(Error::other("can not get lock. please retry".to_string()));
// }
// _lock_guard = guard_opt;
// }
let set_restore_header_fn = async move |oi: &mut ObjectInfo, rerr: Option<Error>| -> Result<()> {
if rerr.is_none() {
return Ok(());
}
self.update_restore_metadata(bucket, object, oi, opts).await?;
Err(rerr.unwrap())
};
let mut oi = ObjectInfo::default();
let fi = self.get_object_fileinfo(bucket, object, opts, true).await;
if let Err(err) = fi {
return set_restore_header_fn(&mut oi, Some(to_object_err(err, vec![bucket, object]))).await;
}
let (actual_fi, _, _) = fi.unwrap();
oi = ObjectInfo::from_file_info(&actual_fi, bucket, object, opts.versioned || opts.version_suspended);
let ropts = put_restore_opts(bucket, object, &opts.transition.restore_request, &oi);
/*if oi.parts.len() == 1 {
let mut rs: HTTPRangeSpec;
let gr = get_transitioned_object_reader(bucket, object, rs, HeaderMap::new(), oi, opts);
//if err != nil {
// return set_restore_header_fn(&mut oi, Some(toObjectErr(err, bucket, object)));
//}
let hash_reader = HashReader::new(gr, gr.obj_info.size, "", "", gr.obj_info.size);
let p_reader = PutObjReader::new(StreamingBlob::from(Box::pin(hash_reader)), hash_reader.size());
if let Err(err) = self.put_object(bucket, object, &mut p_reader, &ropts).await {
return set_restore_header_fn(&mut oi, Some(to_object_err(err, vec![bucket, object])));
} else {
return Ok(());
}
}
let res = self.new_multipart_upload(bucket, object, &ropts).await?;
//if err != nil {
// return set_restore_header_fn(&mut oi, err);
//}
let mut uploaded_parts: Vec<CompletePart> = vec![];
let mut rs: HTTPRangeSpec;
let gr = get_transitioned_object_reader(bucket, object, rs, HeaderMap::new(), oi, opts).await?;
//if err != nil {
// return set_restore_header_fn(&mut oi, err);
//}
for part_info in oi.parts {
//let hr = HashReader::new(LimitReader(gr, part_info.size), part_info.size, "", "", part_info.size);
let hr = HashReader::new(gr, part_info.size as i64, part_info.size as i64, None, false);
//if err != nil {
// return set_restore_header_fn(&mut oi, err);
//}
let mut p_reader = PutObjReader::new(hr, hr.size());
let p_info = self.put_object_part(bucket, object, &res.upload_id, part_info.number, &mut p_reader, &ObjectOptions::default()).await?;
//if let Err(err) = p_info {
// return set_restore_header_fn(&mut oi, err);
//}
if p_info.size != part_info.size {
return set_restore_header_fn(&mut oi, Some(Error::from(ObjectApiError::InvalidObjectState(GenericError{bucket: bucket.to_string(), object: object.to_string(), ..Default::default()}))));
}
uploaded_parts.push(CompletePart {
part_num: p_info.part_num,
etag: p_info.etag,
});
}
if let Err(err) = self.complete_multipart_upload(bucket, object, &res.upload_id, uploaded_parts, &ObjectOptions {
mod_time: oi.mod_time,
..Default::default()
}).await {
set_restore_header_fn(&mut oi, Some(err));
}*/
Ok(())
}
#[tracing::instrument(level = "debug", skip(self))]
async fn put_object_tags(&self, bucket: &str, object: &str, tags: &str, opts: &ObjectOptions) -> Result<ObjectInfo> {
// Acquire write-lock for tag update (metadata write)
// let mut _lock_guard: Option<rustfs_lock::LockGuard> = None;
// if !opts.no_lock {
// let guard_opt = self
// .namespace_lock
// .lock_guard(object, &self.locker_owner, Duration::from_secs(5), Duration::from_secs(10))
// .await?;
// if guard_opt.is_none() {
// return Err(Error::other("can not get lock. please retry".to_string()));
// }
// _lock_guard = guard_opt;
// }
let (mut fi, _, disks) = self.get_object_fileinfo(bucket, object, opts, false).await?;
fi.metadata.insert(AMZ_OBJECT_TAGGING.to_owned(), tags.to_owned());
// TODO: userdeefined
self.update_object_meta(bucket, object, fi.clone(), disks.as_slice()).await?;
// TODO: versioned
Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended))
}
#[tracing::instrument(skip(self))]
async fn delete_object_tags(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<ObjectInfo> {
self.put_object_tags(bucket, object, "", opts).await
}
#[tracing::instrument(skip(self))]
async fn copy_object_part(
&self,
_src_bucket: &str,
_src_object: &str,
_dst_bucket: &str,
_dst_object: &str,
_upload_id: &str,
_part_id: usize,
_start_offset: i64,
_length: i64,
_src_info: &ObjectInfo,
_src_opts: &ObjectOptions,
_dst_opts: &ObjectOptions,
) -> Result<()> {
unimplemented!()
}
#[tracing::instrument(level = "debug", skip(self, data, opts))]
async fn put_object_part(
&self,
bucket: &str,
object: &str,
upload_id: &str,
part_id: usize,
data: &mut PutObjReader,
opts: &ObjectOptions,
) -> Result<PartInfo> {
let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
let (fi, _) = self.check_upload_id_exists(bucket, object, upload_id, true).await?;
let write_quorum = fi.write_quorum(self.default_write_quorum());
let disks = self.disks.read().await;
let disks = disks.clone();
let shuffle_disks = Self::shuffle_disks(&disks, &fi.erasure.distribution);
let part_suffix = format!("part.{part_id}");
let tmp_part = format!("{}x{}", Uuid::new_v4(), OffsetDateTime::now_utc().unix_timestamp());
let tmp_part_path = Arc::new(format!("{tmp_part}/{part_suffix}"));
// let mut writers = Vec::with_capacity(disks.len());
// let erasure = Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size);
// let shared_size = erasure.shard_size(erasure.block_size);
// let futures = disks.iter().map(|disk| {
// let disk = disk.clone();
// let tmp_part_path = tmp_part_path.clone();
// tokio::spawn(async move {
// if let Some(disk) = disk {
// // let writer = disk.append_file(RUSTFS_META_TMP_BUCKET, &tmp_part_path).await?;
// // let filewriter = disk
// // .create_file("", RUSTFS_META_TMP_BUCKET, &tmp_part_path, data.content_length)
// // .await?;
// match new_bitrot_filewriter(
// disk.clone(),
// RUSTFS_META_TMP_BUCKET,
// &tmp_part_path,
// false,
// DEFAULT_BITROT_ALGO,
// shared_size,
// )
// .await
// {
// Ok(writer) => Ok(Some(writer)),
// Err(e) => Err(e),
// }
// } else {
// Ok(None)
// }
// })
// });
// for x in join_all(futures).await {
// let x = x??;
// writers.push(x);
// }
// let erasure = Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size);
// let stream = replace(&mut data.stream, Box::new(empty()));
// let etag_stream = EtagReader::new(stream);
// let (w_size, mut etag) = Arc::new(erasure)
// .encode(etag_stream, &mut writers, data.content_length, write_quorum)
// .await?;
// if let Err(err) = close_bitrot_writers(&mut writers).await {
// error!("close_bitrot_writers err {:?}", err);
// }
let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size);
let mut writers = Vec::with_capacity(shuffle_disks.len());
let mut errors = Vec::with_capacity(shuffle_disks.len());
for disk_op in shuffle_disks.iter() {
if let Some(disk) = disk_op {
let writer = create_bitrot_writer(
false,
Some(disk),
RUSTFS_META_TMP_BUCKET,
&tmp_part_path,
erasure.shard_file_size(data.size()),
erasure.shard_size(),
HashAlgorithm::HighwayHash256,
)
.await?;
// let writer = {
// let f = match disk
// .create_file("", RUSTFS_META_TMP_BUCKET, &tmp_part_path, erasure.shard_file_size(data.content_length))
// .await
// {
// Ok(f) => f,
// Err(e) => {
// errors.push(Some(e));
// writers.push(None);
// continue;
// }
// };
// BitrotWriter::new(Writer::from_tokio_writer(f), erasure.shard_size(), HashAlgorithm::HighwayHash256)
// };
writers.push(Some(writer));
errors.push(None);
} else {
errors.push(Some(DiskError::DiskNotFound));
writers.push(None);
}
}
let nil_count = errors.iter().filter(|&e| e.is_none()).count();
if nil_count < write_quorum {
if let Some(write_err) = reduce_write_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, write_quorum) {
return Err(to_object_err(write_err.into(), vec![bucket, object]));
}
return Err(Error::other(format!("not enough disks to write: {errors:?}")));
}
let stream = mem::replace(
&mut data.stream,
HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false)?,
);
let (reader, w_size) = Arc::new(erasure).encode(stream, &mut writers, write_quorum).await?; // TODO: 出错,删除临时目录
let _ = mem::replace(&mut data.stream, reader);
if (w_size as i64) < data.size() {
return Err(Error::other("put_object_part write size < data.size()"));
}
let index_op = data.stream.try_get_index().map(|v| v.clone().into_vec());
let mut etag = data.stream.try_resolve_etag().unwrap_or_default();
if let Some(ref tag) = opts.preserve_etag {
etag = tag.clone();
}
let mut actual_size = data.actual_size();
if actual_size < 0 {
let is_compressed = fi.is_compressed();
if !is_compressed {
actual_size = w_size as i64;
}
}
let part_info = ObjectPartInfo {
etag: etag.clone(),
number: part_id,
size: w_size,
mod_time: Some(OffsetDateTime::now_utc()),
actual_size,
index: index_op,
..Default::default()
};
// debug!("put_object_part part_info {:?}", part_info);
// fi.parts = vec![part_info.clone()];
let part_info_buff = part_info.marshal_msg()?;
drop(writers); // drop writers to close all files
let part_path = format!("{}/{}/{}", upload_id_path, fi.data_dir.unwrap_or_default(), part_suffix);
let _ = Self::rename_part(
&disks,
RUSTFS_META_TMP_BUCKET,
&tmp_part_path,
RUSTFS_META_MULTIPART_BUCKET,
&part_path,
part_info_buff.into(),
write_quorum,
)
.await?;
let ret: PartInfo = PartInfo {
etag: Some(etag.clone()),
part_num: part_id,
last_mod: Some(OffsetDateTime::now_utc()),
size: w_size,
actual_size,
};
// error!("put_object_part ret {:?}", &ret);
Ok(ret)
}
#[tracing::instrument(skip(self))]
async fn list_object_parts(
&self,
bucket: &str,
object: &str,
upload_id: &str,
part_number_marker: Option<usize>,
mut max_parts: usize,
opts: &ObjectOptions,
) -> Result<ListPartsInfo> {
let (fi, _) = self.check_upload_id_exists(bucket, object, upload_id, false).await?;
let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
if max_parts > MAX_PARTS_COUNT {
max_parts = MAX_PARTS_COUNT;
}
let part_number_marker = part_number_marker.unwrap_or_default();
// Extract storage class from metadata, default to STANDARD if not found
let storage_class = fi
.metadata
.get(rustfs_filemeta::headers::AMZ_STORAGE_CLASS)
.cloned()
.unwrap_or_else(|| storageclass::STANDARD.to_string());
let mut ret = ListPartsInfo {
bucket: bucket.to_owned(),
object: object.to_owned(),
upload_id: upload_id.to_owned(),
storage_class,
max_parts,
part_number_marker,
user_defined: fi.metadata.clone(),
..Default::default()
};
if max_parts == 0 {
return Ok(ret);
}
let online_disks = self.get_disks_internal().await;
let read_quorum = fi.read_quorum(self.default_read_quorum());
let part_path = format!(
"{}{}",
path_join_buf(&[
&upload_id_path,
fi.data_dir.map(|v| v.to_string()).unwrap_or_default().as_str(),
]),
SLASH_SEPARATOR
);
let mut part_numbers = match Self::list_parts(&online_disks, &part_path, read_quorum).await {
Ok(parts) => parts,
Err(err) => {
if err == DiskError::FileNotFound {
return Ok(ret);
}
return Err(to_object_err(err.into(), vec![bucket, object]));
}
};
if part_numbers.is_empty() {
return Ok(ret);
}
let start_op = part_numbers.iter().find(|&&v| v != 0 && v == part_number_marker);
if part_number_marker > 0 && start_op.is_none() {
return Ok(ret);
}
if let Some(start) = start_op {
if start + 1 > part_numbers.len() {
return Ok(ret);
}
part_numbers = part_numbers[start + 1..].to_vec();
}
let mut parts = Vec::with_capacity(part_numbers.len());
let part_meta_paths = part_numbers
.iter()
.map(|v| format!("{part_path}part.{v}.meta"))
.collect::<Vec<String>>();
let object_parts =
Self::read_parts(&online_disks, RUSTFS_META_MULTIPART_BUCKET, &part_meta_paths, &part_numbers, read_quorum)
.await
.map_err(|e| to_object_err(e.into(), vec![bucket, object, upload_id]))?;
let mut count = max_parts;
for (i, part) in object_parts.iter().enumerate() {
if let Some(err) = &part.error {
warn!("list_object_parts part error: {:?}", &err);
}
parts.push(PartInfo {
etag: Some(part.etag.clone()),
part_num: part.number,
last_mod: part.mod_time,
size: part.size,
actual_size: part.actual_size,
});
count -= 1;
if count == 0 {
break;
}
}
ret.parts = parts;
if object_parts.len() > ret.parts.len() {
ret.is_truncated = true;
ret.next_part_number_marker = ret.parts.last().map(|v| v.part_num).unwrap_or_default();
}
Ok(ret)
}
#[tracing::instrument(skip(self))]
async fn list_multipart_uploads(
&self,
bucket: &str,
object: &str,
key_marker: Option<String>,
upload_id_marker: Option<String>,
delimiter: Option<String>,
max_uploads: usize,
) -> Result<ListMultipartsInfo> {
let disks = {
let disks = self.get_online_local_disks().await;
if disks.is_empty() {
// TODO: getOnlineDisksWithHealing
self.get_online_disks().await
} else {
disks
}
};
let mut upload_ids: Vec<String> = Vec::new();
for disk in disks.iter().flatten() {
if !disk.is_online().await {
continue;
}
let has_uoload_ids = match disk
.list_dir(
bucket,
RUSTFS_META_MULTIPART_BUCKET,
Self::get_multipart_sha_dir(bucket, object).as_str(),
-1,
)
.await
{
Ok(res) => Some(res),
Err(err) => {
if err == DiskError::DiskNotFound {
None
} else if err == DiskError::FileNotFound {
return Ok(ListMultipartsInfo {
key_marker: key_marker.to_owned(),
max_uploads,
prefix: object.to_owned(),
delimiter: delimiter.to_owned(),
..Default::default()
});
} else {
return Err(to_object_err(err.into(), vec![bucket, object]));
}
}
};
if let Some(ids) = has_uoload_ids {
upload_ids = ids;
break;
}
}
let mut uploads = Vec::new();
let mut populated_upload_ids = HashSet::new();
for upload_id in upload_ids.iter() {
let upload_id = upload_id.trim_end_matches(SLASH_SEPARATOR).to_string();
if populated_upload_ids.contains(&upload_id) {
continue;
}
let start_time = {
let now = OffsetDateTime::now_utc();
let splits: Vec<&str> = upload_id.split("x").collect();
if splits.len() == 2 {
if let Ok(unix) = splits[1].parse::<i128>() {
OffsetDateTime::from_unix_timestamp_nanos(unix)?
} else {
now
}
} else {
now
}
};
uploads.push(MultipartInfo {
bucket: bucket.to_owned(),
object: object.to_owned(),
upload_id: base64_encode(format!("{}.{}", get_global_deployment_id().unwrap_or_default(), upload_id).as_bytes()),
initiated: Some(start_time),
..Default::default()
});
populated_upload_ids.insert(upload_id);
}
uploads.sort_by(|a, b| a.initiated.cmp(&b.initiated));
let mut upload_idx = 0;
if let Some(upload_id_marker) = &upload_id_marker {
while upload_idx < uploads.len() {
if &uploads[upload_idx].upload_id != upload_id_marker {
upload_idx += 1;
continue;
}
if &uploads[upload_idx].upload_id == upload_id_marker {
upload_idx += 1;
break;
}
upload_idx += 1;
}
}
let mut ret_uploads = Vec::new();
let mut next_upload_id_marker = None;
while upload_idx < uploads.len() {
ret_uploads.push(uploads[upload_idx].clone());
next_upload_id_marker = Some(uploads[upload_idx].upload_id.clone());
upload_idx += 1;
if ret_uploads.len() > max_uploads {
break;
}
}
let is_truncated = ret_uploads.len() < uploads.len();
if !is_truncated {
next_upload_id_marker = None;
}
Ok(ListMultipartsInfo {
key_marker: key_marker.to_owned(),
next_upload_id_marker,
max_uploads,
is_truncated,
uploads: ret_uploads,
prefix: object.to_owned(),
delimiter: delimiter.to_owned(),
..Default::default()
})
}
#[tracing::instrument(skip(self))]
async fn new_multipart_upload(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<MultipartUploadResult> {
let disks = self.disks.read().await;
let disks = disks.clone();
let mut user_defined = opts.user_defined.clone();
if let Some(ref etag) = opts.preserve_etag {
user_defined.insert("etag".to_owned(), etag.clone());
}
if let Some(sc) = user_defined.get(AMZ_STORAGE_CLASS) {
if sc == storageclass::STANDARD {
let _ = user_defined.remove(AMZ_STORAGE_CLASS);
}
}
let sc_parity_drives = {
if let Some(sc) = GLOBAL_STORAGE_CLASS.get() {
sc.get_parity_for_sc(user_defined.get(AMZ_STORAGE_CLASS).cloned().unwrap_or_default().as_str())
} else {
None
}
};
let mut parity_drives = sc_parity_drives.unwrap_or(self.default_parity_count);
if opts.max_parity {
parity_drives = disks.len() / 2;
}
let data_drives = disks.len() - parity_drives;
let mut write_quorum = data_drives;
if data_drives == parity_drives {
write_quorum += 1
}
let mut fi = FileInfo::new([bucket, object].join("/").as_str(), data_drives, parity_drives);
fi.version_id = if let Some(vid) = &opts.version_id {
Some(Uuid::parse_str(vid)?)
} else {
None
};
if opts.versioned && opts.version_id.is_none() {
fi.version_id = Some(Uuid::new_v4());
}
fi.data_dir = Some(Uuid::new_v4());
fi.fresh = true;
let parts_metadata = vec![fi.clone(); disks.len()];
if !user_defined.contains_key("content-type") {
// TODO: get content-type
}
if let Some(sc) = user_defined.get(AMZ_STORAGE_CLASS) {
if sc == storageclass::STANDARD {
let _ = user_defined.remove(AMZ_STORAGE_CLASS);
}
}
let (shuffle_disks, mut parts_metadatas) = Self::shuffle_disks_and_parts_metadata(&disks, &parts_metadata, &fi);
let mod_time = opts.mod_time.unwrap_or(OffsetDateTime::now_utc());
for fi in parts_metadatas.iter_mut() {
fi.metadata = user_defined.clone();
fi.mod_time = Some(mod_time);
fi.fresh = true;
}
// fi.mod_time = Some(now);
let upload_uuid = format!("{}x{}", Uuid::new_v4(), mod_time.unix_timestamp_nanos());
let upload_id = base64_encode(format!("{}.{}", get_global_deployment_id().unwrap_or_default(), upload_uuid).as_bytes());
let upload_path = Self::get_upload_id_dir(bucket, object, upload_uuid.as_str());
Self::write_unique_file_info(
&shuffle_disks,
bucket,
RUSTFS_META_MULTIPART_BUCKET,
upload_path.as_str(),
&parts_metadatas,
write_quorum,
)
.await
.map_err(|e| to_object_err(e.into(), vec![bucket, object]))?;
// evalDisks
Ok(MultipartUploadResult { upload_id })
}
#[tracing::instrument(skip(self))]
async fn get_multipart_info(
&self,
bucket: &str,
object: &str,
upload_id: &str,
_opts: &ObjectOptions,
) -> Result<MultipartInfo> {
// TODO: nslock
let (fi, _) = self
.check_upload_id_exists(bucket, object, upload_id, false)
.await
.map_err(|e| to_object_err(e, vec![bucket, object, upload_id]))?;
Ok(MultipartInfo {
bucket: bucket.to_owned(),
object: object.to_owned(),
upload_id: upload_id.to_owned(),
user_defined: fi.metadata.clone(),
..Default::default()
})
}
#[tracing::instrument(skip(self))]
async fn abort_multipart_upload(&self, bucket: &str, object: &str, upload_id: &str, _opts: &ObjectOptions) -> Result<()> {
self.check_upload_id_exists(bucket, object, upload_id, false).await?;
let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
self.delete_all(RUSTFS_META_MULTIPART_BUCKET, &upload_id_path).await
}
// complete_multipart_upload 完成
#[tracing::instrument(skip(self))]
async fn complete_multipart_upload(
self: Arc<Self>,
bucket: &str,
object: &str,
upload_id: &str,
uploaded_parts: Vec<CompletePart>,
opts: &ObjectOptions,
) -> Result<ObjectInfo> {
let (mut fi, files_metas) = self.check_upload_id_exists(bucket, object, upload_id, true).await?;
let upload_id_path = Self::get_upload_id_dir(bucket, object, upload_id);
let write_quorum = fi.write_quorum(self.default_write_quorum());
let disks = self.disks.read().await;
let disks = disks.clone();
// let disks = Self::shuffle_disks(&disks, &fi.erasure.distribution);
// Acquire per-object exclusive lock via RAII guard. It auto-releases asynchronously on drop.
// let mut _object_lock_guard: Option<rustfs_lock::LockGuard> = None;
if let Some(http_preconditions) = opts.http_preconditions.clone() {
// if !opts.no_lock {
// let guard_opt = self
// .namespace_lock
// .lock_guard(object, &self.locker_owner, Duration::from_secs(5), Duration::from_secs(10))
// .await?;
// if guard_opt.is_none() {
// return Err(Error::other("can not get lock. please retry".to_string()));
// }
// _object_lock_guard = guard_opt;
// }
if let Some(err) = self.check_write_precondition(bucket, object, opts).await {
return Err(err);
}
}
let part_path = format!("{}/{}/", upload_id_path, fi.data_dir.unwrap_or(Uuid::nil()));
let part_meta_paths = uploaded_parts
.iter()
.map(|v| format!("{part_path}part.{0}.meta", v.part_num))
.collect::<Vec<String>>();
let part_numbers = uploaded_parts.iter().map(|v| v.part_num).collect::<Vec<usize>>();
let object_parts =
Self::read_parts(&disks, RUSTFS_META_MULTIPART_BUCKET, &part_meta_paths, &part_numbers, write_quorum).await?;
if object_parts.len() != uploaded_parts.len() {
return Err(Error::other("part result number err"));
}
for (i, part) in object_parts.iter().enumerate() {
if let Some(err) = &part.error {
error!("complete_multipart_upload part error: {:?}", &err);
}
if uploaded_parts[i].part_num != part.number {
error!(
"complete_multipart_upload part_id err part_id != part_num {} != {}",
uploaded_parts[i].part_num, part.number
);
return Err(Error::InvalidPart(uploaded_parts[i].part_num, bucket.to_owned(), object.to_owned()));
}
fi.add_object_part(
part.number,
part.etag.clone(),
part.size,
part.mod_time,
part.actual_size,
part.index.clone(),
);
}
let (shuffle_disks, mut parts_metadatas) = Self::shuffle_disks_and_parts_metadata_by_index(&disks, &files_metas, &fi);
let curr_fi = fi.clone();
fi.parts = Vec::with_capacity(uploaded_parts.len());
let mut object_size: usize = 0;
let mut object_actual_size: i64 = 0;
for (i, p) in uploaded_parts.iter().enumerate() {
let has_part = curr_fi.parts.iter().find(|v| v.number == p.part_num);
if has_part.is_none() {
error!(
"complete_multipart_upload has_part.is_none() {:?}, part_id={}, bucket={}, object={}",
has_part, p.part_num, bucket, object
);
return Err(Error::InvalidPart(p.part_num, "".to_owned(), p.etag.clone().unwrap_or_default()));
}
let ext_part = &curr_fi.parts[i];
if p.etag != Some(ext_part.etag.clone()) {
error!(
"complete_multipart_upload etag err {:?}, part_id={}, bucket={}, object={}",
p.etag, p.part_num, bucket, object
);
return Err(Error::InvalidPart(p.part_num, ext_part.etag.clone(), p.etag.clone().unwrap_or_default()));
}
// TODO: crypto
if (i < uploaded_parts.len() - 1) && !is_min_allowed_part_size(ext_part.actual_size) {
error!(
"complete_multipart_upload part size too small: part {} size {} is less than minimum {}",
p.part_num,
ext_part.actual_size,
GLOBAL_MIN_PART_SIZE.as_u64()
);
return Err(Error::EntityTooSmall(
p.part_num,
ext_part.actual_size,
GLOBAL_MIN_PART_SIZE.as_u64() as i64,
));
}
object_size += ext_part.size;
object_actual_size += ext_part.actual_size;
fi.parts.push(ObjectPartInfo {
etag: ext_part.etag.clone(),
number: p.part_num,
size: ext_part.size,
mod_time: ext_part.mod_time,
actual_size: ext_part.actual_size,
index: ext_part.index.clone(),
..Default::default()
});
}
fi.size = object_size as i64;
fi.mod_time = opts.mod_time;
if fi.mod_time.is_none() {
fi.mod_time = Some(OffsetDateTime::now_utc());
}
// etag
let etag = {
if let Some(etag) = opts.user_defined.get("etag") {
etag.clone()
} else {
get_complete_multipart_md5(&uploaded_parts)
}
};
fi.metadata.insert("etag".to_owned(), etag);
fi.metadata
.insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), object_actual_size.to_string());
if fi.is_compressed() {
fi.metadata
.insert(format!("{RESERVED_METADATA_PREFIX_LOWER}compression-size"), object_size.to_string());
}
if opts.data_movement {
fi.set_data_moved();
}
// TODO: object_actual_size
let _ = object_actual_size;
for meta in parts_metadatas.iter_mut() {
if meta.is_valid() {
meta.size = fi.size;
meta.mod_time = fi.mod_time;
meta.parts.clone_from(&fi.parts);
meta.metadata = fi.metadata.clone();
meta.versioned = opts.versioned || opts.version_suspended;
// TODO: Checksum
}
}
let mut parts = Vec::with_capacity(curr_fi.parts.len());
// TODO: 优化 cleanupMultipartPath
for p in curr_fi.parts.iter() {
parts.push(path_join_buf(&[
&upload_id_path,
curr_fi.data_dir.unwrap_or(Uuid::nil()).to_string().as_str(),
format!("part.{}.meta", p.number).as_str(),
]));
if !fi.parts.iter().any(|v| v.number == p.number) {
parts.push(path_join_buf(&[
&upload_id_path,
curr_fi.data_dir.unwrap_or(Uuid::nil()).to_string().as_str(),
format!("part.{}", p.number).as_str(),
]));
}
// let _ = self
// .remove_part_meta(
// bucket,
// object,
// upload_id,
// curr_fi.data_dir.unwrap_or(Uuid::nil()).to_string().as_str(),
// p.number,
// )
// .await;
// if !fi.parts.iter().any(|v| v.number == p.number) {
// let _ = self
// .remove_object_part(
// bucket,
// object,
// upload_id,
// curr_fi.data_dir.unwrap_or(Uuid::nil()).to_string().as_str(),
// p.number,
// )
// .await;
// }
}
{
let disks = self.get_disks_internal().await;
Self::cleanup_multipart_path(&disks, &parts).await;
}
let (online_disks, versions, op_old_dir) = Self::rename_data(
&shuffle_disks,
RUSTFS_META_MULTIPART_BUCKET,
&upload_id_path,
&parts_metadatas,
bucket,
object,
write_quorum,
)
.await?;
// debug!("complete fileinfo {:?}", &fi);
// TODO: reduce_common_data_dir
if let Some(old_dir) = op_old_dir {
self.commit_rename_data_dir(&shuffle_disks, bucket, object, &old_dir.to_string(), write_quorum)
.await?;
}
if let Some(versions) = versions {
let _ =
rustfs_common::heal_channel::send_heal_request(rustfs_common::heal_channel::create_heal_request_with_options(
bucket.to_string(),
Some(object.to_string()),
false,
Some(rustfs_common::heal_channel::HealChannelPriority::Normal),
Some(self.pool_index),
Some(self.set_index),
))
.await;
}
let upload_id_path = upload_id_path.clone();
let store = self.clone();
let _cleanup_handle = tokio::spawn(async move {
let _ = store.delete_all(RUSTFS_META_MULTIPART_BUCKET, &upload_id_path).await;
});
for (i, op_disk) in online_disks.iter().enumerate() {
if let Some(disk) = op_disk {
if disk.is_online().await {
fi = parts_metadatas[i].clone();
break;
}
}
}
fi.is_latest = true;
Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended))
}
#[tracing::instrument(skip(self))]
async fn get_disks(&self, _pool_idx: usize, _set_idx: usize) -> Result<Vec<Option<DiskStore>>> {
Ok(self.get_disks_internal().await)
}
#[tracing::instrument(skip(self))]
fn set_drive_counts(&self) -> Vec<usize> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn delete_bucket(&self, _bucket: &str, _opts: &DeleteBucketOptions) -> Result<()> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn heal_format(&self, _dry_run: bool) -> Result<(HealResultItem, Option<Error>)> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn heal_bucket(&self, _bucket: &str, _opts: &HealOpts) -> Result<HealResultItem> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn heal_object(
&self,
bucket: &str,
object: &str,
version_id: &str,
opts: &HealOpts,
) -> Result<(HealResultItem, Option<Error>)> {
let _write_lock_guard = if !opts.no_lock {
Some(
self.fast_lock_manager
.acquire_write_lock("", object, self.locker_owner.as_str())
.await
.map_err(|e| Error::other(format!("Failed to acquire write lock for heal operation: {:?}", e)))?,
)
} else {
None
};
if has_suffix(object, SLASH_SEPARATOR) {
let (result, err) = self.heal_object_dir(bucket, object, opts.dry_run, opts.remove).await?;
return Ok((result, err.map(|e| e.into())));
}
let disks = self.disks.read().await;
let disks = disks.clone();
let (_, errs) = Self::read_all_fileinfo(&disks, "", bucket, object, version_id, false, false).await?;
if DiskError::is_all_not_found(&errs) {
warn!(
"heal_object failed, all obj part not found, bucket: {}, obj: {}, version_id: {}",
bucket, object, version_id
);
let err = if !version_id.is_empty() {
Error::FileVersionNotFound
} else {
Error::FileNotFound
};
return Ok((
self.default_heal_result(FileInfo::default(), &errs, bucket, object, version_id)
.await,
Some(err),
));
}
// Heal the object.
let (result, err) = self.heal_object(bucket, object, version_id, opts).await?;
if let Some(err) = err.as_ref() {
match err {
&DiskError::FileCorrupt if opts.scan_mode != HealScanMode::Deep => {
// Instead of returning an error when a bitrot error is detected
// during a normal heal scan, heal again with bitrot flag enabled.
let mut opts = *opts;
opts.scan_mode = HealScanMode::Deep;
let (result, err) = self.heal_object(bucket, object, version_id, &opts).await?;
return Ok((result, err.map(|e| e.into())));
}
_ => {}
}
}
Ok((result, err.map(|e| e.into())))
}
#[tracing::instrument(skip(self))]
async fn get_pool_and_set(&self, _id: &str) -> Result<(Option<usize>, Option<usize>, Option<usize>)> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn check_abandoned_parts(&self, _bucket: &str, _object: &str, _opts: &HealOpts) -> Result<()> {
unimplemented!()
}
#[tracing::instrument(skip(self))]
async fn verify_object_integrity(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result<()> {
let get_object_reader = <Self as ObjectIO>::get_object_reader(self, bucket, object, None, HeaderMap::new(), opts).await?;
// Stream to sink to avoid loading entire object into memory during verification
let mut reader = get_object_reader.stream;
tokio::io::copy(&mut reader, &mut tokio::io::sink()).await?;
Ok(())
}
}
#[derive(Debug, PartialEq, Eq)]
struct ObjProps {
mod_time: Option<OffsetDateTime>,
num_versions: usize,
}
impl Hash for ObjProps {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.mod_time.hash(state);
self.num_versions.hash(state);
}
}
#[derive(Default, Clone, Debug)]
pub struct HealEntryResult {
pub bytes: usize,
pub success: bool,
pub skipped: bool,
pub entry_done: bool,
pub name: String,
}
fn is_object_dang_ling(
meta_arr: &[FileInfo],
errs: &[Option<DiskError>],
data_errs_by_part: &HashMap<usize, Vec<usize>>,
) -> disk::error::Result<FileInfo> {
let mut valid_meta = FileInfo::default();
let (not_found_meta_errs, non_actionable_meta_errs) = dang_ling_meta_errs_count(errs);
let (mut not_found_parts_errs, mut non_actionable_parts_errs) = (0, 0);
data_errs_by_part.iter().for_each(|(_, v)| {
let (nf, na) = dang_ling_part_errs_count(v);
if nf > not_found_parts_errs {
(not_found_parts_errs, non_actionable_parts_errs) = (nf, na);
}
});
meta_arr.iter().for_each(|fi| {
if fi.is_valid() {
valid_meta = fi.clone();
}
});
if !valid_meta.is_valid() {
let data_blocks = meta_arr.len().div_ceil(2);
if not_found_parts_errs > data_blocks {
return Ok(valid_meta);
}
return Err(DiskError::other("not ok"));
}
if non_actionable_meta_errs > 0 || non_actionable_parts_errs > 0 {
return Err(DiskError::other("not ok"));
}
if valid_meta.deleted {
let data_blocks = errs.len().div_ceil(2);
if not_found_meta_errs > data_blocks {
return Ok(valid_meta);
}
return Err(DiskError::other("not ok"));
}
if not_found_meta_errs > 0 && not_found_meta_errs > valid_meta.erasure.parity_blocks {
return Ok(valid_meta);
}
if !valid_meta.is_remote() && not_found_parts_errs > 0 && not_found_parts_errs > valid_meta.erasure.parity_blocks {
return Ok(valid_meta);
}
Err(DiskError::other("not ok"))
}
fn dang_ling_meta_errs_count(cerrs: &[Option<DiskError>]) -> (usize, usize) {
let (mut not_found_count, mut non_actionable_count) = (0, 0);
cerrs.iter().for_each(|err| {
if let Some(err) = err {
if err == &DiskError::FileNotFound || err == &DiskError::FileVersionNotFound {
not_found_count += 1;
} else {
non_actionable_count += 1;
}
}
});
(not_found_count, non_actionable_count)
}
fn dang_ling_part_errs_count(results: &[usize]) -> (usize, usize) {
let (mut not_found_count, mut non_actionable_count) = (0, 0);
results.iter().for_each(|result| {
if *result == CHECK_PART_SUCCESS {
// skip
} else if *result == CHECK_PART_FILE_NOT_FOUND {
not_found_count += 1;
} else {
non_actionable_count += 1;
}
});
(not_found_count, non_actionable_count)
}
fn is_object_dir_dang_ling(errs: &[Option<DiskError>]) -> bool {
let mut found = 0;
let mut not_found = 0;
let mut found_not_empty = 0;
let mut other_found = 0;
errs.iter().for_each(|err| {
if err.is_none() {
found += 1;
} else if let Some(err) = err {
if err == &DiskError::FileNotFound || err == &DiskError::VolumeNotFound {
not_found += 1;
} else if err == &DiskError::VolumeNotEmpty {
found_not_empty += 1;
} else {
other_found += 1;
}
}
});
found = found + found_not_empty + other_found;
found < not_found && found > 0
}
fn join_errs(errs: &[Option<DiskError>]) -> String {
let errs = errs
.iter()
.map(|err| {
if let Some(err) = err {
return err.to_string();
}
"<nil>".to_string()
})
.collect::<Vec<_>>();
errs.join(", ")
}
async fn disks_with_all_parts(
online_disks: &[Option<DiskStore>],
parts_metadata: &mut [FileInfo],
errs: &[Option<DiskError>],
latest_meta: &FileInfo,
bucket: &str,
object: &str,
scan_mode: HealScanMode,
) -> disk::error::Result<(Vec<Option<DiskStore>>, HashMap<usize, Vec<usize>>, HashMap<usize, Vec<usize>>)> {
info!(
"disks_with_all_parts: starting with online_disks.len()={}, scan_mode={:?}",
online_disks.len(),
scan_mode
);
let mut available_disks = vec![None; online_disks.len()];
let mut data_errs_by_disk: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..online_disks.len() {
data_errs_by_disk.insert(i, vec![1; latest_meta.parts.len()]);
}
let mut data_errs_by_part: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..latest_meta.parts.len() {
data_errs_by_part.insert(i, vec![1; online_disks.len()]);
}
let mut inconsistent = 0;
parts_metadata.iter().enumerate().for_each(|(index, meta)| {
if meta.is_valid() && !meta.deleted && meta.erasure.distribution.len() != online_disks.len()
|| (!meta.erasure.distribution.is_empty() && meta.erasure.distribution[index] != meta.erasure.index)
{
warn!("file info inconsistent, meta: {:?}", meta);
inconsistent += 1;
}
});
let erasure_distribution_reliable = inconsistent <= parts_metadata.len() / 2;
let mut meta_errs = Vec::with_capacity(errs.len());
for _ in 0..errs.len() {
meta_errs.push(None);
}
for (index, disk) in online_disks.iter().enumerate() {
let disk = if let Some(disk) = disk {
disk
} else {
meta_errs[index] = Some(DiskError::DiskNotFound);
continue;
};
if let Some(err) = &errs[index] {
meta_errs[index] = Some(err.clone());
continue;
}
if !disk.is_online().await {
meta_errs[index] = Some(DiskError::DiskNotFound);
continue;
}
let meta = &parts_metadata[index];
if !meta.mod_time.eq(&latest_meta.mod_time) || !meta.data_dir.eq(&latest_meta.data_dir) {
warn!("mod_time is not Eq, file corrupt, index: {index}");
meta_errs[index] = Some(DiskError::FileCorrupt);
parts_metadata[index] = FileInfo::default();
continue;
}
if erasure_distribution_reliable {
if !meta.is_valid() {
warn!("file info is not valid, file corrupt, index: {index}");
parts_metadata[index] = FileInfo::default();
meta_errs[index] = Some(DiskError::FileCorrupt);
continue;
}
if !meta.deleted && meta.erasure.distribution.len() != online_disks.len() {
warn!("file info distribution len not Eq online_disks len, file corrupt, index: {index}");
parts_metadata[index] = FileInfo::default();
meta_errs[index] = Some(DiskError::FileCorrupt);
continue;
}
}
}
// info!("meta_errs: {:?}, errs: {:?}", meta_errs, errs);
meta_errs.iter().enumerate().for_each(|(index, err)| {
if err.is_some() {
let part_err = conv_part_err_to_int(err);
for p in 0..latest_meta.parts.len() {
data_errs_by_part.entry(p).or_insert(vec![0; meta_errs.len()])[index] = part_err;
}
}
});
// info!("data_errs_by_part: {:?}, data_errs_by_disk: {:?}", data_errs_by_part, data_errs_by_disk);
for (index, disk) in online_disks.iter().enumerate() {
if meta_errs[index].is_some() {
continue;
}
let disk = if let Some(disk) = disk {
disk
} else {
meta_errs[index] = Some(DiskError::DiskNotFound);
continue;
};
let meta = &mut parts_metadata[index];
if meta.deleted || meta.is_remote() {
continue;
}
// Always check data, if we got it.
if (meta.data.is_some() || meta.size == 0) && !meta.parts.is_empty() {
if let Some(data) = &meta.data {
let checksum_info = meta.erasure.get_checksum_info(meta.parts[0].number);
let data_len = data.len();
let verify_err = bitrot_verify(
Box::new(Cursor::new(data.clone())),
data_len,
meta.erasure.shard_file_size(meta.size) as usize,
checksum_info.algorithm,
checksum_info.hash,
meta.erasure.shard_size(),
)
.await
.err();
if let Some(vec) = data_errs_by_part.get_mut(&0) {
if index < vec.len() {
vec[index] = conv_part_err_to_int(&verify_err.map(|e| e.into()));
info!("bitrot check result: {}", vec[index]);
}
}
}
continue;
}
let mut verify_resp = CheckPartsResp::default();
let mut verify_err = None;
meta.data_dir = latest_meta.data_dir;
if scan_mode == HealScanMode::Deep {
// disk has a valid xl.meta but may not have all the
// parts. This is considered an outdated disk, since
// it needs healing too.
match disk.verify_file(bucket, object, meta).await {
Ok(v) => {
verify_resp = v;
}
Err(err) => {
verify_err = Some(err);
}
}
} else {
match disk.check_parts(bucket, object, meta).await {
Ok(v) => {
verify_resp = v;
}
Err(err) => {
verify_err = Some(err);
}
}
}
for p in 0..latest_meta.parts.len() {
if let Some(vec) = data_errs_by_part.get_mut(&p) {
if index < vec.len() {
if verify_err.is_some() {
info!("verify_err");
vec[index] = conv_part_err_to_int(&verify_err.clone());
} else {
info!("verify_resp, verify_resp.results {}", verify_resp.results[p]);
vec[index] = verify_resp.results[p];
}
}
}
}
}
// info!("data_errs_by_part: {:?}, data_errs_by_disk: {:?}", data_errs_by_part, data_errs_by_disk);
for (part, disks) in data_errs_by_part.iter() {
for (idx, disk) in disks.iter().enumerate() {
if let Some(vec) = data_errs_by_disk.get_mut(&idx) {
vec[*part] = *disk;
}
}
}
// info!("data_errs_by_part: {:?}, data_errs_by_disk: {:?}", data_errs_by_part, data_errs_by_disk);
for (i, disk) in online_disks.iter().enumerate() {
if meta_errs[i].is_none() && disk.is_some() && !has_part_err(&data_errs_by_disk[&i]) {
available_disks[i] = Some(disk.clone().unwrap());
} else {
parts_metadata[i] = FileInfo::default();
}
}
Ok((available_disks, data_errs_by_disk, data_errs_by_part))
}
pub fn should_heal_object_on_disk(
err: &Option<DiskError>,
parts_errs: &[usize],
meta: &FileInfo,
latest_meta: &FileInfo,
) -> (bool, Option<DiskError>) {
if let Some(err) = err {
if err == &DiskError::FileNotFound || err == &DiskError::FileVersionNotFound || err == &DiskError::FileCorrupt {
return (true, Some(err.clone()));
}
}
if latest_meta.volume != meta.volume
|| latest_meta.name != meta.name
|| latest_meta.version_id != meta.version_id
|| latest_meta.deleted != meta.deleted
{
info!("latest_meta not Eq meta, latest_meta: {:?}, meta: {:?}", latest_meta, meta);
return (true, Some(DiskError::OutdatedXLMeta));
}
if !meta.deleted && !meta.is_remote() {
let err_vec = [CHECK_PART_FILE_NOT_FOUND, CHECK_PART_FILE_CORRUPT];
for part_err in parts_errs.iter() {
if err_vec.contains(part_err) {
return (true, Some(DiskError::PartMissingOrCorrupt));
}
}
}
(false, err.clone())
}
async fn get_disks_info(disks: &[Option<DiskStore>], eps: &[Endpoint]) -> Vec<rustfs_madmin::Disk> {
let mut ret = Vec::new();
for (i, pool) in disks.iter().enumerate() {
if let Some(disk) = pool {
match disk.disk_info(&DiskInfoOptions::default()).await {
Ok(res) => ret.push(rustfs_madmin::Disk {
endpoint: eps[i].to_string(),
local: eps[i].is_local,
pool_index: eps[i].pool_idx,
set_index: eps[i].set_idx,
disk_index: eps[i].disk_idx,
state: "ok".to_owned(),
root_disk: res.root_disk,
drive_path: res.mount_path.clone(),
healing: res.healing,
scanning: res.scanning,
uuid: res.id.clone(),
major: res.major as u32,
minor: res.minor as u32,
model: None,
total_space: res.total,
used_space: res.used,
available_space: res.free,
utilization: {
if res.total > 0 {
res.used as f64 / res.total as f64 * 100_f64
} else {
0_f64
}
},
used_inodes: res.used_inodes,
free_inodes: res.free_inodes,
..Default::default()
}),
Err(err) => ret.push(rustfs_madmin::Disk {
state: err.to_string(),
endpoint: eps[i].to_string(),
local: eps[i].is_local,
pool_index: eps[i].pool_idx,
set_index: eps[i].set_idx,
disk_index: eps[i].disk_idx,
..Default::default()
}),
}
} else {
ret.push(rustfs_madmin::Disk {
endpoint: eps[i].to_string(),
local: eps[i].is_local,
pool_index: eps[i].pool_idx,
set_index: eps[i].set_idx,
disk_index: eps[i].disk_idx,
state: DiskError::DiskNotFound.to_string(),
..Default::default()
})
}
}
ret
}
async fn get_storage_info(disks: &[Option<DiskStore>], eps: &[Endpoint]) -> rustfs_madmin::StorageInfo {
let mut disks = get_disks_info(disks, eps).await;
disks.sort_by(|a, b| a.total_space.cmp(&b.total_space));
rustfs_madmin::StorageInfo {
disks,
backend: rustfs_madmin::BackendInfo {
backend_type: rustfs_madmin::BackendByte::Erasure,
..Default::default()
},
}
}
pub async fn stat_all_dirs(disks: &[Option<DiskStore>], bucket: &str, prefix: &str) -> Vec<Option<DiskError>> {
let mut errs = Vec::with_capacity(disks.len());
let mut futures = Vec::with_capacity(disks.len());
for disk in disks.iter().flatten() {
let disk = disk.clone();
let bucket = bucket.to_string();
let prefix = prefix.to_string();
futures.push(tokio::spawn(async move {
match disk.list_dir("", &bucket, &prefix, 1).await {
Ok(entries) => {
if !entries.is_empty() {
return Some(DiskError::VolumeNotEmpty);
}
None
}
Err(err) => Some(err),
}
}));
}
let results = join_all(futures).await;
for err in results.into_iter().flatten() {
errs.push(err);
}
errs
}
const GLOBAL_MIN_PART_SIZE: ByteSize = ByteSize::mib(5);
fn is_min_allowed_part_size(size: i64) -> bool {
size >= GLOBAL_MIN_PART_SIZE.as_u64() as i64
}
fn get_complete_multipart_md5(parts: &[CompletePart]) -> String {
let mut buf = Vec::new();
for part in parts.iter() {
if let Some(etag) = &part.etag {
if let Ok(etag_bytes) = hex_simd::decode_to_vec(etag.as_bytes()) {
buf.extend(etag_bytes);
} else {
buf.extend(etag.bytes());
}
}
}
let mut hasher = Md5::new();
let _ = hasher.write(&buf);
format!("{:x}-{}", hasher.finalize(), parts.len())
}
pub fn canonicalize_etag(etag: &str) -> String {
let re = Regex::new("\"*?([^\"]*?)\"*?$").unwrap();
re.replace_all(etag, "$1").to_string()
}
pub fn e_tag_matches(etag: &str, condition: &str) -> bool {
if condition.trim() == "*" {
return true;
}
canonicalize_etag(etag) == canonicalize_etag(condition)
}
pub fn should_prevent_write(oi: &ObjectInfo, if_none_match: Option<String>, if_match: Option<String>) -> bool {
match &oi.etag {
Some(etag) => {
if let Some(if_none_match) = if_none_match {
if e_tag_matches(etag, &if_none_match) {
return true;
}
}
if let Some(if_match) = if_match {
if !e_tag_matches(etag, &if_match) {
return true;
}
}
false
}
// If we can't obtain the etag of the object, perevent the write only when we have at least one condition
None => if_none_match.is_some() || if_match.is_some(),
}
}
/// Validates if the given storage class is supported
pub fn is_valid_storage_class(storage_class: &str) -> bool {
matches!(
storage_class,
storageclass::STANDARD
| storageclass::RRS
| storageclass::DEEP_ARCHIVE
| storageclass::EXPRESS_ONEZONE
| storageclass::GLACIER
| storageclass::GLACIER_IR
| storageclass::INTELLIGENT_TIERING
| storageclass::ONEZONE_IA
| storageclass::OUTPOSTS
| storageclass::SNOW
| storageclass::STANDARD_IA
)
}
/// Returns true if the storage class is a cold storage tier that requires special handling
pub fn is_cold_storage_class(storage_class: &str) -> bool {
matches!(
storage_class,
storageclass::DEEP_ARCHIVE | storageclass::GLACIER | storageclass::GLACIER_IR
)
}
/// Returns true if the storage class is an infrequent access tier
pub fn is_infrequent_access_class(storage_class: &str) -> bool {
matches!(
storage_class,
storageclass::ONEZONE_IA | storageclass::STANDARD_IA | storageclass::INTELLIGENT_TIERING
)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::disk::CHECK_PART_UNKNOWN;
use crate::disk::CHECK_PART_VOLUME_NOT_FOUND;
use crate::disk::error::DiskError;
use crate::store_api::{CompletePart, ObjectInfo};
use rustfs_filemeta::ErasureInfo;
use std::collections::HashMap;
use time::OffsetDateTime;
#[test]
fn test_check_part_constants() {
// Test that all CHECK_PART constants have expected values
assert_eq!(CHECK_PART_UNKNOWN, 0);
assert_eq!(CHECK_PART_SUCCESS, 1);
assert_eq!(CHECK_PART_FILE_NOT_FOUND, 4); // 实际值是 4不是 2
assert_eq!(CHECK_PART_VOLUME_NOT_FOUND, 3);
assert_eq!(CHECK_PART_FILE_CORRUPT, 5);
}
#[test]
fn test_is_min_allowed_part_size() {
// Test minimum part size validation
assert!(!is_min_allowed_part_size(0));
assert!(!is_min_allowed_part_size(1024)); // 1KB - too small
assert!(!is_min_allowed_part_size(1024 * 1024)); // 1MB - too small
assert!(is_min_allowed_part_size(5 * 1024 * 1024)); // 5MB - minimum allowed
assert!(is_min_allowed_part_size(10 * 1024 * 1024)); // 10MB - allowed
assert!(is_min_allowed_part_size(100 * 1024 * 1024)); // 100MB - allowed
}
#[test]
fn test_get_complete_multipart_md5() {
// Test MD5 calculation for multipart upload
let parts = vec![
CompletePart {
part_num: 1,
etag: Some("d41d8cd98f00b204e9800998ecf8427e".to_string()),
},
CompletePart {
part_num: 2,
etag: Some("098f6bcd4621d373cade4e832627b4f6".to_string()),
},
];
let md5 = get_complete_multipart_md5(&parts);
assert!(md5.ends_with("-2")); // Should end with part count
assert!(md5.len() > 10); // Should have reasonable length
// Test with empty parts
let empty_parts = vec![];
let empty_result = get_complete_multipart_md5(&empty_parts);
assert!(empty_result.ends_with("-0"));
// Test with single part
let single_part = vec![CompletePart {
part_num: 1,
etag: Some("d41d8cd98f00b204e9800998ecf8427e".to_string()),
}];
let single_result = get_complete_multipart_md5(&single_part);
assert!(single_result.ends_with("-1"));
}
#[test]
fn test_get_upload_id_dir() {
// Test upload ID directory path generation
let dir = SetDisks::get_upload_id_dir("bucket", "object", "upload-id");
// The function returns SHA256 hash of bucket/object + upload_id processing
assert!(dir.len() > 64); // Should be longer than just SHA256 hash
assert!(dir.contains("/")); // Should contain path separator
// Test with base64 encoded upload ID
let result2 = SetDisks::get_upload_id_dir("bucket", "object", "dXBsb2FkLWlk"); // base64 for "upload-id"
assert!(!result2.is_empty());
assert!(result2.len() > 10);
}
#[test]
fn test_get_multipart_sha_dir() {
// Test multipart SHA directory path generation
let dir = SetDisks::get_multipart_sha_dir("bucket", "object");
// The function returns SHA256 hash of "bucket/object"
assert_eq!(dir.len(), 64); // SHA256 hash length
assert!(!dir.contains("bucket")); // Should be hash, not original text
assert!(!dir.contains("object")); // Should be hash, not original text
// Test with empty strings
let result2 = SetDisks::get_multipart_sha_dir("", "");
assert!(!result2.is_empty());
assert_eq!(result2.len(), 64); // SHA256 hex string length
// Test that different inputs produce different hashes
let result3 = SetDisks::get_multipart_sha_dir("bucket1", "object1");
let result4 = SetDisks::get_multipart_sha_dir("bucket2", "object2");
assert_ne!(result3, result4);
}
#[test]
fn test_common_parity() {
// Test common parity calculation
// For parities [2, 2, 2, 3] with n=4, default_parity_count=1:
// - parity=2: read_quorum = 4-2 = 2, occ=3 >= 2, so valid
// - parity=3: read_quorum = 4-3 = 1, occ=1 >= 1, so valid
// - max_occ=3 for parity=2, so returns 2
let parities = vec![2, 2, 2, 3];
assert_eq!(SetDisks::common_parity(&parities, 1), 2);
// For parities [1, 2, 3] with n=3, default_parity_count=2:
// - parity=1: read_quorum = 3-1 = 2, occ=1 < 2, so invalid
// - parity=2: read_quorum = 3-2 = 1, occ=1 >= 1, so valid
// - parity=3: read_quorum = 3-3 = 0, occ=1 >= 0, so valid
// - max_occ=1, both parity=2 and parity=3 have same occurrence
// - HashMap iteration order is not guaranteed, so result could be either 2 or 3
let parities = vec![1, 2, 3];
let result = SetDisks::common_parity(&parities, 2);
assert!(result == 2 || result == 3); // Either 2 or 3 is valid
let empty_parities = vec![];
assert_eq!(SetDisks::common_parity(&empty_parities, 3), -1); // Empty returns -1
let invalid_parities = vec![-1, -1, -1];
assert_eq!(SetDisks::common_parity(&invalid_parities, 2), -1); // all invalid
let single_parity = vec![4];
assert_eq!(SetDisks::common_parity(&single_parity, 1), 4);
// Test with -1 values (ignored)
let parities_with_invalid = vec![-1, 2, 2, -1];
assert_eq!(SetDisks::common_parity(&parities_with_invalid, 1), 2);
}
#[test]
fn test_common_time() {
// Test common time calculation
let now = OffsetDateTime::now_utc();
let later = now + Duration::from_secs(60);
let times = vec![Some(now), Some(now), Some(later)];
assert_eq!(SetDisks::common_time(&times, 2), Some(now));
let times2 = vec![Some(now), Some(later), Some(later)];
assert_eq!(SetDisks::common_time(&times2, 2), Some(later));
let times_with_none = vec![Some(now), None, Some(now)];
assert_eq!(SetDisks::common_time(&times_with_none, 2), Some(now));
let times = vec![None, None, None];
assert_eq!(SetDisks::common_time(&times, 2), None);
let empty_times = vec![];
assert_eq!(SetDisks::common_time(&empty_times, 1), None);
}
#[test]
fn test_common_time_and_occurrence() {
// Test common time with occurrence count
let now = OffsetDateTime::now_utc();
let times = vec![Some(now), Some(now), None];
let (time, count) = SetDisks::common_time_and_occurrence(&times);
assert_eq!(time, Some(now));
assert_eq!(count, 2);
let times = vec![None, None, None];
let (time, count) = SetDisks::common_time_and_occurrence(&times);
assert_eq!(time, None);
assert_eq!(count, 0); // No valid times, so count is 0
}
#[test]
fn test_common_etag() {
// Test common etag calculation
let etags = vec![Some("etag1".to_string()), Some("etag1".to_string()), None];
assert_eq!(SetDisks::common_etag(&etags, 2), Some("etag1".to_string()));
let etags = vec![None, None, None];
assert_eq!(SetDisks::common_etag(&etags, 2), None);
}
#[test]
fn test_common_etags() {
// Test common etags with occurrence count
let etags = vec![Some("etag1".to_string()), Some("etag1".to_string()), None];
let (etag, count) = SetDisks::common_etags(&etags);
assert_eq!(etag, Some("etag1".to_string()));
assert_eq!(count, 2);
}
#[test]
fn test_list_object_modtimes() {
// Test extracting modification times from file info
let now = OffsetDateTime::now_utc();
let file_info = FileInfo {
mod_time: Some(now),
..Default::default()
};
let parts_metadata = vec![file_info];
let errs = vec![None];
let modtimes = SetDisks::list_object_modtimes(&parts_metadata, &errs);
assert_eq!(modtimes.len(), 1);
assert_eq!(modtimes[0], Some(now));
}
#[test]
fn test_list_object_etags() {
// Test extracting etags from file info metadata
let mut metadata = HashMap::new();
metadata.insert("etag".to_string(), "test-etag".to_string());
let file_info = FileInfo {
metadata,
..Default::default()
};
let parts_metadata = vec![file_info];
let errs = vec![None];
let etags = SetDisks::list_object_etags(&parts_metadata, &errs);
assert_eq!(etags.len(), 1);
assert_eq!(etags[0], Some("test-etag".to_string()));
}
#[test]
fn test_list_object_parities() {
// Test extracting parity counts from file info
let file_info1 = FileInfo {
erasure: ErasureInfo {
data_blocks: 4,
parity_blocks: 2,
index: 1, // Must be > 0 for is_valid() to return true
distribution: vec![1, 2, 3, 4, 5, 6], // Must match data_blocks + parity_blocks
..Default::default()
},
size: 100, // Non-zero size
deleted: false,
..Default::default()
};
let file_info2 = FileInfo {
erasure: ErasureInfo {
data_blocks: 6,
parity_blocks: 3,
index: 1, // Must be > 0 for is_valid() to return true
distribution: vec![1, 2, 3, 4, 5, 6, 7, 8, 9], // Must match data_blocks + parity_blocks
..Default::default()
},
size: 200, // Non-zero size
deleted: false,
..Default::default()
};
let file_info3 = FileInfo {
erasure: ErasureInfo {
data_blocks: 2,
parity_blocks: 1,
index: 1, // Must be > 0 for is_valid() to return true
distribution: vec![1, 2, 3], // Must match data_blocks + parity_blocks
..Default::default()
},
size: 0, // Zero size - function returns half of total shards
deleted: false,
..Default::default()
};
let parts_metadata = vec![file_info1, file_info2, file_info3];
let errs = vec![None, None, None];
let parities = SetDisks::list_object_parities(&parts_metadata, &errs);
assert_eq!(parities.len(), 3);
assert_eq!(parities[0], 2); // parity_blocks from first file
assert_eq!(parities[1], 3); // parity_blocks from second file
assert_eq!(parities[2], 1); // half of total shards (3/2 = 1) for zero size file
}
#[test]
fn test_conv_part_err_to_int() {
// Test error conversion to integer codes
assert_eq!(conv_part_err_to_int(&None), CHECK_PART_SUCCESS);
let disk_err = DiskError::FileNotFound;
assert_eq!(conv_part_err_to_int(&Some(disk_err)), CHECK_PART_FILE_NOT_FOUND);
let other_err = DiskError::other("other error");
assert_eq!(conv_part_err_to_int(&Some(other_err)), CHECK_PART_UNKNOWN); // other 错误应该返回 UNKNOWN不是 SUCCESS
}
#[test]
fn test_has_part_err() {
// Test checking for part errors
let no_errors = vec![CHECK_PART_SUCCESS, CHECK_PART_SUCCESS];
assert!(!has_part_err(&no_errors));
let with_errors = vec![CHECK_PART_SUCCESS, CHECK_PART_FILE_NOT_FOUND];
assert!(has_part_err(&with_errors));
let unknown_errors = vec![CHECK_PART_UNKNOWN, CHECK_PART_SUCCESS];
assert!(has_part_err(&unknown_errors));
}
#[test]
fn test_should_heal_object_on_disk() {
// Test healing decision logic
let meta = FileInfo::default();
let latest_meta = FileInfo::default();
// Test with file not found error
let err = Some(DiskError::FileNotFound);
let (should_heal, _) = should_heal_object_on_disk(&err, &[], &meta, &latest_meta);
assert!(should_heal);
// Test with no error and no part errors
let (should_heal, _) = should_heal_object_on_disk(&None, &[CHECK_PART_SUCCESS], &meta, &latest_meta);
assert!(!should_heal);
// Test with part corruption
let (should_heal, _) = should_heal_object_on_disk(&None, &[CHECK_PART_FILE_CORRUPT], &meta, &latest_meta);
assert!(should_heal);
}
#[test]
fn test_dang_ling_meta_errs_count() {
// Test counting dangling metadata errors
let errs = vec![None, Some(DiskError::FileNotFound), None];
let (not_found_count, non_actionable_count) = dang_ling_meta_errs_count(&errs);
assert_eq!(not_found_count, 1); // One FileNotFound error
assert_eq!(non_actionable_count, 0); // No other errors
}
#[test]
fn test_dang_ling_part_errs_count() {
// Test counting dangling part errors
let results = vec![CHECK_PART_SUCCESS, CHECK_PART_FILE_NOT_FOUND, CHECK_PART_SUCCESS];
let (not_found_count, non_actionable_count) = dang_ling_part_errs_count(&results);
assert_eq!(not_found_count, 1); // One FILE_NOT_FOUND error
assert_eq!(non_actionable_count, 0); // No other errors
}
#[test]
fn test_is_object_dir_dang_ling() {
// Test object directory dangling detection
let errs = vec![Some(DiskError::FileNotFound), Some(DiskError::FileNotFound), None];
assert!(is_object_dir_dang_ling(&errs));
let errs2 = vec![None, None, None];
assert!(!is_object_dir_dang_ling(&errs2));
let errs3 = vec![Some(DiskError::FileCorrupt), Some(DiskError::FileNotFound)];
assert!(!is_object_dir_dang_ling(&errs3)); // Mixed errors, not all not found
}
#[test]
fn test_join_errs() {
// Test joining error messages
let errs = vec![None, Some(DiskError::other("error1")), Some(DiskError::other("error2"))];
let joined = join_errs(&errs);
assert!(joined.contains("<nil>"));
assert!(joined.contains("io error")); // DiskError::other 显示为 "io error"
// Test with different error types
let errs2 = vec![None, Some(DiskError::FileNotFound), Some(DiskError::FileCorrupt)];
let joined2 = join_errs(&errs2);
assert!(joined2.contains("<nil>"));
assert!(joined2.contains("file not found"));
assert!(joined2.contains("file is corrupted"));
}
#[test]
fn test_reduce_common_data_dir() {
// Test reducing common data directory
use uuid::Uuid;
let uuid1 = Uuid::new_v4();
let uuid2 = Uuid::new_v4();
let data_dirs = vec![Some(uuid1), Some(uuid1), Some(uuid2)];
let result = SetDisks::reduce_common_data_dir(&data_dirs, 2);
assert_eq!(result, Some(uuid1)); // uuid1 appears twice, meets quorum
let data_dirs = vec![Some(uuid1), Some(uuid2), None];
let result = SetDisks::reduce_common_data_dir(&data_dirs, 2);
assert_eq!(result, None); // No UUID meets quorum of 2
}
#[test]
fn test_shuffle_parts_metadata() {
// Test metadata shuffling
let metadata = vec![
FileInfo {
name: "file1".to_string(),
..Default::default()
},
FileInfo {
name: "file2".to_string(),
..Default::default()
},
FileInfo {
name: "file3".to_string(),
..Default::default()
},
];
// Distribution uses 1-based indexing
let distribution = vec![3, 1, 2]; // 1-based shuffle order
let result = SetDisks::shuffle_parts_metadata(&metadata, &distribution);
assert_eq!(result.len(), 3);
assert_eq!(result[0].name, "file2"); // distribution[1] = 1, so metadata[1] goes to index 0
assert_eq!(result[1].name, "file3"); // distribution[2] = 2, so metadata[2] goes to index 1
assert_eq!(result[2].name, "file1"); // distribution[0] = 3, so metadata[0] goes to index 2
// Test with empty distribution
let empty_distribution = vec![];
let result2 = SetDisks::shuffle_parts_metadata(&metadata, &empty_distribution);
assert_eq!(result2.len(), 3);
assert_eq!(result2[0].name, "file1"); // Should return original order
}
#[test]
fn test_shuffle_disks() {
// Test disk shuffling
let disks = vec![None, None, None]; // Mock disks
let distribution = vec![3, 1, 2]; // 1-based indexing
let result = SetDisks::shuffle_disks(&disks, &distribution);
assert_eq!(result.len(), 3);
// All disks are None, so result should be all None
assert!(result.iter().all(|d| d.is_none()));
// Test with empty distribution
let empty_distribution = vec![];
let result2 = SetDisks::shuffle_disks(&disks, &empty_distribution);
assert_eq!(result2.len(), 3);
assert!(result2.iter().all(|d| d.is_none()));
}
#[test]
fn test_etag_matches() {
assert!(e_tag_matches("abc", "abc"));
assert!(e_tag_matches("\"abc\"", "abc"));
assert!(e_tag_matches("\"abc\"", "*"));
}
#[test]
fn test_should_prevent_write() {
let oi = ObjectInfo {
etag: Some("abc".to_string()),
..Default::default()
};
let if_none_match = Some("abc".to_string());
let if_match = None;
assert!(should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = Some("*".to_string());
let if_match = None;
assert!(should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = None;
let if_match = Some("def".to_string());
assert!(should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = None;
let if_match = Some("*".to_string());
assert!(!should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = Some("def".to_string());
let if_match = None;
assert!(!should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = Some("def".to_string());
let if_match = Some("*".to_string());
assert!(!should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = Some("def".to_string());
let if_match = Some("\"abc\"".to_string());
assert!(!should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = Some("*".to_string());
let if_match = Some("\"abc\"".to_string());
assert!(should_prevent_write(&oi, if_none_match, if_match));
let oi = ObjectInfo {
etag: None,
..Default::default()
};
let if_none_match = Some("*".to_string());
let if_match = Some("\"abc\"".to_string());
assert!(should_prevent_write(&oi, if_none_match, if_match));
let if_none_match = None;
let if_match = None;
assert!(!should_prevent_write(&oi, if_none_match, if_match));
}
#[test]
fn test_is_valid_storage_class() {
// Test valid storage classes
assert!(is_valid_storage_class(storageclass::STANDARD));
assert!(is_valid_storage_class(storageclass::RRS));
assert!(is_valid_storage_class(storageclass::DEEP_ARCHIVE));
assert!(is_valid_storage_class(storageclass::EXPRESS_ONEZONE));
assert!(is_valid_storage_class(storageclass::GLACIER));
assert!(is_valid_storage_class(storageclass::GLACIER_IR));
assert!(is_valid_storage_class(storageclass::INTELLIGENT_TIERING));
assert!(is_valid_storage_class(storageclass::ONEZONE_IA));
assert!(is_valid_storage_class(storageclass::OUTPOSTS));
assert!(is_valid_storage_class(storageclass::SNOW));
assert!(is_valid_storage_class(storageclass::STANDARD_IA));
// Test invalid storage classes
assert!(!is_valid_storage_class("INVALID"));
assert!(!is_valid_storage_class(""));
assert!(!is_valid_storage_class("standard")); // lowercase
}
#[test]
fn test_is_cold_storage_class() {
// Test cold storage classes
assert!(is_cold_storage_class(storageclass::DEEP_ARCHIVE));
assert!(is_cold_storage_class(storageclass::GLACIER));
assert!(is_cold_storage_class(storageclass::GLACIER_IR));
// Test non-cold storage classes
assert!(!is_cold_storage_class(storageclass::STANDARD));
assert!(!is_cold_storage_class(storageclass::RRS));
assert!(!is_cold_storage_class(storageclass::STANDARD_IA));
assert!(!is_cold_storage_class(storageclass::EXPRESS_ONEZONE));
}
#[test]
fn test_is_infrequent_access_class() {
// Test infrequent access classes
assert!(is_infrequent_access_class(storageclass::ONEZONE_IA));
assert!(is_infrequent_access_class(storageclass::STANDARD_IA));
assert!(is_infrequent_access_class(storageclass::INTELLIGENT_TIERING));
// Test frequent access classes
assert!(!is_infrequent_access_class(storageclass::STANDARD));
assert!(!is_infrequent_access_class(storageclass::RRS));
assert!(!is_infrequent_access_class(storageclass::DEEP_ARCHIVE));
assert!(!is_infrequent_access_class(storageclass::EXPRESS_ONEZONE));
}
}