From 639bf0c2339b30e2de8f97cc15e00d72895be2cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=89=E6=AD=A3=E8=B6=85?= Date: Sun, 12 Oct 2025 23:47:51 +0800 Subject: [PATCH] Revert "feat(append): implement object append operations with state tracking (#599)" (#646) This reverts commit 4f73760a4540079385806a012d23e3428916c3a4. --- Cargo.lock | 2 - crates/e2e_test/Cargo.toml | 1 - .../src/kms/multipart_encryption_test.rs | 219 ++- crates/e2e_test/src/reliant/append.rs | 1177 -------------- crates/e2e_test/src/reliant/mod.rs | 1 - crates/ecstore/src/erasure_coding/decode.rs | 15 +- crates/ecstore/src/lib.rs | 1 - crates/ecstore/src/object_append.rs | 725 --------- crates/ecstore/src/set_disk.rs | 1361 ++--------------- crates/ecstore/src/sets.rs | 8 - crates/ecstore/src/store.rs | 11 - crates/ecstore/src/store_api.rs | 18 +- crates/filemeta/Cargo.toml | 1 - crates/filemeta/src/append.rs | 541 ------- crates/filemeta/src/fileinfo.rs | 90 -- crates/filemeta/src/lib.rs | 2 - crates/utils/src/dns_resolver.rs | 1 - docs/append_write/README.md | 147 -- rustfs/src/storage/ecfs.rs | 82 - 19 files changed, 240 insertions(+), 4163 deletions(-) delete mode 100644 crates/e2e_test/src/reliant/append.rs delete mode 100644 crates/ecstore/src/object_append.rs delete mode 100644 crates/filemeta/src/append.rs delete mode 100644 docs/append_write/README.md diff --git a/Cargo.lock b/Cargo.lock index 603acb41..fed1eb49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2923,7 +2923,6 @@ dependencies = [ "chrono", "flatbuffers", "futures", - "http 1.3.1", "md5", "rand 0.9.2", "reqwest", @@ -6645,7 +6644,6 @@ dependencies = [ "rustfs-utils", "s3s", "serde", - "serde_json", "thiserror 2.0.17", "time", "tokio", diff --git a/crates/e2e_test/Cargo.toml b/crates/e2e_test/Cargo.toml index b29496da..9f5f9538 100644 --- a/crates/e2e_test/Cargo.toml +++ b/crates/e2e_test/Cargo.toml @@ -49,5 +49,4 @@ uuid = { workspace = true } base64 = { workspace = true } rand = { workspace = true } chrono = { workspace = true } -http.workspace = true md5 = { workspace = true } diff --git a/crates/e2e_test/src/kms/multipart_encryption_test.rs b/crates/e2e_test/src/kms/multipart_encryption_test.rs index 40454801..ee845fe9 100644 --- a/crates/e2e_test/src/kms/multipart_encryption_test.rs +++ b/crates/e2e_test/src/kms/multipart_encryption_test.rs @@ -13,16 +13,25 @@ // See the License for the specific language governing permissions and // limitations under the License. +//! 分片上传加密功能的分步测试用例 +//! +//! 这个测试套件将验证分片上传加密功能的每一个步骤: +//! 1. 测试基础的单分片加密(验证加密基础逻辑) +//! 2. 测试多分片上传(验证分片拼接逻辑) +//! 3. 测试加密元数据的保存和读取 +//! 4. 测试完整的分片上传加密流程 + use super::common::LocalKMSTestEnvironment; use crate::common::{TEST_BUCKET, init_logging}; use serial_test::serial; use tracing::{debug, info}; +/// 步骤1:测试基础单文件加密功能(确保SSE-S3在非分片场景下正常工作) #[tokio::test] #[serial] async fn test_step1_basic_single_file_encryption() -> Result<(), Box> { init_logging(); - info!("🧪 step1: test basic single file encryption"); + info!("🧪 步骤1:测试基础单文件加密功能"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -31,11 +40,11 @@ async fn test_step1_basic_single_file_encryption() -> Result<(), Box Result<(), Box Result<(), Box> { init_logging(); - info!("🧪 step2: test basic multipart upload without encryption"); + info!("🧪 步骤2:测试不加密的分片上传"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -93,16 +102,12 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo let total_parts = 2; let total_size = part_size * total_parts; - // generate test data (with clear pattern for easy verification) + // 生成测试数据(有明显的模式便于验证) let test_data: Vec = (0..total_size).map(|i| (i % 256) as u8).collect(); - info!( - "🚀 step2: start multipart upload (no encryption) with {} parts, each {}MB", - total_parts, - part_size / (1024 * 1024) - ); + info!("🚀 开始分片上传(无加密):{} parts,每个 {}MB", total_parts, part_size / (1024 * 1024)); - // step1: create multipart upload + // 步骤1:创建分片上传 let create_multipart_output = s3_client .create_multipart_upload() .bucket(TEST_BUCKET) @@ -111,16 +116,16 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo .await?; let upload_id = create_multipart_output.upload_id().unwrap(); - info!("📋 step2: create multipart upload, ID: {}", upload_id); + info!("📋 创建分片上传,ID: {}", upload_id); - // step2: upload each part + // 步骤2:上传各个分片 let mut completed_parts = Vec::new(); for part_number in 1..=total_parts { let start = (part_number - 1) * part_size; let end = std::cmp::min(start + part_size, total_size); let part_data = &test_data[start..end]; - info!("📤 step2: upload part {} ({} bytes)", part_number, part_data.len()); + info!("📤 上传分片 {} ({} bytes)", part_number, part_data.len()); let upload_part_output = s3_client .upload_part() @@ -140,15 +145,15 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo .build(), ); - debug!("step2: part {} uploaded, ETag: {}", part_number, etag); + debug!("分片 {} 上传完成,ETag: {}", part_number, etag); } - // step3: complete multipart upload + // 步骤3:完成分片上传 let completed_multipart_upload = aws_sdk_s3::types::CompletedMultipartUpload::builder() .set_parts(Some(completed_parts)) .build(); - info!("🔗 step2: complete multipart upload"); + info!("🔗 完成分片上传"); let complete_output = s3_client .complete_multipart_upload() .bucket(TEST_BUCKET) @@ -158,16 +163,10 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo .send() .await?; - debug!("step2: multipart upload completed, ETag: {:?}", complete_output.e_tag()); + debug!("完成分片上传,ETag: {:?}", complete_output.e_tag()); - // step4: verify multipart upload completed successfully - assert_eq!( - complete_output.e_tag().unwrap().to_string(), - format!("\"{}-{}-{}\"", object_key, upload_id, total_parts) - ); - - // verify data integrity - info!("📥 step2: download file and verify data integrity"); + // 步骤4:下载并验证 + info!("📥 下载文件并验证数据完整性"); let get_response = s3_client.get_object().bucket(TEST_BUCKET).key(object_key).send().await?; let downloaded_data = get_response.body.collect().await?.into_bytes(); @@ -175,16 +174,16 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo assert_eq!(&downloaded_data[..], &test_data[..]); kms_env.base_env.delete_test_bucket(TEST_BUCKET).await?; - info!("✅ step2: basic multipart upload without encryption works as expected"); + info!("✅ 步骤2通过:不加密的分片上传功能正常"); Ok(()) } -/// test multipart upload with SSE-S3 encryption +/// 步骤3:测试分片上传 + SSE-S3加密(重点测试) #[tokio::test] #[serial] async fn test_step3_multipart_upload_with_sse_s3() -> Result<(), Box> { init_logging(); - info!("🧪 step3: test multipart upload with SSE-S3 encryption"); + info!("🧪 步骤3:测试分片上传 + SSE-S3加密"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -198,16 +197,16 @@ async fn test_step3_multipart_upload_with_sse_s3() -> Result<(), Box = (0..total_size).map(|i| ((i / 1000) % 256) as u8).collect(); info!( - "🔐 step3: start multipart upload with SSE-S3 encryption: {} parts, each {}MB", + "🔐 开始分片上传(SSE-S3加密):{} parts,每个 {}MB", total_parts, part_size / (1024 * 1024) ); - // step1: create multipart upload and enable SSE-S3 + // 步骤1:创建分片上传并启用SSE-S3 let create_multipart_output = s3_client .create_multipart_upload() .bucket(TEST_BUCKET) @@ -217,24 +216,24 @@ async fn test_step3_multipart_upload_with_sse_s3() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { init_logging(); - info!("🧪 step4: test larger multipart upload with encryption (streaming encryption)"); + info!("🧪 步骤4:测试大文件分片上传加密"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -326,13 +322,13 @@ async fn test_step4_large_multipart_upload_with_encryption() -> Result<(), Box = (0..total_size) .map(|i| { let part_num = i / part_size; @@ -341,9 +337,9 @@ async fn test_step4_large_multipart_upload_with_encryption() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { init_logging(); - info!("🧪 step5: test all encryption types multipart upload"); + info!("🧪 步骤5:测试所有加密类型的分片上传"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -456,8 +446,8 @@ async fn test_step5_all_encryption_types_multipart() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { - // step5: generate test data + // 生成测试数据 let test_data: Vec = (0..total_size).map(|i| ((i * 7) % 256) as u8).collect(); - // step5: prepare SSE-C key and MD5 (if needed) + // 准备SSE-C所需的密钥(如果需要) let (sse_c_key, sse_c_md5) = if matches!(encryption_type, EncryptionType::SSEC) { let key = "01234567890123456789012345678901"; let key_b64 = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, key); @@ -516,10 +506,9 @@ async fn test_multipart_encryption_type( (None, None) }; - // step5: create multipart upload - info!("🔗 step5: create multipart upload with encryption {:?}", encryption_type); + info!("📋 创建分片上传 - {:?}", encryption_type); - // step5: create multipart upload request + // 创建分片上传 let mut create_request = s3_client.create_multipart_upload().bucket(bucket).key(object_key); create_request = match encryption_type { @@ -533,6 +522,7 @@ async fn test_multipart_encryption_type( let create_multipart_output = create_request.send().await?; let upload_id = create_multipart_output.upload_id().unwrap(); + // 上传分片 let mut completed_parts = Vec::new(); for part_number in 1..=total_parts { let start = (part_number - 1) * part_size; @@ -547,7 +537,7 @@ async fn test_multipart_encryption_type( .part_number(part_number as i32) .body(aws_sdk_s3::primitives::ByteStream::from(part_data.to_vec())); - // step5: include SSE-C key and MD5 in each UploadPart request (if needed) + // SSE-C需要在每个UploadPart请求中包含密钥 if matches!(encryption_type, EncryptionType::SSEC) { upload_request = upload_request .sse_customer_algorithm("AES256") @@ -564,11 +554,10 @@ async fn test_multipart_encryption_type( .build(), ); - // step5: complete multipart upload request - debug!("🔗 step5: complete multipart upload part {} with etag {}", part_number, etag); + debug!("{:?} 分片 {} 上传完成", encryption_type, part_number); } - // step5: complete multipart upload + // 完成分片上传 let completed_multipart_upload = aws_sdk_s3::types::CompletedMultipartUpload::builder() .set_parts(Some(completed_parts)) .build(); @@ -582,12 +571,10 @@ async fn test_multipart_encryption_type( .send() .await?; - // step5: download and verify multipart upload - info!("🔗 step5: download and verify multipart upload with encryption {:?}", encryption_type); - + // 下载并验证 let mut get_request = s3_client.get_object().bucket(bucket).key(object_key); - // step5: include SSE-C key and MD5 in each GET request (if needed) + // SSE-C需要在GET请求中包含密钥 if matches!(encryption_type, EncryptionType::SSEC) { get_request = get_request .sse_customer_algorithm("AES256") @@ -597,7 +584,7 @@ async fn test_multipart_encryption_type( let get_response = get_request.send().await?; - // step5: verify encryption headers + // 验证加密头 match encryption_type { EncryptionType::SSEKMS => { assert_eq!( @@ -610,15 +597,11 @@ async fn test_multipart_encryption_type( } } - // step5: verify data integrity + // 验证数据完整性 let downloaded_data = get_response.body.collect().await?.into_bytes(); assert_eq!(downloaded_data.len(), total_size); assert_eq!(&downloaded_data[..], &test_data[..]); - // step5: verify data integrity - info!( - "✅ step5: verify data integrity for multipart upload with encryption {:?}", - encryption_type - ); + info!("✅ {:?} 分片上传测试通过", encryption_type); Ok(()) } diff --git a/crates/e2e_test/src/reliant/append.rs b/crates/e2e_test/src/reliant/append.rs deleted file mode 100644 index 3541fac8..00000000 --- a/crates/e2e_test/src/reliant/append.rs +++ /dev/null @@ -1,1177 +0,0 @@ -#![cfg(test)] - -use crate::common::{RustFSTestEnvironment, init_logging}; -use aws_sdk_s3::Client; -use aws_sdk_s3::error::SdkError; -use aws_sdk_s3::operation::put_object::{PutObjectError, PutObjectOutput}; -use aws_sdk_s3::primitives::ByteStream; -use http::{ - HeaderValue, - header::{IF_MATCH, IF_NONE_MATCH}, -}; -use serial_test::serial; -use std::error::Error; -use std::time::Duration; -use tokio::time::sleep; -use uuid::Uuid; - -async fn append_object( - client: &Client, - bucket: &str, - key: &str, - position: i64, - payload: &[u8], -) -> Result> { - append_object_with_conditions(client, bucket, key, position, payload, None, None).await -} - -async fn append_object_with_if_match( - client: &Client, - bucket: &str, - key: &str, - position: i64, - payload: &[u8], - if_match: Option, -) -> Result> { - append_object_with_conditions(client, bucket, key, position, payload, if_match, None).await -} - -async fn append_object_with_if_none_match( - client: &Client, - bucket: &str, - key: &str, - position: i64, - payload: &[u8], - if_none_match: Option, -) -> Result> { - append_object_with_conditions(client, bucket, key, position, payload, None, if_none_match).await -} - -async fn append_object_with_conditions( - client: &Client, - bucket: &str, - key: &str, - position: i64, - payload: &[u8], - if_match: Option, - if_none_match: Option, -) -> Result> { - let if_match_header = if_match.clone(); - let if_none_match_header = if_none_match.clone(); - client - .put_object() - .bucket(bucket) - .key(key) - .body(ByteStream::from(payload.to_vec())) - .customize() - .mutate_request(move |req| { - req.headers_mut() - .insert("x-amz-object-append", HeaderValue::from_static("true")); - req.headers_mut().insert( - "x-amz-append-position", - HeaderValue::from_str(&position.to_string()).expect("invalid position header"), - ); - if let Some(tag) = if_match_header.as_deref() { - req.headers_mut() - .insert(IF_MATCH, HeaderValue::from_str(tag).expect("invalid if-match header")); - } - if let Some(tag) = if_none_match_header.as_deref() { - req.headers_mut() - .insert(IF_NONE_MATCH, HeaderValue::from_str(tag).expect("invalid if-none-match header")); - } - }) - .send() - .await -} - -async fn append_action( - client: &Client, - bucket: &str, - key: &str, - action: &str, - if_match: Option<&str>, -) -> Result> { - let action_value = HeaderValue::from_str(action).expect("invalid append action"); - let if_match_value = if_match.map(|v| HeaderValue::from_str(v).expect("invalid if-match")); - client - .put_object() - .bucket(bucket) - .key(key) - .body(ByteStream::from_static(b"")) - .customize() - .mutate_request(move |req| { - req.headers_mut().insert("x-amz-append-action", action_value.clone()); - if let Some(val) = if_match_value.as_ref() { - req.headers_mut().insert(IF_MATCH, val.clone()); - } - }) - .send() - .await -} - -fn md5_hex(data: &[u8]) -> String { - format!("{:x}", md5::compute(data)) -} - -fn multipart_etag(etags: &[&str]) -> String { - let mut buf = Vec::new(); - - for etag in etags { - let clean = etag.trim_matches('"'); - if clean.len() == 32 && clean.chars().all(|c| c.is_ascii_hexdigit()) { - let mut chunk = Vec::with_capacity(clean.len() / 2); - for i in (0..clean.len()).step_by(2) { - let byte = u8::from_str_radix(&clean[i..i + 2], 16).expect("invalid hex"); - chunk.push(byte); - } - buf.extend_from_slice(&chunk); - } else { - buf.extend_from_slice(clean.as_bytes()); - } - } - - let digest = md5::compute(buf); - format!("{:x}-{}", digest, etags.len()) -} - -#[tokio::test] -#[serial] -async fn append_inline_object_updates_content_and_etag() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-inline-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-success.txt"; - let initial = b"hello"; - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - - let initial_fetch = client.get_object().bucket(&bucket).key(key).send().await?; - let initial_body = initial_fetch.body.collect().await?.into_bytes(); - println!("initial body = {:?}", initial_body); - - let append_payload = b" world"; - append_object(&client, &bucket, key, initial.len() as i64, append_payload) - .await - .expect("append request should succeed"); - - let second_payload = b"!!!"; - append_object(&client, &bucket, key, (initial.len() + append_payload.len()) as i64, second_payload) - .await - .expect("second append request should succeed"); - - let expected: Vec = [initial.as_slice(), append_payload.as_slice(), second_payload.as_slice()].concat(); - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let etag = get_resp.e_tag().map(|v| v.to_string()); - let aggregated = get_resp.body.collect().await?; - let _body = aggregated.into_bytes(); - assert_eq!(_body.as_ref(), expected.as_slice()); - - if let Some(etag) = etag { - assert_eq!(etag.trim_matches('"'), md5_hex(&expected)); - } else { - panic!("Append GET response missing ETag"); - } - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_inline_object_rejects_wrong_position() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-inline-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-mismatch.txt"; - let initial = b"abcdef"; - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - - let err = append_object(&client, &bucket, key, (initial.len() as i64) + 1, b"xyz") - .await - .expect_err("append with wrong position must fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 400); - } - other => panic!("unexpected error variant: {other:?}"), - } - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), initial); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_segmented_object_appends_new_part() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-segmented-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-large.bin"; - - let initial_size = 512 * 1024; - let initial: Vec = (0..initial_size).map(|i| (i % 251) as u8).collect(); - let put_resp = client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.clone())) - .send() - .await?; - let initial_etag = put_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("initial put etag"); - - let append_payload: Vec = (0..(128 * 1024)).map(|i| (i % 197) as u8).collect(); - let append_position = initial.len() as i64; - - let mut if_match = String::from("\""); - if_match.push_str(&initial_etag); - if_match.push('"'); - - let append_resp = append_object_with_if_match(&client, &bucket, key, append_position, &append_payload, Some(if_match)) - .await - .expect("append request must succeed"); - let append_etag = append_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("append response etag"); - - let second_segment: Vec = (0..(64 * 1024)).map(|i| (i % 173) as u8).collect(); - let expected_etag_first = multipart_etag(&[&initial_etag, &md5_hex(&append_payload)]); - assert_eq!(append_etag, expected_etag_first); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let _initial_body = aggregated.into_bytes(); - - let append_resp_second = append_object_with_if_match( - &client, - &bucket, - key, - append_position + append_payload.len() as i64, - &second_segment, - Some(format!("\"{append_etag}\"")), - ) - .await - .expect("second segmented append must succeed"); - - let expected_etag = multipart_etag(&[&initial_etag, &md5_hex(&append_payload), &md5_hex(&second_segment)]); - assert_eq!( - append_resp_second.e_tag().map(|v| v.trim_matches('"').to_string()), - Some(expected_etag.clone()) - ); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - - let mut expected = initial.clone(); - expected.extend_from_slice(&append_payload); - expected.extend_from_slice(&second_segment); - - assert_eq!(body.as_ref(), expected.as_slice()); - - let head = client.head_object().bucket(&bucket).key(key).send().await?; - assert_eq!(head.content_length(), Some(expected.len() as i64)); - assert_eq!(head.e_tag().map(|v| v.trim_matches('"').to_string()), Some(expected_etag.clone())); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_inline_object_rejects_failed_precondition() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-precond-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-if-match.txt"; - let initial = b"hello"; - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - - let append_payload = b" world"; - let err = append_object_with_if_match( - &client, - &bucket, - key, - initial.len() as i64, - append_payload, - Some("\"deadbeef\"".to_string()), - ) - .await - .expect_err("append with wrong If-Match must fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 412); - } - other => panic!("unexpected error variant: {other:?}"), - } - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), initial); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_inline_object_honors_if_match() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-inline-if-match-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-inline-if-match-success.txt"; - let initial = b"inline"; - let put_resp = client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - let initial_etag = put_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("initial etag"); - - let append_payload = b" payload"; - let resp = append_object_with_if_match( - &client, - &bucket, - key, - initial.len() as i64, - append_payload, - Some(format!("\"{initial_etag}\"")), - ) - .await - .expect("append with correct if-match should succeed"); - - let combined: Vec = [initial.as_slice(), append_payload.as_slice()].concat(); - assert_eq!(resp.e_tag().map(|v| v.trim_matches('"').to_string()), Some(md5_hex(&combined))); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), combined.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_inline_object_rejects_if_none_match_star() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-if-none-match-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-if-none.txt"; - let initial = b"hello"; - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - - let append_payload = b" world"; - let err = - append_object_with_if_none_match(&client, &bucket, key, initial.len() as i64, append_payload, Some("*".to_string())) - .await - .expect_err("append with If-None-Match:* must fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 412); - } - other => panic!("unexpected error variant: {other:?}"), - } - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), initial); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_inline_object_allows_zero_length_payload() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-inline-empty-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-inline-empty.txt"; - let initial = b"foobar"; - let put_resp = client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - let initial_etag = put_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("initial etag"); - - let resp = append_object_with_if_match(&client, &bucket, key, initial.len() as i64, &[], Some(format!("\"{initial_etag}\""))) - .await - .expect("append with empty payload should succeed"); - - assert_eq!(resp.e_tag().map(|v| v.trim_matches('"').to_string()), Some(initial_etag.clone())); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), initial); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn complete_append_consolidates_pending_segments() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-complete-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "complete-append.bin"; - let base_len = 256 * 1024; - let base: Vec = (0..base_len).map(|i| (i % 251) as u8).collect(); - let put_resp = client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(base.clone())) - .send() - .await?; - let base_etag = put_resp.e_tag().map(|v| v.trim_matches('"').to_string()).expect("base etag"); - - let seg_a: Vec = (0..(64 * 1024)).map(|i| (i % 199) as u8).collect(); - let seg_b: Vec = (0..(96 * 1024)).map(|i| (i % 173) as u8).collect(); - - let append_a = - append_object_with_if_match(&client, &bucket, key, base.len() as i64, &seg_a, Some(format!("\"{base_etag}\""))).await?; - let etag_after_a = append_a - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("etag after first append"); - - let append_b = append_object_with_if_match( - &client, - &bucket, - key, - (base.len() + seg_a.len()) as i64, - &seg_b, - Some(format!("\"{etag_after_a}\"")), - ) - .await?; - let etag_after_b = append_b - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("etag after second append"); - assert!(etag_after_b.contains('-')); - - let complete_resp = append_action(&client, &bucket, key, "complete", None).await?; - let complete_etag = complete_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("complete etag"); - - let mut expected = base.clone(); - expected.extend_from_slice(&seg_a); - expected.extend_from_slice(&seg_b); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let final_body = get_resp.body.collect().await?.into_bytes(); - assert_eq!(final_body.len(), expected.len()); - assert_eq!(final_body.as_ref(), expected.as_slice()); - assert_eq!(complete_etag, md5_hex(&expected)); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn abort_append_discards_pending_segments() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-abort-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "abort-append.bin"; - let base: Vec = (0..(512 * 1024)).map(|i| (i % 181) as u8).collect(); - let put_resp = client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(base.clone())) - .send() - .await?; - let base_etag = put_resp.e_tag().map(|v| v.trim_matches('"').to_string()).expect("base etag"); - - let seg_a: Vec = vec![0xAA; 64 * 1024]; - let seg_b: Vec = vec![0xBB; 96 * 1024]; - - let append_a = - append_object_with_if_match(&client, &bucket, key, base.len() as i64, &seg_a, Some(format!("\"{base_etag}\""))).await?; - let etag_after_a = append_a - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("etag after first append"); - - append_object_with_if_match( - &client, - &bucket, - key, - (base.len() + seg_a.len()) as i64, - &seg_b, - Some(format!("\"{etag_after_a}\"")), - ) - .await?; - - let abort_resp = append_action(&client, &bucket, key, "abort", None).await?; - let abort_etag = abort_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("abort etag"); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let final_body = get_resp.body.collect().await?.into_bytes(); - assert_eq!(final_body.len(), base.len()); - assert_eq!(final_body.as_ref(), base.as_slice()); - - let retry_segment = vec![0xCC; 32 * 1024]; - let retry_resp = append_object_with_if_match( - &client, - &bucket, - key, - base.len() as i64, - &retry_segment, - Some(format!("\"{abort_etag}\"")), - ) - .await?; - let retry_etag = retry_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("retry etag"); - - let mut expected_after_retry = base.clone(); - expected_after_retry.extend_from_slice(&retry_segment); - let head_after_retry = client.head_object().bucket(&bucket).key(key).send().await?; - assert_eq!(head_after_retry.content_length(), Some(expected_after_retry.len() as i64)); - assert_eq!( - head_after_retry.e_tag().map(|v| v.trim_matches('"').to_string()), - Some(retry_etag.clone()) - ); - - let get_after_retry = client.get_object().bucket(&bucket).key(key).send().await?; - let final_bytes = get_after_retry.body.collect().await?.into_bytes(); - assert_eq!(final_bytes.as_ref(), expected_after_retry.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_segments_concurrency_then_complete() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-complete-concurrency-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "concurrent-complete.bin"; - let base: Vec = (0..(196 * 1024)).map(|i| (i % 233) as u8).collect(); - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(base.clone())) - .send() - .await?; - - let seg_a: Vec = vec![0x11; 64 * 1024]; - let seg_b: Vec = vec![0x22; 48 * 1024]; - let seg_c: Vec = vec![0x33; 80 * 1024]; - - let position = base.len() as i64; - let client_a = client.clone(); - let client_b = client.clone(); - let bucket_a = bucket.clone(); - let bucket_b = bucket.clone(); - let key_string = key.to_string(); - let seg_a_clone = seg_a.clone(); - let seg_b_clone = seg_b.clone(); - - let (res_a, res_b) = tokio::join!( - async { append_object(&client_a, &bucket_a, &key_string, position, &seg_a_clone).await }, - async { append_object(&client_b, &bucket_b, &key_string, position, &seg_b_clone).await } - ); - - let (success_resp, failure_resp, winning_segment) = match (res_a, res_b) { - (Ok(resp), Err(err)) => (resp, Some(err), seg_a.clone()), - (Err(err), Ok(resp)) => (resp, Some(err), seg_b.clone()), - _ => panic!("expected exactly one append success"), - }; - - if let Some(SdkError::ServiceError(service_err)) = failure_resp { - assert_eq!(service_err.raw().status().as_u16(), 400); - } - - let winning_etag = success_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("winning append etag"); - - let mut expected = base.clone(); - expected.extend_from_slice(&winning_segment); - - append_object_with_if_match(&client, &bucket, key, expected.len() as i64, &seg_c, Some(format!("\"{winning_etag}\""))) - .await?; - - expected.extend_from_slice(&seg_c); - - let complete_resp = append_action(&client, &bucket, key, "complete", None).await?; - let final_etag = complete_resp - .e_tag() - .map(|v| v.trim_matches('"').to_string()) - .expect("final etag"); - - let head = client.head_object().bucket(&bucket).key(key).send().await?; - assert_eq!(head.content_length(), Some(expected.len() as i64)); - assert_eq!(head.e_tag().map(|v| v.trim_matches('"').to_string()), Some(final_etag.clone())); - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let final_body = get_resp.body.collect().await?.into_bytes(); - assert_eq!(final_body.as_ref(), expected.as_slice()); - assert_eq!(final_etag, md5_hex(&expected)); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_missing_object_returns_not_found() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-missing-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "missing-object.txt"; - let err = append_object(&client, &bucket, key, 0, b"data") - .await - .expect_err("append on missing object must fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 404); - } - other => panic!("unexpected error variant: {other:?}"), - } - - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_segmented_object_rejects_wrong_position() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-seg-pos-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-seg-pos.bin"; - let initial: Vec = (0..(512 * 1024)).map(|i| (i % 211) as u8).collect(); - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.clone())) - .send() - .await?; - - let err = append_object(&client, &bucket, key, (initial.len() as i64) + 1, b"abc") - .await - .expect_err("append with wrong position on segmented object must fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 400); - } - other => panic!("unexpected error variant: {other:?}"), - } - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), initial.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_segmented_object_rejects_failed_precondition() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-seg-precond-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "append-seg-precond.bin"; - let initial: Vec = (0..(256 * 1024)).map(|i| (i % 199) as u8).collect(); - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.clone())) - .send() - .await?; - - let append_payload: Vec = (0..(64 * 1024)).map(|i| (i % 173) as u8).collect(); - let err = append_object_with_if_match( - &client, - &bucket, - key, - initial.len() as i64, - &append_payload, - Some("\"ffffffffffffffffffffffffffffffff\"".to_string()), - ) - .await - .expect_err("append with wrong etag on segmented object must fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 412); - } - other => panic!("unexpected error variant: {other:?}"), - } - - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let aggregated = get_resp.body.collect().await?; - let body = aggregated.into_bytes(); - assert_eq!(body.as_ref(), initial.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_large_file_multi_segments() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-large-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "large-append.bin"; - - // Create initial object with 1MB data - let chunk_size = 1024 * 1024; // 1MB - let initial_data: Vec = (0..chunk_size).map(|i| (i % 256) as u8).collect(); - - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial_data.clone())) - .send() - .await?; - - // Append multiple 1MB chunks to trigger segmented storage - let mut expected_data = initial_data.clone(); - for i in 1..=5 { - let append_chunk: Vec = (0..chunk_size).map(|j| ((j + i * 1000) % 256) as u8).collect(); - - append_object(&client, &bucket, key, expected_data.len() as i64, &append_chunk) - .await - .expect("large file append should succeed"); - - expected_data.extend_from_slice(&append_chunk); - - // Verify partial content after each append - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let body_bytes = get_resp.body.collect().await?.into_bytes(); - assert_eq!(body_bytes.len(), expected_data.len()); - - // Verify first and last few bytes to ensure data integrity - assert_eq!(&body_bytes[0..100], &expected_data[0..100]); - let end_offset = expected_data.len() - 100; - assert_eq!(&body_bytes[end_offset..], &expected_data[end_offset..]); - } - - // Final verification of complete content - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let final_body = get_resp.body.collect().await?.into_bytes(); - assert_eq!(final_body.len(), expected_data.len()); - assert_eq!(final_body.as_ref(), expected_data.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_threshold_crossing_inline_to_segmented() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-threshold-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "threshold-test.dat"; - - // Start with small inline data (should stay inline) - let small_data = vec![0u8; 1024]; // 1KB - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(small_data.clone())) - .send() - .await?; - - let mut expected_data = small_data; - - // Make several small appends to gradually grow the object - for i in 1..=10 { - let append_data = vec![i as u8; 2048]; // 2KB each - - append_object(&client, &bucket, key, expected_data.len() as i64, &append_data) - .await - .expect("threshold crossing append should succeed"); - - expected_data.extend_from_slice(&append_data); - - // Wait a bit between appends to allow background spill processing - sleep(Duration::from_millis(200)).await; - } - - // Add one large append that definitely triggers segmented mode - let large_append = vec![255u8; 512 * 1024]; // 512KB - append_object(&client, &bucket, key, expected_data.len() as i64, &large_append) - .await - .expect("large append triggering segmentation should succeed"); - - expected_data.extend_from_slice(&large_append); - - // Allow time for any background spill operations to complete - sleep(Duration::from_secs(2)).await; - - // Verify final content integrity - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let final_body = get_resp.body.collect().await?.into_bytes(); - assert_eq!(final_body.len(), expected_data.len()); - assert_eq!(final_body.as_ref(), expected_data.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_concurrent_operations_with_epoch() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-concurrent-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "concurrent-test.txt"; - let initial = b"base"; - - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(initial.to_vec())) - .send() - .await?; - - // Get initial object to obtain ETag for conditional append - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let initial_etag = get_resp.e_tag().unwrap().to_string(); - - // First append with correct ETag should succeed - let first_append = b" data1"; - let resp1 = - append_object_with_if_match(&client, &bucket, key, initial.len() as i64, first_append, Some(initial_etag.clone())) - .await - .expect("first conditional append should succeed"); - - let new_etag = resp1.e_tag().unwrap().to_string(); - - // Second append with old ETag should fail (simulating concurrent modification) - let second_append = b" data2"; - let err = append_object_with_if_match( - &client, - &bucket, - key, - (initial.len() + first_append.len()) as i64, - second_append, - Some(initial_etag), // Using old ETag should fail - ) - .await - .expect_err("append with stale etag should fail"); - - match err { - SdkError::ServiceError(service_err) => { - assert_eq!(service_err.raw().status().as_u16(), 412); // Precondition Failed - } - other => panic!("unexpected error variant: {other:?}"), - } - - // Third append with correct new ETag should succeed - append_object_with_if_match( - &client, - &bucket, - key, - (initial.len() + first_append.len()) as i64, - second_append, - Some(new_etag), - ) - .await - .expect("append with correct etag should succeed"); - - // Verify final content - let expected: Vec = [initial.as_slice(), first_append.as_slice(), second_append.as_slice()].concat(); - let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; - let final_body = get_resp.body.collect().await?.into_bytes(); - assert_eq!(final_body.as_ref(), expected.as_slice()); - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} - -#[tokio::test] -#[serial] -async fn append_range_requests_across_segments() -> Result<(), Box> { - init_logging(); - - let mut env = RustFSTestEnvironment::new().await?; - env.start_rustfs_server(Vec::new()).await?; - sleep(Duration::from_secs(1)).await; - let client = env.create_s3_client(); - - let bucket = format!("append-range-{}", Uuid::new_v4().simple()); - client.create_bucket().bucket(&bucket).send().await?; - - let key = "range-test.dat"; - - // Create base object with known pattern - let base_size = 10000; - let base_data: Vec = (0..base_size).map(|i| (i % 256) as u8).collect(); - - client - .put_object() - .bucket(&bucket) - .key(key) - .body(ByteStream::from(base_data.clone())) - .send() - .await?; - - // Append multiple segments with different patterns - let mut expected_data = base_data; - for segment in 1..=3 { - let segment_size = 5000 + segment * 1000; // Variable segment sizes - let segment_data: Vec = (0..segment_size).map(|i| ((i + segment * 100) % 256) as u8).collect(); - - append_object(&client, &bucket, key, expected_data.len() as i64, &segment_data) - .await - .expect("segment append should succeed"); - - expected_data.extend_from_slice(&segment_data); - } - - sleep(Duration::from_millis(500)).await; // Allow background processing - - // Test various range requests that cross segment boundaries - let test_ranges = [ - (0, 999), // Beginning of base segment - (9000, 11000), // Across base and first append - (15000, 20000), // Middle of appended data - (expected_data.len() - 1000, expected_data.len() - 1), // End of data - ]; - - for (start, end) in test_ranges { - let range_header = format!("bytes={}-{}", start, end); - let range_resp = client - .get_object() - .bucket(&bucket) - .key(key) - .range(&range_header) - .send() - .await?; - - let content_range = range_resp.content_range().unwrap().to_string(); - let range_body = range_resp.body.collect().await?.into_bytes(); - let expected_range = &expected_data[start..=end]; - - assert_eq!(range_body.len(), expected_range.len()); - assert_eq!(range_body.as_ref(), expected_range); - assert_eq!(content_range, format!("bytes {}-{}/{}", start, end, expected_data.len())); - } - - client.delete_object().bucket(&bucket).key(key).send().await?; - client.delete_bucket().bucket(&bucket).send().await?; - - Ok(()) -} diff --git a/crates/e2e_test/src/reliant/mod.rs b/crates/e2e_test/src/reliant/mod.rs index bf77dbef..017ecc88 100644 --- a/crates/e2e_test/src/reliant/mod.rs +++ b/crates/e2e_test/src/reliant/mod.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod append; mod conditional_writes; mod lifecycle; mod lock; diff --git a/crates/ecstore/src/erasure_coding/decode.rs b/crates/ecstore/src/erasure_coding/decode.rs index 81fe9211..ef032ddb 100644 --- a/crates/ecstore/src/erasure_coding/decode.rs +++ b/crates/ecstore/src/erasure_coding/decode.rs @@ -167,19 +167,8 @@ async fn write_data_blocks( where W: tokio::io::AsyncWrite + Send + Sync + Unpin, { - let available = get_data_block_len(en_blocks, data_blocks); - if available < length { - let block_sizes: Vec = en_blocks - .iter() - .take(data_blocks) - .map(|block| block.as_ref().map(|buf| buf.len()).unwrap_or(0)) - .collect(); - error!( - expected = length, - available, - ?block_sizes, - "write_data_blocks get_data_block_len < length" - ); + if get_data_block_len(en_blocks, data_blocks) < length { + error!("write_data_blocks get_data_block_len < length"); return Err(io::Error::new(ErrorKind::UnexpectedEof, "Not enough data blocks to write")); } diff --git a/crates/ecstore/src/lib.rs b/crates/ecstore/src/lib.rs index 2cbfa11e..b28ce0cb 100644 --- a/crates/ecstore/src/lib.rs +++ b/crates/ecstore/src/lib.rs @@ -33,7 +33,6 @@ pub mod file_cache; pub mod global; pub mod metrics_realtime; pub mod notification_sys; -pub mod object_append; pub mod pools; pub mod rebalance; pub mod rpc; diff --git a/crates/ecstore/src/object_append.rs b/crates/ecstore/src/object_append.rs deleted file mode 100644 index 186bfe89..00000000 --- a/crates/ecstore/src/object_append.rs +++ /dev/null @@ -1,725 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::bitrot::{create_bitrot_reader, create_bitrot_writer}; -use crate::erasure_coding::{Erasure, calc_shard_size}; -use crate::error::{Error, StorageError}; -use crate::store_api::ObjectInfo; -use rustfs_filemeta::TRANSITION_COMPLETE; -use rustfs_utils::HashAlgorithm; -use rustfs_utils::http::headers::{ - AMZ_SERVER_SIDE_ENCRYPTION, AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, - AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, AMZ_SERVER_SIDE_ENCRYPTION_KMS_CONTEXT, AMZ_SERVER_SIDE_ENCRYPTION_KMS_ID, - RESERVED_METADATA_PREFIX_LOWER, -}; -use std::collections::HashSet; - -/// Ensure the target object can accept append writes under current state. -pub fn validate_append_preconditions(bucket: &str, object: &str, info: &ObjectInfo) -> Result<(), Error> { - if info.is_compressed() { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - "append is not supported for compressed objects".to_string(), - )); - } - - let encryption_headers = [ - AMZ_SERVER_SIDE_ENCRYPTION, - AMZ_SERVER_SIDE_ENCRYPTION_KMS_ID, - AMZ_SERVER_SIDE_ENCRYPTION_KMS_CONTEXT, - AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, - AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, - AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, - ]; - - if encryption_headers - .iter() - .any(|header| info.user_defined.contains_key(*header) || info.user_defined.contains_key(&header.to_ascii_lowercase())) - { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - "append is not supported for encrypted objects".to_string(), - )); - } - - if info.transitioned_object.status == TRANSITION_COMPLETE || !info.transitioned_object.tier.is_empty() { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - "append is not supported for transitioned objects".to_string(), - )); - } - - Ok(()) -} - -/// Validate that the requested append position matches the current object length. -pub fn validate_append_position(bucket: &str, object: &str, info: &ObjectInfo, expected_position: i64) -> Result<(), Error> { - if expected_position != info.size { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - format!("append position mismatch: provided {}, expected {}", expected_position, info.size), - )); - } - Ok(()) -} - -pub struct InlineAppendContext<'a> { - pub existing_inline: Option<&'a [u8]>, - pub existing_plain: Option<&'a [u8]>, - pub existing_size: i64, - pub append_payload: &'a [u8], - pub erasure: &'a Erasure, - pub hash_algorithm: HashAlgorithm, - pub has_checksums: bool, -} - -pub struct InlineAppendResult { - pub inline_data: Vec, - pub total_size: i64, - pub etag: String, -} - -/// Decode inline payload using available checksum algorithms. Returns raw bytes when decoding fails but -/// the inline buffer already contains the plain payload. -pub async fn decode_inline_payload( - inline: &[u8], - size: usize, - erasure: &Erasure, - preferred: HashAlgorithm, -) -> Result<(Vec, HashAlgorithm), Error> { - match decode_inline_variants(inline, size, erasure, preferred).await { - Ok((data, algo)) => Ok((data, algo)), - Err(err) => { - if inline.len() >= size { - Ok((inline[..size].to_vec(), HashAlgorithm::None)) - } else { - Err(err) - } - } - } -} - -/// Append data to an inline object and return the re-encoded inline buffer. -pub async fn append_inline_data(ctx: InlineAppendContext<'_>) -> Result { - let mut plain = Vec::with_capacity(ctx.existing_inline.map(|data| data.len()).unwrap_or(0) + ctx.append_payload.len()); - let mut encode_algorithm = ctx.hash_algorithm.clone(); - - if let Some(existing_plain) = ctx.existing_plain { - if existing_plain.len() != ctx.existing_size as usize { - return Err(StorageError::other("existing plain payload length mismatch")); - } - plain.extend_from_slice(existing_plain); - } else if ctx.existing_size > 0 { - let inline = ctx - .existing_inline - .ok_or_else(|| StorageError::other("inline payload missing"))?; - - let (decoded, detected_algo) = - decode_inline_payload(inline, ctx.existing_size as usize, ctx.erasure, ctx.hash_algorithm.clone()).await?; - encode_algorithm = detected_algo; - plain.extend_from_slice(&decoded); - } else if let Some(inline) = ctx.existing_inline { - plain.extend_from_slice(inline); - } - - plain.extend_from_slice(ctx.append_payload); - let total_size = plain.len() as i64; - let etag = md5_hex(&plain); - - if encode_algorithm == HashAlgorithm::None { - if ctx.has_checksums { - encode_algorithm = ctx.hash_algorithm.clone(); - } else { - return Ok(InlineAppendResult { - inline_data: plain, - total_size, - etag, - }); - } - } - - let mut writer = create_bitrot_writer( - true, - None, - "", - "", - ctx.erasure.shard_file_size(total_size), - ctx.erasure.shard_size(), - encode_algorithm, - ) - .await - .map_err(|e| StorageError::other(format!("failed to create inline writer: {e}")))?; - - let mut remaining = plain.as_slice(); - while !remaining.is_empty() { - let chunk_len = remaining.len().min(ctx.erasure.block_size); - writer - .write(&remaining[..chunk_len]) - .await - .map_err(|e| StorageError::other(format!("failed to write inline data: {e}")))?; - remaining = &remaining[chunk_len..]; - } - - writer - .shutdown() - .await - .map_err(|e| StorageError::other(format!("failed to finalize inline writer: {e}")))?; - - let inline_data = writer - .into_inline_data() - .ok_or_else(|| StorageError::other("inline writer did not return data"))?; - - Ok(InlineAppendResult { - inline_data, - total_size, - etag, - }) -} - -fn md5_hex(data: &[u8]) -> String { - let digest = HashAlgorithm::Md5.hash_encode(data); - hex_from_bytes(digest.as_ref()) -} - -fn hex_from_bytes(bytes: &[u8]) -> String { - let mut out = String::with_capacity(bytes.len() * 2); - for byte in bytes { - use std::fmt::Write; - write!(&mut out, "{:02x}", byte).expect("write hex"); - } - out -} - -async fn decode_inline_variants( - inline: &[u8], - size: usize, - erasure: &Erasure, - preferred: HashAlgorithm, -) -> Result<(Vec, HashAlgorithm), Error> { - let mut tried = HashSet::new(); - let candidates = [preferred, HashAlgorithm::HighwayHash256, HashAlgorithm::HighwayHash256S]; - - let mut last_err: Option = None; - - for algo in candidates { - if !tried.insert(algo.clone()) { - continue; - } - - match decode_inline_with_algo(inline, size, erasure, algo.clone()).await { - Ok(data) => return Ok((data, algo)), - Err(err) => last_err = Some(err), - } - } - - Err(last_err.unwrap_or_else(|| StorageError::other("failed to decode inline data"))) -} - -async fn decode_inline_with_algo(inline: &[u8], size: usize, erasure: &Erasure, algo: HashAlgorithm) -> Result, Error> { - let total_len = inline - .len() - .max(erasure.shard_file_size(size as i64).max(size as i64) as usize); - let mut reader = create_bitrot_reader(Some(inline), None, "", "", 0, total_len, erasure.shard_size(), algo) - .await - .map_err(|e| StorageError::other(format!("failed to create inline reader: {e}")))? - .ok_or_else(|| StorageError::other("inline reader unavailable"))?; - - let mut out = Vec::with_capacity(size); - while out.len() < size { - let remaining = size - out.len(); - let plain_chunk = remaining.min(erasure.block_size); - let shard_payload = calc_shard_size(plain_chunk, erasure.data_shards).max(1); - let mut buf = vec![0u8; shard_payload]; - let read = reader - .read(&mut buf) - .await - .map_err(|e| StorageError::other(format!("failed to read inline data: {e}")))?; - if read == 0 { - return Err(StorageError::other("incomplete inline data read")); - } - - let copy_len = remaining.min(read); - out.extend_from_slice(&buf[..copy_len]); - } - - Ok(out) -} - -/// Background task to spill inline data to segmented format -pub struct InlineSpillProcessor { - pub disks: Vec>, - pub write_quorum: usize, -} - -impl InlineSpillProcessor { - pub fn new(disks: Vec>, write_quorum: usize) -> Self { - Self { disks, write_quorum } - } - - /// Process a single spill operation from InlinePendingSpill to SegmentedActive - pub async fn process_spill( - &self, - bucket: &str, - object: &str, - mut fi: rustfs_filemeta::FileInfo, - mut parts_metadata: Vec, - epoch: u64, - ) -> Result<(), Error> { - use rustfs_filemeta::AppendStateKind; - use tracing::{debug, error, info, warn}; - - // Verify we're in the correct state - let current_state = fi.get_append_state(); - if current_state.state != AppendStateKind::InlinePendingSpill { - warn!( - bucket = bucket, - object = object, - current_state = ?current_state.state, - "Spill processor called on object not in InlinePendingSpill state" - ); - return Ok(()); - } - - // Check epoch to ensure we're processing the correct version - if current_state.epoch != epoch { - debug!( - bucket = bucket, - object = object, - current_epoch = current_state.epoch, - expected_epoch = epoch, - "Spill operation skipped due to epoch mismatch" - ); - return Ok(()); - } - - info!( - bucket = bucket, - object = object, - size = fi.size, - epoch = epoch, - "Starting inline data spill to segmented format" - ); - - // Extract inline data - let inline_data = fi - .data - .clone() - .ok_or_else(|| StorageError::other("Cannot spill object without inline data"))?; - - // Create erasure encoder - let erasure = Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size); - - // Decode inline data to plain data - let hash_algorithm = fi - .parts - .first() - .map(|part| fi.erasure.get_checksum_info(part.number).algorithm) - .unwrap_or(HashAlgorithm::HighwayHash256); - - let plain_data = match decode_inline_payload(&inline_data, fi.size as usize, &erasure, hash_algorithm.clone()).await { - Ok((plain, _detected_algo)) => plain, - Err(err) => { - error!( - bucket = bucket, - object = object, - error = ?err, - "Failed to decode inline data during spill" - ); - return Err(StorageError::other(format!("Failed to decode inline data for spill: {err}"))); - } - }; - - // Generate data directory for the object - let data_dir = uuid::Uuid::new_v4(); - - // Create temporary directory for the spill operation - let tmp_root = format!("{}x{}", uuid::Uuid::new_v4(), time::OffsetDateTime::now_utc().unix_timestamp()); - let tmp_path = format!("{tmp_root}/{}/part.1", data_dir); - - // Encode and write the data to all disks - match self.write_segmented_data(&plain_data, &tmp_path, &erasure).await { - Ok(_) => { - // Move from temp to permanent location - let final_path = format!("{}/part.1", data_dir); - if let Err(err) = self.move_temp_to_final(&tmp_path, &final_path).await { - error!( - bucket = bucket, - object = object, - error = ?err, - "Failed to move spilled data to final location" - ); - // Clean up temp files - let _ = self.cleanup_temp_files(&tmp_path).await; - return Err(err); - } - - // Update file metadata - fi.data_dir = Some(data_dir); - fi.data = None; // Remove inline data - fi.metadata.remove(&format!("{}inline-data", RESERVED_METADATA_PREFIX_LOWER)); - - // Update append state to SegmentedActive - let mut new_state = current_state; - new_state.state = AppendStateKind::SegmentedActive; - new_state.epoch = new_state.epoch.saturating_add(1); - new_state.pending_segments.clear(); - - fi.set_append_state(&new_state) - .map_err(|err| StorageError::other(format!("Failed to update append state after spill: {err}")))?; - - // Update all parts metadata - for meta in parts_metadata.iter_mut() { - if !meta.is_valid() { - continue; - } - meta.data_dir = Some(data_dir); - meta.data = None; - meta.metadata = fi.metadata.clone(); - meta.metadata - .remove(&format!("{}inline-data", RESERVED_METADATA_PREFIX_LOWER)); - } - - // Write updated metadata back to disks - // TODO: Implement metadata write-back logic - // This would typically involve writing the updated FileInfo to all disks - - info!( - bucket = bucket, - object = object, - data_dir = ?data_dir, - new_epoch = new_state.epoch, - "Successfully spilled inline data to segmented format" - ); - - Ok(()) - } - Err(err) => { - error!( - bucket = bucket, - object = object, - error = ?err, - "Failed to write segmented data during spill" - ); - // Clean up temp files - let _ = self.cleanup_temp_files(&tmp_path).await; - Err(err) - } - } - } - - async fn write_segmented_data(&self, data: &[u8], tmp_path: &str, _erasure: &Erasure) -> Result<(), Error> { - use tracing::debug; - - // TODO: Implement proper erasure encoding and writing to disks - // This is a placeholder implementation - debug!( - data_len = data.len(), - path = tmp_path, - "Writing segmented data (placeholder implementation)" - ); - - // For now, just return success - full implementation would: - // 1. Create bitrot writers for each disk - // 2. Erasure encode the data - // 3. Write each shard to its corresponding disk - Ok(()) - } - - async fn move_temp_to_final(&self, tmp_path: &str, final_path: &str) -> Result<(), Error> { - use tracing::debug; - - // TODO: Implement moving temp files to final location - debug!( - tmp_path = tmp_path, - final_path = final_path, - "Moving temp files to final location (placeholder)" - ); - Ok(()) - } - - async fn cleanup_temp_files(&self, tmp_path: &str) -> Result<(), Error> { - use tracing::debug; - - // TODO: Implement temp file cleanup - debug!(tmp_path = tmp_path, "Cleaning up temp files (placeholder)"); - Ok(()) - } -} - -/// Trigger background spill processing for an object -pub fn trigger_spill_process( - bucket: String, - object: String, - fi: rustfs_filemeta::FileInfo, - parts_metadata: Vec, - epoch: u64, - disks: Vec>, - write_quorum: usize, -) { - use tracing::error; - - tokio::spawn(async move { - let processor = InlineSpillProcessor::new(disks, write_quorum); - if let Err(err) = processor.process_spill(&bucket, &object, fi, parts_metadata, epoch).await { - error!( - bucket = bucket, - object = object, - epoch = epoch, - error = ?err, - "Background spill process failed" - ); - } - }); -} - -#[cfg(test)] -mod tests { - use super::*; - use rustfs_utils::HashAlgorithm; - - fn make_object_info() -> ObjectInfo { - ObjectInfo { - bucket: "test-bucket".to_string(), - name: "obj".to_string(), - ..Default::default() - } - } - - #[test] - fn rejects_compressed_objects() { - let mut info = make_object_info(); - info.user_defined - .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}compression"), "zstd".to_string()); - - let err = validate_append_preconditions("test-bucket", "obj", &info).unwrap_err(); - matches!(err, StorageError::InvalidArgument(..)) - .then_some(()) - .expect("expected invalid argument"); - } - - #[test] - fn rejects_encrypted_objects() { - let mut info = make_object_info(); - info.user_defined - .insert("x-amz-server-side-encryption".to_string(), "AES256".to_string()); - - let err = validate_append_preconditions("test-bucket", "obj", &info).unwrap_err(); - matches!(err, StorageError::InvalidArgument(..)) - .then_some(()) - .expect("expected invalid argument"); - } - - #[test] - fn rejects_transitioned_objects() { - let mut info = make_object_info(); - info.transitioned_object.tier = "GLACIER".to_string(); - info.transitioned_object.status = TRANSITION_COMPLETE.to_string(); - - let err = validate_append_preconditions("test-bucket", "obj", &info).unwrap_err(); - matches!(err, StorageError::InvalidArgument(..)) - .then_some(()) - .expect("expected invalid argument"); - } - - #[test] - fn accepts_plain_objects() { - let info = make_object_info(); - validate_append_preconditions("test-bucket", "obj", &info).expect("append should be allowed"); - } - - #[test] - fn rejects_position_mismatch() { - let mut info = make_object_info(); - info.size = 10; - let err = validate_append_position("test-bucket", "obj", &info, 5).unwrap_err(); - matches!(err, StorageError::InvalidArgument(..)) - .then_some(()) - .expect("expected invalid argument"); - } - - fn make_inline_erasure() -> Erasure { - Erasure::new(1, 0, 1024) - } - - async fn encode_inline(data: &[u8], erasure: &Erasure) -> Vec { - let mut writer = create_bitrot_writer( - true, - None, - "", - "", - erasure.shard_file_size(data.len() as i64), - erasure.shard_size(), - HashAlgorithm::HighwayHash256, - ) - .await - .unwrap(); - - let mut remaining = data; - while !remaining.is_empty() { - let chunk_len = remaining.len().min(erasure.block_size); - writer.write(&remaining[..chunk_len]).await.unwrap(); - remaining = &remaining[chunk_len..]; - } - - writer.shutdown().await.unwrap(); - writer.into_inline_data().unwrap() - } - - async fn decode_inline(encoded: &[u8], size: usize, erasure: &Erasure) -> Vec { - let mut reader = - create_bitrot_reader(Some(encoded), None, "", "", 0, size, erasure.shard_size(), HashAlgorithm::HighwayHash256) - .await - .unwrap() - .unwrap(); - - let mut out = Vec::with_capacity(size); - while out.len() < size { - let remaining = size - out.len(); - let mut buf = vec![0u8; erasure.block_size.min(remaining.max(1))]; - let read = reader.read(&mut buf).await.unwrap(); - if read == 0 { - break; - } - out.extend_from_slice(&buf[..read.min(remaining)]); - } - out - } - - #[tokio::test] - async fn append_inline_combines_payloads() { - let erasure = make_inline_erasure(); - let existing_plain = b"hello"; - let encoded = encode_inline(existing_plain, &erasure).await; - - let ctx = InlineAppendContext { - existing_inline: Some(&encoded), - existing_plain: None, - existing_size: existing_plain.len() as i64, - append_payload: b" world", - erasure: &erasure, - hash_algorithm: HashAlgorithm::HighwayHash256, - has_checksums: true, - }; - - let result = append_inline_data(ctx).await.expect("inline append to succeed"); - assert_eq!(result.total_size, 11); - assert_eq!(result.etag, md5_hex(b"hello world")); - - let decoded = decode_inline(&result.inline_data, result.total_size as usize, &erasure).await; - assert_eq!(decoded, b"hello world"); - } - - #[tokio::test] - async fn decode_inline_handles_padded_shards() { - let erasure = Erasure::new(1, 0, 1024); - let plain = b"hello"; - - let mut padded = vec![0u8; calc_shard_size(plain.len(), erasure.data_shards)]; - padded[..plain.len()].copy_from_slice(plain); - - let mut writer = create_bitrot_writer( - true, - None, - "", - "", - erasure.shard_file_size(plain.len() as i64), - erasure.shard_size(), - HashAlgorithm::HighwayHash256, - ) - .await - .unwrap(); - - writer.write(&padded).await.unwrap(); - writer.shutdown().await.unwrap(); - let inline = writer.into_inline_data().unwrap(); - - let (decoded, algo) = decode_inline_payload(&inline, plain.len(), &erasure, HashAlgorithm::HighwayHash256) - .await - .expect("inline decode should succeed"); - - assert_eq!(decoded, plain); - assert_eq!(algo, HashAlgorithm::HighwayHash256); - } - - #[tokio::test] - async fn append_inline_handles_empty_original() { - let erasure = make_inline_erasure(); - let ctx = InlineAppendContext { - existing_inline: None, - existing_plain: None, - existing_size: 0, - append_payload: b"data", - erasure: &erasure, - hash_algorithm: HashAlgorithm::HighwayHash256, - has_checksums: true, - }; - - let result = append_inline_data(ctx).await.expect("inline append to succeed"); - assert_eq!(result.total_size, 4); - assert_eq!(result.etag, md5_hex(b"data")); - - let decoded = decode_inline(&result.inline_data, result.total_size as usize, &erasure).await; - assert_eq!(decoded, b"data"); - } - - #[tokio::test] - async fn append_inline_without_checksums_uses_raw_bytes() { - let erasure = Erasure::new(1, 0, 1024); - let existing = b"hello"; - - let ctx = InlineAppendContext { - existing_inline: Some(existing), - existing_plain: None, - existing_size: existing.len() as i64, - append_payload: b" world", - erasure: &erasure, - hash_algorithm: HashAlgorithm::HighwayHash256, - has_checksums: false, - }; - - let result = append_inline_data(ctx).await.expect("inline append to succeed"); - assert_eq!(result.total_size, 11); - assert_eq!(result.etag, md5_hex(b"hello world")); - - assert_eq!(result.inline_data, b"hello world"); - } - - #[tokio::test] - async fn append_inline_decodes_bitrot_without_checksums() { - let erasure = Erasure::new(1, 0, 1024); - let existing_plain = b"hello"; - let encoded = encode_inline(existing_plain, &erasure).await; - - let ctx = InlineAppendContext { - existing_inline: Some(&encoded), - existing_plain: None, - existing_size: existing_plain.len() as i64, - append_payload: b" world", - erasure: &erasure, - hash_algorithm: HashAlgorithm::HighwayHash256, - has_checksums: false, - }; - - let result = append_inline_data(ctx).await.expect("inline append to succeed"); - assert_eq!(result.total_size, 11); - assert_eq!(result.etag, md5_hex(b"hello world")); - - let decoded = decode_inline(&result.inline_data, result.total_size as usize, &erasure).await; - assert_eq!(decoded, b"hello world"); - } -} diff --git a/crates/ecstore/src/set_disk.rs b/crates/ecstore/src/set_disk.rs index 93366e4a..33e2e185 100644 --- a/crates/ecstore/src/set_disk.rs +++ b/crates/ecstore/src/set_disk.rs @@ -49,9 +49,6 @@ use crate::{ event::name::EventName, event_notification::{EventArgs, send_event}, global::{GLOBAL_LOCAL_DISK_MAP, GLOBAL_LOCAL_DISK_SET_DRIVES, get_global_deployment_id, is_dist_erasure}, - object_append::{ - InlineAppendContext, append_inline_data, decode_inline_payload, validate_append_position, validate_append_preconditions, - }, store_api::{ BucketInfo, BucketOptions, CompletePart, DeleteBucketOptions, DeletedObject, GetObjectReader, HTTPRangeSpec, ListMultipartsInfo, ListObjectsV2Info, MakeBucketOptions, MultipartInfo, MultipartUploadResult, ObjectIO, ObjectInfo, @@ -70,14 +67,15 @@ use rand::{Rng, seq::SliceRandom}; use regex::Regex; use rustfs_common::heal_channel::{DriveState, HealChannelPriority, HealItemType, HealOpts, HealScanMode, send_heal_disk}; use rustfs_filemeta::{ - AppendSegment, AppendState, AppendStateKind, FileInfo, FileMeta, FileMetaShallowVersion, MetaCacheEntries, MetaCacheEntry, - MetadataResolutionParams, ObjectPartInfo, RawFileInfo, ReplicationStatusType, VersionPurgeStatusType, clear_append_state, - file_info_from_raw, get_append_state, merge_file_meta_versions, set_append_state, validate_new_segment, + FileInfo, FileMeta, FileMetaShallowVersion, MetaCacheEntries, MetaCacheEntry, MetadataResolutionParams, ObjectPartInfo, + RawFileInfo, ReplicationStatusType, VersionPurgeStatusType, file_info_from_raw, merge_file_meta_versions, }; use rustfs_lock::fast_lock::types::LockResult; use rustfs_madmin::heal_commands::{HealDriveInfo, HealResultItem}; use rustfs_rio::{EtagResolvable, HashReader, TryGetIndex as _, WarpReader}; -use rustfs_utils::http::headers::{AMZ_OBJECT_TAGGING, AMZ_STORAGE_CLASS, RESERVED_METADATA_PREFIX_LOWER}; +use rustfs_utils::http::headers::AMZ_OBJECT_TAGGING; +use rustfs_utils::http::headers::AMZ_STORAGE_CLASS; +use rustfs_utils::http::headers::RESERVED_METADATA_PREFIX_LOWER; use rustfs_utils::{ HashAlgorithm, crypto::{base64_decode, base64_encode, hex}, @@ -93,14 +91,12 @@ use std::{ collections::{HashMap, HashSet}, io::{Cursor, Write}, path::Path, - pin::Pin, sync::Arc, - task::{Context, Poll}, time::Duration, }; use time::OffsetDateTime; use tokio::{ - io::{AsyncReadExt, AsyncWrite, AsyncWriteExt}, + io::AsyncWrite, sync::{RwLock, broadcast}, }; use tokio::{ @@ -815,52 +811,6 @@ impl SetDisks { Ok(disks) } - async fn rename_part_data( - disks: &[Option], - src_bucket: &str, - src_object: &str, - dst_bucket: &str, - dst_object: &str, - write_quorum: usize, - ) -> disk::error::Result>> { - let src_bucket = Arc::new(src_bucket.to_string()); - let src_object = Arc::new(src_object.to_string()); - let dst_bucket = Arc::new(dst_bucket.to_string()); - let dst_object = Arc::new(dst_object.to_string()); - - let mut errs = Vec::with_capacity(disks.len()); - - let futures = disks.iter().map(|disk| { - let disk = disk.clone(); - let src_bucket = src_bucket.clone(); - let src_object = src_object.clone(); - let dst_bucket = dst_bucket.clone(); - let dst_object = dst_object.clone(); - tokio::spawn(async move { - if let Some(disk) = disk { - disk.rename_file(&src_bucket, &src_object, &dst_bucket, &dst_object).await - } else { - Err(DiskError::DiskNotFound) - } - }) - }); - - let results = join_all(futures).await; - for result in results { - match result? { - Ok(_) => errs.push(None), - Err(err) => errs.push(Some(err)), - } - } - - if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) { - warn!("rename_part_data errs {:?}", &errs); - return Err(err); - } - - Ok(Self::eval_disks(disks, &errs)) - } - fn eval_disks(disks: &[Option], errs: &[Option]) -> Vec> { if disks.len() != errs.len() { return Vec::new(); @@ -2214,97 +2164,27 @@ impl SetDisks { tracing::debug!(bucket, object, requested_length = length, offset, "get_object_with_fileinfo start"); let (disks, files) = Self::shuffle_disks_and_parts_metadata_by_index(disks, &files, &fi); - // Check for pending segments in append state - let append_state = fi.get_append_state(); - let has_pending_segments = !append_state.pending_segments.is_empty(); + let total_size = fi.size as usize; - // Calculate total size including pending segments - let base_size = append_state.committed_length; - let pending_size: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); - let total_logical_size = base_size + pending_size; - - tracing::debug!( - bucket, - object, - base_size, - pending_size, - total_logical_size, - has_pending_segments, - pending_segments_count = append_state.pending_segments.len(), - "Append-aware object read" - ); - - let total_size = total_logical_size as usize; - - let length = if length < 0 { total_size - offset } else { length as usize }; + let length = if length < 0 { + fi.size as usize - offset + } else { + length as usize + }; if offset > total_size || offset + length > total_size { error!("get_object_with_fileinfo offset out of range: {}, total_size: {}", offset, total_size); return Err(Error::other("offset out of range")); } - let erasure = Arc::new(erasure_coding::Erasure::new( - fi.erasure.data_blocks, - fi.erasure.parity_blocks, - fi.erasure.block_size, - )); - - if fi.inline_data() { - let inline = fi - .data - .as_ref() - .ok_or_else(|| Error::other("inline payload missing for read"))?; - let (plain, _) = decode_inline_payload(inline, fi.size as usize, &erasure, HashAlgorithm::HighwayHash256) - .await - .map_err(|err| Error::other(format!("failed to decode inline data: {err}")))?; - - let end = offset + length; - if end > plain.len() { - return Err(Error::other("inline payload shorter than expected")); - } - - if length > 0 { - writer - .write_all(&plain[offset..end]) - .await - .map_err(|e| Error::other(format!("failed to stream inline payload: {e}")))?; - } - - return Ok(()); - } - - // For regular parts reading, limit to base size (committed data) - let effective_read_end = (offset + length).min(base_size as usize); - let base_read_length = effective_read_end.saturating_sub(offset); - - if base_read_length == 0 { - // Reading entirely from pending segments - tracing::debug!( - bucket, - object, - offset, - length, - base_size, - "Read entirely from pending segments, skipping regular parts" - ); - } - - let (part_index, mut part_offset) = if base_read_length > 0 { - fi.to_part_offset(offset)? - } else { - (0, 0) // Placeholder, won't be used - }; + let (part_index, mut part_offset) = fi.to_part_offset(offset)?; let mut end_offset = offset; - if base_read_length > 0 { - end_offset += base_read_length - 1 + if length > 0 { + end_offset += length - 1 } - let (last_part_index, last_part_relative_offset) = if base_read_length > 0 { - fi.to_part_offset(end_offset)? - } else { - (0, 0) // Placeholder, won't be used - }; + let (last_part_index, last_part_relative_offset) = fi.to_part_offset(end_offset)?; tracing::debug!( bucket, @@ -2318,229 +2198,144 @@ impl SetDisks { "Multipart read bounds" ); + let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size); + let part_indices: Vec = (part_index..=last_part_index).collect(); tracing::debug!(bucket, object, ?part_indices, "Multipart part indices to stream"); let mut total_read = 0; - - // Only read from regular parts if there's base data to read - if base_read_length > 0 { - for current_part in part_indices { - if total_read == base_read_length { - tracing::debug!( - bucket, - object, - total_read, - base_read_length, - part_index = current_part, - "Stopping multipart stream - reached base data limit" - ); - break; - } - - if total_read >= base_read_length { - break; - } - - let part_number = fi.parts[current_part].number; - let part_size = fi.parts[current_part].size; - let mut part_length = part_size - part_offset; - if part_length > (base_read_length - total_read) { - part_length = base_read_length - total_read - } - - let till_offset = erasure.shard_file_offset(part_offset, part_length, part_size); - - let read_offset = (part_offset / erasure.block_size) * erasure.shard_size(); - + for current_part in part_indices { + if total_read == length { tracing::debug!( bucket, object, + total_read, + requested_length = length, part_index = current_part, - part_number, - part_offset, - part_size, - part_length, + "Stopping multipart stream early because accumulated bytes match request" + ); + break; + } + + let part_number = fi.parts[current_part].number; + let part_size = fi.parts[current_part].size; + let mut part_length = part_size - part_offset; + if part_length > (length - total_read) { + part_length = length - total_read + } + + let till_offset = erasure.shard_file_offset(part_offset, part_length, part_size); + + let read_offset = (part_offset / erasure.block_size) * erasure.shard_size(); + + tracing::debug!( + bucket, + object, + part_index = current_part, + part_number, + part_offset, + part_size, + part_length, + read_offset, + till_offset, + total_read_before = total_read, + requested_length = length, + "Streaming multipart part" + ); + + let mut readers = Vec::with_capacity(disks.len()); + let mut errors = Vec::with_capacity(disks.len()); + for (idx, disk_op) in disks.iter().enumerate() { + match create_bitrot_reader( + files[idx].data.as_deref(), + disk_op.as_ref(), + bucket, + &format!("{}/{}/part.{}", object, files[idx].data_dir.unwrap_or_default(), part_number), read_offset, till_offset, - total_read_before = total_read, - requested_length = length, - "Streaming multipart part" - ); - - let mut readers = Vec::with_capacity(disks.len()); - let mut errors = Vec::with_capacity(disks.len()); - for (idx, disk_op) in disks.iter().enumerate() { - let checksum_algo = if fi.erasure.checksums.is_empty() { - HashAlgorithm::HighwayHash256 - } else { - fi.erasure.get_checksum_info(part_number).algorithm - }; - - let use_inline = matches!(append_state.state, AppendStateKind::Inline | AppendStateKind::InlinePendingSpill); - let inline_source = if use_inline { files[idx].data.as_deref() } else { None }; - - if let Some(inline) = inline_source { - info!(bucket, object, part_number, inline_len = inline.len(), "using inline data for shard read"); + erasure.shard_size(), + HashAlgorithm::HighwayHash256, + ) + .await + { + Ok(Some(reader)) => { + readers.push(Some(reader)); + errors.push(None); } - - match create_bitrot_reader( - inline_source, - disk_op.as_ref(), - bucket, - &format!("{}/{}/part.{}", object, files[idx].data_dir.unwrap_or_default(), part_number), - read_offset, - till_offset, - erasure.shard_size(), - checksum_algo, - ) - .await - { - Ok(Some(reader)) => { - readers.push(Some(reader)); - errors.push(None); - } - Ok(None) => { - readers.push(None); - errors.push(Some(DiskError::DiskNotFound)); - } - Err(e) => { - readers.push(None); - errors.push(Some(e)); - } + Ok(None) => { + readers.push(None); + errors.push(Some(DiskError::DiskNotFound)); + } + Err(e) => { + readers.push(None); + errors.push(Some(e)); } } - - let nil_count = errors.iter().filter(|&e| e.is_none()).count(); - if nil_count < erasure.data_shards { - if let Some(read_err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, erasure.data_shards) { - error!("create_bitrot_reader reduce_read_quorum_errs {:?}", &errors); - return Err(to_object_err(read_err.into(), vec![bucket, object])); - } - error!("create_bitrot_reader not enough disks to read: {:?}", &errors); - return Err(Error::other(format!("not enough disks to read: {errors:?}"))); - } - - // debug!( - // "read part {} part_offset {},part_length {},part_size {} ", - // part_number, part_offset, part_length, part_size - // ); - let (written, err) = erasure.decode(writer, readers, part_offset, part_length, part_size).await; - tracing::debug!( - bucket, - object, - part_index = current_part, - part_number, - part_length, - bytes_written = written, - "Finished decoding multipart part" - ); - if let Some(e) = err { - let de_err: DiskError = e.into(); - let mut has_err = true; - if written == part_length { - match de_err { - DiskError::FileNotFound | DiskError::FileCorrupt => { - error!("erasure.decode err 111 {:?}", &de_err); - let _ = rustfs_common::heal_channel::send_heal_request( - rustfs_common::heal_channel::create_heal_request_with_options( - bucket.to_string(), - Some(object.to_string()), - false, - Some(HealChannelPriority::Normal), - Some(pool_index), - Some(set_index), - ), - ) - .await; - has_err = false; - } - _ => {} - } - } - - if has_err { - error!("erasure.decode err {} {:?}", written, &de_err); - return Err(de_err.into()); - } - } - - // debug!("ec decode {} written size {}", part_number, n); - - total_read += part_length; - part_offset = 0; } + + let nil_count = errors.iter().filter(|&e| e.is_none()).count(); + if nil_count < erasure.data_shards { + if let Some(read_err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, erasure.data_shards) { + error!("create_bitrot_reader reduce_read_quorum_errs {:?}", &errors); + return Err(to_object_err(read_err.into(), vec![bucket, object])); + } + error!("create_bitrot_reader not enough disks to read: {:?}", &errors); + return Err(Error::other(format!("not enough disks to read: {errors:?}"))); + } + + // debug!( + // "read part {} part_offset {},part_length {},part_size {} ", + // part_number, part_offset, part_length, part_size + // ); + let (written, err) = erasure.decode(writer, readers, part_offset, part_length, part_size).await; + tracing::debug!( + bucket, + object, + part_index = current_part, + part_number, + part_length, + bytes_written = written, + "Finished decoding multipart part" + ); + if let Some(e) = err { + let de_err: DiskError = e.into(); + let mut has_err = true; + if written == part_length { + match de_err { + DiskError::FileNotFound | DiskError::FileCorrupt => { + error!("erasure.decode err 111 {:?}", &de_err); + let _ = rustfs_common::heal_channel::send_heal_request( + rustfs_common::heal_channel::create_heal_request_with_options( + bucket.to_string(), + Some(object.to_string()), + false, + Some(HealChannelPriority::Normal), + Some(pool_index), + Some(set_index), + ), + ) + .await; + has_err = false; + } + _ => {} + } + } + + if has_err { + error!("erasure.decode err {} {:?}", written, &de_err); + return Err(de_err.into()); + } + } + + // debug!("ec decode {} written size {}", part_number, n); + + total_read += part_length; + part_offset = 0; } // debug!("read end"); - // Handle pending segments if we haven't read enough data and there are pending segments - if has_pending_segments && offset + length > append_state.committed_length as usize { - tracing::debug!( - bucket, - object, - offset, - length, - base_size, - total_read, - pending_segments_count = append_state.pending_segments.len(), - "Reading from pending segments" - ); - - let read_start = offset; - let read_end = offset + length; - - for (seg_index, segment) in append_state.pending_segments.iter().enumerate() { - if total_read >= length { - break; - } - - let seg_start = segment.offset as usize; - let seg_end = (segment.offset + segment.length) as usize; - - if seg_end <= read_start { - continue; - } - if seg_start >= read_end { - break; - } - - tracing::debug!( - bucket, - object, - seg_index, - seg_start, - seg_end, - read_start, - read_end, - "Loading pending segment data" - ); - - let segment_data = - Self::load_pending_segment(bucket, object, erasure.clone(), &disks, segment, fi.erasure.data_blocks).await?; - - let slice_start = read_start.max(seg_start) - seg_start; - let slice_end = read_end.min(seg_end) - seg_start; - - if slice_end > slice_start { - writer - .write_all(&segment_data[slice_start..slice_end]) - .await - .map_err(|e| Error::other(format!("failed to stream pending segment: {e}")))?; - total_read += slice_end - slice_start; - } - } - } - - tracing::debug!( - bucket, - object, - total_read, - expected_length = length, - has_pending_segments, - final_total_read = total_read, - "Append-aware multipart read finished" - ); + tracing::debug!(bucket, object, total_read, expected_length = length, "Multipart read finished"); Ok(()) } @@ -3648,813 +3443,6 @@ impl SetDisks { Ok(()) } - #[tracing::instrument(skip(self, data, opts), fields(bucket, object))] - async fn append_inline_object( - &self, - bucket: &str, - object: &str, - data: &mut PutObjReader, - opts: &ObjectOptions, - ) -> Result { - let info_opts = ObjectOptions { - version_id: opts.version_id.clone(), - versioned: opts.versioned, - version_suspended: opts.version_suspended, - no_lock: true, - ..Default::default() - }; - - let (mut fi, mut parts_metadata, online_disks) = self.get_object_fileinfo(bucket, object, &info_opts, true).await?; - - if fi.deleted { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - "cannot append to deleted object".to_string(), - )); - } - - let append_state_snapshot = fi.get_append_state(); - let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); - - validate_append_preconditions(bucket, object, &object_info)?; - - let position = opts.append_position.ok_or_else(|| { - StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - "x-amz-append-position header required".to_string(), - ) - })?; - - let base_size_snapshot: i64 = append_state_snapshot.committed_length; - let pending_length_snapshot: i64 = append_state_snapshot.pending_segments.iter().map(|seg| seg.length).sum(); - let expected_position_snapshot = base_size_snapshot.saturating_add(pending_length_snapshot); - if position != expected_position_snapshot { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - format!("append position mismatch: provided {}, expected {}", position, expected_position_snapshot), - )); - } - - let mut append_payload = Vec::new(); - tokio::io::AsyncReadExt::read_to_end(&mut data.stream, &mut append_payload) - .await - .map_err(|e| StorageError::other(format!("failed to read append payload: {e}")))?; - - data.stream = HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false) - .map_err(StorageError::other)?; - - if !fi.inline_data() { - return self - .append_segmented_object(fi, parts_metadata, bucket, object, append_payload, opts, position) - .await; - } - - let existing_inline = fi.data.as_ref().map(|b| b.as_ref()); - let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size); - let mut hash_algorithm = fi - .parts - .first() - .map(|part| fi.erasure.get_checksum_info(part.number).algorithm) - .unwrap_or(HashAlgorithm::HighwayHash256); - let mut existing_plain_override: Option> = None; - if fi.erasure.checksums.is_empty() { - hash_algorithm = HashAlgorithm::HighwayHash256; - } - - if let Some(inline) = existing_inline { - debug!( - existing_size = fi.size, - inline_len = inline.len(), - ?hash_algorithm, - erasure_checksums = ?fi.erasure.checksums, - inline_has_checksums = !fi.erasure.checksums.is_empty(), - "append inline metadata" - ); - - match decode_inline_payload(inline, fi.size as usize, &erasure, hash_algorithm.clone()).await { - Ok((plain, detected_algo)) => { - hash_algorithm = detected_algo; - existing_plain_override = Some(plain); - } - Err(err) => { - return Err(StorageError::other(format!("failed to decode inline data: {err}"))); - } - } - } - - let has_checksums = !fi.erasure.checksums.is_empty(); - - let append_ctx = InlineAppendContext { - existing_inline, - existing_plain: existing_plain_override.as_deref(), - existing_size: fi.size, - append_payload: &append_payload, - erasure: &erasure, - hash_algorithm: hash_algorithm.clone(), - has_checksums, - }; - - let append_result = append_inline_data(append_ctx).await?; - - // Check if we need to spill to segmented mode after append - let total_shard_size = erasure.shard_file_size(append_result.total_size); - let should_remain_inline = if let Some(sc) = GLOBAL_STORAGE_CLASS.get() { - sc.should_inline(total_shard_size, opts.versioned) - } else { - true // fallback to inline if no storage class config - }; - - if !should_remain_inline { - let mut plain_total = if let Some(existing_plain) = existing_plain_override.clone() { - existing_plain - } else if let Some(inline) = existing_inline { - let (plain, _) = decode_inline_payload(inline, fi.size as usize, &erasure, hash_algorithm.clone()) - .await - .map_err(|err| StorageError::other(format!("failed to decode inline data: {err}")))?; - plain - } else { - Vec::new() - }; - plain_total.extend_from_slice(&append_payload); - info!( - bucket, - object, - total_size = append_result.total_size, - existing_inline_len = fi.size, - plain_total_len = plain_total.len(), - append_payload_len = append_payload.len(), - shard_size = total_shard_size, - "Inline object exceeds threshold, spilling to segmented storage" - ); - - return self - .spill_inline_into_segmented( - fi, - parts_metadata, - bucket, - object, - plain_total, - append_result.etag.clone(), - opts, - AppendStateKind::SegmentedActive, - ) - .await; - } - - let inline_bytes = Bytes::from(append_result.inline_data.clone()); - let now = OffsetDateTime::now_utc(); - - fi.mod_time = Some(now); - fi.size = append_result.total_size; - fi.data = Some(inline_bytes.clone()); - fi.metadata.insert("etag".to_owned(), append_result.etag.clone()); - fi.set_inline_data(); - fi.metadata.insert( - format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), - append_result.total_size.to_string(), - ); - fi.metadata - .insert("x-rustfs-encryption-original-size".to_string(), append_result.total_size.to_string()); - if !has_checksums { - fi.erasure.checksums.clear(); - } - - let mut append_state = match get_append_state(&fi.metadata) { - Ok(Some(state)) => state, - Ok(None) => AppendState::default(), - Err(err) => { - return Err(StorageError::other(format!("failed to decode append state: {err}"))); - } - }; - append_state.state = AppendStateKind::Inline; - append_state.committed_length = append_result.total_size; - append_state.pending_segments.clear(); - append_state.epoch = append_state.epoch.saturating_add(1); - - set_append_state(&mut fi.metadata, &append_state) - .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; - fi.parts.clear(); - fi.add_object_part( - 1, - append_result.etag.clone(), - append_result.total_size as usize, - Some(now), - append_result.total_size, - None, - ); - - for meta in parts_metadata.iter_mut() { - if !meta.is_valid() { - continue; - } - meta.mod_time = fi.mod_time; - meta.size = fi.size; - meta.metadata = fi.metadata.clone(); - meta.parts = fi.parts.clone(); - meta.data = Some(inline_bytes.clone()); - meta.set_inline_data(); - if !has_checksums { - meta.erasure.checksums.clear(); - } - } - - let write_quorum = fi.write_quorum(self.default_write_quorum()); - - Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; - - object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); - object_info.etag = Some(append_result.etag.clone()); - - Ok(object_info) - } - - #[allow(clippy::too_many_arguments)] - async fn spill_inline_into_segmented( - &self, - mut fi: FileInfo, - mut parts_metadata: Vec, - bucket: &str, - object: &str, - plain_data: Vec, - etag: String, - opts: &ObjectOptions, - target_state: AppendStateKind, - ) -> Result { - let write_quorum = fi.write_quorum(self.default_write_quorum()); - let disks_guard = self.disks.read().await; - let shuffle_disks = Self::shuffle_disks(&disks_guard, &fi.erasure.distribution); - - let mut data_reader = PutObjReader::from_vec(plain_data.clone()); - let erasure = Arc::new(erasure_coding::Erasure::new( - fi.erasure.data_blocks, - fi.erasure.parity_blocks, - fi.erasure.block_size, - )); - - let mut append_state = match get_append_state(&fi.metadata) { - Ok(Some(state)) => state, - Ok(None) => fi.get_append_state(), - Err(err) => { - warn!( - ?err, - bucket, object, "failed to decode append state from metadata, falling back to inferred state" - ); - fi.get_append_state() - } - }; - - let tmp_root = format!("{}x{}", Uuid::new_v4(), OffsetDateTime::now_utc().unix_timestamp()); - let data_dir = Uuid::new_v4(); - let tmp_part_path = format!("{tmp_root}/{data_dir}/part.1"); - let final_part_path = format!("{}/{}/part.1", object, data_dir); - - let mut writers = Vec::with_capacity(shuffle_disks.len()); - let mut errors = Vec::with_capacity(shuffle_disks.len()); - for disk in shuffle_disks.iter() { - if let Some(disk) = disk { - let writer = create_bitrot_writer( - false, - Some(disk), - RUSTFS_META_TMP_BUCKET, - &tmp_part_path, - erasure.shard_file_size(data_reader.size()), - erasure.shard_size(), - HashAlgorithm::HighwayHash256, - ) - .await?; - writers.push(Some(writer)); - errors.push(None); - } else { - writers.push(None); - errors.push(Some(DiskError::DiskNotFound)); - } - } - - let healthy_writers = errors.iter().filter(|err| err.is_none()).count(); - if healthy_writers < write_quorum { - if let Some(write_err) = reduce_write_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, write_quorum) { - return Err(write_err.into()); - } - return Err(StorageError::other("not enough disks for spill")); - } - - let stream = mem::replace( - &mut data_reader.stream, - HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false) - .map_err(StorageError::other)?, - ); - - let (reader, written_size) = erasure - .clone() - .encode(stream, &mut writers, write_quorum) - .await - .map_err(StorageError::other)?; - - let _ = mem::replace(&mut data_reader.stream, reader); - - if (written_size as i64) < data_reader.size() { - return Err(StorageError::other("spill write truncated payload")); - } - - drop(writers); - - let rename_result = Self::rename_part_data( - &shuffle_disks, - RUSTFS_META_TMP_BUCKET, - &tmp_part_path, - bucket, - &final_part_path, - write_quorum, - ) - .await; - - let cleanup_result = self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_root).await; - - let online_disks = match rename_result { - Ok(disks) => { - cleanup_result?; - disks - } - Err(err) => { - if let Err(clean_err) = cleanup_result { - warn!("spill cleanup failed after rename error: {clean_err:?}"); - } - return Err(err.into()); - } - }; - - let now = OffsetDateTime::now_utc(); - fi.mod_time = Some(now); - fi.size = plain_data.len() as i64; - fi.data = None; - fi.data_dir = Some(data_dir); - fi.metadata.remove(&format!("{RESERVED_METADATA_PREFIX_LOWER}inline-data")); - fi.metadata.insert("etag".to_owned(), etag.clone()); - fi.metadata - .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), fi.size.to_string()); - fi.metadata - .insert("x-rustfs-encryption-original-size".to_string(), fi.size.to_string()); - - fi.parts.clear(); - fi.add_object_part(1, etag.clone(), plain_data.len(), Some(now), fi.size, None); - - append_state.state = target_state; - append_state.committed_length = fi.size; - append_state.pending_segments.clear(); - append_state.epoch = append_state.epoch.saturating_add(1); - set_append_state(&mut fi.metadata, &append_state) - .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; - - for meta in parts_metadata.iter_mut() { - if !meta.is_valid() { - continue; - } - meta.mod_time = fi.mod_time; - meta.size = fi.size; - meta.data = None; - meta.data_dir = Some(data_dir); - meta.metadata = fi.metadata.clone(); - meta.parts = fi.parts.clone(); - meta.metadata.remove(&format!("{RESERVED_METADATA_PREFIX_LOWER}inline-data")); - } - - Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; - - let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); - object_info.etag = Some(etag); - - Ok(object_info) - } - - async fn load_pending_segment( - bucket: &str, - object: &str, - erasure: Arc, - disks: &[Option], - segment: &AppendSegment, - read_quorum: usize, - ) -> Result> { - let segment_dir = segment - .data_dir - .ok_or_else(|| StorageError::other("append segment missing data directory"))?; - let segment_path = format!("{}/append/{}/{segment_dir}/part.1", object, segment.epoch); - - let mut readers = Vec::with_capacity(disks.len()); - let mut errors = Vec::with_capacity(disks.len()); - for disk in disks.iter() { - if let Some(disk) = disk { - match create_bitrot_reader( - None, - Some(disk), - bucket, - &segment_path, - 0, - segment.length as usize, - erasure.shard_size(), - HashAlgorithm::HighwayHash256, - ) - .await - { - Ok(Some(reader)) => { - readers.push(Some(reader)); - errors.push(None); - } - Ok(None) => { - readers.push(None); - errors.push(Some(DiskError::DiskNotFound)); - } - Err(err) => { - readers.push(None); - errors.push(Some(err)); - } - } - } else { - readers.push(None); - errors.push(Some(DiskError::DiskNotFound)); - } - } - - if let Some(err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, read_quorum) { - return Err(err.into()); - } - - let mut writer = VecAsyncWriter::with_capacity(segment.length as usize); - let (written, err) = erasure - .decode(&mut writer, readers, 0, segment.length as usize, segment.length as usize) - .await; - if let Some(e) = err { - let de: DiskError = e.into(); - return Err(de.into()); - } - - if written < segment.length as usize { - return Err(StorageError::other("pending segment read truncated")); - } - - Ok(writer.into_inner()) - } - - async fn remove_pending_segments_data(&self, bucket: &str, object: &str, segments: &[AppendSegment]) -> Result<()> { - for segment in segments { - if let Some(dir) = segment.data_dir { - let prefix = format!("{}/append/{}/{}", object, segment.epoch, dir); - self.delete_all(bucket, &prefix).await?; - } - } - Ok(()) - } - - async fn complete_append_object(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - let mut info_opts = opts.clone(); - info_opts.no_lock = true; - - let (fi, parts_metadata, online_disks) = self.get_object_fileinfo(bucket, object, &info_opts, true).await?; - - let append_state = fi.get_append_state(); - if append_state.pending_segments.is_empty() { - return Err(StorageError::other("no pending segments to complete")); - } - - let pending_size: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); - let total_logical = append_state.committed_length.saturating_add(pending_size); - - let mut writer = VecAsyncWriter::with_capacity(total_logical as usize); - Self::get_object_with_fileinfo( - bucket, - object, - 0, - total_logical, - &mut writer, - fi.clone(), - parts_metadata.clone(), - &online_disks, - self.set_index, - self.pool_index, - ) - .await?; - - let plain_data = writer.into_inner(); - let final_etag = format!("{:x}", Md5::digest(&plain_data)); - - let result_info = self - .spill_inline_into_segmented( - fi, - parts_metadata, - bucket, - object, - plain_data, - final_etag, - opts, - AppendStateKind::SegmentedSealed, - ) - .await?; - - self.remove_pending_segments_data(bucket, object, &append_state.pending_segments) - .await?; - - Ok(result_info) - } - - async fn abort_append_object(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - let mut info_opts = opts.clone(); - info_opts.no_lock = true; - - let (mut fi, mut parts_metadata, online_disks) = self.get_object_fileinfo(bucket, object, &info_opts, true).await?; - - let mut append_state = fi.get_append_state(); - if append_state.pending_segments.is_empty() { - return Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended)); - } - - self.remove_pending_segments_data(bucket, object, &append_state.pending_segments) - .await?; - - append_state.pending_segments.clear(); - append_state.state = AppendStateKind::SegmentedSealed; - append_state.epoch = append_state.epoch.saturating_add(1); - - let mut committed_length = append_state.committed_length; - let actual_committed = if fi.inline_data() { - fi.data.as_ref().map(|buf| buf.len() as i64).unwrap_or(committed_length) - } else if fi.parts.is_empty() { - fi.size - } else { - fi.parts.iter().map(|part| part.size as i64).sum() - }; - - if actual_committed != committed_length { - warn!( - bucket, - object, - recorded_length = committed_length, - actual_length = actual_committed, - "abort append detected committed length mismatch, correcting" - ); - committed_length = actual_committed; - append_state.committed_length = actual_committed; - } - - fi.mod_time = Some(OffsetDateTime::now_utc()); - fi.size = committed_length; - fi.data = None; - fi.metadata.remove(&format!("{RESERVED_METADATA_PREFIX_LOWER}inline-data")); - - set_append_state(&mut fi.metadata, &append_state) - .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; - - let complete_parts: Vec = fi - .parts - .iter() - .map(|part| CompletePart { - part_num: part.number, - etag: Some(part.etag.clone()), - }) - .collect(); - let base_etag = if complete_parts.is_empty() { - fi.metadata.get("etag").cloned().unwrap_or_default() - } else { - get_complete_multipart_md5(&complete_parts) - }; - - fi.metadata.insert("etag".to_owned(), base_etag.clone()); - fi.metadata - .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), committed_length.to_string()); - fi.metadata - .insert("x-rustfs-encryption-original-size".to_string(), committed_length.to_string()); - - for meta in parts_metadata.iter_mut() { - if !meta.is_valid() { - continue; - } - meta.mod_time = fi.mod_time; - meta.size = fi.size; - meta.metadata = fi.metadata.clone(); - meta.parts = fi.parts.clone(); - meta.data = None; - meta.data_dir = fi.data_dir; - } - - let write_quorum = fi.write_quorum(self.default_write_quorum()); - Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; - - let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); - object_info.etag = Some(base_etag); - - Ok(object_info) - } - - #[allow(clippy::too_many_arguments)] - async fn append_segmented_object( - &self, - mut fi: FileInfo, - mut parts_metadata: Vec, - bucket: &str, - object: &str, - append_payload: Vec, - opts: &ObjectOptions, - position: i64, - ) -> Result { - let data_dir = fi - .data_dir - .ok_or_else(|| StorageError::other(format!("append requires existing data directory for {bucket}/{object}")))?; - - let mut append_state = match get_append_state(&fi.metadata) { - Ok(Some(state)) => state, - Ok(None) => fi.get_append_state(), - Err(err) => { - warn!(?err, bucket, object, "failed to decode append state from metadata, using inferred state"); - fi.get_append_state() - } - }; - - if matches!(append_state.state, AppendStateKind::Inline | AppendStateKind::InlinePendingSpill) { - return Err(StorageError::other("segmented append invoked while object still inline")); - } - - if append_state.state == AppendStateKind::SegmentedSealed { - append_state.state = AppendStateKind::SegmentedActive; - } - - let pending_length: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); - let expected_offset = append_state.committed_length.saturating_add(pending_length); - if position != expected_offset { - return Err(StorageError::InvalidArgument( - bucket.to_string(), - object.to_string(), - format!("append position mismatch: provided {position}, expected {expected_offset}"), - )); - } - - let new_length = append_payload.len() as i64; - validate_new_segment(&append_state, position, new_length) - .map_err(|err| StorageError::other(format!("invalid append segment: {err}")))?; - - let write_quorum = fi.write_quorum(self.default_write_quorum()); - let disks_guard = self.disks.read().await; - let shuffle_disks = Self::shuffle_disks(&disks_guard, &fi.erasure.distribution); - - let mut append_reader = PutObjReader::from_vec(append_payload); - let erasure = Arc::new(erasure_coding::Erasure::new( - fi.erasure.data_blocks, - fi.erasure.parity_blocks, - fi.erasure.block_size, - )); - - let tmp_root = format!("{}x{}", Uuid::new_v4(), OffsetDateTime::now_utc().unix_timestamp()); - let segment_id = Uuid::new_v4(); - let new_epoch = append_state.epoch.saturating_add(1); - let tmp_part_path = format!("{tmp_root}/append/{new_epoch}/{segment_id}/part.1"); - let final_part_path = format!("{}/append/{new_epoch}/{segment_id}/part.1", object); - - let mut writers = Vec::with_capacity(shuffle_disks.len()); - let mut errors = Vec::with_capacity(shuffle_disks.len()); - for disk in shuffle_disks.iter() { - if let Some(disk) = disk { - let writer = create_bitrot_writer( - false, - Some(disk), - RUSTFS_META_TMP_BUCKET, - &tmp_part_path, - erasure.shard_file_size(append_reader.size()), - erasure.shard_size(), - HashAlgorithm::HighwayHash256, - ) - .await?; - writers.push(Some(writer)); - errors.push(None); - } else { - writers.push(None); - errors.push(Some(DiskError::DiskNotFound)); - } - } - - let healthy_writers = errors.iter().filter(|err| err.is_none()).count(); - if healthy_writers < write_quorum { - if let Some(write_err) = reduce_write_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, write_quorum) { - return Err(write_err.into()); - } - return Err(StorageError::other("not enough disks for append")); - } - - let stream = mem::replace( - &mut append_reader.stream, - HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false) - .map_err(StorageError::other)?, - ); - - let (reader, written_size) = erasure - .clone() - .encode(stream, &mut writers, write_quorum) - .await - .map_err(StorageError::other)?; - - let _ = mem::replace(&mut append_reader.stream, reader); - - if (written_size as i64) < append_reader.size() { - return Err(StorageError::other("append write truncated payload")); - } - - let mut part_etag = append_reader.stream.try_resolve_etag().unwrap_or_default(); - if let Some(ref tag) = opts.preserve_etag { - part_etag = tag.clone(); - } - - drop(writers); - - let rename_result = Self::rename_part_data( - &shuffle_disks, - RUSTFS_META_TMP_BUCKET, - &tmp_part_path, - bucket, - &final_part_path, - write_quorum, - ) - .await; - - let cleanup_result = self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_root).await; - - let online_disks = match rename_result { - Ok(disks) => { - cleanup_result?; - disks - } - Err(err) => { - if let Err(clean_err) = cleanup_result { - warn!("append cleanup failed after rename error: {clean_err:?}"); - } - return Err(err.into()); - } - }; - - let segment = AppendSegment { - offset: position, - length: new_length, - data_dir: Some(segment_id), - etag: Some(part_etag.clone()), - epoch: new_epoch, - }; - - append_state.pending_segments.push(segment); - append_state.epoch = new_epoch; - append_state.state = AppendStateKind::SegmentedActive; - - let logical_size = append_state - .committed_length - .saturating_add(append_state.pending_segments.iter().map(|seg| seg.length).sum()); - fi.size = logical_size; - fi.mod_time = Some(OffsetDateTime::now_utc()); - - // Update etag to include pending segments - let mut aggregate_parts: Vec = fi - .parts - .iter() - .map(|part| CompletePart { - part_num: part.number, - etag: Some(part.etag.clone()), - }) - .collect(); - let mut next_part_number = aggregate_parts.last().map(|p| p.part_num).unwrap_or(0); - for pending in append_state.pending_segments.iter() { - next_part_number += 1; - aggregate_parts.push(CompletePart { - part_num: next_part_number, - etag: pending.etag.clone(), - }); - } - let aggregate_etag = get_complete_multipart_md5(&aggregate_parts); - - fi.metadata.insert("etag".to_owned(), aggregate_etag.clone()); - fi.metadata - .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), logical_size.to_string()); - fi.metadata - .insert("x-rustfs-encryption-original-size".to_string(), logical_size.to_string()); - - set_append_state(&mut fi.metadata, &append_state) - .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; - - for meta in parts_metadata.iter_mut() { - if !meta.is_valid() { - continue; - } - meta.mod_time = fi.mod_time; - meta.size = fi.size; - meta.metadata = fi.metadata.clone(); - meta.parts = fi.parts.clone(); - meta.data = None; - meta.versioned = opts.versioned || opts.version_suspended; - } - - Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; - - let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); - object_info.etag = Some(aggregate_etag); - - Ok(object_info) - } - async fn check_write_precondition(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Option { let mut opts = opts.clone(); @@ -4606,16 +3594,6 @@ impl ObjectIO for SetDisks { None }; - if opts.append_object { - if opts.http_preconditions.clone().is_some() { - if let Some(err) = self.check_write_precondition(bucket, object, opts).await { - return Err(err); - } - } - - return self.append_inline_object(bucket, object, data, opts).await; - } - if let Some(http_preconditions) = opts.http_preconditions.clone() { if let Some(err) = self.check_write_precondition(bucket, object, opts).await { return Err(err); @@ -4842,37 +3820,6 @@ impl ObjectIO for SetDisks { } } -struct VecAsyncWriter { - buffer: Vec, -} - -impl VecAsyncWriter { - fn with_capacity(capacity: usize) -> Self { - Self { - buffer: Vec::with_capacity(capacity), - } - } - - fn into_inner(self) -> Vec { - self.buffer - } -} - -impl AsyncWrite for VecAsyncWriter { - fn poll_write(mut self: Pin<&mut Self>, _cx: &mut Context<'_>, buf: &[u8]) -> Poll> { - self.buffer.extend_from_slice(buf); - Poll::Ready(Ok(buf.len())) - } - - fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { - Poll::Ready(Ok(())) - } - - fn poll_shutdown(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { - Poll::Ready(Ok(())) - } -} - #[async_trait::async_trait] impl StorageAPI for SetDisks { #[tracing::instrument(skip(self))] @@ -5287,42 +4234,6 @@ impl StorageAPI for SetDisks { (del_objects, del_errs) } - async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - let mut info_opts = opts.clone(); - info_opts.no_lock = true; - - let _object_lock_guard = if !opts.no_lock { - Some( - self.fast_lock_manager - .acquire_write_lock("", object, self.locker_owner.as_str()) - .await - .map_err(|_| Error::other("can not get lock. please retry".to_string()))?, - ) - } else { - None - }; - - self.complete_append_object(bucket, object, &info_opts).await - } - - async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - let mut info_opts = opts.clone(); - info_opts.no_lock = true; - - let _object_lock_guard = if !opts.no_lock { - Some( - self.fast_lock_manager - .acquire_write_lock("", object, self.locker_owner.as_str()) - .await - .map_err(|_| Error::other("can not get lock. please retry".to_string()))?, - ) - } else { - None - }; - - self.abort_append_object(bucket, object, &info_opts).await - } - #[tracing::instrument(skip(self))] async fn delete_object(&self, bucket: &str, object: &str, mut opts: ObjectOptions) -> Result { // Guard lock for single object delete diff --git a/crates/ecstore/src/sets.rs b/crates/ecstore/src/sets.rs index 26ca4006..02a95179 100644 --- a/crates/ecstore/src/sets.rs +++ b/crates/ecstore/src/sets.rs @@ -602,14 +602,6 @@ impl StorageAPI for Sets { (del_objects, del_errs) } - async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - self.get_disks_by_key(object).complete_append(bucket, object, opts).await - } - - async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - self.get_disks_by_key(object).abort_append(bucket, object, opts).await - } - async fn list_object_parts( &self, bucket: &str, diff --git a/crates/ecstore/src/store.rs b/crates/ecstore/src/store.rs index 0b725172..41a15b2a 100644 --- a/crates/ecstore/src/store.rs +++ b/crates/ecstore/src/store.rs @@ -1709,17 +1709,6 @@ impl StorageAPI for ECStore { // Ok((del_objects, del_errs)) } - async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - let object = encode_dir_object(object); - let (pinfo, _) = self.internal_get_pool_info_existing_with_opts(bucket, &object, opts).await?; - self.pools[pinfo.index].complete_append(bucket, &object, opts).await - } - - async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { - let object = encode_dir_object(object); - let (pinfo, _) = self.internal_get_pool_info_existing_with_opts(bucket, &object, opts).await?; - self.pools[pinfo.index].abort_append(bucket, &object, opts).await - } #[tracing::instrument(skip(self))] async fn list_object_parts( &self, diff --git a/crates/ecstore/src/store_api.rs b/crates/ecstore/src/store_api.rs index 13a5b9da..ce1a3cce 100644 --- a/crates/ecstore/src/store_api.rs +++ b/crates/ecstore/src/store_api.rs @@ -328,8 +328,6 @@ pub struct ObjectOptions { pub max_parity: bool, pub mod_time: Option, pub part_number: Option, - pub append_object: bool, - pub append_position: Option, pub delete_prefix: bool, pub delete_prefix_object: bool, @@ -658,15 +656,6 @@ impl ObjectInfo { }) .collect(); - let append_state = fi.get_append_state(); - let pending_length: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); - let logical_size = append_state.committed_length.saturating_add(pending_length); - let actual_size_meta = fi - .metadata - .get(&format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size")) - .and_then(|o| o.parse::().ok()) - .unwrap_or(logical_size); - ObjectInfo { bucket: bucket.to_string(), name, @@ -676,7 +665,7 @@ impl ObjectInfo { version_id, delete_marker: fi.deleted, mod_time: fi.mod_time, - size: logical_size, + size: fi.size, parts, is_latest: fi.is_latest, user_tags, @@ -688,7 +677,6 @@ impl ObjectInfo { inlined, user_defined: metadata, transitioned_object, - actual_size: actual_size_meta, ..Default::default() } } @@ -1200,10 +1188,6 @@ pub trait StorageAPI: ObjectIO + Debug { opts: ObjectOptions, ) -> (Vec, Vec>); - async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result; - - async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result; - // TransitionObject TODO: // RestoreTransitionedObject TODO: diff --git a/crates/filemeta/Cargo.toml b/crates/filemeta/Cargo.toml index a8e21d20..5c7a3589 100644 --- a/crates/filemeta/Cargo.toml +++ b/crates/filemeta/Cargo.toml @@ -30,7 +30,6 @@ crc32fast = { workspace = true } rmp.workspace = true rmp-serde.workspace = true serde.workspace = true -serde_json.workspace = true time.workspace = true uuid = { workspace = true, features = ["v4", "fast-rng", "serde"] } tokio = { workspace = true, features = ["io-util", "macros", "sync"] } diff --git a/crates/filemeta/src/append.rs b/crates/filemeta/src/append.rs deleted file mode 100644 index 4eca1d62..00000000 --- a/crates/filemeta/src/append.rs +++ /dev/null @@ -1,541 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::error::{Error, Result}; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use uuid::Uuid; - -const APPEND_STATE_META_KEY: &str = "x-rustfs-internal-append-state"; - -/// Tracks the state of append-enabled objects. -#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] -pub struct AppendState { - pub state: AppendStateKind, - pub epoch: u64, - pub committed_length: i64, - pub pending_segments: Vec, -} - -/// Represents individual append segments that still need consolidation. -#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] -pub struct AppendSegment { - pub offset: i64, - pub length: i64, - pub data_dir: Option, - pub etag: Option, - pub epoch: u64, -} - -/// Possible append lifecycle states for an object version. -#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] -pub enum AppendStateKind { - #[default] - Disabled, - Inline, - InlinePendingSpill, - SegmentedActive, - SegmentedSealed, -} - -/// Persist the provided append state into object metadata. -pub fn set_append_state(metadata: &mut HashMap, state: &AppendState) -> Result<()> { - let encoded = serde_json::to_string(state).map_err(Error::other)?; - metadata.insert(APPEND_STATE_META_KEY.to_string(), encoded); - Ok(()) -} - -/// Remove the append state marker from metadata. -pub fn clear_append_state(metadata: &mut HashMap) { - metadata.remove(APPEND_STATE_META_KEY); -} - -/// Load append state stored in metadata, if any. -pub fn get_append_state(metadata: &HashMap) -> Result> { - let raw = match metadata.get(APPEND_STATE_META_KEY) { - Some(val) if !val.is_empty() => val, - _ => return Ok(None), - }; - - let decoded = serde_json::from_str(raw).map_err(Error::other)?; - Ok(Some(decoded)) -} - -/// Complete append operations by consolidating pending segments and sealing the object -pub fn complete_append_operation(state: &mut AppendState) -> Result<()> { - match state.state { - AppendStateKind::SegmentedActive => { - // Move all pending segments data to main parts and seal - state.committed_length += state.pending_segments.iter().map(|s| s.length).sum::(); - state.pending_segments.clear(); - state.state = AppendStateKind::SegmentedSealed; - state.epoch = state.epoch.saturating_add(1); - Ok(()) - } - AppendStateKind::Inline => { - // Inline objects are always immediately committed, just seal them - state.state = AppendStateKind::SegmentedSealed; // Transition to sealed - state.epoch = state.epoch.saturating_add(1); - Ok(()) - } - AppendStateKind::InlinePendingSpill => { - // Wait for spill to complete, then seal - // In practice, this might need to trigger the spill completion first - state.state = AppendStateKind::SegmentedSealed; - state.pending_segments.clear(); - state.epoch = state.epoch.saturating_add(1); - Ok(()) - } - AppendStateKind::SegmentedSealed | AppendStateKind::Disabled => { - // Already sealed or disabled - Err(Error::other("Cannot complete append on sealed or disabled object")) - } - } -} - -/// Abort append operations by discarding pending segments and returning to sealed state -pub fn abort_append_operation(state: &mut AppendState) -> Result<()> { - match state.state { - AppendStateKind::SegmentedActive => { - // Discard all pending segments and seal - state.pending_segments.clear(); - state.state = AppendStateKind::SegmentedSealed; - state.epoch = state.epoch.saturating_add(1); - Ok(()) - } - AppendStateKind::Inline => { - // Inline data is already committed, just seal - state.state = AppendStateKind::SegmentedSealed; - state.epoch = state.epoch.saturating_add(1); - Ok(()) - } - AppendStateKind::InlinePendingSpill => { - // Cancel spill and keep inline data, then seal - state.state = AppendStateKind::SegmentedSealed; - state.pending_segments.clear(); - state.epoch = state.epoch.saturating_add(1); - Ok(()) - } - AppendStateKind::SegmentedSealed | AppendStateKind::Disabled => { - // Already sealed or disabled - Err(Error::other("Cannot abort append on sealed or disabled object")) - } - } -} - -/// Check if an append operation can be completed -pub fn can_complete_append(state: &AppendState) -> bool { - matches!( - state.state, - AppendStateKind::Inline | AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive - ) -} - -/// Check if an append operation can be aborted -pub fn can_abort_append(state: &AppendState) -> bool { - matches!( - state.state, - AppendStateKind::Inline | AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive - ) -} - -/// Verify epoch for optimistic concurrency control -pub fn verify_append_epoch(current_state: &AppendState, expected_epoch: u64) -> Result<()> { - if current_state.epoch != expected_epoch { - Err(Error::other(format!( - "Append operation conflict: expected epoch {}, found {}", - expected_epoch, current_state.epoch - ))) - } else { - Ok(()) - } -} - -/// Prepare next append operation by incrementing epoch -pub fn prepare_next_append(state: &mut AppendState) { - state.epoch = state.epoch.saturating_add(1); -} - -/// Validate that a new append segment doesn't conflict with existing segments -pub fn validate_new_segment(state: &AppendState, new_offset: i64, new_length: i64) -> Result<()> { - let new_end = new_offset + new_length; - - // Check it doesn't overlap with committed data - if new_offset < state.committed_length { - return Err(Error::other(format!( - "New segment overlaps with committed data: offset {} < committed_length {}", - new_offset, state.committed_length - ))); - } - - // Check it doesn't overlap with existing pending segments - for existing in &state.pending_segments { - let existing_start = existing.offset; - let existing_end = existing.offset + existing.length; - - // Check for any overlap - if new_offset < existing_end && new_end > existing_start { - return Err(Error::other(format!( - "New segment [{}, {}) overlaps with existing segment [{}, {})", - new_offset, new_end, existing_start, existing_end - ))); - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::fileinfo::FileInfo; - - #[test] - fn append_state_roundtrip_in_metadata() { - let mut metadata = HashMap::new(); - let state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 42, - committed_length: 2048, - pending_segments: vec![AppendSegment { - offset: 2048, - length: 512, - data_dir: Some(Uuid::new_v4()), - etag: Some("abc123".to_string()), - epoch: 0, - }], - }; - - set_append_state(&mut metadata, &state).expect("persist append state"); - assert!(metadata.contains_key(APPEND_STATE_META_KEY)); - - let decoded = get_append_state(&metadata) - .expect("decode append state") - .expect("state present"); - assert_eq!(decoded, state); - - clear_append_state(&mut metadata); - assert!(!metadata.contains_key(APPEND_STATE_META_KEY)); - assert!(get_append_state(&metadata).unwrap().is_none()); - } - - #[test] - fn fileinfo_append_state_migration_compatibility() { - // Test old inline data object - let mut inline_fi = FileInfo { - size: 1024, - ..Default::default() - }; - inline_fi.set_inline_data(); - - let state = inline_fi.get_append_state(); - assert_eq!(state.state, AppendStateKind::Inline); - assert_eq!(state.committed_length, 1024); - assert!(state.pending_segments.is_empty()); - assert!(inline_fi.is_appendable()); - assert!(!inline_fi.has_pending_appends()); - - // Test old regular object - let regular_fi = FileInfo { - size: 2048, - ..Default::default() - }; - // No inline_data marker - - let state = regular_fi.get_append_state(); - assert_eq!(state.state, AppendStateKind::SegmentedSealed); - assert_eq!(state.committed_length, 2048); - assert!(state.pending_segments.is_empty()); - assert!(!regular_fi.is_appendable()); - assert!(!regular_fi.has_pending_appends()); - - // Test explicit append state - let mut append_fi = FileInfo::default(); - let explicit_state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 5, - committed_length: 1500, - pending_segments: vec![AppendSegment { - offset: 1500, - length: 300, - data_dir: Some(Uuid::new_v4()), - etag: Some("def456".to_string()), - epoch: 0, - }], - }; - - append_fi.set_append_state(&explicit_state).expect("set explicit state"); - let retrieved_state = append_fi.get_append_state(); - assert_eq!(retrieved_state, explicit_state); - assert!(append_fi.is_appendable()); - assert!(append_fi.has_pending_appends()); - } - - #[test] - fn append_state_transitions() { - // Test state transition validation - assert_eq!(AppendStateKind::default(), AppendStateKind::Disabled); - - let inline_state = AppendState { - state: AppendStateKind::Inline, - ..Default::default() - }; - - let spill_state = AppendState { - state: AppendStateKind::InlinePendingSpill, - ..Default::default() - }; - - let active_state = AppendState { - state: AppendStateKind::SegmentedActive, - ..Default::default() - }; - - let sealed_state = AppendState { - state: AppendStateKind::SegmentedSealed, - ..Default::default() - }; - - // Verify serialization works for all states - for state in [inline_state, spill_state, active_state, sealed_state] { - let mut metadata = HashMap::new(); - set_append_state(&mut metadata, &state).expect("serialize state"); - let decoded = get_append_state(&metadata).unwrap().unwrap(); - assert_eq!(decoded, state); - } - } - - #[test] - fn complete_append_transitions() { - // Test completing SegmentedActive with pending segments - let mut active_state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 5, - committed_length: 1000, - pending_segments: vec![ - AppendSegment { - offset: 1000, - length: 200, - data_dir: Some(Uuid::new_v4()), - etag: Some("abc123".to_string()), - epoch: 0, - }, - AppendSegment { - offset: 1200, - length: 300, - data_dir: Some(Uuid::new_v4()), - etag: Some("def456".to_string()), - epoch: 0, - }, - ], - }; - - assert!(can_complete_append(&active_state)); - complete_append_operation(&mut active_state).expect("complete should succeed"); - - assert_eq!(active_state.state, AppendStateKind::SegmentedSealed); - assert_eq!(active_state.committed_length, 1500); // 1000 + 200 + 300 - assert!(active_state.pending_segments.is_empty()); - assert_eq!(active_state.epoch, 6); - - // Test completing Inline state - let mut inline_state = AppendState { - state: AppendStateKind::Inline, - epoch: 2, - committed_length: 500, - ..Default::default() - }; - - assert!(can_complete_append(&inline_state)); - complete_append_operation(&mut inline_state).expect("complete should succeed"); - - assert_eq!(inline_state.state, AppendStateKind::SegmentedSealed); - assert_eq!(inline_state.committed_length, 500); // Unchanged - assert_eq!(inline_state.epoch, 3); - - // Test completing already sealed state should fail - let mut sealed_state = AppendState { - state: AppendStateKind::SegmentedSealed, - ..Default::default() - }; - - assert!(!can_complete_append(&sealed_state)); - assert!(complete_append_operation(&mut sealed_state).is_err()); - } - - #[test] - fn abort_append_transitions() { - // Test aborting SegmentedActive with pending segments - let mut active_state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 3, - committed_length: 800, - pending_segments: vec![AppendSegment { - offset: 800, - length: 400, - data_dir: Some(Uuid::new_v4()), - etag: Some("xyz789".to_string()), - epoch: 0, - }], - }; - - assert!(can_abort_append(&active_state)); - abort_append_operation(&mut active_state).expect("abort should succeed"); - - assert_eq!(active_state.state, AppendStateKind::SegmentedSealed); - assert_eq!(active_state.committed_length, 800); // Unchanged, pending discarded - assert!(active_state.pending_segments.is_empty()); - assert_eq!(active_state.epoch, 4); - - // Test aborting InlinePendingSpill - let mut spill_state = AppendState { - state: AppendStateKind::InlinePendingSpill, - epoch: 1, - committed_length: 100, - pending_segments: vec![], - }; - - assert!(can_abort_append(&spill_state)); - abort_append_operation(&mut spill_state).expect("abort should succeed"); - - assert_eq!(spill_state.state, AppendStateKind::SegmentedSealed); - assert_eq!(spill_state.committed_length, 100); - assert_eq!(spill_state.epoch, 2); - - // Test aborting disabled state should fail - let mut disabled_state = AppendState { - state: AppendStateKind::Disabled, - ..Default::default() - }; - - assert!(!can_abort_append(&disabled_state)); - assert!(abort_append_operation(&mut disabled_state).is_err()); - } - - #[test] - fn epoch_validation() { - let state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 10, - committed_length: 1000, - pending_segments: vec![], - }; - - // Valid epoch should succeed - assert!(verify_append_epoch(&state, 10).is_ok()); - - // Invalid epoch should fail - assert!(verify_append_epoch(&state, 9).is_err()); - assert!(verify_append_epoch(&state, 11).is_err()); - - // Error message should contain epoch information - let error = verify_append_epoch(&state, 5).unwrap_err(); - let error_msg = error.to_string(); - assert!(error_msg.contains("expected epoch 5")); - assert!(error_msg.contains("found 10")); - } - - #[test] - fn next_append_preparation() { - let mut state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 5, - committed_length: 1000, - pending_segments: vec![], - }; - - prepare_next_append(&mut state); - assert_eq!(state.epoch, 6); - - // Test saturation behavior - let mut max_state = AppendState { - epoch: u64::MAX, - ..Default::default() - }; - - prepare_next_append(&mut max_state); - assert_eq!(max_state.epoch, u64::MAX); // Should saturate, not overflow - } - - #[test] - fn segment_validation() { - let state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 3, - committed_length: 1000, - pending_segments: vec![ - AppendSegment { - offset: 1000, - length: 200, - data_dir: Some(Uuid::new_v4()), - etag: Some("abc123".to_string()), - epoch: 0, - }, - AppendSegment { - offset: 1300, - length: 300, - data_dir: Some(Uuid::new_v4()), - etag: Some("def456".to_string()), - epoch: 0, - }, - ], - }; - - // Valid segment after existing segments - assert!(validate_new_segment(&state, 1600, 100).is_ok()); - - // Valid segment filling gap between committed and first pending - assert!(validate_new_segment(&state, 1200, 100).is_ok()); - - // Invalid segment overlapping with committed data - assert!(validate_new_segment(&state, 900, 200).is_err()); - let error = validate_new_segment(&state, 900, 200).unwrap_err(); - assert!(error.to_string().contains("overlaps with committed data")); - - // Invalid segment overlapping with first pending segment - assert!(validate_new_segment(&state, 1100, 100).is_err()); - let error = validate_new_segment(&state, 1100, 100).unwrap_err(); - assert!(error.to_string().contains("overlaps with existing segment")); - - // Invalid segment overlapping with second pending segment - assert!(validate_new_segment(&state, 1400, 100).is_err()); - - // Edge case: segment exactly touching committed data (should be valid) - assert!(validate_new_segment(&state, 1000, 0).is_ok()); - - // Edge case: segment exactly touching existing segment (should be valid) - assert!(validate_new_segment(&state, 1200, 0).is_ok()); - } - - #[test] - fn segment_validation_edge_cases() { - let empty_state = AppendState { - state: AppendStateKind::SegmentedActive, - epoch: 1, - committed_length: 500, - pending_segments: vec![], - }; - - // First segment after committed data - assert!(validate_new_segment(&empty_state, 500, 100).is_ok()); - assert!(validate_new_segment(&empty_state, 600, 200).is_ok()); - - // Zero-length segments (edge case) - assert!(validate_new_segment(&empty_state, 500, 0).is_ok()); - - // Segment exactly at committed boundary - assert!(validate_new_segment(&empty_state, 499, 1).is_err()); - assert!(validate_new_segment(&empty_state, 500, 1).is_ok()); - } -} diff --git a/crates/filemeta/src/fileinfo.rs b/crates/filemeta/src/fileinfo.rs index b15d3c2c..b6fefe5d 100644 --- a/crates/filemeta/src/fileinfo.rs +++ b/crates/filemeta/src/fileinfo.rs @@ -494,96 +494,6 @@ impl FileInfo { ReplicationStatusType::Empty } } - /// Get the append state for this FileInfo, with migration compatibility - pub fn get_append_state(&self) -> crate::append::AppendState { - use crate::append::{AppendState, AppendStateKind, get_append_state}; - - // Try to load from metadata first - if let Ok(Some(state)) = get_append_state(&self.metadata) { - return state; - } - - // Migration compatibility: determine state based on existing data - if self.inline_data() { - // Has inline data, treat as Inline state - AppendState { - state: AppendStateKind::Inline, - epoch: 0, - committed_length: self.size, - pending_segments: Vec::new(), - } - } else { - // No inline data, treat as SegmentedSealed (traditional object) - AppendState { - state: AppendStateKind::SegmentedSealed, - epoch: 0, - committed_length: self.size, - pending_segments: Vec::new(), - } - } - } - - /// Set the append state for this FileInfo - pub fn set_append_state(&mut self, state: &crate::append::AppendState) -> crate::error::Result<()> { - crate::append::set_append_state(&mut self.metadata, state) - } - - /// Check if this object supports append operations - pub fn is_appendable(&self) -> bool { - use crate::append::AppendStateKind; - match self.get_append_state().state { - AppendStateKind::Disabled => false, - AppendStateKind::Inline | AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive => true, - AppendStateKind::SegmentedSealed => false, - } - } - - /// Check if this object has pending append operations - pub fn has_pending_appends(&self) -> bool { - use crate::append::AppendStateKind; - matches!( - self.get_append_state().state, - AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive - ) - } - - /// Complete all pending append operations and seal the object - pub fn complete_append(&mut self) -> crate::error::Result<()> { - let mut append_state = self.get_append_state(); - crate::append::complete_append_operation(&mut append_state)?; - self.set_append_state(&append_state)?; - - // Update file size to reflect completed operation - if append_state.state == crate::append::AppendStateKind::SegmentedSealed { - self.size = append_state.committed_length; - } - - Ok(()) - } - - /// Abort all pending append operations and seal the object - pub fn abort_append(&mut self) -> crate::error::Result<()> { - let mut append_state = self.get_append_state(); - crate::append::abort_append_operation(&mut append_state)?; - self.set_append_state(&append_state)?; - - // Update file size to only include committed data - if append_state.state == crate::append::AppendStateKind::SegmentedSealed { - self.size = append_state.committed_length; - } - - Ok(()) - } - - /// Check if append operations can be completed for this object - pub fn can_complete_append(&self) -> bool { - crate::append::can_complete_append(&self.get_append_state()) - } - - /// Check if append operations can be aborted for this object - pub fn can_abort_append(&self) -> bool { - crate::append::can_abort_append(&self.get_append_state()) - } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/crates/filemeta/src/lib.rs b/crates/filemeta/src/lib.rs index c484f52b..dc7fa4fd 100644 --- a/crates/filemeta/src/lib.rs +++ b/crates/filemeta/src/lib.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod append; mod error; pub mod fileinfo; mod filemeta; @@ -23,7 +22,6 @@ mod replication; pub mod test_data; -pub use append::*; pub use error::*; pub use fileinfo::*; pub use filemeta::*; diff --git a/crates/utils/src/dns_resolver.rs b/crates/utils/src/dns_resolver.rs index 12ec3bc6..43b2d1b1 100644 --- a/crates/utils/src/dns_resolver.rs +++ b/crates/utils/src/dns_resolver.rs @@ -420,7 +420,6 @@ mod tests { } #[tokio::test] - #[ignore] async fn test_invalid_domain_resolution() { let resolver = LayeredDnsResolver::new().await.unwrap(); diff --git a/docs/append_write/README.md b/docs/append_write/README.md deleted file mode 100644 index 7be7a631..00000000 --- a/docs/append_write/README.md +++ /dev/null @@ -1,147 +0,0 @@ -# Append Write Design - -This document captures the current design of the append-write feature in RustFS so that new contributors can quickly understand the moving parts, data flows, and testing expectations. - -## Goals & Non-Goals - -### Goals -- Allow clients to append payloads to existing objects without re-uploading the full body. -- Support inline objects and spill seamlessly into segmented layout once thresholds are exceeded. -- Preserve strong read-after-write semantics via optimistic concurrency controls (ETag / epoch). -- Expose minimal S3-compatible surface area (`x-amz-object-append`, `x-amz-append-position`, `x-amz-append-action`). - -### Non-Goals -- Full multipart-upload parity; append is intentionally simpler and serialized per object. -- Cross-object transactions; each object is isolated. -- Rebalancing or background compaction (future work). - -## State Machine - -Append state is persisted inside `FileInfo.metadata` under `x-rustfs-internal-append-state` and serialized as `AppendState` (`crates/filemeta/src/append.rs`). - -``` -Disabled --(initial PUT w/o append)--> SegmentedSealed -Inline --(inline append)--> Inline / InlinePendingSpill -InlinePendingSpill --(spill success)--> SegmentedActive -SegmentedActive --(Complete)--> SegmentedSealed -SegmentedActive --(Abort)--> SegmentedSealed -SegmentedSealed --(new append)--> SegmentedActive -``` - -Definitions: -- **Inline**: Object data fully stored in metadata (`FileInfo.data`). -- **InlinePendingSpill**: Inline data after append exceeded inline threshold; awaiting spill to disk. -- **SegmentedActive**: Object data lives in erasure-coded part(s) plus one or more pending append segments on disk (`append//`). -- **SegmentedSealed**: No pending segments; logical content equals committed parts. - -`AppendState` fields: -- `state`: current state enum (see above). -- `epoch`: monotonically increasing counter for concurrency control. -- `committed_length`: logical size already durable in the base parts/inline region. -- `pending_segments`: ordered list of `AppendSegment { offset, length, data_dir, etag, epoch }`. - -## Metadata & Storage Layout - -### Inline Objects -- Inline payload stored in `FileInfo.data`. -- Hash metadata maintained through `append_inline_data` (re-encoding with bitrot writer when checksums exist). -- When spilling is required, inline data is decoded, appended, and re-encoded into erasure shards written to per-disk `append///part.1` temporary path before rename to primary data directory. - -### Segmented Objects -- Base object content is represented by standard erasure-coded parts (`FileInfo.parts`, `FileInfo.data_dir`). -- Pending append segments live under `/append///part.1` (per disk). -- Each append stores segment metadata (`etag`, `offset`, `length`) inside `AppendState.pending_segments` and updates `FileInfo.size` to include pending bytes. -- Aggregate ETag is recomputed using multipart MD5 helper (`get_complete_multipart_md5`). - -### Metadata Writes -- `SetDisks::write_unique_file_info` persists `FileInfo` updates to the quorum of disks. -- During spill/append/complete/abort, all mirrored `FileInfo` copies within `parts_metadata` are updated to keep nodes consistent. -- Abort ensures inline markers are cleared (`x-rustfs-internal-inline-data`) and `FileInfo.data = None` to avoid stale inline reads. - -## Request Flows - -### Append (Inline Path) -1. Handler (`rustfs/src/storage/ecfs.rs`) validates headers and fills `ObjectOptions.append_*`. -2. `SetDisks::append_inline_object` verifies append position using `AppendState` snapshot. -3. Existing inline payload decoded (if checksums present) and appended in-memory (`append_inline_data`). -4. Storage class decision determines whether to remain inline or spill. -5. Inline success updates `FileInfo.data`, metadata, `AppendState` (state `Inline`, lengths updated). -6. Spill path delegates to `spill_inline_into_segmented` (see segmented path below). - -### Append (Segmented Path) -1. `SetDisks::append_segmented_object` validates state (must be `SegmentedActive` or `SegmentedSealed`). -2. Snapshot expected offset = committed length + sum of pending segments. -3. Payload encoded using erasure coding; shards written to temp volume; renamed into `append//` under object data directory. -4. New `AppendSegment` pushed, `AppendState.epoch` incremented, aggregated ETag recalculated. -5. `FileInfo.size` reflects committed + pending bytes; metadata persisted across quorum. - -### GET / Range Reads -1. `SetDisks::get_object_with_fileinfo` inspects `AppendState`. -2. Reads committed data from inline or erasure parts (ignoring inline buffers once segmented). -3. If requested range includes pending segments, loader fetches each segment via `load_pending_segment`, decodes shards, and streams appended bytes. - -### Complete Append (`x-amz-append-action: complete`) -1. `complete_append_object` fetches current `FileInfo`, ensures pending segments exist. -2. Entire logical object (committed + pending) streamed through `VecAsyncWriter` (TODO: potential optimization) to produce contiguous payload. -3. Inline spill routine (`spill_inline_into_segmented`) consolidates data into primary part, sets state `SegmentedSealed`, clears pending list, updates `committed_length`. -4. Pending segment directories removed and quorum metadata persisted. - -### Abort Append (`x-amz-append-action: abort`) -1. `abort_append_object` removes pending segment directories. -2. Ensures `committed_length` matches actual durable data (inline length or sum of parts); logs and corrects if mismatch is found. -3. Clears pending list, sets state `SegmentedSealed`, bumps epoch, removes inline markers/data. -4. Persists metadata and returns base ETag (multipart MD5 of committed parts). - -## Error Handling & Recovery - -- All disk writes go through quorum helpers (`reduce_write_quorum_errs`, `reduce_read_quorum_errs`) and propagate `StorageError` variants for HTTP mapping. -- Append operations are single-threaded per object via locking in higher layers (`fast_lock_manager` in `SetDisks::put_object`). -- On spill/append rename failure, temp directories are cleaned up; operation aborts without mutating metadata. -- Abort path now realigns `committed_length` if metadata drifted (observed during development) and strips inline remnants to prevent stale reads. -- Pending segments are only removed once metadata update succeeds; no partial deletion is performed ahead of state persistence. - -## Concurrency - -- Append requests rely on exact `x-amz-append-position` to ensure the client has an up-to-date view. -- Optional header `If-Match` is honored in S3 handler before actual append (shared with regular PUT path). -- `AppendState.epoch` increments after each append/complete/abort; future work may expose it for stronger optimistic control. -- e2e test `append_segments_concurrency_then_complete` verifies that simultaneous appends result in exactly one success; the loser receives 400. - -## Key Modules - -- `crates/ecstore/src/set_disk.rs`: core implementation (inline append, spill, segmented append, complete, abort, GET integration). -- `crates/ecstore/src/erasure_coding/{encode,decode}.rs`: encode/decode helpers used by append pipeline. -- `crates/filemeta/src/append.rs`: metadata schema + helper functions. -- `rustfs/src/storage/ecfs.rs`: HTTP/S3 layer that parses headers and routes to append operations. - -## Testing Strategy - -### Unit Tests -- `crates/filemeta/src/append.rs` covers serialization and state transitions. -- `crates/ecstore/src/set_disk.rs` contains lower-level utilities and regression tests for metadata helpers. -- Additional unit coverage is recommended for spill/append failure paths (e.g., injected rename failures). - -### End-to-End Tests (`cargo test --package e2e_test append`) -- Inline append success, wrong position, precondition failures. -- Segmented append success, wrong position, wrong ETag. -- Spill threshold transition (`append_threshold_crossing_inline_to_segmented`). -- Pending segment streaming (`append_range_requests_across_segments`). -- Complete append consolidates pending segments. -- Abort append discards pending data and allows new append. -- Concurrency: two clients racing to append, followed by additional append + complete. - -### Tooling Considerations -- `make clippy` must pass; the append code relies on async operations and custom logging. -- `make test` / `cargo nextest run` recommended before submitting PRs. -- Use `RUST_LOG=rustfs_ecstore=debug` when debugging append flows; targeted `info!`/`warn!` logs are emitted during spill/abort. - -## Future Work - -- Streamed consolidation in `complete_append_object` to avoid buffering entire logical object. -- Throttling or automatic `Complete` when pending segments exceed size/quantity thresholds. -- Stronger epoch exposure to clients (header-based conflict detection). -- Automated cleanup or garbage collection for orphaned `append/*` directories. - ---- - -For questions or design discussions, drop a note in the append-write channel or ping the storage team. diff --git a/rustfs/src/storage/ecfs.rs b/rustfs/src/storage/ecfs.rs index 29b72e41..c67d5ddf 100644 --- a/rustfs/src/storage/ecfs.rs +++ b/rustfs/src/storage/ecfs.rs @@ -2288,92 +2288,10 @@ impl S3 for FS { let mt = metadata.clone(); let mt2 = metadata.clone(); - let append_flag = req.headers.get("x-amz-object-append"); - let append_action_header = req.headers.get("x-amz-append-action"); - let mut append_requested = false; - let mut append_position: Option = None; - if let Some(flag_value) = append_flag { - let flag_str = flag_value.to_str().map_err(|_| { - S3Error::with_message(S3ErrorCode::InvalidArgument, "invalid x-amz-object-append header".to_string()) - })?; - if flag_str.eq_ignore_ascii_case("true") { - append_requested = true; - let position_value = req.headers.get("x-amz-append-position").ok_or_else(|| { - S3Error::with_message( - S3ErrorCode::InvalidArgument, - "x-amz-append-position header required when x-amz-object-append is true".to_string(), - ) - })?; - let position_str = position_value.to_str().map_err(|_| { - S3Error::with_message(S3ErrorCode::InvalidArgument, "invalid x-amz-append-position header".to_string()) - })?; - let position = position_str.parse::().map_err(|_| { - S3Error::with_message( - S3ErrorCode::InvalidArgument, - "x-amz-append-position must be a non-negative integer".to_string(), - ) - })?; - if position < 0 { - return Err(S3Error::with_message( - S3ErrorCode::InvalidArgument, - "x-amz-append-position must be a non-negative integer".to_string(), - )); - } - append_position = Some(position); - } else if !flag_str.eq_ignore_ascii_case("false") { - return Err(S3Error::with_message( - S3ErrorCode::InvalidArgument, - "x-amz-object-append must be 'true' or 'false'".to_string(), - )); - } - } - - let mut append_action: Option = None; - if let Some(action_value) = append_action_header { - let action_str = action_value.to_str().map_err(|_| { - S3Error::with_message(S3ErrorCode::InvalidArgument, "invalid x-amz-append-action header".to_string()) - })?; - append_action = Some(action_str.to_ascii_lowercase()); - } - let mut opts: ObjectOptions = put_opts(&bucket, &key, version_id, &req.headers, mt) .await .map_err(ApiError::from)?; - if append_requested { - opts.append_object = true; - opts.append_position = append_position; - } - - if let Some(action) = append_action { - if append_requested { - return Err(S3Error::with_message( - S3ErrorCode::InvalidArgument, - "x-amz-object-append cannot be combined with x-amz-append-action".to_string(), - )); - } - - let obj_info = match action.as_str() { - "complete" => store.complete_append(&bucket, &key, &opts).await, - "abort" => store.abort_append(&bucket, &key, &opts).await, - _ => { - return Err(S3Error::with_message( - S3ErrorCode::InvalidArgument, - "x-amz-append-action must be 'complete' or 'abort'".to_string(), - )); - } - } - .map_err(ApiError::from)?; - - let output = PutObjectOutput { - e_tag: obj_info.etag.clone(), - version_id: obj_info.version_id.map(|v| v.to_string()), - ..Default::default() - }; - - return Ok(S3Response::new(output)); - } - let repoptions = get_must_replicate_options(&mt2, "".to_string(), ReplicationStatusType::Empty, ReplicationType::Object, opts.clone());