diff --git a/Cargo.lock b/Cargo.lock index 3c2e0756..eccbbdee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2895,6 +2895,7 @@ dependencies = [ "chrono", "flatbuffers", "futures", + "http 1.3.1", "md5 0.8.0", "rand 0.9.2", "reqwest", @@ -6481,6 +6482,7 @@ dependencies = [ "rustfs-utils", "s3s", "serde", + "serde_json", "thiserror 2.0.16", "time", "tokio", diff --git a/crates/e2e_test/Cargo.toml b/crates/e2e_test/Cargo.toml index 9f5f9538..b29496da 100644 --- a/crates/e2e_test/Cargo.toml +++ b/crates/e2e_test/Cargo.toml @@ -49,4 +49,5 @@ uuid = { workspace = true } base64 = { workspace = true } rand = { workspace = true } chrono = { workspace = true } +http.workspace = true md5 = { workspace = true } diff --git a/crates/e2e_test/src/kms/multipart_encryption_test.rs b/crates/e2e_test/src/kms/multipart_encryption_test.rs index ee845fe9..40454801 100644 --- a/crates/e2e_test/src/kms/multipart_encryption_test.rs +++ b/crates/e2e_test/src/kms/multipart_encryption_test.rs @@ -13,25 +13,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! 分片上传加密功能的分步测试用例 -//! -//! 这个测试套件将验证分片上传加密功能的每一个步骤: -//! 1. 测试基础的单分片加密(验证加密基础逻辑) -//! 2. 测试多分片上传(验证分片拼接逻辑) -//! 3. 测试加密元数据的保存和读取 -//! 4. 测试完整的分片上传加密流程 - use super::common::LocalKMSTestEnvironment; use crate::common::{TEST_BUCKET, init_logging}; use serial_test::serial; use tracing::{debug, info}; -/// 步骤1:测试基础单文件加密功能(确保SSE-S3在非分片场景下正常工作) #[tokio::test] #[serial] async fn test_step1_basic_single_file_encryption() -> Result<(), Box> { init_logging(); - info!("🧪 步骤1:测试基础单文件加密功能"); + info!("🧪 step1: test basic single file encryption"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -40,11 +31,11 @@ async fn test_step1_basic_single_file_encryption() -> Result<(), Box Result<(), Box Result<(), Box> { init_logging(); - info!("🧪 步骤2:测试不加密的分片上传"); + info!("🧪 step2: test basic multipart upload without encryption"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -102,12 +93,16 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo let total_parts = 2; let total_size = part_size * total_parts; - // 生成测试数据(有明显的模式便于验证) + // generate test data (with clear pattern for easy verification) let test_data: Vec = (0..total_size).map(|i| (i % 256) as u8).collect(); - info!("🚀 开始分片上传(无加密):{} parts,每个 {}MB", total_parts, part_size / (1024 * 1024)); + info!( + "🚀 step2: start multipart upload (no encryption) with {} parts, each {}MB", + total_parts, + part_size / (1024 * 1024) + ); - // 步骤1:创建分片上传 + // step1: create multipart upload let create_multipart_output = s3_client .create_multipart_upload() .bucket(TEST_BUCKET) @@ -116,16 +111,16 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo .await?; let upload_id = create_multipart_output.upload_id().unwrap(); - info!("📋 创建分片上传,ID: {}", upload_id); + info!("📋 step2: create multipart upload, ID: {}", upload_id); - // 步骤2:上传各个分片 + // step2: upload each part let mut completed_parts = Vec::new(); for part_number in 1..=total_parts { let start = (part_number - 1) * part_size; let end = std::cmp::min(start + part_size, total_size); let part_data = &test_data[start..end]; - info!("📤 上传分片 {} ({} bytes)", part_number, part_data.len()); + info!("📤 step2: upload part {} ({} bytes)", part_number, part_data.len()); let upload_part_output = s3_client .upload_part() @@ -145,15 +140,15 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo .build(), ); - debug!("分片 {} 上传完成,ETag: {}", part_number, etag); + debug!("step2: part {} uploaded, ETag: {}", part_number, etag); } - // 步骤3:完成分片上传 + // step3: complete multipart upload let completed_multipart_upload = aws_sdk_s3::types::CompletedMultipartUpload::builder() .set_parts(Some(completed_parts)) .build(); - info!("🔗 完成分片上传"); + info!("🔗 step2: complete multipart upload"); let complete_output = s3_client .complete_multipart_upload() .bucket(TEST_BUCKET) @@ -163,10 +158,16 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo .send() .await?; - debug!("完成分片上传,ETag: {:?}", complete_output.e_tag()); + debug!("step2: multipart upload completed, ETag: {:?}", complete_output.e_tag()); - // 步骤4:下载并验证 - info!("📥 下载文件并验证数据完整性"); + // step4: verify multipart upload completed successfully + assert_eq!( + complete_output.e_tag().unwrap().to_string(), + format!("\"{}-{}-{}\"", object_key, upload_id, total_parts) + ); + + // verify data integrity + info!("📥 step2: download file and verify data integrity"); let get_response = s3_client.get_object().bucket(TEST_BUCKET).key(object_key).send().await?; let downloaded_data = get_response.body.collect().await?.into_bytes(); @@ -174,16 +175,16 @@ async fn test_step2_basic_multipart_upload_without_encryption() -> Result<(), Bo assert_eq!(&downloaded_data[..], &test_data[..]); kms_env.base_env.delete_test_bucket(TEST_BUCKET).await?; - info!("✅ 步骤2通过:不加密的分片上传功能正常"); + info!("✅ step2: basic multipart upload without encryption works as expected"); Ok(()) } -/// 步骤3:测试分片上传 + SSE-S3加密(重点测试) +/// test multipart upload with SSE-S3 encryption #[tokio::test] #[serial] async fn test_step3_multipart_upload_with_sse_s3() -> Result<(), Box> { init_logging(); - info!("🧪 步骤3:测试分片上传 + SSE-S3加密"); + info!("🧪 step3: test multipart upload with SSE-S3 encryption"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -197,16 +198,16 @@ async fn test_step3_multipart_upload_with_sse_s3() -> Result<(), Box = (0..total_size).map(|i| ((i / 1000) % 256) as u8).collect(); info!( - "🔐 开始分片上传(SSE-S3加密):{} parts,每个 {}MB", + "🔐 step3: start multipart upload with SSE-S3 encryption: {} parts, each {}MB", total_parts, part_size / (1024 * 1024) ); - // 步骤1:创建分片上传并启用SSE-S3 + // step1: create multipart upload and enable SSE-S3 let create_multipart_output = s3_client .create_multipart_upload() .bucket(TEST_BUCKET) @@ -216,24 +217,24 @@ async fn test_step3_multipart_upload_with_sse_s3() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { init_logging(); - info!("🧪 步骤4:测试大文件分片上传加密"); + info!("🧪 step4: test larger multipart upload with encryption (streaming encryption)"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -322,13 +326,13 @@ async fn test_step4_large_multipart_upload_with_encryption() -> Result<(), Box = (0..total_size) .map(|i| { let part_num = i / part_size; @@ -337,9 +341,9 @@ async fn test_step4_large_multipart_upload_with_encryption() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { init_logging(); - info!("🧪 步骤5:测试所有加密类型的分片上传"); + info!("🧪 step5: test all encryption types multipart upload"); let mut kms_env = LocalKMSTestEnvironment::new().await?; let _default_key_id = kms_env.start_rustfs_for_local_kms().await?; @@ -446,8 +456,8 @@ async fn test_step5_all_encryption_types_multipart() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { - // 生成测试数据 + // step5: generate test data let test_data: Vec = (0..total_size).map(|i| ((i * 7) % 256) as u8).collect(); - // 准备SSE-C所需的密钥(如果需要) + // step5: prepare SSE-C key and MD5 (if needed) let (sse_c_key, sse_c_md5) = if matches!(encryption_type, EncryptionType::SSEC) { let key = "01234567890123456789012345678901"; let key_b64 = base64::Engine::encode(&base64::engine::general_purpose::STANDARD, key); @@ -506,9 +516,10 @@ async fn test_multipart_encryption_type( (None, None) }; - info!("📋 创建分片上传 - {:?}", encryption_type); + // step5: create multipart upload + info!("🔗 step5: create multipart upload with encryption {:?}", encryption_type); - // 创建分片上传 + // step5: create multipart upload request let mut create_request = s3_client.create_multipart_upload().bucket(bucket).key(object_key); create_request = match encryption_type { @@ -522,7 +533,6 @@ async fn test_multipart_encryption_type( let create_multipart_output = create_request.send().await?; let upload_id = create_multipart_output.upload_id().unwrap(); - // 上传分片 let mut completed_parts = Vec::new(); for part_number in 1..=total_parts { let start = (part_number - 1) * part_size; @@ -537,7 +547,7 @@ async fn test_multipart_encryption_type( .part_number(part_number as i32) .body(aws_sdk_s3::primitives::ByteStream::from(part_data.to_vec())); - // SSE-C需要在每个UploadPart请求中包含密钥 + // step5: include SSE-C key and MD5 in each UploadPart request (if needed) if matches!(encryption_type, EncryptionType::SSEC) { upload_request = upload_request .sse_customer_algorithm("AES256") @@ -554,10 +564,11 @@ async fn test_multipart_encryption_type( .build(), ); - debug!("{:?} 分片 {} 上传完成", encryption_type, part_number); + // step5: complete multipart upload request + debug!("🔗 step5: complete multipart upload part {} with etag {}", part_number, etag); } - // 完成分片上传 + // step5: complete multipart upload let completed_multipart_upload = aws_sdk_s3::types::CompletedMultipartUpload::builder() .set_parts(Some(completed_parts)) .build(); @@ -571,10 +582,12 @@ async fn test_multipart_encryption_type( .send() .await?; - // 下载并验证 + // step5: download and verify multipart upload + info!("🔗 step5: download and verify multipart upload with encryption {:?}", encryption_type); + let mut get_request = s3_client.get_object().bucket(bucket).key(object_key); - // SSE-C需要在GET请求中包含密钥 + // step5: include SSE-C key and MD5 in each GET request (if needed) if matches!(encryption_type, EncryptionType::SSEC) { get_request = get_request .sse_customer_algorithm("AES256") @@ -584,7 +597,7 @@ async fn test_multipart_encryption_type( let get_response = get_request.send().await?; - // 验证加密头 + // step5: verify encryption headers match encryption_type { EncryptionType::SSEKMS => { assert_eq!( @@ -597,11 +610,15 @@ async fn test_multipart_encryption_type( } } - // 验证数据完整性 + // step5: verify data integrity let downloaded_data = get_response.body.collect().await?.into_bytes(); assert_eq!(downloaded_data.len(), total_size); assert_eq!(&downloaded_data[..], &test_data[..]); - info!("✅ {:?} 分片上传测试通过", encryption_type); + // step5: verify data integrity + info!( + "✅ step5: verify data integrity for multipart upload with encryption {:?}", + encryption_type + ); Ok(()) } diff --git a/crates/e2e_test/src/reliant/append.rs b/crates/e2e_test/src/reliant/append.rs new file mode 100644 index 00000000..3541fac8 --- /dev/null +++ b/crates/e2e_test/src/reliant/append.rs @@ -0,0 +1,1177 @@ +#![cfg(test)] + +use crate::common::{RustFSTestEnvironment, init_logging}; +use aws_sdk_s3::Client; +use aws_sdk_s3::error::SdkError; +use aws_sdk_s3::operation::put_object::{PutObjectError, PutObjectOutput}; +use aws_sdk_s3::primitives::ByteStream; +use http::{ + HeaderValue, + header::{IF_MATCH, IF_NONE_MATCH}, +}; +use serial_test::serial; +use std::error::Error; +use std::time::Duration; +use tokio::time::sleep; +use uuid::Uuid; + +async fn append_object( + client: &Client, + bucket: &str, + key: &str, + position: i64, + payload: &[u8], +) -> Result> { + append_object_with_conditions(client, bucket, key, position, payload, None, None).await +} + +async fn append_object_with_if_match( + client: &Client, + bucket: &str, + key: &str, + position: i64, + payload: &[u8], + if_match: Option, +) -> Result> { + append_object_with_conditions(client, bucket, key, position, payload, if_match, None).await +} + +async fn append_object_with_if_none_match( + client: &Client, + bucket: &str, + key: &str, + position: i64, + payload: &[u8], + if_none_match: Option, +) -> Result> { + append_object_with_conditions(client, bucket, key, position, payload, None, if_none_match).await +} + +async fn append_object_with_conditions( + client: &Client, + bucket: &str, + key: &str, + position: i64, + payload: &[u8], + if_match: Option, + if_none_match: Option, +) -> Result> { + let if_match_header = if_match.clone(); + let if_none_match_header = if_none_match.clone(); + client + .put_object() + .bucket(bucket) + .key(key) + .body(ByteStream::from(payload.to_vec())) + .customize() + .mutate_request(move |req| { + req.headers_mut() + .insert("x-amz-object-append", HeaderValue::from_static("true")); + req.headers_mut().insert( + "x-amz-append-position", + HeaderValue::from_str(&position.to_string()).expect("invalid position header"), + ); + if let Some(tag) = if_match_header.as_deref() { + req.headers_mut() + .insert(IF_MATCH, HeaderValue::from_str(tag).expect("invalid if-match header")); + } + if let Some(tag) = if_none_match_header.as_deref() { + req.headers_mut() + .insert(IF_NONE_MATCH, HeaderValue::from_str(tag).expect("invalid if-none-match header")); + } + }) + .send() + .await +} + +async fn append_action( + client: &Client, + bucket: &str, + key: &str, + action: &str, + if_match: Option<&str>, +) -> Result> { + let action_value = HeaderValue::from_str(action).expect("invalid append action"); + let if_match_value = if_match.map(|v| HeaderValue::from_str(v).expect("invalid if-match")); + client + .put_object() + .bucket(bucket) + .key(key) + .body(ByteStream::from_static(b"")) + .customize() + .mutate_request(move |req| { + req.headers_mut().insert("x-amz-append-action", action_value.clone()); + if let Some(val) = if_match_value.as_ref() { + req.headers_mut().insert(IF_MATCH, val.clone()); + } + }) + .send() + .await +} + +fn md5_hex(data: &[u8]) -> String { + format!("{:x}", md5::compute(data)) +} + +fn multipart_etag(etags: &[&str]) -> String { + let mut buf = Vec::new(); + + for etag in etags { + let clean = etag.trim_matches('"'); + if clean.len() == 32 && clean.chars().all(|c| c.is_ascii_hexdigit()) { + let mut chunk = Vec::with_capacity(clean.len() / 2); + for i in (0..clean.len()).step_by(2) { + let byte = u8::from_str_radix(&clean[i..i + 2], 16).expect("invalid hex"); + chunk.push(byte); + } + buf.extend_from_slice(&chunk); + } else { + buf.extend_from_slice(clean.as_bytes()); + } + } + + let digest = md5::compute(buf); + format!("{:x}-{}", digest, etags.len()) +} + +#[tokio::test] +#[serial] +async fn append_inline_object_updates_content_and_etag() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-inline-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-success.txt"; + let initial = b"hello"; + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + + let initial_fetch = client.get_object().bucket(&bucket).key(key).send().await?; + let initial_body = initial_fetch.body.collect().await?.into_bytes(); + println!("initial body = {:?}", initial_body); + + let append_payload = b" world"; + append_object(&client, &bucket, key, initial.len() as i64, append_payload) + .await + .expect("append request should succeed"); + + let second_payload = b"!!!"; + append_object(&client, &bucket, key, (initial.len() + append_payload.len()) as i64, second_payload) + .await + .expect("second append request should succeed"); + + let expected: Vec = [initial.as_slice(), append_payload.as_slice(), second_payload.as_slice()].concat(); + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let etag = get_resp.e_tag().map(|v| v.to_string()); + let aggregated = get_resp.body.collect().await?; + let _body = aggregated.into_bytes(); + assert_eq!(_body.as_ref(), expected.as_slice()); + + if let Some(etag) = etag { + assert_eq!(etag.trim_matches('"'), md5_hex(&expected)); + } else { + panic!("Append GET response missing ETag"); + } + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_inline_object_rejects_wrong_position() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-inline-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-mismatch.txt"; + let initial = b"abcdef"; + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + + let err = append_object(&client, &bucket, key, (initial.len() as i64) + 1, b"xyz") + .await + .expect_err("append with wrong position must fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 400); + } + other => panic!("unexpected error variant: {other:?}"), + } + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), initial); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_segmented_object_appends_new_part() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-segmented-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-large.bin"; + + let initial_size = 512 * 1024; + let initial: Vec = (0..initial_size).map(|i| (i % 251) as u8).collect(); + let put_resp = client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.clone())) + .send() + .await?; + let initial_etag = put_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("initial put etag"); + + let append_payload: Vec = (0..(128 * 1024)).map(|i| (i % 197) as u8).collect(); + let append_position = initial.len() as i64; + + let mut if_match = String::from("\""); + if_match.push_str(&initial_etag); + if_match.push('"'); + + let append_resp = append_object_with_if_match(&client, &bucket, key, append_position, &append_payload, Some(if_match)) + .await + .expect("append request must succeed"); + let append_etag = append_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("append response etag"); + + let second_segment: Vec = (0..(64 * 1024)).map(|i| (i % 173) as u8).collect(); + let expected_etag_first = multipart_etag(&[&initial_etag, &md5_hex(&append_payload)]); + assert_eq!(append_etag, expected_etag_first); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let _initial_body = aggregated.into_bytes(); + + let append_resp_second = append_object_with_if_match( + &client, + &bucket, + key, + append_position + append_payload.len() as i64, + &second_segment, + Some(format!("\"{append_etag}\"")), + ) + .await + .expect("second segmented append must succeed"); + + let expected_etag = multipart_etag(&[&initial_etag, &md5_hex(&append_payload), &md5_hex(&second_segment)]); + assert_eq!( + append_resp_second.e_tag().map(|v| v.trim_matches('"').to_string()), + Some(expected_etag.clone()) + ); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + + let mut expected = initial.clone(); + expected.extend_from_slice(&append_payload); + expected.extend_from_slice(&second_segment); + + assert_eq!(body.as_ref(), expected.as_slice()); + + let head = client.head_object().bucket(&bucket).key(key).send().await?; + assert_eq!(head.content_length(), Some(expected.len() as i64)); + assert_eq!(head.e_tag().map(|v| v.trim_matches('"').to_string()), Some(expected_etag.clone())); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_inline_object_rejects_failed_precondition() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-precond-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-if-match.txt"; + let initial = b"hello"; + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + + let append_payload = b" world"; + let err = append_object_with_if_match( + &client, + &bucket, + key, + initial.len() as i64, + append_payload, + Some("\"deadbeef\"".to_string()), + ) + .await + .expect_err("append with wrong If-Match must fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 412); + } + other => panic!("unexpected error variant: {other:?}"), + } + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), initial); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_inline_object_honors_if_match() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-inline-if-match-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-inline-if-match-success.txt"; + let initial = b"inline"; + let put_resp = client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + let initial_etag = put_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("initial etag"); + + let append_payload = b" payload"; + let resp = append_object_with_if_match( + &client, + &bucket, + key, + initial.len() as i64, + append_payload, + Some(format!("\"{initial_etag}\"")), + ) + .await + .expect("append with correct if-match should succeed"); + + let combined: Vec = [initial.as_slice(), append_payload.as_slice()].concat(); + assert_eq!(resp.e_tag().map(|v| v.trim_matches('"').to_string()), Some(md5_hex(&combined))); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), combined.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_inline_object_rejects_if_none_match_star() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-if-none-match-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-if-none.txt"; + let initial = b"hello"; + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + + let append_payload = b" world"; + let err = + append_object_with_if_none_match(&client, &bucket, key, initial.len() as i64, append_payload, Some("*".to_string())) + .await + .expect_err("append with If-None-Match:* must fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 412); + } + other => panic!("unexpected error variant: {other:?}"), + } + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), initial); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_inline_object_allows_zero_length_payload() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-inline-empty-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-inline-empty.txt"; + let initial = b"foobar"; + let put_resp = client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + let initial_etag = put_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("initial etag"); + + let resp = append_object_with_if_match(&client, &bucket, key, initial.len() as i64, &[], Some(format!("\"{initial_etag}\""))) + .await + .expect("append with empty payload should succeed"); + + assert_eq!(resp.e_tag().map(|v| v.trim_matches('"').to_string()), Some(initial_etag.clone())); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), initial); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn complete_append_consolidates_pending_segments() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-complete-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "complete-append.bin"; + let base_len = 256 * 1024; + let base: Vec = (0..base_len).map(|i| (i % 251) as u8).collect(); + let put_resp = client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(base.clone())) + .send() + .await?; + let base_etag = put_resp.e_tag().map(|v| v.trim_matches('"').to_string()).expect("base etag"); + + let seg_a: Vec = (0..(64 * 1024)).map(|i| (i % 199) as u8).collect(); + let seg_b: Vec = (0..(96 * 1024)).map(|i| (i % 173) as u8).collect(); + + let append_a = + append_object_with_if_match(&client, &bucket, key, base.len() as i64, &seg_a, Some(format!("\"{base_etag}\""))).await?; + let etag_after_a = append_a + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("etag after first append"); + + let append_b = append_object_with_if_match( + &client, + &bucket, + key, + (base.len() + seg_a.len()) as i64, + &seg_b, + Some(format!("\"{etag_after_a}\"")), + ) + .await?; + let etag_after_b = append_b + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("etag after second append"); + assert!(etag_after_b.contains('-')); + + let complete_resp = append_action(&client, &bucket, key, "complete", None).await?; + let complete_etag = complete_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("complete etag"); + + let mut expected = base.clone(); + expected.extend_from_slice(&seg_a); + expected.extend_from_slice(&seg_b); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let final_body = get_resp.body.collect().await?.into_bytes(); + assert_eq!(final_body.len(), expected.len()); + assert_eq!(final_body.as_ref(), expected.as_slice()); + assert_eq!(complete_etag, md5_hex(&expected)); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn abort_append_discards_pending_segments() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-abort-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "abort-append.bin"; + let base: Vec = (0..(512 * 1024)).map(|i| (i % 181) as u8).collect(); + let put_resp = client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(base.clone())) + .send() + .await?; + let base_etag = put_resp.e_tag().map(|v| v.trim_matches('"').to_string()).expect("base etag"); + + let seg_a: Vec = vec![0xAA; 64 * 1024]; + let seg_b: Vec = vec![0xBB; 96 * 1024]; + + let append_a = + append_object_with_if_match(&client, &bucket, key, base.len() as i64, &seg_a, Some(format!("\"{base_etag}\""))).await?; + let etag_after_a = append_a + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("etag after first append"); + + append_object_with_if_match( + &client, + &bucket, + key, + (base.len() + seg_a.len()) as i64, + &seg_b, + Some(format!("\"{etag_after_a}\"")), + ) + .await?; + + let abort_resp = append_action(&client, &bucket, key, "abort", None).await?; + let abort_etag = abort_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("abort etag"); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let final_body = get_resp.body.collect().await?.into_bytes(); + assert_eq!(final_body.len(), base.len()); + assert_eq!(final_body.as_ref(), base.as_slice()); + + let retry_segment = vec![0xCC; 32 * 1024]; + let retry_resp = append_object_with_if_match( + &client, + &bucket, + key, + base.len() as i64, + &retry_segment, + Some(format!("\"{abort_etag}\"")), + ) + .await?; + let retry_etag = retry_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("retry etag"); + + let mut expected_after_retry = base.clone(); + expected_after_retry.extend_from_slice(&retry_segment); + let head_after_retry = client.head_object().bucket(&bucket).key(key).send().await?; + assert_eq!(head_after_retry.content_length(), Some(expected_after_retry.len() as i64)); + assert_eq!( + head_after_retry.e_tag().map(|v| v.trim_matches('"').to_string()), + Some(retry_etag.clone()) + ); + + let get_after_retry = client.get_object().bucket(&bucket).key(key).send().await?; + let final_bytes = get_after_retry.body.collect().await?.into_bytes(); + assert_eq!(final_bytes.as_ref(), expected_after_retry.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_segments_concurrency_then_complete() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-complete-concurrency-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "concurrent-complete.bin"; + let base: Vec = (0..(196 * 1024)).map(|i| (i % 233) as u8).collect(); + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(base.clone())) + .send() + .await?; + + let seg_a: Vec = vec![0x11; 64 * 1024]; + let seg_b: Vec = vec![0x22; 48 * 1024]; + let seg_c: Vec = vec![0x33; 80 * 1024]; + + let position = base.len() as i64; + let client_a = client.clone(); + let client_b = client.clone(); + let bucket_a = bucket.clone(); + let bucket_b = bucket.clone(); + let key_string = key.to_string(); + let seg_a_clone = seg_a.clone(); + let seg_b_clone = seg_b.clone(); + + let (res_a, res_b) = tokio::join!( + async { append_object(&client_a, &bucket_a, &key_string, position, &seg_a_clone).await }, + async { append_object(&client_b, &bucket_b, &key_string, position, &seg_b_clone).await } + ); + + let (success_resp, failure_resp, winning_segment) = match (res_a, res_b) { + (Ok(resp), Err(err)) => (resp, Some(err), seg_a.clone()), + (Err(err), Ok(resp)) => (resp, Some(err), seg_b.clone()), + _ => panic!("expected exactly one append success"), + }; + + if let Some(SdkError::ServiceError(service_err)) = failure_resp { + assert_eq!(service_err.raw().status().as_u16(), 400); + } + + let winning_etag = success_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("winning append etag"); + + let mut expected = base.clone(); + expected.extend_from_slice(&winning_segment); + + append_object_with_if_match(&client, &bucket, key, expected.len() as i64, &seg_c, Some(format!("\"{winning_etag}\""))) + .await?; + + expected.extend_from_slice(&seg_c); + + let complete_resp = append_action(&client, &bucket, key, "complete", None).await?; + let final_etag = complete_resp + .e_tag() + .map(|v| v.trim_matches('"').to_string()) + .expect("final etag"); + + let head = client.head_object().bucket(&bucket).key(key).send().await?; + assert_eq!(head.content_length(), Some(expected.len() as i64)); + assert_eq!(head.e_tag().map(|v| v.trim_matches('"').to_string()), Some(final_etag.clone())); + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let final_body = get_resp.body.collect().await?.into_bytes(); + assert_eq!(final_body.as_ref(), expected.as_slice()); + assert_eq!(final_etag, md5_hex(&expected)); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_missing_object_returns_not_found() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-missing-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "missing-object.txt"; + let err = append_object(&client, &bucket, key, 0, b"data") + .await + .expect_err("append on missing object must fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 404); + } + other => panic!("unexpected error variant: {other:?}"), + } + + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_segmented_object_rejects_wrong_position() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-seg-pos-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-seg-pos.bin"; + let initial: Vec = (0..(512 * 1024)).map(|i| (i % 211) as u8).collect(); + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.clone())) + .send() + .await?; + + let err = append_object(&client, &bucket, key, (initial.len() as i64) + 1, b"abc") + .await + .expect_err("append with wrong position on segmented object must fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 400); + } + other => panic!("unexpected error variant: {other:?}"), + } + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), initial.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_segmented_object_rejects_failed_precondition() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-seg-precond-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "append-seg-precond.bin"; + let initial: Vec = (0..(256 * 1024)).map(|i| (i % 199) as u8).collect(); + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.clone())) + .send() + .await?; + + let append_payload: Vec = (0..(64 * 1024)).map(|i| (i % 173) as u8).collect(); + let err = append_object_with_if_match( + &client, + &bucket, + key, + initial.len() as i64, + &append_payload, + Some("\"ffffffffffffffffffffffffffffffff\"".to_string()), + ) + .await + .expect_err("append with wrong etag on segmented object must fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 412); + } + other => panic!("unexpected error variant: {other:?}"), + } + + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let aggregated = get_resp.body.collect().await?; + let body = aggregated.into_bytes(); + assert_eq!(body.as_ref(), initial.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_large_file_multi_segments() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-large-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "large-append.bin"; + + // Create initial object with 1MB data + let chunk_size = 1024 * 1024; // 1MB + let initial_data: Vec = (0..chunk_size).map(|i| (i % 256) as u8).collect(); + + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial_data.clone())) + .send() + .await?; + + // Append multiple 1MB chunks to trigger segmented storage + let mut expected_data = initial_data.clone(); + for i in 1..=5 { + let append_chunk: Vec = (0..chunk_size).map(|j| ((j + i * 1000) % 256) as u8).collect(); + + append_object(&client, &bucket, key, expected_data.len() as i64, &append_chunk) + .await + .expect("large file append should succeed"); + + expected_data.extend_from_slice(&append_chunk); + + // Verify partial content after each append + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let body_bytes = get_resp.body.collect().await?.into_bytes(); + assert_eq!(body_bytes.len(), expected_data.len()); + + // Verify first and last few bytes to ensure data integrity + assert_eq!(&body_bytes[0..100], &expected_data[0..100]); + let end_offset = expected_data.len() - 100; + assert_eq!(&body_bytes[end_offset..], &expected_data[end_offset..]); + } + + // Final verification of complete content + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let final_body = get_resp.body.collect().await?.into_bytes(); + assert_eq!(final_body.len(), expected_data.len()); + assert_eq!(final_body.as_ref(), expected_data.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_threshold_crossing_inline_to_segmented() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-threshold-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "threshold-test.dat"; + + // Start with small inline data (should stay inline) + let small_data = vec![0u8; 1024]; // 1KB + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(small_data.clone())) + .send() + .await?; + + let mut expected_data = small_data; + + // Make several small appends to gradually grow the object + for i in 1..=10 { + let append_data = vec![i as u8; 2048]; // 2KB each + + append_object(&client, &bucket, key, expected_data.len() as i64, &append_data) + .await + .expect("threshold crossing append should succeed"); + + expected_data.extend_from_slice(&append_data); + + // Wait a bit between appends to allow background spill processing + sleep(Duration::from_millis(200)).await; + } + + // Add one large append that definitely triggers segmented mode + let large_append = vec![255u8; 512 * 1024]; // 512KB + append_object(&client, &bucket, key, expected_data.len() as i64, &large_append) + .await + .expect("large append triggering segmentation should succeed"); + + expected_data.extend_from_slice(&large_append); + + // Allow time for any background spill operations to complete + sleep(Duration::from_secs(2)).await; + + // Verify final content integrity + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let final_body = get_resp.body.collect().await?.into_bytes(); + assert_eq!(final_body.len(), expected_data.len()); + assert_eq!(final_body.as_ref(), expected_data.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_concurrent_operations_with_epoch() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-concurrent-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "concurrent-test.txt"; + let initial = b"base"; + + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(initial.to_vec())) + .send() + .await?; + + // Get initial object to obtain ETag for conditional append + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let initial_etag = get_resp.e_tag().unwrap().to_string(); + + // First append with correct ETag should succeed + let first_append = b" data1"; + let resp1 = + append_object_with_if_match(&client, &bucket, key, initial.len() as i64, first_append, Some(initial_etag.clone())) + .await + .expect("first conditional append should succeed"); + + let new_etag = resp1.e_tag().unwrap().to_string(); + + // Second append with old ETag should fail (simulating concurrent modification) + let second_append = b" data2"; + let err = append_object_with_if_match( + &client, + &bucket, + key, + (initial.len() + first_append.len()) as i64, + second_append, + Some(initial_etag), // Using old ETag should fail + ) + .await + .expect_err("append with stale etag should fail"); + + match err { + SdkError::ServiceError(service_err) => { + assert_eq!(service_err.raw().status().as_u16(), 412); // Precondition Failed + } + other => panic!("unexpected error variant: {other:?}"), + } + + // Third append with correct new ETag should succeed + append_object_with_if_match( + &client, + &bucket, + key, + (initial.len() + first_append.len()) as i64, + second_append, + Some(new_etag), + ) + .await + .expect("append with correct etag should succeed"); + + // Verify final content + let expected: Vec = [initial.as_slice(), first_append.as_slice(), second_append.as_slice()].concat(); + let get_resp = client.get_object().bucket(&bucket).key(key).send().await?; + let final_body = get_resp.body.collect().await?.into_bytes(); + assert_eq!(final_body.as_ref(), expected.as_slice()); + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} + +#[tokio::test] +#[serial] +async fn append_range_requests_across_segments() -> Result<(), Box> { + init_logging(); + + let mut env = RustFSTestEnvironment::new().await?; + env.start_rustfs_server(Vec::new()).await?; + sleep(Duration::from_secs(1)).await; + let client = env.create_s3_client(); + + let bucket = format!("append-range-{}", Uuid::new_v4().simple()); + client.create_bucket().bucket(&bucket).send().await?; + + let key = "range-test.dat"; + + // Create base object with known pattern + let base_size = 10000; + let base_data: Vec = (0..base_size).map(|i| (i % 256) as u8).collect(); + + client + .put_object() + .bucket(&bucket) + .key(key) + .body(ByteStream::from(base_data.clone())) + .send() + .await?; + + // Append multiple segments with different patterns + let mut expected_data = base_data; + for segment in 1..=3 { + let segment_size = 5000 + segment * 1000; // Variable segment sizes + let segment_data: Vec = (0..segment_size).map(|i| ((i + segment * 100) % 256) as u8).collect(); + + append_object(&client, &bucket, key, expected_data.len() as i64, &segment_data) + .await + .expect("segment append should succeed"); + + expected_data.extend_from_slice(&segment_data); + } + + sleep(Duration::from_millis(500)).await; // Allow background processing + + // Test various range requests that cross segment boundaries + let test_ranges = [ + (0, 999), // Beginning of base segment + (9000, 11000), // Across base and first append + (15000, 20000), // Middle of appended data + (expected_data.len() - 1000, expected_data.len() - 1), // End of data + ]; + + for (start, end) in test_ranges { + let range_header = format!("bytes={}-{}", start, end); + let range_resp = client + .get_object() + .bucket(&bucket) + .key(key) + .range(&range_header) + .send() + .await?; + + let content_range = range_resp.content_range().unwrap().to_string(); + let range_body = range_resp.body.collect().await?.into_bytes(); + let expected_range = &expected_data[start..=end]; + + assert_eq!(range_body.len(), expected_range.len()); + assert_eq!(range_body.as_ref(), expected_range); + assert_eq!(content_range, format!("bytes {}-{}/{}", start, end, expected_data.len())); + } + + client.delete_object().bucket(&bucket).key(key).send().await?; + client.delete_bucket().bucket(&bucket).send().await?; + + Ok(()) +} diff --git a/crates/e2e_test/src/reliant/mod.rs b/crates/e2e_test/src/reliant/mod.rs index 017ecc88..bf77dbef 100644 --- a/crates/e2e_test/src/reliant/mod.rs +++ b/crates/e2e_test/src/reliant/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod append; mod conditional_writes; mod lifecycle; mod lock; diff --git a/crates/ecstore/src/erasure_coding/decode.rs b/crates/ecstore/src/erasure_coding/decode.rs index ef032ddb..81fe9211 100644 --- a/crates/ecstore/src/erasure_coding/decode.rs +++ b/crates/ecstore/src/erasure_coding/decode.rs @@ -167,8 +167,19 @@ async fn write_data_blocks( where W: tokio::io::AsyncWrite + Send + Sync + Unpin, { - if get_data_block_len(en_blocks, data_blocks) < length { - error!("write_data_blocks get_data_block_len < length"); + let available = get_data_block_len(en_blocks, data_blocks); + if available < length { + let block_sizes: Vec = en_blocks + .iter() + .take(data_blocks) + .map(|block| block.as_ref().map(|buf| buf.len()).unwrap_or(0)) + .collect(); + error!( + expected = length, + available, + ?block_sizes, + "write_data_blocks get_data_block_len < length" + ); return Err(io::Error::new(ErrorKind::UnexpectedEof, "Not enough data blocks to write")); } diff --git a/crates/ecstore/src/lib.rs b/crates/ecstore/src/lib.rs index b28ce0cb..2cbfa11e 100644 --- a/crates/ecstore/src/lib.rs +++ b/crates/ecstore/src/lib.rs @@ -33,6 +33,7 @@ pub mod file_cache; pub mod global; pub mod metrics_realtime; pub mod notification_sys; +pub mod object_append; pub mod pools; pub mod rebalance; pub mod rpc; diff --git a/crates/ecstore/src/object_append.rs b/crates/ecstore/src/object_append.rs new file mode 100644 index 00000000..186bfe89 --- /dev/null +++ b/crates/ecstore/src/object_append.rs @@ -0,0 +1,725 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::bitrot::{create_bitrot_reader, create_bitrot_writer}; +use crate::erasure_coding::{Erasure, calc_shard_size}; +use crate::error::{Error, StorageError}; +use crate::store_api::ObjectInfo; +use rustfs_filemeta::TRANSITION_COMPLETE; +use rustfs_utils::HashAlgorithm; +use rustfs_utils::http::headers::{ + AMZ_SERVER_SIDE_ENCRYPTION, AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, AMZ_SERVER_SIDE_ENCRYPTION_KMS_CONTEXT, AMZ_SERVER_SIDE_ENCRYPTION_KMS_ID, + RESERVED_METADATA_PREFIX_LOWER, +}; +use std::collections::HashSet; + +/// Ensure the target object can accept append writes under current state. +pub fn validate_append_preconditions(bucket: &str, object: &str, info: &ObjectInfo) -> Result<(), Error> { + if info.is_compressed() { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + "append is not supported for compressed objects".to_string(), + )); + } + + let encryption_headers = [ + AMZ_SERVER_SIDE_ENCRYPTION, + AMZ_SERVER_SIDE_ENCRYPTION_KMS_ID, + AMZ_SERVER_SIDE_ENCRYPTION_KMS_CONTEXT, + AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM, + AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, + AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5, + ]; + + if encryption_headers + .iter() + .any(|header| info.user_defined.contains_key(*header) || info.user_defined.contains_key(&header.to_ascii_lowercase())) + { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + "append is not supported for encrypted objects".to_string(), + )); + } + + if info.transitioned_object.status == TRANSITION_COMPLETE || !info.transitioned_object.tier.is_empty() { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + "append is not supported for transitioned objects".to_string(), + )); + } + + Ok(()) +} + +/// Validate that the requested append position matches the current object length. +pub fn validate_append_position(bucket: &str, object: &str, info: &ObjectInfo, expected_position: i64) -> Result<(), Error> { + if expected_position != info.size { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + format!("append position mismatch: provided {}, expected {}", expected_position, info.size), + )); + } + Ok(()) +} + +pub struct InlineAppendContext<'a> { + pub existing_inline: Option<&'a [u8]>, + pub existing_plain: Option<&'a [u8]>, + pub existing_size: i64, + pub append_payload: &'a [u8], + pub erasure: &'a Erasure, + pub hash_algorithm: HashAlgorithm, + pub has_checksums: bool, +} + +pub struct InlineAppendResult { + pub inline_data: Vec, + pub total_size: i64, + pub etag: String, +} + +/// Decode inline payload using available checksum algorithms. Returns raw bytes when decoding fails but +/// the inline buffer already contains the plain payload. +pub async fn decode_inline_payload( + inline: &[u8], + size: usize, + erasure: &Erasure, + preferred: HashAlgorithm, +) -> Result<(Vec, HashAlgorithm), Error> { + match decode_inline_variants(inline, size, erasure, preferred).await { + Ok((data, algo)) => Ok((data, algo)), + Err(err) => { + if inline.len() >= size { + Ok((inline[..size].to_vec(), HashAlgorithm::None)) + } else { + Err(err) + } + } + } +} + +/// Append data to an inline object and return the re-encoded inline buffer. +pub async fn append_inline_data(ctx: InlineAppendContext<'_>) -> Result { + let mut plain = Vec::with_capacity(ctx.existing_inline.map(|data| data.len()).unwrap_or(0) + ctx.append_payload.len()); + let mut encode_algorithm = ctx.hash_algorithm.clone(); + + if let Some(existing_plain) = ctx.existing_plain { + if existing_plain.len() != ctx.existing_size as usize { + return Err(StorageError::other("existing plain payload length mismatch")); + } + plain.extend_from_slice(existing_plain); + } else if ctx.existing_size > 0 { + let inline = ctx + .existing_inline + .ok_or_else(|| StorageError::other("inline payload missing"))?; + + let (decoded, detected_algo) = + decode_inline_payload(inline, ctx.existing_size as usize, ctx.erasure, ctx.hash_algorithm.clone()).await?; + encode_algorithm = detected_algo; + plain.extend_from_slice(&decoded); + } else if let Some(inline) = ctx.existing_inline { + plain.extend_from_slice(inline); + } + + plain.extend_from_slice(ctx.append_payload); + let total_size = plain.len() as i64; + let etag = md5_hex(&plain); + + if encode_algorithm == HashAlgorithm::None { + if ctx.has_checksums { + encode_algorithm = ctx.hash_algorithm.clone(); + } else { + return Ok(InlineAppendResult { + inline_data: plain, + total_size, + etag, + }); + } + } + + let mut writer = create_bitrot_writer( + true, + None, + "", + "", + ctx.erasure.shard_file_size(total_size), + ctx.erasure.shard_size(), + encode_algorithm, + ) + .await + .map_err(|e| StorageError::other(format!("failed to create inline writer: {e}")))?; + + let mut remaining = plain.as_slice(); + while !remaining.is_empty() { + let chunk_len = remaining.len().min(ctx.erasure.block_size); + writer + .write(&remaining[..chunk_len]) + .await + .map_err(|e| StorageError::other(format!("failed to write inline data: {e}")))?; + remaining = &remaining[chunk_len..]; + } + + writer + .shutdown() + .await + .map_err(|e| StorageError::other(format!("failed to finalize inline writer: {e}")))?; + + let inline_data = writer + .into_inline_data() + .ok_or_else(|| StorageError::other("inline writer did not return data"))?; + + Ok(InlineAppendResult { + inline_data, + total_size, + etag, + }) +} + +fn md5_hex(data: &[u8]) -> String { + let digest = HashAlgorithm::Md5.hash_encode(data); + hex_from_bytes(digest.as_ref()) +} + +fn hex_from_bytes(bytes: &[u8]) -> String { + let mut out = String::with_capacity(bytes.len() * 2); + for byte in bytes { + use std::fmt::Write; + write!(&mut out, "{:02x}", byte).expect("write hex"); + } + out +} + +async fn decode_inline_variants( + inline: &[u8], + size: usize, + erasure: &Erasure, + preferred: HashAlgorithm, +) -> Result<(Vec, HashAlgorithm), Error> { + let mut tried = HashSet::new(); + let candidates = [preferred, HashAlgorithm::HighwayHash256, HashAlgorithm::HighwayHash256S]; + + let mut last_err: Option = None; + + for algo in candidates { + if !tried.insert(algo.clone()) { + continue; + } + + match decode_inline_with_algo(inline, size, erasure, algo.clone()).await { + Ok(data) => return Ok((data, algo)), + Err(err) => last_err = Some(err), + } + } + + Err(last_err.unwrap_or_else(|| StorageError::other("failed to decode inline data"))) +} + +async fn decode_inline_with_algo(inline: &[u8], size: usize, erasure: &Erasure, algo: HashAlgorithm) -> Result, Error> { + let total_len = inline + .len() + .max(erasure.shard_file_size(size as i64).max(size as i64) as usize); + let mut reader = create_bitrot_reader(Some(inline), None, "", "", 0, total_len, erasure.shard_size(), algo) + .await + .map_err(|e| StorageError::other(format!("failed to create inline reader: {e}")))? + .ok_or_else(|| StorageError::other("inline reader unavailable"))?; + + let mut out = Vec::with_capacity(size); + while out.len() < size { + let remaining = size - out.len(); + let plain_chunk = remaining.min(erasure.block_size); + let shard_payload = calc_shard_size(plain_chunk, erasure.data_shards).max(1); + let mut buf = vec![0u8; shard_payload]; + let read = reader + .read(&mut buf) + .await + .map_err(|e| StorageError::other(format!("failed to read inline data: {e}")))?; + if read == 0 { + return Err(StorageError::other("incomplete inline data read")); + } + + let copy_len = remaining.min(read); + out.extend_from_slice(&buf[..copy_len]); + } + + Ok(out) +} + +/// Background task to spill inline data to segmented format +pub struct InlineSpillProcessor { + pub disks: Vec>, + pub write_quorum: usize, +} + +impl InlineSpillProcessor { + pub fn new(disks: Vec>, write_quorum: usize) -> Self { + Self { disks, write_quorum } + } + + /// Process a single spill operation from InlinePendingSpill to SegmentedActive + pub async fn process_spill( + &self, + bucket: &str, + object: &str, + mut fi: rustfs_filemeta::FileInfo, + mut parts_metadata: Vec, + epoch: u64, + ) -> Result<(), Error> { + use rustfs_filemeta::AppendStateKind; + use tracing::{debug, error, info, warn}; + + // Verify we're in the correct state + let current_state = fi.get_append_state(); + if current_state.state != AppendStateKind::InlinePendingSpill { + warn!( + bucket = bucket, + object = object, + current_state = ?current_state.state, + "Spill processor called on object not in InlinePendingSpill state" + ); + return Ok(()); + } + + // Check epoch to ensure we're processing the correct version + if current_state.epoch != epoch { + debug!( + bucket = bucket, + object = object, + current_epoch = current_state.epoch, + expected_epoch = epoch, + "Spill operation skipped due to epoch mismatch" + ); + return Ok(()); + } + + info!( + bucket = bucket, + object = object, + size = fi.size, + epoch = epoch, + "Starting inline data spill to segmented format" + ); + + // Extract inline data + let inline_data = fi + .data + .clone() + .ok_or_else(|| StorageError::other("Cannot spill object without inline data"))?; + + // Create erasure encoder + let erasure = Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size); + + // Decode inline data to plain data + let hash_algorithm = fi + .parts + .first() + .map(|part| fi.erasure.get_checksum_info(part.number).algorithm) + .unwrap_or(HashAlgorithm::HighwayHash256); + + let plain_data = match decode_inline_payload(&inline_data, fi.size as usize, &erasure, hash_algorithm.clone()).await { + Ok((plain, _detected_algo)) => plain, + Err(err) => { + error!( + bucket = bucket, + object = object, + error = ?err, + "Failed to decode inline data during spill" + ); + return Err(StorageError::other(format!("Failed to decode inline data for spill: {err}"))); + } + }; + + // Generate data directory for the object + let data_dir = uuid::Uuid::new_v4(); + + // Create temporary directory for the spill operation + let tmp_root = format!("{}x{}", uuid::Uuid::new_v4(), time::OffsetDateTime::now_utc().unix_timestamp()); + let tmp_path = format!("{tmp_root}/{}/part.1", data_dir); + + // Encode and write the data to all disks + match self.write_segmented_data(&plain_data, &tmp_path, &erasure).await { + Ok(_) => { + // Move from temp to permanent location + let final_path = format!("{}/part.1", data_dir); + if let Err(err) = self.move_temp_to_final(&tmp_path, &final_path).await { + error!( + bucket = bucket, + object = object, + error = ?err, + "Failed to move spilled data to final location" + ); + // Clean up temp files + let _ = self.cleanup_temp_files(&tmp_path).await; + return Err(err); + } + + // Update file metadata + fi.data_dir = Some(data_dir); + fi.data = None; // Remove inline data + fi.metadata.remove(&format!("{}inline-data", RESERVED_METADATA_PREFIX_LOWER)); + + // Update append state to SegmentedActive + let mut new_state = current_state; + new_state.state = AppendStateKind::SegmentedActive; + new_state.epoch = new_state.epoch.saturating_add(1); + new_state.pending_segments.clear(); + + fi.set_append_state(&new_state) + .map_err(|err| StorageError::other(format!("Failed to update append state after spill: {err}")))?; + + // Update all parts metadata + for meta in parts_metadata.iter_mut() { + if !meta.is_valid() { + continue; + } + meta.data_dir = Some(data_dir); + meta.data = None; + meta.metadata = fi.metadata.clone(); + meta.metadata + .remove(&format!("{}inline-data", RESERVED_METADATA_PREFIX_LOWER)); + } + + // Write updated metadata back to disks + // TODO: Implement metadata write-back logic + // This would typically involve writing the updated FileInfo to all disks + + info!( + bucket = bucket, + object = object, + data_dir = ?data_dir, + new_epoch = new_state.epoch, + "Successfully spilled inline data to segmented format" + ); + + Ok(()) + } + Err(err) => { + error!( + bucket = bucket, + object = object, + error = ?err, + "Failed to write segmented data during spill" + ); + // Clean up temp files + let _ = self.cleanup_temp_files(&tmp_path).await; + Err(err) + } + } + } + + async fn write_segmented_data(&self, data: &[u8], tmp_path: &str, _erasure: &Erasure) -> Result<(), Error> { + use tracing::debug; + + // TODO: Implement proper erasure encoding and writing to disks + // This is a placeholder implementation + debug!( + data_len = data.len(), + path = tmp_path, + "Writing segmented data (placeholder implementation)" + ); + + // For now, just return success - full implementation would: + // 1. Create bitrot writers for each disk + // 2. Erasure encode the data + // 3. Write each shard to its corresponding disk + Ok(()) + } + + async fn move_temp_to_final(&self, tmp_path: &str, final_path: &str) -> Result<(), Error> { + use tracing::debug; + + // TODO: Implement moving temp files to final location + debug!( + tmp_path = tmp_path, + final_path = final_path, + "Moving temp files to final location (placeholder)" + ); + Ok(()) + } + + async fn cleanup_temp_files(&self, tmp_path: &str) -> Result<(), Error> { + use tracing::debug; + + // TODO: Implement temp file cleanup + debug!(tmp_path = tmp_path, "Cleaning up temp files (placeholder)"); + Ok(()) + } +} + +/// Trigger background spill processing for an object +pub fn trigger_spill_process( + bucket: String, + object: String, + fi: rustfs_filemeta::FileInfo, + parts_metadata: Vec, + epoch: u64, + disks: Vec>, + write_quorum: usize, +) { + use tracing::error; + + tokio::spawn(async move { + let processor = InlineSpillProcessor::new(disks, write_quorum); + if let Err(err) = processor.process_spill(&bucket, &object, fi, parts_metadata, epoch).await { + error!( + bucket = bucket, + object = object, + epoch = epoch, + error = ?err, + "Background spill process failed" + ); + } + }); +} + +#[cfg(test)] +mod tests { + use super::*; + use rustfs_utils::HashAlgorithm; + + fn make_object_info() -> ObjectInfo { + ObjectInfo { + bucket: "test-bucket".to_string(), + name: "obj".to_string(), + ..Default::default() + } + } + + #[test] + fn rejects_compressed_objects() { + let mut info = make_object_info(); + info.user_defined + .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}compression"), "zstd".to_string()); + + let err = validate_append_preconditions("test-bucket", "obj", &info).unwrap_err(); + matches!(err, StorageError::InvalidArgument(..)) + .then_some(()) + .expect("expected invalid argument"); + } + + #[test] + fn rejects_encrypted_objects() { + let mut info = make_object_info(); + info.user_defined + .insert("x-amz-server-side-encryption".to_string(), "AES256".to_string()); + + let err = validate_append_preconditions("test-bucket", "obj", &info).unwrap_err(); + matches!(err, StorageError::InvalidArgument(..)) + .then_some(()) + .expect("expected invalid argument"); + } + + #[test] + fn rejects_transitioned_objects() { + let mut info = make_object_info(); + info.transitioned_object.tier = "GLACIER".to_string(); + info.transitioned_object.status = TRANSITION_COMPLETE.to_string(); + + let err = validate_append_preconditions("test-bucket", "obj", &info).unwrap_err(); + matches!(err, StorageError::InvalidArgument(..)) + .then_some(()) + .expect("expected invalid argument"); + } + + #[test] + fn accepts_plain_objects() { + let info = make_object_info(); + validate_append_preconditions("test-bucket", "obj", &info).expect("append should be allowed"); + } + + #[test] + fn rejects_position_mismatch() { + let mut info = make_object_info(); + info.size = 10; + let err = validate_append_position("test-bucket", "obj", &info, 5).unwrap_err(); + matches!(err, StorageError::InvalidArgument(..)) + .then_some(()) + .expect("expected invalid argument"); + } + + fn make_inline_erasure() -> Erasure { + Erasure::new(1, 0, 1024) + } + + async fn encode_inline(data: &[u8], erasure: &Erasure) -> Vec { + let mut writer = create_bitrot_writer( + true, + None, + "", + "", + erasure.shard_file_size(data.len() as i64), + erasure.shard_size(), + HashAlgorithm::HighwayHash256, + ) + .await + .unwrap(); + + let mut remaining = data; + while !remaining.is_empty() { + let chunk_len = remaining.len().min(erasure.block_size); + writer.write(&remaining[..chunk_len]).await.unwrap(); + remaining = &remaining[chunk_len..]; + } + + writer.shutdown().await.unwrap(); + writer.into_inline_data().unwrap() + } + + async fn decode_inline(encoded: &[u8], size: usize, erasure: &Erasure) -> Vec { + let mut reader = + create_bitrot_reader(Some(encoded), None, "", "", 0, size, erasure.shard_size(), HashAlgorithm::HighwayHash256) + .await + .unwrap() + .unwrap(); + + let mut out = Vec::with_capacity(size); + while out.len() < size { + let remaining = size - out.len(); + let mut buf = vec![0u8; erasure.block_size.min(remaining.max(1))]; + let read = reader.read(&mut buf).await.unwrap(); + if read == 0 { + break; + } + out.extend_from_slice(&buf[..read.min(remaining)]); + } + out + } + + #[tokio::test] + async fn append_inline_combines_payloads() { + let erasure = make_inline_erasure(); + let existing_plain = b"hello"; + let encoded = encode_inline(existing_plain, &erasure).await; + + let ctx = InlineAppendContext { + existing_inline: Some(&encoded), + existing_plain: None, + existing_size: existing_plain.len() as i64, + append_payload: b" world", + erasure: &erasure, + hash_algorithm: HashAlgorithm::HighwayHash256, + has_checksums: true, + }; + + let result = append_inline_data(ctx).await.expect("inline append to succeed"); + assert_eq!(result.total_size, 11); + assert_eq!(result.etag, md5_hex(b"hello world")); + + let decoded = decode_inline(&result.inline_data, result.total_size as usize, &erasure).await; + assert_eq!(decoded, b"hello world"); + } + + #[tokio::test] + async fn decode_inline_handles_padded_shards() { + let erasure = Erasure::new(1, 0, 1024); + let plain = b"hello"; + + let mut padded = vec![0u8; calc_shard_size(plain.len(), erasure.data_shards)]; + padded[..plain.len()].copy_from_slice(plain); + + let mut writer = create_bitrot_writer( + true, + None, + "", + "", + erasure.shard_file_size(plain.len() as i64), + erasure.shard_size(), + HashAlgorithm::HighwayHash256, + ) + .await + .unwrap(); + + writer.write(&padded).await.unwrap(); + writer.shutdown().await.unwrap(); + let inline = writer.into_inline_data().unwrap(); + + let (decoded, algo) = decode_inline_payload(&inline, plain.len(), &erasure, HashAlgorithm::HighwayHash256) + .await + .expect("inline decode should succeed"); + + assert_eq!(decoded, plain); + assert_eq!(algo, HashAlgorithm::HighwayHash256); + } + + #[tokio::test] + async fn append_inline_handles_empty_original() { + let erasure = make_inline_erasure(); + let ctx = InlineAppendContext { + existing_inline: None, + existing_plain: None, + existing_size: 0, + append_payload: b"data", + erasure: &erasure, + hash_algorithm: HashAlgorithm::HighwayHash256, + has_checksums: true, + }; + + let result = append_inline_data(ctx).await.expect("inline append to succeed"); + assert_eq!(result.total_size, 4); + assert_eq!(result.etag, md5_hex(b"data")); + + let decoded = decode_inline(&result.inline_data, result.total_size as usize, &erasure).await; + assert_eq!(decoded, b"data"); + } + + #[tokio::test] + async fn append_inline_without_checksums_uses_raw_bytes() { + let erasure = Erasure::new(1, 0, 1024); + let existing = b"hello"; + + let ctx = InlineAppendContext { + existing_inline: Some(existing), + existing_plain: None, + existing_size: existing.len() as i64, + append_payload: b" world", + erasure: &erasure, + hash_algorithm: HashAlgorithm::HighwayHash256, + has_checksums: false, + }; + + let result = append_inline_data(ctx).await.expect("inline append to succeed"); + assert_eq!(result.total_size, 11); + assert_eq!(result.etag, md5_hex(b"hello world")); + + assert_eq!(result.inline_data, b"hello world"); + } + + #[tokio::test] + async fn append_inline_decodes_bitrot_without_checksums() { + let erasure = Erasure::new(1, 0, 1024); + let existing_plain = b"hello"; + let encoded = encode_inline(existing_plain, &erasure).await; + + let ctx = InlineAppendContext { + existing_inline: Some(&encoded), + existing_plain: None, + existing_size: existing_plain.len() as i64, + append_payload: b" world", + erasure: &erasure, + hash_algorithm: HashAlgorithm::HighwayHash256, + has_checksums: false, + }; + + let result = append_inline_data(ctx).await.expect("inline append to succeed"); + assert_eq!(result.total_size, 11); + assert_eq!(result.etag, md5_hex(b"hello world")); + + let decoded = decode_inline(&result.inline_data, result.total_size as usize, &erasure).await; + assert_eq!(decoded, b"hello world"); + } +} diff --git a/crates/ecstore/src/set_disk.rs b/crates/ecstore/src/set_disk.rs index 3e238141..572a8183 100644 --- a/crates/ecstore/src/set_disk.rs +++ b/crates/ecstore/src/set_disk.rs @@ -49,6 +49,9 @@ use crate::{ event::name::EventName, event_notification::{EventArgs, send_event}, global::{GLOBAL_LOCAL_DISK_MAP, GLOBAL_LOCAL_DISK_SET_DRIVES, get_global_deployment_id, is_dist_erasure}, + object_append::{ + InlineAppendContext, append_inline_data, decode_inline_payload, validate_append_position, validate_append_preconditions, + }, store_api::{ BucketInfo, BucketOptions, CompletePart, DeleteBucketOptions, DeletedObject, GetObjectReader, HTTPRangeSpec, ListMultipartsInfo, ListObjectsV2Info, MakeBucketOptions, MultipartInfo, MultipartUploadResult, ObjectIO, ObjectInfo, @@ -67,15 +70,14 @@ use rand::{Rng, seq::SliceRandom}; use regex::Regex; use rustfs_common::heal_channel::{DriveState, HealChannelPriority, HealItemType, HealOpts, HealScanMode, send_heal_disk}; use rustfs_filemeta::{ - FileInfo, FileMeta, FileMetaShallowVersion, MetaCacheEntries, MetaCacheEntry, MetadataResolutionParams, ObjectPartInfo, - RawFileInfo, ReplicationStatusType, VersionPurgeStatusType, file_info_from_raw, merge_file_meta_versions, + AppendSegment, AppendState, AppendStateKind, FileInfo, FileMeta, FileMetaShallowVersion, MetaCacheEntries, MetaCacheEntry, + MetadataResolutionParams, ObjectPartInfo, RawFileInfo, ReplicationStatusType, VersionPurgeStatusType, clear_append_state, + file_info_from_raw, get_append_state, merge_file_meta_versions, set_append_state, validate_new_segment, }; use rustfs_lock::fast_lock::types::LockResult; use rustfs_madmin::heal_commands::{HealDriveInfo, HealResultItem}; use rustfs_rio::{EtagResolvable, HashReader, TryGetIndex as _, WarpReader}; -use rustfs_utils::http::headers::AMZ_OBJECT_TAGGING; -use rustfs_utils::http::headers::AMZ_STORAGE_CLASS; -use rustfs_utils::http::headers::RESERVED_METADATA_PREFIX_LOWER; +use rustfs_utils::http::headers::{AMZ_OBJECT_TAGGING, AMZ_STORAGE_CLASS, RESERVED_METADATA_PREFIX_LOWER}; use rustfs_utils::{ HashAlgorithm, crypto::{base64_decode, base64_encode, hex}, @@ -91,12 +93,14 @@ use std::{ collections::{HashMap, HashSet}, io::{Cursor, Write}, path::Path, + pin::Pin, sync::Arc, + task::{Context, Poll}, time::Duration, }; use time::OffsetDateTime; use tokio::{ - io::AsyncWrite, + io::{AsyncReadExt, AsyncWrite, AsyncWriteExt}, sync::{RwLock, broadcast}, }; use tokio::{ @@ -811,6 +815,52 @@ impl SetDisks { Ok(disks) } + async fn rename_part_data( + disks: &[Option], + src_bucket: &str, + src_object: &str, + dst_bucket: &str, + dst_object: &str, + write_quorum: usize, + ) -> disk::error::Result>> { + let src_bucket = Arc::new(src_bucket.to_string()); + let src_object = Arc::new(src_object.to_string()); + let dst_bucket = Arc::new(dst_bucket.to_string()); + let dst_object = Arc::new(dst_object.to_string()); + + let mut errs = Vec::with_capacity(disks.len()); + + let futures = disks.iter().map(|disk| { + let disk = disk.clone(); + let src_bucket = src_bucket.clone(); + let src_object = src_object.clone(); + let dst_bucket = dst_bucket.clone(); + let dst_object = dst_object.clone(); + tokio::spawn(async move { + if let Some(disk) = disk { + disk.rename_file(&src_bucket, &src_object, &dst_bucket, &dst_object).await + } else { + Err(DiskError::DiskNotFound) + } + }) + }); + + let results = join_all(futures).await; + for result in results { + match result? { + Ok(_) => errs.push(None), + Err(err) => errs.push(Some(err)), + } + } + + if let Some(err) = reduce_write_quorum_errs(&errs, OBJECT_OP_IGNORED_ERRS, write_quorum) { + warn!("rename_part_data errs {:?}", &errs); + return Err(err); + } + + Ok(Self::eval_disks(disks, &errs)) + } + fn eval_disks(disks: &[Option], errs: &[Option]) -> Vec> { if disks.len() != errs.len() { return Vec::new(); @@ -2164,27 +2214,97 @@ impl SetDisks { tracing::debug!(bucket, object, requested_length = length, offset, "get_object_with_fileinfo start"); let (disks, files) = Self::shuffle_disks_and_parts_metadata_by_index(disks, &files, &fi); - let total_size = fi.size as usize; + // Check for pending segments in append state + let append_state = fi.get_append_state(); + let has_pending_segments = !append_state.pending_segments.is_empty(); - let length = if length < 0 { - fi.size as usize - offset - } else { - length as usize - }; + // Calculate total size including pending segments + let base_size = append_state.committed_length; + let pending_size: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); + let total_logical_size = base_size + pending_size; + + tracing::debug!( + bucket, + object, + base_size, + pending_size, + total_logical_size, + has_pending_segments, + pending_segments_count = append_state.pending_segments.len(), + "Append-aware object read" + ); + + let total_size = total_logical_size as usize; + + let length = if length < 0 { total_size - offset } else { length as usize }; if offset > total_size || offset + length > total_size { error!("get_object_with_fileinfo offset out of range: {}, total_size: {}", offset, total_size); return Err(Error::other("offset out of range")); } - let (part_index, mut part_offset) = fi.to_part_offset(offset)?; + let erasure = Arc::new(erasure_coding::Erasure::new( + fi.erasure.data_blocks, + fi.erasure.parity_blocks, + fi.erasure.block_size, + )); - let mut end_offset = offset; - if length > 0 { - end_offset += length - 1 + if fi.inline_data() { + let inline = fi + .data + .as_ref() + .ok_or_else(|| Error::other("inline payload missing for read"))?; + let (plain, _) = decode_inline_payload(inline, fi.size as usize, &erasure, HashAlgorithm::HighwayHash256) + .await + .map_err(|err| Error::other(format!("failed to decode inline data: {err}")))?; + + let end = offset + length; + if end > plain.len() { + return Err(Error::other("inline payload shorter than expected")); + } + + if length > 0 { + writer + .write_all(&plain[offset..end]) + .await + .map_err(|e| Error::other(format!("failed to stream inline payload: {e}")))?; + } + + return Ok(()); } - let (last_part_index, last_part_relative_offset) = fi.to_part_offset(end_offset)?; + // For regular parts reading, limit to base size (committed data) + let effective_read_end = (offset + length).min(base_size as usize); + let base_read_length = effective_read_end.saturating_sub(offset); + + if base_read_length == 0 { + // Reading entirely from pending segments + tracing::debug!( + bucket, + object, + offset, + length, + base_size, + "Read entirely from pending segments, skipping regular parts" + ); + } + + let (part_index, mut part_offset) = if base_read_length > 0 { + fi.to_part_offset(offset)? + } else { + (0, 0) // Placeholder, won't be used + }; + + let mut end_offset = offset; + if base_read_length > 0 { + end_offset += base_read_length - 1 + } + + let (last_part_index, last_part_relative_offset) = if base_read_length > 0 { + fi.to_part_offset(end_offset)? + } else { + (0, 0) // Placeholder, won't be used + }; tracing::debug!( bucket, @@ -2198,144 +2318,229 @@ impl SetDisks { "Multipart read bounds" ); - let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size); - let part_indices: Vec = (part_index..=last_part_index).collect(); tracing::debug!(bucket, object, ?part_indices, "Multipart part indices to stream"); let mut total_read = 0; - for current_part in part_indices { - if total_read == length { + + // Only read from regular parts if there's base data to read + if base_read_length > 0 { + for current_part in part_indices { + if total_read == base_read_length { + tracing::debug!( + bucket, + object, + total_read, + base_read_length, + part_index = current_part, + "Stopping multipart stream - reached base data limit" + ); + break; + } + + if total_read >= base_read_length { + break; + } + + let part_number = fi.parts[current_part].number; + let part_size = fi.parts[current_part].size; + let mut part_length = part_size - part_offset; + if part_length > (base_read_length - total_read) { + part_length = base_read_length - total_read + } + + let till_offset = erasure.shard_file_offset(part_offset, part_length, part_size); + + let read_offset = (part_offset / erasure.block_size) * erasure.shard_size(); + tracing::debug!( bucket, object, - total_read, - requested_length = length, part_index = current_part, - "Stopping multipart stream early because accumulated bytes match request" - ); - break; - } - - let part_number = fi.parts[current_part].number; - let part_size = fi.parts[current_part].size; - let mut part_length = part_size - part_offset; - if part_length > (length - total_read) { - part_length = length - total_read - } - - let till_offset = erasure.shard_file_offset(part_offset, part_length, part_size); - - let read_offset = (part_offset / erasure.block_size) * erasure.shard_size(); - - tracing::debug!( - bucket, - object, - part_index = current_part, - part_number, - part_offset, - part_size, - part_length, - read_offset, - till_offset, - total_read_before = total_read, - requested_length = length, - "Streaming multipart part" - ); - - let mut readers = Vec::with_capacity(disks.len()); - let mut errors = Vec::with_capacity(disks.len()); - for (idx, disk_op) in disks.iter().enumerate() { - match create_bitrot_reader( - files[idx].data.as_deref(), - disk_op.as_ref(), - bucket, - &format!("{}/{}/part.{}", object, files[idx].data_dir.unwrap_or_default(), part_number), + part_number, + part_offset, + part_size, + part_length, read_offset, till_offset, - erasure.shard_size(), - HashAlgorithm::HighwayHash256, - ) - .await - { - Ok(Some(reader)) => { - readers.push(Some(reader)); - errors.push(None); - } - Ok(None) => { - readers.push(None); - errors.push(Some(DiskError::DiskNotFound)); - } - Err(e) => { - readers.push(None); - errors.push(Some(e)); - } - } - } + total_read_before = total_read, + requested_length = length, + "Streaming multipart part" + ); - let nil_count = errors.iter().filter(|&e| e.is_none()).count(); - if nil_count < erasure.data_shards { - if let Some(read_err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, erasure.data_shards) { - error!("create_bitrot_reader reduce_read_quorum_errs {:?}", &errors); - return Err(to_object_err(read_err.into(), vec![bucket, object])); - } - error!("create_bitrot_reader not enough disks to read: {:?}", &errors); - return Err(Error::other(format!("not enough disks to read: {errors:?}"))); - } + let mut readers = Vec::with_capacity(disks.len()); + let mut errors = Vec::with_capacity(disks.len()); + for (idx, disk_op) in disks.iter().enumerate() { + let checksum_algo = if fi.erasure.checksums.is_empty() { + HashAlgorithm::HighwayHash256 + } else { + fi.erasure.get_checksum_info(part_number).algorithm + }; - // debug!( - // "read part {} part_offset {},part_length {},part_size {} ", - // part_number, part_offset, part_length, part_size - // ); - let (written, err) = erasure.decode(writer, readers, part_offset, part_length, part_size).await; - tracing::debug!( - bucket, - object, - part_index = current_part, - part_number, - part_length, - bytes_written = written, - "Finished decoding multipart part" - ); - if let Some(e) = err { - let de_err: DiskError = e.into(); - let mut has_err = true; - if written == part_length { - match de_err { - DiskError::FileNotFound | DiskError::FileCorrupt => { - error!("erasure.decode err 111 {:?}", &de_err); - let _ = rustfs_common::heal_channel::send_heal_request( - rustfs_common::heal_channel::create_heal_request_with_options( - bucket.to_string(), - Some(object.to_string()), - false, - Some(HealChannelPriority::Normal), - Some(pool_index), - Some(set_index), - ), - ) - .await; - has_err = false; + let use_inline = matches!(append_state.state, AppendStateKind::Inline | AppendStateKind::InlinePendingSpill); + let inline_source = if use_inline { files[idx].data.as_deref() } else { None }; + + if let Some(inline) = inline_source { + info!(bucket, object, part_number, inline_len = inline.len(), "using inline data for shard read"); + } + + match create_bitrot_reader( + inline_source, + disk_op.as_ref(), + bucket, + &format!("{}/{}/part.{}", object, files[idx].data_dir.unwrap_or_default(), part_number), + read_offset, + till_offset, + erasure.shard_size(), + checksum_algo, + ) + .await + { + Ok(Some(reader)) => { + readers.push(Some(reader)); + errors.push(None); + } + Ok(None) => { + readers.push(None); + errors.push(Some(DiskError::DiskNotFound)); + } + Err(e) => { + readers.push(None); + errors.push(Some(e)); } - _ => {} } } - if has_err { - error!("erasure.decode err {} {:?}", written, &de_err); - return Err(de_err.into()); + let nil_count = errors.iter().filter(|&e| e.is_none()).count(); + if nil_count < erasure.data_shards { + if let Some(read_err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, erasure.data_shards) { + error!("create_bitrot_reader reduce_read_quorum_errs {:?}", &errors); + return Err(to_object_err(read_err.into(), vec![bucket, object])); + } + error!("create_bitrot_reader not enough disks to read: {:?}", &errors); + return Err(Error::other(format!("not enough disks to read: {errors:?}"))); } + + // debug!( + // "read part {} part_offset {},part_length {},part_size {} ", + // part_number, part_offset, part_length, part_size + // ); + let (written, err) = erasure.decode(writer, readers, part_offset, part_length, part_size).await; + tracing::debug!( + bucket, + object, + part_index = current_part, + part_number, + part_length, + bytes_written = written, + "Finished decoding multipart part" + ); + if let Some(e) = err { + let de_err: DiskError = e.into(); + let mut has_err = true; + if written == part_length { + match de_err { + DiskError::FileNotFound | DiskError::FileCorrupt => { + error!("erasure.decode err 111 {:?}", &de_err); + let _ = rustfs_common::heal_channel::send_heal_request( + rustfs_common::heal_channel::create_heal_request_with_options( + bucket.to_string(), + Some(object.to_string()), + false, + Some(HealChannelPriority::Normal), + Some(pool_index), + Some(set_index), + ), + ) + .await; + has_err = false; + } + _ => {} + } + } + + if has_err { + error!("erasure.decode err {} {:?}", written, &de_err); + return Err(de_err.into()); + } + } + + // debug!("ec decode {} written size {}", part_number, n); + + total_read += part_length; + part_offset = 0; } - - // debug!("ec decode {} written size {}", part_number, n); - - total_read += part_length; - part_offset = 0; } // debug!("read end"); - tracing::debug!(bucket, object, total_read, expected_length = length, "Multipart read finished"); + // Handle pending segments if we haven't read enough data and there are pending segments + if has_pending_segments && offset + length > append_state.committed_length as usize { + tracing::debug!( + bucket, + object, + offset, + length, + base_size, + total_read, + pending_segments_count = append_state.pending_segments.len(), + "Reading from pending segments" + ); + + let read_start = offset; + let read_end = offset + length; + + for (seg_index, segment) in append_state.pending_segments.iter().enumerate() { + if total_read >= length { + break; + } + + let seg_start = segment.offset as usize; + let seg_end = (segment.offset + segment.length) as usize; + + if seg_end <= read_start { + continue; + } + if seg_start >= read_end { + break; + } + + tracing::debug!( + bucket, + object, + seg_index, + seg_start, + seg_end, + read_start, + read_end, + "Loading pending segment data" + ); + + let segment_data = + Self::load_pending_segment(bucket, object, erasure.clone(), &disks, segment, fi.erasure.data_blocks).await?; + + let slice_start = read_start.max(seg_start) - seg_start; + let slice_end = read_end.min(seg_end) - seg_start; + + if slice_end > slice_start { + writer + .write_all(&segment_data[slice_start..slice_end]) + .await + .map_err(|e| Error::other(format!("failed to stream pending segment: {e}")))?; + total_read += slice_end - slice_start; + } + } + } + + tracing::debug!( + bucket, + object, + total_read, + expected_length = length, + has_pending_segments, + final_total_read = total_read, + "Append-aware multipart read finished" + ); Ok(()) } @@ -3443,6 +3648,813 @@ impl SetDisks { Ok(()) } + #[tracing::instrument(skip(self, data, opts), fields(bucket, object))] + async fn append_inline_object( + &self, + bucket: &str, + object: &str, + data: &mut PutObjReader, + opts: &ObjectOptions, + ) -> Result { + let info_opts = ObjectOptions { + version_id: opts.version_id.clone(), + versioned: opts.versioned, + version_suspended: opts.version_suspended, + no_lock: true, + ..Default::default() + }; + + let (mut fi, mut parts_metadata, online_disks) = self.get_object_fileinfo(bucket, object, &info_opts, true).await?; + + if fi.deleted { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + "cannot append to deleted object".to_string(), + )); + } + + let append_state_snapshot = fi.get_append_state(); + let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); + + validate_append_preconditions(bucket, object, &object_info)?; + + let position = opts.append_position.ok_or_else(|| { + StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + "x-amz-append-position header required".to_string(), + ) + })?; + + let base_size_snapshot: i64 = append_state_snapshot.committed_length; + let pending_length_snapshot: i64 = append_state_snapshot.pending_segments.iter().map(|seg| seg.length).sum(); + let expected_position_snapshot = base_size_snapshot.saturating_add(pending_length_snapshot); + if position != expected_position_snapshot { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + format!("append position mismatch: provided {}, expected {}", position, expected_position_snapshot), + )); + } + + let mut append_payload = Vec::new(); + tokio::io::AsyncReadExt::read_to_end(&mut data.stream, &mut append_payload) + .await + .map_err(|e| StorageError::other(format!("failed to read append payload: {e}")))?; + + data.stream = HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false) + .map_err(StorageError::other)?; + + if !fi.inline_data() { + return self + .append_segmented_object(fi, parts_metadata, bucket, object, append_payload, opts, position) + .await; + } + + let existing_inline = fi.data.as_ref().map(|b| b.as_ref()); + let erasure = erasure_coding::Erasure::new(fi.erasure.data_blocks, fi.erasure.parity_blocks, fi.erasure.block_size); + let mut hash_algorithm = fi + .parts + .first() + .map(|part| fi.erasure.get_checksum_info(part.number).algorithm) + .unwrap_or(HashAlgorithm::HighwayHash256); + let mut existing_plain_override: Option> = None; + if fi.erasure.checksums.is_empty() { + hash_algorithm = HashAlgorithm::HighwayHash256; + } + + if let Some(inline) = existing_inline { + debug!( + existing_size = fi.size, + inline_len = inline.len(), + ?hash_algorithm, + erasure_checksums = ?fi.erasure.checksums, + inline_has_checksums = !fi.erasure.checksums.is_empty(), + "append inline metadata" + ); + + match decode_inline_payload(inline, fi.size as usize, &erasure, hash_algorithm.clone()).await { + Ok((plain, detected_algo)) => { + hash_algorithm = detected_algo; + existing_plain_override = Some(plain); + } + Err(err) => { + return Err(StorageError::other(format!("failed to decode inline data: {err}"))); + } + } + } + + let has_checksums = !fi.erasure.checksums.is_empty(); + + let append_ctx = InlineAppendContext { + existing_inline, + existing_plain: existing_plain_override.as_deref(), + existing_size: fi.size, + append_payload: &append_payload, + erasure: &erasure, + hash_algorithm: hash_algorithm.clone(), + has_checksums, + }; + + let append_result = append_inline_data(append_ctx).await?; + + // Check if we need to spill to segmented mode after append + let total_shard_size = erasure.shard_file_size(append_result.total_size); + let should_remain_inline = if let Some(sc) = GLOBAL_STORAGE_CLASS.get() { + sc.should_inline(total_shard_size, opts.versioned) + } else { + true // fallback to inline if no storage class config + }; + + if !should_remain_inline { + let mut plain_total = if let Some(existing_plain) = existing_plain_override.clone() { + existing_plain + } else if let Some(inline) = existing_inline { + let (plain, _) = decode_inline_payload(inline, fi.size as usize, &erasure, hash_algorithm.clone()) + .await + .map_err(|err| StorageError::other(format!("failed to decode inline data: {err}")))?; + plain + } else { + Vec::new() + }; + plain_total.extend_from_slice(&append_payload); + info!( + bucket, + object, + total_size = append_result.total_size, + existing_inline_len = fi.size, + plain_total_len = plain_total.len(), + append_payload_len = append_payload.len(), + shard_size = total_shard_size, + "Inline object exceeds threshold, spilling to segmented storage" + ); + + return self + .spill_inline_into_segmented( + fi, + parts_metadata, + bucket, + object, + plain_total, + append_result.etag.clone(), + opts, + AppendStateKind::SegmentedActive, + ) + .await; + } + + let inline_bytes = Bytes::from(append_result.inline_data.clone()); + let now = OffsetDateTime::now_utc(); + + fi.mod_time = Some(now); + fi.size = append_result.total_size; + fi.data = Some(inline_bytes.clone()); + fi.metadata.insert("etag".to_owned(), append_result.etag.clone()); + fi.set_inline_data(); + fi.metadata.insert( + format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), + append_result.total_size.to_string(), + ); + fi.metadata + .insert("x-rustfs-encryption-original-size".to_string(), append_result.total_size.to_string()); + if !has_checksums { + fi.erasure.checksums.clear(); + } + + let mut append_state = match get_append_state(&fi.metadata) { + Ok(Some(state)) => state, + Ok(None) => AppendState::default(), + Err(err) => { + return Err(StorageError::other(format!("failed to decode append state: {err}"))); + } + }; + append_state.state = AppendStateKind::Inline; + append_state.committed_length = append_result.total_size; + append_state.pending_segments.clear(); + append_state.epoch = append_state.epoch.saturating_add(1); + + set_append_state(&mut fi.metadata, &append_state) + .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; + fi.parts.clear(); + fi.add_object_part( + 1, + append_result.etag.clone(), + append_result.total_size as usize, + Some(now), + append_result.total_size, + None, + ); + + for meta in parts_metadata.iter_mut() { + if !meta.is_valid() { + continue; + } + meta.mod_time = fi.mod_time; + meta.size = fi.size; + meta.metadata = fi.metadata.clone(); + meta.parts = fi.parts.clone(); + meta.data = Some(inline_bytes.clone()); + meta.set_inline_data(); + if !has_checksums { + meta.erasure.checksums.clear(); + } + } + + let write_quorum = fi.write_quorum(self.default_write_quorum()); + + Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; + + object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); + object_info.etag = Some(append_result.etag.clone()); + + Ok(object_info) + } + + #[allow(clippy::too_many_arguments)] + async fn spill_inline_into_segmented( + &self, + mut fi: FileInfo, + mut parts_metadata: Vec, + bucket: &str, + object: &str, + plain_data: Vec, + etag: String, + opts: &ObjectOptions, + target_state: AppendStateKind, + ) -> Result { + let write_quorum = fi.write_quorum(self.default_write_quorum()); + let disks_guard = self.disks.read().await; + let shuffle_disks = Self::shuffle_disks(&disks_guard, &fi.erasure.distribution); + + let mut data_reader = PutObjReader::from_vec(plain_data.clone()); + let erasure = Arc::new(erasure_coding::Erasure::new( + fi.erasure.data_blocks, + fi.erasure.parity_blocks, + fi.erasure.block_size, + )); + + let mut append_state = match get_append_state(&fi.metadata) { + Ok(Some(state)) => state, + Ok(None) => fi.get_append_state(), + Err(err) => { + warn!( + ?err, + bucket, object, "failed to decode append state from metadata, falling back to inferred state" + ); + fi.get_append_state() + } + }; + + let tmp_root = format!("{}x{}", Uuid::new_v4(), OffsetDateTime::now_utc().unix_timestamp()); + let data_dir = Uuid::new_v4(); + let tmp_part_path = format!("{tmp_root}/{data_dir}/part.1"); + let final_part_path = format!("{}/{}/part.1", object, data_dir); + + let mut writers = Vec::with_capacity(shuffle_disks.len()); + let mut errors = Vec::with_capacity(shuffle_disks.len()); + for disk in shuffle_disks.iter() { + if let Some(disk) = disk { + let writer = create_bitrot_writer( + false, + Some(disk), + RUSTFS_META_TMP_BUCKET, + &tmp_part_path, + erasure.shard_file_size(data_reader.size()), + erasure.shard_size(), + HashAlgorithm::HighwayHash256, + ) + .await?; + writers.push(Some(writer)); + errors.push(None); + } else { + writers.push(None); + errors.push(Some(DiskError::DiskNotFound)); + } + } + + let healthy_writers = errors.iter().filter(|err| err.is_none()).count(); + if healthy_writers < write_quorum { + if let Some(write_err) = reduce_write_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, write_quorum) { + return Err(write_err.into()); + } + return Err(StorageError::other("not enough disks for spill")); + } + + let stream = mem::replace( + &mut data_reader.stream, + HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false) + .map_err(StorageError::other)?, + ); + + let (reader, written_size) = erasure + .clone() + .encode(stream, &mut writers, write_quorum) + .await + .map_err(StorageError::other)?; + + let _ = mem::replace(&mut data_reader.stream, reader); + + if (written_size as i64) < data_reader.size() { + return Err(StorageError::other("spill write truncated payload")); + } + + drop(writers); + + let rename_result = Self::rename_part_data( + &shuffle_disks, + RUSTFS_META_TMP_BUCKET, + &tmp_part_path, + bucket, + &final_part_path, + write_quorum, + ) + .await; + + let cleanup_result = self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_root).await; + + let online_disks = match rename_result { + Ok(disks) => { + cleanup_result?; + disks + } + Err(err) => { + if let Err(clean_err) = cleanup_result { + warn!("spill cleanup failed after rename error: {clean_err:?}"); + } + return Err(err.into()); + } + }; + + let now = OffsetDateTime::now_utc(); + fi.mod_time = Some(now); + fi.size = plain_data.len() as i64; + fi.data = None; + fi.data_dir = Some(data_dir); + fi.metadata.remove(&format!("{RESERVED_METADATA_PREFIX_LOWER}inline-data")); + fi.metadata.insert("etag".to_owned(), etag.clone()); + fi.metadata + .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), fi.size.to_string()); + fi.metadata + .insert("x-rustfs-encryption-original-size".to_string(), fi.size.to_string()); + + fi.parts.clear(); + fi.add_object_part(1, etag.clone(), plain_data.len(), Some(now), fi.size, None); + + append_state.state = target_state; + append_state.committed_length = fi.size; + append_state.pending_segments.clear(); + append_state.epoch = append_state.epoch.saturating_add(1); + set_append_state(&mut fi.metadata, &append_state) + .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; + + for meta in parts_metadata.iter_mut() { + if !meta.is_valid() { + continue; + } + meta.mod_time = fi.mod_time; + meta.size = fi.size; + meta.data = None; + meta.data_dir = Some(data_dir); + meta.metadata = fi.metadata.clone(); + meta.parts = fi.parts.clone(); + meta.metadata.remove(&format!("{RESERVED_METADATA_PREFIX_LOWER}inline-data")); + } + + Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; + + let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); + object_info.etag = Some(etag); + + Ok(object_info) + } + + async fn load_pending_segment( + bucket: &str, + object: &str, + erasure: Arc, + disks: &[Option], + segment: &AppendSegment, + read_quorum: usize, + ) -> Result> { + let segment_dir = segment + .data_dir + .ok_or_else(|| StorageError::other("append segment missing data directory"))?; + let segment_path = format!("{}/append/{}/{segment_dir}/part.1", object, segment.epoch); + + let mut readers = Vec::with_capacity(disks.len()); + let mut errors = Vec::with_capacity(disks.len()); + for disk in disks.iter() { + if let Some(disk) = disk { + match create_bitrot_reader( + None, + Some(disk), + bucket, + &segment_path, + 0, + segment.length as usize, + erasure.shard_size(), + HashAlgorithm::HighwayHash256, + ) + .await + { + Ok(Some(reader)) => { + readers.push(Some(reader)); + errors.push(None); + } + Ok(None) => { + readers.push(None); + errors.push(Some(DiskError::DiskNotFound)); + } + Err(err) => { + readers.push(None); + errors.push(Some(err)); + } + } + } else { + readers.push(None); + errors.push(Some(DiskError::DiskNotFound)); + } + } + + if let Some(err) = reduce_read_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, read_quorum) { + return Err(err.into()); + } + + let mut writer = VecAsyncWriter::with_capacity(segment.length as usize); + let (written, err) = erasure + .decode(&mut writer, readers, 0, segment.length as usize, segment.length as usize) + .await; + if let Some(e) = err { + let de: DiskError = e.into(); + return Err(de.into()); + } + + if written < segment.length as usize { + return Err(StorageError::other("pending segment read truncated")); + } + + Ok(writer.into_inner()) + } + + async fn remove_pending_segments_data(&self, bucket: &str, object: &str, segments: &[AppendSegment]) -> Result<()> { + for segment in segments { + if let Some(dir) = segment.data_dir { + let prefix = format!("{}/append/{}/{}", object, segment.epoch, dir); + self.delete_all(bucket, &prefix).await?; + } + } + Ok(()) + } + + async fn complete_append_object(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + let mut info_opts = opts.clone(); + info_opts.no_lock = true; + + let (fi, parts_metadata, online_disks) = self.get_object_fileinfo(bucket, object, &info_opts, true).await?; + + let append_state = fi.get_append_state(); + if append_state.pending_segments.is_empty() { + return Err(StorageError::other("no pending segments to complete")); + } + + let pending_size: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); + let total_logical = append_state.committed_length.saturating_add(pending_size); + + let mut writer = VecAsyncWriter::with_capacity(total_logical as usize); + Self::get_object_with_fileinfo( + bucket, + object, + 0, + total_logical, + &mut writer, + fi.clone(), + parts_metadata.clone(), + &online_disks, + self.set_index, + self.pool_index, + ) + .await?; + + let plain_data = writer.into_inner(); + let final_etag = format!("{:x}", Md5::digest(&plain_data)); + + let result_info = self + .spill_inline_into_segmented( + fi, + parts_metadata, + bucket, + object, + plain_data, + final_etag, + opts, + AppendStateKind::SegmentedSealed, + ) + .await?; + + self.remove_pending_segments_data(bucket, object, &append_state.pending_segments) + .await?; + + Ok(result_info) + } + + async fn abort_append_object(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + let mut info_opts = opts.clone(); + info_opts.no_lock = true; + + let (mut fi, mut parts_metadata, online_disks) = self.get_object_fileinfo(bucket, object, &info_opts, true).await?; + + let mut append_state = fi.get_append_state(); + if append_state.pending_segments.is_empty() { + return Ok(ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended)); + } + + self.remove_pending_segments_data(bucket, object, &append_state.pending_segments) + .await?; + + append_state.pending_segments.clear(); + append_state.state = AppendStateKind::SegmentedSealed; + append_state.epoch = append_state.epoch.saturating_add(1); + + let mut committed_length = append_state.committed_length; + let actual_committed = if fi.inline_data() { + fi.data.as_ref().map(|buf| buf.len() as i64).unwrap_or(committed_length) + } else if fi.parts.is_empty() { + fi.size + } else { + fi.parts.iter().map(|part| part.size as i64).sum() + }; + + if actual_committed != committed_length { + warn!( + bucket, + object, + recorded_length = committed_length, + actual_length = actual_committed, + "abort append detected committed length mismatch, correcting" + ); + committed_length = actual_committed; + append_state.committed_length = actual_committed; + } + + fi.mod_time = Some(OffsetDateTime::now_utc()); + fi.size = committed_length; + fi.data = None; + fi.metadata.remove(&format!("{RESERVED_METADATA_PREFIX_LOWER}inline-data")); + + set_append_state(&mut fi.metadata, &append_state) + .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; + + let complete_parts: Vec = fi + .parts + .iter() + .map(|part| CompletePart { + part_num: part.number, + etag: Some(part.etag.clone()), + }) + .collect(); + let base_etag = if complete_parts.is_empty() { + fi.metadata.get("etag").cloned().unwrap_or_default() + } else { + get_complete_multipart_md5(&complete_parts) + }; + + fi.metadata.insert("etag".to_owned(), base_etag.clone()); + fi.metadata + .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), committed_length.to_string()); + fi.metadata + .insert("x-rustfs-encryption-original-size".to_string(), committed_length.to_string()); + + for meta in parts_metadata.iter_mut() { + if !meta.is_valid() { + continue; + } + meta.mod_time = fi.mod_time; + meta.size = fi.size; + meta.metadata = fi.metadata.clone(); + meta.parts = fi.parts.clone(); + meta.data = None; + meta.data_dir = fi.data_dir; + } + + let write_quorum = fi.write_quorum(self.default_write_quorum()); + Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; + + let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); + object_info.etag = Some(base_etag); + + Ok(object_info) + } + + #[allow(clippy::too_many_arguments)] + async fn append_segmented_object( + &self, + mut fi: FileInfo, + mut parts_metadata: Vec, + bucket: &str, + object: &str, + append_payload: Vec, + opts: &ObjectOptions, + position: i64, + ) -> Result { + let data_dir = fi + .data_dir + .ok_or_else(|| StorageError::other(format!("append requires existing data directory for {bucket}/{object}")))?; + + let mut append_state = match get_append_state(&fi.metadata) { + Ok(Some(state)) => state, + Ok(None) => fi.get_append_state(), + Err(err) => { + warn!(?err, bucket, object, "failed to decode append state from metadata, using inferred state"); + fi.get_append_state() + } + }; + + if matches!(append_state.state, AppendStateKind::Inline | AppendStateKind::InlinePendingSpill) { + return Err(StorageError::other("segmented append invoked while object still inline")); + } + + if append_state.state == AppendStateKind::SegmentedSealed { + append_state.state = AppendStateKind::SegmentedActive; + } + + let pending_length: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); + let expected_offset = append_state.committed_length.saturating_add(pending_length); + if position != expected_offset { + return Err(StorageError::InvalidArgument( + bucket.to_string(), + object.to_string(), + format!("append position mismatch: provided {position}, expected {expected_offset}"), + )); + } + + let new_length = append_payload.len() as i64; + validate_new_segment(&append_state, position, new_length) + .map_err(|err| StorageError::other(format!("invalid append segment: {err}")))?; + + let write_quorum = fi.write_quorum(self.default_write_quorum()); + let disks_guard = self.disks.read().await; + let shuffle_disks = Self::shuffle_disks(&disks_guard, &fi.erasure.distribution); + + let mut append_reader = PutObjReader::from_vec(append_payload); + let erasure = Arc::new(erasure_coding::Erasure::new( + fi.erasure.data_blocks, + fi.erasure.parity_blocks, + fi.erasure.block_size, + )); + + let tmp_root = format!("{}x{}", Uuid::new_v4(), OffsetDateTime::now_utc().unix_timestamp()); + let segment_id = Uuid::new_v4(); + let new_epoch = append_state.epoch.saturating_add(1); + let tmp_part_path = format!("{tmp_root}/append/{new_epoch}/{segment_id}/part.1"); + let final_part_path = format!("{}/append/{new_epoch}/{segment_id}/part.1", object); + + let mut writers = Vec::with_capacity(shuffle_disks.len()); + let mut errors = Vec::with_capacity(shuffle_disks.len()); + for disk in shuffle_disks.iter() { + if let Some(disk) = disk { + let writer = create_bitrot_writer( + false, + Some(disk), + RUSTFS_META_TMP_BUCKET, + &tmp_part_path, + erasure.shard_file_size(append_reader.size()), + erasure.shard_size(), + HashAlgorithm::HighwayHash256, + ) + .await?; + writers.push(Some(writer)); + errors.push(None); + } else { + writers.push(None); + errors.push(Some(DiskError::DiskNotFound)); + } + } + + let healthy_writers = errors.iter().filter(|err| err.is_none()).count(); + if healthy_writers < write_quorum { + if let Some(write_err) = reduce_write_quorum_errs(&errors, OBJECT_OP_IGNORED_ERRS, write_quorum) { + return Err(write_err.into()); + } + return Err(StorageError::other("not enough disks for append")); + } + + let stream = mem::replace( + &mut append_reader.stream, + HashReader::new(Box::new(WarpReader::new(Cursor::new(Vec::new()))), 0, 0, None, false) + .map_err(StorageError::other)?, + ); + + let (reader, written_size) = erasure + .clone() + .encode(stream, &mut writers, write_quorum) + .await + .map_err(StorageError::other)?; + + let _ = mem::replace(&mut append_reader.stream, reader); + + if (written_size as i64) < append_reader.size() { + return Err(StorageError::other("append write truncated payload")); + } + + let mut part_etag = append_reader.stream.try_resolve_etag().unwrap_or_default(); + if let Some(ref tag) = opts.preserve_etag { + part_etag = tag.clone(); + } + + drop(writers); + + let rename_result = Self::rename_part_data( + &shuffle_disks, + RUSTFS_META_TMP_BUCKET, + &tmp_part_path, + bucket, + &final_part_path, + write_quorum, + ) + .await; + + let cleanup_result = self.delete_all(RUSTFS_META_TMP_BUCKET, &tmp_root).await; + + let online_disks = match rename_result { + Ok(disks) => { + cleanup_result?; + disks + } + Err(err) => { + if let Err(clean_err) = cleanup_result { + warn!("append cleanup failed after rename error: {clean_err:?}"); + } + return Err(err.into()); + } + }; + + let segment = AppendSegment { + offset: position, + length: new_length, + data_dir: Some(segment_id), + etag: Some(part_etag.clone()), + epoch: new_epoch, + }; + + append_state.pending_segments.push(segment); + append_state.epoch = new_epoch; + append_state.state = AppendStateKind::SegmentedActive; + + let logical_size = append_state + .committed_length + .saturating_add(append_state.pending_segments.iter().map(|seg| seg.length).sum()); + fi.size = logical_size; + fi.mod_time = Some(OffsetDateTime::now_utc()); + + // Update etag to include pending segments + let mut aggregate_parts: Vec = fi + .parts + .iter() + .map(|part| CompletePart { + part_num: part.number, + etag: Some(part.etag.clone()), + }) + .collect(); + let mut next_part_number = aggregate_parts.last().map(|p| p.part_num).unwrap_or(0); + for pending in append_state.pending_segments.iter() { + next_part_number += 1; + aggregate_parts.push(CompletePart { + part_num: next_part_number, + etag: pending.etag.clone(), + }); + } + let aggregate_etag = get_complete_multipart_md5(&aggregate_parts); + + fi.metadata.insert("etag".to_owned(), aggregate_etag.clone()); + fi.metadata + .insert(format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size"), logical_size.to_string()); + fi.metadata + .insert("x-rustfs-encryption-original-size".to_string(), logical_size.to_string()); + + set_append_state(&mut fi.metadata, &append_state) + .map_err(|err| StorageError::other(format!("failed to persist append state: {err}")))?; + + for meta in parts_metadata.iter_mut() { + if !meta.is_valid() { + continue; + } + meta.mod_time = fi.mod_time; + meta.size = fi.size; + meta.metadata = fi.metadata.clone(); + meta.parts = fi.parts.clone(); + meta.data = None; + meta.versioned = opts.versioned || opts.version_suspended; + } + + Self::write_unique_file_info(&online_disks, "", bucket, object, &parts_metadata, write_quorum).await?; + + let mut object_info = ObjectInfo::from_file_info(&fi, bucket, object, opts.versioned || opts.version_suspended); + object_info.etag = Some(aggregate_etag); + + Ok(object_info) + } + async fn check_write_precondition(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Option { let mut opts = opts.clone(); @@ -3594,6 +4606,16 @@ impl ObjectIO for SetDisks { None }; + if opts.append_object { + if opts.http_preconditions.clone().is_some() { + if let Some(err) = self.check_write_precondition(bucket, object, opts).await { + return Err(err); + } + } + + return self.append_inline_object(bucket, object, data, opts).await; + } + if let Some(http_preconditions) = opts.http_preconditions.clone() { if let Some(err) = self.check_write_precondition(bucket, object, opts).await { return Err(err); @@ -3820,6 +4842,37 @@ impl ObjectIO for SetDisks { } } +struct VecAsyncWriter { + buffer: Vec, +} + +impl VecAsyncWriter { + fn with_capacity(capacity: usize) -> Self { + Self { + buffer: Vec::with_capacity(capacity), + } + } + + fn into_inner(self) -> Vec { + self.buffer + } +} + +impl AsyncWrite for VecAsyncWriter { + fn poll_write(mut self: Pin<&mut Self>, _cx: &mut Context<'_>, buf: &[u8]) -> Poll> { + self.buffer.extend_from_slice(buf); + Poll::Ready(Ok(buf.len())) + } + + fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn poll_shutdown(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } +} + #[async_trait::async_trait] impl StorageAPI for SetDisks { #[tracing::instrument(skip(self))] @@ -4234,6 +5287,42 @@ impl StorageAPI for SetDisks { (del_objects, del_errs) } + async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + let mut info_opts = opts.clone(); + info_opts.no_lock = true; + + let _object_lock_guard = if !opts.no_lock { + Some( + self.fast_lock_manager + .acquire_write_lock("", object, self.locker_owner.as_str()) + .await + .map_err(|_| Error::other("can not get lock. please retry".to_string()))?, + ) + } else { + None + }; + + self.complete_append_object(bucket, object, &info_opts).await + } + + async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + let mut info_opts = opts.clone(); + info_opts.no_lock = true; + + let _object_lock_guard = if !opts.no_lock { + Some( + self.fast_lock_manager + .acquire_write_lock("", object, self.locker_owner.as_str()) + .await + .map_err(|_| Error::other("can not get lock. please retry".to_string()))?, + ) + } else { + None + }; + + self.abort_append_object(bucket, object, &info_opts).await + } + #[tracing::instrument(skip(self))] async fn delete_object(&self, bucket: &str, object: &str, mut opts: ObjectOptions) -> Result { // Guard lock for single object delete diff --git a/crates/ecstore/src/sets.rs b/crates/ecstore/src/sets.rs index 02a95179..26ca4006 100644 --- a/crates/ecstore/src/sets.rs +++ b/crates/ecstore/src/sets.rs @@ -602,6 +602,14 @@ impl StorageAPI for Sets { (del_objects, del_errs) } + async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + self.get_disks_by_key(object).complete_append(bucket, object, opts).await + } + + async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + self.get_disks_by_key(object).abort_append(bucket, object, opts).await + } + async fn list_object_parts( &self, bucket: &str, diff --git a/crates/ecstore/src/store.rs b/crates/ecstore/src/store.rs index 41a15b2a..0b725172 100644 --- a/crates/ecstore/src/store.rs +++ b/crates/ecstore/src/store.rs @@ -1709,6 +1709,17 @@ impl StorageAPI for ECStore { // Ok((del_objects, del_errs)) } + async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + let object = encode_dir_object(object); + let (pinfo, _) = self.internal_get_pool_info_existing_with_opts(bucket, &object, opts).await?; + self.pools[pinfo.index].complete_append(bucket, &object, opts).await + } + + async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result { + let object = encode_dir_object(object); + let (pinfo, _) = self.internal_get_pool_info_existing_with_opts(bucket, &object, opts).await?; + self.pools[pinfo.index].abort_append(bucket, &object, opts).await + } #[tracing::instrument(skip(self))] async fn list_object_parts( &self, diff --git a/crates/ecstore/src/store_api.rs b/crates/ecstore/src/store_api.rs index ce1a3cce..13a5b9da 100644 --- a/crates/ecstore/src/store_api.rs +++ b/crates/ecstore/src/store_api.rs @@ -328,6 +328,8 @@ pub struct ObjectOptions { pub max_parity: bool, pub mod_time: Option, pub part_number: Option, + pub append_object: bool, + pub append_position: Option, pub delete_prefix: bool, pub delete_prefix_object: bool, @@ -656,6 +658,15 @@ impl ObjectInfo { }) .collect(); + let append_state = fi.get_append_state(); + let pending_length: i64 = append_state.pending_segments.iter().map(|seg| seg.length).sum(); + let logical_size = append_state.committed_length.saturating_add(pending_length); + let actual_size_meta = fi + .metadata + .get(&format!("{RESERVED_METADATA_PREFIX_LOWER}actual-size")) + .and_then(|o| o.parse::().ok()) + .unwrap_or(logical_size); + ObjectInfo { bucket: bucket.to_string(), name, @@ -665,7 +676,7 @@ impl ObjectInfo { version_id, delete_marker: fi.deleted, mod_time: fi.mod_time, - size: fi.size, + size: logical_size, parts, is_latest: fi.is_latest, user_tags, @@ -677,6 +688,7 @@ impl ObjectInfo { inlined, user_defined: metadata, transitioned_object, + actual_size: actual_size_meta, ..Default::default() } } @@ -1188,6 +1200,10 @@ pub trait StorageAPI: ObjectIO + Debug { opts: ObjectOptions, ) -> (Vec, Vec>); + async fn complete_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result; + + async fn abort_append(&self, bucket: &str, object: &str, opts: &ObjectOptions) -> Result; + // TransitionObject TODO: // RestoreTransitionedObject TODO: diff --git a/crates/filemeta/Cargo.toml b/crates/filemeta/Cargo.toml index 5c7a3589..a8e21d20 100644 --- a/crates/filemeta/Cargo.toml +++ b/crates/filemeta/Cargo.toml @@ -30,6 +30,7 @@ crc32fast = { workspace = true } rmp.workspace = true rmp-serde.workspace = true serde.workspace = true +serde_json.workspace = true time.workspace = true uuid = { workspace = true, features = ["v4", "fast-rng", "serde"] } tokio = { workspace = true, features = ["io-util", "macros", "sync"] } diff --git a/crates/filemeta/src/append.rs b/crates/filemeta/src/append.rs new file mode 100644 index 00000000..4eca1d62 --- /dev/null +++ b/crates/filemeta/src/append.rs @@ -0,0 +1,541 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::error::{Error, Result}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use uuid::Uuid; + +const APPEND_STATE_META_KEY: &str = "x-rustfs-internal-append-state"; + +/// Tracks the state of append-enabled objects. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct AppendState { + pub state: AppendStateKind, + pub epoch: u64, + pub committed_length: i64, + pub pending_segments: Vec, +} + +/// Represents individual append segments that still need consolidation. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub struct AppendSegment { + pub offset: i64, + pub length: i64, + pub data_dir: Option, + pub etag: Option, + pub epoch: u64, +} + +/// Possible append lifecycle states for an object version. +#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)] +pub enum AppendStateKind { + #[default] + Disabled, + Inline, + InlinePendingSpill, + SegmentedActive, + SegmentedSealed, +} + +/// Persist the provided append state into object metadata. +pub fn set_append_state(metadata: &mut HashMap, state: &AppendState) -> Result<()> { + let encoded = serde_json::to_string(state).map_err(Error::other)?; + metadata.insert(APPEND_STATE_META_KEY.to_string(), encoded); + Ok(()) +} + +/// Remove the append state marker from metadata. +pub fn clear_append_state(metadata: &mut HashMap) { + metadata.remove(APPEND_STATE_META_KEY); +} + +/// Load append state stored in metadata, if any. +pub fn get_append_state(metadata: &HashMap) -> Result> { + let raw = match metadata.get(APPEND_STATE_META_KEY) { + Some(val) if !val.is_empty() => val, + _ => return Ok(None), + }; + + let decoded = serde_json::from_str(raw).map_err(Error::other)?; + Ok(Some(decoded)) +} + +/// Complete append operations by consolidating pending segments and sealing the object +pub fn complete_append_operation(state: &mut AppendState) -> Result<()> { + match state.state { + AppendStateKind::SegmentedActive => { + // Move all pending segments data to main parts and seal + state.committed_length += state.pending_segments.iter().map(|s| s.length).sum::(); + state.pending_segments.clear(); + state.state = AppendStateKind::SegmentedSealed; + state.epoch = state.epoch.saturating_add(1); + Ok(()) + } + AppendStateKind::Inline => { + // Inline objects are always immediately committed, just seal them + state.state = AppendStateKind::SegmentedSealed; // Transition to sealed + state.epoch = state.epoch.saturating_add(1); + Ok(()) + } + AppendStateKind::InlinePendingSpill => { + // Wait for spill to complete, then seal + // In practice, this might need to trigger the spill completion first + state.state = AppendStateKind::SegmentedSealed; + state.pending_segments.clear(); + state.epoch = state.epoch.saturating_add(1); + Ok(()) + } + AppendStateKind::SegmentedSealed | AppendStateKind::Disabled => { + // Already sealed or disabled + Err(Error::other("Cannot complete append on sealed or disabled object")) + } + } +} + +/// Abort append operations by discarding pending segments and returning to sealed state +pub fn abort_append_operation(state: &mut AppendState) -> Result<()> { + match state.state { + AppendStateKind::SegmentedActive => { + // Discard all pending segments and seal + state.pending_segments.clear(); + state.state = AppendStateKind::SegmentedSealed; + state.epoch = state.epoch.saturating_add(1); + Ok(()) + } + AppendStateKind::Inline => { + // Inline data is already committed, just seal + state.state = AppendStateKind::SegmentedSealed; + state.epoch = state.epoch.saturating_add(1); + Ok(()) + } + AppendStateKind::InlinePendingSpill => { + // Cancel spill and keep inline data, then seal + state.state = AppendStateKind::SegmentedSealed; + state.pending_segments.clear(); + state.epoch = state.epoch.saturating_add(1); + Ok(()) + } + AppendStateKind::SegmentedSealed | AppendStateKind::Disabled => { + // Already sealed or disabled + Err(Error::other("Cannot abort append on sealed or disabled object")) + } + } +} + +/// Check if an append operation can be completed +pub fn can_complete_append(state: &AppendState) -> bool { + matches!( + state.state, + AppendStateKind::Inline | AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive + ) +} + +/// Check if an append operation can be aborted +pub fn can_abort_append(state: &AppendState) -> bool { + matches!( + state.state, + AppendStateKind::Inline | AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive + ) +} + +/// Verify epoch for optimistic concurrency control +pub fn verify_append_epoch(current_state: &AppendState, expected_epoch: u64) -> Result<()> { + if current_state.epoch != expected_epoch { + Err(Error::other(format!( + "Append operation conflict: expected epoch {}, found {}", + expected_epoch, current_state.epoch + ))) + } else { + Ok(()) + } +} + +/// Prepare next append operation by incrementing epoch +pub fn prepare_next_append(state: &mut AppendState) { + state.epoch = state.epoch.saturating_add(1); +} + +/// Validate that a new append segment doesn't conflict with existing segments +pub fn validate_new_segment(state: &AppendState, new_offset: i64, new_length: i64) -> Result<()> { + let new_end = new_offset + new_length; + + // Check it doesn't overlap with committed data + if new_offset < state.committed_length { + return Err(Error::other(format!( + "New segment overlaps with committed data: offset {} < committed_length {}", + new_offset, state.committed_length + ))); + } + + // Check it doesn't overlap with existing pending segments + for existing in &state.pending_segments { + let existing_start = existing.offset; + let existing_end = existing.offset + existing.length; + + // Check for any overlap + if new_offset < existing_end && new_end > existing_start { + return Err(Error::other(format!( + "New segment [{}, {}) overlaps with existing segment [{}, {})", + new_offset, new_end, existing_start, existing_end + ))); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::fileinfo::FileInfo; + + #[test] + fn append_state_roundtrip_in_metadata() { + let mut metadata = HashMap::new(); + let state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 42, + committed_length: 2048, + pending_segments: vec![AppendSegment { + offset: 2048, + length: 512, + data_dir: Some(Uuid::new_v4()), + etag: Some("abc123".to_string()), + epoch: 0, + }], + }; + + set_append_state(&mut metadata, &state).expect("persist append state"); + assert!(metadata.contains_key(APPEND_STATE_META_KEY)); + + let decoded = get_append_state(&metadata) + .expect("decode append state") + .expect("state present"); + assert_eq!(decoded, state); + + clear_append_state(&mut metadata); + assert!(!metadata.contains_key(APPEND_STATE_META_KEY)); + assert!(get_append_state(&metadata).unwrap().is_none()); + } + + #[test] + fn fileinfo_append_state_migration_compatibility() { + // Test old inline data object + let mut inline_fi = FileInfo { + size: 1024, + ..Default::default() + }; + inline_fi.set_inline_data(); + + let state = inline_fi.get_append_state(); + assert_eq!(state.state, AppendStateKind::Inline); + assert_eq!(state.committed_length, 1024); + assert!(state.pending_segments.is_empty()); + assert!(inline_fi.is_appendable()); + assert!(!inline_fi.has_pending_appends()); + + // Test old regular object + let regular_fi = FileInfo { + size: 2048, + ..Default::default() + }; + // No inline_data marker + + let state = regular_fi.get_append_state(); + assert_eq!(state.state, AppendStateKind::SegmentedSealed); + assert_eq!(state.committed_length, 2048); + assert!(state.pending_segments.is_empty()); + assert!(!regular_fi.is_appendable()); + assert!(!regular_fi.has_pending_appends()); + + // Test explicit append state + let mut append_fi = FileInfo::default(); + let explicit_state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 5, + committed_length: 1500, + pending_segments: vec![AppendSegment { + offset: 1500, + length: 300, + data_dir: Some(Uuid::new_v4()), + etag: Some("def456".to_string()), + epoch: 0, + }], + }; + + append_fi.set_append_state(&explicit_state).expect("set explicit state"); + let retrieved_state = append_fi.get_append_state(); + assert_eq!(retrieved_state, explicit_state); + assert!(append_fi.is_appendable()); + assert!(append_fi.has_pending_appends()); + } + + #[test] + fn append_state_transitions() { + // Test state transition validation + assert_eq!(AppendStateKind::default(), AppendStateKind::Disabled); + + let inline_state = AppendState { + state: AppendStateKind::Inline, + ..Default::default() + }; + + let spill_state = AppendState { + state: AppendStateKind::InlinePendingSpill, + ..Default::default() + }; + + let active_state = AppendState { + state: AppendStateKind::SegmentedActive, + ..Default::default() + }; + + let sealed_state = AppendState { + state: AppendStateKind::SegmentedSealed, + ..Default::default() + }; + + // Verify serialization works for all states + for state in [inline_state, spill_state, active_state, sealed_state] { + let mut metadata = HashMap::new(); + set_append_state(&mut metadata, &state).expect("serialize state"); + let decoded = get_append_state(&metadata).unwrap().unwrap(); + assert_eq!(decoded, state); + } + } + + #[test] + fn complete_append_transitions() { + // Test completing SegmentedActive with pending segments + let mut active_state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 5, + committed_length: 1000, + pending_segments: vec![ + AppendSegment { + offset: 1000, + length: 200, + data_dir: Some(Uuid::new_v4()), + etag: Some("abc123".to_string()), + epoch: 0, + }, + AppendSegment { + offset: 1200, + length: 300, + data_dir: Some(Uuid::new_v4()), + etag: Some("def456".to_string()), + epoch: 0, + }, + ], + }; + + assert!(can_complete_append(&active_state)); + complete_append_operation(&mut active_state).expect("complete should succeed"); + + assert_eq!(active_state.state, AppendStateKind::SegmentedSealed); + assert_eq!(active_state.committed_length, 1500); // 1000 + 200 + 300 + assert!(active_state.pending_segments.is_empty()); + assert_eq!(active_state.epoch, 6); + + // Test completing Inline state + let mut inline_state = AppendState { + state: AppendStateKind::Inline, + epoch: 2, + committed_length: 500, + ..Default::default() + }; + + assert!(can_complete_append(&inline_state)); + complete_append_operation(&mut inline_state).expect("complete should succeed"); + + assert_eq!(inline_state.state, AppendStateKind::SegmentedSealed); + assert_eq!(inline_state.committed_length, 500); // Unchanged + assert_eq!(inline_state.epoch, 3); + + // Test completing already sealed state should fail + let mut sealed_state = AppendState { + state: AppendStateKind::SegmentedSealed, + ..Default::default() + }; + + assert!(!can_complete_append(&sealed_state)); + assert!(complete_append_operation(&mut sealed_state).is_err()); + } + + #[test] + fn abort_append_transitions() { + // Test aborting SegmentedActive with pending segments + let mut active_state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 3, + committed_length: 800, + pending_segments: vec![AppendSegment { + offset: 800, + length: 400, + data_dir: Some(Uuid::new_v4()), + etag: Some("xyz789".to_string()), + epoch: 0, + }], + }; + + assert!(can_abort_append(&active_state)); + abort_append_operation(&mut active_state).expect("abort should succeed"); + + assert_eq!(active_state.state, AppendStateKind::SegmentedSealed); + assert_eq!(active_state.committed_length, 800); // Unchanged, pending discarded + assert!(active_state.pending_segments.is_empty()); + assert_eq!(active_state.epoch, 4); + + // Test aborting InlinePendingSpill + let mut spill_state = AppendState { + state: AppendStateKind::InlinePendingSpill, + epoch: 1, + committed_length: 100, + pending_segments: vec![], + }; + + assert!(can_abort_append(&spill_state)); + abort_append_operation(&mut spill_state).expect("abort should succeed"); + + assert_eq!(spill_state.state, AppendStateKind::SegmentedSealed); + assert_eq!(spill_state.committed_length, 100); + assert_eq!(spill_state.epoch, 2); + + // Test aborting disabled state should fail + let mut disabled_state = AppendState { + state: AppendStateKind::Disabled, + ..Default::default() + }; + + assert!(!can_abort_append(&disabled_state)); + assert!(abort_append_operation(&mut disabled_state).is_err()); + } + + #[test] + fn epoch_validation() { + let state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 10, + committed_length: 1000, + pending_segments: vec![], + }; + + // Valid epoch should succeed + assert!(verify_append_epoch(&state, 10).is_ok()); + + // Invalid epoch should fail + assert!(verify_append_epoch(&state, 9).is_err()); + assert!(verify_append_epoch(&state, 11).is_err()); + + // Error message should contain epoch information + let error = verify_append_epoch(&state, 5).unwrap_err(); + let error_msg = error.to_string(); + assert!(error_msg.contains("expected epoch 5")); + assert!(error_msg.contains("found 10")); + } + + #[test] + fn next_append_preparation() { + let mut state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 5, + committed_length: 1000, + pending_segments: vec![], + }; + + prepare_next_append(&mut state); + assert_eq!(state.epoch, 6); + + // Test saturation behavior + let mut max_state = AppendState { + epoch: u64::MAX, + ..Default::default() + }; + + prepare_next_append(&mut max_state); + assert_eq!(max_state.epoch, u64::MAX); // Should saturate, not overflow + } + + #[test] + fn segment_validation() { + let state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 3, + committed_length: 1000, + pending_segments: vec![ + AppendSegment { + offset: 1000, + length: 200, + data_dir: Some(Uuid::new_v4()), + etag: Some("abc123".to_string()), + epoch: 0, + }, + AppendSegment { + offset: 1300, + length: 300, + data_dir: Some(Uuid::new_v4()), + etag: Some("def456".to_string()), + epoch: 0, + }, + ], + }; + + // Valid segment after existing segments + assert!(validate_new_segment(&state, 1600, 100).is_ok()); + + // Valid segment filling gap between committed and first pending + assert!(validate_new_segment(&state, 1200, 100).is_ok()); + + // Invalid segment overlapping with committed data + assert!(validate_new_segment(&state, 900, 200).is_err()); + let error = validate_new_segment(&state, 900, 200).unwrap_err(); + assert!(error.to_string().contains("overlaps with committed data")); + + // Invalid segment overlapping with first pending segment + assert!(validate_new_segment(&state, 1100, 100).is_err()); + let error = validate_new_segment(&state, 1100, 100).unwrap_err(); + assert!(error.to_string().contains("overlaps with existing segment")); + + // Invalid segment overlapping with second pending segment + assert!(validate_new_segment(&state, 1400, 100).is_err()); + + // Edge case: segment exactly touching committed data (should be valid) + assert!(validate_new_segment(&state, 1000, 0).is_ok()); + + // Edge case: segment exactly touching existing segment (should be valid) + assert!(validate_new_segment(&state, 1200, 0).is_ok()); + } + + #[test] + fn segment_validation_edge_cases() { + let empty_state = AppendState { + state: AppendStateKind::SegmentedActive, + epoch: 1, + committed_length: 500, + pending_segments: vec![], + }; + + // First segment after committed data + assert!(validate_new_segment(&empty_state, 500, 100).is_ok()); + assert!(validate_new_segment(&empty_state, 600, 200).is_ok()); + + // Zero-length segments (edge case) + assert!(validate_new_segment(&empty_state, 500, 0).is_ok()); + + // Segment exactly at committed boundary + assert!(validate_new_segment(&empty_state, 499, 1).is_err()); + assert!(validate_new_segment(&empty_state, 500, 1).is_ok()); + } +} diff --git a/crates/filemeta/src/fileinfo.rs b/crates/filemeta/src/fileinfo.rs index b6fefe5d..b15d3c2c 100644 --- a/crates/filemeta/src/fileinfo.rs +++ b/crates/filemeta/src/fileinfo.rs @@ -494,6 +494,96 @@ impl FileInfo { ReplicationStatusType::Empty } } + /// Get the append state for this FileInfo, with migration compatibility + pub fn get_append_state(&self) -> crate::append::AppendState { + use crate::append::{AppendState, AppendStateKind, get_append_state}; + + // Try to load from metadata first + if let Ok(Some(state)) = get_append_state(&self.metadata) { + return state; + } + + // Migration compatibility: determine state based on existing data + if self.inline_data() { + // Has inline data, treat as Inline state + AppendState { + state: AppendStateKind::Inline, + epoch: 0, + committed_length: self.size, + pending_segments: Vec::new(), + } + } else { + // No inline data, treat as SegmentedSealed (traditional object) + AppendState { + state: AppendStateKind::SegmentedSealed, + epoch: 0, + committed_length: self.size, + pending_segments: Vec::new(), + } + } + } + + /// Set the append state for this FileInfo + pub fn set_append_state(&mut self, state: &crate::append::AppendState) -> crate::error::Result<()> { + crate::append::set_append_state(&mut self.metadata, state) + } + + /// Check if this object supports append operations + pub fn is_appendable(&self) -> bool { + use crate::append::AppendStateKind; + match self.get_append_state().state { + AppendStateKind::Disabled => false, + AppendStateKind::Inline | AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive => true, + AppendStateKind::SegmentedSealed => false, + } + } + + /// Check if this object has pending append operations + pub fn has_pending_appends(&self) -> bool { + use crate::append::AppendStateKind; + matches!( + self.get_append_state().state, + AppendStateKind::InlinePendingSpill | AppendStateKind::SegmentedActive + ) + } + + /// Complete all pending append operations and seal the object + pub fn complete_append(&mut self) -> crate::error::Result<()> { + let mut append_state = self.get_append_state(); + crate::append::complete_append_operation(&mut append_state)?; + self.set_append_state(&append_state)?; + + // Update file size to reflect completed operation + if append_state.state == crate::append::AppendStateKind::SegmentedSealed { + self.size = append_state.committed_length; + } + + Ok(()) + } + + /// Abort all pending append operations and seal the object + pub fn abort_append(&mut self) -> crate::error::Result<()> { + let mut append_state = self.get_append_state(); + crate::append::abort_append_operation(&mut append_state)?; + self.set_append_state(&append_state)?; + + // Update file size to only include committed data + if append_state.state == crate::append::AppendStateKind::SegmentedSealed { + self.size = append_state.committed_length; + } + + Ok(()) + } + + /// Check if append operations can be completed for this object + pub fn can_complete_append(&self) -> bool { + crate::append::can_complete_append(&self.get_append_state()) + } + + /// Check if append operations can be aborted for this object + pub fn can_abort_append(&self) -> bool { + crate::append::can_abort_append(&self.get_append_state()) + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/crates/filemeta/src/lib.rs b/crates/filemeta/src/lib.rs index dc7fa4fd..c484f52b 100644 --- a/crates/filemeta/src/lib.rs +++ b/crates/filemeta/src/lib.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod append; mod error; pub mod fileinfo; mod filemeta; @@ -22,6 +23,7 @@ mod replication; pub mod test_data; +pub use append::*; pub use error::*; pub use fileinfo::*; pub use filemeta::*; diff --git a/crates/utils/src/dns_resolver.rs b/crates/utils/src/dns_resolver.rs index 43b2d1b1..12ec3bc6 100644 --- a/crates/utils/src/dns_resolver.rs +++ b/crates/utils/src/dns_resolver.rs @@ -420,6 +420,7 @@ mod tests { } #[tokio::test] + #[ignore] async fn test_invalid_domain_resolution() { let resolver = LayeredDnsResolver::new().await.unwrap(); diff --git a/docs/append_write/README.md b/docs/append_write/README.md new file mode 100644 index 00000000..7be7a631 --- /dev/null +++ b/docs/append_write/README.md @@ -0,0 +1,147 @@ +# Append Write Design + +This document captures the current design of the append-write feature in RustFS so that new contributors can quickly understand the moving parts, data flows, and testing expectations. + +## Goals & Non-Goals + +### Goals +- Allow clients to append payloads to existing objects without re-uploading the full body. +- Support inline objects and spill seamlessly into segmented layout once thresholds are exceeded. +- Preserve strong read-after-write semantics via optimistic concurrency controls (ETag / epoch). +- Expose minimal S3-compatible surface area (`x-amz-object-append`, `x-amz-append-position`, `x-amz-append-action`). + +### Non-Goals +- Full multipart-upload parity; append is intentionally simpler and serialized per object. +- Cross-object transactions; each object is isolated. +- Rebalancing or background compaction (future work). + +## State Machine + +Append state is persisted inside `FileInfo.metadata` under `x-rustfs-internal-append-state` and serialized as `AppendState` (`crates/filemeta/src/append.rs`). + +``` +Disabled --(initial PUT w/o append)--> SegmentedSealed +Inline --(inline append)--> Inline / InlinePendingSpill +InlinePendingSpill --(spill success)--> SegmentedActive +SegmentedActive --(Complete)--> SegmentedSealed +SegmentedActive --(Abort)--> SegmentedSealed +SegmentedSealed --(new append)--> SegmentedActive +``` + +Definitions: +- **Inline**: Object data fully stored in metadata (`FileInfo.data`). +- **InlinePendingSpill**: Inline data after append exceeded inline threshold; awaiting spill to disk. +- **SegmentedActive**: Object data lives in erasure-coded part(s) plus one or more pending append segments on disk (`append//`). +- **SegmentedSealed**: No pending segments; logical content equals committed parts. + +`AppendState` fields: +- `state`: current state enum (see above). +- `epoch`: monotonically increasing counter for concurrency control. +- `committed_length`: logical size already durable in the base parts/inline region. +- `pending_segments`: ordered list of `AppendSegment { offset, length, data_dir, etag, epoch }`. + +## Metadata & Storage Layout + +### Inline Objects +- Inline payload stored in `FileInfo.data`. +- Hash metadata maintained through `append_inline_data` (re-encoding with bitrot writer when checksums exist). +- When spilling is required, inline data is decoded, appended, and re-encoded into erasure shards written to per-disk `append///part.1` temporary path before rename to primary data directory. + +### Segmented Objects +- Base object content is represented by standard erasure-coded parts (`FileInfo.parts`, `FileInfo.data_dir`). +- Pending append segments live under `/append///part.1` (per disk). +- Each append stores segment metadata (`etag`, `offset`, `length`) inside `AppendState.pending_segments` and updates `FileInfo.size` to include pending bytes. +- Aggregate ETag is recomputed using multipart MD5 helper (`get_complete_multipart_md5`). + +### Metadata Writes +- `SetDisks::write_unique_file_info` persists `FileInfo` updates to the quorum of disks. +- During spill/append/complete/abort, all mirrored `FileInfo` copies within `parts_metadata` are updated to keep nodes consistent. +- Abort ensures inline markers are cleared (`x-rustfs-internal-inline-data`) and `FileInfo.data = None` to avoid stale inline reads. + +## Request Flows + +### Append (Inline Path) +1. Handler (`rustfs/src/storage/ecfs.rs`) validates headers and fills `ObjectOptions.append_*`. +2. `SetDisks::append_inline_object` verifies append position using `AppendState` snapshot. +3. Existing inline payload decoded (if checksums present) and appended in-memory (`append_inline_data`). +4. Storage class decision determines whether to remain inline or spill. +5. Inline success updates `FileInfo.data`, metadata, `AppendState` (state `Inline`, lengths updated). +6. Spill path delegates to `spill_inline_into_segmented` (see segmented path below). + +### Append (Segmented Path) +1. `SetDisks::append_segmented_object` validates state (must be `SegmentedActive` or `SegmentedSealed`). +2. Snapshot expected offset = committed length + sum of pending segments. +3. Payload encoded using erasure coding; shards written to temp volume; renamed into `append//` under object data directory. +4. New `AppendSegment` pushed, `AppendState.epoch` incremented, aggregated ETag recalculated. +5. `FileInfo.size` reflects committed + pending bytes; metadata persisted across quorum. + +### GET / Range Reads +1. `SetDisks::get_object_with_fileinfo` inspects `AppendState`. +2. Reads committed data from inline or erasure parts (ignoring inline buffers once segmented). +3. If requested range includes pending segments, loader fetches each segment via `load_pending_segment`, decodes shards, and streams appended bytes. + +### Complete Append (`x-amz-append-action: complete`) +1. `complete_append_object` fetches current `FileInfo`, ensures pending segments exist. +2. Entire logical object (committed + pending) streamed through `VecAsyncWriter` (TODO: potential optimization) to produce contiguous payload. +3. Inline spill routine (`spill_inline_into_segmented`) consolidates data into primary part, sets state `SegmentedSealed`, clears pending list, updates `committed_length`. +4. Pending segment directories removed and quorum metadata persisted. + +### Abort Append (`x-amz-append-action: abort`) +1. `abort_append_object` removes pending segment directories. +2. Ensures `committed_length` matches actual durable data (inline length or sum of parts); logs and corrects if mismatch is found. +3. Clears pending list, sets state `SegmentedSealed`, bumps epoch, removes inline markers/data. +4. Persists metadata and returns base ETag (multipart MD5 of committed parts). + +## Error Handling & Recovery + +- All disk writes go through quorum helpers (`reduce_write_quorum_errs`, `reduce_read_quorum_errs`) and propagate `StorageError` variants for HTTP mapping. +- Append operations are single-threaded per object via locking in higher layers (`fast_lock_manager` in `SetDisks::put_object`). +- On spill/append rename failure, temp directories are cleaned up; operation aborts without mutating metadata. +- Abort path now realigns `committed_length` if metadata drifted (observed during development) and strips inline remnants to prevent stale reads. +- Pending segments are only removed once metadata update succeeds; no partial deletion is performed ahead of state persistence. + +## Concurrency + +- Append requests rely on exact `x-amz-append-position` to ensure the client has an up-to-date view. +- Optional header `If-Match` is honored in S3 handler before actual append (shared with regular PUT path). +- `AppendState.epoch` increments after each append/complete/abort; future work may expose it for stronger optimistic control. +- e2e test `append_segments_concurrency_then_complete` verifies that simultaneous appends result in exactly one success; the loser receives 400. + +## Key Modules + +- `crates/ecstore/src/set_disk.rs`: core implementation (inline append, spill, segmented append, complete, abort, GET integration). +- `crates/ecstore/src/erasure_coding/{encode,decode}.rs`: encode/decode helpers used by append pipeline. +- `crates/filemeta/src/append.rs`: metadata schema + helper functions. +- `rustfs/src/storage/ecfs.rs`: HTTP/S3 layer that parses headers and routes to append operations. + +## Testing Strategy + +### Unit Tests +- `crates/filemeta/src/append.rs` covers serialization and state transitions. +- `crates/ecstore/src/set_disk.rs` contains lower-level utilities and regression tests for metadata helpers. +- Additional unit coverage is recommended for spill/append failure paths (e.g., injected rename failures). + +### End-to-End Tests (`cargo test --package e2e_test append`) +- Inline append success, wrong position, precondition failures. +- Segmented append success, wrong position, wrong ETag. +- Spill threshold transition (`append_threshold_crossing_inline_to_segmented`). +- Pending segment streaming (`append_range_requests_across_segments`). +- Complete append consolidates pending segments. +- Abort append discards pending data and allows new append. +- Concurrency: two clients racing to append, followed by additional append + complete. + +### Tooling Considerations +- `make clippy` must pass; the append code relies on async operations and custom logging. +- `make test` / `cargo nextest run` recommended before submitting PRs. +- Use `RUST_LOG=rustfs_ecstore=debug` when debugging append flows; targeted `info!`/`warn!` logs are emitted during spill/abort. + +## Future Work + +- Streamed consolidation in `complete_append_object` to avoid buffering entire logical object. +- Throttling or automatic `Complete` when pending segments exceed size/quantity thresholds. +- Stronger epoch exposure to clients (header-based conflict detection). +- Automated cleanup or garbage collection for orphaned `append/*` directories. + +--- + +For questions or design discussions, drop a note in the append-write channel or ping the storage team. diff --git a/rustfs/src/storage/ecfs.rs b/rustfs/src/storage/ecfs.rs index 46ab39ce..363c2637 100644 --- a/rustfs/src/storage/ecfs.rs +++ b/rustfs/src/storage/ecfs.rs @@ -2312,10 +2312,92 @@ impl S3 for FS { let mt = metadata.clone(); let mt2 = metadata.clone(); + let append_flag = req.headers.get("x-amz-object-append"); + let append_action_header = req.headers.get("x-amz-append-action"); + let mut append_requested = false; + let mut append_position: Option = None; + if let Some(flag_value) = append_flag { + let flag_str = flag_value.to_str().map_err(|_| { + S3Error::with_message(S3ErrorCode::InvalidArgument, "invalid x-amz-object-append header".to_string()) + })?; + if flag_str.eq_ignore_ascii_case("true") { + append_requested = true; + let position_value = req.headers.get("x-amz-append-position").ok_or_else(|| { + S3Error::with_message( + S3ErrorCode::InvalidArgument, + "x-amz-append-position header required when x-amz-object-append is true".to_string(), + ) + })?; + let position_str = position_value.to_str().map_err(|_| { + S3Error::with_message(S3ErrorCode::InvalidArgument, "invalid x-amz-append-position header".to_string()) + })?; + let position = position_str.parse::().map_err(|_| { + S3Error::with_message( + S3ErrorCode::InvalidArgument, + "x-amz-append-position must be a non-negative integer".to_string(), + ) + })?; + if position < 0 { + return Err(S3Error::with_message( + S3ErrorCode::InvalidArgument, + "x-amz-append-position must be a non-negative integer".to_string(), + )); + } + append_position = Some(position); + } else if !flag_str.eq_ignore_ascii_case("false") { + return Err(S3Error::with_message( + S3ErrorCode::InvalidArgument, + "x-amz-object-append must be 'true' or 'false'".to_string(), + )); + } + } + + let mut append_action: Option = None; + if let Some(action_value) = append_action_header { + let action_str = action_value.to_str().map_err(|_| { + S3Error::with_message(S3ErrorCode::InvalidArgument, "invalid x-amz-append-action header".to_string()) + })?; + append_action = Some(action_str.to_ascii_lowercase()); + } + let mut opts: ObjectOptions = put_opts(&bucket, &key, version_id, &req.headers, mt) .await .map_err(ApiError::from)?; + if append_requested { + opts.append_object = true; + opts.append_position = append_position; + } + + if let Some(action) = append_action { + if append_requested { + return Err(S3Error::with_message( + S3ErrorCode::InvalidArgument, + "x-amz-object-append cannot be combined with x-amz-append-action".to_string(), + )); + } + + let obj_info = match action.as_str() { + "complete" => store.complete_append(&bucket, &key, &opts).await, + "abort" => store.abort_append(&bucket, &key, &opts).await, + _ => { + return Err(S3Error::with_message( + S3ErrorCode::InvalidArgument, + "x-amz-append-action must be 'complete' or 'abort'".to_string(), + )); + } + } + .map_err(ApiError::from)?; + + let output = PutObjectOutput { + e_tag: obj_info.etag.clone(), + version_id: obj_info.version_id.map(|v| v.to_string()), + ..Default::default() + }; + + return Ok(S3Response::new(output)); + } + let repoptions = get_must_replicate_options(&mt2, "".to_string(), ReplicationStatusType::Empty, ReplicationType::Object, opts.clone());