diff --git a/ecstore/benches/comparison_benchmark.rs b/ecstore/benches/comparison_benchmark.rs
index 201c8a1b..5140e306 100644
--- a/ecstore/benches/comparison_benchmark.rs
+++ b/ecstore/benches/comparison_benchmark.rs
@@ -1,29 +1,28 @@
-//! 专门比较 Pure Erasure 和 Hybrid (SIMD) 模式性能的基准测试
+//! Reed-Solomon SIMD performance analysis benchmarks
 //!
-//! 这个基准测试使用不同的feature编译配置来直接对比两种实现的性能。
+//! This benchmark analyzes the performance characteristics of the SIMD Reed-Solomon implementation
+//! across different data sizes, shard configurations, and usage patterns.
 //!
-//! ## 运行比较测试
+//! ## Running Performance Analysis
 //!
 //! ```bash
-//! # 测试 Pure Erasure 实现 (默认)
+//! # Run all SIMD performance tests
 //! cargo bench --bench comparison_benchmark
 //!
-//! # 测试 Hybrid (SIMD) 实现  
-//! cargo bench --bench comparison_benchmark --features reed-solomon-simd
+//! # Generate detailed performance report
+//! cargo bench --bench comparison_benchmark -- --save-baseline simd_analysis
 //!
-//! # 测试强制 erasure-only 模式
-//! cargo bench --bench comparison_benchmark
-//!
-//! # 生成对比报告
-//! cargo bench --bench comparison_benchmark -- --save-baseline erasure
-//! cargo bench --bench comparison_benchmark --features reed-solomon-simd -- --save-baseline hybrid
+//! # Run specific test categories
+//! cargo bench --bench comparison_benchmark encode_analysis
+//! cargo bench --bench comparison_benchmark decode_analysis
+//! cargo bench --bench comparison_benchmark shard_analysis
 //! ```
 
 use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
 use ecstore::erasure_coding::Erasure;
 use std::time::Duration;
 
-/// 基准测试数据配置
+/// Performance test data configuration
 struct TestData {
     data: Vec<u8>,
     size_name: &'static str,
@@ -36,41 +35,41 @@ impl TestData {
     }
 }
 
-/// 生成不同大小的测试数据集
+/// Generate different sized test datasets for performance analysis
 fn generate_test_datasets() -> Vec<TestData> {
     vec![
-        TestData::new(1024, "1KB"),            // 小数据
-        TestData::new(8 * 1024, "8KB"),        // 中小数据
-        TestData::new(64 * 1024, "64KB"),      // 中等数据
-        TestData::new(256 * 1024, "256KB"),    // 中大数据
-        TestData::new(1024 * 1024, "1MB"),     // 大数据
-        TestData::new(4 * 1024 * 1024, "4MB"), // 超大数据
+        TestData::new(1024, "1KB"),            // Small data
+        TestData::new(8 * 1024, "8KB"),        // Medium-small data
+        TestData::new(64 * 1024, "64KB"),      // Medium data
+        TestData::new(256 * 1024, "256KB"),    // Medium-large data
+        TestData::new(1024 * 1024, "1MB"),     // Large data
+        TestData::new(4 * 1024 * 1024, "4MB"), // Extra large data
     ]
 }
 
-/// 编码性能比较基准测试
-fn bench_encode_comparison(c: &mut Criterion) {
+/// SIMD encoding performance analysis
+fn bench_encode_analysis(c: &mut Criterion) {
     let datasets = generate_test_datasets();
     let configs = vec![
-        (4, 2, "4+2"), // 常用配置
-        (6, 3, "6+3"), // 50%冗余
-        (8, 4, "8+4"), // 50%冗余，更多分片
+        (4, 2, "4+2"), // Common configuration
+        (6, 3, "6+3"), // 50% redundancy
+        (8, 4, "8+4"), // 50% redundancy, more shards
     ];
 
     for dataset in &datasets {
         for (data_shards, parity_shards, config_name) in &configs {
-            let test_name = format!("{}_{}_{}", dataset.size_name, config_name, get_implementation_name());
+            let test_name = format!("{}_{}_{}", dataset.size_name, config_name, "simd");
 
-            let mut group = c.benchmark_group("encode_comparison");
+            let mut group = c.benchmark_group("encode_analysis");
             group.throughput(Throughput::Bytes(dataset.data.len() as u64));
             group.sample_size(20);
             group.measurement_time(Duration::from_secs(10));
 
-            // 检查是否能够创建erasure实例（某些配置在纯SIMD模式下可能失败）
+            // Test SIMD encoding performance
             match Erasure::new(*data_shards, *parity_shards, dataset.data.len()).encode_data(&dataset.data) {
                 Ok(_) => {
                     group.bench_with_input(
-                        BenchmarkId::new("implementation", &test_name),
+                        BenchmarkId::new("simd_encode", &test_name),
                         &(&dataset.data, *data_shards, *parity_shards),
                         |b, (data, data_shards, parity_shards)| {
                             let erasure = Erasure::new(*data_shards, *parity_shards, data.len());
@@ -82,7 +81,7 @@ fn bench_encode_comparison(c: &mut Criterion) {
                     );
                 }
                 Err(e) => {
-                    println!("⚠️  跳过测试 {} - 配置不支持: {}", test_name, e);
+                    println!("⚠️  Skipping test {} - configuration not supported: {}", test_name, e);
                 }
             }
             group.finish();
@@ -90,35 +89,35 @@ fn bench_encode_comparison(c: &mut Criterion) {
     }
 }
 
-/// 解码性能比较基准测试
-fn bench_decode_comparison(c: &mut Criterion) {
+/// SIMD decoding performance analysis
+fn bench_decode_analysis(c: &mut Criterion) {
     let datasets = generate_test_datasets();
     let configs = vec![(4, 2, "4+2"), (6, 3, "6+3"), (8, 4, "8+4")];
 
     for dataset in &datasets {
         for (data_shards, parity_shards, config_name) in &configs {
-            let test_name = format!("{}_{}_{}", dataset.size_name, config_name, get_implementation_name());
+            let test_name = format!("{}_{}_{}", dataset.size_name, config_name, "simd");
             let erasure = Erasure::new(*data_shards, *parity_shards, dataset.data.len());
 
-            // 预先编码数据 - 检查是否支持此配置
+            // Pre-encode data - check if this configuration is supported
             match erasure.encode_data(&dataset.data) {
                 Ok(encoded_shards) => {
-                    let mut group = c.benchmark_group("decode_comparison");
+                    let mut group = c.benchmark_group("decode_analysis");
                     group.throughput(Throughput::Bytes(dataset.data.len() as u64));
                     group.sample_size(20);
                     group.measurement_time(Duration::from_secs(10));
 
                     group.bench_with_input(
-                        BenchmarkId::new("implementation", &test_name),
+                        BenchmarkId::new("simd_decode", &test_name),
                         &(&encoded_shards, *data_shards, *parity_shards),
                         |b, (shards, data_shards, parity_shards)| {
                             let erasure = Erasure::new(*data_shards, *parity_shards, dataset.data.len());
                             b.iter(|| {
-                                // 模拟最大可恢复的数据丢失
+                                // Simulate maximum recoverable data loss
                                 let mut shards_opt: Vec<Option<Vec<u8>>> =
                                     shards.iter().map(|shard| Some(shard.to_vec())).collect();
 
-                                // 丢失等于奇偶校验分片数量的分片
+                                // Lose up to parity_shards number of shards
                                 for item in shards_opt.iter_mut().take(*parity_shards) {
                                     *item = None;
                                 }
@@ -131,33 +130,33 @@ fn bench_decode_comparison(c: &mut Criterion) {
                     group.finish();
                 }
                 Err(e) => {
-                    println!("⚠️  跳过解码测试 {} - 配置不支持: {}", test_name, e);
+                    println!("⚠️  Skipping decode test {} - configuration not supported: {}", test_name, e);
                 }
             }
         }
     }
 }
 
-/// 分片大小敏感性测试
-fn bench_shard_size_sensitivity(c: &mut Criterion) {
+/// Shard size sensitivity analysis for SIMD optimization
+fn bench_shard_size_analysis(c: &mut Criterion) {
     let data_shards = 4;
     let parity_shards = 2;
 
-    // 测试不同的分片大小，特别关注SIMD的临界点
+    // Test different shard sizes, focusing on SIMD optimization thresholds
     let shard_sizes = vec![32, 64, 128, 256, 512, 1024, 2048, 4096, 8192];
 
-    let mut group = c.benchmark_group("shard_size_sensitivity");
+    let mut group = c.benchmark_group("shard_size_analysis");
     group.sample_size(15);
     group.measurement_time(Duration::from_secs(8));
 
     for shard_size in shard_sizes {
         let total_size = shard_size * data_shards;
         let data = (0..total_size).map(|i| (i % 256) as u8).collect::<Vec<u8>>();
-        let test_name = format!("{}B_shard_{}", shard_size, get_implementation_name());
+        let test_name = format!("{}B_shard_simd", shard_size);
 
         group.throughput(Throughput::Bytes(total_size as u64));
 
-        // 检查此分片大小是否支持
+        // Check if this shard size is supported
         let erasure = Erasure::new(data_shards, parity_shards, data.len());
         match erasure.encode_data(&data) {
             Ok(_) => {
@@ -170,15 +169,15 @@ fn bench_shard_size_sensitivity(c: &mut Criterion) {
                 });
             }
             Err(e) => {
-                println!("⚠️  跳过分片大小测试 {} - 不支持: {}", test_name, e);
+                println!("⚠️  Skipping shard size test {} - not supported: {}", test_name, e);
             }
         }
     }
     group.finish();
 }
 
-/// 高负载并发测试
-fn bench_concurrent_load(c: &mut Criterion) {
+/// High-load concurrent performance analysis
+fn bench_concurrent_analysis(c: &mut Criterion) {
     use std::sync::Arc;
     use std::thread;
 
@@ -186,14 +185,14 @@ fn bench_concurrent_load(c: &mut Criterion) {
     let data = Arc::new((0..data_size).map(|i| (i % 256) as u8).collect::<Vec<u8>>());
     let erasure = Arc::new(Erasure::new(4, 2, data_size));
 
-    let mut group = c.benchmark_group("concurrent_load");
+    let mut group = c.benchmark_group("concurrent_analysis");
     group.throughput(Throughput::Bytes(data_size as u64));
     group.sample_size(10);
     group.measurement_time(Duration::from_secs(15));
 
-    let test_name = format!("1MB_concurrent_{}", get_implementation_name());
+    let test_name = "1MB_concurrent_simd";
 
-    group.bench_function(&test_name, |b| {
+    group.bench_function(test_name, |b| {
         b.iter(|| {
             let handles: Vec<_> = (0..4)
                 .map(|_| {
@@ -214,42 +213,44 @@ fn bench_concurrent_load(c: &mut Criterion) {
     group.finish();
 }
 
-/// 错误恢复能力测试
-fn bench_error_recovery_performance(c: &mut Criterion) {
-    let data_size = 256 * 1024; // 256KB
+/// Error recovery performance analysis
+fn bench_error_recovery_analysis(c: &mut Criterion) {
+    let data_size = 512 * 1024; // 512KB
     let data = (0..data_size).map(|i| (i % 256) as u8).collect::<Vec<u8>>();
 
-    let configs = vec![
-        (4, 2, 1), // 丢失1个分片
-        (4, 2, 2), // 丢失2个分片（最大可恢复）
-        (6, 3, 2), // 丢失2个分片
-        (6, 3, 3), // 丢失3个分片（最大可恢复）
-        (8, 4, 3), // 丢失3个分片
-        (8, 4, 4), // 丢失4个分片（最大可恢复）
+    // Test different error recovery scenarios
+    let scenarios = vec![
+        (4, 2, 1, "single_loss"),     // Lose 1 shard
+        (4, 2, 2, "double_loss"),     // Lose 2 shards (maximum)
+        (6, 3, 1, "single_loss_6_3"), // Lose 1 shard with 6+3
+        (6, 3, 3, "triple_loss_6_3"), // Lose 3 shards (maximum)
+        (8, 4, 2, "double_loss_8_4"), // Lose 2 shards with 8+4
+        (8, 4, 4, "quad_loss_8_4"),   // Lose 4 shards (maximum)
     ];
 
-    let mut group = c.benchmark_group("error_recovery");
+    let mut group = c.benchmark_group("error_recovery_analysis");
     group.throughput(Throughput::Bytes(data_size as u64));
     group.sample_size(15);
-    group.measurement_time(Duration::from_secs(8));
+    group.measurement_time(Duration::from_secs(10));
 
-    for (data_shards, parity_shards, lost_shards) in configs {
+    for (data_shards, parity_shards, loss_count, scenario_name) in scenarios {
         let erasure = Erasure::new(data_shards, parity_shards, data_size);
-        let test_name = format!("{}+{}_lost{}_{}", data_shards, parity_shards, lost_shards, get_implementation_name());
 
-        // 检查此配置是否支持
         match erasure.encode_data(&data) {
             Ok(encoded_shards) => {
+                let test_name = format!("{}+{}_{}", data_shards, parity_shards, scenario_name);
+
                 group.bench_with_input(
                     BenchmarkId::new("recovery", &test_name),
-                    &(&encoded_shards, data_shards, parity_shards, lost_shards),
-                    |b, (shards, data_shards, parity_shards, lost_shards)| {
+                    &(&encoded_shards, data_shards, parity_shards, loss_count),
+                    |b, (shards, data_shards, parity_shards, loss_count)| {
                         let erasure = Erasure::new(*data_shards, *parity_shards, data_size);
                         b.iter(|| {
+                            // Simulate specific number of shard losses
                             let mut shards_opt: Vec<Option<Vec<u8>>> = shards.iter().map(|shard| Some(shard.to_vec())).collect();
 
-                            // 丢失指定数量的分片
-                            for item in shards_opt.iter_mut().take(*lost_shards) {
+                            // Lose the specified number of shards
+                            for item in shards_opt.iter_mut().take(*loss_count) {
                                 *item = None;
                             }
 
@@ -260,71 +261,57 @@ fn bench_error_recovery_performance(c: &mut Criterion) {
                 );
             }
             Err(e) => {
-                println!("⚠️  跳过错误恢复测试 {} - 配置不支持: {}", test_name, e);
+                println!("⚠️  Skipping recovery test {}: {}", scenario_name, e);
             }
         }
     }
     group.finish();
 }
 
-/// 内存效率测试
-fn bench_memory_efficiency(c: &mut Criterion) {
-    let data_shards = 4;
-    let parity_shards = 2;
-    let data_size = 1024 * 1024; // 1MB
+/// Memory efficiency analysis
+fn bench_memory_analysis(c: &mut Criterion) {
+    let data_sizes = vec![64 * 1024, 256 * 1024, 1024 * 1024]; // 64KB, 256KB, 1MB
+    let config = (4, 2); // 4+2 configuration
 
-    let mut group = c.benchmark_group("memory_efficiency");
-    group.throughput(Throughput::Bytes(data_size as u64));
-    group.sample_size(10);
+    let mut group = c.benchmark_group("memory_analysis");
+    group.sample_size(15);
     group.measurement_time(Duration::from_secs(8));
 
-    let test_name = format!("memory_pattern_{}", get_implementation_name());
+    for data_size in data_sizes {
+        let data = (0..data_size).map(|i| (i % 256) as u8).collect::<Vec<u8>>();
+        let size_name = format!("{}KB", data_size / 1024);
 
-    // 测试连续多次编码对内存的影响
-    group.bench_function(format!("{}_continuous", test_name), |b| {
-        let erasure = Erasure::new(data_shards, parity_shards, data_size);
-        b.iter(|| {
-            for i in 0..10 {
-                let data = vec![(i % 256) as u8; data_size];
-                let shards = erasure.encode_data(black_box(&data)).unwrap();
+        group.throughput(Throughput::Bytes(data_size as u64));
+
+        // Test instance reuse vs new instance creation
+        group.bench_with_input(BenchmarkId::new("reuse_instance", &size_name), &data, |b, data| {
+            let erasure = Erasure::new(config.0, config.1, data.len());
+            b.iter(|| {
+                let shards = erasure.encode_data(black_box(data)).unwrap();
                 black_box(shards);
-            }
+            });
         });
-    });
 
-    // 测试大量小编码任务
-    group.bench_function(format!("{}_small_chunks", test_name), |b| {
-        let chunk_size = 1024; // 1KB chunks
-        let erasure = Erasure::new(data_shards, parity_shards, chunk_size);
-        b.iter(|| {
-            for i in 0..1024 {
-                let data = vec![(i % 256) as u8; chunk_size];
-                let shards = erasure.encode_data(black_box(&data)).unwrap();
+        group.bench_with_input(BenchmarkId::new("new_instance", &size_name), &data, |b, data| {
+            b.iter(|| {
+                let erasure = Erasure::new(config.0, config.1, data.len());
+                let shards = erasure.encode_data(black_box(data)).unwrap();
                 black_box(shards);
-            }
+            });
         });
-    });
-
+    }
     group.finish();
 }
 
-/// 获取当前实现的名称
-fn get_implementation_name() -> &'static str {
-    #[cfg(feature = "reed-solomon-simd")]
-    return "hybrid";
-
-    #[cfg(not(feature = "reed-solomon-simd"))]
-    return "erasure";
-}
-
+// Benchmark group configuration
 criterion_group!(
     benches,
-    bench_encode_comparison,
-    bench_decode_comparison,
-    bench_shard_size_sensitivity,
-    bench_concurrent_load,
-    bench_error_recovery_performance,
-    bench_memory_efficiency
+    bench_encode_analysis,
+    bench_decode_analysis,
+    bench_shard_size_analysis,
+    bench_concurrent_analysis,
+    bench_error_recovery_analysis,
+    bench_memory_analysis
 );
 
 criterion_main!(benches);
diff --git a/ecstore/benches/erasure_benchmark.rs b/ecstore/benches/erasure_benchmark.rs
index 6ffaa6f6..eec595db 100644
--- a/ecstore/benches/erasure_benchmark.rs
+++ b/ecstore/benches/erasure_benchmark.rs
@@ -1,25 +1,23 @@
-//! Reed-Solomon erasure coding performance benchmarks.
+//! Reed-Solomon SIMD erasure coding performance benchmarks.
 //!
-//! This benchmark compares the performance of different Reed-Solomon implementations:
-//! - SIMD mode: High-performance reed-solomon-simd implementation
-//! - `reed-solomon-simd` feature: SIMD mode with optimized performance
+//! This benchmark tests the performance of the high-performance SIMD Reed-Solomon implementation.
 //!
 //! ## Running Benchmarks
 //!
 //! ```bash
-//! # 运行所有基准测试
+//! # Run all benchmarks
 //! cargo bench
 //!
-//! # 运行特定的基准测试
+//! # Run specific benchmark
 //! cargo bench --bench erasure_benchmark
 //!
-//! # 生成HTML报告
+//! # Generate HTML report
 //! cargo bench --bench erasure_benchmark -- --output-format html
 //!
-//! # 只测试编码性能
+//! # Test encoding performance only
 //! cargo bench encode
 //!
-//! # 只测试解码性能
+//! # Test decoding performance only
 //! cargo bench decode
 //! ```
 //!
@@ -29,24 +27,24 @@
 //! - Different data sizes: 1KB, 64KB, 1MB, 16MB
 //! - Different erasure coding configurations: (4,2), (6,3), (8,4)
 //! - Both encoding and decoding operations
-//! - Small vs large shard scenarios for SIMD optimization
+//! - SIMD optimization for different shard sizes
 
 use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
 use ecstore::erasure_coding::{Erasure, calc_shard_size};
 use std::time::Duration;
 
-/// 基准测试配置结构体
+/// Benchmark configuration structure
 #[derive(Clone, Debug)]
 struct BenchConfig {
-    /// 数据分片数量
+    /// Number of data shards
     data_shards: usize,
-    /// 奇偶校验分片数量
+    /// Number of parity shards
     parity_shards: usize,
-    /// 测试数据大小（字节）
+    /// Test data size (bytes)
     data_size: usize,
-    /// 块大小（字节）
+    /// Block size (bytes)
     block_size: usize,
-    /// 配置名称
+    /// Configuration name
     name: String,
 }
 
@@ -62,27 +60,27 @@ impl BenchConfig {
     }
 }
 
-/// 生成测试数据
+/// Generate test data
 fn generate_test_data(size: usize) -> Vec<u8> {
     (0..size).map(|i| (i % 256) as u8).collect()
 }
 
-/// 基准测试: 编码性能对比
+/// Benchmark: Encoding performance
 fn bench_encode_performance(c: &mut Criterion) {
     let configs = vec![
-        // 小数据量测试 - 1KB
+        // Small data tests - 1KB
         BenchConfig::new(4, 2, 1024, 1024),
         BenchConfig::new(6, 3, 1024, 1024),
         BenchConfig::new(8, 4, 1024, 1024),
-        // 中等数据量测试 - 64KB
+        // Medium data tests - 64KB
         BenchConfig::new(4, 2, 64 * 1024, 64 * 1024),
         BenchConfig::new(6, 3, 64 * 1024, 64 * 1024),
         BenchConfig::new(8, 4, 64 * 1024, 64 * 1024),
-        // 大数据量测试 - 1MB
+        // Large data tests - 1MB
         BenchConfig::new(4, 2, 1024 * 1024, 1024 * 1024),
         BenchConfig::new(6, 3, 1024 * 1024, 1024 * 1024),
         BenchConfig::new(8, 4, 1024 * 1024, 1024 * 1024),
-        // 超大数据量测试 - 16MB
+        // Extra large data tests - 16MB
         BenchConfig::new(4, 2, 16 * 1024 * 1024, 16 * 1024 * 1024),
         BenchConfig::new(6, 3, 16 * 1024 * 1024, 16 * 1024 * 1024),
     ];
@@ -90,13 +88,13 @@ fn bench_encode_performance(c: &mut Criterion) {
     for config in configs {
         let data = generate_test_data(config.data_size);
 
-        // 测试当前默认实现（通常是SIMD）
-        let mut group = c.benchmark_group("encode_current");
+        // Test SIMD encoding performance
+        let mut group = c.benchmark_group("encode_simd");
         group.throughput(Throughput::Bytes(config.data_size as u64));
         group.sample_size(10);
         group.measurement_time(Duration::from_secs(5));
 
-        group.bench_with_input(BenchmarkId::new("current_impl", &config.name), &(&data, &config), |b, (data, config)| {
+        group.bench_with_input(BenchmarkId::new("simd_impl", &config.name), &(&data, &config), |b, (data, config)| {
             let erasure = Erasure::new(config.data_shards, config.parity_shards, config.block_size);
             b.iter(|| {
                 let shards = erasure.encode_data(black_box(data)).unwrap();
@@ -105,99 +103,55 @@ fn bench_encode_performance(c: &mut Criterion) {
         });
         group.finish();
 
-        // 如果SIMD feature启用，测试专用的erasure实现对比
-        #[cfg(feature = "reed-solomon-simd")]
-        {
-            use ecstore::erasure_coding::ReedSolomonEncoder;
+        // Test direct SIMD implementation for large shards (>= 512 bytes)
+        let shard_size = calc_shard_size(config.data_size, config.data_shards);
+        if shard_size >= 512 {
+            let mut simd_group = c.benchmark_group("encode_simd_direct");
+            simd_group.throughput(Throughput::Bytes(config.data_size as u64));
+            simd_group.sample_size(10);
+            simd_group.measurement_time(Duration::from_secs(5));
 
-            let mut erasure_group = c.benchmark_group("encode_erasure_only");
-            erasure_group.throughput(Throughput::Bytes(config.data_size as u64));
-            erasure_group.sample_size(10);
-            erasure_group.measurement_time(Duration::from_secs(5));
+            simd_group.bench_with_input(BenchmarkId::new("simd_direct", &config.name), &(&data, &config), |b, (data, config)| {
+                b.iter(|| {
+                    // Direct SIMD implementation
+                    let per_shard_size = calc_shard_size(data.len(), config.data_shards);
+                    match reed_solomon_simd::ReedSolomonEncoder::new(config.data_shards, config.parity_shards, per_shard_size) {
+                        Ok(mut encoder) => {
+                            // Create properly sized buffer and fill with data
+                            let mut buffer = vec![0u8; per_shard_size * config.data_shards];
+                            let copy_len = data.len().min(buffer.len());
+                            buffer[..copy_len].copy_from_slice(&data[..copy_len]);
 
-            erasure_group.bench_with_input(
-                BenchmarkId::new("erasure_impl", &config.name),
-                &(&data, &config),
-                |b, (data, config)| {
-                    let encoder = ReedSolomonEncoder::new(config.data_shards, config.parity_shards).unwrap();
-                    b.iter(|| {
-                        // 创建编码所需的数据结构
-                        let per_shard_size = calc_shard_size(data.len(), config.data_shards);
-                        let total_size = per_shard_size * (config.data_shards + config.parity_shards);
-                        let mut buffer = vec![0u8; total_size];
-                        buffer[..data.len()].copy_from_slice(data);
-
-                        let slices: smallvec::SmallVec<[&mut [u8]; 16]> = buffer.chunks_exact_mut(per_shard_size).collect();
-
-                        encoder.encode(black_box(slices)).unwrap();
-                        black_box(&buffer);
-                    });
-                },
-            );
-            erasure_group.finish();
-        }
-
-        // 如果使用SIMD feature，测试直接SIMD实现对比
-        #[cfg(feature = "reed-solomon-simd")]
-        {
-            // 只对大shard测试SIMD（小于512字节的shard SIMD性能不佳）
-            let shard_size = calc_shard_size(config.data_size, config.data_shards);
-            if shard_size >= 512 {
-                let mut simd_group = c.benchmark_group("encode_simd_direct");
-                simd_group.throughput(Throughput::Bytes(config.data_size as u64));
-                simd_group.sample_size(10);
-                simd_group.measurement_time(Duration::from_secs(5));
-
-                simd_group.bench_with_input(
-                    BenchmarkId::new("simd_impl", &config.name),
-                    &(&data, &config),
-                    |b, (data, config)| {
-                        b.iter(|| {
-                            // 直接使用SIMD实现
-                            let per_shard_size = calc_shard_size(data.len(), config.data_shards);
-                            match reed_solomon_simd::ReedSolomonEncoder::new(
-                                config.data_shards,
-                                config.parity_shards,
-                                per_shard_size,
-                            ) {
-                                Ok(mut encoder) => {
-                                    // 创建正确大小的缓冲区，并填充数据
-                                    let mut buffer = vec![0u8; per_shard_size * config.data_shards];
-                                    let copy_len = data.len().min(buffer.len());
-                                    buffer[..copy_len].copy_from_slice(&data[..copy_len]);
-
-                                    // 按正确的分片大小添加数据分片
-                                    for chunk in buffer.chunks_exact(per_shard_size) {
-                                        encoder.add_original_shard(black_box(chunk)).unwrap();
-                                    }
-
-                                    let result = encoder.encode().unwrap();
-                                    black_box(result);
-                                }
-                                Err(_) => {
-                                    // SIMD不支持此配置，跳过
-                                    black_box(());
-                                }
+                            // Add data shards with correct shard size
+                            for chunk in buffer.chunks_exact(per_shard_size) {
+                                encoder.add_original_shard(black_box(chunk)).unwrap();
                             }
-                        });
-                    },
-                );
-                simd_group.finish();
-            }
+
+                            let result = encoder.encode().unwrap();
+                            black_box(result);
+                        }
+                        Err(_) => {
+                            // SIMD doesn't support this configuration, skip
+                            black_box(());
+                        }
+                    }
+                });
+            });
+            simd_group.finish();
         }
     }
 }
 
-/// 基准测试: 解码性能对比
+/// Benchmark: Decoding performance
 fn bench_decode_performance(c: &mut Criterion) {
     let configs = vec![
-        // 中等数据量测试 - 64KB
+        // Medium data tests - 64KB
         BenchConfig::new(4, 2, 64 * 1024, 64 * 1024),
         BenchConfig::new(6, 3, 64 * 1024, 64 * 1024),
-        // 大数据量测试 - 1MB
+        // Large data tests - 1MB
         BenchConfig::new(4, 2, 1024 * 1024, 1024 * 1024),
         BenchConfig::new(6, 3, 1024 * 1024, 1024 * 1024),
-        // 超大数据量测试 - 16MB
+        // Extra large data tests - 16MB
         BenchConfig::new(4, 2, 16 * 1024 * 1024, 16 * 1024 * 1024),
     ];
 
@@ -205,25 +159,25 @@ fn bench_decode_performance(c: &mut Criterion) {
         let data = generate_test_data(config.data_size);
         let erasure = Erasure::new(config.data_shards, config.parity_shards, config.block_size);
 
-        // 预先编码数据
+        // Pre-encode data
         let encoded_shards = erasure.encode_data(&data).unwrap();
 
-        // 测试当前默认实现的解码性能
-        let mut group = c.benchmark_group("decode_current");
+        // Test SIMD decoding performance
+        let mut group = c.benchmark_group("decode_simd");
         group.throughput(Throughput::Bytes(config.data_size as u64));
         group.sample_size(10);
         group.measurement_time(Duration::from_secs(5));
 
         group.bench_with_input(
-            BenchmarkId::new("current_impl", &config.name),
+            BenchmarkId::new("simd_impl", &config.name),
             &(&encoded_shards, &config),
             |b, (shards, config)| {
                 let erasure = Erasure::new(config.data_shards, config.parity_shards, config.block_size);
                 b.iter(|| {
-                    // 模拟数据丢失 - 丢失一个数据分片和一个奇偶分片
+                    // Simulate data loss - lose one data shard and one parity shard
                     let mut shards_opt: Vec<Option<Vec<u8>>> = shards.iter().map(|shard| Some(shard.to_vec())).collect();
 
-                    // 丢失最后一个数据分片和第一个奇偶分片
+                    // Lose last data shard and first parity shard
                     shards_opt[config.data_shards - 1] = None;
                     shards_opt[config.data_shards] = None;
 
@@ -234,58 +188,52 @@ fn bench_decode_performance(c: &mut Criterion) {
         );
         group.finish();
 
-        // 如果使用混合模式（默认），测试SIMD解码性能
+        // Test direct SIMD decoding for large shards
+        let shard_size = calc_shard_size(config.data_size, config.data_shards);
+        if shard_size >= 512 {
+            let mut simd_group = c.benchmark_group("decode_simd_direct");
+            simd_group.throughput(Throughput::Bytes(config.data_size as u64));
+            simd_group.sample_size(10);
+            simd_group.measurement_time(Duration::from_secs(5));
 
-        {
-            let shard_size = calc_shard_size(config.data_size, config.data_shards);
-            if shard_size >= 512 {
-                let mut simd_group = c.benchmark_group("decode_simd_direct");
-                simd_group.throughput(Throughput::Bytes(config.data_size as u64));
-                simd_group.sample_size(10);
-                simd_group.measurement_time(Duration::from_secs(5));
-
-                simd_group.bench_with_input(
-                    BenchmarkId::new("simd_impl", &config.name),
-                    &(&encoded_shards, &config),
-                    |b, (shards, config)| {
-                        b.iter(|| {
-                            let per_shard_size = calc_shard_size(config.data_size, config.data_shards);
-                            match reed_solomon_simd::ReedSolomonDecoder::new(
-                                config.data_shards,
-                                config.parity_shards,
-                                per_shard_size,
-                            ) {
-                                Ok(mut decoder) => {
-                                    // 添加可用的分片（除了丢失的）
-                                    for (i, shard) in shards.iter().enumerate() {
-                                        if i != config.data_shards - 1 && i != config.data_shards {
-                                            if i < config.data_shards {
-                                                decoder.add_original_shard(i, black_box(shard)).unwrap();
-                                            } else {
-                                                let recovery_idx = i - config.data_shards;
-                                                decoder.add_recovery_shard(recovery_idx, black_box(shard)).unwrap();
-                                            }
+            simd_group.bench_with_input(
+                BenchmarkId::new("simd_direct", &config.name),
+                &(&encoded_shards, &config),
+                |b, (shards, config)| {
+                    b.iter(|| {
+                        let per_shard_size = calc_shard_size(config.data_size, config.data_shards);
+                        match reed_solomon_simd::ReedSolomonDecoder::new(config.data_shards, config.parity_shards, per_shard_size)
+                        {
+                            Ok(mut decoder) => {
+                                // Add available shards (except lost ones)
+                                for (i, shard) in shards.iter().enumerate() {
+                                    if i != config.data_shards - 1 && i != config.data_shards {
+                                        if i < config.data_shards {
+                                            decoder.add_original_shard(i, black_box(shard)).unwrap();
+                                        } else {
+                                            let recovery_idx = i - config.data_shards;
+                                            decoder.add_recovery_shard(recovery_idx, black_box(shard)).unwrap();
                                         }
                                     }
+                                }
 
-                                    let result = decoder.decode().unwrap();
-                                    black_box(result);
-                                }
-                                Err(_) => {
-                                    // SIMD不支持此配置，跳过
-                                    black_box(());
-                                }
+                                let result = decoder.decode().unwrap();
+                                black_box(result);
                             }
-                        });
-                    },
-                );
-                simd_group.finish();
-            }
+                            Err(_) => {
+                                // SIMD doesn't support this configuration, skip
+                                black_box(());
+                            }
+                        }
+                    });
+                },
+            );
+            simd_group.finish();
         }
     }
 }
 
-/// 基准测试: 不同分片大小对性能的影响
+/// Benchmark: Impact of different shard sizes on performance
 fn bench_shard_size_impact(c: &mut Criterion) {
     let shard_sizes = vec![64, 128, 256, 512, 1024, 2048, 4096, 8192];
     let data_shards = 4;
@@ -301,8 +249,8 @@ fn bench_shard_size_impact(c: &mut Criterion) {
 
         group.throughput(Throughput::Bytes(total_data_size as u64));
 
-        // 测试当前实现
-        group.bench_with_input(BenchmarkId::new("current", format!("shard_{}B", shard_size)), &data, |b, data| {
+        // Test SIMD implementation
+        group.bench_with_input(BenchmarkId::new("simd", format!("shard_{}B", shard_size)), &data, |b, data| {
             let erasure = Erasure::new(data_shards, parity_shards, total_data_size);
             b.iter(|| {
                 let shards = erasure.encode_data(black_box(data)).unwrap();
@@ -313,19 +261,19 @@ fn bench_shard_size_impact(c: &mut Criterion) {
     group.finish();
 }
 
-/// 基准测试: 编码配置对性能的影响
+/// Benchmark: Impact of coding configurations on performance
 fn bench_coding_configurations(c: &mut Criterion) {
     let configs = vec![
-        (2, 1),  // 最小冗余
-        (3, 2),  // 中等冗余
-        (4, 2),  // 常用配置
-        (6, 3),  // 50%冗余
-        (8, 4),  // 50%冗余，更多分片
-        (10, 5), // 50%冗余，大量分片
-        (12, 6), // 50%冗余，更大量分片
+        (2, 1),  // Minimal redundancy
+        (3, 2),  // Medium redundancy
+        (4, 2),  // Common configuration
+        (6, 3),  // 50% redundancy
+        (8, 4),  // 50% redundancy, more shards
+        (10, 5), // 50% redundancy, many shards
+        (12, 6), // 50% redundancy, very many shards
     ];
 
-    let data_size = 1024 * 1024; // 1MB测试数据
+    let data_size = 1024 * 1024; // 1MB test data
     let data = generate_test_data(data_size);
 
     let mut group = c.benchmark_group("coding_configurations");
@@ -347,17 +295,17 @@ fn bench_coding_configurations(c: &mut Criterion) {
     group.finish();
 }
 
-/// 基准测试: 内存使用模式
+/// Benchmark: Memory usage patterns
 fn bench_memory_patterns(c: &mut Criterion) {
     let data_shards = 4;
     let parity_shards = 2;
-    let block_size = 1024 * 1024; // 1MB块
+    let block_size = 1024 * 1024; // 1MB block
 
     let mut group = c.benchmark_group("memory_patterns");
     group.sample_size(10);
     group.measurement_time(Duration::from_secs(5));
 
-    // 测试重复使用同一个Erasure实例
+    // Test reusing the same Erasure instance
     group.bench_function("reuse_erasure_instance", |b| {
         let erasure = Erasure::new(data_shards, parity_shards, block_size);
         let data = generate_test_data(block_size);
@@ -368,7 +316,7 @@ fn bench_memory_patterns(c: &mut Criterion) {
         });
     });
 
-    // 测试每次创建新的Erasure实例
+    // Test creating new Erasure instance each time
     group.bench_function("new_erasure_instance", |b| {
         let data = generate_test_data(block_size);
 
@@ -382,7 +330,7 @@ fn bench_memory_patterns(c: &mut Criterion) {
     group.finish();
 }
 
-// 基准测试组配置
+// Benchmark group configuration
 criterion_group!(
     benches,
     bench_encode_performance,