diff --git a/Cargo.toml b/Cargo.toml index 84e2e9c7..113938c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,7 +91,7 @@ argon2 = { version = "0.5.3", features = ["std"] } atoi = "2.0.0" async-channel = "2.5.0" async-recursion = "1.1.1" -async-trait = "0.1.88" +async-trait = "0.1" async-compression = { version = "0.4.0" } atomic_enum = "0.3.0" aws-sdk-s3 = "1.96.0" diff --git a/crates/ahm/architecture.md b/crates/ahm/architecture.md deleted file mode 100644 index fab56c49..00000000 --- a/crates/ahm/architecture.md +++ /dev/null @@ -1,557 +0,0 @@ -# RustFS Advanced Health & Metrics (AHM) System Architecture - -## Overview - -The RustFS AHM system is a newly designed distributed storage health monitoring and repair system that provides intelligent scanning, automatic repair, rich metrics, and policy-driven management capabilities. - -## System Architecture - -### Overall Architecture Diagram - -``` -┌─────────────────────────────────────┐ -│ API Layer (REST/gRPC) │ -├─────────────────────────────────────┤ -│ Policy & Configuration │ -├─────────────────────────────────────┤ -│ Core Coordination Engine │ -├─────────────────────────────────────┤ -│ Scanner Engine │ Heal Engine │ -├─────────────────────────────────────┤ -│ Metrics & Observability │ -├─────────────────────────────────────┤ -│ Storage Abstraction │ -└─────────────────────────────────────┘ -``` - -### Module Structure - -``` -rustfs/crates/ecstore/src/ahm/ -├── mod.rs # Module entry point and public interfaces -├── core/ # Core engines -│ ├── coordinator.rs # Distributed coordinator - event routing and state management -│ ├── scheduler.rs # Task scheduler - priority queue and work assignment -│ └── lifecycle.rs # Lifecycle manager - system startup/shutdown control -├── scanner/ # Scanning system -│ ├── engine.rs # Scan engine - scan process control -│ ├── object_scanner.rs # Object scanner - object-level integrity checks -│ ├── disk_scanner.rs # Disk scanner - disk-level health checks -│ ├── metrics_collector.rs # Metrics collector - scan process data collection -│ └── bandwidth_limiter.rs # Bandwidth limiter - I/O resource control -├── heal/ # Repair system -│ ├── engine.rs # Heal engine - repair process control -│ ├── priority_queue.rs # Priority queue - repair task ordering -│ ├── repair_worker.rs # Repair worker - actual repair execution -│ └── validation.rs # Repair validator - repair result verification -├── metrics/ # Metrics system -│ ├── collector.rs # Metrics collector - real-time data collection -│ ├── aggregator.rs # Metrics aggregator - data aggregation and computation -│ ├── storage.rs # Metrics storage - time-series data storage -│ └── reporter.rs # Metrics reporter - external system export -├── policy/ # Policy system -│ ├── scan_policy.rs # Scan policy - scan behavior configuration -│ ├── heal_policy.rs # Heal policy - repair priority and strategy -│ └── retention_policy.rs # Retention policy - data lifecycle management -└── api/ # API interfaces - ├── admin_api.rs # Admin API - system management operations - ├── metrics_api.rs # Metrics API - metrics query and export - └── status_api.rs # Status API - system status monitoring -``` - -## Core Design Principles - -### 1. Event-Driven Architecture - -```rust -pub enum SystemEvent { - ObjectDiscovered { bucket: String, object: String, metadata: ObjectMetadata }, - HealthIssueDetected { issue_type: HealthIssueType, severity: Severity }, - HealCompleted { result: HealResult }, - ScanCycleCompleted { statistics: ScanStatistics }, - ResourceUsageUpdated { usage: ResourceUsage }, -} -``` - -- **Scanner** generates discovery events -- **Heal** responds to repair events -- **Metrics** collects all event statistics -- **Policy** controls event processing strategies - -### 2. Layered Modular Design - -#### **API Layer**: REST/gRPC interfaces -- Unified response format -- Comprehensive error handling -- Authentication and authorization support - -#### **Policy Layer**: Configurable business rules -- Scan frequency and depth control -- Repair priority policies -- Data retention rules - -#### **Coordination Layer**: System coordination and scheduling -- Event routing and distribution -- Resource management and allocation -- Task scheduling and execution - -#### **Engine Layer**: Core business logic -- Intelligent scanning algorithms -- Adaptive repair strategies -- Performance optimization control - -#### **Metrics Layer**: Observability support -- Real-time metrics collection -- Historical trend analysis -- Multi-format export - -### 3. Multi-Mode Scanning Strategies - -```rust -pub enum ScanStrategy { - Full { mode: ScanMode, scope: ScanScope }, // Full scan - Incremental { since: Instant, mode: ScanMode }, // Incremental scan - Smart { sample_rate: f64, favor_unscanned: bool }, // Smart sampling - Targeted { targets: Vec, mode: ScanMode }, // Targeted scan -} - -pub enum ScanMode { - Quick, // Quick scan - metadata only - Normal, // Normal scan - basic integrity verification - Deep, // Deep scan - includes bit-rot detection -} -``` - -### 4. Priority-Based Repair System - -```rust -pub enum HealPriority { - Low = 0, - Normal = 1, - High = 2, - Critical = 3, - Emergency = 4, -} - -pub enum HealMode { - RealTime, // Real-time repair - triggered on GET/PUT - Background, // Background repair - scheduled tasks - OnDemand, // On-demand repair - admin triggered - Emergency, // Emergency repair - critical issues -} -``` - -## API Usage Guide - -### 1. System Management API - -#### Start AHM System - -```http -POST /admin/system/start -Content-Type: application/json - -{ - "coordinator": { - "event_buffer_size": 10000, - "max_concurrent_operations": 1000 - }, - "scanner": { - "default_scan_mode": "Normal", - "scan_interval": "24h" - }, - "heal": { - "max_workers": 16, - "queue_capacity": 50000 - } -} -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "system_id": "ahm-001", - "status": "Running", - "started_at": "2024-01-15T10:30:00Z" - }, - "timestamp": "2024-01-15T10:30:00Z" -} -``` - -#### Get System Status - -```http -GET /status/health -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "status": "Running", - "version": "1.0.0", - "uptime_seconds": 3600, - "subsystems": { - "scanner": { - "status": "Scanning", - "last_check": "2024-01-15T10:29:00Z", - "error_message": null - }, - "heal": { - "status": "Idle", - "last_check": "2024-01-15T10:29:00Z", - "error_message": null - }, - "metrics": { - "status": "Running", - "last_check": "2024-01-15T10:29:00Z", - "error_message": null - } - } - }, - "timestamp": "2024-01-15T10:30:00Z" -} -``` - -### 2. Scan Management API - -#### Start Scan Task - -```http -POST /admin/scan/start -Content-Type: application/json - -{ - "strategy": { - "type": "Full", - "mode": "Normal", - "scope": { - "buckets": ["important-data", "user-uploads"], - "include_system_objects": false, - "max_objects": 1000000 - } - }, - "priority": "High" -} -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "scan_id": "scan-12345", - "status": "Started", - "estimated_duration": "2h30m", - "estimated_objects": 850000 - }, - "timestamp": "2024-01-15T10:30:00Z" -} -``` - -#### Query Scan Status - -```http -GET /admin/scan/{scan_id}/status -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "scan_id": "scan-12345", - "status": "Scanning", - "progress": { - "objects_scanned": 425000, - "bytes_scanned": 1073741824000, - "issues_detected": 23, - "completion_percentage": 50.0, - "scan_rate_ops": 117.5, - "scan_rate_bps": 268435456, - "elapsed_time": "1h15m", - "estimated_remaining": "1h15m" - }, - "issues": [ - { - "issue_type": "MissingShards", - "severity": "High", - "bucket": "user-uploads", - "object": "photos/IMG_001.jpg", - "description": "Missing 1 data shard", - "detected_at": "2024-01-15T11:15:00Z" - } - ] - }, - "timestamp": "2024-01-15T11:45:00Z" -} -``` - -### 3. Heal Management API - -#### Submit Heal Request - -```http -POST /admin/heal/request -Content-Type: application/json - -{ - "bucket": "user-uploads", - "object": "photos/IMG_001.jpg", - "version_id": null, - "priority": "High", - "mode": "OnDemand", - "max_retries": 3 -} -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "heal_request_id": "heal-67890", - "status": "Queued", - "priority": "High", - "estimated_start": "2024-01-15T11:50:00Z", - "queue_position": 5 - }, - "timestamp": "2024-01-15T11:45:00Z" -} -``` - -#### Query Heal Status - -```http -GET /admin/heal/{heal_request_id}/status -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "heal_request_id": "heal-67890", - "status": "Completed", - "result": { - "success": true, - "shards_repaired": 1, - "total_shards": 8, - "duration": "45s", - "strategy_used": "ParityShardRepair", - "validation_results": [ - { - "validation_type": "Checksum", - "passed": true, - "details": "Object checksum verified", - "duration": "2s" - }, - { - "validation_type": "ShardCount", - "passed": true, - "details": "All 8 shards present", - "duration": "1s" - } - ] - } - }, - "timestamp": "2024-01-15T11:46:00Z" -} -``` - -### 4. Metrics Query API - -#### Get System Metrics - -```http -GET /metrics/system?period=1h&metrics=objects_total,scan_rate,heal_success_rate -``` - -**Response Example:** -```json -{ - "success": true, - "data": { - "period": "1h", - "timestamp_range": { - "start": "2024-01-15T10:45:00Z", - "end": "2024-01-15T11:45:00Z" - }, - "metrics": { - "objects_total": { - "value": 2500000, - "unit": "count", - "labels": {} - }, - "scan_rate_objects_per_second": { - "value": 117.5, - "unit": "ops", - "labels": {} - }, - "heal_success_rate": { - "value": 0.98, - "unit": "ratio", - "labels": {} - } - } - }, - "timestamp": "2024-01-15T11:45:00Z" -} -``` - -#### Export Prometheus Format Metrics - -```http -GET /metrics/prometheus -``` - -**Response Example:** -``` -# HELP rustfs_objects_total Total number of objects in the system -# TYPE rustfs_objects_total gauge -rustfs_objects_total 2500000 - -# HELP rustfs_scan_rate_objects_per_second Object scanning rate -# TYPE rustfs_scan_rate_objects_per_second gauge -rustfs_scan_rate_objects_per_second 117.5 - -# HELP rustfs_heal_success_rate Healing operation success rate -# TYPE rustfs_heal_success_rate gauge -rustfs_heal_success_rate 0.98 - -# HELP rustfs_health_issues_total Total health issues detected -# TYPE rustfs_health_issues_total counter -rustfs_health_issues_total{severity="critical"} 0 -rustfs_health_issues_total{severity="high"} 3 -rustfs_health_issues_total{severity="medium"} 15 -rustfs_health_issues_total{severity="low"} 45 -``` - -### 5. Policy Configuration API - -#### Update Scan Policy - -```http -PUT /admin/policy/scan -Content-Type: application/json - -{ - "default_scan_interval": "12h", - "deep_scan_probability": 0.1, - "bandwidth_limit_mbps": 100, - "concurrent_scanners": 4, - "skip_system_objects": true, - "priority_buckets": ["critical-data", "user-data"] -} -``` - -#### Update Heal Policy - -```http -PUT /admin/policy/heal -Content-Type: application/json - -{ - "max_concurrent_heals": 8, - "emergency_heal_timeout": "5m", - "auto_heal_enabled": true, - "heal_verification_required": true, - "priority_mapping": { - "critical_buckets": "Emergency", - "important_buckets": "High", - "standard_buckets": "Normal" - } -} -``` - -## Usage Examples - -### Complete Monitoring and Repair Workflow - -```bash -# 1. Start AHM system -curl -X POST http://localhost:9000/admin/system/start \ - -H "Content-Type: application/json" \ - -d '{"scanner": {"default_scan_mode": "Normal"}}' - -# 2. Start full scan -SCAN_ID=$(curl -X POST http://localhost:9000/admin/scan/start \ - -H "Content-Type: application/json" \ - -d '{"strategy": {"type": "Full", "mode": "Normal"}}' | \ - jq -r '.data.scan_id') - -# 3. Monitor scan progress -watch "curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | jq '.data.progress'" - -# 4. View discovered issues -curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | \ - jq '.data.issues[]' - -# 5. Start repair for discovered issues -HEAL_ID=$(curl -X POST http://localhost:9000/admin/heal/request \ - -H "Content-Type: application/json" \ - -d '{ - "bucket": "user-uploads", - "object": "photos/IMG_001.jpg", - "priority": "High" - }' | jq -r '.data.heal_request_id') - -# 6. Monitor repair progress -watch "curl -s http://localhost:9000/admin/heal/$HEAL_ID/status | jq '.data'" - -# 7. View system metrics -curl -s http://localhost:9000/metrics/system?period=1h | jq '.data.metrics' - -# 8. Export Prometheus metrics -curl -s http://localhost:9000/metrics/prometheus -``` - -## Key Features - -### 1. Intelligent Scanning -- **Multi-level scan modes**: Quick/Normal/Deep three depths -- **Adaptive sampling**: Intelligent object selection based on historical data -- **Bandwidth control**: Configurable I/O resource limits -- **Incremental scanning**: Timestamp-based change detection - -### 2. Intelligent Repair -- **Priority queue**: Repair ordering based on business importance -- **Multiple repair strategies**: Data shard, parity shard, hybrid repair -- **Real-time validation**: Post-repair integrity verification -- **Retry mechanism**: Configurable failure retry policies - -### 3. Rich Metrics -- **Real-time statistics**: Object counts, storage usage, performance metrics -- **Historical trends**: Time-series data storage and analysis -- **Multi-format export**: Prometheus, JSON, CSV formats -- **Custom metrics**: Extensible metrics definition framework - -### 4. Policy-Driven -- **Configurable policies**: Independent configuration for scan, heal, retention policies -- **Dynamic adjustment**: Runtime policy updates without restart -- **Business alignment**: Differentiated handling based on business importance - -## Deployment Recommendations - -### 1. Resource Configuration -- **CPU**: Recommended 16+ cores for parallel scanning and repair -- **Memory**: Recommended 32GB+ for metrics cache and task queues -- **Network**: Recommended gigabit+ bandwidth for cross-node data sync -- **Storage**: Recommended SSD for metrics data storage - -### 2. Monitoring Integration -- **Prometheus**: Metrics collection and alerting -- **Grafana**: Visualization dashboards -- **ELK Stack**: Log aggregation and analysis -- **Jaeger**: Distributed tracing - -### 3. High Availability Deployment -- **Multi-instance deployment**: Avoid single points of failure -- **Load balancing**: API request distribution -- **Data backup**: Metrics and configuration data backup -- **Failover**: Automatic failure detection and switching - -This architecture design provides RustFS with modern, scalable, and highly observable health monitoring and repair capabilities that meet the operational requirements of enterprise-grade distributed storage systems. \ No newline at end of file diff --git a/crates/ahm/architecture_ch.md b/crates/ahm/architecture_ch.md deleted file mode 100644 index e349cf51..00000000 --- a/crates/ahm/architecture_ch.md +++ /dev/null @@ -1,557 +0,0 @@ -# RustFS Advanced Health & Metrics (AHM) 系统架构设计 - -## 概述 - -RustFS AHM 系统是一个全新设计的分布式存储健康监控和修复系统,提供智能扫描、自动修复、丰富指标和策略驱动的管理能力。 - -## 系统架构 - -### 整体架构图 - -``` -┌─────────────────────────────────────┐ -│ API Layer (REST/gRPC) │ -├─────────────────────────────────────┤ -│ Policy & Configuration │ -├─────────────────────────────────────┤ -│ Core Coordination Engine │ -├─────────────────────────────────────┤ -│ Scanner Engine │ Heal Engine │ -├─────────────────────────────────────┤ -│ Metrics & Observability │ -├─────────────────────────────────────┤ -│ Storage Abstraction │ -└─────────────────────────────────────┘ -``` - -### 模块结构 - -``` -rustfs/crates/ecstore/src/ahm/ -├── mod.rs # 模块入口和公共接口 -├── core/ # 核心引擎 -│ ├── coordinator.rs # 分布式协调器 - 事件路由和状态管理 -│ ├── scheduler.rs # 任务调度器 - 优先级队列和工作分配 -│ └── lifecycle.rs # 生命周期管理器 - 系统启停控制 -├── scanner/ # 扫描系统 -│ ├── engine.rs # 扫描引擎 - 扫描流程控制 -│ ├── object_scanner.rs # 对象扫描器 - 对象级完整性检查 -│ ├── disk_scanner.rs # 磁盘扫描器 - 磁盘级健康检查 -│ ├── metrics_collector.rs # 指标收集器 - 扫描过程数据收集 -│ └── bandwidth_limiter.rs # 带宽限制器 - I/O 资源控制 -├── heal/ # 修复系统 -│ ├── engine.rs # 修复引擎 - 修复流程控制 -│ ├── priority_queue.rs # 优先级队列 - 修复任务排序 -│ ├── repair_worker.rs # 修复工作器 - 实际修复执行 -│ └── validation.rs # 修复验证器 - 修复结果验证 -├── metrics/ # 指标系统 -│ ├── collector.rs # 指标收集器 - 实时数据收集 -│ ├── aggregator.rs # 指标聚合器 - 数据聚合计算 -│ ├── storage.rs # 指标存储器 - 时序数据存储 -│ └── reporter.rs # 指标报告器 - 外部系统导出 -├── policy/ # 策略系统 -│ ├── scan_policy.rs # 扫描策略 - 扫描行为配置 -│ ├── heal_policy.rs # 修复策略 - 修复优先级和策略 -│ └── retention_policy.rs # 保留策略 - 数据生命周期管理 -└── api/ # API接口 - ├── admin_api.rs # 管理API - 系统管理操作 - ├── metrics_api.rs # 指标API - 指标查询和导出 - └── status_api.rs # 状态API - 系统状态监控 -``` - -## 核心设计理念 - -### 1. 事件驱动架构 - -```rust -pub enum SystemEvent { - ObjectDiscovered { bucket: String, object: String, metadata: ObjectMetadata }, - HealthIssueDetected { issue_type: HealthIssueType, severity: Severity }, - HealCompleted { result: HealResult }, - ScanCycleCompleted { statistics: ScanStatistics }, - ResourceUsageUpdated { usage: ResourceUsage }, -} -``` - -- **Scanner** 产生发现事件 -- **Heal** 响应修复事件 -- **Metrics** 收集所有事件统计 -- **Policy** 控制事件处理策略 - -### 2. 分层模块化设计 - -#### **API层**: REST/gRPC接口 -- 统一的响应格式 -- 完整的错误处理 -- 认证和授权支持 - -#### **策略层**: 可配置的业务规则 -- 扫描频率和深度控制 -- 修复优先级策略 -- 数据保留规则 - -#### **协调层**: 系统协调和调度 -- 事件路由分发 -- 资源管理分配 -- 任务调度执行 - -#### **引擎层**: 核心业务逻辑 -- 智能扫描算法 -- 自适应修复策略 -- 性能优化控制 - -#### **指标层**: 可观测性支持 -- 实时指标收集 -- 历史趋势分析 -- 多格式导出 - -### 3. 多模式扫描策略 - -```rust -pub enum ScanStrategy { - Full { mode: ScanMode, scope: ScanScope }, // 全量扫描 - Incremental { since: Instant, mode: ScanMode }, // 增量扫描 - Smart { sample_rate: f64, favor_unscanned: bool }, // 智能采样 - Targeted { targets: Vec, mode: ScanMode }, // 定向扫描 -} - -pub enum ScanMode { - Quick, // 快速扫描 - 仅元数据检查 - Normal, // 标准扫描 - 基础完整性验证 - Deep, // 深度扫描 - 包含位腐蚀检测 -} -``` - -### 4. 优先级修复系统 - -```rust -pub enum HealPriority { - Low = 0, - Normal = 1, - High = 2, - Critical = 3, - Emergency = 4, -} - -pub enum HealMode { - RealTime, // 实时修复 - GET/PUT时触发 - Background, // 后台修复 - 计划任务 - OnDemand, // 按需修复 - 管理员触发 - Emergency, // 紧急修复 - 关键问题 -} -``` - -## API 使用指南 - -### 1. 系统管理 API - -#### 启动 AHM 系统 - -```http -POST /admin/system/start -Content-Type: application/json - -{ - "coordinator": { - "event_buffer_size": 10000, - "max_concurrent_operations": 1000 - }, - "scanner": { - "default_scan_mode": "Normal", - "scan_interval": "24h" - }, - "heal": { - "max_workers": 16, - "queue_capacity": 50000 - } -} -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "system_id": "ahm-001", - "status": "Running", - "started_at": "2024-01-15T10:30:00Z" - }, - "timestamp": "2024-01-15T10:30:00Z" -} -``` - -#### 获取系统状态 - -```http -GET /status/health -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "status": "Running", - "version": "1.0.0", - "uptime_seconds": 3600, - "subsystems": { - "scanner": { - "status": "Scanning", - "last_check": "2024-01-15T10:29:00Z", - "error_message": null - }, - "heal": { - "status": "Idle", - "last_check": "2024-01-15T10:29:00Z", - "error_message": null - }, - "metrics": { - "status": "Running", - "last_check": "2024-01-15T10:29:00Z", - "error_message": null - } - } - }, - "timestamp": "2024-01-15T10:30:00Z" -} -``` - -### 2. 扫描管理 API - -#### 启动扫描任务 - -```http -POST /admin/scan/start -Content-Type: application/json - -{ - "strategy": { - "type": "Full", - "mode": "Normal", - "scope": { - "buckets": ["important-data", "user-uploads"], - "include_system_objects": false, - "max_objects": 1000000 - } - }, - "priority": "High" -} -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "scan_id": "scan-12345", - "status": "Started", - "estimated_duration": "2h30m", - "estimated_objects": 850000 - }, - "timestamp": "2024-01-15T10:30:00Z" -} -``` - -#### 查询扫描状态 - -```http -GET /admin/scan/{scan_id}/status -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "scan_id": "scan-12345", - "status": "Scanning", - "progress": { - "objects_scanned": 425000, - "bytes_scanned": 1073741824000, - "issues_detected": 23, - "completion_percentage": 50.0, - "scan_rate_ops": 117.5, - "scan_rate_bps": 268435456, - "elapsed_time": "1h15m", - "estimated_remaining": "1h15m" - }, - "issues": [ - { - "issue_type": "MissingShards", - "severity": "High", - "bucket": "user-uploads", - "object": "photos/IMG_001.jpg", - "description": "Missing 1 data shard", - "detected_at": "2024-01-15T11:15:00Z" - } - ] - }, - "timestamp": "2024-01-15T11:45:00Z" -} -``` - -### 3. 修复管理 API - -#### 提交修复请求 - -```http -POST /admin/heal/request -Content-Type: application/json - -{ - "bucket": "user-uploads", - "object": "photos/IMG_001.jpg", - "version_id": null, - "priority": "High", - "mode": "OnDemand", - "max_retries": 3 -} -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "heal_request_id": "heal-67890", - "status": "Queued", - "priority": "High", - "estimated_start": "2024-01-15T11:50:00Z", - "queue_position": 5 - }, - "timestamp": "2024-01-15T11:45:00Z" -} -``` - -#### 查询修复状态 - -```http -GET /admin/heal/{heal_request_id}/status -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "heal_request_id": "heal-67890", - "status": "Completed", - "result": { - "success": true, - "shards_repaired": 1, - "total_shards": 8, - "duration": "45s", - "strategy_used": "ParityShardRepair", - "validation_results": [ - { - "validation_type": "Checksum", - "passed": true, - "details": "Object checksum verified", - "duration": "2s" - }, - { - "validation_type": "ShardCount", - "passed": true, - "details": "All 8 shards present", - "duration": "1s" - } - ] - } - }, - "timestamp": "2024-01-15T11:46:00Z" -} -``` - -### 4. 指标查询 API - -#### 获取系统指标 - -```http -GET /metrics/system?period=1h&metrics=objects_total,scan_rate,heal_success_rate -``` - -**响应示例:** -```json -{ - "success": true, - "data": { - "period": "1h", - "timestamp_range": { - "start": "2024-01-15T10:45:00Z", - "end": "2024-01-15T11:45:00Z" - }, - "metrics": { - "objects_total": { - "value": 2500000, - "unit": "count", - "labels": {} - }, - "scan_rate_objects_per_second": { - "value": 117.5, - "unit": "ops", - "labels": {} - }, - "heal_success_rate": { - "value": 0.98, - "unit": "ratio", - "labels": {} - } - } - }, - "timestamp": "2024-01-15T11:45:00Z" -} -``` - -#### 导出 Prometheus 格式指标 - -```http -GET /metrics/prometheus -``` - -**响应示例:** -``` -# HELP rustfs_objects_total Total number of objects in the system -# TYPE rustfs_objects_total gauge -rustfs_objects_total 2500000 - -# HELP rustfs_scan_rate_objects_per_second Object scanning rate -# TYPE rustfs_scan_rate_objects_per_second gauge -rustfs_scan_rate_objects_per_second 117.5 - -# HELP rustfs_heal_success_rate Healing operation success rate -# TYPE rustfs_heal_success_rate gauge -rustfs_heal_success_rate 0.98 - -# HELP rustfs_health_issues_total Total health issues detected -# TYPE rustfs_health_issues_total counter -rustfs_health_issues_total{severity="critical"} 0 -rustfs_health_issues_total{severity="high"} 3 -rustfs_health_issues_total{severity="medium"} 15 -rustfs_health_issues_total{severity="low"} 45 -``` - -### 5. 策略配置 API - -#### 更新扫描策略 - -```http -PUT /admin/policy/scan -Content-Type: application/json - -{ - "default_scan_interval": "12h", - "deep_scan_probability": 0.1, - "bandwidth_limit_mbps": 100, - "concurrent_scanners": 4, - "skip_system_objects": true, - "priority_buckets": ["critical-data", "user-data"] -} -``` - -#### 更新修复策略 - -```http -PUT /admin/policy/heal -Content-Type: application/json - -{ - "max_concurrent_heals": 8, - "emergency_heal_timeout": "5m", - "auto_heal_enabled": true, - "heal_verification_required": true, - "priority_mapping": { - "critical_buckets": "Emergency", - "important_buckets": "High", - "standard_buckets": "Normal" - } -} -``` - -## 使用示例 - -### 完整的监控和修复流程 - -```bash -# 1. 启动 AHM 系统 -curl -X POST http://localhost:9000/admin/system/start \ - -H "Content-Type: application/json" \ - -d '{"scanner": {"default_scan_mode": "Normal"}}' - -# 2. 启动全量扫描 -SCAN_ID=$(curl -X POST http://localhost:9000/admin/scan/start \ - -H "Content-Type: application/json" \ - -d '{"strategy": {"type": "Full", "mode": "Normal"}}' | \ - jq -r '.data.scan_id') - -# 3. 监控扫描进度 -watch "curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | jq '.data.progress'" - -# 4. 查看发现的问题 -curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | \ - jq '.data.issues[]' - -# 5. 针对发现的问题启动修复 -HEAL_ID=$(curl -X POST http://localhost:9000/admin/heal/request \ - -H "Content-Type: application/json" \ - -d '{ - "bucket": "user-uploads", - "object": "photos/IMG_001.jpg", - "priority": "High" - }' | jq -r '.data.heal_request_id') - -# 6. 监控修复进度 -watch "curl -s http://localhost:9000/admin/heal/$HEAL_ID/status | jq '.data'" - -# 7. 查看系统指标 -curl -s http://localhost:9000/metrics/system?period=1h | jq '.data.metrics' - -# 8. 导出 Prometheus 指标 -curl -s http://localhost:9000/metrics/prometheus -``` - -## 关键特性 - -### 1. 智能扫描 -- **多级扫描模式**: Quick/Normal/Deep 三种深度 -- **自适应采样**: 基于历史数据智能选择扫描对象 -- **带宽控制**: 可配置的 I/O 资源限制 -- **增量扫描**: 基于时间戳的变化检测 - -### 2. 智能修复 -- **优先级队列**: 基于业务重要性的修复排序 -- **多种修复策略**: 数据分片、奇偶校验、混合修复 -- **实时验证**: 修复后的完整性验证 -- **重试机制**: 可配置的失败重试策略 - -### 3. 丰富指标 -- **实时统计**: 对象数量、存储使用、性能指标 -- **历史趋势**: 时序数据存储和分析 -- **多格式导出**: Prometheus、JSON、CSV 等格式 -- **自定义指标**: 可扩展的指标定义框架 - -### 4. 策略驱动 -- **可配置策略**: 扫描、修复、保留策略独立配置 -- **动态调整**: 运行时策略更新,无需重启 -- **业务对齐**: 基于业务重要性的差异化处理 - -## 部署建议 - -### 1. 资源配置 -- **CPU**: 推荐 16+ 核心用于并行扫描和修复 -- **内存**: 推荐 32GB+ 用于指标缓存和任务队列 -- **网络**: 推荐千兆以上带宽用于跨节点数据同步 -- **存储**: 推荐 SSD 用于指标数据存储 - -### 2. 监控集成 -- **Prometheus**: 指标收集和告警 -- **Grafana**: 可视化仪表板 -- **ELK Stack**: 日志聚合和分析 -- **Jaeger**: 分布式链路追踪 - -### 3. 高可用部署 -- **多实例部署**: 避免单点故障 -- **负载均衡**: API 请求分发 -- **数据备份**: 指标和配置数据备份 -- **故障转移**: 自动故障检测和切换 - -这个架构设计为 RustFS 提供了现代化、可扩展、高可观测的健康监控和修复能力,能够满足企业级分布式存储系统的运维需求。 \ No newline at end of file diff --git a/crates/ahm/src/api/admin_api.rs b/crates/ahm/src/api/admin_api.rs deleted file mode 100644 index 19c1a494..00000000 --- a/crates/ahm/src/api/admin_api.rs +++ /dev/null @@ -1,843 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::sync::Arc; - -use tracing::{debug, error, info, warn}; - -use crate::{ - error::Result, - heal::HealEngine, - policy::{ScanPolicyEngine as PolicyEngine}, - scanner::{Engine as ScanEngine}, -}; - -use super::{HttpRequest, HttpResponse}; - -/// Configuration for the admin API -#[derive(Debug, Clone)] -pub struct AdminApiConfig { - /// Whether to enable admin API - pub enabled: bool, - /// Admin API prefix - pub prefix: String, - /// Authentication required - pub require_auth: bool, - /// Admin token - pub admin_token: Option, - /// Rate limiting for admin endpoints - pub rate_limit_requests_per_minute: u32, - /// Maximum request body size - pub max_request_size: usize, - /// Enable audit logging - pub enable_audit_logging: bool, - /// Audit log path - pub audit_log_path: Option, -} - -impl Default for AdminApiConfig { - fn default() -> Self { - Self { - enabled: true, - prefix: "/admin".to_string(), - require_auth: true, - admin_token: Some("admin-secret-token".to_string()), - rate_limit_requests_per_minute: 100, - max_request_size: 1024 * 1024, // 1 MB - enable_audit_logging: true, - audit_log_path: Some("/tmp/rustfs/admin-audit.log".to_string()), - } - } -} - -/// Admin API that provides administrative operations -pub struct AdminApi { - config: AdminApiConfig, - scan_engine: Arc, - heal_engine: Arc, - policy_engine: Arc, -} - -impl AdminApi { - /// Create a new admin API - pub async fn new( - config: AdminApiConfig, - scan_engine: Arc, - heal_engine: Arc, - policy_engine: Arc, - ) -> Result { - Ok(Self { - config, - scan_engine, - heal_engine, - policy_engine, - }) - } - - /// Get the configuration - pub fn config(&self) -> &AdminApiConfig { - &self.config - } - - /// Handle HTTP request - pub async fn handle_request(&self, request: HttpRequest) -> Result { - // Check authentication if required - if self.config.require_auth { - if !self.authenticate_request(&request).await? { - return Ok(HttpResponse { - status_code: 401, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Unauthorized", - "message": "Authentication required" - }).to_string(), - }); - } - } - - // Log audit if enabled - if self.config.enable_audit_logging { - self.log_audit(&request).await?; - } - - match request.path.as_str() { - // Scan operations - "/admin/scan/start" => self.start_scan(request).await, - "/admin/scan/stop" => self.stop_scan(request).await, - "/admin/scan/status" => self.get_scan_status(request).await, - "/admin/scan/config" => self.get_scan_config(request).await, - "/admin/scan/config" if request.method == "PUT" => self.update_scan_config(request).await, - - // Heal operations - "/admin/heal/start" => self.start_heal(request).await, - "/admin/heal/stop" => self.stop_heal(request).await, - "/admin/heal/status" => self.get_heal_status(request).await, - "/admin/heal/config" => self.get_heal_config(request).await, - "/admin/heal/config" if request.method == "PUT" => self.update_heal_config(request).await, - - // Policy operations - "/admin/policy/list" => self.list_policies(request).await, - "/admin/policy/get" => self.get_policy(request).await, - "/admin/policy/create" => self.create_policy(request).await, - "/admin/policy/update" => self.update_policy(request).await, - "/admin/policy/delete" => self.delete_policy(request).await, - - // System operations - "/admin/system/status" => self.get_system_status(request).await, - "/admin/system/config" => self.get_system_config(request).await, - "/admin/system/restart" => self.restart_system(request).await, - "/admin/system/shutdown" => self.shutdown_system(request).await, - - // Default 404 - _ => Ok(HttpResponse { - status_code: 404, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Not Found", - "message": "Admin endpoint not found" - }).to_string(), - }), - } - } - - /// Authenticate request - async fn authenticate_request(&self, request: &HttpRequest) -> Result { - if let Some(token) = &self.config.admin_token { - // Check for Authorization header - if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") { - if auth_header.1 == format!("Bearer {}", token) { - return Ok(true); - } - } - - // Check for token in query parameters - if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") { - if token_param.1 == *token { - return Ok(true); - } - } - } - - Ok(false) - } - - /// Log audit entry - async fn log_audit(&self, request: &HttpRequest) -> Result<()> { - let audit_entry = serde_json::json!({ - "timestamp": chrono::Utc::now().to_rfc3339(), - "method": request.method, - "path": request.path, - "ip": "127.0.0.1", // In real implementation, get from request - "user_agent": "admin-api", // In real implementation, get from headers - }); - - if let Some(log_path) = &self.config.audit_log_path { - // In a real implementation, this would write to the audit log file - debug!("Audit log entry: {}", audit_entry); - } - - Ok(()) - } - - /// Start scan operation - async fn start_scan(&self, _request: HttpRequest) -> Result { - match self.scan_engine.start_scan().await { - Ok(_) => { - info!("Scan started via admin API"); - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Scan started successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to start scan: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to start scan: {}", e) - }).to_string(), - }) - } - } - } - - /// Stop scan operation - async fn stop_scan(&self, _request: HttpRequest) -> Result { - match self.scan_engine.stop_scan().await { - Ok(_) => { - info!("Scan stopped via admin API"); - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Scan stopped successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to stop scan: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to stop scan: {}", e) - }).to_string(), - }) - } - } - } - - /// Get scan status - async fn get_scan_status(&self, _request: HttpRequest) -> Result { - let status = self.scan_engine.get_status().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "scan_status": status, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get scan configuration - async fn get_scan_config(&self, _request: HttpRequest) -> Result { - let config = self.scan_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "scan_config": config, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Update scan configuration - async fn update_scan_config(&self, request: HttpRequest) -> Result { - if let Some(body) = request.body { - match serde_json::from_str::(&body) { - Ok(config_json) => { - // In a real implementation, this would update the scan configuration - info!("Scan config updated via admin API: {:?}", config_json); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Scan configuration updated successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": format!("Invalid JSON: {}", e) - }).to_string(), - }) - } - } - } else { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": "Request body required" - }).to_string(), - }) - } - } - - /// Start heal operation - async fn start_heal(&self, _request: HttpRequest) -> Result { - match self.heal_engine.start_healing().await { - Ok(_) => { - info!("Healing started via admin API"); - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Healing started successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to start healing: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to start healing: {}", e) - }).to_string(), - }) - } - } - } - - /// Stop heal operation - async fn stop_heal(&self, _request: HttpRequest) -> Result { - match self.heal_engine.stop_healing().await { - Ok(_) => { - info!("Healing stopped via admin API"); - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Healing stopped successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to stop healing: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to stop healing: {}", e) - }).to_string(), - }) - } - } - } - - /// Get heal status - async fn get_heal_status(&self, _request: HttpRequest) -> Result { - let status = self.heal_engine.get_status().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "heal_status": status, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get heal configuration - async fn get_heal_config(&self, _request: HttpRequest) -> Result { - let config = self.heal_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "heal_config": config, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Update heal configuration - async fn update_heal_config(&self, request: HttpRequest) -> Result { - if let Some(body) = request.body { - match serde_json::from_str::(&body) { - Ok(config_json) => { - // In a real implementation, this would update the heal configuration - info!("Heal config updated via admin API: {:?}", config_json); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Heal configuration updated successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": format!("Invalid JSON: {}", e) - }).to_string(), - }) - } - } - } else { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": "Request body required" - }).to_string(), - }) - } - } - - /// List policies - async fn list_policies(&self, _request: HttpRequest) -> Result { - let policies = self.policy_engine.list_policies().await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "policies": policies, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get policy - async fn get_policy(&self, request: HttpRequest) -> Result { - if let Some(policy_name) = request.query_params.iter().find(|(k, _)| k == "name") { - match self.policy_engine.get_policy(&policy_name.1).await { - Ok(policy) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "policy": policy, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - Ok(HttpResponse { - status_code: 404, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Not Found", - "message": format!("Policy not found: {}", e) - }).to_string(), - }) - } - } - } else { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": "Policy name parameter required" - }).to_string(), - }) - } - } - - /// Create policy - async fn create_policy(&self, request: HttpRequest) -> Result { - if let Some(body) = request.body { - match serde_json::from_str::(&body) { - Ok(policy_json) => { - // In a real implementation, this would create the policy - info!("Policy created via admin API: {:?}", policy_json); - - Ok(HttpResponse { - status_code: 201, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Policy created successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": format!("Invalid JSON: {}", e) - }).to_string(), - }) - } - } - } else { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": "Request body required" - }).to_string(), - }) - } - } - - /// Update policy - async fn update_policy(&self, request: HttpRequest) -> Result { - if let Some(body) = request.body { - match serde_json::from_str::(&body) { - Ok(policy_json) => { - // In a real implementation, this would update the policy - info!("Policy updated via admin API: {:?}", policy_json); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Policy updated successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": format!("Invalid JSON: {}", e) - }).to_string(), - }) - } - } - } else { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": "Request body required" - }).to_string(), - }) - } - } - - /// Delete policy - async fn delete_policy(&self, request: HttpRequest) -> Result { - if let Some(policy_name) = request.query_params.iter().find(|(k, _)| k == "name") { - // In a real implementation, this would delete the policy - info!("Policy deleted via admin API: {}", policy_name.1); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "Policy deleted successfully", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } else { - Ok(HttpResponse { - status_code: 400, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Bad Request", - "message": "Policy name parameter required" - }).to_string(), - }) - } - } - - /// Get system status - async fn get_system_status(&self, _request: HttpRequest) -> Result { - let scan_status = self.scan_engine.get_status().await; - let heal_status = self.heal_engine.get_status().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "system_status": { - "scan": scan_status, - "heal": heal_status, - "overall": "healthy" - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get system configuration - async fn get_system_config(&self, _request: HttpRequest) -> Result { - let scan_config = self.scan_engine.get_config().await; - let heal_config = self.heal_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "system_config": { - "scan": scan_config, - "heal": heal_config - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Restart system - async fn restart_system(&self, _request: HttpRequest) -> Result { - // In a real implementation, this would restart the system - info!("System restart requested via admin API"); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "System restart initiated", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Shutdown system - async fn shutdown_system(&self, _request: HttpRequest) -> Result { - // In a real implementation, this would shutdown the system - info!("System shutdown requested via admin API"); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "message": "System shutdown initiated", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - heal::HealEngineConfig, - policy::PolicyEngineConfig, - scanner::ScanEngineConfig, - }; - - #[tokio::test] - async fn test_admin_api_creation() { - let config = AdminApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - assert!(admin_api.config().enabled); - assert_eq!(admin_api.config().prefix, "/admin"); - } - - #[tokio::test] - async fn test_authentication() { - let config = AdminApiConfig { - admin_token: Some("test-token".to_string()), - ..Default::default() - }; - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - // Test with valid token in header - let request = HttpRequest { - method: "GET".to_string(), - path: "/admin/scan/status".to_string(), - headers: vec![("Authorization".to_string(), "Bearer test-token".to_string())], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - - // Test with valid token in query - let request = HttpRequest { - method: "GET".to_string(), - path: "/admin/scan/status".to_string(), - headers: vec![], - body: None, - query_params: vec![("token".to_string(), "test-token".to_string())], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - - // Test with invalid token - let request = HttpRequest { - method: "GET".to_string(), - path: "/admin/scan/status".to_string(), - headers: vec![("Authorization".to_string(), "Bearer invalid-token".to_string())], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 401); - } - - #[tokio::test] - async fn test_scan_operations() { - let config = AdminApiConfig { - require_auth: false, // Disable auth for testing - ..Default::default() - }; - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - // Test start scan - let request = HttpRequest { - method: "POST".to_string(), - path: "/admin/scan/start".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - - // Test get scan status - let request = HttpRequest { - method: "GET".to_string(), - path: "/admin/scan/status".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - } - - #[tokio::test] - async fn test_heal_operations() { - let config = AdminApiConfig { - require_auth: false, // Disable auth for testing - ..Default::default() - }; - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - // Test start heal - let request = HttpRequest { - method: "POST".to_string(), - path: "/admin/heal/start".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - - // Test get heal status - let request = HttpRequest { - method: "GET".to_string(), - path: "/admin/heal/status".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - } - - #[tokio::test] - async fn test_system_operations() { - let config = AdminApiConfig { - require_auth: false, // Disable auth for testing - ..Default::default() - }; - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - // Test get system status - let request = HttpRequest { - method: "GET".to_string(), - path: "/admin/system/status".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = admin_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("system_status")); - } -} \ No newline at end of file diff --git a/crates/ahm/src/api/metrics_api.rs b/crates/ahm/src/api/metrics_api.rs deleted file mode 100644 index 06f1efa5..00000000 --- a/crates/ahm/src/api/metrics_api.rs +++ /dev/null @@ -1,1180 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::sync::Arc; -use std::time::{SystemTime, Duration}; - -use tracing::{debug, error, info, warn}; - -use crate::{ - error::Result, - metrics::{Collector, Reporter, Storage, MetricsQuery, MetricType}, -}; - -use super::{HttpRequest, HttpResponse}; - -/// Configuration for the metrics API -#[derive(Debug, Clone)] -pub struct MetricsApiConfig { - /// Whether to enable metrics API - pub enabled: bool, - /// Metrics API prefix - pub prefix: String, - /// Authentication required - pub require_auth: bool, - /// Metrics token - pub metrics_token: Option, - /// Rate limiting for metrics endpoints - pub rate_limit_requests_per_minute: u32, - /// Maximum request body size - pub max_request_size: usize, - /// Enable metrics caching - pub enable_caching: bool, - /// Cache TTL in seconds - pub cache_ttl_seconds: u64, - /// Enable metrics compression - pub enable_compression: bool, - /// Default metrics format - pub default_format: MetricsFormat, -} - -impl Default for MetricsApiConfig { - fn default() -> Self { - Self { - enabled: true, - prefix: "/metrics".to_string(), - require_auth: false, - metrics_token: None, - rate_limit_requests_per_minute: 1000, - max_request_size: 1024 * 1024, // 1 MB - enable_caching: true, - cache_ttl_seconds: 300, // 5 minutes - enable_compression: true, - default_format: MetricsFormat::Json, - } - } -} - -/// Metrics format -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum MetricsFormat { - Json, - Prometheus, - Csv, - Xml, -} - -/// Backup report -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct BackupReport { - pub timestamp: SystemTime, - pub backup_id: String, - pub status: BackupStatus, - pub objects_backed_up: u64, - pub total_size: u64, - pub duration: Duration, - pub errors: Vec, -} - -/// Restore report -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RestoreReport { - pub timestamp: SystemTime, - pub restore_id: String, - pub status: RestoreStatus, - pub objects_restored: u64, - pub total_size: u64, - pub duration: Duration, - pub errors: Vec, -} - -/// Data integrity report -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DataIntegrityReport { - pub timestamp: SystemTime, - pub validation_id: String, - pub status: ValidationStatus, - pub objects_validated: u64, - pub corrupted_objects: u64, - pub duration: Duration, - pub details: Vec, -} - -/// Backup status -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum BackupStatus { - Pending, - InProgress, - Completed, - Failed, -} - -/// Restore status -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum RestoreStatus { - Pending, - InProgress, - Completed, - Failed, -} - -/// Validation status -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum ValidationStatus { - Pending, - InProgress, - Completed, - Failed, -} - -/// Validation detail -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ValidationDetail { - pub object_path: String, - pub status: ValidationResult, - pub error_message: Option, -} - -/// Validation result -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum ValidationResult { - Valid, - Corrupted, - Missing, - AccessDenied, -} - -/// Metrics API that provides metrics data and operations -pub struct MetricsApi { - config: MetricsApiConfig, - collector: Arc, - reporter: Arc, - storage: Arc, -} - -impl MetricsApi { - /// Create a new metrics API - pub async fn new( - config: MetricsApiConfig, - collector: Arc, - reporter: Arc, - storage: Arc, - ) -> Result { - Ok(Self { - config, - collector, - reporter, - storage, - }) - } - - /// Get the configuration - pub fn config(&self) -> &MetricsApiConfig { - &self.config - } - - /// Handle HTTP request - pub async fn handle_request(&self, request: HttpRequest) -> Result { - // Check authentication if required - if self.config.require_auth { - if !self.authenticate_request(&request).await? { - return Ok(HttpResponse { - status_code: 401, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Unauthorized", - "message": "Authentication required" - }).to_string(), - }) - } - } - - match request.path.as_str() { - // Current metrics - "/metrics/current" => self.get_current_metrics(request).await, - "/metrics/latest" => self.get_latest_metrics(request).await, - - // Historical metrics - "/metrics/history" => self.get_metrics_history(request).await, - "/metrics/range" => self.get_metrics_range(request).await, - - // Aggregated metrics - "/metrics/aggregated" => self.get_aggregated_metrics(request).await, - "/metrics/summary" => self.get_metrics_summary(request).await, - - // Specific metric types - "/metrics/system" => self.get_system_metrics(request).await, - "/metrics/scan" => self.get_scan_metrics(request).await, - "/metrics/heal" => self.get_heal_metrics(request).await, - "/metrics/policy" => self.get_policy_metrics(request).await, - "/metrics/network" => self.get_network_metrics(request).await, - "/metrics/disk" => self.get_disk_metrics(request).await, - - // Health issues - "/metrics/health-issues" => self.get_health_issues(request).await, - "/metrics/alerts" => self.get_alerts(request).await, - - // Reports - "/metrics/reports" => self.get_reports(request).await, - "/metrics/reports/comprehensive" => self.get_comprehensive_report(request).await, - - // Prometheus format - "/metrics/prometheus" => self.get_prometheus_metrics(request).await, - - // Storage operations - "/metrics/storage/backup" => self.backup_metrics(request).await, - "/metrics/storage/restore" => self.restore_metrics(request).await, - "/metrics/storage/validate" => self.validate_metrics(request).await, - - // Default 404 - _ => Ok(HttpResponse { - status_code: 404, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Not Found", - "message": "Metrics endpoint not found" - }).to_string(), - }), - } - } - - /// Authenticate request - async fn authenticate_request(&self, request: &HttpRequest) -> Result { - if let Some(token) = &self.config.metrics_token { - // Check for Authorization header - if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") { - if auth_header.1 == format!("Bearer {}", token) { - return Ok(true); - } - } - - // Check for token in query parameters - if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") { - if token_param.1 == *token { - return Ok(true); - } - } - } - - Ok(false) - } - - /// Get current metrics - async fn get_current_metrics(&self, _request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let format = self.get_request_format(&_request); - let body = self.format_metrics(&metrics, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to collect current metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to collect metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get latest metrics - async fn get_latest_metrics(&self, _request: HttpRequest) -> Result { - match self.collector.get_latest_metrics().await { - Ok(Some(metrics)) => { - let format = self.get_request_format(&_request); - let body = self.format_metrics(&metrics, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Ok(None) => { - Ok(HttpResponse { - status_code: 404, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Not Found", - "message": "No metrics available" - }).to_string(), - }) - } - Err(e) => { - error!("Failed to get latest metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get latest metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get metrics history - async fn get_metrics_history(&self, request: HttpRequest) -> Result { - let hours = request.query_params - .iter() - .find(|(k, _)| k == "hours") - .and_then(|(_, v)| v.parse::().ok()) - .unwrap_or(24); - - let end_time = std::time::SystemTime::now(); - let start_time = end_time - std::time::Duration::from_secs(hours * 3600); - - let query = MetricsQuery { - start_time, - end_time, - interval: std::time::Duration::from_secs(300), // 5 minutes - metrics: vec![], - severity_filter: None, - limit: None, - }; - - match self.collector.query_metrics(query).await { - Ok(aggregated) => { - let format = self.get_request_format(&request); - let body = self.format_aggregated_metrics(&aggregated, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get metrics history: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get metrics history: {}", e) - }).to_string(), - }) - } - } - } - - /// Get metrics range - async fn get_metrics_range(&self, request: HttpRequest) -> Result { - let start_time = request.query_params - .iter() - .find(|(k, _)| k == "start") - .and_then(|(_, v)| v.parse::().ok()) - .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) - .unwrap_or_else(|| std::time::SystemTime::now() - std::time::Duration::from_secs(3600)); - - let end_time = request.query_params - .iter() - .find(|(k, _)| k == "end") - .and_then(|(_, v)| v.parse::().ok()) - .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) - .unwrap_or_else(std::time::SystemTime::now); - - let query = MetricsQuery { - start_time, - end_time, - interval: std::time::Duration::from_secs(300), // 5 minutes - metrics: vec![], - severity_filter: None, - limit: None, - }; - - match self.collector.query_metrics(query).await { - Ok(aggregated) => { - let format = self.get_request_format(&request); - let body = self.format_aggregated_metrics(&aggregated, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get metrics range: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get metrics range: {}", e) - }).to_string(), - }) - } - } - } - - /// Get aggregated metrics - async fn get_aggregated_metrics(&self, request: HttpRequest) -> Result { - let query = self.parse_metrics_query(&request)?; - - match self.collector.query_metrics(query).await { - Ok(aggregated) => { - let format = self.get_request_format(&request); - let body = self.format_aggregated_metrics(&aggregated, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get aggregated metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get aggregated metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get metrics summary - async fn get_metrics_summary(&self, request: HttpRequest) -> Result { - let hours = request.query_params - .iter() - .find(|(k, _)| k == "hours") - .and_then(|(_, v)| v.parse::().ok()) - .unwrap_or(24); - - let end_time = std::time::SystemTime::now(); - let start_time = end_time - std::time::Duration::from_secs(hours * 3600); - - let query = MetricsQuery { - start_time, - end_time, - interval: std::time::Duration::from_secs(3600), // 1 hour - metrics: vec![], - severity_filter: None, - limit: None, - }; - - match self.collector.query_metrics(query).await { - Ok(aggregated) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "summary": aggregated.summary, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to get metrics summary: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get metrics summary: {}", e) - }).to_string(), - }) - } - } - } - - /// Get system metrics - async fn get_system_metrics(&self, request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let system_data = serde_json::json!({ - "cpu_usage": metrics.cpu_usage, - "memory_usage": metrics.memory_usage, - "disk_usage": metrics.disk_usage, - "system_load": metrics.system_load, - "active_operations": metrics.active_operations, - "network_io": metrics.network_io, - "disk_io": metrics.disk_io, - }); - - let format = self.get_request_format(&request); - let body = self.format_json_data(&system_data, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get system metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get system metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get scan metrics - async fn get_scan_metrics(&self, request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let scan_data = serde_json::json!({ - "scan_metrics": metrics.scan_metrics, - }); - - let format = self.get_request_format(&request); - let body = self.format_json_data(&scan_data, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get scan metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get scan metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get heal metrics - async fn get_heal_metrics(&self, request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let heal_data = serde_json::json!({ - "heal_metrics": metrics.heal_metrics, - }); - - let format = self.get_request_format(&request); - let body = self.format_json_data(&heal_data, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get heal metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get heal metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get policy metrics - async fn get_policy_metrics(&self, request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let policy_data = serde_json::json!({ - "policy_metrics": metrics.policy_metrics, - }); - - let format = self.get_request_format(&request); - let body = self.format_json_data(&policy_data, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get policy metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get policy metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get network metrics - async fn get_network_metrics(&self, request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let network_data = serde_json::json!({ - "network_io": metrics.network_io, - }); - - let format = self.get_request_format(&request); - let body = self.format_json_data(&network_data, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get network metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get network metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get disk metrics - async fn get_disk_metrics(&self, request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let disk_data = serde_json::json!({ - "disk_io": metrics.disk_io, - "disk_usage": metrics.disk_usage, - }); - - let format = self.get_request_format(&request); - let body = self.format_json_data(&disk_data, format.clone()).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), self.get_content_type(format))], - body, - }) - } - Err(e) => { - error!("Failed to get disk metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get disk metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get health issues - async fn get_health_issues(&self, _request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "health_issues": metrics.health_issues, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to get health issues: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get health issues: {}", e) - }).to_string(), - }) - } - } - } - - /// Get alerts - async fn get_alerts(&self, request: HttpRequest) -> Result { - let hours = request.query_params - .iter() - .find(|(k, _)| k == "hours") - .and_then(|(_, v)| v.parse::().ok()) - .unwrap_or(24); - - match self.reporter.get_recent_alerts(hours).await { - Ok(alerts) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "alerts": alerts, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to get alerts: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get alerts: {}", e) - }).to_string(), - }) - } - } - } - - /// Get reports - async fn get_reports(&self, _request: HttpRequest) -> Result { - let stats = self.reporter.get_statistics().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "reports": stats, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get comprehensive report - async fn get_comprehensive_report(&self, request: HttpRequest) -> Result { - let query = self.parse_metrics_query(&request)?; - - match self.collector.query_metrics(query).await { - Ok(aggregated) => { - match self.reporter.generate_comprehensive_report(&aggregated).await { - Ok(report) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "report": report, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to generate comprehensive report: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to generate report: {}", e) - }).to_string(), - }) - } - } - } - Err(e) => { - error!("Failed to get metrics for comprehensive report: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to get metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Get Prometheus metrics - async fn get_prometheus_metrics(&self, _request: HttpRequest) -> Result { - match self.collector.collect_metrics().await { - Ok(metrics) => { - let prometheus_data = self.format_prometheus_metrics(&metrics).await?; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "text/plain; version=0.0.4; charset=utf-8".to_string())], - body: prometheus_data, - }) - } - Err(e) => { - error!("Failed to get Prometheus metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "text/plain".to_string())], - body: format!("# ERROR: Failed to get metrics: {}", e), - }) - } - } - } - - /// Backup metrics - async fn backup_metrics(&self, request: HttpRequest) -> Result { - let backup_path = request.query_params - .iter() - .find(|(k, _)| k == "path") - .map(|(_, v)| std::path::PathBuf::from(v)) - .unwrap_or_else(|| std::path::PathBuf::from("/tmp/rustfs/metrics-backup")); - - match self.storage.backup_data(&backup_path).await { - Ok(report) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "backup_report": report, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to backup metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to backup metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Restore metrics - async fn restore_metrics(&self, request: HttpRequest) -> Result { - let backup_path = request.query_params - .iter() - .find(|(k, _)| k == "path") - .map(|(_, v)| std::path::PathBuf::from(v)) - .unwrap_or_else(|| std::path::PathBuf::from("/tmp/rustfs/metrics-backup")); - - match self.storage.restore_data(&backup_path).await { - Ok(report) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "restore_report": report, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to restore metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to restore metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Validate metrics - async fn validate_metrics(&self, _request: HttpRequest) -> Result { - match self.storage.validate_data_integrity().await { - Ok(report) => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "validation_report": report, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - Err(e) => { - error!("Failed to validate metrics: {}", e); - Ok(HttpResponse { - status_code: 500, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Internal Server Error", - "message": format!("Failed to validate metrics: {}", e) - }).to_string(), - }) - } - } - } - - /// Helper methods - fn get_request_format(&self, request: &HttpRequest) -> MetricsFormat { - request.query_params - .iter() - .find(|(k, _)| k == "format") - .and_then(|(_, v)| match v.as_str() { - "prometheus" => Some(MetricsFormat::Prometheus), - "csv" => Some(MetricsFormat::Csv), - "xml" => Some(MetricsFormat::Xml), - _ => Some(MetricsFormat::Json), - }) - .unwrap_or(self.config.default_format.clone()) - } - - fn get_content_type(&self, format: MetricsFormat) -> String { - match format { - MetricsFormat::Json => "application/json".to_string(), - MetricsFormat::Prometheus => "text/plain; version=0.0.4; charset=utf-8".to_string(), - MetricsFormat::Csv => "text/csv".to_string(), - MetricsFormat::Xml => "application/xml".to_string(), - } - } - - async fn format_metrics(&self, metrics: &crate::metrics::SystemMetrics, format: MetricsFormat) -> Result { - match format { - MetricsFormat::Json => Ok(serde_json::json!({ - "status": "success", - "metrics": metrics, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string()), - MetricsFormat::Prometheus => self.format_prometheus_metrics(metrics).await, - MetricsFormat::Csv => self.format_csv_metrics(metrics).await, - MetricsFormat::Xml => self.format_xml_metrics(metrics).await, - } - } - - async fn format_aggregated_metrics(&self, aggregated: &crate::metrics::AggregatedMetrics, format: MetricsFormat) -> Result { - match format { - MetricsFormat::Json => Ok(serde_json::json!({ - "status": "success", - "aggregated_metrics": aggregated, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string()), - MetricsFormat::Prometheus => self.format_prometheus_aggregated(aggregated).await, - MetricsFormat::Csv => self.format_csv_aggregated(aggregated).await, - MetricsFormat::Xml => self.format_xml_aggregated(aggregated).await, - } - } - - async fn format_json_data(&self, data: &serde_json::Value, format: MetricsFormat) -> Result { - match format { - MetricsFormat::Json => Ok(serde_json::json!({ - "status": "success", - "data": data, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string()), - _ => Ok(serde_json::json!(data).to_string()), - } - } - - async fn format_prometheus_metrics(&self, metrics: &crate::metrics::SystemMetrics) -> Result { - let mut prometheus_lines = Vec::new(); - - // System metrics - prometheus_lines.push(format!("rustfs_cpu_usage_percent {}", metrics.cpu_usage)); - prometheus_lines.push(format!("rustfs_memory_usage_percent {}", metrics.memory_usage)); - prometheus_lines.push(format!("rustfs_disk_usage_percent {}", metrics.disk_usage)); - prometheus_lines.push(format!("rustfs_system_load {}", metrics.system_load)); - prometheus_lines.push(format!("rustfs_active_operations {}", metrics.active_operations)); - - // Network metrics - prometheus_lines.push(format!("rustfs_network_bytes_received_per_sec {}", metrics.network_io.bytes_received_per_sec)); - prometheus_lines.push(format!("rustfs_network_bytes_sent_per_sec {}", metrics.network_io.bytes_sent_per_sec)); - - // Disk metrics - prometheus_lines.push(format!("rustfs_disk_bytes_read_per_sec {}", metrics.disk_io.bytes_read_per_sec)); - prometheus_lines.push(format!("rustfs_disk_bytes_written_per_sec {}", metrics.disk_io.bytes_written_per_sec)); - - // Scan metrics - prometheus_lines.push(format!("rustfs_scan_objects_scanned_total {}", metrics.scan_metrics.objects_scanned)); - prometheus_lines.push(format!("rustfs_scan_bytes_scanned_total {}", metrics.scan_metrics.bytes_scanned)); - - // Heal metrics - prometheus_lines.push(format!("rustfs_heal_total_repairs {}", metrics.heal_metrics.total_repairs)); - prometheus_lines.push(format!("rustfs_heal_successful_repairs {}", metrics.heal_metrics.successful_repairs)); - prometheus_lines.push(format!("rustfs_heal_failed_repairs {}", metrics.heal_metrics.failed_repairs)); - - Ok(prometheus_lines.join("\n")) - } - - async fn format_prometheus_aggregated(&self, _aggregated: &crate::metrics::AggregatedMetrics) -> Result { - // In a real implementation, this would format aggregated metrics for Prometheus - Ok("# Aggregated metrics not yet implemented for Prometheus format".to_string()) - } - - async fn format_csv_metrics(&self, _metrics: &crate::metrics::SystemMetrics) -> Result { - // In a real implementation, this would format metrics as CSV - Ok("timestamp,cpu_usage,memory_usage,disk_usage\n".to_string()) - } - - async fn format_csv_aggregated(&self, _aggregated: &crate::metrics::AggregatedMetrics) -> Result { - // In a real implementation, this would format aggregated metrics as CSV - Ok("timestamp,avg_cpu_usage,avg_memory_usage,avg_disk_usage\n".to_string()) - } - - async fn format_xml_metrics(&self, _metrics: &crate::metrics::SystemMetrics) -> Result { - // In a real implementation, this would format metrics as XML - Ok("success".to_string()) - } - - async fn format_xml_aggregated(&self, _aggregated: &crate::metrics::AggregatedMetrics) -> Result { - // In a real implementation, this would format aggregated metrics as XML - Ok("success".to_string()) - } - - fn parse_metrics_query(&self, request: &HttpRequest) -> Result { - let start_time = request.query_params - .iter() - .find(|(k, _)| k == "start") - .and_then(|(_, v)| v.parse::().ok()) - .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) - .unwrap_or_else(|| std::time::SystemTime::now() - std::time::Duration::from_secs(3600)); - - let end_time = request.query_params - .iter() - .find(|(k, _)| k == "end") - .and_then(|(_, v)| v.parse::().ok()) - .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) - .unwrap_or_else(std::time::SystemTime::now); - - let interval = request.query_params - .iter() - .find(|(k, _)| k == "interval") - .and_then(|(_, v)| v.parse::().ok()) - .map(|secs| std::time::Duration::from_secs(secs)) - .unwrap_or(std::time::Duration::from_secs(300)); - - let metrics = request.query_params - .iter() - .filter(|(k, _)| k == "metric") - .map(|(_, v)| match v.as_str() { - "system" => MetricType::System, - "network" => MetricType::Network, - "disk" => MetricType::DiskIo, - "scan" => MetricType::Scan, - "heal" => MetricType::Heal, - "policy" => MetricType::Policy, - "health" => MetricType::HealthIssues, - _ => MetricType::System, - }) - .collect(); - - let limit = request.query_params - .iter() - .find(|(k, _)| k == "limit") - .and_then(|(_, v)| v.parse::().ok()); - - Ok(MetricsQuery { - start_time, - end_time, - interval, - metrics, - severity_filter: None, - limit, - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::metrics::{CollectorConfig, ReporterConfig, StorageConfig}; - - #[tokio::test] - async fn test_metrics_api_creation() { - let config = MetricsApiConfig::default(); - let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); - - assert!(metrics_api.config().enabled); - assert_eq!(metrics_api.config().prefix, "/metrics"); - } - - #[tokio::test] - async fn test_current_metrics() { - let config = MetricsApiConfig::default(); - let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/metrics/current".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = metrics_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("status")); - } - - #[tokio::test] - async fn test_prometheus_metrics() { - let config = MetricsApiConfig::default(); - let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/metrics/prometheus".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = metrics_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("rustfs_cpu_usage_percent")); - } - - #[tokio::test] - async fn test_system_metrics() { - let config = MetricsApiConfig::default(); - let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/metrics/system".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = metrics_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("cpu_usage")); - } -} \ No newline at end of file diff --git a/crates/ahm/src/api/mod.rs b/crates/ahm/src/api/mod.rs deleted file mode 100644 index 913fa455..00000000 --- a/crates/ahm/src/api/mod.rs +++ /dev/null @@ -1,504 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! API interfaces for the AHM system -//! -//! Provides REST and gRPC endpoints for: -//! - Administrative operations -//! - Metrics and monitoring -//! - System status and control - -pub mod admin_api; -pub mod metrics_api; -pub mod status_api; - -pub use admin_api::{AdminApi, AdminApiConfig}; -pub use metrics_api::{MetricsApi, MetricsApiConfig}; -pub use status_api::{StatusApi, StatusApiConfig}; - -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::sync::Arc; - -use crate::{ - error::Result, - heal::HealEngine, - metrics::{Collector, Reporter, Storage}, - policy::{ScanPolicyEngine as PolicyEngine}, - scanner::{Engine as ScanEngine}, -}; - -/// Configuration for the API server -#[derive(Debug, Clone)] -pub struct ApiConfig { - /// Admin API configuration - pub admin: AdminApiConfig, - /// Metrics API configuration - pub metrics: MetricsApiConfig, - /// Status API configuration - pub status: StatusApiConfig, - /// Server address - pub address: String, - /// Server port - pub port: u16, - /// Enable HTTPS - pub enable_https: bool, - /// SSL certificate path - pub ssl_cert_path: Option, - /// SSL key path - pub ssl_key_path: Option, - /// Request timeout - pub request_timeout: std::time::Duration, - /// Maximum request size - pub max_request_size: usize, - /// Enable CORS - pub enable_cors: bool, - /// CORS origins - pub cors_origins: Vec, - /// Enable rate limiting - pub enable_rate_limiting: bool, - /// Rate limit requests per minute - pub rate_limit_requests_per_minute: u32, -} - -impl Default for ApiConfig { - fn default() -> Self { - Self { - admin: AdminApiConfig::default(), - metrics: MetricsApiConfig::default(), - status: StatusApiConfig::default(), - address: "127.0.0.1".to_string(), - port: 8080, - enable_https: false, - ssl_cert_path: None, - ssl_key_path: None, - request_timeout: std::time::Duration::from_secs(30), - max_request_size: 1024 * 1024, // 1 MB - enable_cors: true, - cors_origins: vec!["*".to_string()], - enable_rate_limiting: true, - rate_limit_requests_per_minute: 1000, - } - } -} - -/// API server that provides HTTP endpoints for AHM functionality -pub struct ApiServer { - config: ApiConfig, - admin_api: Arc, - metrics_api: Arc, - status_api: Arc, - scan_engine: Arc, - heal_engine: Arc, - policy_engine: Arc, - metrics_collector: Arc, - metrics_reporter: Arc, - metrics_storage: Arc, -} - -impl ApiServer { - /// Create a new API server - pub async fn new( - config: ApiConfig, - scan_engine: Arc, - heal_engine: Arc, - policy_engine: Arc, - metrics_collector: Arc, - metrics_reporter: Arc, - metrics_storage: Arc, - ) -> Result { - let admin_api = Arc::new(AdminApi::new(config.admin.clone(), scan_engine.clone(), heal_engine.clone(), policy_engine.clone()).await?); - let metrics_api = Arc::new(MetricsApi::new(config.metrics.clone(), metrics_collector.clone(), metrics_reporter.clone(), metrics_storage.clone()).await?); - let status_api = Arc::new(StatusApi::new(config.status.clone(), scan_engine.clone(), heal_engine.clone(), policy_engine.clone()).await?); - - Ok(Self { - config, - admin_api, - metrics_api, - status_api, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - }) - } - - /// Get the configuration - pub fn config(&self) -> &ApiConfig { - &self.config - } - - /// Start the API server - pub async fn start(&self) -> Result<()> { - // In a real implementation, this would start an HTTP server - // For now, we'll just simulate the server startup - tracing::info!("API server starting on {}:{}", self.config.address, self.config.port); - - if self.config.enable_https { - tracing::info!("HTTPS enabled"); - } - - if self.config.enable_cors { - tracing::info!("CORS enabled with origins: {:?}", self.config.cors_origins); - } - - if self.config.enable_rate_limiting { - tracing::info!("Rate limiting enabled: {} requests/minute", self.config.rate_limit_requests_per_minute); - } - - tracing::info!("API server started successfully"); - Ok(()) - } - - /// Stop the API server - pub async fn stop(&self) -> Result<()> { - tracing::info!("API server stopping"); - tracing::info!("API server stopped successfully"); - Ok(()) - } - - /// Get server status - pub async fn status(&self) -> ServerStatus { - ServerStatus { - address: self.config.address.clone(), - port: self.config.port, - https_enabled: self.config.enable_https, - cors_enabled: self.config.enable_cors, - rate_limiting_enabled: self.config.enable_rate_limiting, - admin_api_enabled: true, - metrics_api_enabled: true, - status_api_enabled: true, - } - } - - /// Get admin API - pub fn admin_api(&self) -> &Arc { - &self.admin_api - } - - /// Get metrics API - pub fn metrics_api(&self) -> &Arc { - &self.metrics_api - } - - /// Get status API - pub fn status_api(&self) -> &Arc { - &self.status_api - } - - /// Handle HTTP request - pub async fn handle_request(&self, request: HttpRequest) -> Result { - match request.path.as_str() { - // Admin API routes - path if path.starts_with("/admin") => { - self.admin_api.handle_request(request).await - } - // Metrics API routes - path if path.starts_with("/metrics") => { - self.metrics_api.handle_request(request).await - } - // Status API routes - path if path.starts_with("/status") => { - self.status_api.handle_request(request).await - } - // Health check - "/health" => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "healthy", - "timestamp": chrono::Utc::now().to_rfc3339(), - "version": env!("CARGO_PKG_VERSION") - }).to_string(), - }) - } - // Root endpoint - "/" => { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "service": "RustFS AHM API", - "version": env!("CARGO_PKG_VERSION"), - "endpoints": { - "admin": "/admin", - "metrics": "/metrics", - "status": "/status", - "health": "/health" - } - }).to_string(), - }) - } - // 404 for unknown routes - _ => { - Ok(HttpResponse { - status_code: 404, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Not Found", - "message": "The requested endpoint does not exist" - }).to_string(), - }) - } - } - } -} - -/// HTTP request -#[derive(Debug, Clone)] -pub struct HttpRequest { - pub method: String, - pub path: String, - pub headers: Vec<(String, String)>, - pub body: Option, - pub query_params: Vec<(String, String)>, -} - -/// HTTP response -#[derive(Debug, Clone)] -pub struct HttpResponse { - pub status_code: u16, - pub headers: Vec<(String, String)>, - pub body: String, -} - -/// Server status -#[derive(Debug, Clone)] -pub struct ServerStatus { - pub address: String, - pub port: u16, - pub https_enabled: bool, - pub cors_enabled: bool, - pub rate_limiting_enabled: bool, - pub admin_api_enabled: bool, - pub metrics_api_enabled: bool, - pub status_api_enabled: bool, -} - -/// API endpoint information -#[derive(Debug, Clone)] -pub struct EndpointInfo { - pub path: String, - pub method: String, - pub description: String, - pub parameters: Vec, - pub response_type: String, -} - -/// Parameter information -#[derive(Debug, Clone)] -pub struct ParameterInfo { - pub name: String, - pub parameter_type: String, - pub required: bool, - pub description: String, -} - -/// API documentation -#[derive(Debug, Clone)] -pub struct ApiDocumentation { - pub title: String, - pub version: String, - pub description: String, - pub endpoints: Vec, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - heal::HealEngineConfig, - metrics::{CollectorConfig, ReporterConfig, StorageConfig}, - policy::PolicyEngineConfig, - scanner::ScanEngineConfig, - }; - - #[tokio::test] - async fn test_api_server_creation() { - let config = ApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let server = ApiServer::new( - config, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - ).await.unwrap(); - - assert_eq!(server.config().port, 8080); - assert_eq!(server.config().address, "127.0.0.1"); - } - - #[tokio::test] - async fn test_api_server_start_stop() { - let config = ApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let server = ApiServer::new( - config, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - ).await.unwrap(); - - server.start().await.unwrap(); - server.stop().await.unwrap(); - } - - #[tokio::test] - async fn test_api_server_status() { - let config = ApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let server = ApiServer::new( - config, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - ).await.unwrap(); - - let status = server.status().await; - assert_eq!(status.port, 8080); - assert_eq!(status.address, "127.0.0.1"); - assert!(status.admin_api_enabled); - assert!(status.metrics_api_enabled); - assert!(status.status_api_enabled); - } - - #[tokio::test] - async fn test_health_endpoint() { - let config = ApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let server = ApiServer::new( - config, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - ).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/health".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = server.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("healthy")); - } - - #[tokio::test] - async fn test_root_endpoint() { - let config = ApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let server = ApiServer::new( - config, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - ).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = server.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("RustFS AHM API")); - } - - #[tokio::test] - async fn test_404_endpoint() { - let config = ApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); - let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); - let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); - - let server = ApiServer::new( - config, - scan_engine, - heal_engine, - policy_engine, - metrics_collector, - metrics_reporter, - metrics_storage, - ).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/unknown".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = server.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 404); - assert!(response.body.contains("Not Found")); - } -} \ No newline at end of file diff --git a/crates/ahm/src/api/status_api.rs b/crates/ahm/src/api/status_api.rs deleted file mode 100644 index ca47b121..00000000 --- a/crates/ahm/src/api/status_api.rs +++ /dev/null @@ -1,761 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::sync::Arc; - -use tracing::{debug, error, info, warn}; - -use crate::{ - error::Result, - heal::HealEngine, - policy::{ScanPolicyEngine as PolicyEngine}, - scanner::{Engine as ScanEngine}, -}; - -use super::{HttpRequest, HttpResponse}; - -use serde::{Deserialize, Serialize}; - -/// Configuration for the status API -#[derive(Debug, Clone)] -pub struct StatusApiConfig { - /// Whether to enable status API - pub enabled: bool, - /// Status API prefix - pub prefix: String, - /// Authentication required - pub require_auth: bool, - /// Status token - pub status_token: Option, - /// Rate limiting for status endpoints - pub rate_limit_requests_per_minute: u32, - /// Maximum request body size - pub max_request_size: usize, - /// Enable detailed status information - pub enable_detailed_status: bool, - /// Status cache TTL in seconds - pub status_cache_ttl_seconds: u64, - /// Enable health checks - pub enable_health_checks: bool, - /// Health check timeout - pub health_check_timeout: std::time::Duration, -} - -impl Default for StatusApiConfig { - fn default() -> Self { - Self { - enabled: true, - prefix: "/status".to_string(), - require_auth: false, - status_token: None, - rate_limit_requests_per_minute: 1000, - max_request_size: 1024 * 1024, // 1 MB - enable_detailed_status: true, - status_cache_ttl_seconds: 30, // 30 seconds - enable_health_checks: true, - health_check_timeout: std::time::Duration::from_secs(5), - } - } -} - -/// Status API that provides system status and health information -pub struct StatusApi { - config: StatusApiConfig, - scan_engine: Arc, - heal_engine: Arc, - policy_engine: Arc, -} - -impl StatusApi { - /// Create a new status API - pub async fn new( - config: StatusApiConfig, - scan_engine: Arc, - heal_engine: Arc, - policy_engine: Arc, - ) -> Result { - Ok(Self { - config, - scan_engine, - heal_engine, - policy_engine, - }) - } - - /// Get the configuration - pub fn config(&self) -> &StatusApiConfig { - &self.config - } - - /// Handle HTTP request - pub async fn handle_request(&self, request: HttpRequest) -> Result { - // Check authentication if required - if self.config.require_auth { - if !self.authenticate_request(&request).await? { - return Ok(HttpResponse { - status_code: 401, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Unauthorized", - "message": "Authentication required" - }).to_string(), - }) - } - } - - match request.path.as_str() { - // Basic status - "/status" => self.get_status(request).await, - "/status/health" => self.get_health_status(request).await, - "/status/overview" => self.get_overview_status(request).await, - - // Component status - "/status/scan" => self.get_scan_status(request).await, - "/status/heal" => self.get_heal_status(request).await, - "/status/policy" => self.get_policy_status(request).await, - - // Detailed status - "/status/detailed" => self.get_detailed_status(request).await, - "/status/components" => self.get_components_status(request).await, - "/status/resources" => self.get_resources_status(request).await, - - // Health checks - "/status/health/check" => self.perform_health_check(request).await, - "/status/health/readiness" => self.get_readiness_status(request).await, - "/status/health/liveness" => self.get_liveness_status(request).await, - - // System information - "/status/info" => self.get_system_info(request).await, - "/status/version" => self.get_version_info(request).await, - "/status/uptime" => self.get_uptime_info(request).await, - - // Default 404 - _ => Ok(HttpResponse { - status_code: 404, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Not Found", - "message": "Status endpoint not found" - }).to_string(), - }), - } - } - - /// Authenticate request - async fn authenticate_request(&self, request: &HttpRequest) -> Result { - if let Some(token) = &self.config.status_token { - // Check for Authorization header - if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") { - if auth_header.1 == format!("Bearer {}", token) { - return Ok(true); - } - } - - // Check for token in query parameters - if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") { - if token_param.1 == *token { - return Ok(true); - } - } - } - - Ok(false) - } - - /// Get basic status - async fn get_status(&self, _request: HttpRequest) -> Result { - let scan_status = self.scan_engine.status().await; - let heal_status = self.heal_engine.get_status().await; - - let overall_status = if scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running { - "healthy" - } else if scan_status == crate::scanner::Status::Stopped && heal_status == crate::heal::Status::Stopped { - "stopped" - } else { - "degraded" - }; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "overall_status": overall_status, - "components": { - "scan": scan_status, - "heal": heal_status - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get health status - async fn get_health_status(&self, _request: HttpRequest) -> Result { - let scan_status = self.scan_engine.status().await; - let heal_status = self.heal_engine.get_status().await; - - let is_healthy = scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running; - let status_code = if is_healthy { 200 } else { 503 }; - - Ok(HttpResponse { - status_code, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": if is_healthy { "healthy" } else { "unhealthy" }, - "components": { - "scan": { - "status": scan_status, - "healthy": scan_status == crate::scanner::Status::Running - }, - "heal": { - "status": heal_status, - "healthy": heal_status == crate::heal::Status::Running - } - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get overview status - async fn get_overview_status(&self, _request: HttpRequest) -> Result { - let scan_status = self.scan_engine.status().await; - let heal_status = self.heal_engine.get_status().await; - - let scan_config = self.scan_engine.get_config().await; - let heal_config = self.heal_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "overview": { - "scan": { - "status": scan_status, - "enabled": scan_config.enabled, - "scan_interval": scan_config.scan_interval.as_secs() - }, - "heal": { - "status": heal_status, - "enabled": heal_config.auto_heal_enabled, - "max_workers": heal_config.max_workers - } - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get scan status - async fn get_scan_status(&self, _request: HttpRequest) -> Result { - let status = self.scan_engine.status().await; - let config = self.scan_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "scan": { - "status": status, - "enabled": config.enabled, - "scan_interval": config.scan_interval.as_secs(), - "max_concurrent_scans": config.max_concurrent_scans, - "scan_paths": config.scan_paths, - "bandwidth_limit": config.bandwidth_limit - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get heal status - async fn get_heal_status(&self, _request: HttpRequest) -> Result { - let status = self.heal_engine.get_status().await; - let config = self.heal_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "heal": { - "status": status, - "enabled": config.auto_heal_enabled, - "max_workers": config.max_workers, - "repair_timeout": config.repair_timeout.as_secs(), - "retry_attempts": config.max_retry_attempts, - "priority_queue_size": config.max_queue_size - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get policy status - async fn get_policy_status(&self, _request: HttpRequest) -> Result { - let policies = self.policy_engine.list_policies().await?; - let config = self.policy_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "policy": { - "enabled": config.enabled, - "total_policies": policies.len(), - "policies": policies, - "evaluation_timeout": config.evaluation_timeout.as_secs(), - "cache_enabled": config.cache_enabled - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get detailed status - async fn get_detailed_status(&self, _request: HttpRequest) -> Result { - if !self.config.enable_detailed_status { - return Ok(HttpResponse { - status_code: 403, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "error": "Forbidden", - "message": "Detailed status is disabled" - }).to_string(), - }); - } - - let scan_status = self.scan_engine.status().await; - let heal_status = self.heal_engine.get_status().await; - let scan_config = self.scan_engine.get_config().await; - let heal_config = self.heal_engine.get_config().await; - let policy_config = self.policy_engine.get_config().await; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "detailed_status": { - "scan": { - "status": scan_status, - "config": scan_config - }, - "heal": { - "status": heal_status, - "config": heal_config - }, - "policy": { - "config": policy_config - } - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get components status - async fn get_components_status(&self, _request: HttpRequest) -> Result { - let scan_status = self.scan_engine.status().await; - let heal_status = self.heal_engine.get_status().await; - - let components = vec![ - serde_json::json!({ - "name": "scan_engine", - "status": scan_status, - "healthy": scan_status == crate::scanner::Status::Running, - "type": "scanner" - }), - serde_json::json!({ - "name": "heal_engine", - "status": heal_status, - "healthy": heal_status == crate::heal::Status::Running, - "type": "healer" - }), - serde_json::json!({ - "name": "policy_engine", - "status": "running", - "healthy": true, - "type": "policy" - }) - ]; - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "components": components, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get resources status - async fn get_resources_status(&self, _request: HttpRequest) -> Result { - // In a real implementation, this would collect actual resource usage - // For now, we'll return simulated data - let resources = serde_json::json!({ - "cpu": { - "usage_percent": 25.5, - "cores": 8, - "load_average": 0.75 - }, - "memory": { - "usage_percent": 60.2, - "total_bytes": 8589934592, // 8 GB - "available_bytes": 3422552064 // ~3.2 GB - }, - "disk": { - "usage_percent": 45.8, - "total_bytes": 107374182400, // 100 GB - "available_bytes": 58133032960 // ~54 GB - }, - "network": { - "bytes_received_per_sec": 1048576, // 1 MB/s - "bytes_sent_per_sec": 524288 // 512 KB/s - } - }); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "resources": resources, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Perform health checks - async fn perform_health_checks(&self) -> Result> { - let mut checks = Vec::new(); - let start_time = std::time::Instant::now(); - - // Check scan engine - let scan_start = std::time::Instant::now(); - let scan_status = self.scan_engine.status().await; - let scan_duration = scan_start.elapsed(); - checks.push(HealthCheckResult { - name: "scan_engine".to_string(), - healthy: scan_status == crate::scanner::Status::Running, - message: format!("Scan engine status: {:?}", scan_status), - duration_ms: scan_duration.as_millis() as u64, - }); - - // Check heal engine - let heal_start = std::time::Instant::now(); - let heal_status = self.heal_engine.get_status().await; - let heal_duration = heal_start.elapsed(); - checks.push(HealthCheckResult { - name: "heal_engine".to_string(), - healthy: heal_status == crate::heal::Status::Running, - message: format!("Heal engine status: {:?}", heal_status), - duration_ms: heal_duration.as_millis() as u64, - }); - - // Check policy engine - let policy_start = std::time::Instant::now(); - let policy_result = self.policy_engine.list_policies().await; - let policy_duration = policy_start.elapsed(); - checks.push(HealthCheckResult { - name: "policy_engine".to_string(), - healthy: policy_result.is_ok(), - message: if policy_result.is_ok() { - "Policy engine is responding".to_string() - } else { - format!("Policy engine error: {:?}", policy_result.unwrap_err()) - }, - duration_ms: policy_duration.as_millis() as u64, - }); - - let total_duration = start_time.elapsed(); - info!("Health checks completed in {:?}", total_duration); - - Ok(checks) - } - - /// Perform health check (alias for perform_health_checks) - async fn perform_health_check(&self, _request: HttpRequest) -> Result { - let checks = self.perform_health_checks().await?; - let all_healthy = checks.iter().all(|check| check.healthy); - let check_time = std::time::Instant::now().elapsed(); - - Ok(HttpResponse { - status_code: if all_healthy { 200 } else { 503 }, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": if all_healthy { "healthy" } else { "unhealthy" }, - "checks": checks, - "check_time_ms": check_time.as_millis(), - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get readiness status - async fn get_readiness_status(&self, _request: HttpRequest) -> Result { - let scan_status = self.scan_engine.status().await; - let heal_status = self.heal_engine.get_status().await; - - let is_ready = scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running; - let status_code = if is_ready { 200 } else { 503 }; - - Ok(HttpResponse { - status_code, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": if is_ready { "ready" } else { "not_ready" }, - "components": { - "scan_engine": scan_status == crate::scanner::Status::Running, - "heal_engine": heal_status == crate::heal::Status::Running - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get liveness status - async fn get_liveness_status(&self, _request: HttpRequest) -> Result { - // Liveness check is simple - if we can respond, we're alive - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "alive", - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get system information - async fn get_system_info(&self, _request: HttpRequest) -> Result { - let system_info = serde_json::json!({ - "service": "RustFS AHM", - "version": env!("CARGO_PKG_VERSION"), - "system_info": { - "rust_version": option_env!("RUST_VERSION").unwrap_or("unknown"), - "target_arch": option_env!("TARGET_ARCH").unwrap_or("unknown"), - "target_os": option_env!("TARGET_OS").unwrap_or("unknown"), - "build_time": option_env!("VERGEN_BUILD_TIMESTAMP").unwrap_or("unknown"), - "git_commit": option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"), - "git_branch": option_env!("VERGEN_GIT_BRANCH").unwrap_or("unknown"), - }, - }); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "system_info": system_info, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get version information - async fn get_version_info(&self, _request: HttpRequest) -> Result { - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "version": env!("CARGO_PKG_VERSION"), - "build_time": option_env!("VERGEN_BUILD_TIMESTAMP").unwrap_or("unknown"), - "git_commit": option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"), - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } - - /// Get uptime information - async fn get_uptime_info(&self, _request: HttpRequest) -> Result { - // In a real implementation, this would track actual uptime - // For now, we'll return simulated data - let uptime_seconds = 3600; // 1 hour - let uptime_duration = std::time::Duration::from_secs(uptime_seconds); - - Ok(HttpResponse { - status_code: 200, - headers: vec![("Content-Type".to_string(), "application/json".to_string())], - body: serde_json::json!({ - "status": "success", - "uptime": { - "seconds": uptime_seconds, - "duration": format!("{:?}", uptime_duration), - "start_time": chrono::Utc::now() - chrono::Duration::seconds(uptime_seconds as i64) - }, - "timestamp": chrono::Utc::now().to_rfc3339() - }).to_string(), - }) - } -} - -/// Health check result -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HealthCheckResult { - pub name: String, - pub healthy: bool, - pub message: String, - pub duration_ms: u64, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - heal::HealEngineConfig, - policy::PolicyEngineConfig, - scanner::ScanEngineConfig, - }; - - #[tokio::test] - async fn test_status_api_creation() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - assert!(status_api.config().enabled); - assert_eq!(status_api.config().prefix, "/status"); - } - - #[tokio::test] - async fn test_basic_status() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/status".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = status_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("overall_status")); - } - - #[tokio::test] - async fn test_health_status() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/status/health".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = status_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("status")); - } - - #[tokio::test] - async fn test_scan_status() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/status/scan".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = status_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("scan")); - } - - #[tokio::test] - async fn test_heal_status() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/status/heal".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = status_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("heal")); - } - - #[tokio::test] - async fn test_version_info() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/status/version".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = status_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("version")); - } - - #[tokio::test] - async fn test_liveness_status() { - let config = StatusApiConfig::default(); - let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); - let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); - let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); - - let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); - - let request = HttpRequest { - method: "GET".to_string(), - path: "/status/health/liveness".to_string(), - headers: vec![], - body: None, - query_params: vec![], - }; - - let response = status_api.handle_request(request).await.unwrap(); - assert_eq!(response.status_code, 200); - assert!(response.body.contains("alive")); - } -} \ No newline at end of file diff --git a/crates/ahm/src/core/coordinator.rs b/crates/ahm/src/core/coordinator.rs deleted file mode 100644 index 0c82a11b..00000000 --- a/crates/ahm/src/core/coordinator.rs +++ /dev/null @@ -1,448 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Core coordinator for the AHM system -//! -//! The coordinator is responsible for: -//! - Event routing and distribution between subsystems -//! - Resource management and allocation -//! - Global state coordination -//! - Cross-system communication - -use std::{ - sync::{Arc, atomic::{AtomicU64, Ordering}}, - time::{Duration, Instant}, -}; - -use tokio::{ - sync::{broadcast, RwLock}, - task::JoinHandle, - time::interval, -}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, info, warn}; - -use crate::{SystemEvent, metrics}; -use super::{Status, Scheduler, SchedulerConfig}; -use crate::scanner; -use crate::error::Result; -use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - -/// Configuration for the coordinator -#[derive(Debug, Clone)] -pub struct CoordinatorConfig { - /// Event channel buffer size - pub event_buffer_size: usize, - /// Resource monitoring interval - pub resource_monitor_interval: Duration, - /// Maximum number of concurrent operations - pub max_concurrent_operations: usize, - /// Scheduler configuration - pub scheduler: SchedulerConfig, - /// Event channel capacity - pub event_channel_capacity: usize, - /// Health check interval - pub health_check_interval: Duration, - /// Metrics update interval - pub metrics_update_interval: Duration, -} - -impl Default for CoordinatorConfig { - fn default() -> Self { - Self { - event_buffer_size: 10000, - resource_monitor_interval: Duration::from_secs(30), - max_concurrent_operations: 100, - scheduler: SchedulerConfig::default(), - event_channel_capacity: 1024, - health_check_interval: Duration::from_secs(300), - metrics_update_interval: Duration::from_secs(60), - } - } -} - -/// Core coordinator for the AHM system -#[derive(Debug)] -pub struct Coordinator { - /// Configuration - config: CoordinatorConfig, - /// Current status - status: Arc>, - /// Event broadcaster - event_tx: broadcast::Sender, - /// Resource monitor handle - resource_monitor_handle: Arc>>>, - /// Event processor handle - event_processor_handle: Arc>>>, - /// Task scheduler - scheduler: Arc, - /// Metrics collector reference - metrics: Arc, - /// Active operations counter - active_operations: AtomicU64, - /// Cancellation token - cancel_token: CancellationToken, - /// Operation statistics - operation_stats: Arc>, -} - -impl Coordinator { - /// Create a new coordinator - pub async fn new( - config: CoordinatorConfig, - metrics: Arc, - cancel_token: CancellationToken, - ) -> Result { - let (event_tx, _) = broadcast::channel(config.event_buffer_size); - let scheduler = Arc::new(Scheduler::new(config.scheduler.clone()).await?); - - Ok(Self { - config, - status: Arc::new(RwLock::new(Status::Initializing)), - event_tx, - resource_monitor_handle: Arc::new(RwLock::new(None)), - event_processor_handle: Arc::new(RwLock::new(None)), - scheduler, - metrics, - active_operations: AtomicU64::new(0), - cancel_token, - operation_stats: Arc::new(RwLock::new(OperationStatistics::default())), - }) - } - - /// Start the coordinator - pub async fn start(&self) -> Result<()> { - info!("Starting AHM coordinator"); - - // Update status - *self.status.write().await = Status::Running; - - // Start resource monitor - self.start_resource_monitor().await?; - - // Start event processor - self.start_event_processor().await?; - - // Start scheduler - self.scheduler.start().await?; - - info!("AHM coordinator started successfully"); - Ok(()) - } - - /// Stop the coordinator - pub async fn stop(&self) -> Result<()> { - info!("Stopping AHM coordinator"); - - // Update status - *self.status.write().await = Status::Stopping; - - // Stop scheduler - self.scheduler.stop().await?; - - // Stop resource monitor - if let Some(handle) = self.resource_monitor_handle.write().await.take() { - handle.abort(); - } - - // Stop event processor - if let Some(handle) = self.event_processor_handle.write().await.take() { - handle.abort(); - } - - *self.status.write().await = Status::Stopped; - info!("AHM coordinator stopped"); - Ok(()) - } - - /// Get current status - pub async fn status(&self) -> Status { - self.status.read().await.clone() - } - - /// Subscribe to system events - pub fn subscribe_events(&self) -> broadcast::Receiver { - self.event_tx.subscribe() - } - - /// Publish a system event - pub async fn publish_event(&self, event: SystemEvent) -> Result<()> { - debug!("Publishing system event: {:?}", event); - - // Update operation statistics - self.update_operation_stats(&event).await; - - // Send to all subscribers - if let Err(e) = self.event_tx.send(event.clone()) { - warn!("Failed to publish event: {:?}", e); - } - - // Record the event in metrics - self.metrics.record_health_issue(&HealthIssue { - issue_type: HealthIssueType::Unknown, - severity: Severity::Low, - bucket: "system".to_string(), - object: "coordinator".to_string(), - description: format!("System event: {:?}", event), - metadata: None, - }).await?; - - Ok(()) - } - - /// Get system resource usage - pub async fn get_resource_usage(&self) -> metrics::ResourceUsage { - metrics::ResourceUsage { - disk_usage: metrics::DiskUsage { - total_bytes: 1_000_000_000, - used_bytes: 500_000_000, - available_bytes: 500_000_000, - usage_percentage: 50.0, - }, - memory_usage: metrics::MemoryUsage { - total_bytes: 16_000_000_000, - used_bytes: 4_000_000_000, - available_bytes: 12_000_000_000, - usage_percentage: 25.0, - }, - network_usage: metrics::NetworkUsage { - bytes_received: 1_000_000, - bytes_sent: 500_000, - packets_received: 1000, - packets_sent: 500, - }, - cpu_usage: metrics::CpuUsage { - usage_percentage: 0.25, - cores: 8, - load_average: 1.5, - }, - } - } - - /// Get operation statistics - pub async fn get_operation_statistics(&self) -> OperationStatistics { - self.operation_stats.read().await.clone() - } - - /// Get active operations count - pub fn get_active_operations_count(&self) -> u64 { - self.active_operations.load(Ordering::Relaxed) - } - - /// Register an active operation - pub fn register_operation(&self) -> OperationGuard { - let count = self.active_operations.fetch_add(1, Ordering::Relaxed); - debug!("Registered operation, active count: {}", count + 1); - OperationGuard::new(&self.active_operations) - } - - /// Start the resource monitor - async fn start_resource_monitor(&self) -> Result<()> { - let cancel_token = self.cancel_token.clone(); - let _event_tx = self.event_tx.clone(); - let interval_duration = self.config.resource_monitor_interval; - - let handle = tokio::spawn(async move { - let mut interval = interval(interval_duration); - - loop { - tokio::select! { - _ = cancel_token.cancelled() => { - debug!("Resource monitor cancelled"); - break; - } - _ = interval.tick() => { - // This would collect real resource metrics - // For now, we'll skip the actual collection - debug!("Resource monitor tick"); - } - } - } - }); - - *self.resource_monitor_handle.write().await = Some(handle); - Ok(()) - } - - /// Start the event processor - async fn start_event_processor(&self) -> Result<()> { - let mut event_rx = self.event_tx.subscribe(); - let cancel_token = self.cancel_token.clone(); - - let handle = tokio::spawn(async move { - loop { - tokio::select! { - _ = cancel_token.cancelled() => { - debug!("Event processor cancelled"); - break; - } - event = event_rx.recv() => { - match event { - Ok(event) => { - debug!("Processing system event: {:?}", event); - // Process the event (e.g., route to specific handlers) - } - Err(e) => { - warn!("Event processor error: {:?}", e); - } - } - } - } - } - }); - - *self.event_processor_handle.write().await = Some(handle); - Ok(()) - } - - /// Update operation statistics based on events - async fn update_operation_stats(&self, event: &SystemEvent) { - let mut stats = self.operation_stats.write().await; - - match event { - SystemEvent::ObjectDiscovered { .. } => { - stats.objects_discovered += 1; - } - SystemEvent::HealthIssueDetected(issue) => { - stats.health_issues_detected += 1; - match issue.severity { - scanner::Severity::Critical => stats.critical_issues += 1, - scanner::Severity::High => stats.high_priority_issues += 1, - scanner::Severity::Medium => stats.medium_priority_issues += 1, - scanner::Severity::Low => stats.low_priority_issues += 1, - } - } - SystemEvent::HealCompleted(result) => { - if result.success { - stats.heal_operations_succeeded += 1; - } else { - stats.heal_operations_failed += 1; - } - } - SystemEvent::ScanCompleted(_) => { - stats.scan_cycles_completed += 1; - } - SystemEvent::ResourceUsageUpdated { .. } => { - stats.resource_updates += 1; - } - } - - stats.last_updated = Instant::now(); - } -} - -/// RAII guard for tracking active operations -pub struct OperationGuard<'a> { - active_operations: &'a AtomicU64, -} - -impl<'a> OperationGuard<'a> { - pub fn new(active_operations: &'a AtomicU64) -> Self { - active_operations.fetch_add(1, Ordering::Relaxed); - Self { active_operations } - } -} - -impl Drop for OperationGuard<'_> { - fn drop(&mut self) { - self.active_operations.fetch_sub(1, Ordering::Relaxed); - } -} - -/// Operation statistics tracked by the coordinator -#[derive(Debug, Clone)] -pub struct OperationStatistics { - pub objects_discovered: u64, - pub health_issues_detected: u64, - pub heal_operations_succeeded: u64, - pub heal_operations_failed: u64, - pub scan_cycles_completed: u64, - pub resource_updates: u64, - pub critical_issues: u64, - pub high_priority_issues: u64, - pub medium_priority_issues: u64, - pub low_priority_issues: u64, - pub last_updated: Instant, -} - -impl Default for OperationStatistics { - fn default() -> Self { - Self { - objects_discovered: 0, - health_issues_detected: 0, - heal_operations_succeeded: 0, - heal_operations_failed: 0, - scan_cycles_completed: 0, - resource_updates: 0, - critical_issues: 0, - high_priority_issues: 0, - medium_priority_issues: 0, - low_priority_issues: 0, - last_updated: Instant::now(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::metrics::CollectorConfig; - - #[tokio::test] - async fn test_coordinator_lifecycle() { - let config = CoordinatorConfig::default(); - let metrics_config = CollectorConfig::default(); - let metrics = Arc::new(metrics::Collector::new(metrics_config).await.unwrap()); - let cancel_token = CancellationToken::new(); - - let coordinator = Coordinator::new(config, metrics, cancel_token).await.unwrap(); - - // Test initial status - assert_eq!(coordinator.status().await, Status::Initializing); - - // Start coordinator - coordinator.start().await.unwrap(); - assert_eq!(coordinator.status().await, Status::Running); - - // Stop coordinator - coordinator.stop().await.unwrap(); - assert_eq!(coordinator.status().await, Status::Stopped); - } - - #[tokio::test] - async fn test_operation_guard() { - let config = CoordinatorConfig::default(); - let metrics_config = CollectorConfig::default(); - let metrics = Arc::new(metrics::Collector::new(metrics_config).await.unwrap()); - let cancel_token = CancellationToken::new(); - - let coordinator = Coordinator::new(config, metrics, cancel_token).await.unwrap(); - - assert_eq!(coordinator.get_active_operations_count(), 0); - - { - let _guard1 = coordinator.register_operation(); - assert_eq!(coordinator.get_active_operations_count(), 1); - - { - let _guard2 = coordinator.register_operation(); - assert_eq!(coordinator.get_active_operations_count(), 2); - } - - assert_eq!(coordinator.get_active_operations_count(), 1); - } - - assert_eq!(coordinator.get_active_operations_count(), 0); - } -} \ No newline at end of file diff --git a/crates/ahm/src/core/lifecycle.rs b/crates/ahm/src/core/lifecycle.rs deleted file mode 100644 index ddb5d17a..00000000 --- a/crates/ahm/src/core/lifecycle.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2024 RustFS Team - -use crate::error::Result; - -#[derive(Debug, Clone, Default)] -pub struct LifecycleConfig {} - -pub struct LifecycleManager {} - -impl LifecycleManager { - pub async fn new(_config: LifecycleConfig) -> Result { - Ok(Self {}) - } - - pub async fn start(&self) -> Result<()> { - Ok(()) - } - - pub async fn stop(&self) -> Result<()> { - Ok(()) - } -} \ No newline at end of file diff --git a/crates/ahm/src/core/mod.rs b/crates/ahm/src/core/mod.rs deleted file mode 100644 index 582162af..00000000 --- a/crates/ahm/src/core/mod.rs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Core coordination and lifecycle management for the AHM system - -pub mod coordinator; -pub mod scheduler; -pub mod lifecycle; - -pub use coordinator::{Coordinator, CoordinatorConfig}; -pub use scheduler::{Scheduler, SchedulerConfig, Task, TaskPriority}; -pub use lifecycle::{LifecycleManager, LifecycleConfig}; - -/// Status of the core coordination system -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Status { - /// System is initializing - Initializing, - /// System is running normally - Running, - /// System is degraded but operational - Degraded, - /// System is shutting down - Stopping, - /// System has stopped - Stopped, - /// System encountered an error - Error(String), -} \ No newline at end of file diff --git a/crates/ahm/src/core/scheduler.rs b/crates/ahm/src/core/scheduler.rs deleted file mode 100644 index fa25fd27..00000000 --- a/crates/ahm/src/core/scheduler.rs +++ /dev/null @@ -1,226 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Task scheduler for the AHM system - -use std::{ - collections::{BinaryHeap, HashMap}, - sync::{Arc, atomic::{AtomicU64, Ordering}}, - time::{Duration, Instant}, -}; - -use tokio::{ - sync::RwLock, - task::JoinHandle, -}; -use uuid::Uuid; - -use crate::error::Result; - -/// Task scheduler configuration -#[derive(Debug, Clone)] -pub struct SchedulerConfig { - /// Maximum number of concurrent tasks - pub max_concurrent_tasks: usize, - /// Default task timeout - pub default_timeout: Duration, - /// Queue capacity - pub queue_capacity: usize, - pub default_task_priority: TaskPriority, -} - -impl Default for SchedulerConfig { - fn default() -> Self { - Self { - max_concurrent_tasks: 10, - default_timeout: Duration::from_secs(300), // 5 minutes - queue_capacity: 1000, - default_task_priority: TaskPriority::Normal, - } - } -} - -/// Task priority levels -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub enum TaskPriority { - Low = 0, - Normal = 1, - High = 2, - Critical = 3, -} - -/// A scheduled task -#[derive(Debug, Clone)] -pub struct Task { - pub id: Uuid, - pub priority: TaskPriority, - pub scheduled_time: Instant, - pub timeout: Duration, - pub task_type: TaskType, - pub payload: TaskPayload, -} - -impl Task { - pub fn new(task_type: TaskType, payload: TaskPayload) -> Self { - Self { - id: Uuid::new_v4(), - priority: TaskPriority::Normal, - scheduled_time: Instant::now(), - timeout: Duration::from_secs(300), - task_type, - payload, - } - } - - pub fn with_priority(mut self, priority: TaskPriority) -> Self { - self.priority = priority; - self - } - - pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.timeout = timeout; - self - } - - pub fn with_delay(mut self, delay: Duration) -> Self { - self.scheduled_time = Instant::now() + delay; - self - } -} - -/// Types of tasks that can be scheduled -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TaskType { - Scan, - Heal, - Cleanup, - Maintenance, - Report, -} - -/// Task payload data -#[derive(Debug, Clone)] -pub enum TaskPayload { - Scan { - bucket: Option, - object_prefix: Option, - deep_scan: bool, - }, - Heal { - bucket: String, - object: String, - version_id: Option, - }, - Cleanup { - older_than: Duration, - }, - Maintenance { - operation: String, - }, - Report { - report_type: String, - }, -} - -/// Task scheduler -#[allow(dead_code)] -#[derive(Debug)] -pub struct Scheduler { - config: SchedulerConfig, - task_queue: Arc>>, - active_tasks: Arc>>>, - task_counter: AtomicU64, - worker_handles: Arc>>>, -} - -impl Scheduler { - pub async fn new(config: SchedulerConfig) -> Result { - Ok(Self { - config, - task_queue: Arc::new(RwLock::new(BinaryHeap::new())), - active_tasks: Arc::new(RwLock::new(HashMap::new())), - task_counter: AtomicU64::new(0), - worker_handles: Arc::new(RwLock::new(Vec::new())), - }) - } - - pub async fn start(&self) -> Result<()> { - // Start worker tasks - // Implementation would go here - Ok(()) - } - - pub async fn stop(&self) -> Result<()> { - // Stop all workers and drain queues - // Implementation would go here - Ok(()) - } - - pub async fn schedule_task(&self, task: Task) -> Result { - let task_id = task.id; - let prioritized_task = PrioritizedTask { - task, - sequence: self.task_counter.fetch_add(1, Ordering::Relaxed), - }; - - self.task_queue.write().await.push(prioritized_task); - Ok(task_id) - } - - pub async fn cancel_task(&self, task_id: Uuid) -> Result { - if let Some(handle) = self.active_tasks.write().await.remove(&task_id) { - handle.abort(); - Ok(true) - } else { - Ok(false) - } - } -} - -/// Task wrapper for priority queue ordering -#[derive(Debug)] -struct PrioritizedTask { - task: Task, - sequence: u64, -} - -impl PartialEq for PrioritizedTask { - fn eq(&self, other: &Self) -> bool { - self.task.priority == other.task.priority && self.sequence == other.sequence - } -} - -impl Eq for PrioritizedTask {} - -impl PartialOrd for PrioritizedTask { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for PrioritizedTask { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - // Higher priority first, then by sequence number for fairness - other.task.priority.cmp(&self.task.priority) - .then_with(|| self.sequence.cmp(&other.sequence)) - } -} - -#[derive(Debug, Clone)] -pub struct ScheduledTask { - pub id: Uuid, - pub task_type: TaskType, - pub priority: TaskPriority, - pub created_at: Instant, -} \ No newline at end of file diff --git a/crates/ahm/src/heal/engine.rs b/crates/ahm/src/heal/engine.rs deleted file mode 100644 index ba90108f..00000000 --- a/crates/ahm/src/heal/engine.rs +++ /dev/null @@ -1,438 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::HashMap, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::{ - sync::{mpsc, RwLock}, - time::sleep, -}; -use tracing::{error, info, warn}; -use uuid::Uuid; - -use crate::error::Result; -use super::{HealConfig, HealPriority, HealResult, HealStatistics, HealTask, Status}; - -/// Main healing engine that coordinates repair operations -pub struct HealEngine { - config: HealConfig, - status: Arc>, - statistics: Arc>, - task_queue: Arc>>, - active_tasks: Arc>>, - completed_tasks: Arc>>, - shutdown_tx: Option>, -} - -impl HealEngine { - /// Create a new healing engine - pub fn new(config: HealConfig) -> Self { - Self { - config, - status: Arc::new(RwLock::new(Status::Initializing)), - statistics: Arc::new(RwLock::new(HealStatistics::default())), - task_queue: Arc::new(RwLock::new(Vec::new())), - active_tasks: Arc::new(RwLock::new(HashMap::new())), - completed_tasks: Arc::new(RwLock::new(Vec::new())), - shutdown_tx: None, - } - } - - /// Start the healing engine - pub async fn start(&mut self) -> Result<()> { - info!("Starting heal engine"); - - let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1); - self.shutdown_tx = Some(shutdown_tx); - - // Update status - { - let mut status = self.status.write().await; - *status = Status::Idle; - } - - let config = self.config.clone(); - let status = Arc::clone(&self.status); - let statistics = Arc::clone(&self.statistics); - let task_queue = Arc::clone(&self.task_queue); - let active_tasks = Arc::clone(&self.active_tasks); - let completed_tasks = Arc::clone(&self.completed_tasks); - - // Start the main healing loop - tokio::spawn(async move { - let mut interval = tokio::time::interval(config.heal_interval); - - loop { - tokio::select! { - _ = interval.tick() => { - if let Err(e) = Self::process_healing_cycle( - &config, - &status, - &statistics, - &task_queue, - &active_tasks, - &completed_tasks, - ).await { - error!("Healing cycle failed: {}", e); - } - } - _ = shutdown_rx.recv() => { - info!("Shutdown signal received, stopping heal engine"); - break; - } - } - } - - // Update status to stopped - let mut status = status.write().await; - *status = Status::Stopped; - }); - - info!("Heal engine started successfully"); - Ok(()) - } - - /// Stop the healing engine - pub async fn stop(&mut self) -> Result<()> { - info!("Stopping heal engine"); - - // Update status - { - let mut status = self.status.write().await; - *status = Status::Stopping; - } - - // Send shutdown signal - if let Some(shutdown_tx) = &self.shutdown_tx { - let _ = shutdown_tx.send(()).await; - } - - // Wait for engine to stop - let mut attempts = 0; - while attempts < 10 { - let status = self.status.read().await; - if *status == Status::Stopped { - break; - } - drop(status); - sleep(Duration::from_millis(100)).await; - attempts += 1; - } - - info!("Heal engine stopped"); - Ok(()) - } - - /// Add a healing task to the queue - pub async fn add_task(&self, task: HealTask) -> Result<()> { - let task_id = task.id.clone(); - let queue = Arc::clone(&self.task_queue); - - // Add task to priority queue - queue.write().await.push(task); - - info!("Added healing task to queue: {}", task_id); - Ok(()) - } - - /// Get current engine status - pub async fn status(&self) -> Status { - self.status.read().await.clone() - } - - /// Get current engine status (alias for status) - pub async fn get_status(&self) -> Status { - self.status.read().await.clone() - } - - /// Get engine configuration - pub async fn get_config(&self) -> HealConfig { - self.config.clone() - } - - /// Get healing statistics - pub async fn statistics(&self) -> HealStatistics { - self.statistics.read().await.clone() - } - - /// Get completed healing results - pub async fn completed_results(&self) -> Vec { - self.completed_tasks.read().await.clone() - } - - /// Process a single healing cycle - async fn process_healing_cycle( - config: &HealConfig, - status: &Arc>, - statistics: &Arc>, - task_queue: &Arc>>, - active_tasks: &Arc>>, - completed_tasks: &Arc>>, - ) -> Result<()> { - // Update status to healing - { - let mut status = status.write().await; - *status = Status::Healing; - } - - // Get ready tasks from queue - let mut queue = task_queue.write().await; - let mut ready_tasks = Vec::new(); - let mut remaining_tasks = Vec::new(); - - for task in queue.drain(..) { - if task.is_ready() { - ready_tasks.push(task); - } else { - remaining_tasks.push(task); - } - } - - // Sort ready tasks by priority - ready_tasks.sort_by(|a, b| a.priority.cmp(&b.priority)); - - // Process ready tasks - let active_count = active_tasks.read().await.len(); - let max_concurrent = config.max_workers.saturating_sub(active_count); - - for task in ready_tasks.into_iter().take(max_concurrent) { - if let Err(e) = Self::process_task( - config, - statistics, - active_tasks, - completed_tasks, - task, - ).await { - error!("Failed to process healing task: {}", e); - } - } - - // Put remaining tasks back in queue - queue.extend(remaining_tasks); - - // Update statistics - { - let mut stats = statistics.write().await; - stats.queued_tasks = queue.len() as u64; - stats.active_workers = active_tasks.read().await.len() as u64; - } - - // Update status back to idle - { - let mut status = status.write().await; - *status = Status::Idle; - } - - Ok(()) - } - - /// Process a single healing task - async fn process_task( - config: &HealConfig, - statistics: &Arc>, - active_tasks: &Arc>>, - completed_tasks: &Arc>>, - task: HealTask, - ) -> Result<()> { - let task_id = task.id.clone(); - - // Add task to active tasks - { - let mut active = active_tasks.write().await; - active.insert(task_id.clone(), task.clone()); - } - - // Update statistics - { - let mut stats = statistics.write().await; - stats.total_repairs += 1; - stats.active_workers = active_tasks.read().await.len() as u64; - } - - info!("Processing healing task: {}", task_id); - - // Simulate healing operation - let start_time = Instant::now(); - let result = Self::perform_healing_operation(&task, config).await; - let duration = start_time.elapsed(); - - // Create heal result - let heal_result = HealResult { - success: result.is_ok(), - original_issue: task.issue.clone(), - repair_duration: duration, - retry_attempts: task.retry_count, - error_message: result.err().map(|e| e.to_string()), - metadata: None, - completed_at: SystemTime::now(), - }; - - // Update statistics - { - let mut stats = statistics.write().await; - if heal_result.success { - stats.successful_repairs += 1; - } else { - stats.failed_repairs += 1; - } - stats.total_repair_time += duration; - stats.average_repair_time = if stats.total_repairs > 0 { - Duration::from_secs_f64( - stats.total_repair_time.as_secs_f64() / stats.total_repairs as f64 - ) - } else { - Duration::ZERO - }; - stats.last_repair_time = Some(SystemTime::now()); - stats.total_retry_attempts += task.retry_count as u64; - } - - // Add result to completed tasks - { - let mut completed = completed_tasks.write().await; - completed.push(heal_result.clone()); - } - - // Remove task from active tasks - { - let mut active = active_tasks.write().await; - active.remove(&task_id); - } - - // Update statistics - { - let mut stats = statistics.write().await; - stats.active_workers = active_tasks.read().await.len() as u64; - } - - if heal_result.success { - info!("Healing task completed successfully: {}", task_id); - } else { - warn!("Healing task failed: {}", task_id); - } - - Ok(()) - } - - /// Perform the actual healing operation - async fn perform_healing_operation(task: &HealTask, _config: &HealConfig) -> Result<()> { - // Simulate healing operation based on issue type - match task.issue.issue_type { - crate::scanner::HealthIssueType::MissingReplica => { - // Simulate replica repair - sleep(Duration::from_millis(100)).await; - info!("Repaired missing replica for {}/{}", task.issue.bucket, task.issue.object); - } - crate::scanner::HealthIssueType::ChecksumMismatch => { - // Simulate checksum repair - sleep(Duration::from_millis(200)).await; - info!("Repaired checksum mismatch for {}/{}", task.issue.bucket, task.issue.object); - } - crate::scanner::HealthIssueType::DiskReadError => { - // Simulate disk error recovery - sleep(Duration::from_millis(300)).await; - info!("Recovered from disk read error for {}/{}", task.issue.bucket, task.issue.object); - } - _ => { - // Generic repair for other issue types - sleep(Duration::from_millis(150)).await; - info!("Performed generic repair for {}/{}", task.issue.bucket, task.issue.object); - } - } - - // Simulate occasional failures for testing - if task.retry_count > 0 && task.retry_count % 3 == 0 { - return Err(crate::error::Error::Other(anyhow::anyhow!("Simulated healing failure"))); - } - - Ok(()) - } - - /// Start healing operations - pub async fn start_healing(&self) -> Result<()> { - let mut status = self.status.write().await; - *status = Status::Running; - info!("Healing operations started"); - Ok(()) - } - - /// Stop healing operations - pub async fn stop_healing(&self) -> Result<()> { - let mut status = self.status.write().await; - *status = Status::Stopped; - info!("Healing operations stopped"); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - - #[tokio::test] - async fn test_heal_engine_creation() { - let config = HealConfig::default(); - let engine = HealEngine::new(config); - - assert_eq!(engine.status().await, Status::Initializing); - } - - #[tokio::test] - async fn test_heal_engine_start_stop() { - let config = HealConfig::default(); - let mut engine = HealEngine::new(config); - - // Start engine - engine.start().await.unwrap(); - sleep(Duration::from_millis(100)).await; - - // Check status - let status = engine.status().await; - assert!(matches!(status, Status::Idle | Status::Healing)); - - // Stop engine - engine.stop().await.unwrap(); - sleep(Duration::from_millis(100)).await; - - // Check status - let status = engine.status().await; - assert_eq!(status, Status::Stopped); - } - - #[tokio::test] - async fn test_add_healing_task() { - let config = HealConfig::default(); - let engine = HealEngine::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let task = HealTask::new(issue); - engine.add_task(task).await.unwrap(); - - let stats = engine.statistics().await; - assert_eq!(stats.queued_tasks, 1); - } -} \ No newline at end of file diff --git a/crates/ahm/src/heal/mod.rs b/crates/ahm/src/heal/mod.rs deleted file mode 100644 index 9e7847c9..00000000 --- a/crates/ahm/src/heal/mod.rs +++ /dev/null @@ -1,360 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Healing subsystem for the AHM system -//! -//! The heal subsystem provides intelligent repair capabilities: -//! - Priority-based healing queue -//! - Real-time and background healing modes -//! - Comprehensive repair validation -//! - Adaptive healing strategies - -pub mod engine; -pub mod priority_queue; -pub mod repair_worker; -pub mod validation; - -pub use engine::HealEngine; -pub use priority_queue::PriorityQueue; -pub use repair_worker::RepairWorker; -pub use validation::HealValidator; - -use std::time::{Duration, SystemTime}; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; -use derive_builder::Builder; - -use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - -/// Configuration for the healing system -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HealConfig { - /// Maximum number of concurrent repair workers - pub max_workers: usize, - /// Maximum number of tasks in the priority queue - pub max_queue_size: usize, - /// Timeout for individual repair operations - pub repair_timeout: Duration, - /// Interval between healing cycles - pub heal_interval: Duration, - /// Whether to enable automatic healing - pub auto_heal_enabled: bool, - /// Maximum number of retry attempts for failed repairs - pub max_retry_attempts: u32, - /// Backoff delay between retry attempts - pub retry_backoff_delay: Duration, - /// Whether to validate repairs after completion - pub validate_after_repair: bool, -} - -impl Default for HealConfig { - fn default() -> Self { - Self { - max_workers: 4, - max_queue_size: 1000, - repair_timeout: Duration::from_secs(300), // 5 minutes - heal_interval: Duration::from_secs(60), // 1 minute - auto_heal_enabled: true, - max_retry_attempts: 3, - retry_backoff_delay: Duration::from_secs(30), - validate_after_repair: true, - } - } -} - -/// Result of a healing operation -#[derive(Debug, Clone)] -pub struct HealResult { - /// Whether the healing operation was successful - pub success: bool, - /// The original health issue that was addressed - pub original_issue: HealthIssue, - /// Time taken to complete the repair - pub repair_duration: Duration, - /// Number of retry attempts made - pub retry_attempts: u32, - /// Error message if repair failed - pub error_message: Option, - /// Additional metadata about the repair - pub metadata: Option, - /// Timestamp when the repair was completed - pub completed_at: SystemTime, -} - -/// Statistics for the healing system -#[derive(Debug, Clone, Default)] -pub struct HealStatistics { - /// Total number of repair tasks processed - pub total_repairs: u64, - /// Number of successful repairs - pub successful_repairs: u64, - /// Number of failed repairs - pub failed_repairs: u64, - /// Number of tasks currently in queue - pub queued_tasks: u64, - /// Number of active workers - pub active_workers: u64, - /// Total time spent on repairs - pub total_repair_time: Duration, - /// Average repair time - pub average_repair_time: Duration, - /// Last repair completion time - pub last_repair_time: Option, - /// Number of retry attempts made - pub total_retry_attempts: u64, -} - -/// Priority levels for healing tasks -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub enum HealPriority { - /// Critical issues that need immediate attention - Critical = 0, - /// High priority issues - High = 1, - /// Medium priority issues - Medium = 2, - /// Low priority issues - Low = 3, -} - -impl From for HealPriority { - fn from(severity: Severity) -> Self { - match severity { - Severity::Critical => HealPriority::Critical, - Severity::High => HealPriority::High, - Severity::Medium => HealPriority::Medium, - Severity::Low => HealPriority::Low, - } - } -} - -/// A healing task to be processed -#[derive(Debug, Clone)] -pub struct HealTask { - /// Unique identifier for the task - pub id: String, - /// The health issue to be repaired - pub issue: HealthIssue, - /// Priority level for this task - pub priority: HealPriority, - /// When the task was created - pub created_at: SystemTime, - /// When the task should be processed (for delayed tasks) - pub scheduled_at: Option, - /// Number of retry attempts made - pub retry_count: u32, - /// Maximum number of retry attempts allowed - pub max_retries: u32, - /// Additional context for the repair operation - pub context: Option, -} - -impl HealTask { - /// Create a new healing task - pub fn new(issue: HealthIssue) -> Self { - let priority = HealPriority::from(issue.severity); - Self { - id: uuid::Uuid::new_v4().to_string(), - issue, - priority, - created_at: SystemTime::now(), - scheduled_at: None, - retry_count: 0, - max_retries: 3, - context: None, - } - } - - /// Create a delayed healing task - pub fn delayed(issue: HealthIssue, delay: Duration) -> Self { - let mut task = Self::new(issue); - task.scheduled_at = Some(SystemTime::now() + delay); - task - } - - /// Check if the task is ready to be processed - pub fn is_ready(&self) -> bool { - if let Some(scheduled_at) = self.scheduled_at { - SystemTime::now() >= scheduled_at - } else { - true - } - } - - /// Check if the task can be retried - pub fn can_retry(&self) -> bool { - self.retry_count < self.max_retries - } - - /// Increment the retry count - pub fn increment_retry(&mut self) { - self.retry_count += 1; - } -} - -/// Heal engine status -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum Status { - /// Heal engine is initializing - Initializing, - /// Heal engine is idle - Idle, - /// Heal engine is running normally - Running, - /// Heal engine is actively healing - Healing, - /// Heal engine is paused - Paused, - /// Heal engine is stopping - Stopping, - /// Heal engine has stopped - Stopped, - /// Heal engine encountered an error - Error(String), -} - -/// Healing operation modes -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum HealMode { - /// Real-time healing during GET/PUT operations - RealTime, - /// Background healing during scheduled scans - Background, - /// On-demand healing triggered by admin - OnDemand, - /// Emergency healing for critical issues - Emergency, -} - -/// Validation result for a repaired object -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ValidationResult { - /// Type of validation performed - pub validation_type: ValidationType, - /// Whether validation passed - pub passed: bool, - /// Details about the validation - pub details: String, - /// Time taken for validation - pub duration: Duration, -} - -/// Types of validation that can be performed -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum ValidationType { - /// Checksum verification - Checksum, - /// Shard count verification - ShardCount, - /// Data integrity check - DataIntegrity, - /// Metadata consistency check - MetadataConsistency, - /// Cross-shard redundancy check - RedundancyCheck, -} - -/// Healing strategies for different scenarios -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum HealStrategy { - /// Repair using available data shards - DataShardRepair, - /// Repair using parity shards - ParityShardRepair, - /// Hybrid repair using both data and parity - HybridRepair, - /// Metadata-only repair - MetadataRepair, - /// Full object reconstruction - FullReconstruction, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_heal_priority_from_severity() { - assert_eq!(HealPriority::from(Severity::Critical), HealPriority::Critical); - assert_eq!(HealPriority::from(Severity::High), HealPriority::High); - assert_eq!(HealPriority::from(Severity::Medium), HealPriority::Medium); - assert_eq!(HealPriority::from(Severity::Low), HealPriority::Low); - } - - #[test] - fn test_heal_task_creation() { - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let task = HealTask::new(issue.clone()); - assert_eq!(task.priority, HealPriority::Critical); - assert_eq!(task.issue.bucket, issue.bucket); - assert_eq!(task.issue.object, issue.object); - assert_eq!(task.retry_count, 0); - assert_eq!(task.max_retries, 3); - assert!(task.is_ready()); - } - - #[test] - fn test_delayed_heal_task() { - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Medium, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let delay = Duration::from_secs(1); - let task = HealTask::delayed(issue, delay); - - assert!(task.scheduled_at.is_some()); - assert!(!task.is_ready()); // Should not be ready immediately - - // Wait for the delay to pass - std::thread::sleep(delay + Duration::from_millis(100)); - assert!(task.is_ready()); - } - - #[test] - fn test_heal_task_retry_logic() { - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Low, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let mut task = HealTask::new(issue); - assert!(task.can_retry()); - - task.increment_retry(); - assert_eq!(task.retry_count, 1); - assert!(task.can_retry()); - - task.increment_retry(); - task.increment_retry(); - assert_eq!(task.retry_count, 3); - assert!(!task.can_retry()); - } -} \ No newline at end of file diff --git a/crates/ahm/src/heal/priority_queue.rs b/crates/ahm/src/heal/priority_queue.rs deleted file mode 100644 index 07a1b600..00000000 --- a/crates/ahm/src/heal/priority_queue.rs +++ /dev/null @@ -1,413 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::BinaryHeap, - sync::Arc, - time::{Duration, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{debug, info, warn}; - -use crate::error::Result; -use super::{HealPriority, HealTask}; - -/// Priority queue for healing tasks -pub struct PriorityQueue { - tasks: Arc>>, - max_size: usize, - statistics: Arc>, -} - -/// Statistics for the priority queue -#[derive(Debug, Clone, Default)] -pub struct QueueStatistics { - /// Total number of tasks added to the queue - pub total_tasks_added: u64, - /// Total number of tasks removed from the queue - pub total_tasks_removed: u64, - /// Current number of tasks in the queue - pub current_queue_size: u64, - /// Maximum queue size reached - pub max_queue_size_reached: u64, - /// Number of tasks rejected due to queue being full - pub tasks_rejected: u64, - /// Average time tasks spend in queue - pub average_queue_time: Duration, - /// Total time all tasks have spent in queue - pub total_queue_time: Duration, -} - -impl PriorityQueue { - /// Create a new priority queue - pub fn new(max_size: usize) -> Self { - Self { - tasks: Arc::new(RwLock::new(BinaryHeap::new())), - max_size, - statistics: Arc::new(RwLock::new(QueueStatistics::default())), - } - } - - /// Add a task to the queue - pub async fn push(&self, task: HealTask) -> Result<()> { - let mut tasks = self.tasks.write().await; - let mut stats = self.statistics.write().await; - - if tasks.len() >= self.max_size { - stats.tasks_rejected += 1; - warn!("Priority queue is full, rejecting task: {}", task.id); - return Err(crate::error::Error::Other(anyhow::anyhow!("Queue is full"))); - } - - let task_id = task.id.clone(); - let priority = task.priority.clone(); - tasks.push(task); - stats.total_tasks_added += 1; - stats.current_queue_size = tasks.len() as u64; - stats.max_queue_size_reached = stats.max_queue_size_reached.max(tasks.len() as u64); - - debug!("Added task to priority queue: {} (priority: {:?})", task_id, priority); - Ok(()) - } - - /// Remove and return the highest priority task - pub async fn pop(&self) -> Option { - let mut tasks = self.tasks.write().await; - let mut stats = self.statistics.write().await; - - if let Some(task) = tasks.pop() { - stats.total_tasks_removed += 1; - stats.current_queue_size = tasks.len() as u64; - - // Update queue time statistics - let queue_time = SystemTime::now().duration_since(task.created_at).unwrap_or(Duration::ZERO); - stats.total_queue_time += queue_time; - stats.average_queue_time = if stats.total_tasks_removed > 0 { - Duration::from_secs_f64( - stats.total_queue_time.as_secs_f64() / stats.total_tasks_removed as f64 - ) - } else { - Duration::ZERO - }; - - debug!("Removed task from priority queue: {} (priority: {:?})", task.id, task.priority); - Some(task) - } else { - None - } - } - - /// Peek at the highest priority task without removing it - pub async fn peek(&self) -> Option { - let tasks = self.tasks.read().await; - tasks.peek().cloned() - } - - /// Get the current size of the queue - pub async fn len(&self) -> usize { - self.tasks.read().await.len() - } - - /// Check if the queue is empty - pub async fn is_empty(&self) -> bool { - self.tasks.read().await.is_empty() - } - - /// Get queue statistics - pub async fn statistics(&self) -> QueueStatistics { - self.statistics.read().await.clone() - } - - /// Clear all tasks from the queue - pub async fn clear(&self) { - let mut tasks = self.tasks.write().await; - let mut stats = self.statistics.write().await; - - let cleared_count = tasks.len(); - tasks.clear(); - stats.current_queue_size = 0; - - info!("Cleared {} tasks from priority queue", cleared_count); - } - - /// Get all tasks that are ready to be processed - pub async fn get_ready_tasks(&self, max_count: usize) -> Vec { - let mut tasks = self.tasks.write().await; - let mut ready_tasks = Vec::new(); - let mut remaining_tasks = Vec::new(); - - while let Some(task) = tasks.pop() { - if task.is_ready() && ready_tasks.len() < max_count { - ready_tasks.push(task); - } else { - remaining_tasks.push(task); - } - } - - // Put remaining tasks back - for task in remaining_tasks { - tasks.push(task); - } - - ready_tasks - } - - /// Remove a specific task by ID - pub async fn remove_task(&self, task_id: &str) -> bool { - let mut tasks = self.tasks.write().await; - let mut stats = self.statistics.write().await; - - let mut temp_tasks = Vec::new(); - let mut found = false; - - while let Some(task) = tasks.pop() { - if task.id == task_id { - found = true; - stats.total_tasks_removed += 1; - debug!("Removed specific task from queue: {}", task_id); - } else { - temp_tasks.push(task); - } - } - - // Put remaining tasks back - for task in temp_tasks { - tasks.push(task); - } - - stats.current_queue_size = tasks.len() as u64; - found - } - - /// Get tasks by priority level - pub async fn get_tasks_by_priority(&self, priority: HealPriority) -> Vec { - let mut tasks = self.tasks.write().await; - let mut matching_tasks = Vec::new(); - let mut other_tasks = Vec::new(); - - while let Some(task) = tasks.pop() { - if task.priority == priority { - matching_tasks.push(task); - } else { - other_tasks.push(task); - } - } - - // Put other tasks back - for task in other_tasks { - tasks.push(task); - } - - matching_tasks - } - - /// Update task priority - pub async fn update_priority(&self, task_id: &str, new_priority: HealPriority) -> bool { - let mut tasks = self.tasks.write().await; - - let mut temp_tasks = Vec::new(); - let mut found = false; - - while let Some(mut task) = tasks.pop() { - if task.id == task_id { - task.priority = new_priority.clone(); - found = true; - debug!("Updated task priority: {} -> {:?}", task_id, new_priority); - } - temp_tasks.push(task); - } - - // Put all tasks back - for task in temp_tasks { - tasks.push(task); - } - - found - } -} - -// Implement Ord for HealTask to enable priority queue functionality -impl std::cmp::Ord for HealTask { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - // Higher priority (lower enum value) comes first - self.priority.cmp(&other.priority) - .then_with(|| self.created_at.cmp(&other.created_at)) - } -} - -impl std::cmp::PartialOrd for HealTask { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl std::cmp::PartialEq for HealTask { - fn eq(&self, other: &Self) -> bool { - self.id == other.id - } -} - -impl std::cmp::Eq for HealTask {} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - - #[tokio::test] - async fn test_priority_queue_creation() { - let queue = PriorityQueue::new(100); - assert_eq!(queue.len().await, 0); - assert!(queue.is_empty().await); - } - - #[tokio::test] - async fn test_priority_queue_push_pop() { - let queue = PriorityQueue::new(10); - - let issue1 = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Low, - bucket: "bucket1".to_string(), - object: "object1".to_string(), - description: "Test issue 1".to_string(), - metadata: None, - }; - - let issue2 = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "bucket2".to_string(), - object: "object2".to_string(), - description: "Test issue 2".to_string(), - metadata: None, - }; - - let task1 = HealTask::new(issue1); - let task2 = HealTask::new(issue2); - - // Add tasks - queue.push(task1.clone()).await.unwrap(); - queue.push(task2.clone()).await.unwrap(); - - assert_eq!(queue.len().await, 2); - - // Critical task should come first - let first_task = queue.pop().await.unwrap(); - assert_eq!(first_task.priority, HealPriority::Critical); - assert_eq!(first_task.id, task2.id); - - let second_task = queue.pop().await.unwrap(); - assert_eq!(second_task.priority, HealPriority::Low); - assert_eq!(second_task.id, task1.id); - - assert!(queue.is_empty().await); - } - - #[tokio::test] - async fn test_priority_queue_full() { - let queue = PriorityQueue::new(1); - - let issue1 = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Low, - bucket: "bucket1".to_string(), - object: "object1".to_string(), - description: "Test issue 1".to_string(), - metadata: None, - }; - - let issue2 = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "bucket2".to_string(), - object: "object2".to_string(), - description: "Test issue 2".to_string(), - metadata: None, - }; - - let task1 = HealTask::new(issue1); - let task2 = HealTask::new(issue2); - - // First task should succeed - queue.push(task1).await.unwrap(); - assert_eq!(queue.len().await, 1); - - // Second task should fail - let result = queue.push(task2).await; - assert!(result.is_err()); - assert_eq!(queue.len().await, 1); - - let stats = queue.statistics().await; - assert_eq!(stats.tasks_rejected, 1); - } - - #[tokio::test] - async fn test_priority_queue_remove_task() { - let queue = PriorityQueue::new(10); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Medium, - bucket: "bucket1".to_string(), - object: "object1".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let task = HealTask::new(issue); - let task_id = task.id.clone(); - - queue.push(task).await.unwrap(); - assert_eq!(queue.len().await, 1); - - // Remove the task - let removed = queue.remove_task(&task_id).await; - assert!(removed); - assert_eq!(queue.len().await, 0); - - // Try to remove non-existent task - let removed = queue.remove_task("non-existent").await; - assert!(!removed); - } - - #[tokio::test] - async fn test_priority_queue_update_priority() { - let queue = PriorityQueue::new(10); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Low, - bucket: "bucket1".to_string(), - object: "object1".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let task = HealTask::new(issue); - let task_id = task.id.clone(); - - queue.push(task).await.unwrap(); - - // Update priority - let updated = queue.update_priority(&task_id, HealPriority::Critical).await; - assert!(updated); - - // Check that the task now has higher priority - let popped_task = queue.pop().await.unwrap(); - assert_eq!(popped_task.priority, HealPriority::Critical); - assert_eq!(popped_task.id, task_id); - } -} \ No newline at end of file diff --git a/crates/ahm/src/heal/repair_worker.rs b/crates/ahm/src/heal/repair_worker.rs deleted file mode 100644 index 018e62a5..00000000 --- a/crates/ahm/src/heal/repair_worker.rs +++ /dev/null @@ -1,505 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::{ - sync::{mpsc, RwLock}, - time::{sleep, timeout}, -}; -use tracing::{debug, error, info, warn}; - -use crate::error::Result; -use super::{HealConfig, HealResult, HealTask, Status}; - -/// Configuration for repair workers -#[derive(Debug, Clone)] -pub struct RepairWorkerConfig { - /// Worker ID - pub worker_id: String, - /// Maximum time to spend on a single repair operation - pub operation_timeout: Duration, - /// Whether to enable detailed logging - pub enable_detailed_logging: bool, - /// Maximum number of concurrent operations - pub max_concurrent_operations: usize, - /// Retry configuration - pub retry_config: RetryConfig, -} - -/// Retry configuration for repair operations -#[derive(Debug, Clone)] -pub struct RetryConfig { - /// Maximum number of retry attempts - pub max_attempts: u32, - /// Initial backoff delay - pub initial_backoff: Duration, - /// Maximum backoff delay - pub max_backoff: Duration, - /// Backoff multiplier - pub backoff_multiplier: f64, - /// Whether to use exponential backoff - pub exponential_backoff: bool, -} - -impl Default for RepairWorkerConfig { - fn default() -> Self { - Self { - worker_id: "worker-1".to_string(), - operation_timeout: Duration::from_secs(300), // 5 minutes - enable_detailed_logging: true, - max_concurrent_operations: 1, - retry_config: RetryConfig::default(), - } - } -} - -impl Default for RetryConfig { - fn default() -> Self { - Self { - max_attempts: 3, - initial_backoff: Duration::from_secs(1), - max_backoff: Duration::from_secs(60), - backoff_multiplier: 2.0, - exponential_backoff: true, - } - } -} - -/// Statistics for a repair worker -#[derive(Debug, Clone, Default)] -pub struct WorkerStatistics { - /// Total number of tasks processed - pub total_tasks_processed: u64, - /// Number of successful repairs - pub successful_repairs: u64, - /// Number of failed repairs - pub failed_repairs: u64, - /// Total time spent on repairs - pub total_repair_time: Duration, - /// Average repair time - pub average_repair_time: Duration, - /// Number of retry attempts made - pub total_retry_attempts: u64, - /// Current worker status - pub status: WorkerStatus, - /// Last task completion time - pub last_task_time: Option, -} - -/// Worker status -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum WorkerStatus { - /// Worker is idle - Idle, - /// Worker is processing a task - Processing, - /// Worker is retrying a failed task - Retrying, - /// Worker is stopping - Stopping, - /// Worker has stopped - Stopped, - /// Worker encountered an error - Error(String), -} - -impl Default for WorkerStatus { - fn default() -> Self { - WorkerStatus::Idle - } -} - -/// Repair worker that executes healing tasks -pub struct RepairWorker { - config: RepairWorkerConfig, - statistics: Arc>, - status: Arc>, - result_tx: mpsc::Sender, - shutdown_tx: Option>, -} - -impl RepairWorker { - /// Create a new repair worker - pub fn new( - config: RepairWorkerConfig, - result_tx: mpsc::Sender, - ) -> Self { - Self { - config, - statistics: Arc::new(RwLock::new(WorkerStatistics::default())), - status: Arc::new(RwLock::new(WorkerStatus::Idle)), - result_tx, - shutdown_tx: None, - } - } - - /// Start the repair worker - pub async fn start(&mut self) -> Result<()> { - info!("Starting repair worker: {}", self.config.worker_id); - - let (_task_tx, task_rx) = mpsc::channel(100); - let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1); - - self.shutdown_tx = Some(shutdown_tx); - - // Update status - { - let mut status = self.status.write().await; - *status = WorkerStatus::Idle; - } - - let config = self.config.clone(); - let statistics = Arc::clone(&self.statistics); - let status = Arc::clone(&self.status); - let result_tx = self.result_tx.clone(); - - // Start the worker loop - tokio::spawn(async move { - let mut task_rx = task_rx; - - loop { - tokio::select! { - Some(task) = task_rx.recv() => { - if let Err(e) = Self::process_task( - &config, - &statistics, - &status, - &result_tx, - task, - ).await { - error!("Failed to process task: {}", e); - } - } - _ = shutdown_rx.recv() => { - info!("Shutdown signal received, stopping worker: {}", config.worker_id); - break; - } - } - } - - // Update status to stopped - let mut status = status.write().await; - *status = WorkerStatus::Stopped; - }); - - info!("Repair worker started: {}", self.config.worker_id); - Ok(()) - } - - /// Stop the repair worker - pub async fn stop(&mut self) -> Result<()> { - info!("Stopping repair worker: {}", self.config.worker_id); - - // Update status - { - let mut status = self.status.write().await; - *status = WorkerStatus::Stopping; - } - - // Send shutdown signal - if let Some(shutdown_tx) = &self.shutdown_tx { - let _ = shutdown_tx.send(()).await; - } - - // Wait for worker to stop - let mut attempts = 0; - while attempts < 10 { - let status = self.status.read().await; - if *status == WorkerStatus::Stopped { - break; - } - drop(status); - sleep(Duration::from_millis(100)).await; - attempts += 1; - } - - info!("Repair worker stopped: {}", self.config.worker_id); - Ok(()) - } - - /// Submit a task to the worker - pub async fn submit_task(&self, _task: HealTask) -> Result<()> { - // TODO: Implement task submission - Err(crate::error::Error::Other(anyhow::anyhow!("Task submission not implemented"))) - } - - /// Get worker statistics - pub async fn statistics(&self) -> WorkerStatistics { - self.statistics.read().await.clone() - } - - /// Get worker status - pub async fn status(&self) -> WorkerStatus { - self.status.read().await.clone() - } - - /// Process a single task - async fn process_task( - config: &RepairWorkerConfig, - statistics: &Arc>, - status: &Arc>, - result_tx: &mpsc::Sender, - task: HealTask, - ) -> Result<()> { - let task_id = task.id.clone(); - - // Update status to processing - { - let mut status = status.write().await; - *status = WorkerStatus::Processing; - } - - // Update statistics - { - let mut stats = statistics.write().await; - stats.total_tasks_processed += 1; - stats.status = WorkerStatus::Processing; - } - - info!("Processing repair task: {} (worker: {})", task_id, config.worker_id); - - let start_time = Instant::now(); - let mut attempt = 0; - let mut last_error = None; - - // Retry loop - while attempt < config.retry_config.max_attempts { - attempt += 1; - - if attempt > 1 { - // Update status to retrying - { - let mut status = status.write().await; - *status = WorkerStatus::Retrying; - } - - // Calculate backoff delay - let backoff_delay = if config.retry_config.exponential_backoff { - let delay = config.retry_config.initial_backoff * - (config.retry_config.backoff_multiplier.powi((attempt - 1) as i32)) as u32; - delay.min(config.retry_config.max_backoff) - } else { - config.retry_config.initial_backoff - }; - - warn!("Retrying task {} (attempt {}/{}), waiting {:?}", - task_id, attempt, config.retry_config.max_attempts, backoff_delay); - sleep(backoff_delay).await; - } - - // Attempt the repair operation - let result = timeout( - config.operation_timeout, - Self::perform_repair_operation(&task, config) - ).await; - - match result { - Ok(Ok(())) => { - // Success - let duration = start_time.elapsed(); - let heal_result = HealResult { - success: true, - original_issue: task.issue.clone(), - repair_duration: duration, - retry_attempts: attempt - 1, - error_message: None, - metadata: None, - completed_at: SystemTime::now(), - }; - - // Send result - if let Err(e) = result_tx.send(heal_result).await { - error!("Failed to send heal result: {}", e); - } - - // Update statistics - { - let mut stats = statistics.write().await; - stats.successful_repairs += 1; - stats.total_repair_time += duration; - stats.average_repair_time = if stats.total_tasks_processed > 0 { - Duration::from_secs_f64( - stats.total_repair_time.as_secs_f64() / stats.total_tasks_processed as f64 - ) - } else { - Duration::ZERO - }; - stats.total_retry_attempts += (attempt - 1) as u64; - stats.last_task_time = Some(SystemTime::now()); - stats.status = WorkerStatus::Idle; - } - - info!("Successfully completed repair task: {} (worker: {})", task_id, config.worker_id); - return Ok(()); - } - Ok(Err(e)) => { - // Operation failed - let error_msg = e.to_string(); - last_error = Some(e); - warn!("Repair operation failed for task {} (attempt {}/{}): {}", - task_id, attempt, config.retry_config.max_attempts, error_msg); - } - Err(_) => { - // Operation timed out - last_error = Some(crate::error::Error::Other(anyhow::anyhow!("Operation timed out"))); - warn!("Repair operation timed out for task {} (attempt {}/{})", - task_id, attempt, config.retry_config.max_attempts); - } - } - } - - // All attempts failed - let duration = start_time.elapsed(); - let heal_result = HealResult { - success: false, - original_issue: task.issue.clone(), - repair_duration: duration, - retry_attempts: attempt - 1, - error_message: last_error.map(|e| e.to_string()), - metadata: None, - completed_at: SystemTime::now(), - }; - - // Send result - if let Err(e) = result_tx.send(heal_result).await { - error!("Failed to send heal result: {}", e); - } - - // Update statistics - { - let mut stats = statistics.write().await; - stats.failed_repairs += 1; - stats.total_repair_time += duration; - stats.average_repair_time = if stats.total_tasks_processed > 0 { - Duration::from_secs_f64( - stats.total_repair_time.as_secs_f64() / stats.total_tasks_processed as f64 - ) - } else { - Duration::ZERO - }; - stats.total_retry_attempts += (attempt - 1) as u64; - stats.last_task_time = Some(SystemTime::now()); - stats.status = WorkerStatus::Idle; - } - - error!("Failed to complete repair task after {} attempts: {} (worker: {})", - attempt, task_id, config.worker_id); - Ok(()) - } - - /// Perform the actual repair operation - async fn perform_repair_operation(task: &HealTask, config: &RepairWorkerConfig) -> Result<()> { - if config.enable_detailed_logging { - debug!("Starting repair operation for task: {} (worker: {})", task.id, config.worker_id); - } - - // Simulate repair operation based on issue type - match task.issue.issue_type { - crate::scanner::HealthIssueType::MissingReplica => { - // Simulate replica repair - sleep(Duration::from_millis(100)).await; - if config.enable_detailed_logging { - debug!("Repaired missing replica for {}/{}", task.issue.bucket, task.issue.object); - } - } - crate::scanner::HealthIssueType::ChecksumMismatch => { - // Simulate checksum repair - sleep(Duration::from_millis(200)).await; - if config.enable_detailed_logging { - debug!("Repaired checksum mismatch for {}/{}", task.issue.bucket, task.issue.object); - } - } - crate::scanner::HealthIssueType::DiskReadError => { - // Simulate disk error recovery - sleep(Duration::from_millis(300)).await; - if config.enable_detailed_logging { - debug!("Recovered from disk read error for {}/{}", task.issue.bucket, task.issue.object); - } - } - _ => { - // Generic repair for other issue types - sleep(Duration::from_millis(150)).await; - if config.enable_detailed_logging { - debug!("Performed generic repair for {}/{}", task.issue.bucket, task.issue.object); - } - } - } - - // Simulate occasional failures for testing - if task.retry_count > 0 && task.retry_count % 3 == 0 { - return Err(crate::error::Error::Other(anyhow::anyhow!("Simulated repair failure"))); - } - - if config.enable_detailed_logging { - debug!("Completed repair operation for task: {} (worker: {})", task.id, config.worker_id); - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - - #[tokio::test] - async fn test_repair_worker_creation() { - let config = RepairWorkerConfig::default(); - let (result_tx, _result_rx) = mpsc::channel(100); - let worker = RepairWorker::new(config, result_tx); - - assert_eq!(worker.status().await, WorkerStatus::Idle); - } - - #[tokio::test] - async fn test_repair_worker_start_stop() { - let config = RepairWorkerConfig::default(); - let (result_tx, _result_rx) = mpsc::channel(100); - let mut worker = RepairWorker::new(config, result_tx); - - // Start worker - worker.start().await.unwrap(); - sleep(Duration::from_millis(100)).await; - - // Check status - let status = worker.status().await; - assert_eq!(status, WorkerStatus::Idle); - - // Stop worker - worker.stop().await.unwrap(); - sleep(Duration::from_millis(100)).await; - - // Check status - let status = worker.status().await; - assert_eq!(status, WorkerStatus::Stopped); - } - - #[tokio::test] - async fn test_repair_worker_statistics() { - let config = RepairWorkerConfig::default(); - let (result_tx, _result_rx) = mpsc::channel(100); - let worker = RepairWorker::new(config, result_tx); - - let stats = worker.statistics().await; - assert_eq!(stats.total_tasks_processed, 0); - assert_eq!(stats.successful_repairs, 0); - assert_eq!(stats.failed_repairs, 0); - assert_eq!(stats.status, WorkerStatus::Idle); - } -} \ No newline at end of file diff --git a/crates/ahm/src/heal/validation.rs b/crates/ahm/src/heal/validation.rs deleted file mode 100644 index 2bb481f0..00000000 --- a/crates/ahm/src/heal/validation.rs +++ /dev/null @@ -1,453 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::HashMap, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{debug, error, info, warn}; - -use crate::error::Result; -use super::{HealResult, HealTask}; - -/// Configuration for validation operations -#[derive(Debug, Clone)] -pub struct ValidationConfig { - /// Whether to enable validation after repair - pub enable_post_repair_validation: bool, - /// Timeout for validation operations - pub validation_timeout: Duration, - /// Whether to enable detailed validation logging - pub enable_detailed_logging: bool, - /// Maximum number of validation retries - pub max_validation_retries: u32, - /// Validation retry delay - pub validation_retry_delay: Duration, -} - -impl Default for ValidationConfig { - fn default() -> Self { - Self { - enable_post_repair_validation: true, - validation_timeout: Duration::from_secs(60), // 1 minute - max_validation_retries: 3, - validation_retry_delay: Duration::from_secs(5), - enable_detailed_logging: true, - } - } -} - -/// Validation result for a repair operation -#[derive(Debug, Clone)] -pub struct ValidationResult { - /// Whether validation passed - pub passed: bool, - /// Validation type - pub validation_type: ValidationType, - /// Detailed validation message - pub message: String, - /// Time taken for validation - pub duration: Duration, - /// Validation timestamp - pub timestamp: SystemTime, - /// Additional validation metadata - pub metadata: Option, -} - -/// Types of validation that can be performed -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum ValidationType { - /// Checksum validation - Checksum, - /// File existence validation - FileExistence, - /// File size validation - FileSize, - /// File permissions validation - FilePermissions, - /// Metadata consistency validation - MetadataConsistency, - /// Replication status validation - ReplicationStatus, - /// Data integrity validation - DataIntegrity, - /// Custom validation - Custom(String), -} - -/// Statistics for validation operations -#[derive(Debug, Clone, Default)] -pub struct ValidationStatistics { - /// Total number of validations performed - pub total_validations: u64, - /// Number of successful validations - pub successful_validations: u64, - /// Number of failed validations - pub failed_validations: u64, - /// Total time spent on validation - pub total_validation_time: Duration, - /// Average validation time - pub average_validation_time: Duration, - /// Number of validation retries - pub total_validation_retries: u64, - /// Last validation time - pub last_validation_time: Option, -} - -/// Validator for repair operations -pub struct HealValidator { - config: ValidationConfig, - statistics: Arc>, -} - -impl HealValidator { - /// Create a new validator - pub fn new(config: ValidationConfig) -> Self { - Self { - config, - statistics: Arc::new(RwLock::new(ValidationStatistics::default())), - } - } - - /// Validate a repair operation - pub async fn validate_repair(&self, task: &HealTask, result: &HealResult) -> Result> { - if !self.config.enable_post_repair_validation { - return Ok(Vec::new()); - } - - let start_time = Instant::now(); - let mut validation_results = Vec::new(); - - info!("Starting validation for repair task: {}", task.id); - - // Perform different types of validation based on the issue type - match task.issue.issue_type { - crate::scanner::HealthIssueType::MissingReplica => { - validation_results.extend(self.validate_replica_repair(task, result).await?); - } - crate::scanner::HealthIssueType::ChecksumMismatch => { - validation_results.extend(self.validate_checksum_repair(task, result).await?); - } - crate::scanner::HealthIssueType::DiskReadError => { - validation_results.extend(self.validate_disk_repair(task, result).await?); - } - _ => { - validation_results.extend(self.validate_generic_repair(task, result).await?); - } - } - - let duration = start_time.elapsed(); - - // Update statistics - { - let mut stats = self.statistics.write().await; - stats.total_validations += validation_results.len() as u64; - stats.total_validation_time += duration; - stats.average_validation_time = if stats.total_validations > 0 { - Duration::from_secs_f64( - stats.total_validation_time.as_secs_f64() / stats.total_validations as f64 - ) - } else { - Duration::ZERO - }; - stats.last_validation_time = Some(SystemTime::now()); - - let successful_count = validation_results.iter().filter(|r| r.passed).count(); - let failed_count = validation_results.len() - successful_count; - stats.successful_validations += successful_count as u64; - stats.failed_validations += failed_count as u64; - } - - if self.config.enable_detailed_logging { - debug!("Validation completed for task {}: {} passed, {} failed", - task.id, - validation_results.iter().filter(|r| r.passed).count(), - validation_results.iter().filter(|r| !r.passed).count() - ); - } - - Ok(validation_results) - } - - /// Validate replica repair - async fn validate_replica_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { - let mut results = Vec::new(); - - // Validate file existence - let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await; - results.push(existence_result); - - // Validate replication status - let replication_result = self.validate_replication_status(&task.issue.bucket, &task.issue.object).await; - results.push(replication_result); - - Ok(results) - } - - /// Validate checksum repair - async fn validate_checksum_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { - let mut results = Vec::new(); - - // Validate checksum - let checksum_result = self.validate_checksum(&task.issue.bucket, &task.issue.object).await; - results.push(checksum_result); - - // Validate data integrity - let integrity_result = self.validate_data_integrity(&task.issue.bucket, &task.issue.object).await; - results.push(integrity_result); - - Ok(results) - } - - /// Validate disk repair - async fn validate_disk_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { - let mut results = Vec::new(); - - // Validate file existence - let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await; - results.push(existence_result); - - // Validate file permissions - let permissions_result = self.validate_file_permissions(&task.issue.bucket, &task.issue.object).await; - results.push(permissions_result); - - Ok(results) - } - - /// Validate generic repair - async fn validate_generic_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { - let mut results = Vec::new(); - - // Validate file existence - let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await; - results.push(existence_result); - - // Validate metadata consistency - let metadata_result = self.validate_metadata_consistency(&task.issue.bucket, &task.issue.object).await; - results.push(metadata_result); - - Ok(results) - } - - /// Validate file existence - async fn validate_file_existence(&self, bucket: &str, object: &str) -> ValidationResult { - let start_time = Instant::now(); - - // Simulate file existence check - tokio::time::sleep(Duration::from_millis(10)).await; - - let duration = start_time.elapsed(); - let passed = true; // Simulate successful validation - - ValidationResult { - passed, - validation_type: ValidationType::FileExistence, - message: format!("File existence validation for {}/{}", bucket, object), - duration, - timestamp: SystemTime::now(), - metadata: None, - } - } - - /// Validate checksum - async fn validate_checksum(&self, bucket: &str, object: &str) -> ValidationResult { - let start_time = Instant::now(); - - // Simulate checksum validation - tokio::time::sleep(Duration::from_millis(20)).await; - - let duration = start_time.elapsed(); - let passed = true; // Simulate successful validation - - ValidationResult { - passed, - validation_type: ValidationType::Checksum, - message: format!("Checksum validation for {}/{}", bucket, object), - duration, - timestamp: SystemTime::now(), - metadata: None, - } - } - - /// Validate replication status - async fn validate_replication_status(&self, bucket: &str, object: &str) -> ValidationResult { - let start_time = Instant::now(); - - // Simulate replication status validation - tokio::time::sleep(Duration::from_millis(15)).await; - - let duration = start_time.elapsed(); - let passed = true; // Simulate successful validation - - ValidationResult { - passed, - validation_type: ValidationType::ReplicationStatus, - message: format!("Replication status validation for {}/{}", bucket, object), - duration, - timestamp: SystemTime::now(), - metadata: None, - } - } - - /// Validate file permissions - async fn validate_file_permissions(&self, bucket: &str, object: &str) -> ValidationResult { - let start_time = Instant::now(); - - // Simulate file permissions validation - tokio::time::sleep(Duration::from_millis(5)).await; - - let duration = start_time.elapsed(); - let passed = true; // Simulate successful validation - - ValidationResult { - passed, - validation_type: ValidationType::FilePermissions, - message: format!("File permissions validation for {}/{}", bucket, object), - duration, - timestamp: SystemTime::now(), - metadata: None, - } - } - - /// Validate metadata consistency - async fn validate_metadata_consistency(&self, bucket: &str, object: &str) -> ValidationResult { - let start_time = Instant::now(); - - // Simulate metadata consistency validation - tokio::time::sleep(Duration::from_millis(25)).await; - - let duration = start_time.elapsed(); - let passed = true; // Simulate successful validation - - ValidationResult { - passed, - validation_type: ValidationType::MetadataConsistency, - message: format!("Metadata consistency validation for {}/{}", bucket, object), - duration, - timestamp: SystemTime::now(), - metadata: None, - } - } - - /// Validate data integrity - async fn validate_data_integrity(&self, bucket: &str, object: &str) -> ValidationResult { - let start_time = Instant::now(); - - // Simulate data integrity validation - tokio::time::sleep(Duration::from_millis(30)).await; - - let duration = start_time.elapsed(); - let passed = true; // Simulate successful validation - - ValidationResult { - passed, - validation_type: ValidationType::DataIntegrity, - message: format!("Data integrity validation for {}/{}", bucket, object), - duration, - timestamp: SystemTime::now(), - metadata: None, - } - } - - /// Get validation statistics - pub async fn statistics(&self) -> ValidationStatistics { - self.statistics.read().await.clone() - } - - /// Reset validation statistics - pub async fn reset_statistics(&self) { - let mut stats = self.statistics.write().await; - *stats = ValidationStatistics::default(); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - - #[tokio::test] - async fn test_validator_creation() { - let config = ValidationConfig::default(); - let validator = HealValidator::new(config); - - let stats = validator.statistics().await; - assert_eq!(stats.total_validations, 0); - } - - #[tokio::test] - async fn test_validate_repair() { - let config = ValidationConfig::default(); - let validator = HealValidator::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let task = super::HealTask::new(issue); - let result = super::HealResult { - success: true, - original_issue: task.issue.clone(), - repair_duration: Duration::from_secs(1), - retry_attempts: 0, - error_message: None, - metadata: None, - completed_at: SystemTime::now(), - }; - - let validation_results = validator.validate_repair(&task, &result).await.unwrap(); - assert!(!validation_results.is_empty()); - - let stats = validator.statistics().await; - assert_eq!(stats.total_validations, validation_results.len() as u64); - } - - #[tokio::test] - async fn test_validation_disabled() { - let mut config = ValidationConfig::default(); - config.enable_post_repair_validation = false; - let validator = HealValidator::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let task = super::HealTask::new(issue); - let result = super::HealResult { - success: true, - original_issue: task.issue.clone(), - repair_duration: Duration::from_secs(1), - retry_attempts: 0, - error_message: None, - metadata: None, - completed_at: SystemTime::now(), - }; - - let validation_results = validator.validate_repair(&task, &result).await.unwrap(); - assert!(validation_results.is_empty()); - } -} \ No newline at end of file diff --git a/crates/ahm/src/lib.rs b/crates/ahm/src/lib.rs index d3d65619..d3f6e151 100644 --- a/crates/ahm/src/lib.rs +++ b/crates/ahm/src/lib.rs @@ -17,6 +17,7 @@ use tokio_util::sync::CancellationToken; pub mod error; pub mod scanner; +pub mod metrics; pub use error::{Error, Result}; pub use scanner::{ diff --git a/crates/ahm/src/metrics.rs b/crates/ahm/src/metrics.rs new file mode 100644 index 00000000..b541b2fc --- /dev/null +++ b/crates/ahm/src/metrics.rs @@ -0,0 +1,284 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + sync::atomic::{AtomicU64, Ordering}, + time::{Duration, SystemTime}, +}; + +use serde::{Deserialize, Serialize}; +use tracing::info; + +/// Scanner metrics similar to MinIO's scanner metrics +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ScannerMetrics { + /// Total objects scanned since server start + pub objects_scanned: u64, + /// Total object versions scanned since server start + pub versions_scanned: u64, + /// Total directories scanned since server start + pub directories_scanned: u64, + /// Total bucket scans started since server start + pub bucket_scans_started: u64, + /// Total bucket scans finished since server start + pub bucket_scans_finished: u64, + /// Total objects with health issues found + pub objects_with_issues: u64, + /// Total heal tasks queued + pub heal_tasks_queued: u64, + /// Total heal tasks completed + pub heal_tasks_completed: u64, + /// Total heal tasks failed + pub heal_tasks_failed: u64, + /// Last scan activity time + pub last_activity: Option, + /// Current scan cycle + pub current_cycle: u64, + /// Total scan cycles completed + pub total_cycles: u64, + /// Current scan duration + pub current_scan_duration: Option, + /// Average scan duration + pub avg_scan_duration: Duration, + /// Objects scanned per second + pub objects_per_second: f64, + /// Buckets scanned per second + pub buckets_per_second: f64, + /// Storage metrics by bucket + pub bucket_metrics: HashMap, + /// Disk metrics + pub disk_metrics: HashMap, +} + +/// Bucket-specific metrics +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct BucketMetrics { + /// Bucket name + pub bucket: String, + /// Total objects in bucket + pub total_objects: u64, + /// Total size of objects in bucket (bytes) + pub total_size: u64, + /// Objects with health issues + pub objects_with_issues: u64, + /// Last scan time + pub last_scan_time: Option, + /// Scan duration + pub scan_duration: Option, + /// Heal tasks queued for this bucket + pub heal_tasks_queued: u64, + /// Heal tasks completed for this bucket + pub heal_tasks_completed: u64, + /// Heal tasks failed for this bucket + pub heal_tasks_failed: u64, +} + +/// Disk-specific metrics +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct DiskMetrics { + /// Disk path + pub disk_path: String, + /// Total disk space (bytes) + pub total_space: u64, + /// Used disk space (bytes) + pub used_space: u64, + /// Free disk space (bytes) + pub free_space: u64, + /// Objects scanned on this disk + pub objects_scanned: u64, + /// Objects with issues on this disk + pub objects_with_issues: u64, + /// Last scan time + pub last_scan_time: Option, + /// Whether disk is online + pub is_online: bool, + /// Whether disk is being scanned + pub is_scanning: bool, +} + +/// Thread-safe metrics collector +pub struct MetricsCollector { + /// Atomic counters for real-time metrics + objects_scanned: AtomicU64, + versions_scanned: AtomicU64, + directories_scanned: AtomicU64, + bucket_scans_started: AtomicU64, + bucket_scans_finished: AtomicU64, + objects_with_issues: AtomicU64, + heal_tasks_queued: AtomicU64, + heal_tasks_completed: AtomicU64, + heal_tasks_failed: AtomicU64, + current_cycle: AtomicU64, + total_cycles: AtomicU64, +} + +impl MetricsCollector { + /// Create a new metrics collector + pub fn new() -> Self { + Self { + objects_scanned: AtomicU64::new(0), + versions_scanned: AtomicU64::new(0), + directories_scanned: AtomicU64::new(0), + bucket_scans_started: AtomicU64::new(0), + bucket_scans_finished: AtomicU64::new(0), + objects_with_issues: AtomicU64::new(0), + heal_tasks_queued: AtomicU64::new(0), + heal_tasks_completed: AtomicU64::new(0), + heal_tasks_failed: AtomicU64::new(0), + current_cycle: AtomicU64::new(0), + total_cycles: AtomicU64::new(0), + } + } + + /// Increment objects scanned count + pub fn increment_objects_scanned(&self, count: u64) { + self.objects_scanned.fetch_add(count, Ordering::Relaxed); + } + + /// Increment versions scanned count + pub fn increment_versions_scanned(&self, count: u64) { + self.versions_scanned.fetch_add(count, Ordering::Relaxed); + } + + /// Increment directories scanned count + pub fn increment_directories_scanned(&self, count: u64) { + self.directories_scanned.fetch_add(count, Ordering::Relaxed); + } + + /// Increment bucket scans started count + pub fn increment_bucket_scans_started(&self, count: u64) { + self.bucket_scans_started.fetch_add(count, Ordering::Relaxed); + } + + /// Increment bucket scans finished count + pub fn increment_bucket_scans_finished(&self, count: u64) { + self.bucket_scans_finished.fetch_add(count, Ordering::Relaxed); + } + + /// Increment objects with issues count + pub fn increment_objects_with_issues(&self, count: u64) { + self.objects_with_issues.fetch_add(count, Ordering::Relaxed); + } + + /// Increment heal tasks queued count + pub fn increment_heal_tasks_queued(&self, count: u64) { + self.heal_tasks_queued.fetch_add(count, Ordering::Relaxed); + } + + /// Increment heal tasks completed count + pub fn increment_heal_tasks_completed(&self, count: u64) { + self.heal_tasks_completed.fetch_add(count, Ordering::Relaxed); + } + + /// Increment heal tasks failed count + pub fn increment_heal_tasks_failed(&self, count: u64) { + self.heal_tasks_failed.fetch_add(count, Ordering::Relaxed); + } + + /// Set current cycle + pub fn set_current_cycle(&self, cycle: u64) { + self.current_cycle.store(cycle, Ordering::Relaxed); + } + + /// Increment total cycles + pub fn increment_total_cycles(&self) { + self.total_cycles.fetch_add(1, Ordering::Relaxed); + } + + /// Get current metrics snapshot + pub fn get_metrics(&self) -> ScannerMetrics { + ScannerMetrics { + objects_scanned: self.objects_scanned.load(Ordering::Relaxed), + versions_scanned: self.versions_scanned.load(Ordering::Relaxed), + directories_scanned: self.directories_scanned.load(Ordering::Relaxed), + bucket_scans_started: self.bucket_scans_started.load(Ordering::Relaxed), + bucket_scans_finished: self.bucket_scans_finished.load(Ordering::Relaxed), + objects_with_issues: self.objects_with_issues.load(Ordering::Relaxed), + heal_tasks_queued: self.heal_tasks_queued.load(Ordering::Relaxed), + heal_tasks_completed: self.heal_tasks_completed.load(Ordering::Relaxed), + heal_tasks_failed: self.heal_tasks_failed.load(Ordering::Relaxed), + last_activity: Some(SystemTime::now()), + current_cycle: self.current_cycle.load(Ordering::Relaxed), + total_cycles: self.total_cycles.load(Ordering::Relaxed), + current_scan_duration: None, // Will be set by scanner + avg_scan_duration: Duration::ZERO, // Will be calculated + objects_per_second: 0.0, // Will be calculated + buckets_per_second: 0.0, // Will be calculated + bucket_metrics: HashMap::new(), // Will be populated by scanner + disk_metrics: HashMap::new(), // Will be populated by scanner + } + } + + /// Reset all metrics + pub fn reset(&self) { + self.objects_scanned.store(0, Ordering::Relaxed); + self.versions_scanned.store(0, Ordering::Relaxed); + self.directories_scanned.store(0, Ordering::Relaxed); + self.bucket_scans_started.store(0, Ordering::Relaxed); + self.bucket_scans_finished.store(0, Ordering::Relaxed); + self.objects_with_issues.store(0, Ordering::Relaxed); + self.heal_tasks_queued.store(0, Ordering::Relaxed); + self.heal_tasks_completed.store(0, Ordering::Relaxed); + self.heal_tasks_failed.store(0, Ordering::Relaxed); + self.current_cycle.store(0, Ordering::Relaxed); + self.total_cycles.store(0, Ordering::Relaxed); + + info!("Scanner metrics reset"); + } +} + +impl Default for MetricsCollector { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_metrics_collector_creation() { + let collector = MetricsCollector::new(); + let metrics = collector.get_metrics(); + assert_eq!(metrics.objects_scanned, 0); + assert_eq!(metrics.versions_scanned, 0); + } + + #[test] + fn test_metrics_increment() { + let collector = MetricsCollector::new(); + + collector.increment_objects_scanned(10); + collector.increment_versions_scanned(5); + collector.increment_objects_with_issues(2); + + let metrics = collector.get_metrics(); + assert_eq!(metrics.objects_scanned, 10); + assert_eq!(metrics.versions_scanned, 5); + assert_eq!(metrics.objects_with_issues, 2); + } + + #[test] + fn test_metrics_reset() { + let collector = MetricsCollector::new(); + + collector.increment_objects_scanned(10); + collector.reset(); + + let metrics = collector.get_metrics(); + assert_eq!(metrics.objects_scanned, 0); + } +} \ No newline at end of file diff --git a/crates/ahm/src/metrics/aggregator.rs b/crates/ahm/src/metrics/aggregator.rs deleted file mode 100644 index 961a1344..00000000 --- a/crates/ahm/src/metrics/aggregator.rs +++ /dev/null @@ -1,739 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::{ - collections::HashMap, - time::{Duration, SystemTime}, -}; - -use tracing::{debug, error, info, warn}; - -use crate::error::Result; - -use super::{ - AggregatedMetrics, DiskMetrics, HealMetrics, MetricsDataPoint, MetricsQuery, MetricsSummary, - NetworkMetrics, PolicyMetrics, ScanMetrics, SystemMetrics, -}; - -/// Configuration for the metrics aggregator -#[derive(Debug, Clone)] -pub struct AggregatorConfig { - /// Default aggregation interval - pub default_interval: Duration, - /// Maximum number of data points to keep in memory - pub max_data_points: usize, - /// Whether to enable automatic aggregation - pub enable_auto_aggregation: bool, - /// Aggregation window size - pub aggregation_window: Duration, - /// Whether to enable data compression - pub enable_compression: bool, - /// Compression threshold (number of points before compression) - pub compression_threshold: usize, - /// Whether to enable outlier detection - pub enable_outlier_detection: bool, - /// Outlier detection threshold (standard deviations) - pub outlier_threshold: f64, -} - -impl Default for AggregatorConfig { - fn default() -> Self { - Self { - default_interval: Duration::from_secs(300), // 5 minutes - max_data_points: 10000, - enable_auto_aggregation: true, - aggregation_window: Duration::from_secs(3600), // 1 hour - enable_compression: true, - compression_threshold: 1000, - enable_outlier_detection: true, - outlier_threshold: 2.0, // 2 standard deviations - } - } -} - -/// Metrics aggregator that processes and aggregates metrics data -#[derive(Debug, Clone)] -pub struct Aggregator { - config: AggregatorConfig, - data_points: Vec, - aggregation_cache: HashMap, - last_aggregation_time: SystemTime, - aggregation_count: u64, -} - -impl Aggregator { - /// Create a new metrics aggregator - pub async fn new(interval: Duration) -> Result { - let config = AggregatorConfig { - default_interval: interval, - ..Default::default() - }; - - Ok(Self { - config, - data_points: Vec::new(), - aggregation_cache: HashMap::new(), - last_aggregation_time: SystemTime::now(), - aggregation_count: 0, - }) - } - - /// Get the configuration - pub fn config(&self) -> &AggregatorConfig { - &self.config - } - - /// Add metrics data point - pub async fn add_data_point(&mut self, data_point: MetricsDataPoint) -> Result<()> { - self.data_points.push(data_point); - - // Trim old data points if we exceed the limit - if self.data_points.len() > self.config.max_data_points { - let excess = self.data_points.len() - self.config.max_data_points; - self.data_points.drain(0..excess); - } - - // Auto-aggregate if enabled - if self.config.enable_auto_aggregation { - self.auto_aggregate().await?; - } - - Ok(()) - } - - /// Aggregate metrics based on query - pub async fn aggregate_metrics(&mut self, query: MetricsQuery) -> Result { - let start_time = SystemTime::now(); - - // Check cache first - let cache_key = self.generate_cache_key(&query); - if let Some(cached) = self.aggregation_cache.get(&cache_key) { - debug!("Returning cached aggregation result"); - return Ok(cached.clone()); - } - - // Filter data points by time range - let filtered_points: Vec<&MetricsDataPoint> = self - .data_points - .iter() - .filter(|point| { - point.timestamp >= query.start_time && point.timestamp <= query.end_time - }) - .collect(); - - if filtered_points.is_empty() { - warn!("No data points found for the specified time range"); - return Ok(AggregatedMetrics { - query, - data_points: Vec::new(), - summary: MetricsSummary::default(), - }); - } - - // Aggregate data points - let aggregated_points = self.aggregate_data_points(&filtered_points, &query).await?; - - // Generate summary - let summary = self.generate_summary(&aggregated_points, &query).await?; - - let result = AggregatedMetrics { - query, - data_points: aggregated_points, - summary, - }; - - // Cache the result - self.aggregation_cache.insert(cache_key, result.clone()); - - let aggregation_time = start_time.elapsed(); - debug!("Metrics aggregation completed in {:?}", aggregation_time); - - Ok(result) - } - - /// Auto-aggregate data points - async fn auto_aggregate(&mut self) -> Result<()> { - let now = SystemTime::now(); - - // Check if it's time to aggregate - if now.duration_since(self.last_aggregation_time).unwrap() < self.config.aggregation_window { - return Ok(()); - } - - // Perform aggregation - let window_start = now - self.config.aggregation_window; - let query = MetricsQuery { - start_time: window_start, - end_time: now, - interval: self.config.default_interval, - metrics: vec![], // All metrics - severity_filter: None, - limit: None, - }; - - let _aggregated = self.aggregate_metrics(query).await?; - - self.last_aggregation_time = now; - self.aggregation_count += 1; - - info!("Auto-aggregation completed, count: {}", self.aggregation_count); - - Ok(()) - } - - /// Aggregate data points based on interval - async fn aggregate_data_points( - &self, - points: &[&MetricsDataPoint], - query: &MetricsQuery, - ) -> Result> { - if points.is_empty() { - return Ok(Vec::new()); - } - - let mut aggregated_points = Vec::new(); - let mut current_bucket_start = query.start_time; - let mut current_bucket_points = Vec::new(); - - for point in points { - if point.timestamp >= current_bucket_start + query.interval { - // Process current bucket - if !current_bucket_points.is_empty() { - let aggregated = self.aggregate_bucket(¤t_bucket_points, current_bucket_start).await?; - aggregated_points.push(aggregated); - } - - // Start new bucket - current_bucket_start = current_bucket_start + query.interval; - current_bucket_points.clear(); - } - - current_bucket_points.push(*point); - } - - // Process last bucket - if !current_bucket_points.is_empty() { - let aggregated = self.aggregate_bucket(¤t_bucket_points, current_bucket_start).await?; - aggregated_points.push(aggregated); - } - - Ok(aggregated_points) - } - - /// Aggregate a bucket of data points - async fn aggregate_bucket( - &self, - points: &[&MetricsDataPoint], - bucket_start: SystemTime, - ) -> Result { - let mut aggregated = MetricsDataPoint { - timestamp: bucket_start, - system: None, - network: None, - disk_io: None, - scan: None, - heal: None, - policy: None, - }; - - // Aggregate system metrics - let system_metrics: Vec<&SystemMetrics> = points - .iter() - .filter_map(|p| p.system.as_ref()) - .collect(); - - if !system_metrics.is_empty() { - aggregated.system = Some(self.aggregate_system_metrics(&system_metrics).await?); - } - - // Aggregate network metrics - let network_metrics: Vec<&NetworkMetrics> = points - .iter() - .filter_map(|p| p.network.as_ref()) - .collect(); - - if !network_metrics.is_empty() { - aggregated.network = Some(self.aggregate_network_metrics(&network_metrics).await?); - } - - // Aggregate disk I/O metrics - let disk_metrics: Vec<&DiskMetrics> = points - .iter() - .filter_map(|p| p.disk_io.as_ref()) - .collect(); - - if !disk_metrics.is_empty() { - aggregated.disk_io = Some(self.aggregate_disk_metrics(&disk_metrics).await?); - } - - // Aggregate scan metrics - let scan_metrics: Vec<&ScanMetrics> = points - .iter() - .filter_map(|p| p.scan.as_ref()) - .collect(); - - if !scan_metrics.is_empty() { - aggregated.scan = Some(self.aggregate_scan_metrics(&scan_metrics).await?); - } - - // Aggregate heal metrics - let heal_metrics: Vec<&HealMetrics> = points - .iter() - .filter_map(|p| p.heal.as_ref()) - .collect(); - - if !heal_metrics.is_empty() { - aggregated.heal = Some(self.aggregate_heal_metrics(&heal_metrics).await?); - } - - // Aggregate policy metrics - let policy_metrics: Vec<&PolicyMetrics> = points - .iter() - .filter_map(|p| p.policy.as_ref()) - .collect(); - - if !policy_metrics.is_empty() { - aggregated.policy = Some(self.aggregate_policy_metrics(&policy_metrics).await?); - } - - Ok(aggregated) - } - - /// Aggregate system metrics - async fn aggregate_system_metrics(&self, metrics: &[&SystemMetrics]) -> Result { - if metrics.is_empty() { - return Ok(SystemMetrics::default()); - } - - let cpu_usage: f64 = metrics.iter().map(|m| m.cpu_usage).sum::() / metrics.len() as f64; - let memory_usage: f64 = metrics.iter().map(|m| m.memory_usage).sum::() / metrics.len() as f64; - let disk_usage: f64 = metrics.iter().map(|m| m.disk_usage).sum::() / metrics.len() as f64; - let system_load: f64 = metrics.iter().map(|m| m.system_load).sum::() / metrics.len() as f64; - let active_operations: u64 = metrics.iter().map(|m| m.active_operations).sum::() / metrics.len() as u64; - - // Aggregate health issues - let mut health_issues = HashMap::new(); - for metric in metrics { - for (severity, count) in &metric.health_issues { - *health_issues.entry(*severity).or_insert(0) += count; - } - } - - Ok(SystemMetrics { - timestamp: SystemTime::now(), - cpu_usage, - memory_usage, - disk_usage, - network_io: NetworkMetrics::default(), // Will be aggregated separately - disk_io: DiskMetrics::default(), // Will be aggregated separately - active_operations, - system_load, - health_issues, - scan_metrics: ScanMetrics::default(), // Will be aggregated separately - heal_metrics: HealMetrics::default(), // Will be aggregated separately - policy_metrics: PolicyMetrics::default(), // Will be aggregated separately - }) - } - - /// Aggregate network metrics - async fn aggregate_network_metrics(&self, metrics: &[&NetworkMetrics]) -> Result { - if metrics.is_empty() { - return Ok(NetworkMetrics::default()); - } - - let bytes_received_per_sec: u64 = metrics.iter().map(|m| m.bytes_received_per_sec).sum::() / metrics.len() as u64; - let bytes_sent_per_sec: u64 = metrics.iter().map(|m| m.bytes_sent_per_sec).sum::() / metrics.len() as u64; - let packets_received_per_sec: u64 = metrics.iter().map(|m| m.packets_received_per_sec).sum::() / metrics.len() as u64; - let packets_sent_per_sec: u64 = metrics.iter().map(|m| m.packets_sent_per_sec).sum::() / metrics.len() as u64; - - Ok(NetworkMetrics { - bytes_received_per_sec, - bytes_sent_per_sec, - packets_received_per_sec, - packets_sent_per_sec, - }) - } - - /// Aggregate disk metrics - async fn aggregate_disk_metrics(&self, metrics: &[&DiskMetrics]) -> Result { - if metrics.is_empty() { - return Ok(DiskMetrics::default()); - } - - let bytes_read_per_sec: u64 = metrics.iter().map(|m| m.bytes_read_per_sec).sum::() / metrics.len() as u64; - let bytes_written_per_sec: u64 = metrics.iter().map(|m| m.bytes_written_per_sec).sum::() / metrics.len() as u64; - let read_ops_per_sec: u64 = metrics.iter().map(|m| m.read_ops_per_sec).sum::() / metrics.len() as u64; - let write_ops_per_sec: u64 = metrics.iter().map(|m| m.write_ops_per_sec).sum::() / metrics.len() as u64; - let avg_read_latency_ms: f64 = metrics.iter().map(|m| m.avg_read_latency_ms).sum::() / metrics.len() as f64; - let avg_write_latency_ms: f64 = metrics.iter().map(|m| m.avg_write_latency_ms).sum::() / metrics.len() as f64; - - Ok(DiskMetrics { - bytes_read_per_sec, - bytes_written_per_sec, - read_ops_per_sec, - write_ops_per_sec, - avg_read_latency_ms, - avg_write_latency_ms, - }) - } - - /// Aggregate scan metrics - async fn aggregate_scan_metrics(&self, metrics: &[&ScanMetrics]) -> Result { - if metrics.is_empty() { - return Ok(ScanMetrics::default()); - } - - let objects_scanned: u64 = metrics.iter().map(|m| m.objects_scanned).sum(); - let bytes_scanned: u64 = metrics.iter().map(|m| m.bytes_scanned).sum(); - let scan_duration: Duration = metrics.iter().map(|m| m.scan_duration).sum(); - let health_issues_found: u64 = metrics.iter().map(|m| m.health_issues_found).sum(); - let scan_cycles_completed: u64 = metrics.iter().map(|m| m.scan_cycles_completed).sum(); - - // Calculate rates - let total_duration_secs = scan_duration.as_secs_f64(); - let scan_rate_objects_per_sec = if total_duration_secs > 0.0 { - objects_scanned as f64 / total_duration_secs - } else { - 0.0 - }; - - let scan_rate_bytes_per_sec = if total_duration_secs > 0.0 { - bytes_scanned as f64 / total_duration_secs - } else { - 0.0 - }; - - Ok(ScanMetrics { - objects_scanned, - bytes_scanned, - scan_duration, - scan_rate_objects_per_sec, - scan_rate_bytes_per_sec, - health_issues_found, - scan_cycles_completed, - last_scan_time: metrics.last().and_then(|m| m.last_scan_time), - }) - } - - /// Aggregate heal metrics - async fn aggregate_heal_metrics(&self, metrics: &[&HealMetrics]) -> Result { - if metrics.is_empty() { - return Ok(HealMetrics::default()); - } - - let total_repairs: u64 = metrics.iter().map(|m| m.total_repairs).sum(); - let successful_repairs: u64 = metrics.iter().map(|m| m.successful_repairs).sum(); - let failed_repairs: u64 = metrics.iter().map(|m| m.failed_repairs).sum(); - let total_repair_time: Duration = metrics.iter().map(|m| m.total_repair_time).sum(); - let total_retry_attempts: u64 = metrics.iter().map(|m| m.total_retry_attempts).sum(); - - // Calculate average repair time - let average_repair_time = if total_repairs > 0 { - let total_ms = total_repair_time.as_millis() as u64; - Duration::from_millis(total_ms / total_repairs) - } else { - Duration::ZERO - }; - - // Get latest values for current state - let active_repair_workers = metrics.last().map(|m| m.active_repair_workers).unwrap_or(0); - let queued_repair_tasks = metrics.last().map(|m| m.queued_repair_tasks).unwrap_or(0); - let last_repair_time = metrics.last().and_then(|m| m.last_repair_time); - - Ok(HealMetrics { - total_repairs, - successful_repairs, - failed_repairs, - total_repair_time, - average_repair_time, - active_repair_workers, - queued_repair_tasks, - last_repair_time, - total_retry_attempts, - }) - } - - /// Aggregate policy metrics - async fn aggregate_policy_metrics(&self, metrics: &[&PolicyMetrics]) -> Result { - if metrics.is_empty() { - return Ok(PolicyMetrics::default()); - } - - let total_evaluations: u64 = metrics.iter().map(|m| m.total_evaluations).sum(); - let allowed_operations: u64 = metrics.iter().map(|m| m.allowed_operations).sum(); - let denied_operations: u64 = metrics.iter().map(|m| m.denied_operations).sum(); - let scan_policy_evaluations: u64 = metrics.iter().map(|m| m.scan_policy_evaluations).sum(); - let heal_policy_evaluations: u64 = metrics.iter().map(|m| m.heal_policy_evaluations).sum(); - let retention_policy_evaluations: u64 = metrics.iter().map(|m| m.retention_policy_evaluations).sum(); - let total_evaluation_time: Duration = metrics.iter().map(|m| m.average_evaluation_time).sum(); - - // Calculate average evaluation time - let average_evaluation_time = if total_evaluations > 0 { - let total_ms = total_evaluation_time.as_millis() as u64; - Duration::from_millis(total_ms / total_evaluations) - } else { - Duration::ZERO - }; - - Ok(PolicyMetrics { - total_evaluations, - allowed_operations, - denied_operations, - scan_policy_evaluations, - heal_policy_evaluations, - retention_policy_evaluations, - average_evaluation_time, - }) - } - - /// Generate summary statistics - async fn generate_summary( - &self, - data_points: &[MetricsDataPoint], - query: &MetricsQuery, - ) -> Result { - let total_points = data_points.len() as u64; - let time_range = query.end_time.duration_since(query.start_time).unwrap_or(Duration::ZERO); - - // Calculate averages from system metrics - let system_metrics: Vec<&SystemMetrics> = data_points - .iter() - .filter_map(|p| p.system.as_ref()) - .collect(); - - let avg_cpu_usage = if !system_metrics.is_empty() { - system_metrics.iter().map(|m| m.cpu_usage).sum::() / system_metrics.len() as f64 - } else { - 0.0 - }; - - let avg_memory_usage = if !system_metrics.is_empty() { - system_metrics.iter().map(|m| m.memory_usage).sum::() / system_metrics.len() as f64 - } else { - 0.0 - }; - - let avg_disk_usage = if !system_metrics.is_empty() { - system_metrics.iter().map(|m| m.disk_usage).sum::() / system_metrics.len() as f64 - } else { - 0.0 - }; - - // Calculate totals from scan and heal metrics - let scan_metrics: Vec<&ScanMetrics> = data_points - .iter() - .filter_map(|p| p.scan.as_ref()) - .collect(); - - let total_objects_scanned = scan_metrics.iter().map(|m| m.objects_scanned).sum(); - let total_health_issues = scan_metrics.iter().map(|m| m.health_issues_found).sum(); - - let heal_metrics: Vec<&HealMetrics> = data_points - .iter() - .filter_map(|p| p.heal.as_ref()) - .collect(); - - let total_repairs = heal_metrics.iter().map(|m| m.total_repairs).sum(); - let successful_repairs: u64 = heal_metrics.iter().map(|m| m.successful_repairs).sum(); - let repair_success_rate = if total_repairs > 0 { - successful_repairs as f64 / total_repairs as f64 - } else { - 0.0 - }; - - Ok(MetricsSummary { - total_points, - time_range, - avg_cpu_usage, - avg_memory_usage, - avg_disk_usage, - total_objects_scanned, - total_repairs, - repair_success_rate, - total_health_issues, - }) - } - - /// Generate cache key for query - fn generate_cache_key(&self, query: &MetricsQuery) -> String { - format!( - "{:?}_{:?}_{:?}_{:?}", - query.start_time, query.end_time, query.interval, query.metrics - ) - } - - /// Clear old cache entries - pub async fn clear_old_cache(&mut self) -> Result<()> { - let now = SystemTime::now(); - let retention_period = Duration::from_secs(3600); // 1 hour - - self.aggregation_cache.retain(|_key, value| { - if let Some(latest_point) = value.data_points.last() { - now.duration_since(latest_point.timestamp).unwrap_or(Duration::ZERO) < retention_period - } else { - false - } - }); - - info!("Cleared old cache entries, remaining: {}", self.aggregation_cache.len()); - Ok(()) - } - - /// Get aggregation statistics - pub fn get_statistics(&self) -> AggregatorStatistics { - AggregatorStatistics { - total_data_points: self.data_points.len(), - total_aggregations: self.aggregation_count, - cache_size: self.aggregation_cache.len(), - last_aggregation_time: self.last_aggregation_time, - config: self.config.clone(), - } - } -} - -/// Aggregator statistics -#[derive(Debug, Clone)] -pub struct AggregatorStatistics { - pub total_data_points: usize, - pub total_aggregations: u64, - pub cache_size: usize, - pub last_aggregation_time: SystemTime, - pub config: AggregatorConfig, -} - -impl Default for MetricsSummary { - fn default() -> Self { - Self { - total_points: 0, - time_range: Duration::ZERO, - avg_cpu_usage: 0.0, - avg_memory_usage: 0.0, - avg_disk_usage: 0.0, - total_objects_scanned: 0, - total_repairs: 0, - repair_success_rate: 0.0, - total_health_issues: 0, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::Severity; - - #[tokio::test] - async fn test_aggregator_creation() { - let aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); - - assert_eq!(aggregator.config().default_interval, Duration::from_secs(300)); - assert!(aggregator.config().enable_auto_aggregation); - } - - #[tokio::test] - async fn test_data_point_addition() { - let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); - - let data_point = MetricsDataPoint { - timestamp: SystemTime::now(), - system: Some(SystemMetrics::default()), - network: None, - disk_io: None, - scan: None, - heal: None, - policy: None, - }; - - aggregator.add_data_point(data_point).await.unwrap(); - - let stats = aggregator.get_statistics(); - assert_eq!(stats.total_data_points, 1); - } - - #[tokio::test] - async fn test_metrics_aggregation() { - let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); - - // Add some test data points - for i in 0..5 { - let mut system_metrics = SystemMetrics::default(); - system_metrics.cpu_usage = i as f64 * 10.0; - system_metrics.memory_usage = i as f64 * 20.0; - - let data_point = MetricsDataPoint { - timestamp: SystemTime::now() + Duration::from_secs(i * 60), - system: Some(system_metrics), - network: None, - disk_io: None, - scan: None, - heal: None, - policy: None, - }; - - aggregator.add_data_point(data_point).await.unwrap(); - } - - let query = MetricsQuery { - start_time: SystemTime::now(), - end_time: SystemTime::now() + Duration::from_secs(300), - interval: Duration::from_secs(60), - metrics: vec![MetricType::System], - severity_filter: None, - limit: None, - }; - - let result = aggregator.aggregate_metrics(query).await.unwrap(); - assert_eq!(result.data_points.len(), 5); - assert_eq!(result.summary.total_points, 5); - } - - #[tokio::test] - async fn test_system_metrics_aggregation() { - let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); - - let metrics = vec![ - SystemMetrics { - cpu_usage: 10.0, - memory_usage: 20.0, - disk_usage: 30.0, - ..Default::default() - }, - SystemMetrics { - cpu_usage: 20.0, - memory_usage: 40.0, - disk_usage: 60.0, - ..Default::default() - }, - ]; - - let aggregated = aggregator.aggregate_system_metrics(&metrics.iter().collect::>()).await.unwrap(); - - assert_eq!(aggregated.cpu_usage, 15.0); - assert_eq!(aggregated.memory_usage, 30.0); - assert_eq!(aggregated.disk_usage, 45.0); - } - - #[tokio::test] - async fn test_cache_clearing() { - let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); - - // Add some cached data - let query = MetricsQuery { - start_time: SystemTime::now() - Duration::from_secs(3600), - end_time: SystemTime::now() - Duration::from_secs(3000), - interval: Duration::from_secs(60), - metrics: vec![], - severity_filter: None, - limit: None, - }; - - let _result = aggregator.aggregate_metrics(query).await.unwrap(); - - let stats_before = aggregator.get_statistics(); - assert_eq!(stats_before.cache_size, 1); - - aggregator.clear_old_cache().await.unwrap(); - - let stats_after = aggregator.get_statistics(); - assert_eq!(stats_after.cache_size, 0); - } -} \ No newline at end of file diff --git a/crates/ahm/src/metrics/collector.rs b/crates/ahm/src/metrics/collector.rs deleted file mode 100644 index 3932b94e..00000000 --- a/crates/ahm/src/metrics/collector.rs +++ /dev/null @@ -1,426 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::{ - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{debug, error, info, warn}; - -use crate::{ - error::Result, - scanner::{HealthIssue, Severity}, -}; - -use super::{ - AggregatedMetrics, Aggregator, DiskMetrics, HealMetrics, MetricsQuery, MetricType, - NetworkMetrics, PolicyMetrics, ScanMetrics, SystemMetrics, -}; - -/// Configuration for the metrics collector -#[derive(Debug, Clone)] -pub struct CollectorConfig { - /// Collection interval - pub collection_interval: Duration, - /// Whether to enable detailed metrics collection - pub enable_detailed_metrics: bool, - /// Maximum number of metrics to keep in memory - pub max_metrics_in_memory: usize, - /// Whether to enable automatic aggregation - pub enable_auto_aggregation: bool, - /// Aggregation interval - pub aggregation_interval: Duration, - /// Whether to enable resource monitoring - pub enable_resource_monitoring: bool, - /// Resource monitoring interval - pub resource_monitoring_interval: Duration, - /// Whether to enable health issue tracking - pub enable_health_issue_tracking: bool, - /// Metrics retention period - pub metrics_retention_period: Duration, -} - -impl Default for CollectorConfig { - fn default() -> Self { - Self { - collection_interval: Duration::from_secs(30), // 30 seconds - enable_detailed_metrics: true, - max_metrics_in_memory: 10000, - enable_auto_aggregation: true, - aggregation_interval: Duration::from_secs(300), // 5 minutes - enable_resource_monitoring: true, - resource_monitoring_interval: Duration::from_secs(10), // 10 seconds - enable_health_issue_tracking: true, - metrics_retention_period: Duration::from_secs(86400 * 7), // 7 days - } - } -} - -/// Metrics collector that gathers system metrics -#[derive(Debug)] -pub struct Collector { - config: CollectorConfig, - metrics: Arc>>, - aggregator: Arc, - last_collection_time: Arc>, - collection_count: Arc>, - health_issues: Arc>>, -} - -impl Collector { - /// Create a new metrics collector - pub async fn new(config: CollectorConfig) -> Result { - let aggregator = Arc::new(Aggregator::new(config.aggregation_interval).await?); - - Ok(Self { - config, - metrics: Arc::new(RwLock::new(Vec::new())), - aggregator, - last_collection_time: Arc::new(RwLock::new(SystemTime::now())), - collection_count: Arc::new(RwLock::new(0)), - health_issues: Arc::new(RwLock::new(std::collections::HashMap::new())), - }) - } - - /// Get the configuration - pub fn config(&self) -> &CollectorConfig { - &self.config - } - - /// Collect current system metrics - pub async fn collect_metrics(&self) -> Result { - let start_time = Instant::now(); - - let mut metrics = SystemMetrics::default(); - metrics.timestamp = SystemTime::now(); - - // Collect system resource metrics - if self.config.enable_resource_monitoring { - self.collect_system_resources(&mut metrics).await?; - } - - // Collect scan metrics - self.collect_scan_metrics(&mut metrics).await?; - - // Collect heal metrics - self.collect_heal_metrics(&mut metrics).await?; - - // Collect policy metrics - self.collect_policy_metrics(&mut metrics).await?; - - // Collect health issues - if self.config.enable_health_issue_tracking { - self.collect_health_issues(&mut metrics).await?; - } - - // Store metrics - { - let mut metrics_store = self.metrics.write().await; - metrics_store.push(metrics.clone()); - - // Trim old metrics if we exceed the limit - if metrics_store.len() > self.config.max_metrics_in_memory { - let excess = metrics_store.len() - self.config.max_metrics_in_memory; - metrics_store.drain(0..excess); - } - } - - // Update collection statistics - { - let mut last_time = self.last_collection_time.write().await; - *last_time = metrics.timestamp; - - let mut count = self.collection_count.write().await; - *count += 1; - } - - let collection_time = start_time.elapsed(); - debug!("Metrics collection completed in {:?}", collection_time); - - Ok(metrics) - } - - /// Collect system resource metrics - async fn collect_system_resources(&self, metrics: &mut SystemMetrics) -> Result<()> { - // Simulate system resource collection - // In a real implementation, this would use system APIs - - metrics.cpu_usage = self.get_cpu_usage().await?; - metrics.memory_usage = self.get_memory_usage().await?; - metrics.disk_usage = self.get_disk_usage().await?; - metrics.system_load = self.get_system_load().await?; - metrics.active_operations = self.get_active_operations().await?; - - // Collect network metrics - metrics.network_io = self.get_network_metrics().await?; - - // Collect disk I/O metrics - metrics.disk_io = self.get_disk_io_metrics().await?; - - Ok(()) - } - - /// Collect scan metrics - async fn collect_scan_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> { - // In a real implementation, this would get data from the scanner - metrics.scan_metrics = ScanMetrics::default(); - - // Simulate some scan metrics - metrics.scan_metrics.objects_scanned = 1000; - metrics.scan_metrics.bytes_scanned = 1024 * 1024 * 100; // 100 MB - metrics.scan_metrics.scan_duration = Duration::from_secs(60); - metrics.scan_metrics.scan_rate_objects_per_sec = 16.67; // 1000 / 60 - metrics.scan_metrics.scan_rate_bytes_per_sec = 1_747_200.0; // 100MB / 60s - metrics.scan_metrics.health_issues_found = 5; - metrics.scan_metrics.scan_cycles_completed = 1; - metrics.scan_metrics.last_scan_time = Some(SystemTime::now()); - - Ok(()) - } - - /// Collect heal metrics - async fn collect_heal_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> { - // In a real implementation, this would get data from the heal system - metrics.heal_metrics = HealMetrics::default(); - - // Simulate some heal metrics - metrics.heal_metrics.total_repairs = 10; - metrics.heal_metrics.successful_repairs = 8; - metrics.heal_metrics.failed_repairs = 2; - metrics.heal_metrics.total_repair_time = Duration::from_secs(300); - metrics.heal_metrics.average_repair_time = Duration::from_secs(30); - metrics.heal_metrics.active_repair_workers = 2; - metrics.heal_metrics.queued_repair_tasks = 5; - metrics.heal_metrics.last_repair_time = Some(SystemTime::now()); - metrics.heal_metrics.total_retry_attempts = 3; - - Ok(()) - } - - /// Collect policy metrics - async fn collect_policy_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> { - // In a real implementation, this would get data from the policy system - metrics.policy_metrics = PolicyMetrics::default(); - - // Simulate some policy metrics - metrics.policy_metrics.total_evaluations = 50; - metrics.policy_metrics.allowed_operations = 45; - metrics.policy_metrics.denied_operations = 5; - metrics.policy_metrics.scan_policy_evaluations = 20; - metrics.policy_metrics.heal_policy_evaluations = 20; - metrics.policy_metrics.retention_policy_evaluations = 10; - metrics.policy_metrics.average_evaluation_time = Duration::from_millis(10); - - Ok(()) - } - - /// Collect health issues - async fn collect_health_issues(&self, metrics: &mut SystemMetrics) -> Result<()> { - let health_issues = self.health_issues.read().await; - metrics.health_issues = health_issues.clone(); - Ok(()) - } - - /// Record a health issue - pub async fn record_health_issue(&self, issue: &HealthIssue) -> Result<()> { - let mut issues = self.health_issues.write().await; - let count = issues.entry(issue.severity).or_insert(0); - *count += 1; - - info!("Recorded health issue: {:?} - {}", issue.severity, issue.description); - Ok(()) - } - - /// Record an event (alias for record_health_issue) - pub async fn record_event(&self, issue: &HealthIssue) -> Result<()> { - self.record_health_issue(issue).await - } - - /// Clear health issues - pub async fn clear_health_issues(&self) -> Result<()> { - let mut health_issues = self.health_issues.write().await; - health_issues.clear(); - - info!("Cleared all health issues"); - Ok(()) - } - - /// Query metrics with aggregation - pub async fn query_metrics(&self, query: MetricsQuery) -> Result { - // In a real implementation, this would query the aggregator - // For now, we'll return a simple aggregated result - let aggregator = self.aggregator.as_ref(); - let mut aggregator_guard = aggregator.write().await; - aggregator_guard.aggregate_metrics(query).await - } - - /// Get metrics for a specific time range - pub async fn get_metrics_range(&self, start_time: SystemTime, end_time: SystemTime) -> Result> { - let metrics = self.metrics.read().await; - let filtered_metrics: Vec = metrics - .iter() - .filter(|m| m.timestamp >= start_time && m.timestamp <= end_time) - .cloned() - .collect(); - - Ok(filtered_metrics) - } - - /// Get latest metrics - pub async fn get_latest_metrics(&self) -> Result> { - let metrics = self.metrics.read().await; - Ok(metrics.last().cloned()) - } - - /// Get collection statistics - pub async fn get_collection_statistics(&self) -> CollectionStatistics { - let collection_count = *self.collection_count.read().await; - let last_collection_time = *self.last_collection_time.read().await; - let metrics_count = self.metrics.read().await.len(); - - CollectionStatistics { - total_collections: collection_count, - last_collection_time, - metrics_in_memory: metrics_count, - config: self.config.clone(), - } - } - - /// Simulated system resource collection methods - async fn get_cpu_usage(&self) -> Result { - // Simulate CPU usage collection - Ok(25.5) // 25.5% - } - - async fn get_memory_usage(&self) -> Result { - // Simulate memory usage collection - Ok(60.2) // 60.2% - } - - async fn get_disk_usage(&self) -> Result { - // Simulate disk usage collection - Ok(45.8) // 45.8% - } - - async fn get_system_load(&self) -> Result { - // Simulate system load collection - Ok(0.75) // 0.75 - } - - async fn get_active_operations(&self) -> Result { - // Simulate active operations count - Ok(15) - } - - async fn get_network_metrics(&self) -> Result { - // Simulate network metrics collection - Ok(NetworkMetrics { - bytes_received_per_sec: 1024 * 1024, // 1 MB/s - bytes_sent_per_sec: 512 * 1024, // 512 KB/s - packets_received_per_sec: 1000, - packets_sent_per_sec: 500, - }) - } - - async fn get_disk_io_metrics(&self) -> Result { - // Simulate disk I/O metrics collection - Ok(DiskMetrics { - bytes_read_per_sec: 2 * 1024 * 1024, // 2 MB/s - bytes_written_per_sec: 1 * 1024 * 1024, // 1 MB/s - read_ops_per_sec: 200, - write_ops_per_sec: 100, - avg_read_latency_ms: 5.0, - avg_write_latency_ms: 8.0, - }) - } -} - -/// Collection statistics -#[derive(Debug, Clone)] -pub struct CollectionStatistics { - pub total_collections: u64, - pub last_collection_time: SystemTime, - pub metrics_in_memory: usize, - pub config: CollectorConfig, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType}; - - #[tokio::test] - async fn test_collector_creation() { - let config = CollectorConfig::default(); - let collector = Collector::new(config).await.unwrap(); - - assert_eq!(collector.config().collection_interval, Duration::from_secs(30)); - assert!(collector.config().enable_detailed_metrics); - } - - #[tokio::test] - async fn test_metrics_collection() { - let config = CollectorConfig::default(); - let collector = Collector::new(config).await.unwrap(); - - let metrics = collector.collect_metrics().await.unwrap(); - assert_eq!(metrics.cpu_usage, 25.5); - assert_eq!(metrics.memory_usage, 60.2); - assert_eq!(metrics.disk_usage, 45.8); - } - - #[tokio::test] - async fn test_health_issue_recording() { - let config = CollectorConfig::default(); - let collector = Collector::new(config).await.unwrap(); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - collector.record_health_issue(&issue).await.unwrap(); - - let stats = collector.get_collection_statistics().await; - assert_eq!(stats.total_collections, 0); // No collection yet - } - - #[tokio::test] - async fn test_latest_metrics() { - let config = CollectorConfig::default(); - let collector = Collector::new(config).await.unwrap(); - - // Initially no metrics - let latest = collector.get_latest_metrics().await.unwrap(); - assert!(latest.is_none()); - - // Collect metrics - collector.collect_metrics().await.unwrap(); - - // Now should have metrics - let latest = collector.get_latest_metrics().await.unwrap(); - assert!(latest.is_some()); - } - - #[tokio::test] - async fn test_collection_statistics() { - let config = CollectorConfig::default(); - let collector = Collector::new(config).await.unwrap(); - - let stats = collector.get_collection_statistics().await; - assert_eq!(stats.total_collections, 0); - assert_eq!(stats.metrics_in_memory, 0); - - // Collect metrics - collector.collect_metrics().await.unwrap(); - - let stats = collector.get_collection_statistics().await; - assert_eq!(stats.total_collections, 1); - assert_eq!(stats.metrics_in_memory, 1); - } -} \ No newline at end of file diff --git a/crates/ahm/src/metrics/mod.rs b/crates/ahm/src/metrics/mod.rs deleted file mode 100644 index f5f6ae85..00000000 --- a/crates/ahm/src/metrics/mod.rs +++ /dev/null @@ -1,617 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Metrics collection and aggregation system -//! -//! The metrics subsystem provides comprehensive data collection and analysis: -//! - Real-time metrics collection from all subsystems -//! - Time-series data storage and aggregation -//! - Export capabilities for external monitoring systems -//! - Performance analytics and trend analysis - -pub mod collector; -pub mod aggregator; -pub mod storage; -pub mod reporter; - -pub use collector::{Collector, CollectorConfig}; -pub use aggregator::{Aggregator, AggregatorConfig}; -pub use storage::{Storage, StorageConfig}; -pub use reporter::{Reporter, ReporterConfig}; - -use std::time::{Duration, SystemTime}; -use std::collections::HashMap; -use serde::{Deserialize, Serialize}; - -use crate::scanner::{HealthIssue, Severity}; - -/// Metrics subsystem status -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Status { - /// Metrics system is initializing - Initializing, - /// Metrics system is running normally - Running, - /// Metrics system is degraded (some exporters failing) - Degraded, - /// Metrics system is stopping - Stopping, - /// Metrics system has stopped - Stopped, - /// Metrics system encountered an error - Error(String), -} - -/// Metric data point with timestamp and value -#[derive(Debug, Clone)] -pub struct MetricPoint { - /// Metric name - pub name: String, - /// Metric value - pub value: MetricValue, - /// Timestamp when metric was collected - pub timestamp: SystemTime, - /// Additional labels/tags - pub labels: HashMap, -} - -/// Different types of metric values -#[derive(Debug, Clone)] -pub enum MetricValue { - /// Counter that only increases - Counter(u64), - /// Gauge that can go up or down - Gauge(f64), - /// Histogram with buckets - Histogram { - count: u64, - sum: f64, - buckets: Vec, - }, - /// Summary with quantiles - Summary { - count: u64, - sum: f64, - quantiles: Vec, - }, -} - -/// Histogram bucket -#[derive(Debug, Clone)] -pub struct HistogramBucket { - /// Upper bound of the bucket - pub le: f64, - /// Count of observations in this bucket - pub count: u64, -} - -/// Summary quantile -#[derive(Debug, Clone)] -pub struct Quantile { - /// Quantile value (e.g., 0.5 for median) - pub quantile: f64, - /// Value at this quantile - pub value: f64, -} - -/// Aggregation functions for metrics -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum AggregationFunction { - Sum, - Average, - Min, - Max, - Count, - Rate, - Percentile(u8), -} - -/// Time window for aggregation -#[derive(Debug, Clone)] -pub struct TimeWindow { - /// Duration of the window - pub duration: Duration, - /// How often to create new windows - pub step: Duration, -} - -/// Metric export configuration -#[derive(Debug, Clone)] -pub struct ExportConfig { - /// Export format - pub format: ExportFormat, - /// Export destination - pub destination: ExportDestination, - /// Export interval - pub interval: Duration, - /// Metric filters - pub filters: Vec, -} - -/// Supported export formats -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum ExportFormat { - /// Prometheus format - Prometheus, - /// JSON format - Json, - /// CSV format - Csv, - /// Custom format - Custom(String), -} - -/// Export destinations -#[derive(Debug, Clone)] -pub enum ExportDestination { - /// HTTP endpoint - Http { url: String, headers: HashMap }, - /// File system - File { path: String }, - /// Standard output - Stdout, - /// Custom destination - Custom(String), -} - -/// Metric filtering rules -#[derive(Debug, Clone)] -pub struct MetricFilter { - /// Metric name pattern (regex) - pub name_pattern: String, - /// Label filters - pub label_filters: HashMap, - /// Include or exclude matching metrics - pub include: bool, -} - -/// System-wide metrics that are automatically collected -pub mod system_metrics { - /// Object-related metrics - pub const OBJECTS_TOTAL: &str = "rustfs_objects_total"; - pub const OBJECTS_SIZE_BYTES: &str = "rustfs_objects_size_bytes"; - pub const OBJECTS_SCANNED_TOTAL: &str = "rustfs_objects_scanned_total"; - pub const OBJECTS_HEAL_OPERATIONS_TOTAL: &str = "rustfs_objects_heal_operations_total"; - - /// Scanner metrics - pub const SCAN_CYCLES_TOTAL: &str = "rustfs_scan_cycles_total"; - pub const SCAN_DURATION_SECONDS: &str = "rustfs_scan_duration_seconds"; - pub const SCAN_RATE_OBJECTS_PER_SECOND: &str = "rustfs_scan_rate_objects_per_second"; - pub const SCAN_RATE_BYTES_PER_SECOND: &str = "rustfs_scan_rate_bytes_per_second"; - - /// Health metrics - pub const HEALTH_ISSUES_TOTAL: &str = "rustfs_health_issues_total"; - pub const HEALTH_ISSUES_BY_SEVERITY: &str = "rustfs_health_issues_by_severity"; - pub const HEAL_SUCCESS_RATE: &str = "rustfs_heal_success_rate"; - - /// System resource metrics - pub const DISK_USAGE_BYTES: &str = "rustfs_disk_usage_bytes"; - pub const DISK_IOPS: &str = "rustfs_disk_iops"; - pub const MEMORY_USAGE_BYTES: &str = "rustfs_memory_usage_bytes"; - pub const CPU_USAGE_PERCENT: &str = "rustfs_cpu_usage_percent"; - - /// Performance metrics - pub const OPERATION_DURATION_SECONDS: &str = "rustfs_operation_duration_seconds"; - pub const ACTIVE_OPERATIONS: &str = "rustfs_active_operations"; - pub const THROUGHPUT_BYTES_PER_SECOND: &str = "rustfs_throughput_bytes_per_second"; -} - -/// System metrics collected by AHM -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SystemMetrics { - /// Timestamp when metrics were collected - pub timestamp: SystemTime, - /// CPU usage percentage - pub cpu_usage: f64, - /// Memory usage percentage - pub memory_usage: f64, - /// Disk usage percentage - pub disk_usage: f64, - /// Network I/O bytes per second - pub network_io: NetworkMetrics, - /// Disk I/O bytes per second - pub disk_io: DiskMetrics, - /// Active operations count - pub active_operations: u64, - /// System load average - pub system_load: f64, - /// Health issues count by severity - pub health_issues: std::collections::HashMap, - /// Scan metrics - pub scan_metrics: ScanMetrics, - /// Heal metrics - pub heal_metrics: HealMetrics, - /// Policy metrics - pub policy_metrics: PolicyMetrics, -} - -/// Network I/O metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NetworkMetrics { - /// Bytes received per second - pub bytes_received_per_sec: u64, - /// Bytes sent per second - pub bytes_sent_per_sec: u64, - /// Packets received per second - pub packets_received_per_sec: u64, - /// Packets sent per second - pub packets_sent_per_sec: u64, -} - -/// Disk I/O metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DiskMetrics { - /// Bytes read per second - pub bytes_read_per_sec: u64, - /// Bytes written per second - pub bytes_written_per_sec: u64, - /// Read operations per second - pub read_ops_per_sec: u64, - /// Write operations per second - pub write_ops_per_sec: u64, - /// Average read latency in milliseconds - pub avg_read_latency_ms: f64, - /// Average write latency in milliseconds - pub avg_write_latency_ms: f64, -} - -/// Scan operation metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ScanMetrics { - /// Total objects scanned - pub objects_scanned: u64, - /// Total bytes scanned - pub bytes_scanned: u64, - /// Scan duration - pub scan_duration: Duration, - /// Scan rate (objects per second) - pub scan_rate_objects_per_sec: f64, - /// Scan rate (bytes per second) - pub scan_rate_bytes_per_sec: f64, - /// Health issues found - pub health_issues_found: u64, - /// Scan cycles completed - pub scan_cycles_completed: u64, - /// Last scan time - pub last_scan_time: Option, -} - -/// Heal operation metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HealMetrics { - /// Total repair operations - pub total_repairs: u64, - /// Successful repairs - pub successful_repairs: u64, - /// Failed repairs - pub failed_repairs: u64, - /// Total repair time - pub total_repair_time: Duration, - /// Average repair time - pub average_repair_time: Duration, - /// Active repair workers - pub active_repair_workers: u64, - /// Queued repair tasks - pub queued_repair_tasks: u64, - /// Last repair time - pub last_repair_time: Option, - /// Retry attempts - pub total_retry_attempts: u64, -} - -/// Policy evaluation metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PolicyMetrics { - /// Total policy evaluations - pub total_evaluations: u64, - /// Allowed operations - pub allowed_operations: u64, - /// Denied operations - pub denied_operations: u64, - /// Scan policy evaluations - pub scan_policy_evaluations: u64, - /// Heal policy evaluations - pub heal_policy_evaluations: u64, - /// Retention policy evaluations - pub retention_policy_evaluations: u64, - /// Average evaluation time - pub average_evaluation_time: Duration, -} - -impl Default for SystemMetrics { - fn default() -> Self { - Self { - timestamp: SystemTime::now(), - cpu_usage: 0.0, - memory_usage: 0.0, - disk_usage: 0.0, - network_io: NetworkMetrics::default(), - disk_io: DiskMetrics::default(), - active_operations: 0, - system_load: 0.0, - health_issues: std::collections::HashMap::new(), - scan_metrics: ScanMetrics::default(), - heal_metrics: HealMetrics::default(), - policy_metrics: PolicyMetrics::default(), - } - } -} - -impl Default for NetworkMetrics { - fn default() -> Self { - Self { - bytes_received_per_sec: 0, - bytes_sent_per_sec: 0, - packets_received_per_sec: 0, - packets_sent_per_sec: 0, - } - } -} - -impl Default for DiskMetrics { - fn default() -> Self { - Self { - bytes_read_per_sec: 0, - bytes_written_per_sec: 0, - read_ops_per_sec: 0, - write_ops_per_sec: 0, - avg_read_latency_ms: 0.0, - avg_write_latency_ms: 0.0, - } - } -} - -impl Default for ScanMetrics { - fn default() -> Self { - Self { - objects_scanned: 0, - bytes_scanned: 0, - scan_duration: Duration::ZERO, - scan_rate_objects_per_sec: 0.0, - scan_rate_bytes_per_sec: 0.0, - health_issues_found: 0, - scan_cycles_completed: 0, - last_scan_time: None, - } - } -} - -impl Default for HealMetrics { - fn default() -> Self { - Self { - total_repairs: 0, - successful_repairs: 0, - failed_repairs: 0, - total_repair_time: Duration::ZERO, - average_repair_time: Duration::ZERO, - active_repair_workers: 0, - queued_repair_tasks: 0, - last_repair_time: None, - total_retry_attempts: 0, - } - } -} - -impl Default for PolicyMetrics { - fn default() -> Self { - Self { - total_evaluations: 0, - allowed_operations: 0, - denied_operations: 0, - scan_policy_evaluations: 0, - heal_policy_evaluations: 0, - retention_policy_evaluations: 0, - average_evaluation_time: Duration::ZERO, - } - } -} - -/// Metrics query parameters -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsQuery { - /// Start time for the query - pub start_time: SystemTime, - /// End time for the query - pub end_time: SystemTime, - /// Metrics aggregation interval - pub interval: Duration, - /// Metrics to include in the query - pub metrics: Vec, - /// Filter by severity - pub severity_filter: Option, - /// Limit number of results - pub limit: Option, -} - -/// Types of metrics that can be queried -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub enum MetricType { - /// System metrics (CPU, memory, disk) - System, - /// Network metrics - Network, - /// Disk I/O metrics - DiskIo, - /// Scan metrics - Scan, - /// Heal metrics - Heal, - /// Policy metrics - Policy, - /// Health issues - HealthIssues, -} - -/// Aggregated metrics data -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AggregatedMetrics { - /// Query parameters used - pub query: MetricsQuery, - /// Aggregated data points - pub data_points: Vec, - /// Summary statistics - pub summary: MetricsSummary, -} - -/// Individual metrics data point -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsDataPoint { - /// Timestamp for this data point - pub timestamp: SystemTime, - /// System metrics - pub system: Option, - /// Network metrics - pub network: Option, - /// Disk I/O metrics - pub disk_io: Option, - /// Scan metrics - pub scan: Option, - /// Heal metrics - pub heal: Option, - /// Policy metrics - pub policy: Option, -} - -/// Summary statistics for aggregated metrics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsSummary { - /// Total data points - pub total_points: u64, - /// Time range covered - pub time_range: Duration, - /// Average CPU usage - pub avg_cpu_usage: f64, - /// Average memory usage - pub avg_memory_usage: f64, - /// Average disk usage - pub avg_disk_usage: f64, - /// Total objects scanned - pub total_objects_scanned: u64, - /// Total repairs performed - pub total_repairs: u64, - /// Success rate for repairs - pub repair_success_rate: f64, - /// Total health issues - pub total_health_issues: u64, -} - -/// Resource usage information -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ResourceUsage { - /// Disk usage information - pub disk_usage: DiskUsage, - /// Memory usage information - pub memory_usage: MemoryUsage, - /// Network usage information - pub network_usage: NetworkUsage, - /// CPU usage information - pub cpu_usage: CpuUsage, -} - -/// Disk usage information -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DiskUsage { - /// Total disk space in bytes - pub total_bytes: u64, - /// Used disk space in bytes - pub used_bytes: u64, - /// Available disk space in bytes - pub available_bytes: u64, - /// Usage percentage - pub usage_percentage: f64, -} - -/// Memory usage information -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MemoryUsage { - /// Total memory in bytes - pub total_bytes: u64, - /// Used memory in bytes - pub used_bytes: u64, - /// Available memory in bytes - pub available_bytes: u64, - /// Usage percentage - pub usage_percentage: f64, -} - -/// Network usage information -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NetworkUsage { - /// Bytes received - pub bytes_received: u64, - /// Bytes sent - pub bytes_sent: u64, - /// Packets received - pub packets_received: u64, - /// Packets sent - pub packets_sent: u64, -} - -/// CPU usage information -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CpuUsage { - /// CPU usage percentage - pub usage_percentage: f64, - /// Number of CPU cores - pub cores: u32, - /// Load average - pub load_average: f64, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_system_metrics_creation() { - let metrics = SystemMetrics::default(); - assert_eq!(metrics.cpu_usage, 0.0); - assert_eq!(metrics.memory_usage, 0.0); - assert_eq!(metrics.active_operations, 0); - } - - #[test] - fn test_scan_metrics_creation() { - let metrics = ScanMetrics::default(); - assert_eq!(metrics.objects_scanned, 0); - assert_eq!(metrics.bytes_scanned, 0); - assert_eq!(metrics.scan_cycles_completed, 0); - } - - #[test] - fn test_heal_metrics_creation() { - let metrics = HealMetrics::default(); - assert_eq!(metrics.total_repairs, 0); - assert_eq!(metrics.successful_repairs, 0); - assert_eq!(metrics.failed_repairs, 0); - } - - #[test] - fn test_metrics_query_creation() { - let start_time = SystemTime::now(); - let end_time = start_time + Duration::from_secs(3600); - let query = MetricsQuery { - start_time, - end_time, - interval: Duration::from_secs(60), - metrics: vec![MetricType::System, MetricType::Scan], - severity_filter: Some(Severity::Critical), - limit: Some(100), - }; - - assert_eq!(query.metrics.len(), 2); - assert_eq!(query.interval, Duration::from_secs(60)); - assert_eq!(query.limit, Some(100)); - } -} \ No newline at end of file diff --git a/crates/ahm/src/metrics/reporter.rs b/crates/ahm/src/metrics/reporter.rs deleted file mode 100644 index 45aacc03..00000000 --- a/crates/ahm/src/metrics/reporter.rs +++ /dev/null @@ -1,861 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::{ - collections::HashMap, - fmt, - sync::Arc, - time::{Duration, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{debug, error, info, warn}; -use serde::{Serialize, Deserialize}; - -use crate::error::Result; - -use super::{ - AggregatedMetrics, MetricsQuery, MetricsSummary, SystemMetrics, -}; - -/// Configuration for the metrics reporter -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReporterConfig { - /// Whether to enable reporting - pub enabled: bool, - /// Report generation interval - pub report_interval: Duration, - /// Maximum number of reports to keep in memory - pub max_reports_in_memory: usize, - /// Alert thresholds - pub alert_thresholds: AlertThresholds, - /// Report output format - pub default_format: ReportFormat, - /// Whether to enable alerting - pub enable_alerts: bool, - /// Maximum number of alerts to keep in memory - pub max_alerts_in_memory: usize, - /// Report output directory - pub output_directory: Option, - /// Whether to enable HTTP reporting - pub enable_http_reporting: bool, - /// HTTP reporting endpoint - pub http_endpoint: Option, -} - -impl Default for ReporterConfig { - fn default() -> Self { - Self { - enabled: true, - report_interval: Duration::from_secs(60), // 1 minute - max_reports_in_memory: 1000, - alert_thresholds: AlertThresholds::default(), - default_format: ReportFormat::Json, - enable_alerts: true, - max_alerts_in_memory: 1000, - output_directory: None, - enable_http_reporting: false, - http_endpoint: None, - } - } -} - -/// Alert thresholds for metrics reporting -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AlertThresholds { - /// CPU usage threshold (percentage) - pub cpu_usage_threshold: f64, - /// Memory usage threshold (percentage) - pub memory_usage_threshold: f64, - /// Disk usage threshold (percentage) - pub disk_usage_threshold: f64, - /// System load threshold - pub system_load_threshold: f64, - /// Repair failure rate threshold (percentage) - pub repair_failure_rate_threshold: f64, - /// Health issues threshold (count) - pub health_issues_threshold: u64, -} - -impl Default for AlertThresholds { - fn default() -> Self { - Self { - cpu_usage_threshold: 80.0, - memory_usage_threshold: 85.0, - disk_usage_threshold: 90.0, - system_load_threshold: 5.0, - repair_failure_rate_threshold: 20.0, - health_issues_threshold: 10, - } - } -} - -/// Metrics reporter that generates and outputs metrics reports -pub struct Reporter { - config: ReporterConfig, - reports: Arc>>, - alerts: Arc>>, - last_report_time: Arc>, - report_count: Arc>, - alert_count: Arc>, -} - -impl Reporter { - /// Create a new metrics reporter - pub async fn new(config: ReporterConfig) -> Result { - Ok(Self { - config, - reports: Arc::new(RwLock::new(Vec::new())), - alerts: Arc::new(RwLock::new(Vec::new())), - last_report_time: Arc::new(RwLock::new(SystemTime::now())), - report_count: Arc::new(RwLock::new(0)), - alert_count: Arc::new(RwLock::new(0)), - }) - } - - /// Get the configuration - pub fn config(&self) -> &ReporterConfig { - &self.config - } - - /// Generate a metrics report - pub async fn generate_report(&self, metrics: &SystemMetrics) -> Result { - let start_time = SystemTime::now(); - - let report = MetricsReport { - timestamp: start_time, - metrics: metrics.clone(), - alerts: self.check_alerts(metrics).await?, - summary: self.generate_summary(metrics).await?, - format: self.config.default_format, - }; - - // Store report - { - let mut reports = self.reports.write().await; - reports.push(report.clone()); - - // Trim old reports if we exceed the limit - if reports.len() > self.config.max_reports_in_memory { - let excess = reports.len() - self.config.max_reports_in_memory; - reports.drain(0..excess); - } - } - - // Update statistics - { - let mut last_time = self.last_report_time.write().await; - *last_time = start_time; - - let mut count = self.report_count.write().await; - *count += 1; - } - - info!("Generated metrics report #{}", *self.report_count.read().await); - Ok(report) - } - - /// Generate a comprehensive report from aggregated metrics - pub async fn generate_comprehensive_report(&self, aggregated: &AggregatedMetrics) -> Result { - let start_time = SystemTime::now(); - - let report = ComprehensiveReport { - timestamp: start_time, - query: aggregated.query.clone(), - data_points: aggregated.data_points.len(), - summary: aggregated.summary.clone(), - alerts: self.check_aggregated_alerts(aggregated).await?, - trends: self.analyze_trends(aggregated).await?, - recommendations: self.generate_recommendations(aggregated).await?, - }; - - info!("Generated comprehensive report with {} data points", report.data_points); - Ok(report) - } - - /// Output a report in the specified format - pub async fn output_report(&self, report: &MetricsReport, format: ReportFormat) -> Result<()> { - match format { - ReportFormat::Console => self.output_to_console(report).await?, - ReportFormat::File => self.output_to_file(report).await?, - ReportFormat::Http => self.output_to_http(report).await?, - ReportFormat::Prometheus => self.output_prometheus(report).await?, - ReportFormat::Json => self.output_json(report).await?, - ReportFormat::Csv => self.output_csv(report).await?, - } - - Ok(()) - } - - /// Check for alerts based on metrics - async fn check_alerts(&self, metrics: &SystemMetrics) -> Result> { - let mut alerts = Vec::new(); - - // Check CPU usage - if metrics.cpu_usage > self.config.alert_thresholds.cpu_usage_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Warning, - category: AlertCategory::System, - message: format!("High CPU usage: {:.1}%", metrics.cpu_usage), - metric_value: metrics.cpu_usage, - threshold: self.config.alert_thresholds.cpu_usage_threshold, - }); - } - - // Check memory usage - if metrics.memory_usage > self.config.alert_thresholds.memory_usage_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Warning, - category: AlertCategory::System, - message: format!("High memory usage: {:.1}%", metrics.memory_usage), - metric_value: metrics.memory_usage, - threshold: self.config.alert_thresholds.memory_usage_threshold, - }); - } - - // Check disk usage - if metrics.disk_usage > self.config.alert_thresholds.disk_usage_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Critical, - category: AlertCategory::System, - message: format!("High disk usage: {:.1}%", metrics.disk_usage), - metric_value: metrics.disk_usage, - threshold: self.config.alert_thresholds.disk_usage_threshold, - }); - } - - // Check system load - if metrics.system_load > self.config.alert_thresholds.system_load_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Warning, - category: AlertCategory::System, - message: format!("High system load: {:.2}", metrics.system_load), - metric_value: metrics.system_load, - threshold: self.config.alert_thresholds.system_load_threshold, - }); - } - - // Check repair failure rate - if metrics.heal_metrics.total_repairs > 0 { - let failure_rate = (metrics.heal_metrics.failed_repairs as f64 / metrics.heal_metrics.total_repairs as f64) * 100.0; - if failure_rate > self.config.alert_thresholds.repair_failure_rate_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Critical, - category: AlertCategory::Heal, - message: format!("High repair failure rate: {:.1}%", failure_rate), - metric_value: failure_rate, - threshold: self.config.alert_thresholds.repair_failure_rate_threshold, - }); - } - } - - // Check health issues - let total_health_issues: u64 = metrics.health_issues.values().sum(); - if total_health_issues > self.config.alert_thresholds.health_issues_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Warning, - category: AlertCategory::Health, - message: format!("High number of health issues: {}", total_health_issues), - metric_value: total_health_issues as f64, - threshold: self.config.alert_thresholds.health_issues_threshold as f64, - }); - } - - // Store alerts - if !alerts.is_empty() { - let mut alert_store = self.alerts.write().await; - alert_store.extend(alerts.clone()); - - let mut count = self.alert_count.write().await; - *count += alerts.len() as u64; - } - - Ok(alerts) - } - - /// Check for alerts based on aggregated metrics - async fn check_aggregated_alerts(&self, aggregated: &AggregatedMetrics) -> Result> { - let mut alerts = Vec::new(); - - // Check summary statistics - if aggregated.summary.avg_cpu_usage > self.config.alert_thresholds.cpu_usage_threshold { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Warning, - category: AlertCategory::System, - message: format!("High average CPU usage: {:.1}%", aggregated.summary.avg_cpu_usage), - metric_value: aggregated.summary.avg_cpu_usage, - threshold: self.config.alert_thresholds.cpu_usage_threshold, - }); - } - - if aggregated.summary.repair_success_rate < (100.0 - self.config.alert_thresholds.repair_failure_rate_threshold) { - alerts.push(Alert { - timestamp: SystemTime::now(), - severity: AlertSeverity::Critical, - category: AlertCategory::Heal, - message: format!("Low repair success rate: {:.1}%", aggregated.summary.repair_success_rate * 100.0), - metric_value: aggregated.summary.repair_success_rate * 100.0, - threshold: 100.0 - self.config.alert_thresholds.repair_failure_rate_threshold, - }); - } - - Ok(alerts) - } - - /// Generate summary for metrics - async fn generate_summary(&self, metrics: &SystemMetrics) -> Result { - Ok(ReportSummary { - system_health: self.calculate_system_health(metrics), - performance_score: self.calculate_performance_score(metrics), - resource_utilization: self.calculate_resource_utilization(metrics), - operational_status: self.determine_operational_status(metrics), - key_metrics: self.extract_key_metrics(metrics), - }) - } - - /// Analyze trends in aggregated data - async fn analyze_trends(&self, aggregated: &AggregatedMetrics) -> Result> { - let mut trends = Vec::new(); - - if aggregated.data_points.len() < 2 { - return Ok(trends); - } - - // Analyze CPU usage trend - let cpu_values: Vec = aggregated - .data_points - .iter() - .filter_map(|p| p.system.as_ref().map(|s| s.cpu_usage)) - .collect(); - - if cpu_values.len() >= 2 { - let trend = self.calculate_trend(&cpu_values, "CPU Usage"); - trends.push(trend); - } - - // Analyze memory usage trend - let memory_values: Vec = aggregated - .data_points - .iter() - .filter_map(|p| p.system.as_ref().map(|s| s.memory_usage)) - .collect(); - - if memory_values.len() >= 2 { - let trend = self.calculate_trend(&memory_values, "Memory Usage"); - trends.push(trend); - } - - Ok(trends) - } - - /// Generate recommendations based on metrics - async fn generate_recommendations(&self, aggregated: &AggregatedMetrics) -> Result> { - let mut recommendations = Vec::new(); - - // Check for high resource usage - if aggregated.summary.avg_cpu_usage > 70.0 { - recommendations.push(Recommendation { - priority: RecommendationPriority::High, - category: RecommendationCategory::Performance, - title: "High CPU Usage".to_string(), - description: "Consider scaling up CPU resources or optimizing workload distribution".to_string(), - action: "Monitor CPU usage patterns and consider resource allocation adjustments".to_string(), - }); - } - - if aggregated.summary.avg_memory_usage > 80.0 { - recommendations.push(Recommendation { - priority: RecommendationPriority::High, - category: RecommendationCategory::Performance, - title: "High Memory Usage".to_string(), - description: "Memory usage is approaching critical levels".to_string(), - action: "Consider increasing memory allocation or optimizing memory usage".to_string(), - }); - } - - // Check for repair issues - if aggregated.summary.repair_success_rate < 0.8 { - recommendations.push(Recommendation { - priority: RecommendationPriority::Critical, - category: RecommendationCategory::Reliability, - title: "Low Repair Success Rate".to_string(), - description: "Data repair operations are failing frequently".to_string(), - action: "Investigate repair failures and check system health".to_string(), - }); - } - - Ok(recommendations) - } - - /// Calculate trend for a series of values - fn calculate_trend(&self, values: &[f64], metric_name: &str) -> TrendAnalysis { - if values.len() < 2 { - return TrendAnalysis { - metric_name: metric_name.to_string(), - trend_direction: TrendDirection::Stable, - change_rate: 0.0, - confidence: 0.0, - }; - } - - let first = values[0]; - let last = values[values.len() - 1]; - let change_rate = ((last - first) / first) * 100.0; - - let trend_direction = if change_rate > 5.0 { - TrendDirection::Increasing - } else if change_rate < -5.0 { - TrendDirection::Decreasing - } else { - TrendDirection::Stable - }; - - // Simple confidence calculation based on data points - let confidence = (values.len() as f64 / 10.0).min(1.0); - - TrendAnalysis { - metric_name: metric_name.to_string(), - trend_direction, - change_rate, - confidence, - } - } - - /// Calculate system health score - fn calculate_system_health(&self, metrics: &SystemMetrics) -> f64 { - let mut score = 100.0; - - // Deduct points for high resource usage - if metrics.cpu_usage > 80.0 { - score -= (metrics.cpu_usage - 80.0) * 0.5; - } - if metrics.memory_usage > 85.0 { - score -= (metrics.memory_usage - 85.0) * 0.5; - } - if metrics.disk_usage > 90.0 { - score -= (metrics.disk_usage - 90.0) * 1.0; - } - - // Deduct points for health issues - let total_health_issues: u64 = metrics.health_issues.values().sum(); - score -= total_health_issues as f64 * 5.0; - - // Deduct points for repair failures - if metrics.heal_metrics.total_repairs > 0 { - let failure_rate = metrics.heal_metrics.failed_repairs as f64 / metrics.heal_metrics.total_repairs as f64; - score -= failure_rate * 20.0; - } - - score.max(0.0) - } - - /// Calculate performance score - fn calculate_performance_score(&self, metrics: &SystemMetrics) -> f64 { - let mut score = 100.0; - - // Base score on resource efficiency - score -= metrics.cpu_usage * 0.3; - score -= metrics.memory_usage * 0.3; - score -= metrics.disk_usage * 0.2; - score -= metrics.system_load * 10.0; - - score.max(0.0) - } - - /// Calculate resource utilization - fn calculate_resource_utilization(&self, metrics: &SystemMetrics) -> f64 { - (metrics.cpu_usage + metrics.memory_usage + metrics.disk_usage) / 3.0 - } - - /// Determine operational status - fn determine_operational_status(&self, metrics: &SystemMetrics) -> OperationalStatus { - let health_score = self.calculate_system_health(metrics); - - if health_score >= 90.0 { - OperationalStatus::Excellent - } else if health_score >= 75.0 { - OperationalStatus::Good - } else if health_score >= 50.0 { - OperationalStatus::Fair - } else { - OperationalStatus::Poor - } - } - - /// Extract key metrics - fn extract_key_metrics(&self, metrics: &SystemMetrics) -> HashMap { - let mut key_metrics = HashMap::new(); - key_metrics.insert("cpu_usage".to_string(), metrics.cpu_usage); - key_metrics.insert("memory_usage".to_string(), metrics.memory_usage); - key_metrics.insert("disk_usage".to_string(), metrics.disk_usage); - key_metrics.insert("system_load".to_string(), metrics.system_load); - key_metrics.insert("active_operations".to_string(), metrics.active_operations as f64); - key_metrics.insert("objects_scanned".to_string(), metrics.scan_metrics.objects_scanned as f64); - key_metrics.insert("total_repairs".to_string(), metrics.heal_metrics.total_repairs as f64); - key_metrics.insert("successful_repairs".to_string(), metrics.heal_metrics.successful_repairs as f64); - - key_metrics - } - - /// Output methods (simulated) - async fn output_to_console(&self, report: &MetricsReport) -> Result<()> { - if self.config.enabled { - info!("=== Metrics Report ==="); - info!("Timestamp: {:?}", report.timestamp); - info!("System Health: {:.1}%", report.summary.system_health); - info!("Performance Score: {:.1}%", report.summary.performance_score); - info!("Operational Status: {:?}", report.summary.operational_status); - - if !report.alerts.is_empty() { - info!("=== Alerts ==="); - for alert in &report.alerts { - info!("[{}] {}: {}", alert.severity, alert.category, alert.message); - } - } - } - Ok(()) - } - - async fn output_to_file(&self, _report: &MetricsReport) -> Result<()> { - if self.config.enabled { - // In a real implementation, this would write to a file - debug!("Would write report to file: {}", self.config.output_directory.as_ref().unwrap_or(&String::new())); - } - Ok(()) - } - - async fn output_to_http(&self, _report: &MetricsReport) -> Result<()> { - if self.config.enable_http_reporting { - // In a real implementation, this would serve via HTTP - debug!("Would serve report via HTTP on endpoint: {}", self.config.http_endpoint.as_ref().unwrap_or(&String::new())); - } - Ok(()) - } - - async fn output_prometheus(&self, _report: &MetricsReport) -> Result<()> { - if self.config.enabled { - // In a real implementation, this would output Prometheus format - debug!("Would output Prometheus format"); - } - Ok(()) - } - - async fn output_json(&self, _report: &MetricsReport) -> Result<()> { - if self.config.enabled { - // In a real implementation, this would output JSON format - debug!("Would output JSON format"); - } - Ok(()) - } - - async fn output_csv(&self, _report: &MetricsReport) -> Result<()> { - if self.config.enabled { - // In a real implementation, this would output CSV format - debug!("Would output CSV format"); - } - Ok(()) - } - - /// Get reporting statistics - pub async fn get_statistics(&self) -> ReporterStatistics { - let report_count = *self.report_count.read().await; - let alert_count = *self.alert_count.read().await; - let last_report_time = *self.last_report_time.read().await; - let reports_count = self.reports.read().await.len(); - let alerts_count = self.alerts.read().await.len(); - - ReporterStatistics { - total_reports: report_count, - total_alerts: alert_count, - reports_in_memory: reports_count, - alerts_in_memory: alerts_count, - last_report_time, - config: self.config.clone(), - } - } - - /// Get recent alerts - pub async fn get_recent_alerts(&self, hours: u64) -> Result> { - let cutoff_time = SystemTime::now() - Duration::from_secs(hours * 3600); - let alerts = self.alerts.read().await; - - let recent_alerts: Vec = alerts - .iter() - .filter(|alert| alert.timestamp >= cutoff_time) - .cloned() - .collect(); - - Ok(recent_alerts) - } - - /// Clear old alerts - pub async fn clear_old_alerts(&self, hours: u64) -> Result<()> { - let cutoff_time = SystemTime::now() - Duration::from_secs(hours * 3600); - let mut alerts = self.alerts.write().await; - alerts.retain(|alert| alert.timestamp >= cutoff_time); - - info!("Cleared alerts older than {} hours", hours); - Ok(()) - } -} - -/// Metrics report -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsReport { - pub timestamp: SystemTime, - pub metrics: SystemMetrics, - pub alerts: Vec, - pub summary: ReportSummary, - pub format: ReportFormat, -} - -/// Comprehensive report -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ComprehensiveReport { - pub timestamp: SystemTime, - pub query: MetricsQuery, - pub data_points: usize, - pub summary: MetricsSummary, - pub alerts: Vec, - pub trends: Vec, - pub recommendations: Vec, -} - -/// Report summary -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReportSummary { - pub system_health: f64, - pub performance_score: f64, - pub resource_utilization: f64, - pub operational_status: OperationalStatus, - pub key_metrics: HashMap, -} - -/// Alert -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Alert { - pub timestamp: SystemTime, - pub severity: AlertSeverity, - pub category: AlertCategory, - pub message: String, - pub metric_value: f64, - pub threshold: f64, -} - -/// Alert severity -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum AlertSeverity { - Info, - Warning, - Critical, -} - -impl fmt::Display for AlertSeverity { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - AlertSeverity::Info => write!(f, "INFO"), - AlertSeverity::Warning => write!(f, "WARNING"), - AlertSeverity::Critical => write!(f, "CRITICAL"), - } - } -} - -/// Alert category -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum AlertCategory { - System, - Performance, - Health, - Heal, - Security, -} - -impl fmt::Display for AlertCategory { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - AlertCategory::System => write!(f, "SYSTEM"), - AlertCategory::Performance => write!(f, "PERFORMANCE"), - AlertCategory::Health => write!(f, "HEALTH"), - AlertCategory::Heal => write!(f, "HEAL"), - AlertCategory::Security => write!(f, "SECURITY"), - } - } -} - -/// Report format -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum ReportFormat { - Console, - File, - Http, - Prometheus, - Json, - Csv, -} - -/// Operational status -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum OperationalStatus { - Excellent, - Good, - Fair, - Poor, -} - -/// Trend analysis -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TrendAnalysis { - pub metric_name: String, - pub trend_direction: TrendDirection, - pub change_rate: f64, - pub confidence: f64, -} - -/// Trend direction -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum TrendDirection { - Increasing, - Decreasing, - Stable, -} - -/// Recommendation -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Recommendation { - pub priority: RecommendationPriority, - pub category: RecommendationCategory, - pub title: String, - pub description: String, - pub action: String, -} - -/// Recommendation priority -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum RecommendationPriority { - Low, - Medium, - High, - Critical, -} - -/// Recommendation category -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum RecommendationCategory { - Performance, - Reliability, - Security, - Maintenance, -} - -/// Reporter statistics -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReporterStatistics { - pub total_reports: u64, - pub total_alerts: u64, - pub reports_in_memory: usize, - pub alerts_in_memory: usize, - pub last_report_time: SystemTime, - pub config: ReporterConfig, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_reporter_creation() { - let config = ReporterConfig::default(); - let reporter = Reporter::new(config).await.unwrap(); - - assert_eq!(reporter.config().report_interval, Duration::from_secs(60)); - assert!(reporter.config().enabled); - } - - #[tokio::test] - async fn test_report_generation() { - let config = ReporterConfig::default(); - let reporter = Reporter::new(config).await.unwrap(); - - let metrics = SystemMetrics::default(); - let report = reporter.generate_report(&metrics).await.unwrap(); - - assert_eq!(report.metrics.cpu_usage, 0.0); - assert_eq!(report.alerts.len(), 0); - } - - #[tokio::test] - async fn test_alert_generation() { - let config = ReporterConfig { - alert_thresholds: AlertThresholds { - cpu_usage_threshold: 50.0, - ..Default::default() - }, - ..Default::default() - }; - let reporter = Reporter::new(config).await.unwrap(); - - let mut metrics = SystemMetrics::default(); - metrics.cpu_usage = 75.0; // Above threshold - - let report = reporter.generate_report(&metrics).await.unwrap(); - assert!(!report.alerts.is_empty()); - assert_eq!(report.alerts[0].severity, AlertSeverity::Warning); - } - - #[tokio::test] - async fn test_comprehensive_report() { - let config = ReporterConfig::default(); - let reporter = Reporter::new(config).await.unwrap(); - - let aggregated = AggregatedMetrics { - query: MetricsQuery { - start_time: SystemTime::now(), - end_time: SystemTime::now() + Duration::from_secs(3600), - interval: Duration::from_secs(60), - metrics: vec![], - severity_filter: None, - limit: None, - }, - data_points: vec![], - summary: MetricsSummary::default(), - }; - - let report = reporter.generate_comprehensive_report(&aggregated).await.unwrap(); - assert_eq!(report.data_points, 0); - assert!(report.recommendations.is_empty()); - } - - #[tokio::test] - async fn test_reporter_statistics() { - let config = ReporterConfig::default(); - let reporter = Reporter::new(config).await.unwrap(); - - let stats = reporter.get_statistics().await; - assert_eq!(stats.total_reports, 0); - assert_eq!(stats.total_alerts, 0); - } - - #[tokio::test] - async fn test_alert_clearing() { - let config = ReporterConfig::default(); - let reporter = Reporter::new(config).await.unwrap(); - - // Generate some alerts - let mut metrics = SystemMetrics::default(); - metrics.cpu_usage = 90.0; // Above threshold - - let _report = reporter.generate_report(&metrics).await.unwrap(); - - // Clear old alerts - reporter.clear_old_alerts(1).await.unwrap(); - - let stats = reporter.get_statistics().await; - assert_eq!(stats.alerts_in_memory, 0); - } -} \ No newline at end of file diff --git a/crates/ahm/src/metrics/storage.rs b/crates/ahm/src/metrics/storage.rs deleted file mode 100644 index 66520ff7..00000000 --- a/crates/ahm/src/metrics/storage.rs +++ /dev/null @@ -1,573 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::{ - collections::HashMap, - sync::Arc, - path::PathBuf, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{debug, error, info, warn}; - -use crate::error::Result; - -use super::{ - AggregatedMetrics, MetricsDataPoint, MetricsQuery, MetricsSummary, SystemMetrics, -}; - -/// Configuration for metrics storage -#[derive(Debug, Clone)] -pub struct StorageConfig { - /// Storage directory path - pub storage_path: PathBuf, - /// Maximum file size for metrics files - pub max_file_size: u64, - /// Compression enabled - pub compression_enabled: bool, - /// Retention period for metrics data - pub retention_period: Duration, - /// Batch size for writes - pub batch_size: usize, - /// Flush interval - pub flush_interval: Duration, - /// Whether to enable data validation - pub enable_validation: bool, - /// Whether to enable data encryption - pub enable_encryption: bool, - /// Encryption key (if enabled) - pub encryption_key: Option, -} - -impl Default for StorageConfig { - fn default() -> Self { - Self { - storage_path: PathBuf::from("/tmp/rustfs/metrics"), - max_file_size: 100 * 1024 * 1024, // 100 MB - compression_enabled: true, - retention_period: Duration::from_secs(86400 * 30), // 30 days - batch_size: 1000, - flush_interval: Duration::from_secs(60), // 1 minute - enable_validation: true, - enable_encryption: false, - encryption_key: None, - } - } -} - -/// Metrics storage that persists metrics data to disk -pub struct Storage { - config: StorageConfig, - metrics_buffer: Arc>>, - aggregated_buffer: Arc>>, - file_handles: Arc>>, - last_flush_time: Arc>, - total_writes: Arc>, - total_reads: Arc>, -} - -impl Storage { - /// Create a new metrics storage - pub async fn new(config: StorageConfig) -> Result { - // Create storage directory if it doesn't exist - tokio::fs::create_dir_all(&config.storage_path).await?; - - Ok(Self { - config, - metrics_buffer: Arc::new(RwLock::new(Vec::new())), - aggregated_buffer: Arc::new(RwLock::new(Vec::new())), - file_handles: Arc::new(RwLock::new(HashMap::new())), - last_flush_time: Arc::new(RwLock::new(SystemTime::now())), - total_writes: Arc::new(RwLock::new(0)), - total_reads: Arc::new(RwLock::new(0)), - }) - } - - /// Get the configuration - pub fn config(&self) -> &StorageConfig { - &self.config - } - - /// Store system metrics - pub async fn store_metrics(&self, metrics: SystemMetrics) -> Result<()> { - let mut buffer = self.metrics_buffer.write().await; - buffer.push(metrics); - - // Flush if buffer is full - if buffer.len() >= self.config.batch_size { - self.flush_metrics_buffer().await?; - } - - // Update write count - { - let mut writes = self.total_writes.write().await; - *writes += 1; - } - - Ok(()) - } - - /// Store aggregated metrics - pub async fn store_aggregated_metrics(&self, aggregated: AggregatedMetrics) -> Result<()> { - let mut buffer = self.aggregated_buffer.write().await; - buffer.push(aggregated); - - // Flush if buffer is full - if buffer.len() >= self.config.batch_size { - self.flush_aggregated_buffer().await?; - } - - Ok(()) - } - - /// Retrieve metrics for a time range - pub async fn retrieve_metrics(&self, query: &MetricsQuery) -> Result> { - let start_time = Instant::now(); - - // Update read count - { - let mut reads = self.total_reads.write().await; - *reads += 1; - } - - // In a real implementation, this would read from disk files - // For now, we'll return data from the buffer - let buffer = self.metrics_buffer.read().await; - let filtered_metrics: Vec = buffer - .iter() - .filter(|m| m.timestamp >= query.start_time && m.timestamp <= query.end_time) - .cloned() - .collect(); - - let retrieval_time = start_time.elapsed(); - debug!("Metrics retrieval completed in {:?}", retrieval_time); - - Ok(filtered_metrics) - } - - /// Retrieve aggregated metrics - pub async fn retrieve_aggregated_metrics(&self, query: &MetricsQuery) -> Result> { - let buffer = self.aggregated_buffer.read().await; - let filtered_metrics: Vec = buffer - .iter() - .filter(|m| { - if let Some(first_point) = m.data_points.first() { - first_point.timestamp >= query.start_time - } else { - false - } - }) - .filter(|m| { - if let Some(last_point) = m.data_points.last() { - last_point.timestamp <= query.end_time - } else { - false - } - }) - .cloned() - .collect(); - - Ok(filtered_metrics) - } - - /// Flush metrics buffer to disk - async fn flush_metrics_buffer(&self) -> Result<()> { - let mut buffer = self.metrics_buffer.write().await; - if buffer.is_empty() { - return Ok(()); - } - - let metrics_to_write = buffer.drain(..).collect::>(); - drop(buffer); // Release lock - - // Write to file - self.write_metrics_to_file(&metrics_to_write).await?; - - // Update flush time - { - let mut last_flush = self.last_flush_time.write().await; - *last_flush = SystemTime::now(); - } - - info!("Flushed {} metrics to disk", metrics_to_write.len()); - Ok(()) - } - - /// Flush aggregated buffer to disk - async fn flush_aggregated_buffer(&self) -> Result<()> { - let mut buffer = self.aggregated_buffer.write().await; - if buffer.is_empty() { - return Ok(()); - } - - let aggregated_to_write = buffer.drain(..).collect::>(); - drop(buffer); // Release lock - - // Write to file - self.write_aggregated_to_file(&aggregated_to_write).await?; - - info!("Flushed {} aggregated metrics to disk", aggregated_to_write.len()); - Ok(()) - } - - /// Write metrics to file - async fn write_metrics_to_file(&self, metrics: &[SystemMetrics]) -> Result<()> { - let filename = format!("metrics_{}.json", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs()); - let filepath = self.config.storage_path.join(filename); - - // In a real implementation, this would write to a file - // For now, we'll just simulate the write - debug!("Would write {} metrics to {}", metrics.len(), filepath.display()); - - Ok(()) - } - - /// Write aggregated metrics to file - async fn write_aggregated_to_file(&self, aggregated: &[AggregatedMetrics]) -> Result<()> { - let filename = format!("aggregated_{}.json", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs()); - let filepath = self.config.storage_path.join(filename); - - // In a real implementation, this would write to a file - // For now, we'll just simulate the write - debug!("Would write {} aggregated metrics to {}", aggregated.len(), filepath.display()); - - Ok(()) - } - - /// Force flush all buffers - pub async fn force_flush(&self) -> Result<()> { - self.flush_metrics_buffer().await?; - self.flush_aggregated_buffer().await?; - - info!("Force flush completed"); - Ok(()) - } - - /// Clean up old data based on retention policy - pub async fn cleanup_old_data(&self) -> Result<()> { - let cutoff_time = SystemTime::now() - self.config.retention_period; - - // Clean up metrics buffer - { - let mut buffer = self.metrics_buffer.write().await; - buffer.retain(|m| m.timestamp >= cutoff_time); - } - - // Clean up aggregated buffer - { - let mut buffer = self.aggregated_buffer.write().await; - buffer.retain(|m| { - if let Some(first_point) = m.data_points.first() { - first_point.timestamp >= cutoff_time - } else { - false - } - }); - } - - // In a real implementation, this would also clean up old files - info!("Cleanup completed, removed data older than {:?}", cutoff_time); - Ok(()) - } - - /// Get storage statistics - pub async fn get_statistics(&self) -> StorageStatistics { - let metrics_count = self.metrics_buffer.read().await.len(); - let aggregated_count = self.aggregated_buffer.read().await.len(); - let total_writes = *self.total_writes.read().await; - let total_reads = *self.total_reads.read().await; - let last_flush_time = *self.last_flush_time.read().await; - - StorageStatistics { - metrics_in_buffer: metrics_count, - aggregated_in_buffer: aggregated_count, - total_writes, - total_reads, - last_flush_time, - config: self.config.clone(), - } - } - - /// Validate stored data integrity - pub async fn validate_data_integrity(&self) -> Result { - if !self.config.enable_validation { - return Ok(DataIntegrityReport { - is_valid: true, - total_records: 0, - corrupted_records: 0, - validation_time: Duration::ZERO, - errors: Vec::new(), - }); - } - - let start_time = Instant::now(); - let mut errors = Vec::new(); - let mut corrupted_records = 0; - - // Validate metrics buffer - { - let buffer = self.metrics_buffer.read().await; - for (i, metric) in buffer.iter().enumerate() { - if !self.validate_metric(metric) { - errors.push(format!("Invalid metric at index {}: {:?}", i, metric)); - corrupted_records += 1; - } - } - } - - // Validate aggregated buffer - { - let buffer = self.aggregated_buffer.read().await; - for (i, aggregated) in buffer.iter().enumerate() { - if !self.validate_aggregated(aggregated) { - errors.push(format!("Invalid aggregated metrics at index {}: {:?}", i, aggregated)); - corrupted_records += 1; - } - } - } - - let validation_time = start_time.elapsed(); - let total_records = { - let metrics_count = self.metrics_buffer.read().await.len(); - let aggregated_count = self.aggregated_buffer.read().await.len(); - metrics_count + aggregated_count - }; - - let is_valid = corrupted_records == 0; - - Ok(DataIntegrityReport { - is_valid, - total_records, - corrupted_records, - validation_time, - errors, - }) - } - - /// Validate a single metric - fn validate_metric(&self, metric: &SystemMetrics) -> bool { - // Basic validation checks - metric.cpu_usage >= 0.0 && metric.cpu_usage <= 100.0 - && metric.memory_usage >= 0.0 && metric.memory_usage <= 100.0 - && metric.disk_usage >= 0.0 && metric.disk_usage <= 100.0 - && metric.system_load >= 0.0 - } - - /// Validate aggregated metrics - fn validate_aggregated(&self, aggregated: &AggregatedMetrics) -> bool { - // Basic validation checks - !aggregated.data_points.is_empty() - && aggregated.query.start_time <= aggregated.query.end_time - && aggregated.summary.total_points > 0 - } - - /// Backup metrics data - pub async fn backup_data(&self, backup_path: &PathBuf) -> Result { - let start_time = Instant::now(); - - // Create backup directory - tokio::fs::create_dir_all(backup_path).await?; - - // In a real implementation, this would copy files to backup location - // For now, we'll just simulate the backup - let metrics_count = self.metrics_buffer.read().await.len(); - let aggregated_count = self.aggregated_buffer.read().await.len(); - - let backup_time = start_time.elapsed(); - - Ok(BackupReport { - backup_path: backup_path.clone(), - metrics_backed_up: metrics_count, - aggregated_backed_up: aggregated_count, - backup_time, - success: true, - }) - } - - /// Restore metrics data from backup - pub async fn restore_data(&self, backup_path: &PathBuf) -> Result { - let start_time = Instant::now(); - - // In a real implementation, this would restore from backup files - // For now, we'll just simulate the restore - debug!("Would restore data from {}", backup_path.display()); - - let restore_time = start_time.elapsed(); - - Ok(RestoreReport { - backup_path: backup_path.clone(), - metrics_restored: 0, - aggregated_restored: 0, - restore_time, - success: true, - }) - } -} - -/// Storage statistics -#[derive(Debug, Clone)] -pub struct StorageStatistics { - pub metrics_in_buffer: usize, - pub aggregated_in_buffer: usize, - pub total_writes: u64, - pub total_reads: u64, - pub last_flush_time: SystemTime, - pub config: StorageConfig, -} - -/// Data integrity validation report -#[derive(Debug, Clone)] -pub struct DataIntegrityReport { - pub is_valid: bool, - pub total_records: usize, - pub corrupted_records: usize, - pub validation_time: Duration, - pub errors: Vec, -} - -/// Backup report -#[derive(Debug, Clone)] -pub struct BackupReport { - pub backup_path: PathBuf, - pub metrics_backed_up: usize, - pub aggregated_backed_up: usize, - pub backup_time: Duration, - pub success: bool, -} - -/// Restore report -#[derive(Debug, Clone)] -pub struct RestoreReport { - pub backup_path: PathBuf, - pub metrics_restored: usize, - pub aggregated_restored: usize, - pub restore_time: Duration, - pub success: bool, -} - -#[cfg(test)] -mod tests { - use super::*; - use std::time::Instant; - - #[tokio::test] - async fn test_storage_creation() { - let config = StorageConfig::default(); - let storage = Storage::new(config).await.unwrap(); - - assert_eq!(storage.config().batch_size, 1000); - assert!(storage.config().compression_enabled); - } - - #[tokio::test] - async fn test_metrics_storage() { - let config = StorageConfig::default(); - let storage = Storage::new(config).await.unwrap(); - - let metrics = SystemMetrics::default(); - storage.store_metrics(metrics).await.unwrap(); - - let stats = storage.get_statistics().await; - assert_eq!(stats.metrics_in_buffer, 1); - assert_eq!(stats.total_writes, 1); - } - - #[tokio::test] - async fn test_aggregated_storage() { - let config = StorageConfig::default(); - let storage = Storage::new(config).await.unwrap(); - - let aggregated = AggregatedMetrics { - query: MetricsQuery { - start_time: SystemTime::now(), - end_time: SystemTime::now() + Duration::from_secs(3600), - interval: Duration::from_secs(60), - metrics: vec![], - severity_filter: None, - limit: None, - }, - data_points: vec![], - summary: MetricsSummary::default(), - }; - - storage.store_aggregated_metrics(aggregated).await.unwrap(); - - let stats = storage.get_statistics().await; - assert_eq!(stats.aggregated_in_buffer, 1); - } - - #[tokio::test] - async fn test_metrics_retrieval() { - let config = StorageConfig::default(); - let storage = Storage::new(config).await.unwrap(); - - // Store some metrics - for i in 0..5 { - let mut metrics = SystemMetrics::default(); - metrics.timestamp = SystemTime::now() + Duration::from_secs(i * 60); - storage.store_metrics(metrics).await.unwrap(); - } - - let query = MetricsQuery { - start_time: SystemTime::now(), - end_time: SystemTime::now() + Duration::from_secs(300), - interval: Duration::from_secs(60), - metrics: vec![], - severity_filter: None, - limit: None, - }; - - let retrieved = storage.retrieve_metrics(&query).await.unwrap(); - assert_eq!(retrieved.len(), 5); - } - - #[tokio::test] - async fn test_data_integrity_validation() { - let config = StorageConfig { - enable_validation: true, - ..Default::default() - }; - let storage = Storage::new(config).await.unwrap(); - - let report = storage.validate_data_integrity().await.unwrap(); - assert!(report.is_valid); - assert_eq!(report.corrupted_records, 0); - } - - #[tokio::test] - async fn test_force_flush() { - let config = StorageConfig::default(); - let storage = Storage::new(config).await.unwrap(); - - // Add some data - storage.store_metrics(SystemMetrics::default()).await.unwrap(); - - // Force flush - storage.force_flush().await.unwrap(); - - let stats = storage.get_statistics().await; - assert_eq!(stats.metrics_in_buffer, 0); - } - - #[tokio::test] - async fn test_cleanup_old_data() { - let config = StorageConfig::default(); - let storage = Storage::new(config).await.unwrap(); - - // Add some old data - let mut old_metrics = SystemMetrics::default(); - old_metrics.timestamp = SystemTime::now() - Duration::from_secs(86400 * 31); // 31 days old - storage.store_metrics(old_metrics).await.unwrap(); - - // Add some recent data - let mut recent_metrics = SystemMetrics::default(); - recent_metrics.timestamp = SystemTime::now(); - storage.store_metrics(recent_metrics).await.unwrap(); - - // Cleanup - storage.cleanup_old_data().await.unwrap(); - - let stats = storage.get_statistics().await; - assert_eq!(stats.metrics_in_buffer, 1); // Only recent data should remain - } -} \ No newline at end of file diff --git a/crates/ahm/src/policy/heal_policy.rs b/crates/ahm/src/policy/heal_policy.rs deleted file mode 100644 index 8342f089..00000000 --- a/crates/ahm/src/policy/heal_policy.rs +++ /dev/null @@ -1,508 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::time::{Duration, SystemTime}; - -use crate::scanner::{HealthIssue, Severity}; - -use super::{PolicyContext, PolicyResult, ResourceUsage}; - -/// Configuration for heal policies -#[derive(Debug, Clone)] -pub struct HealPolicyConfig { - /// Maximum number of concurrent repairs - pub max_concurrent_repairs: usize, - /// Maximum repair duration per operation - pub max_repair_duration: Duration, - /// Minimum interval between repairs - pub min_repair_interval: Duration, - /// Maximum system load threshold for healing - pub max_system_load: f64, - /// Minimum available disk space percentage for healing - pub min_disk_space: f64, - /// Maximum number of active operations for healing - pub max_active_operations: u64, - /// Whether to enable automatic healing - pub auto_heal_enabled: bool, - /// Priority-based healing configuration - pub priority_config: HealPriorityConfig, - /// Resource-based healing configuration - pub resource_config: HealResourceConfig, - /// Retry configuration - pub retry_config: HealRetryConfig, -} - -/// Priority-based healing configuration -#[derive(Debug, Clone)] -pub struct HealPriorityConfig { - /// Whether to enable priority-based healing - pub enabled: bool, - /// Critical issues heal immediately - pub critical_immediate: bool, - /// High priority issues heal within - pub high_timeout: Duration, - /// Medium priority issues heal within - pub medium_timeout: Duration, - /// Low priority issues heal within - pub low_timeout: Duration, -} - -/// Resource-based healing configuration -#[derive(Debug, Clone)] -pub struct HealResourceConfig { - /// Maximum CPU usage for healing - pub max_cpu_usage: f64, - /// Maximum memory usage for healing - pub max_memory_usage: f64, - /// Maximum disk I/O usage for healing - pub max_disk_io_usage: f64, - /// Maximum network I/O usage for healing - pub max_network_io_usage: f64, - /// Whether to enable resource-based throttling - pub enable_throttling: bool, -} - -/// Retry configuration for healing -#[derive(Debug, Clone)] -pub struct HealRetryConfig { - /// Maximum number of retry attempts - pub max_retry_attempts: u32, - /// Initial backoff delay - pub initial_backoff: Duration, - /// Maximum backoff delay - pub max_backoff: Duration, - /// Backoff multiplier - pub backoff_multiplier: f64, - /// Whether to use exponential backoff - pub exponential_backoff: bool, -} - -impl Default for HealPolicyConfig { - fn default() -> Self { - Self { - max_concurrent_repairs: 4, - max_repair_duration: Duration::from_secs(1800), // 30 minutes - min_repair_interval: Duration::from_secs(60), // 1 minute - max_system_load: 0.7, - min_disk_space: 15.0, // 15% minimum disk space - max_active_operations: 50, - auto_heal_enabled: true, - priority_config: HealPriorityConfig::default(), - resource_config: HealResourceConfig::default(), - retry_config: HealRetryConfig::default(), - } - } -} - -impl Default for HealPriorityConfig { - fn default() -> Self { - Self { - enabled: true, - critical_immediate: true, - high_timeout: Duration::from_secs(300), // 5 minutes - medium_timeout: Duration::from_secs(1800), // 30 minutes - low_timeout: Duration::from_secs(3600), // 1 hour - } - } -} - -impl Default for HealResourceConfig { - fn default() -> Self { - Self { - max_cpu_usage: 80.0, - max_memory_usage: 80.0, - max_disk_io_usage: 70.0, - max_network_io_usage: 70.0, - enable_throttling: true, - } - } -} - -impl Default for HealRetryConfig { - fn default() -> Self { - Self { - max_retry_attempts: 3, - initial_backoff: Duration::from_secs(30), - max_backoff: Duration::from_secs(300), - backoff_multiplier: 2.0, - exponential_backoff: true, - } - } -} - -/// Heal policy engine -pub struct HealPolicyEngine { - config: HealPolicyConfig, - last_repair_time: SystemTime, - repair_count: u64, - active_repairs: u64, -} - -impl HealPolicyEngine { - /// Create a new heal policy engine - pub fn new(config: HealPolicyConfig) -> Self { - Self { - config, - last_repair_time: SystemTime::now(), - repair_count: 0, - active_repairs: 0, - } - } - - /// Get the configuration - pub fn config(&self) -> &HealPolicyConfig { - &self.config - } - - /// Evaluate heal policy - pub async fn evaluate(&self, issue: &HealthIssue, context: &PolicyContext) -> PolicyResult { - let mut reasons = Vec::new(); - let mut allowed = true; - - // Check if auto-heal is enabled - if !self.config.auto_heal_enabled { - allowed = false; - reasons.push("Auto-heal is disabled".to_string()); - } - - // Check system load - if context.system_load > self.config.max_system_load { - allowed = false; - reasons.push(format!( - "System load too high: {:.2} > {:.2}", - context.system_load, self.config.max_system_load - )); - } - - // Check disk space - if context.disk_space_available < self.config.min_disk_space { - allowed = false; - reasons.push(format!( - "Disk space too low: {:.1}% < {:.1}%", - context.disk_space_available, self.config.min_disk_space - )); - } - - // Check active operations - if context.active_operations > self.config.max_active_operations { - allowed = false; - reasons.push(format!( - "Too many active operations: {} > {}", - context.active_operations, self.config.max_active_operations - )); - } - - // Check repair interval - let time_since_last_repair = context.current_time - .duration_since(self.last_repair_time) - .unwrap_or(Duration::ZERO); - - if time_since_last_repair < self.config.min_repair_interval { - allowed = false; - reasons.push(format!( - "Repair interval too short: {:?} < {:?}", - time_since_last_repair, self.config.min_repair_interval - )); - } - - // Check resource usage - if self.config.resource_config.enable_throttling { - if context.resource_usage.cpu_usage > self.config.resource_config.max_cpu_usage { - allowed = false; - reasons.push(format!( - "CPU usage too high: {:.1}% > {:.1}%", - context.resource_usage.cpu_usage, self.config.resource_config.max_cpu_usage - )); - } - - if context.resource_usage.memory_usage > self.config.resource_config.max_memory_usage { - allowed = false; - reasons.push(format!( - "Memory usage too high: {:.1}% > {:.1}%", - context.resource_usage.memory_usage, self.config.resource_config.max_memory_usage - )); - } - - if context.resource_usage.disk_io_usage > self.config.resource_config.max_disk_io_usage { - allowed = false; - reasons.push(format!( - "Disk I/O usage too high: {:.1}% > {:.1}%", - context.resource_usage.disk_io_usage, self.config.resource_config.max_disk_io_usage - )); - } - - if context.resource_usage.network_io_usage > self.config.resource_config.max_network_io_usage { - allowed = false; - reasons.push(format!( - "Network I/O usage too high: {:.1}% > {:.1}%", - context.resource_usage.network_io_usage, self.config.resource_config.max_network_io_usage - )); - } - } - - // Check priority-based policies - if self.config.priority_config.enabled { - match issue.severity { - Severity::Critical => { - if self.config.priority_config.critical_immediate { - // Critical issues should always be allowed unless resource constraints prevent it - if allowed { - reasons.clear(); - reasons.push("Critical issue - immediate repair allowed".to_string()); - } - } - } - Severity::High => { - // Check if we're within the high priority timeout - if time_since_last_repair > self.config.priority_config.high_timeout { - allowed = false; - reasons.push(format!( - "High priority issue timeout exceeded: {:?} > {:?}", - time_since_last_repair, self.config.priority_config.high_timeout - )); - } - } - Severity::Medium => { - // Check if we're within the medium priority timeout - if time_since_last_repair > self.config.priority_config.medium_timeout { - allowed = false; - reasons.push(format!( - "Medium priority issue timeout exceeded: {:?} > {:?}", - time_since_last_repair, self.config.priority_config.medium_timeout - )); - } - } - Severity::Low => { - // Check if we're within the low priority timeout - if time_since_last_repair > self.config.priority_config.low_timeout { - allowed = false; - reasons.push(format!( - "Low priority issue timeout exceeded: {:?} > {:?}", - time_since_last_repair, self.config.priority_config.low_timeout - )); - } - } - } - } - - let reason = if reasons.is_empty() { - "Heal allowed".to_string() - } else { - reasons.join("; ") - }; - - PolicyResult { - allowed, - reason, - metadata: Some(serde_json::json!({ - "repair_count": self.repair_count, - "active_repairs": self.active_repairs, - "time_since_last_repair": time_since_last_repair.as_secs(), - "issue_severity": format!("{:?}", issue.severity), - "issue_type": format!("{:?}", issue.issue_type), - "system_load": context.system_load, - "disk_space_available": context.disk_space_available, - "active_operations": context.active_operations, - })), - evaluated_at: context.current_time, - } - } - - /// Get repair timeout based on priority - pub fn get_repair_timeout(&self, severity: Severity) -> Duration { - if !self.config.priority_config.enabled { - return self.config.max_repair_duration; - } - - match severity { - Severity::Critical => Duration::from_secs(300), // 5 minutes for critical - Severity::High => self.config.priority_config.high_timeout, - Severity::Medium => self.config.priority_config.medium_timeout, - Severity::Low => self.config.priority_config.low_timeout, - } - } - - /// Get retry configuration - pub fn get_retry_config(&self) -> &HealRetryConfig { - &self.config.retry_config - } - - /// Update repair statistics - pub fn record_repair(&mut self) { - self.last_repair_time = SystemTime::now(); - self.repair_count += 1; - } - - /// Increment active repairs - pub fn increment_active_repairs(&mut self) { - self.active_repairs += 1; - } - - /// Decrement active repairs - pub fn decrement_active_repairs(&mut self) { - if self.active_repairs > 0 { - self.active_repairs -= 1; - } - } - - /// Get heal statistics - pub fn get_statistics(&self) -> HealPolicyStatistics { - HealPolicyStatistics { - total_repairs: self.repair_count, - active_repairs: self.active_repairs, - last_repair_time: self.last_repair_time, - config: self.config.clone(), - } - } -} - -/// Heal policy statistics -#[derive(Debug, Clone)] -pub struct HealPolicyStatistics { - pub total_repairs: u64, - pub active_repairs: u64, - pub last_repair_time: SystemTime, - pub config: HealPolicyConfig, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType, Severity}; - - #[tokio::test] - async fn test_heal_policy_creation() { - let config = HealPolicyConfig::default(); - let engine = HealPolicyEngine::new(config); - - assert_eq!(engine.config().max_concurrent_repairs, 4); - assert_eq!(engine.config().max_system_load, 0.7); - assert_eq!(engine.config().min_disk_space, 15.0); - } - - #[tokio::test] - async fn test_heal_policy_evaluation() { - let config = HealPolicyConfig::default(); - let engine = HealPolicyEngine::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Medium, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate(&issue, &context).await; - assert!(result.allowed); - assert!(result.reason.contains("Heal allowed")); - } - - #[tokio::test] - async fn test_heal_policy_critical_immediate() { - let config = HealPolicyConfig::default(); - let engine = HealPolicyEngine::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate(&issue, &context).await; - assert!(result.allowed); - assert!(result.reason.contains("Critical issue - immediate repair allowed")); - } - - #[tokio::test] - async fn test_heal_policy_system_load_limit() { - let config = HealPolicyConfig::default(); - let engine = HealPolicyEngine::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Medium, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let context = PolicyContext { - system_load: 0.8, // Above threshold - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate(&issue, &context).await; - assert!(!result.allowed); - assert!(result.reason.contains("System load too high")); - } - - #[tokio::test] - async fn test_repair_timeouts() { - let config = HealPolicyConfig::default(); - let engine = HealPolicyEngine::new(config); - - assert_eq!( - engine.get_repair_timeout(Severity::Critical), - Duration::from_secs(300) - ); - assert_eq!( - engine.get_repair_timeout(Severity::High), - Duration::from_secs(300) - ); - assert_eq!( - engine.get_repair_timeout(Severity::Medium), - Duration::from_secs(1800) - ); - assert_eq!( - engine.get_repair_timeout(Severity::Low), - Duration::from_secs(3600) - ); - } - - #[tokio::test] - async fn test_heal_statistics() { - let config = HealPolicyConfig::default(); - let mut engine = HealPolicyEngine::new(config); - - assert_eq!(engine.get_statistics().total_repairs, 0); - assert_eq!(engine.get_statistics().active_repairs, 0); - - engine.record_repair(); - engine.increment_active_repairs(); - engine.increment_active_repairs(); - - let stats = engine.get_statistics(); - assert_eq!(stats.total_repairs, 1); - assert_eq!(stats.active_repairs, 2); - - engine.decrement_active_repairs(); - assert_eq!(engine.get_statistics().active_repairs, 1); - } -} \ No newline at end of file diff --git a/crates/ahm/src/policy/mod.rs b/crates/ahm/src/policy/mod.rs deleted file mode 100644 index 507113cc..00000000 --- a/crates/ahm/src/policy/mod.rs +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Policy system for AHM operations -//! -//! Defines configurable policies for: -//! - Scanning behavior and frequency -//! - Healing priorities and strategies -//! - Data retention and lifecycle management - -pub mod scan_policy; -pub mod heal_policy; -pub mod retention_policy; - -pub use scan_policy::{ScanPolicyConfig, ScanPolicyEngine}; -pub use heal_policy::{HealPolicyConfig, HealPolicyEngine}; -pub use retention_policy::{RetentionPolicyConfig, RetentionPolicyEngine}; - -use std::time::{Duration, SystemTime}; -use serde::{Deserialize, Serialize}; - -use crate::scanner::{HealthIssue, Severity}; - -/// Policy evaluation result -#[derive(Debug, Clone)] -pub struct PolicyResult { - /// Whether the policy allows the action - pub allowed: bool, - /// Reason for the decision - pub reason: String, - /// Additional metadata - pub metadata: Option, - /// When the policy was evaluated - pub evaluated_at: SystemTime, -} - -/// Policy evaluation context -#[derive(Debug, Clone)] -pub struct PolicyContext { - /// Current system load - pub system_load: f64, - /// Available disk space percentage - pub disk_space_available: f64, - /// Number of active operations - pub active_operations: u64, - /// Current time - pub current_time: SystemTime, - /// Health issues count by severity - pub health_issues: std::collections::HashMap, - /// Resource usage metrics - pub resource_usage: ResourceUsage, -} - -/// Resource usage information -#[derive(Debug, Clone)] -pub struct ResourceUsage { - /// CPU usage percentage - pub cpu_usage: f64, - /// Memory usage percentage - pub memory_usage: f64, - /// Disk I/O usage percentage - pub disk_io_usage: f64, - /// Network I/O usage percentage - pub network_io_usage: f64, -} - -impl Default for ResourceUsage { - fn default() -> Self { - Self { - cpu_usage: 0.0, - memory_usage: 0.0, - disk_io_usage: 0.0, - network_io_usage: 0.0, - } - } -} - -/// Policy manager that coordinates all policies -pub struct PolicyManager { - scan_policy: ScanPolicyEngine, - heal_policy: HealPolicyEngine, - retention_policy: RetentionPolicyEngine, -} - -impl PolicyManager { - /// Create a new policy manager - pub fn new( - scan_config: ScanPolicyConfig, - heal_config: HealPolicyConfig, - retention_config: RetentionPolicyConfig, - ) -> Self { - Self { - scan_policy: ScanPolicyEngine::new(scan_config), - heal_policy: HealPolicyEngine::new(heal_config), - retention_policy: RetentionPolicyEngine::new(retention_config), - } - } - - /// Evaluate scan policy - pub async fn evaluate_scan_policy(&self, context: &PolicyContext) -> PolicyResult { - self.scan_policy.evaluate(context).await - } - - /// Evaluate heal policy - pub async fn evaluate_heal_policy(&self, issue: &HealthIssue, context: &PolicyContext) -> PolicyResult { - self.heal_policy.evaluate(issue, context).await - } - - /// Evaluate retention policy - pub async fn evaluate_retention_policy(&self, object_age: Duration, context: &PolicyContext) -> PolicyResult { - self.retention_policy.evaluate(object_age, context).await - } - - /// Get scan policy engine - pub fn scan_policy(&self) -> &ScanPolicyEngine { - &self.scan_policy - } - - /// Get heal policy engine - pub fn heal_policy(&self) -> &HealPolicyEngine { - &self.heal_policy - } - - /// Get retention policy engine - pub fn retention_policy(&self) -> &RetentionPolicyEngine { - &self.retention_policy - } - - /// Update scan policy configuration - pub async fn update_scan_policy(&mut self, config: ScanPolicyConfig) { - self.scan_policy = ScanPolicyEngine::new(config); - } - - /// Update heal policy configuration - pub async fn update_heal_policy(&mut self, config: HealPolicyConfig) { - self.heal_policy = HealPolicyEngine::new(config); - } - - /// Update retention policy configuration - pub async fn update_retention_policy(&mut self, config: RetentionPolicyConfig) { - self.retention_policy = RetentionPolicyEngine::new(config); - } - - /// List all policies - pub async fn list_policies(&self) -> crate::error::Result> { - // In a real implementation, this would return actual policy names - Ok(vec![ - "scan_policy".to_string(), - "heal_policy".to_string(), - "retention_policy".to_string(), - ]) - } - - /// Get a specific policy - pub async fn get_policy(&self, name: &str) -> crate::error::Result { - // In a real implementation, this would return the actual policy - Ok(format!("Policy configuration for: {}", name)) - } - - /// Get engine configuration - pub async fn get_config(&self) -> PolicyConfig { - PolicyConfig::default() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::{HealthIssue, HealthIssueType}; - - #[tokio::test] - async fn test_policy_manager_creation() { - let scan_config = ScanPolicyConfig::default(); - let heal_config = HealPolicyConfig::default(); - let retention_config = RetentionPolicyConfig::default(); - - let manager = PolicyManager::new(scan_config, heal_config, retention_config); - - // Test that all policy engines are available - assert!(manager.scan_policy().config().max_concurrent_scans > 0); - assert!(manager.heal_policy().config().max_concurrent_repairs > 0); - assert!(manager.retention_policy().config().default_retention_days > 0); - } - - #[tokio::test] - async fn test_policy_evaluation() { - let scan_config = ScanPolicyConfig::default(); - let heal_config = HealPolicyConfig::default(); - let retention_config = RetentionPolicyConfig::default(); - - let manager = PolicyManager::new(scan_config, heal_config, retention_config); - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - // Test scan policy evaluation - let scan_result = manager.evaluate_scan_policy(&context).await; - assert!(scan_result.allowed); - - // Test heal policy evaluation - let issue = HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - let heal_result = manager.evaluate_heal_policy(&issue, &context).await; - assert!(heal_result.allowed); - - // Test retention policy evaluation - let retention_result = manager.evaluate_retention_policy(Duration::from_secs(86400), &context).await; - assert!(retention_result.allowed); - } -} - -/// Master policy configuration -#[derive(Debug, Clone)] -pub struct PolicyConfig { - pub scan: ScanPolicyConfig, - pub heal: HealPolicyConfig, - pub retention: RetentionPolicyConfig, -} - -impl Default for PolicyConfig { - fn default() -> Self { - Self { - scan: ScanPolicyConfig::default(), - heal: HealPolicyConfig::default(), - retention: RetentionPolicyConfig::default(), - } - } -} - -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct PolicyManagerConfig { - #[serde(default)] - pub default_scan_interval: Duration, -} \ No newline at end of file diff --git a/crates/ahm/src/policy/retention_policy.rs b/crates/ahm/src/policy/retention_policy.rs deleted file mode 100644 index f54591fa..00000000 --- a/crates/ahm/src/policy/retention_policy.rs +++ /dev/null @@ -1,487 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::time::{Duration, SystemTime}; - -use super::{PolicyContext, PolicyResult, ResourceUsage}; - -/// Configuration for retention policies -#[derive(Debug, Clone)] -pub struct RetentionPolicyConfig { - /// Default retention period in days - pub default_retention_days: u32, - /// Whether to enable retention policies - pub enabled: bool, - /// Maximum system load threshold for retention operations - pub max_system_load: f64, - /// Minimum available disk space percentage for retention operations - pub min_disk_space: f64, - /// Maximum number of active operations for retention - pub max_active_operations: u64, - /// Retention rules by object type - pub retention_rules: Vec, - /// Whether to enable automatic cleanup - pub auto_cleanup_enabled: bool, - /// Cleanup interval - pub cleanup_interval: Duration, - /// Maximum objects to delete per cleanup cycle - pub max_objects_per_cleanup: u64, -} - -/// Retention rule for specific object types -#[derive(Debug, Clone)] -pub struct RetentionRule { - /// Object type pattern (e.g., "*.log", "temp/*") - pub pattern: String, - /// Retention period in days - pub retention_days: u32, - /// Whether this rule is enabled - pub enabled: bool, - /// Priority of this rule (higher = more important) - pub priority: u32, - /// Whether to apply this rule recursively - pub recursive: bool, -} - -impl Default for RetentionPolicyConfig { - fn default() -> Self { - Self { - default_retention_days: 30, - enabled: true, - max_system_load: 0.6, - min_disk_space: 20.0, // 20% minimum disk space - max_active_operations: 20, - retention_rules: vec![ - RetentionRule { - pattern: "*.log".to_string(), - retention_days: 7, - enabled: true, - priority: 1, - recursive: false, - }, - RetentionRule { - pattern: "temp/*".to_string(), - retention_days: 1, - enabled: true, - priority: 2, - recursive: true, - }, - RetentionRule { - pattern: "cache/*".to_string(), - retention_days: 3, - enabled: true, - priority: 3, - recursive: true, - }, - ], - auto_cleanup_enabled: true, - cleanup_interval: Duration::from_secs(3600), // 1 hour - max_objects_per_cleanup: 1000, - } - } -} - -/// Retention policy engine -pub struct RetentionPolicyEngine { - config: RetentionPolicyConfig, - last_cleanup_time: SystemTime, - cleanup_count: u64, - objects_deleted: u64, -} - -impl RetentionPolicyEngine { - /// Create a new retention policy engine - pub fn new(config: RetentionPolicyConfig) -> Self { - Self { - config, - last_cleanup_time: SystemTime::now(), - cleanup_count: 0, - objects_deleted: 0, - } - } - - /// Get the configuration - pub fn config(&self) -> &RetentionPolicyConfig { - &self.config - } - - /// Evaluate retention policy - pub async fn evaluate(&self, object_age: Duration, context: &PolicyContext) -> PolicyResult { - let mut reasons = Vec::new(); - let mut allowed = false; - - // Check if retention policies are enabled - if !self.config.enabled { - allowed = false; - reasons.push("Retention policies are disabled".to_string()); - } else { - // Check if object should be retained based on age - let retention_days = self.get_retention_days_for_object("default"); - let retention_duration = Duration::from_secs(retention_days as u64 * 24 * 3600); - - if object_age > retention_duration { - allowed = true; - reasons.push(format!( - "Object age exceeds retention period: {:?} > {:?}", - object_age, retention_duration - )); - } else { - allowed = false; - reasons.push(format!( - "Object within retention period: {:?} <= {:?}", - object_age, retention_duration - )); - } - } - - // Check system constraints - if context.system_load > self.config.max_system_load { - allowed = false; - reasons.push(format!( - "System load too high: {:.2} > {:.2}", - context.system_load, self.config.max_system_load - )); - } - - if context.disk_space_available < self.config.min_disk_space { - allowed = false; - reasons.push(format!( - "Disk space too low: {:.1}% < {:.1}%", - context.disk_space_available, self.config.min_disk_space - )); - } - - if context.active_operations > self.config.max_active_operations { - allowed = false; - reasons.push(format!( - "Too many active operations: {} > {}", - context.active_operations, self.config.max_active_operations - )); - } - - let reason = if reasons.is_empty() { - "Retention evaluation completed".to_string() - } else { - reasons.join("; ") - }; - - PolicyResult { - allowed, - reason, - metadata: Some(serde_json::json!({ - "object_age_seconds": object_age.as_secs(), - "cleanup_count": self.cleanup_count, - "objects_deleted": self.objects_deleted, - "system_load": context.system_load, - "disk_space_available": context.disk_space_available, - "active_operations": context.active_operations, - })), - evaluated_at: context.current_time, - } - } - - /// Evaluate cleanup policy - pub async fn evaluate_cleanup(&self, context: &PolicyContext) -> PolicyResult { - let mut reasons = Vec::new(); - let mut allowed = false; - - // Check if auto-cleanup is enabled - if !self.config.auto_cleanup_enabled { - allowed = false; - reasons.push("Auto-cleanup is disabled".to_string()); - } else { - // Check cleanup interval - let time_since_last_cleanup = context.current_time - .duration_since(self.last_cleanup_time) - .unwrap_or(Duration::ZERO); - - if time_since_last_cleanup >= self.config.cleanup_interval { - allowed = true; - reasons.push("Cleanup interval reached".to_string()); - } else { - allowed = false; - reasons.push(format!( - "Cleanup interval not reached: {:?} < {:?}", - time_since_last_cleanup, self.config.cleanup_interval - )); - } - } - - // Check system constraints - if context.system_load > self.config.max_system_load { - allowed = false; - reasons.push(format!( - "System load too high: {:.2} > {:.2}", - context.system_load, self.config.max_system_load - )); - } - - if context.disk_space_available < self.config.min_disk_space { - allowed = false; - reasons.push(format!( - "Disk space too low: {:.1}% < {:.1}%", - context.disk_space_available, self.config.min_disk_space - )); - } - - let reason = if reasons.is_empty() { - "Cleanup evaluation completed".to_string() - } else { - reasons.join("; ") - }; - - PolicyResult { - allowed, - reason, - metadata: Some(serde_json::json!({ - "cleanup_count": self.cleanup_count, - "objects_deleted": self.objects_deleted, - "max_objects_per_cleanup": self.config.max_objects_per_cleanup, - "system_load": context.system_load, - "disk_space_available": context.disk_space_available, - })), - evaluated_at: context.current_time, - } - } - - /// Get retention days for a specific object - pub fn get_retention_days_for_object(&self, object_path: &str) -> u32 { - // Find the highest priority matching rule - let mut best_rule: Option<&RetentionRule> = None; - let mut best_priority = 0; - - for rule in &self.config.retention_rules { - if !rule.enabled { - continue; - } - - if self.matches_pattern(object_path, &rule.pattern) { - if rule.priority > best_priority { - best_rule = Some(rule); - best_priority = rule.priority; - } - } - } - - best_rule - .map(|rule| rule.retention_days) - .unwrap_or(self.config.default_retention_days) - } - - /// Check if an object path matches a pattern - fn matches_pattern(&self, object_path: &str, pattern: &str) -> bool { - // Simple pattern matching - can be enhanced with regex - if pattern.contains('*') { - // Wildcard matching - let pattern_parts: Vec<&str> = pattern.split('*').collect(); - if pattern_parts.len() == 2 { - let prefix = pattern_parts[0]; - let suffix = pattern_parts[1]; - object_path.starts_with(prefix) && object_path.ends_with(suffix) - } else { - false - } - } else { - // Exact match - object_path == pattern - } - } - - /// Get all retention rules - pub fn get_retention_rules(&self) -> &[RetentionRule] { - &self.config.retention_rules - } - - /// Add a new retention rule - pub fn add_retention_rule(&mut self, rule: RetentionRule) { - self.config.retention_rules.push(rule); - } - - /// Remove a retention rule by pattern - pub fn remove_retention_rule(&mut self, pattern: &str) -> bool { - let initial_len = self.config.retention_rules.len(); - self.config.retention_rules.retain(|rule| rule.pattern != pattern); - self.config.retention_rules.len() < initial_len - } - - /// Update cleanup statistics - pub fn record_cleanup(&mut self, objects_deleted: u64) { - self.last_cleanup_time = SystemTime::now(); - self.cleanup_count += 1; - self.objects_deleted += objects_deleted; - } - - /// Get retention statistics - pub fn get_statistics(&self) -> RetentionPolicyStatistics { - RetentionPolicyStatistics { - total_cleanups: self.cleanup_count, - total_objects_deleted: self.objects_deleted, - last_cleanup_time: self.last_cleanup_time, - config: self.config.clone(), - } - } -} - -/// Retention policy statistics -#[derive(Debug, Clone)] -pub struct RetentionPolicyStatistics { - pub total_cleanups: u64, - pub total_objects_deleted: u64, - pub last_cleanup_time: SystemTime, - pub config: RetentionPolicyConfig, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_retention_policy_creation() { - let config = RetentionPolicyConfig::default(); - let engine = RetentionPolicyEngine::new(config); - - assert_eq!(engine.config().default_retention_days, 30); - assert_eq!(engine.config().max_system_load, 0.6); - assert_eq!(engine.config().min_disk_space, 20.0); - } - - #[tokio::test] - async fn test_retention_policy_evaluation() { - let config = RetentionPolicyConfig::default(); - let engine = RetentionPolicyEngine::new(config); - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - // Test object within retention period - let object_age = Duration::from_secs(7 * 24 * 3600); // 7 days - let result = engine.evaluate(object_age, &context).await; - assert!(!result.allowed); - assert!(result.reason.contains("Object within retention period")); - - // Test object exceeding retention period - let object_age = Duration::from_secs(40 * 24 * 3600); // 40 days - let result = engine.evaluate(object_age, &context).await; - assert!(result.allowed); - assert!(result.reason.contains("Object age exceeds retention period")); - } - - #[tokio::test] - async fn test_retention_policy_system_constraints() { - let config = RetentionPolicyConfig::default(); - let engine = RetentionPolicyEngine::new(config); - - let context = PolicyContext { - system_load: 0.7, // Above threshold - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let object_age = Duration::from_secs(40 * 24 * 3600); // 40 days - let result = engine.evaluate(object_age, &context).await; - assert!(!result.allowed); - assert!(result.reason.contains("System load too high")); - } - - #[tokio::test] - async fn test_retention_rules() { - let config = RetentionPolicyConfig::default(); - let engine = RetentionPolicyEngine::new(config); - - // Test default retention - assert_eq!(engine.get_retention_days_for_object("unknown.txt"), 30); - - // Test log file retention - assert_eq!(engine.get_retention_days_for_object("app.log"), 7); - - // Test temp file retention - assert_eq!(engine.get_retention_days_for_object("temp/file.txt"), 1); - - // Test cache file retention - assert_eq!(engine.get_retention_days_for_object("cache/data.bin"), 3); - } - - #[tokio::test] - async fn test_pattern_matching() { - let config = RetentionPolicyConfig::default(); - let engine = RetentionPolicyEngine::new(config); - - // Test wildcard matching - assert!(engine.matches_pattern("app.log", "*.log")); - assert!(engine.matches_pattern("error.log", "*.log")); - assert!(!engine.matches_pattern("app.txt", "*.log")); - - // Test exact matching - assert!(engine.matches_pattern("temp/file.txt", "temp/file.txt")); - assert!(!engine.matches_pattern("temp/file.txt", "temp/other.txt")); - } - - #[tokio::test] - async fn test_cleanup_evaluation() { - let config = RetentionPolicyConfig::default(); - let engine = RetentionPolicyEngine::new(config); - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate_cleanup(&context).await; - // Should be allowed if enough time has passed since last cleanup - assert!(result.allowed || result.reason.contains("Cleanup interval not reached")); - } - - #[tokio::test] - async fn test_retention_statistics() { - let config = RetentionPolicyConfig::default(); - let mut engine = RetentionPolicyEngine::new(config); - - assert_eq!(engine.get_statistics().total_cleanups, 0); - assert_eq!(engine.get_statistics().total_objects_deleted, 0); - - engine.record_cleanup(50); - assert_eq!(engine.get_statistics().total_cleanups, 1); - assert_eq!(engine.get_statistics().total_objects_deleted, 50); - - engine.record_cleanup(30); - assert_eq!(engine.get_statistics().total_cleanups, 2); - assert_eq!(engine.get_statistics().total_objects_deleted, 80); - } - - #[tokio::test] - async fn test_retention_rule_management() { - let config = RetentionPolicyConfig::default(); - let mut engine = RetentionPolicyEngine::new(config); - - let initial_rules = engine.get_retention_rules().len(); - - // Add a new rule - let new_rule = RetentionRule { - pattern: "backup/*".to_string(), - retention_days: 90, - enabled: true, - priority: 4, - recursive: true, - }; - engine.add_retention_rule(new_rule); - - assert_eq!(engine.get_retention_rules().len(), initial_rules + 1); - - // Remove a rule - let removed = engine.remove_retention_rule("*.log"); - assert!(removed); - assert_eq!(engine.get_retention_rules().len(), initial_rules); - } -} \ No newline at end of file diff --git a/crates/ahm/src/policy/scan_policy.rs b/crates/ahm/src/policy/scan_policy.rs deleted file mode 100644 index 44e3fc14..00000000 --- a/crates/ahm/src/policy/scan_policy.rs +++ /dev/null @@ -1,373 +0,0 @@ -// Copyright 2024 RustFS Team - -use std::time::{Duration, SystemTime}; - -use crate::scanner::Severity; - -use super::{PolicyContext, PolicyResult, ResourceUsage}; - -/// Configuration for scan policies -#[derive(Debug, Clone)] -pub struct ScanPolicyConfig { - /// Maximum number of concurrent scans - pub max_concurrent_scans: usize, - /// Maximum scan duration per cycle - pub max_scan_duration: Duration, - /// Minimum interval between scans - pub min_scan_interval: Duration, - /// Maximum system load threshold for scanning - pub max_system_load: f64, - /// Minimum available disk space percentage for scanning - pub min_disk_space: f64, - /// Maximum number of active operations for scanning - pub max_active_operations: u64, - /// Whether to enable deep scanning - pub enable_deep_scan: bool, - /// Deep scan interval (how often to perform deep scans) - pub deep_scan_interval: Duration, - /// Bandwidth limit for scanning (bytes per second) - pub bandwidth_limit: Option, - /// Priority-based scanning configuration - pub priority_config: ScanPriorityConfig, -} - -/// Priority-based scanning configuration -#[derive(Debug, Clone)] -pub struct ScanPriorityConfig { - /// Whether to enable priority-based scanning - pub enabled: bool, - /// Critical issues scan interval - pub critical_interval: Duration, - /// High priority issues scan interval - pub high_interval: Duration, - /// Medium priority issues scan interval - pub medium_interval: Duration, - /// Low priority issues scan interval - pub low_interval: Duration, -} - -impl Default for ScanPolicyConfig { - fn default() -> Self { - Self { - max_concurrent_scans: 4, - max_scan_duration: Duration::from_secs(3600), // 1 hour - min_scan_interval: Duration::from_secs(300), // 5 minutes - max_system_load: 0.8, - min_disk_space: 10.0, // 10% minimum disk space - max_active_operations: 100, - enable_deep_scan: true, - deep_scan_interval: Duration::from_secs(86400), // 24 hours - bandwidth_limit: Some(100 * 1024 * 1024), // 100 MB/s - priority_config: ScanPriorityConfig::default(), - } - } -} - -impl Default for ScanPriorityConfig { - fn default() -> Self { - Self { - enabled: true, - critical_interval: Duration::from_secs(60), // 1 minute - high_interval: Duration::from_secs(300), // 5 minutes - medium_interval: Duration::from_secs(1800), // 30 minutes - low_interval: Duration::from_secs(3600), // 1 hour - } - } -} - -/// Scan policy engine -pub struct ScanPolicyEngine { - config: ScanPolicyConfig, - last_scan_time: SystemTime, - last_deep_scan_time: SystemTime, - scan_count: u64, -} - -impl ScanPolicyEngine { - /// Create a new scan policy engine - pub fn new(config: ScanPolicyConfig) -> Self { - Self { - config, - last_scan_time: SystemTime::now(), - last_deep_scan_time: SystemTime::now(), - scan_count: 0, - } - } - - /// Get the configuration - pub fn config(&self) -> &ScanPolicyConfig { - &self.config - } - - /// Evaluate scan policy - pub async fn evaluate(&self, context: &PolicyContext) -> PolicyResult { - let mut reasons = Vec::new(); - let mut allowed = true; - - // Check system load - if context.system_load > self.config.max_system_load { - allowed = false; - reasons.push(format!( - "System load too high: {:.2} > {:.2}", - context.system_load, self.config.max_system_load - )); - } - - // Check disk space - if context.disk_space_available < self.config.min_disk_space { - allowed = false; - reasons.push(format!( - "Disk space too low: {:.1}% < {:.1}%", - context.disk_space_available, self.config.min_disk_space - )); - } - - // Check active operations - if context.active_operations > self.config.max_active_operations { - allowed = false; - reasons.push(format!( - "Too many active operations: {} > {}", - context.active_operations, self.config.max_active_operations - )); - } - - // Check scan interval - let time_since_last_scan = context.current_time - .duration_since(self.last_scan_time) - .unwrap_or(Duration::ZERO); - - if time_since_last_scan < self.config.min_scan_interval { - allowed = false; - reasons.push(format!( - "Scan interval too short: {:?} < {:?}", - time_since_last_scan, self.config.min_scan_interval - )); - } - - // Check resource usage - if context.resource_usage.cpu_usage > 90.0 { - allowed = false; - reasons.push("CPU usage too high".to_string()); - } - - if context.resource_usage.memory_usage > 90.0 { - allowed = false; - reasons.push("Memory usage too high".to_string()); - } - - let reason = if reasons.is_empty() { - "Scan allowed".to_string() - } else { - reasons.join("; ") - }; - - PolicyResult { - allowed, - reason, - metadata: Some(serde_json::json!({ - "scan_count": self.scan_count, - "time_since_last_scan": time_since_last_scan.as_secs(), - "system_load": context.system_load, - "disk_space_available": context.disk_space_available, - "active_operations": context.active_operations, - })), - evaluated_at: context.current_time, - } - } - - /// Evaluate deep scan policy - pub async fn evaluate_deep_scan(&self, context: &PolicyContext) -> PolicyResult { - let mut base_result = self.evaluate(context).await; - - if !base_result.allowed { - return base_result; - } - - // Check deep scan interval - let time_since_last_deep_scan = context.current_time - .duration_since(self.last_deep_scan_time) - .unwrap_or(Duration::ZERO); - - if time_since_last_deep_scan < self.config.deep_scan_interval { - base_result.allowed = false; - base_result.reason = format!( - "Deep scan interval too short: {:?} < {:?}", - time_since_last_deep_scan, self.config.deep_scan_interval - ); - } else { - base_result.reason = "Deep scan allowed".to_string(); - } - - // Add deep scan metadata - if let Some(ref mut metadata) = base_result.metadata { - if let Some(obj) = metadata.as_object_mut() { - obj.insert( - "time_since_last_deep_scan".to_string(), - serde_json::Value::Number(serde_json::Number::from(time_since_last_deep_scan.as_secs())), - ); - obj.insert( - "deep_scan_enabled".to_string(), - serde_json::Value::Bool(self.config.enable_deep_scan), - ); - } - } - - base_result - } - - /// Get scan interval based on priority - pub fn get_priority_interval(&self, severity: Severity) -> Duration { - if !self.config.priority_config.enabled { - return self.config.min_scan_interval; - } - - match severity { - Severity::Critical => self.config.priority_config.critical_interval, - Severity::High => self.config.priority_config.high_interval, - Severity::Medium => self.config.priority_config.medium_interval, - Severity::Low => self.config.priority_config.low_interval, - } - } - - /// Update scan statistics - pub fn record_scan(&mut self) { - self.last_scan_time = SystemTime::now(); - self.scan_count += 1; - } - - /// Update deep scan statistics - pub fn record_deep_scan(&mut self) { - self.last_deep_scan_time = SystemTime::now(); - } - - /// Get scan statistics - pub fn get_statistics(&self) -> ScanPolicyStatistics { - ScanPolicyStatistics { - total_scans: self.scan_count, - last_scan_time: self.last_scan_time, - last_deep_scan_time: self.last_deep_scan_time, - config: self.config.clone(), - } - } -} - -/// Scan policy statistics -#[derive(Debug, Clone)] -pub struct ScanPolicyStatistics { - pub total_scans: u64, - pub last_scan_time: SystemTime, - pub last_deep_scan_time: SystemTime, - pub config: ScanPolicyConfig, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::scanner::Severity; - - #[tokio::test] - async fn test_scan_policy_creation() { - let config = ScanPolicyConfig::default(); - let engine = ScanPolicyEngine::new(config); - - assert_eq!(engine.config().max_concurrent_scans, 4); - assert_eq!(engine.config().max_system_load, 0.8); - assert_eq!(engine.config().min_disk_space, 10.0); - } - - #[tokio::test] - async fn test_scan_policy_evaluation() { - let config = ScanPolicyConfig::default(); - let engine = ScanPolicyEngine::new(config); - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate(&context).await; - assert!(result.allowed); - assert!(result.reason.contains("Scan allowed")); - } - - #[tokio::test] - async fn test_scan_policy_system_load_limit() { - let config = ScanPolicyConfig::default(); - let engine = ScanPolicyEngine::new(config); - - let context = PolicyContext { - system_load: 0.9, // Above threshold - disk_space_available: 80.0, - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate(&context).await; - assert!(!result.allowed); - assert!(result.reason.contains("System load too high")); - } - - #[tokio::test] - async fn test_scan_policy_disk_space_limit() { - let config = ScanPolicyConfig::default(); - let engine = ScanPolicyEngine::new(config); - - let context = PolicyContext { - system_load: 0.5, - disk_space_available: 5.0, // Below threshold - active_operations: 10, - current_time: SystemTime::now(), - health_issues: std::collections::HashMap::new(), - resource_usage: ResourceUsage::default(), - }; - - let result = engine.evaluate(&context).await; - assert!(!result.allowed); - assert!(result.reason.contains("Disk space too low")); - } - - #[tokio::test] - async fn test_priority_intervals() { - let config = ScanPolicyConfig::default(); - let engine = ScanPolicyEngine::new(config); - - assert_eq!( - engine.get_priority_interval(Severity::Critical), - Duration::from_secs(60) - ); - assert_eq!( - engine.get_priority_interval(Severity::High), - Duration::from_secs(300) - ); - assert_eq!( - engine.get_priority_interval(Severity::Medium), - Duration::from_secs(1800) - ); - assert_eq!( - engine.get_priority_interval(Severity::Low), - Duration::from_secs(3600) - ); - } - - #[tokio::test] - async fn test_scan_statistics() { - let config = ScanPolicyConfig::default(); - let mut engine = ScanPolicyEngine::new(config); - - assert_eq!(engine.get_statistics().total_scans, 0); - - engine.record_scan(); - assert_eq!(engine.get_statistics().total_scans, 1); - - engine.record_deep_scan(); - let stats = engine.get_statistics(); - assert_eq!(stats.total_scans, 1); - assert!(stats.last_deep_scan_time > stats.last_scan_time); - } -} \ No newline at end of file diff --git a/crates/ahm/src/scanner.rs b/crates/ahm/src/scanner.rs new file mode 100644 index 00000000..52f92bba --- /dev/null +++ b/crates/ahm/src/scanner.rs @@ -0,0 +1,902 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use rustfs_ecstore as ecstore; +use ecstore::{ + disk::{DiskAPI, DiskStore, WalkDirOptions}, + set_disk::SetDisks, +}; +use rustfs_filemeta::MetacacheReader; +use tokio::sync::RwLock; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; + +use crate::{ + error::{Error, Result}, + metrics::{BucketMetrics, DiskMetrics, MetricsCollector, ScannerMetrics}, +}; + +/// Custom scan mode enum for AHM scanner +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ScanMode { + /// Normal scan - basic object discovery and metadata collection + Normal, + /// Deep scan - includes EC verification and integrity checks + Deep, +} + +impl Default for ScanMode { + fn default() -> Self { + ScanMode::Normal + } +} + +/// Scanner configuration +#[derive(Debug, Clone)] +pub struct ScannerConfig { + /// Scan interval between cycles + pub scan_interval: Duration, + /// Deep scan interval (how often to perform deep scan) + pub deep_scan_interval: Duration, + /// Maximum concurrent scans + pub max_concurrent_scans: usize, + /// Whether to enable healing + pub enable_healing: bool, + /// Whether to enable metrics collection + pub enable_metrics: bool, + /// Current scan mode (normal, deep) + pub scan_mode: ScanMode, +} + +impl Default for ScannerConfig { + fn default() -> Self { + Self { + scan_interval: Duration::from_secs(3600), // 1 hour + deep_scan_interval: Duration::from_secs(3600), // 1 hour + max_concurrent_scans: 20, + enable_healing: true, + enable_metrics: true, + scan_mode: ScanMode::Normal, + } + } +} + +/// Scanner state +#[derive(Debug)] +pub struct ScannerState { + /// Whether scanner is running + pub is_running: bool, + /// Current scan cycle + pub current_cycle: u64, + /// Last scan start time + pub last_scan_start: Option, + /// Last scan end time + pub last_scan_end: Option, + /// Current scan duration + pub current_scan_duration: Option, + /// Last deep scan time + pub last_deep_scan_time: Option, + /// Buckets being scanned + pub scanning_buckets: Vec, + /// Disks being scanned + pub scanning_disks: Vec, +} + +impl Default for ScannerState { + fn default() -> Self { + Self { + is_running: false, + current_cycle: 0, + last_scan_start: None, + last_scan_end: None, + current_scan_duration: None, + last_deep_scan_time: None, + scanning_buckets: Vec::new(), + scanning_disks: Vec::new(), + } + } +} + +/// AHM Scanner - Automatic Health Management Scanner +/// +/// This scanner monitors the health of objects in the RustFS storage system. +/// It integrates with ECStore's SetDisks to perform real data scanning and +/// collects metrics similar to MinIO's scanner. +/// +/// The scanner operates on EC (Erasure Coding) sets, where each set contains +/// multiple disks that store the same objects with different shards. +pub struct Scanner { + /// Scanner configuration + config: Arc>, + /// Scanner state + state: Arc>, + /// Metrics collector + metrics: Arc, + /// Bucket metrics cache + bucket_metrics: Arc>>, + /// Disk metrics cache + disk_metrics: Arc>>, + /// EC Set disks - represents a complete erasure coding set + set_disks: Arc, + /// Cancellation token for graceful shutdown + cancel_token: CancellationToken, +} + +impl Scanner { + /// Create a new scanner + pub fn new(set_disks: Arc, config: Option) -> Self { + let config = config.unwrap_or_default(); + let cancel_token = CancellationToken::new(); + + info!("Creating AHM scanner for EC set with {} disks", set_disks.set_drive_count); + + Self { + config: Arc::new(RwLock::new(config)), + state: Arc::new(RwLock::new(ScannerState::default())), + metrics: Arc::new(MetricsCollector::new()), + bucket_metrics: Arc::new(RwLock::new(HashMap::new())), + disk_metrics: Arc::new(RwLock::new(HashMap::new())), + set_disks, + cancel_token, + } + } + + /// Start the scanner + pub async fn start(&self) -> Result<()> { + let mut state = self.state.write().await; + + if state.is_running { + warn!("Scanner is already running"); + return Ok(()); + } + + state.is_running = true; + state.last_scan_start = Some(SystemTime::now()); + + info!("Starting AHM scanner"); + + // Start background scan loop + let scanner = self.clone_for_background(); + tokio::spawn(async move { + if let Err(e) = scanner.scan_loop().await { + error!("Scanner loop failed: {}", e); + } + }); + + Ok(()) + } + + /// Stop the scanner gracefully + pub async fn stop(&self) -> Result<()> { + let mut state = self.state.write().await; + + if !state.is_running { + warn!("Scanner is not running"); + return Ok(()); + } + + info!("Stopping AHM scanner gracefully..."); + + // Trigger cancellation + self.cancel_token.cancel(); + + state.is_running = false; + state.last_scan_end = Some(SystemTime::now()); + + if let Some(start_time) = state.last_scan_start { + state.current_scan_duration = Some( + SystemTime::now() + .duration_since(start_time) + .unwrap_or(Duration::ZERO) + ); + } + + info!("AHM scanner stopped"); + Ok(()) + } + + /// Get a clone of the cancellation token (for external graceful shutdown) + pub fn cancellation_token(&self) -> CancellationToken { + self.cancel_token.clone() + } + + /// Get current scanner metrics + pub async fn get_metrics(&self) -> ScannerMetrics { + let mut metrics = self.metrics.get_metrics(); + + // Add bucket metrics + let bucket_metrics = self.bucket_metrics.read().await; + metrics.bucket_metrics = bucket_metrics.clone(); + + // Add disk metrics + let disk_metrics = self.disk_metrics.read().await; + metrics.disk_metrics = disk_metrics.clone(); + + // Add current scan duration + let state = self.state.read().await; + metrics.current_scan_duration = state.current_scan_duration; + + metrics + } + + /// Perform a single scan cycle + pub async fn scan_cycle(&self) -> Result<()> { + let start_time = SystemTime::now(); + + info!("Starting scan cycle {}", self.metrics.get_metrics().current_cycle + 1); + + // Update state + { + let mut state = self.state.write().await; + state.current_cycle += 1; + state.last_scan_start = Some(start_time); + state.scanning_buckets.clear(); + state.scanning_disks.clear(); + } + + self.metrics.set_current_cycle(self.state.read().await.current_cycle); + self.metrics.increment_total_cycles(); + + // Get online disks from the EC set + let (disks, _) = self.set_disks.get_online_disks_with_healing(false).await; + + if disks.is_empty() { + warn!("No online disks available for scanning"); + return Ok(()); + } + + info!("Scanning {} online disks", disks.len()); + + // Phase 1: Scan all disks concurrently to collect object metadata + let config = self.config.read().await; + let semaphore = Arc::new(tokio::sync::Semaphore::new(config.max_concurrent_scans)); + drop(config); + let mut scan_futures = Vec::new(); + + for disk in disks.clone() { + let semaphore = semaphore.clone(); + let scanner = self.clone_for_background(); + + let future = async move { + let _permit = semaphore.acquire().await.unwrap(); + scanner.scan_disk(&disk).await + }; + + scan_futures.push(future); + } + + // Wait for all scans to complete + let mut results = Vec::new(); + for future in scan_futures { + results.push(future.await); + } + + // Check results and collect object metadata + let mut successful_scans = 0; + let mut failed_scans = 0; + let mut all_disk_objects = Vec::new(); + + for result in results { + match result { + Ok(disk_objects) => { + successful_scans += 1; + all_disk_objects.push(disk_objects); + } + Err(e) => { + failed_scans += 1; + error!("Disk scan failed: {}", e); + // Add empty map for failed disk + all_disk_objects.push(HashMap::new()); + } + } + + } + + // Phase 2: Analyze object distribution and perform EC verification + if successful_scans > 0 { + if let Err(e) = self.analyze_object_distribution(&all_disk_objects, &disks).await { + error!("Object distribution analysis failed: {}", e); + } + } + + // Update scan duration + let scan_duration = SystemTime::now() + .duration_since(start_time) + .unwrap_or(Duration::ZERO); + + { + let mut state = self.state.write().await; + state.last_scan_end = Some(SystemTime::now()); + state.current_scan_duration = Some(scan_duration); + } + + info!("Completed scan cycle in {:?} ({} successful, {} failed)", + scan_duration, successful_scans, failed_scans); + Ok(()) + } + + /// Scan a single disk + async fn scan_disk(&self, disk: &DiskStore) -> Result>> { + let disk_path = disk.path().to_string_lossy().to_string(); + + info!("Scanning disk: {}", disk_path); + + // Update disk metrics + { + let mut disk_metrics = self.disk_metrics.write().await; + let metrics = disk_metrics.entry(disk_path.clone()).or_insert_with(|| DiskMetrics { + disk_path: disk_path.clone(), + ..Default::default() + }); + + metrics.is_scanning = true; + metrics.last_scan_time = Some(SystemTime::now()); + + // Get disk info + if let Ok(disk_info) = disk.disk_info(&ecstore::disk::DiskInfoOptions { + disk_id: disk_path.clone(), + metrics: true, + noop: false, + }).await { + metrics.total_space = disk_info.total; + metrics.used_space = disk_info.used; + metrics.free_space = disk_info.free; + metrics.is_online = disk.is_online().await; + } + } + + // Update state + { + let mut state = self.state.write().await; + state.scanning_disks.push(disk_path.clone()); + } + + // List volumes (buckets) on this disk + let volumes = match disk.list_volumes().await { + Ok(volumes) => volumes, + Err(e) => { + error!("Failed to list volumes on disk {}: {}", disk_path, e); + return Err(Error::Storage(e.into())); + } + }; + + // Scan each volume and collect object metadata + let mut disk_objects = HashMap::new(); + for volume in volumes { + // 检查取消信号 + if self.cancel_token.is_cancelled() { + info!("Cancellation requested, stopping disk scan"); + break; + } + + match self.scan_volume(disk, &volume.name).await { + Ok(object_metadata) => { + disk_objects.insert(volume.name, object_metadata); + } + Err(e) => { + error!("Failed to scan volume {} on disk {}: {}", volume.name, disk_path, e); + continue; + } + } + } + + // Update disk metrics after scan + { + let mut disk_metrics = self.disk_metrics.write().await; + if let Some(metrics) = disk_metrics.get_mut(&disk_path) { + metrics.is_scanning = false; + } + } + + // Update state + { + let mut state = self.state.write().await; + state.scanning_disks.retain(|d| d != &disk_path); + } + + Ok(disk_objects) + } + + /// Scan a single volume (bucket) and collect object information + /// + /// This method collects all objects from a disk for a specific bucket. + /// It returns a map of object names to their metadata for later analysis. + async fn scan_volume(&self, disk: &DiskStore, bucket: &str) -> Result> { + info!("Scanning bucket: {} on disk: {}", bucket, disk.to_string()); + + // Update bucket metrics + { + let mut bucket_metrics = self.bucket_metrics.write().await; + let metrics = bucket_metrics.entry(bucket.to_string()).or_insert_with(|| BucketMetrics { + bucket: bucket.to_string(), + ..Default::default() + }); + + metrics.last_scan_time = Some(SystemTime::now()); + } + + // Update state + { + let mut state = self.state.write().await; + state.scanning_buckets.push(bucket.to_string()); + } + + self.metrics.increment_bucket_scans_started(1); + + let scan_start = SystemTime::now(); + + // Walk through all objects in the bucket + let walk_opts = WalkDirOptions { + bucket: bucket.to_string(), + base_dir: String::new(), + recursive: true, + report_notfound: false, + filter_prefix: None, + forward_to: None, + limit: 0, + disk_id: String::new(), + }; + + // Use a buffer to collect scan results for processing + let mut scan_buffer = Vec::new(); + + if let Err(e) = disk.walk_dir(walk_opts, &mut scan_buffer).await { + error!("Failed to walk directory for bucket {}: {}", bucket, e); + return Err(Error::Storage(e.into())); + } + + // Process the scan results using MetacacheReader + let mut reader = MetacacheReader::new(std::io::Cursor::new(scan_buffer)); + let mut objects_scanned = 0u64; + let mut objects_with_issues = 0u64; + let mut object_metadata = HashMap::new(); + + // Process each object entry + while let Ok(Some(mut entry)) = reader.peek().await { + objects_scanned += 1; + // Check if this is an actual object (not just a directory) + if entry.is_object() { + debug!("Scanned object: {}", entry.name); + + // Parse object metadata + if let Ok(file_meta) = entry.xl_meta() { + if file_meta.versions.is_empty() { + objects_with_issues += 1; + warn!("Object {} has no versions", entry.name); + } else { + // Store object metadata for later analysis + object_metadata.insert(entry.name.clone(), file_meta); + } + } else { + objects_with_issues += 1; + warn!("Failed to parse metadata for object {}", entry.name); + } + } + } + + // Update metrics + self.metrics.increment_objects_scanned(objects_scanned); + self.metrics.increment_objects_with_issues(objects_with_issues); + self.metrics.increment_bucket_scans_finished(1); + + // Update bucket metrics + { + let mut bucket_metrics = self.bucket_metrics.write().await; + if let Some(metrics) = bucket_metrics.get_mut(bucket) { + metrics.total_objects = objects_scanned; + metrics.objects_with_issues = objects_with_issues; + metrics.scan_duration = Some( + SystemTime::now() + .duration_since(scan_start) + .unwrap_or(Duration::ZERO) + ); + } + } + + // Update state + { + let mut state = self.state.write().await; + state.scanning_buckets.retain(|b| b != bucket); + } + + debug!("Completed scanning bucket: {} on disk {} ({} objects, {} issues)", + bucket, disk.to_string(), objects_scanned, objects_with_issues); + + Ok(object_metadata) + } + + /// Analyze object distribution across all disks and perform EC verification + /// + /// This method takes the collected object metadata from all disks and: + /// 1. Creates a union of all objects across all disks + /// 2. Identifies missing objects on each disk (for healing) + /// 3. Performs EC decode verification for deep scan mode + async fn analyze_object_distribution( + &self, + all_disk_objects: &[HashMap>], + disks: &[DiskStore], + ) -> Result<()> { + info!("Analyzing object distribution across {} disks", disks.len()); + + // Step 1: Create union of all objects across all disks + let mut all_objects = HashMap::new(); // bucket -> Set + let mut object_locations = HashMap::new(); // (bucket, object) -> Vec + + for (disk_idx, disk_objects) in all_disk_objects.iter().enumerate() { + for (bucket, objects) in disk_objects { + // Add bucket to all_objects + let bucket_objects = all_objects.entry(bucket.clone()).or_insert_with(|| std::collections::HashSet::new()); + + for (object_name, _file_meta) in objects { + bucket_objects.insert(object_name.clone()); + + // Record which disk has this object + let key = (bucket.clone(), object_name.clone()); + let locations = object_locations.entry(key).or_insert_with(|| Vec::new()); + locations.push(disk_idx); + } + } + } + + info!("Found {} buckets with {} total objects", all_objects.len(), + all_objects.values().map(|s| s.len()).sum::()); + + // Step 2: Identify missing objects and perform EC verification + let mut objects_needing_heal = 0u64; + let mut objects_with_ec_issues = 0u64; + + for (bucket, objects) in &all_objects { + for object_name in objects { + let key = (bucket.clone(), object_name.clone()); + let empty_vec = Vec::new(); + let locations = object_locations.get(&key).unwrap_or(&empty_vec); + + // Check if object is missing from some disks + if locations.len() < disks.len() { + objects_needing_heal += 1; + let missing_disks: Vec = (0..disks.len()) + .filter(|&i| !locations.contains(&i)) + .collect(); + warn!("Object {}/{} missing from disks: {:?}", bucket, object_name, missing_disks); + println!("Object {}/{} missing from disks: {:?}", bucket, object_name, missing_disks); + // TODO: Trigger heal for this object + } + + // Step 3: Deep scan EC verification + let config = self.config.read().await; + if config.scan_mode == ScanMode::Deep { + // Find the first disk that has this object to get metadata + if let Some(&first_disk_idx) = locations.first() { + if let Some(file_meta) = all_disk_objects[first_disk_idx] + .get(bucket) + .and_then(|objects| objects.get(object_name)) + { + if let Err(e) = self.verify_ec_decode_with_locations( + bucket, object_name, file_meta, locations, disks + ).await { + objects_with_ec_issues += 1; + warn!("EC decode verification failed for object {}/{}: {}", bucket, object_name, e); + } + } + } + } + } + } + + info!("Analysis complete: {} objects need healing, {} objects have EC issues", + objects_needing_heal, objects_with_ec_issues); + + Ok(()) + } + + /// Verify EC decode capability for an object using known disk locations + /// + /// This method is optimized to use the known locations of object copies + /// instead of scanning all disks. + async fn verify_ec_decode_with_locations( + &self, + bucket: &str, + object: &str, + file_meta: &rustfs_filemeta::FileMeta, + locations: &[usize], + all_disks: &[DiskStore], + ) -> Result<()> { + // Get EC parameters from the latest version + let (data_blocks, _parity_blocks) = if let Some(latest_version) = file_meta.versions.last() { + if let Ok(version) = rustfs_filemeta::FileMetaVersion::try_from(latest_version.clone()) { + if let Some(obj) = version.object { + (obj.erasure_m, obj.erasure_n) + } else { + // Not an object version, skip EC verification + return Ok(()); + } + } else { + // Cannot parse version, skip EC verification + return Ok(()); + } + } else { + // No versions, skip EC verification + return Ok(()); + }; + + let read_quorum = data_blocks; // Need at least data_blocks to decode + + if locations.len() < read_quorum { + return Err(Error::Scanner(format!( + "Insufficient object copies for EC decode: need {}, have {}", + read_quorum, locations.len() + ))); + } + + // Try to read object metadata from the known locations + let mut successful_reads = 0; + let mut errors = Vec::new(); + + for &disk_idx in locations { + if successful_reads >= read_quorum { + break; // We have enough copies for EC decode + } + + let disk = &all_disks[disk_idx]; + match disk.read_xl(bucket, object, false).await { + Ok(_) => { + successful_reads += 1; + debug!("Successfully read object {}/{} from disk {} (index: {})", + bucket, object, disk.to_string(), disk_idx); + } + Err(e) => { + let error_msg = format!("{}", e); + errors.push(error_msg); + debug!("Failed to read object {}/{} from disk {} (index: {}): {}", + bucket, object, disk.to_string(), disk_idx, e); + } + } + } + + if successful_reads >= read_quorum { + debug!("EC decode verification passed for object {}/{} ({} successful reads from {} locations)", + bucket, object, successful_reads, locations.len()); + Ok(()) + } else { + Err(Error::Scanner(format!( + "EC decode verification failed for object {}/{}: need {} reads, got {} (errors: {:?})", + bucket, object, read_quorum, successful_reads, errors + ))) + } + } + + /// Background scan loop with graceful shutdown + async fn scan_loop(self) -> Result<()> { + let config = self.config.read().await; + let mut interval = tokio::time::interval(config.scan_interval); + let deep_scan_interval = config.deep_scan_interval; + drop(config); + let cancel_token = self.cancel_token.clone(); + + loop { + tokio::select! { + _ = interval.tick() => { + // Check if scanner should still be running + if !self.state.read().await.is_running { + break; + } + + // 检查取消信号 + if cancel_token.is_cancelled() { + info!("Cancellation requested, exiting scanner loop"); + break; + } + + // Determine if it's time for a deep scan + let current_time = SystemTime::now(); + let last_deep_scan_time = self.state.read().await.last_deep_scan_time.unwrap_or(SystemTime::UNIX_EPOCH); + + if current_time.duration_since(last_deep_scan_time).unwrap_or(Duration::ZERO) >= deep_scan_interval { + info!("Deep scan interval reached, switching to deep scan mode"); + self.config.write().await.scan_mode = ScanMode::Deep; + self.state.write().await.last_deep_scan_time = Some(current_time); + } + + // Perform scan cycle + if let Err(e) = self.scan_cycle().await { + error!("Scan cycle failed: {}", e); + } + } + _ = cancel_token.cancelled() => { + info!("Received cancellation, stopping scanner loop"); + break; + } + } + } + + info!("Scanner loop stopped"); + Ok(()) + } + + /// Clone scanner for background tasks + fn clone_for_background(&self) -> Self { + Self { + config: self.config.clone(), + state: Arc::clone(&self.state), + metrics: Arc::clone(&self.metrics), + bucket_metrics: Arc::clone(&self.bucket_metrics), + disk_metrics: Arc::clone(&self.disk_metrics), + set_disks: Arc::clone(&self.set_disks), + cancel_token: self.cancel_token.clone(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rustfs_ecstore::store::ECStore; + use rustfs_ecstore::disk::endpoint::Endpoint; + use rustfs_ecstore::endpoints::{EndpointServerPools, PoolEndpoints, Endpoints}; + use rustfs_ecstore::{StorageAPI, store_api::{ObjectIO, MakeBucketOptions, PutObjReader}}; + use std::fs; + use std::net::SocketAddr; + + #[tokio::test(flavor = "multi_thread")] + async fn test_scanner_basic_functionality() { + // create temp dir as 4 disks + let temp_dir = std::path::PathBuf::from("/tmp/rustfs_ahm_test"); + if temp_dir.exists() { + fs::remove_dir_all(&temp_dir).unwrap(); + } + fs::create_dir_all(&temp_dir).unwrap(); + + // create 4 disk dirs + let disk_paths = vec![ + temp_dir.join("disk1"), + temp_dir.join("disk2"), + temp_dir.join("disk3"), + temp_dir.join("disk4"), + ]; + + for disk_path in &disk_paths { + fs::create_dir_all(disk_path).unwrap(); + } + + // create EndpointServerPools + let mut endpoints = Vec::new(); + for (i, disk_path) in disk_paths.iter().enumerate() { + let mut endpoint = Endpoint::try_from(disk_path.to_str().unwrap()).unwrap(); + // set correct index + endpoint.set_pool_index(0); + endpoint.set_set_index(0); + endpoint.set_disk_index(i); + endpoints.push(endpoint); + } + + let pool_endpoints = PoolEndpoints { + legacy: false, + set_count: 1, + drives_per_set: 4, + endpoints: Endpoints::from(endpoints), + cmd_line: "test".to_string(), + platform: format!("OS: {} | Arch: {}", std::env::consts::OS, std::env::consts::ARCH), + }; + + let endpoint_pools = EndpointServerPools(vec![pool_endpoints]); + + // format disks + rustfs_ecstore::store::init_local_disks(endpoint_pools.clone()).await.unwrap(); + + // create ECStore + let server_addr: SocketAddr = "127.0.0.1:9000".parse().unwrap(); + let ecstore = ECStore::new(server_addr, endpoint_pools).await.unwrap(); + + // init bucket metadata system + let buckets_list = ecstore + .list_bucket(&rustfs_ecstore::store_api::BucketOptions { + no_metadata: true, + ..Default::default() + }) + .await + .unwrap(); + let buckets = buckets_list.into_iter().map(|v| v.name).collect(); + rustfs_ecstore::bucket::metadata_sys::init_bucket_metadata_sys(ecstore.clone(), buckets).await; + + // get first SetDisks + let set_disks = ecstore.pools[0].get_disks(0); + + // create some test data + let bucket_name = "test-bucket"; + let object_name = "test-object"; + let test_data = b"Hello, RustFS!"; + + // create bucket and verify + let bucket_opts = MakeBucketOptions::default(); + ecstore.make_bucket(bucket_name, &bucket_opts).await.expect("make_bucket failed"); + + // check bucket really exists + let buckets = ecstore.list_bucket(&rustfs_ecstore::store_api::BucketOptions::default()).await.unwrap(); + assert!(buckets.iter().any(|b| b.name == bucket_name), "bucket not found after creation"); + + // write object + let mut put_reader = PutObjReader::from_vec(test_data.to_vec()); + let object_opts = rustfs_ecstore::store_api::ObjectOptions::default(); + ecstore.put_object(bucket_name, object_name, &mut put_reader, &object_opts).await.expect("put_object failed"); + + // create Scanner and test basic functionality + let scanner = Scanner::new(set_disks, None); + + // Test 1: Normal scan - verify object is found + println!("=== Test 1: Normal scan ==="); + let scan_result = scanner.scan_cycle().await; + assert!(scan_result.is_ok(), "Normal scan should succeed"); + let metrics = scanner.get_metrics().await; + assert!(metrics.objects_scanned > 0, "Objects scanned should be positive"); + println!("Normal scan completed successfully"); + + // Test 2: Simulate disk corruption - delete object data from disk1 + println!("=== Test 2: Simulate disk corruption ==="); + let disk1_bucket_path = disk_paths[0].join(bucket_name); + let disk1_object_path = disk1_bucket_path.join(object_name); + + // Try to delete the object file from disk1 (simulate corruption) + // Note: This might fail if ECStore is actively using the file + match fs::remove_dir_all(&disk1_object_path) { + Ok(_) => { + println!("Successfully deleted object from disk1: {:?}", disk1_object_path); + + // Verify deletion by checking if the directory still exists + if disk1_object_path.exists() { + println!("WARNING: Directory still exists after deletion: {:?}", disk1_object_path); + } else { + println!("Confirmed: Directory was successfully deleted"); + } + } + Err(e) => { + println!("Could not delete object from disk1 (file may be in use): {:?} - {}", disk1_object_path, e); + // This is expected behavior - ECStore might be holding file handles + } + } + + // Scan again - should still complete (even with missing data) + let scan_result_after_corruption = scanner.scan_cycle().await; + println!("Scan after corruption result: {:?}", scan_result_after_corruption); + + // Scanner should handle missing data gracefully + assert!(scan_result_after_corruption.is_ok(), "Scanner should handle missing data gracefully"); + + // Test 3: Verify EC decode capability + println!("=== Test 3: Verify EC decode ==="); + // Note: EC decode verification is done internally during scan_cycle + // We can verify that the scanner handles missing data gracefully + println!("EC decode verification is handled internally during scan cycles"); + + // Test 4: Test metrics collection + println!("=== Test 4: Metrics collection ==="); + let final_metrics = scanner.get_metrics().await; + println!("Final metrics: {:?}", final_metrics); + + // Verify metrics are reasonable + assert!(final_metrics.total_cycles > 0, "Should have completed scan cycles"); + assert!(final_metrics.last_activity.is_some(), "Should have scan activity"); + + // clean up temp dir + // if let Err(e) = fs::remove_dir_all(&temp_dir) { + // eprintln!("Warning: Failed to clean up temp directory {:?}: {}", temp_dir, e); + // } + } +} + diff --git a/crates/ahm/src/scanner/bandwidth_limiter.rs b/crates/ahm/src/scanner/bandwidth_limiter.rs deleted file mode 100644 index cf0e6a8d..00000000 --- a/crates/ahm/src/scanner/bandwidth_limiter.rs +++ /dev/null @@ -1,353 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, - time::{Duration, Instant}, -}; - -use tokio::{ - sync::RwLock, - time::{sleep, sleep_until, Instant as TokioInstant}, -}; -use tracing::{debug, info, warn}; - -use crate::error::Result; - -/// Configuration for bandwidth limiting -#[derive(Debug, Clone)] -pub struct BandwidthConfig { - /// Maximum bytes per second - pub bytes_per_second: u64, - /// Maximum operations per second - pub operations_per_second: u64, - /// Burst allowance multiplier - pub burst_multiplier: f64, - /// Whether to enable adaptive throttling - pub adaptive_throttling: bool, - /// Minimum sleep duration between operations - pub min_sleep_duration: Duration, - /// Maximum sleep duration between operations - pub max_sleep_duration: Duration, -} - -impl Default for BandwidthConfig { - fn default() -> Self { - Self { - bytes_per_second: 100 * 1024 * 1024, // 100 MB/s - operations_per_second: 1000, // 1000 ops/s - burst_multiplier: 2.0, - adaptive_throttling: true, - min_sleep_duration: Duration::from_micros(100), - max_sleep_duration: Duration::from_millis(100), - } - } -} - -/// Bandwidth limiter for controlling scan I/O rates -pub struct BandwidthLimiter { - config: BandwidthConfig, - bytes_this_second: Arc, - operations_this_second: Arc, - last_reset: Arc>, - adaptive_sleep_duration: Arc>, - total_bytes_processed: Arc, - total_operations_processed: Arc, - start_time: Instant, -} - -impl BandwidthLimiter { - /// Create a new bandwidth limiter - pub fn new(config: BandwidthConfig) -> Self { - let adaptive_sleep = if config.adaptive_throttling { - config.min_sleep_duration - } else { - Duration::from_micros(1000) // 1ms default - }; - - Self { - config, - bytes_this_second: Arc::new(AtomicU64::new(0)), - operations_this_second: Arc::new(AtomicU64::new(0)), - last_reset: Arc::new(RwLock::new(Instant::now())), - adaptive_sleep_duration: Arc::new(RwLock::new(adaptive_sleep)), - total_bytes_processed: Arc::new(AtomicU64::new(0)), - total_operations_processed: Arc::new(AtomicU64::new(0)), - start_time: Instant::now(), - } - } - - /// Wait for bandwidth allowance before processing bytes - pub async fn wait_for_bytes(&self, bytes: u64) -> Result<()> { - if self.config.bytes_per_second == 0 { - return Ok(()); - } - - let mut total_wait_time = Duration::ZERO; - let mut remaining_bytes = bytes; - - while remaining_bytes > 0 { - // Reset counters if a second has passed - self.reset_counters_if_needed().await; - - let current_bytes = self.bytes_this_second.load(Ordering::Relaxed); - let burst_limit = (self.config.bytes_per_second as f64 * self.config.burst_multiplier) as u64; - - if current_bytes >= burst_limit { - // We're over the burst limit, wait - let wait_time = self.calculate_wait_time(current_bytes, self.config.bytes_per_second).await; - sleep(wait_time).await; - total_wait_time += wait_time; - continue; - } - - let bytes_to_process = std::cmp::min(remaining_bytes, burst_limit - current_bytes); - self.bytes_this_second.fetch_add(bytes_to_process, Ordering::Relaxed); - self.total_bytes_processed.fetch_add(bytes_to_process, Ordering::Relaxed); - remaining_bytes -= bytes_to_process; - - // Adaptive throttling - if self.config.adaptive_throttling { - self.update_adaptive_sleep(bytes_to_process).await; - } - } - - if total_wait_time > Duration::ZERO { - debug!("Bandwidth limiter waited {:?} for {} bytes", total_wait_time, bytes); - } - - Ok(()) - } - - /// Wait for bandwidth allowance before processing an operation - pub async fn wait_for_operation(&self) -> Result<()> { - if self.config.operations_per_second == 0 { - return Ok(()); - } - - // Reset counters if a second has passed - self.reset_counters_if_needed().await; - - let current_ops = self.operations_this_second.load(Ordering::Relaxed); - let burst_limit = (self.config.operations_per_second as f64 * self.config.burst_multiplier) as u64; - - if current_ops >= burst_limit { - // We're over the burst limit, wait - let wait_time = self.calculate_wait_time(current_ops, self.config.operations_per_second).await; - sleep(wait_time).await; - debug!("Bandwidth limiter waited {:?} for operation", wait_time); - } - - self.operations_this_second.fetch_add(1, Ordering::Relaxed); - self.total_operations_processed.fetch_add(1, Ordering::Relaxed); - - Ok(()) - } - - /// Wait for bandwidth allowance before processing both bytes and operations - pub async fn wait_for_bytes_and_operation(&self, bytes: u64) -> Result<()> { - self.wait_for_bytes(bytes).await?; - self.wait_for_operation().await?; - Ok(()) - } - - /// Reset counters if a second has passed - async fn reset_counters_if_needed(&self) { - let mut last_reset = self.last_reset.write().await; - let now = Instant::now(); - - if now.duration_since(*last_reset) >= Duration::from_secs(1) { - self.bytes_this_second.store(0, Ordering::Relaxed); - self.operations_this_second.store(0, Ordering::Relaxed); - *last_reset = now; - } - } - - /// Calculate wait time based on current usage and limit - async fn calculate_wait_time(&self, current: u64, limit: u64) -> Duration { - if current == 0 || limit == 0 { - return self.config.min_sleep_duration; - } - - let utilization = current as f64 / limit as f64; - let base_sleep = self.config.min_sleep_duration.as_micros() as f64; - let max_sleep = self.config.max_sleep_duration.as_micros() as f64; - - // Exponential backoff based on utilization - let sleep_micros = base_sleep * (utilization * utilization); - let sleep_micros = sleep_micros.min(max_sleep).max(base_sleep); - - Duration::from_micros(sleep_micros as u64) - } - - /// Update adaptive sleep duration based on recent activity - async fn update_adaptive_sleep(&self, bytes_processed: u64) { - let mut sleep_duration = self.adaptive_sleep_duration.write().await; - - // Simple adaptive algorithm: increase sleep if we're processing too much - let current_rate = bytes_processed as f64 / sleep_duration.as_secs_f64(); - let target_rate = self.config.bytes_per_second as f64; - - if current_rate > target_rate * 1.1 { - // We're going too fast, increase sleep - *sleep_duration = Duration::from_micros( - (sleep_duration.as_micros() as f64 * 1.1) as u64 - ).min(self.config.max_sleep_duration); - } else if current_rate < target_rate * 0.9 { - // We're going too slow, decrease sleep - *sleep_duration = Duration::from_micros( - (sleep_duration.as_micros() as f64 * 0.9) as u64 - ).max(self.config.min_sleep_duration); - } - } - - /// Get current bandwidth statistics - pub async fn statistics(&self) -> BandwidthStatistics { - let elapsed = self.start_time.elapsed(); - let total_bytes = self.total_bytes_processed.load(Ordering::Relaxed); - let total_ops = self.total_operations_processed.load(Ordering::Relaxed); - let current_bytes = self.bytes_this_second.load(Ordering::Relaxed); - let current_ops = self.operations_this_second.load(Ordering::Relaxed); - let adaptive_sleep = *self.adaptive_sleep_duration.read().await; - - BandwidthStatistics { - total_bytes_processed: total_bytes, - total_operations_processed: total_ops, - current_bytes_per_second: current_bytes, - current_operations_per_second: current_ops, - average_bytes_per_second: if elapsed.as_secs() > 0 { - total_bytes / elapsed.as_secs() - } else { - 0 - }, - average_operations_per_second: if elapsed.as_secs() > 0 { - total_ops / elapsed.as_secs() - } else { - 0 - }, - adaptive_sleep_duration: adaptive_sleep, - uptime: elapsed, - } - } - - /// Reset all statistics - pub async fn reset_statistics(&self) { - self.total_bytes_processed.store(0, Ordering::Relaxed); - self.total_operations_processed.store(0, Ordering::Relaxed); - self.bytes_this_second.store(0, Ordering::Relaxed); - self.operations_this_second.store(0, Ordering::Relaxed); - *self.last_reset.write().await = Instant::now(); - *self.adaptive_sleep_duration.write().await = self.config.min_sleep_duration; - } - - /// Update configuration - pub async fn update_config(&self, new_config: BandwidthConfig) { - info!("Updating bandwidth limiter config: {:?}", new_config); - - // Reset adaptive sleep if adaptive throttling is disabled - if !new_config.adaptive_throttling { - *self.adaptive_sleep_duration.write().await = new_config.min_sleep_duration; - } - - // Note: We can't update the config struct itself since it's not wrapped in Arc - // In a real implementation, you might want to wrap the config in Arc as well - warn!("Config update not fully implemented - config struct is not mutable"); - } -} - -/// Statistics for bandwidth limiting -#[derive(Debug, Clone)] -pub struct BandwidthStatistics { - pub total_bytes_processed: u64, - pub total_operations_processed: u64, - pub current_bytes_per_second: u64, - pub current_operations_per_second: u64, - pub average_bytes_per_second: u64, - pub average_operations_per_second: u64, - pub adaptive_sleep_duration: Duration, - pub uptime: Duration, -} - -#[cfg(test)] -mod tests { - use super::*; - use tokio::time::Instant as TokioInstant; - - #[tokio::test] - async fn test_bandwidth_limiter_creation() { - let config = BandwidthConfig::default(); - let limiter = BandwidthLimiter::new(config); - let stats = limiter.statistics().await; - assert_eq!(stats.total_bytes_processed, 0); - assert_eq!(stats.total_operations_processed, 0); - } - - #[tokio::test] - async fn test_bytes_limiting() { - let config = BandwidthConfig { - bytes_per_second: 1000, // 1KB/s - operations_per_second: 1000, - ..Default::default() - }; - let limiter = BandwidthLimiter::new(config); - - let start = TokioInstant::now(); - - // Process 500 bytes (should not be limited) - limiter.wait_for_bytes(500).await.unwrap(); - - // Process another 600 bytes (should be limited) - limiter.wait_for_bytes(600).await.unwrap(); - - let elapsed = start.elapsed(); - assert!(elapsed >= Duration::from_millis(100)); // Should take some time due to limiting - } - - #[tokio::test] - async fn test_operation_limiting() { - let config = BandwidthConfig { - bytes_per_second: 1000000, // 1MB/s - operations_per_second: 10, // 10 ops/s - ..Default::default() - }; - let limiter = BandwidthLimiter::new(config); - - let start = TokioInstant::now(); - - // Process 15 operations (should be limited) - for _ in 0..15 { - limiter.wait_for_operation().await.unwrap(); - } - - let elapsed = start.elapsed(); - assert!(elapsed >= Duration::from_millis(500)); // Should take some time due to limiting - } - - #[tokio::test] - async fn test_statistics() { - let config = BandwidthConfig::default(); - let limiter = BandwidthLimiter::new(config); - - limiter.wait_for_bytes(1000).await.unwrap(); - limiter.wait_for_operation().await.unwrap(); - - let stats = limiter.statistics().await; - assert_eq!(stats.total_bytes_processed, 1000); - assert_eq!(stats.total_operations_processed, 1); - assert!(stats.uptime > Duration::ZERO); - } -} \ No newline at end of file diff --git a/crates/ahm/src/scanner/disk_scanner.rs b/crates/ahm/src/scanner/disk_scanner.rs deleted file mode 100644 index 8d636107..00000000 --- a/crates/ahm/src/scanner/disk_scanner.rs +++ /dev/null @@ -1,591 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::HashMap, - path::Path, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{error, info}; -use anyhow; - -use crate::error::Result; -use super::{HealthIssue, HealthIssueType, Severity}; - -/// Configuration for disk scanning -#[derive(Debug, Clone)] -pub struct DiskScannerConfig { - /// Scan interval for disk health checks - pub scan_interval: Duration, - /// Minimum free space threshold (percentage) - pub min_free_space_percent: f64, - /// Maximum disk usage threshold (percentage) - pub max_disk_usage_percent: f64, - /// Minimum inode usage threshold (percentage) - pub min_inode_usage_percent: f64, - /// Maximum inode usage threshold (percentage) - pub max_inode_usage_percent: f64, - /// Whether to check disk I/O performance - pub check_io_performance: bool, - /// Whether to check disk temperature (if available) - pub check_temperature: bool, - /// Whether to check disk SMART status (if available) - pub check_smart_status: bool, - /// Timeout for individual disk operations - pub operation_timeout: Duration, - /// Maximum number of concurrent disk scans - pub max_concurrent_scans: usize, -} - -impl Default for DiskScannerConfig { - fn default() -> Self { - Self { - scan_interval: Duration::from_secs(300), // 5 minutes - min_free_space_percent: 10.0, // 10% minimum free space - max_disk_usage_percent: 90.0, // 90% maximum usage - min_inode_usage_percent: 5.0, // 5% minimum inode usage - max_inode_usage_percent: 95.0, // 95% maximum inode usage - check_io_performance: true, - check_temperature: false, // Disabled by default - check_smart_status: false, // Disabled by default - operation_timeout: Duration::from_secs(30), - max_concurrent_scans: 4, - } - } -} - -/// Disk information and health status -#[derive(Debug, Clone)] -pub struct DiskInfo { - pub device_path: String, - pub mount_point: String, - pub filesystem_type: String, - pub total_space: u64, - pub used_space: u64, - pub free_space: u64, - pub available_space: u64, - pub usage_percent: f64, - pub inode_total: Option, - pub inode_used: Option, - pub inode_free: Option, - pub inode_usage_percent: Option, - pub last_scan_time: SystemTime, - pub health_status: DiskHealthStatus, - pub performance_metrics: Option, - pub temperature: Option, - pub smart_status: Option, -} - -/// Disk health status -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum DiskHealthStatus { - Healthy, - Warning, - Critical, - Unknown, -} - -/// Disk performance metrics -#[derive(Debug, Clone)] -pub struct DiskPerformanceMetrics { - pub read_bytes_per_sec: f64, - pub write_bytes_per_sec: f64, - pub read_operations_per_sec: f64, - pub write_operations_per_sec: f64, - pub average_response_time_ms: f64, - pub queue_depth: f64, - pub utilization_percent: f64, - pub last_updated: SystemTime, -} - -/// SMART status information -#[derive(Debug, Clone)] -pub struct SmartStatus { - pub overall_health: SmartHealthStatus, - pub temperature: Option, - pub power_on_hours: Option, - pub reallocated_sectors: Option, - pub pending_sectors: Option, - pub uncorrectable_sectors: Option, - pub attributes: HashMap, -} - -/// SMART health status -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SmartHealthStatus { - Passed, - Failed, - Unknown, -} - -/// SMART attribute -#[derive(Debug, Clone)] -pub struct SmartAttribute { - pub name: String, - pub value: u64, - pub worst: u64, - pub threshold: u64, - pub status: SmartAttributeStatus, -} - -/// SMART attribute status -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SmartAttributeStatus { - Good, - Warning, - Critical, - Unknown, -} - -/// Result of scanning a single disk -#[derive(Debug, Clone)] -pub struct DiskScanResult { - pub disk_info: DiskInfo, - pub health_issues: Vec, - pub scan_duration: Duration, - pub success: bool, - pub error_message: Option, -} - -/// Disk scanner for monitoring disk health and performance -pub struct DiskScanner { - config: DiskScannerConfig, - statistics: Arc>, - last_scan_results: Arc>>, -} - -/// Statistics for disk scanning -#[derive(Debug, Clone, Default)] -pub struct DiskScannerStatistics { - pub disks_scanned: u64, - pub disks_with_issues: u64, - pub total_issues_found: u64, - pub total_scan_time: Duration, - pub average_scan_time: Duration, - pub last_scan_time: Option, - pub scan_cycles_completed: u64, - pub scan_cycles_failed: u64, -} - -impl DiskScanner { - /// Create a new disk scanner - pub fn new(config: DiskScannerConfig) -> Self { - Self { - config, - statistics: Arc::new(RwLock::new(DiskScannerStatistics::default())), - last_scan_results: Arc::new(RwLock::new(HashMap::new())), - } - } - - /// Scan all mounted disks - pub async fn scan_all_disks(&self) -> Result> { - let scan_start = Instant::now(); - let mut results = Vec::new(); - - // Get list of mounted filesystems - let mount_points = self.get_mount_points().await?; - - info!("Starting disk scan for {} mount points", mount_points.len()); - - // Scan each mount point - for mount_point in mount_points { - match self.scan_disk(&mount_point).await { - Ok(result) => { - results.push(result.clone()); - - // Store result for later reference - let mut last_results = self.last_scan_results.write().await; - last_results.insert(mount_point.clone(), result); - } - Err(e) => { - error!("Failed to scan disk at {}: {}", mount_point, e); - - // Create error result - let error_result = DiskScanResult { - disk_info: DiskInfo { - device_path: "unknown".to_string(), - mount_point: mount_point.clone(), - filesystem_type: "unknown".to_string(), - total_space: 0, - used_space: 0, - free_space: 0, - available_space: 0, - usage_percent: 0.0, - inode_total: None, - inode_used: None, - inode_free: None, - inode_usage_percent: None, - last_scan_time: SystemTime::now(), - health_status: DiskHealthStatus::Unknown, - performance_metrics: None, - temperature: None, - smart_status: None, - }, - health_issues: vec![HealthIssue { - issue_type: HealthIssueType::DiskReadError, - severity: Severity::High, - bucket: "system".to_string(), - object: mount_point.clone(), - description: format!("Failed to scan disk: {}", e), - metadata: None, - }], - scan_duration: scan_start.elapsed(), - success: false, - error_message: Some(e.to_string()), - }; - - results.push(error_result); - } - } - } - - // Update statistics - self.update_statistics(|stats| { - stats.disks_scanned += results.len() as u64; - stats.disks_with_issues += results.iter().filter(|r| !r.health_issues.is_empty()).count() as u64; - stats.total_issues_found += results.iter().map(|r| r.health_issues.len() as u64).sum::(); - stats.total_scan_time += scan_start.elapsed(); - stats.average_scan_time = Duration::from_millis( - stats.total_scan_time.as_millis() as u64 / stats.disks_scanned.max(1) - ); - stats.last_scan_time = Some(SystemTime::now()); - stats.scan_cycles_completed += 1; - }).await; - - info!( - "Disk scan completed: {} disks, {} issues found in {:?}", - results.len(), - results.iter().map(|r| r.health_issues.len()).sum::(), - scan_start.elapsed() - ); - - Ok(results) - } - - /// Scan a single disk - pub async fn scan_disk(&self, mount_point: &str) -> Result { - let scan_start = Instant::now(); - let mut health_issues = Vec::new(); - - // Get disk space information - let disk_info = self.get_disk_info(mount_point).await?; - - // Check disk space usage - if disk_info.usage_percent > self.config.max_disk_usage_percent { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskFull, - severity: if disk_info.usage_percent > 95.0 { Severity::Critical } else { Severity::High }, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: format!("Disk usage is {}%, exceeds threshold of {}%", - disk_info.usage_percent, self.config.max_disk_usage_percent), - metadata: None, - }); - } - - if disk_info.usage_percent < self.config.min_free_space_percent { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskFull, - severity: Severity::Medium, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: format!("Free space is only {}%, below threshold of {}%", - 100.0 - disk_info.usage_percent, self.config.min_free_space_percent), - metadata: None, - }); - } - - // Check inode usage if available - if let Some(inode_usage) = disk_info.inode_usage_percent { - if inode_usage > self.config.max_inode_usage_percent { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskFull, - severity: if inode_usage > 95.0 { Severity::Critical } else { Severity::High }, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: format!("Inode usage is {}%, exceeds threshold of {}%", - inode_usage, self.config.max_inode_usage_percent), - metadata: None, - }); - } - } - - // Check I/O performance if enabled - if self.config.check_io_performance { - if let Some(metrics) = &disk_info.performance_metrics { - if metrics.utilization_percent > 90.0 { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskReadError, - severity: Severity::Medium, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: format!("High disk utilization: {}%", metrics.utilization_percent), - metadata: None, - }); - } - - if metrics.average_response_time_ms > 100.0 { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskReadError, - severity: Severity::Medium, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: format!("High disk response time: {}ms", metrics.average_response_time_ms), - metadata: None, - }); - } - } - } - - // Check temperature if enabled - if self.config.check_temperature { - if let Some(temp) = disk_info.temperature { - if temp > 60.0 { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskReadError, - severity: if temp > 70.0 { Severity::Critical } else { Severity::High }, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: format!("High disk temperature: {}°C", temp), - metadata: None, - }); - } - } - } - - // Check SMART status if enabled - if self.config.check_smart_status { - if let Some(smart) = &disk_info.smart_status { - if smart.overall_health == SmartHealthStatus::Failed { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskReadError, - severity: Severity::Critical, - bucket: "system".to_string(), - object: mount_point.to_string(), - description: "SMART health check failed".to_string(), - metadata: None, - }); - } - } - } - - let scan_duration = scan_start.elapsed(); - let success = health_issues.is_empty(); - - Ok(DiskScanResult { - disk_info, - health_issues, - scan_duration, - success, - error_message: None, - }) - } - - /// Get list of mounted filesystems - async fn get_mount_points(&self) -> Result> { - // TODO: Implement actual mount point detection - // For now, return common mount points - Ok(vec![ - "/".to_string(), - "/data".to_string(), - "/var".to_string(), - ]) - } - - /// Get disk information for a mount point - async fn get_disk_info(&self, mount_point: &str) -> Result { - let path = Path::new(mount_point); - - // Get filesystem statistics using std::fs instead of nix for now - let _metadata = match std::fs::metadata(path) { - Ok(metadata) => metadata, - Err(e) => { - return Err(crate::error::Error::Other(anyhow::anyhow!("Failed to get filesystem stats: {}", e))); - } - }; - - // For now, use placeholder values since we can't easily get filesystem stats - let total_space = 1000000000; // 1GB placeholder - let free_space = 500000000; // 500MB placeholder - let available_space = 450000000; // 450MB placeholder - let used_space = total_space - free_space; - let usage_percent = (used_space as f64 / total_space as f64) * 100.0; - - // Get inode information (placeholder) - let inode_total = Some(1000000); - let inode_free = Some(500000); - let inode_used = Some(500000); - let inode_usage_percent = Some(50.0); - - // Get filesystem type - let filesystem_type = self.get_filesystem_type(mount_point).await.unwrap_or_else(|_| "unknown".to_string()); - - // Get device path - let device_path = self.get_device_path(mount_point).await.unwrap_or_else(|_| "unknown".to_string()); - - // Get performance metrics if enabled - let performance_metrics = if self.config.check_io_performance { - self.get_performance_metrics(&device_path).await.ok() - } else { - None - }; - - // Get temperature if enabled - let temperature = if self.config.check_temperature { - self.get_disk_temperature(&device_path).await.ok().flatten() - } else { - None - }; - - // Get SMART status if enabled - let smart_status = if self.config.check_smart_status { - self.get_smart_status(&device_path).await.ok().flatten() - } else { - None - }; - - // Determine health status (placeholder - will be set by scan_disk method) - let health_status = DiskHealthStatus::Healthy; - - Ok(DiskInfo { - device_path, - mount_point: mount_point.to_string(), - filesystem_type, - total_space, - used_space, - free_space, - available_space, - usage_percent, - inode_total, - inode_used, - inode_free, - inode_usage_percent, - last_scan_time: SystemTime::now(), - health_status, - performance_metrics, - temperature, - smart_status, - }) - } - - /// Get filesystem type for a mount point - async fn get_filesystem_type(&self, _mount_point: &str) -> Result { - // TODO: Implement filesystem type detection - // For now, return a placeholder - Ok("ext4".to_string()) - } - - /// Get device path for a mount point - async fn get_device_path(&self, _mount_point: &str) -> Result { - // TODO: Implement device path detection - // For now, return a placeholder - Ok("/dev/sda1".to_string()) - } - - /// Get disk performance metrics - async fn get_performance_metrics(&self, _device_path: &str) -> Result { - // TODO: Implement performance metrics collection - // For now, return placeholder metrics - Ok(DiskPerformanceMetrics { - read_bytes_per_sec: 1000000.0, // 1MB/s - write_bytes_per_sec: 500000.0, // 500KB/s - read_operations_per_sec: 100.0, - write_operations_per_sec: 50.0, - average_response_time_ms: 5.0, - queue_depth: 1.0, - utilization_percent: 10.0, - last_updated: SystemTime::now(), - }) - } - - /// Get disk temperature - async fn get_disk_temperature(&self, _device_path: &str) -> Result> { - // TODO: Implement temperature monitoring - // For now, return None (temperature not available) - Ok(None) - } - - /// Get SMART status - async fn get_smart_status(&self, _device_path: &str) -> Result> { - // TODO: Implement SMART status checking - // For now, return None (SMART not available) - Ok(None) - } - - /// Update scanner statistics - async fn update_statistics(&self, update_fn: F) - where - F: FnOnce(&mut DiskScannerStatistics), - { - let mut stats = self.statistics.write().await; - update_fn(&mut stats); - } - - /// Get current statistics - pub async fn statistics(&self) -> DiskScannerStatistics { - self.statistics.read().await.clone() - } - - /// Get last scan results - pub async fn last_scan_results(&self) -> HashMap { - self.last_scan_results.read().await.clone() - } - - /// Reset statistics - pub async fn reset_statistics(&self) { - let mut stats = self.statistics.write().await; - *stats = DiskScannerStatistics::default(); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_disk_scanner_creation() { - let config = DiskScannerConfig::default(); - let scanner = DiskScanner::new(config); - assert_eq!(scanner.statistics().await.disks_scanned, 0); - } - - #[tokio::test] - async fn test_disk_info_creation() { - let disk_info = DiskInfo { - device_path: "/dev/sda1".to_string(), - mount_point: "/".to_string(), - filesystem_type: "ext4".to_string(), - total_space: 1000000000, - used_space: 500000000, - free_space: 500000000, - available_space: 450000000, - usage_percent: 50.0, - inode_total: Some(1000000), - inode_used: Some(500000), - inode_free: Some(500000), - inode_usage_percent: Some(50.0), - last_scan_time: SystemTime::now(), - health_status: DiskHealthStatus::Healthy, - performance_metrics: None, - temperature: None, - smart_status: None, - }; - - assert_eq!(disk_info.usage_percent, 50.0); - assert_eq!(disk_info.health_status, DiskHealthStatus::Healthy); - } -} \ No newline at end of file diff --git a/crates/ahm/src/scanner/engine.rs b/crates/ahm/src/scanner/engine.rs deleted file mode 100644 index 129b21a7..00000000 --- a/crates/ahm/src/scanner/engine.rs +++ /dev/null @@ -1,536 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::HashMap, - path::{Path, PathBuf}, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::{ - sync::{broadcast, RwLock}, - time::sleep, -}; -use tracing::{error, info, warn}; -use tokio_util::sync::CancellationToken; - -use crate::{core, error::Result, metrics, SystemEvent}; -use crate::core::Status; -use super::{HealthIssue, HealthIssueType, Severity}; - -/// Represents a discovered object during scanning -#[derive(Debug, Clone)] -pub struct ScannedObject { - pub bucket: String, - pub object: String, - pub version_id: Option, - pub path: PathBuf, - pub size: u64, - pub modified_time: SystemTime, - pub metadata: HashMap, - pub health_issues: Vec, -} - -/// Configuration for the scanner engine -#[derive(Debug, Clone)] -pub struct EngineConfig { - /// Root directory to scan - pub root_path: String, - /// Maximum number of concurrent scan workers - pub max_workers: usize, - /// Scan interval between cycles - pub scan_interval: Duration, - /// Bandwidth limit for scanning (bytes per second) - pub bandwidth_limit: Option, - /// Whether to enable deep scanning (bitrot detection) - pub enable_deep_scan: bool, - /// Probability of healing objects during scan (1 in N) - pub heal_probability: u32, - /// Maximum folders to scan before compacting - pub max_folders_before_compact: u64, - /// Sleep duration between folder scans - pub folder_sleep_duration: Duration, -} - -impl Default for EngineConfig { - fn default() -> Self { - Self { - root_path: "/data".to_string(), - max_workers: 4, - scan_interval: Duration::from_secs(300), // 5 minutes - bandwidth_limit: None, - enable_deep_scan: false, - heal_probability: 1024, // 1 in 1024 objects - max_folders_before_compact: 10000, - folder_sleep_duration: Duration::from_millis(1), - } - } -} - -/// Scanner statistics -#[derive(Debug, Clone, Default)] -pub struct ScannerStatistics { - pub objects_scanned: u64, - pub bytes_scanned: u64, - pub issues_found: u64, - pub scan_duration: Duration, - pub scan_rate_objects_per_sec: f64, - pub scan_rate_bytes_per_sec: f64, - pub folders_scanned: u64, - pub objects_with_issues: u64, -} - -/// Main scanner engine -pub struct Engine { - config: EngineConfig, - coordinator: Arc, - metrics: Arc, - cancel_token: CancellationToken, - status: Arc>, - statistics: Arc>, - scan_cycle: Arc>, -} - -impl Engine { - /// Create a new scanner engine - pub async fn new( - config: EngineConfig, - coordinator: Arc, - metrics: Arc, - cancel_token: CancellationToken, - ) -> Result { - let engine = Self { - config, - coordinator, - metrics, - cancel_token, - status: Arc::new(RwLock::new(Status::Initializing)), - statistics: Arc::new(RwLock::new(ScannerStatistics::default())), - scan_cycle: Arc::new(RwLock::new(0)), - }; - - info!("Scanner engine created with config: {:?}", engine.config); - Ok(engine) - } - - /// Start the scanner engine - pub async fn start(&self) -> Result<()> { - info!("Starting scanner engine"); - *self.status.write().await = Status::Running; - - let engine = self.clone_for_background(); - tokio::spawn(async move { - if let Err(e) = engine.run_scan_loop().await { - error!("Scanner engine error: {}", e); - } - }); - - Ok(()) - } - - /// Stop the scanner engine - pub async fn stop(&self) -> Result<()> { - info!("Stopping scanner engine"); - *self.status.write().await = Status::Stopping; - self.cancel_token.cancel(); - *self.status.write().await = Status::Stopped; - Ok(()) - } - - /// Get current status - pub async fn status(&self) -> Status { - self.status.read().await.clone() - } - - /// Get current statistics - pub async fn statistics(&self) -> ScannerStatistics { - self.statistics.read().await.clone() - } - - /// Clone the engine for background tasks - fn clone_for_background(&self) -> Arc { - Arc::new(Self { - config: self.config.clone(), - coordinator: self.coordinator.clone(), - metrics: self.metrics.clone(), - cancel_token: self.cancel_token.clone(), - status: self.status.clone(), - statistics: self.statistics.clone(), - scan_cycle: self.scan_cycle.clone(), - }) - } - - /// Main scan loop - async fn run_scan_loop(&self) -> Result<()> { - info!("Scanner engine loop started"); - - loop { - tokio::select! { - _ = self.cancel_token.cancelled() => { - info!("Scanner engine received cancellation signal"); - break; - } - _ = sleep(self.config.scan_interval) => { - if let Err(e) = self.run_scan_cycle().await { - error!("Scan cycle failed: {}", e); - } - } - } - } - - Ok(()) - } - - /// Run a single scan cycle - async fn run_scan_cycle(&self) -> Result<()> { - let cycle_start = Instant::now(); - let cycle = { - let mut cycle_guard = self.scan_cycle.write().await; - *cycle_guard += 1; - *cycle_guard - }; - - info!("Starting scan cycle {}", cycle); - - // Reset statistics for new cycle - { - let mut stats = self.statistics.write().await; - *stats = ScannerStatistics::default(); - } - - // Scan the root directory - let scan_result = self.scan_directory(&self.config.root_path).await?; - - // Update statistics - { - let mut stats = self.statistics.write().await; - stats.scan_duration = cycle_start.elapsed(); - stats.objects_scanned = scan_result.objects.len() as u64; - stats.bytes_scanned = scan_result.total_size; - stats.issues_found = scan_result.total_issues; - stats.folders_scanned = scan_result.folders_scanned; - stats.objects_with_issues = scan_result.objects_with_issues; - - if stats.scan_duration.as_secs() > 0 { - stats.scan_rate_objects_per_sec = stats.objects_scanned as f64 / stats.scan_duration.as_secs() as f64; - stats.scan_rate_bytes_per_sec = stats.bytes_scanned as f64 / stats.scan_duration.as_secs() as f64; - } - } - - // Publish scan completion event - let scan_report = crate::scanner::ScanReport { - scan_id: cycle.to_string(), - status: "completed".to_string(), - summary: format!("Scanned {} objects, found {} issues", scan_result.objects.len(), scan_result.total_issues), - issues_found: scan_result.total_issues, - }; - - self.coordinator.publish_event(SystemEvent::ScanCompleted(scan_report)).await?; - - info!( - "Scan cycle {} completed: {} objects, {} bytes, {} issues in {:?}", - cycle, - scan_result.objects.len(), - scan_result.total_size, - scan_result.total_issues, - cycle_start.elapsed() - ); - - Ok(()) - } - - /// Scan a directory recursively - async fn scan_directory(&self, path: &str) -> Result { - let mut result = ScanResult::default(); - let path_buf = PathBuf::from(path); - - if !path_buf.exists() { - warn!("Scan path does not exist: {}", path); - return Ok(result); - } - - if !path_buf.is_dir() { - warn!("Scan path is not a directory: {}", path); - return Ok(result); - } - - self.scan_directory_recursive(&path_buf, &mut result).await?; - Ok(result) - } - - /// Recursively scan a directory - async fn scan_directory_recursive(&self, dir_path: &Path, result: &mut ScanResult) -> Result<()> { - result.folders_scanned += 1; - - // Check for cancellation - if self.cancel_token.is_cancelled() { - return Ok(()); - } - - let entries = match std::fs::read_dir(dir_path) { - Ok(entries) => entries, - Err(e) => { - warn!("Failed to read directory {}: {}", dir_path.display(), e); - return Ok(()); - } - }; - - for entry in entries { - if self.cancel_token.is_cancelled() { - break; - } - - let entry = match entry { - Ok(entry) => entry, - Err(e) => { - warn!("Failed to read directory entry: {}", e); - continue; - } - }; - - let file_path = entry.path(); - let _path_str = file_path.to_string_lossy(); - let entry_name = file_path.file_name() - .and_then(|n| n.to_str()) - .unwrap_or("unknown"); - - // Skip hidden files and system files - if entry_name.starts_with('.') || entry_name == ".." || entry_name == "." { - continue; - } - - if file_path.is_dir() { - // Recursively scan subdirectories - Box::pin(self.scan_directory_recursive(&file_path, result)).await?; - } else if file_path.is_file() { - // Scan individual file - if let Some(scanned_object) = self.scan_object(&file_path).await? { - result.objects.push(scanned_object.clone()); - result.total_size += scanned_object.size; - - if !scanned_object.health_issues.is_empty() { - result.objects_with_issues += 1; - result.total_issues += scanned_object.health_issues.len() as u64; - - // Publish health issues - for issue in &scanned_object.health_issues { - let health_issue = crate::scanner::HealthIssue { - issue_type: issue.issue_type.clone(), - severity: issue.severity, - bucket: scanned_object.bucket.clone(), - object: scanned_object.object.clone(), - description: issue.description.clone(), - metadata: None, // TODO: Convert HashMap to ObjectMetadata - }; - - self.coordinator.publish_event(SystemEvent::HealthIssueDetected(health_issue)).await?; - } - } - - // Publish object discovered event - let metadata = crate::ObjectMetadata { - size: scanned_object.size, - mod_time: scanned_object.modified_time.duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default() - .as_secs() as i64, - content_type: "application/octet-stream".to_string(), - etag: "".to_string(), // TODO: Calculate actual ETag - }; - - self.coordinator.publish_event(SystemEvent::ObjectDiscovered { - bucket: scanned_object.bucket.clone(), - object: scanned_object.object.clone(), - version_id: scanned_object.version_id.clone(), - metadata, - }).await?; - } - } - - // Sleep between items to avoid overwhelming the system - sleep(self.config.folder_sleep_duration).await; - } - - Ok(()) - } - - /// Scan a single object file - async fn scan_object(&self, file_path: &Path) -> Result> { - let metadata = match std::fs::metadata(file_path) { - Ok(metadata) => metadata, - Err(e) => { - warn!("Failed to read file metadata {}: {}", file_path.display(), e); - return Ok(None); - } - }; - - // Extract bucket and object from path - let (bucket, object) = self.extract_bucket_object_from_path(file_path)?; - if bucket.is_empty() || object.is_empty() { - return Ok(None); - } - - // Check for health issues - let health_issues = self.check_object_health(file_path, &metadata).await?; - - let scanned_object = ScannedObject { - bucket, - object, - version_id: None, // TODO: Extract version ID from path - path: file_path.to_path_buf(), - size: metadata.len(), - modified_time: metadata.modified().unwrap_or(SystemTime::now()), - metadata: HashMap::new(), // TODO: Extract metadata - health_issues, - }; - - Ok(Some(scanned_object)) - } - - /// Extract bucket and object name from file path - fn extract_bucket_object_from_path(&self, file_path: &Path) -> Result<(String, String)> { - let _path_str = file_path.to_string_lossy(); - let root_path = Path::new(&self.config.root_path); - - if let Ok(relative_path) = file_path.strip_prefix(root_path) { - let components: Vec<&str> = relative_path.components() - .filter_map(|c| c.as_os_str().to_str()) - .collect(); - - if components.len() >= 2 { - let bucket = components[0].to_string(); - let object = components[1..].join("/"); - return Ok((bucket, object)); - } - } - - Ok((String::new(), String::new())) - } - - /// Check object health and detect issues - async fn check_object_health(&self, file_path: &Path, metadata: &std::fs::Metadata) -> Result> { - let mut issues = Vec::new(); - - // Extract bucket and object from path for health issues - let (bucket, object) = self.extract_bucket_object_from_path(file_path)?; - - // Check file size - if metadata.len() == 0 { - issues.push(HealthIssue { - issue_type: HealthIssueType::ObjectTooSmall, - severity: Severity::Low, - bucket: bucket.clone(), - object: object.clone(), - description: "Object has zero size".to_string(), - metadata: None, - }); - } - - // Check file permissions - if !metadata.permissions().readonly() { - issues.push(HealthIssue { - issue_type: HealthIssueType::PolicyViolation, - severity: Severity::Medium, - bucket: bucket.clone(), - object: object.clone(), - description: "Object is not read-only".to_string(), - metadata: None, - }); - } - - // TODO: Add more health checks: - // - Checksum verification - // - Replication status - // - Encryption status - // - Metadata consistency - // - Disk health - - Ok(issues) - } - - /// Start scanning operations - pub async fn start_scan(&self) -> Result<()> { - let mut status = self.status.write().await; - *status = Status::Running; - info!("Scanning operations started"); - Ok(()) - } - - /// Stop scanning operations - pub async fn stop_scan(&self) -> Result<()> { - let mut status = self.status.write().await; - *status = Status::Stopped; - info!("Scanning operations stopped"); - Ok(()) - } - - /// Get engine configuration - pub async fn get_config(&self) -> ScanConfig { - self.config.clone() - } -} - -/// Result of a scan operation -#[derive(Debug, Clone, Default)] -pub struct ScanResult { - pub objects: Vec, - pub total_size: u64, - pub total_issues: u64, - pub folders_scanned: u64, - pub objects_with_issues: u64, -} - -#[cfg(test)] -mod tests { - use super::*; - use tokio::time::Duration; - - #[tokio::test] - async fn test_engine_creation() { - let config = EngineConfig::default(); - let coordinator = Arc::new(core::Coordinator::new( - core::CoordinatorConfig::default(), - Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()), - CancellationToken::new(), - ).await.unwrap()); - let metrics = Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()); - let cancel_token = CancellationToken::new(); - - let engine = Engine::new(config, coordinator, metrics, cancel_token).await; - assert!(engine.is_ok()); - } - - #[tokio::test] - async fn test_path_extraction() { - let config = EngineConfig { - root_path: "/data".to_string(), - ..Default::default() - }; - let coordinator = Arc::new(core::Coordinator::new( - core::CoordinatorConfig::default(), - Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()), - CancellationToken::new(), - ).await.unwrap()); - let metrics = Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()); - let cancel_token = CancellationToken::new(); - - let engine = Engine::new(config, coordinator, metrics, cancel_token).await.unwrap(); - - let test_path = Path::new("/data/bucket1/object1.txt"); - let (bucket, object) = engine.extract_bucket_object_from_path(test_path).unwrap(); - - assert_eq!(bucket, "bucket1"); - assert_eq!(object, "object1.txt"); - } -} \ No newline at end of file diff --git a/crates/ahm/src/scanner/metrics_collector.rs b/crates/ahm/src/scanner/metrics_collector.rs deleted file mode 100644 index 45c0d4b1..00000000 --- a/crates/ahm/src/scanner/metrics_collector.rs +++ /dev/null @@ -1,526 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::HashMap, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::{debug, error, info, warn}; - -use crate::error::Result; -use super::{HealthIssue, HealthIssueType, Severity}; - -/// Configuration for metrics collection -#[derive(Debug, Clone)] -pub struct MetricsConfig { - /// Collection interval for metrics - pub collection_interval: Duration, - /// Retention period for historical metrics - pub retention_period: Duration, - /// Maximum number of data points to keep in memory - pub max_data_points: usize, - /// Whether to enable detailed metrics collection - pub enable_detailed_metrics: bool, - /// Whether to enable performance profiling - pub enable_profiling: bool, - /// Whether to enable resource usage tracking - pub enable_resource_tracking: bool, -} - -impl Default for MetricsConfig { - fn default() -> Self { - Self { - collection_interval: Duration::from_secs(60), // 1 minute - retention_period: Duration::from_secs(3600 * 24), // 24 hours - max_data_points: 1440, // 24 hours worth of minute-level data - enable_detailed_metrics: true, - enable_profiling: false, - enable_resource_tracking: true, - } - } -} - -/// Scanner performance metrics -#[derive(Debug, Clone)] -pub struct ScannerMetrics { - /// Objects scanned per second - pub objects_per_second: f64, - /// Bytes scanned per second - pub bytes_per_second: f64, - /// Average scan time per object - pub avg_scan_time_per_object: Duration, - /// Total objects scanned in current cycle - pub total_objects_scanned: u64, - /// Total bytes scanned in current cycle - pub total_bytes_scanned: u64, - /// Number of health issues detected - pub health_issues_detected: u64, - /// Scan success rate (percentage) - pub success_rate: f64, - /// Current scan cycle duration - pub current_cycle_duration: Duration, - /// Average scan cycle duration - pub avg_cycle_duration: Duration, - /// Last scan completion time - pub last_scan_completion: Option, -} - -/// Resource usage metrics -#[derive(Debug, Clone)] -pub struct ResourceMetrics { - /// CPU usage percentage - pub cpu_usage_percent: f64, - /// Memory usage in bytes - pub memory_usage_bytes: u64, - /// Memory usage percentage - pub memory_usage_percent: f64, - /// Disk I/O operations per second - pub disk_io_ops_per_sec: f64, - /// Disk I/O bytes per second - pub disk_io_bytes_per_sec: f64, - /// Network I/O bytes per second - pub network_io_bytes_per_sec: f64, - /// Number of active threads - pub active_threads: u32, - /// Number of open file descriptors - pub open_file_descriptors: u32, -} - -/// Health metrics summary -#[derive(Debug, Clone)] -pub struct HealthMetrics { - /// Total health issues by severity - pub issues_by_severity: HashMap, - /// Total health issues by type - pub issues_by_type: HashMap, - /// Objects with health issues - pub objects_with_issues: u64, - /// Percentage of objects with issues - pub objects_with_issues_percent: f64, - /// Last health check time - pub last_health_check: SystemTime, - /// Health score (0-100, higher is better) - pub health_score: f64, -} - -/// Historical metrics data point -#[derive(Debug, Clone)] -pub struct MetricsDataPoint { - pub timestamp: SystemTime, - pub scanner_metrics: ScannerMetrics, - pub resource_metrics: ResourceMetrics, - pub health_metrics: HealthMetrics, -} - -/// Metrics collector for scanner system -pub struct MetricsCollector { - config: MetricsConfig, - current_metrics: Arc>, - historical_data: Arc>>, - collection_start_time: Instant, -} - -/// Current metrics state -#[derive(Debug, Clone)] -pub struct CurrentMetrics { - pub scanner_metrics: ScannerMetrics, - pub resource_metrics: ResourceMetrics, - pub health_metrics: HealthMetrics, - pub last_update: SystemTime, -} - -impl MetricsCollector { - /// Create a new metrics collector - pub fn new(config: MetricsConfig) -> Self { - let collector = Self { - config, - current_metrics: Arc::new(RwLock::new(CurrentMetrics { - scanner_metrics: ScannerMetrics { - objects_per_second: 0.0, - bytes_per_second: 0.0, - avg_scan_time_per_object: Duration::ZERO, - total_objects_scanned: 0, - total_bytes_scanned: 0, - health_issues_detected: 0, - success_rate: 100.0, - current_cycle_duration: Duration::ZERO, - avg_cycle_duration: Duration::ZERO, - last_scan_completion: None, - }, - resource_metrics: ResourceMetrics { - cpu_usage_percent: 0.0, - memory_usage_bytes: 0, - memory_usage_percent: 0.0, - disk_io_ops_per_sec: 0.0, - disk_io_bytes_per_sec: 0.0, - network_io_bytes_per_sec: 0.0, - active_threads: 0, - open_file_descriptors: 0, - }, - health_metrics: HealthMetrics { - issues_by_severity: HashMap::new(), - issues_by_type: HashMap::new(), - objects_with_issues: 0, - objects_with_issues_percent: 0.0, - last_health_check: SystemTime::now(), - health_score: 100.0, - }, - last_update: SystemTime::now(), - })), - historical_data: Arc::new(RwLock::new(Vec::new())), - collection_start_time: Instant::now(), - }; - - info!("Metrics collector created with config: {:?}", collector.config); - collector - } - - /// Start metrics collection - pub async fn start_collection(&self) -> Result<()> { - info!("Starting metrics collection"); - - let collector = self.clone_for_background(); - tokio::spawn(async move { - if let Err(e) = collector.run_collection_loop().await { - error!("Metrics collection error: {}", e); - } - }); - - Ok(()) - } - - /// Stop metrics collection - pub async fn stop_collection(&self) -> Result<()> { - info!("Stopping metrics collection"); - Ok(()) - } - - /// Update scanner metrics - pub async fn update_scanner_metrics(&self, metrics: ScannerMetrics) -> Result<()> { - let mut current = self.current_metrics.write().await; - current.scanner_metrics = metrics; - current.last_update = SystemTime::now(); - Ok(()) - } - - /// Update resource metrics - pub async fn update_resource_metrics(&self, metrics: ResourceMetrics) -> Result<()> { - let mut current = self.current_metrics.write().await; - current.resource_metrics = metrics; - current.last_update = SystemTime::now(); - Ok(()) - } - - /// Update health metrics - pub async fn update_health_metrics(&self, metrics: HealthMetrics) -> Result<()> { - let mut current = self.current_metrics.write().await; - current.health_metrics = metrics; - current.last_update = SystemTime::now(); - Ok(()) - } - - /// Record a health issue - pub async fn record_health_issue(&self, issue: &HealthIssue) -> Result<()> { - let mut current = self.current_metrics.write().await; - - // Update severity count - *current.health_metrics.issues_by_severity.entry(issue.severity).or_insert(0) += 1; - - // Update type count - *current.health_metrics.issues_by_type.entry(issue.issue_type.clone()).or_insert(0) += 1; - - // Update scanner metrics - current.scanner_metrics.health_issues_detected += 1; - - current.last_update = SystemTime::now(); - Ok(()) - } - - /// Get current metrics - pub async fn current_metrics(&self) -> CurrentMetrics { - self.current_metrics.read().await.clone() - } - - /// Get historical metrics - pub async fn historical_metrics(&self, duration: Duration) -> Vec { - let historical = self.historical_data.read().await; - let cutoff_time = SystemTime::now() - duration; - - historical.iter() - .filter(|point| point.timestamp >= cutoff_time) - .cloned() - .collect() - } - - /// Get metrics summary - pub async fn metrics_summary(&self) -> MetricsSummary { - let current = self.current_metrics.read().await; - let historical = self.historical_data.read().await; - - let uptime = self.collection_start_time.elapsed(); - let total_data_points = historical.len(); - - // Calculate averages from historical data - let avg_objects_per_sec = if !historical.is_empty() { - historical.iter() - .map(|point| point.scanner_metrics.objects_per_second) - .sum::() / historical.len() as f64 - } else { - 0.0 - }; - - let avg_bytes_per_sec = if !historical.is_empty() { - historical.iter() - .map(|point| point.scanner_metrics.bytes_per_second) - .sum::() / historical.len() as f64 - } else { - 0.0 - }; - - let avg_cpu_usage = if !historical.is_empty() { - historical.iter() - .map(|point| point.resource_metrics.cpu_usage_percent) - .sum::() / historical.len() as f64 - } else { - 0.0 - }; - - let avg_memory_usage = if !historical.is_empty() { - historical.iter() - .map(|point| point.resource_metrics.memory_usage_percent) - .sum::() / historical.len() as f64 - } else { - 0.0 - }; - - MetricsSummary { - uptime, - total_data_points, - current_scanner_metrics: current.scanner_metrics.clone(), - current_resource_metrics: current.resource_metrics.clone(), - current_health_metrics: current.health_metrics.clone(), - avg_objects_per_sec, - avg_bytes_per_sec, - avg_cpu_usage, - avg_memory_usage, - last_update: current.last_update, - } - } - - /// Clone the collector for background tasks - fn clone_for_background(&self) -> Arc { - Arc::new(Self { - config: self.config.clone(), - current_metrics: self.current_metrics.clone(), - historical_data: self.historical_data.clone(), - collection_start_time: self.collection_start_time, - }) - } - - /// Main collection loop - async fn run_collection_loop(&self) -> Result<()> { - info!("Metrics collection loop started"); - - loop { - // Collect current metrics - self.collect_current_metrics().await?; - - // Store historical data point - self.store_historical_data_point().await?; - - // Clean up old data - self.cleanup_old_data().await?; - - // Wait for next collection interval - tokio::time::sleep(self.config.collection_interval).await; - } - } - - /// Collect current system metrics - async fn collect_current_metrics(&self) -> Result<()> { - if self.config.enable_resource_tracking { - let resource_metrics = self.collect_resource_metrics().await?; - self.update_resource_metrics(resource_metrics).await?; - } - - Ok(()) - } - - /// Collect resource usage metrics - async fn collect_resource_metrics(&self) -> Result { - // TODO: Implement actual resource metrics collection - // For now, return placeholder metrics - Ok(ResourceMetrics { - cpu_usage_percent: 0.0, - memory_usage_bytes: 0, - memory_usage_percent: 0.0, - disk_io_ops_per_sec: 0.0, - disk_io_bytes_per_sec: 0.0, - network_io_bytes_per_sec: 0.0, - active_threads: 0, - open_file_descriptors: 0, - }) - } - - /// Store current metrics as historical data point - async fn store_historical_data_point(&self) -> Result<()> { - let current = self.current_metrics.read().await; - let data_point = MetricsDataPoint { - timestamp: SystemTime::now(), - scanner_metrics: current.scanner_metrics.clone(), - resource_metrics: current.resource_metrics.clone(), - health_metrics: current.health_metrics.clone(), - }; - - let mut historical = self.historical_data.write().await; - historical.push(data_point); - - // Limit the number of data points - if historical.len() > self.config.max_data_points { - historical.remove(0); - } - - Ok(()) - } - - /// Clean up old historical data - async fn cleanup_old_data(&self) -> Result<()> { - let cutoff_time = SystemTime::now() - self.config.retention_period; - let mut historical = self.historical_data.write().await; - - historical.retain(|point| point.timestamp >= cutoff_time); - - Ok(()) - } - - /// Reset all metrics - pub async fn reset_metrics(&self) -> Result<()> { - let mut current = self.current_metrics.write().await; - *current = CurrentMetrics { - scanner_metrics: ScannerMetrics { - objects_per_second: 0.0, - bytes_per_second: 0.0, - avg_scan_time_per_object: Duration::ZERO, - total_objects_scanned: 0, - total_bytes_scanned: 0, - health_issues_detected: 0, - success_rate: 100.0, - current_cycle_duration: Duration::ZERO, - avg_cycle_duration: Duration::ZERO, - last_scan_completion: None, - }, - resource_metrics: ResourceMetrics { - cpu_usage_percent: 0.0, - memory_usage_bytes: 0, - memory_usage_percent: 0.0, - disk_io_ops_per_sec: 0.0, - disk_io_bytes_per_sec: 0.0, - network_io_bytes_per_sec: 0.0, - active_threads: 0, - open_file_descriptors: 0, - }, - health_metrics: HealthMetrics { - issues_by_severity: HashMap::new(), - issues_by_type: HashMap::new(), - objects_with_issues: 0, - objects_with_issues_percent: 0.0, - last_health_check: SystemTime::now(), - health_score: 100.0, - }, - last_update: SystemTime::now(), - }; - - let mut historical = self.historical_data.write().await; - historical.clear(); - - Ok(()) - } -} - -/// Summary of all metrics -#[derive(Debug, Clone)] -pub struct MetricsSummary { - pub uptime: Duration, - pub total_data_points: usize, - pub current_scanner_metrics: ScannerMetrics, - pub current_resource_metrics: ResourceMetrics, - pub current_health_metrics: HealthMetrics, - pub avg_objects_per_sec: f64, - pub avg_bytes_per_sec: f64, - pub avg_cpu_usage: f64, - pub avg_memory_usage: f64, - pub last_update: SystemTime, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_metrics_collector_creation() { - let config = MetricsConfig::default(); - let collector = MetricsCollector::new(config); - let metrics = collector.current_metrics().await; - assert_eq!(metrics.scanner_metrics.total_objects_scanned, 0); - } - - #[tokio::test] - async fn test_metrics_update() { - let config = MetricsConfig::default(); - let collector = MetricsCollector::new(config); - - let scanner_metrics = ScannerMetrics { - objects_per_second: 100.0, - bytes_per_second: 1024.0, - avg_scan_time_per_object: Duration::from_millis(10), - total_objects_scanned: 1000, - total_bytes_scanned: 1024000, - health_issues_detected: 5, - success_rate: 99.5, - current_cycle_duration: Duration::from_secs(60), - avg_cycle_duration: Duration::from_secs(65), - last_scan_completion: Some(SystemTime::now()), - }; - - collector.update_scanner_metrics(scanner_metrics).await.unwrap(); - - let current = collector.current_metrics().await; - assert_eq!(current.scanner_metrics.total_objects_scanned, 1000); - assert_eq!(current.scanner_metrics.health_issues_detected, 5); - } - - #[tokio::test] - async fn test_health_issue_recording() { - let config = MetricsConfig::default(); - let collector = MetricsCollector::new(config); - - let issue = HealthIssue { - issue_type: HealthIssueType::DiskFull, - severity: Severity::High, - bucket: "test-bucket".to_string(), - object: "test-object".to_string(), - description: "Test issue".to_string(), - metadata: None, - }; - - collector.record_health_issue(&issue).await.unwrap(); - - let current = collector.current_metrics().await; - assert_eq!(current.scanner_metrics.health_issues_detected, 1); - assert_eq!(current.health_metrics.issues_by_severity.get(&Severity::High), Some(&1)); - } -} \ No newline at end of file diff --git a/crates/ahm/src/scanner/object_scanner.rs b/crates/ahm/src/scanner/object_scanner.rs deleted file mode 100644 index fd438d8f..00000000 --- a/crates/ahm/src/scanner/object_scanner.rs +++ /dev/null @@ -1,419 +0,0 @@ -// Copyright 2024 RustFS Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::{ - collections::HashMap, - path::Path, - sync::Arc, - time::{Duration, SystemTime}, -}; - -use tokio::sync::RwLock; -use tracing::info; - -use crate::error::Result; -use super::{HealthIssue, HealthIssueType, Severity}; - -/// Configuration for object scanning -#[derive(Debug, Clone)] -pub struct ObjectScannerConfig { - /// Whether to perform checksum verification - pub verify_checksum: bool, - /// Whether to check replication status - pub check_replication: bool, - /// Whether to validate metadata consistency - pub validate_metadata: bool, - /// Maximum object size to scan (bytes) - pub max_object_size: u64, - /// Minimum object size (bytes) - pub min_object_size: u64, - /// Timeout for individual object scans - pub scan_timeout: Duration, - /// Whether to enable deep scanning (bitrot detection) - pub enable_deep_scan: bool, -} - -impl Default for ObjectScannerConfig { - fn default() -> Self { - Self { - verify_checksum: true, - check_replication: true, - validate_metadata: true, - max_object_size: 1024 * 1024 * 1024 * 1024, // 1TB - min_object_size: 0, - scan_timeout: Duration::from_secs(30), - enable_deep_scan: false, - } - } -} - -/// Result of scanning a single object -#[derive(Debug, Clone)] -pub struct ObjectScanResult { - /// Object identifier - pub bucket: String, - pub object: String, - pub version_id: Option, - /// Scan success status - pub success: bool, - /// Object metadata discovered - pub metadata: Option, - /// Health issues detected - pub health_issues: Vec, - /// Time taken to scan this object - pub scan_duration: Duration, - /// Error message if scan failed - pub error_message: Option, -} - -/// Object metadata -#[derive(Debug, Clone)] -pub struct ObjectMetadata { - pub size: u64, - pub modified_time: SystemTime, - pub content_type: String, - pub etag: String, - pub checksum: Option, - pub replication_status: Option, - pub encryption_status: Option, - pub custom_metadata: HashMap, -} - -/// Object scanner for individual object health checking -pub struct ObjectScanner { - config: ObjectScannerConfig, - statistics: Arc>, -} - -/// Statistics for object scanning -#[derive(Debug, Clone, Default)] -pub struct ObjectScannerStatistics { - pub objects_scanned: u64, - pub objects_with_issues: u64, - pub total_issues_found: u64, - pub total_scan_time: Duration, - pub average_scan_time: Duration, - pub checksum_verifications: u64, - pub checksum_failures: u64, - pub replication_checks: u64, - pub replication_failures: u64, -} - -impl ObjectScanner { - /// Create a new object scanner - pub fn new(config: ObjectScannerConfig) -> Self { - Self { - config, - statistics: Arc::new(RwLock::new(ObjectScannerStatistics::default())), - } - } - - /// Scan a single object for health issues - pub async fn scan_object(&self, bucket: &str, object: &str, version_id: Option<&str>, path: &Path) -> Result { - let scan_start = std::time::Instant::now(); - let mut health_issues = Vec::new(); - let mut error_message = None; - - // Check if file exists - if !path.exists() { - return Ok(ObjectScanResult { - bucket: bucket.to_string(), - object: object.to_string(), - version_id: version_id.map(|v| v.to_string()), - success: false, - metadata: None, - health_issues: vec![HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::Critical, - bucket: bucket.to_string(), - object: object.to_string(), - description: "Object file does not exist".to_string(), - metadata: None, - }], - scan_duration: scan_start.elapsed(), - error_message: Some("Object file not found".to_string()), - }); - } - - // Get file metadata - let metadata = match std::fs::metadata(path) { - Ok(metadata) => metadata, - Err(e) => { - error_message = Some(format!("Failed to read file metadata: {}", e)); - health_issues.push(HealthIssue { - issue_type: HealthIssueType::DiskReadError, - severity: Severity::High, - bucket: bucket.to_string(), - object: object.to_string(), - description: "Failed to read file metadata".to_string(), - metadata: None, - }); - return Ok(ObjectScanResult { - bucket: bucket.to_string(), - object: object.to_string(), - version_id: version_id.map(|v| v.to_string()), - success: false, - metadata: None, - health_issues, - scan_duration: scan_start.elapsed(), - error_message, - }); - } - }; - - // Check file size - let file_size = metadata.len(); - if file_size < self.config.min_object_size { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::ObjectTooSmall, - severity: Severity::Low, - bucket: bucket.to_string(), - object: object.to_string(), - description: format!("Object size {} is below minimum {}", file_size, self.config.min_object_size), - metadata: None, - }); - } - - if file_size > self.config.max_object_size { - health_issues.push(HealthIssue { - issue_type: HealthIssueType::ObjectTooLarge, - severity: Severity::Medium, - bucket: bucket.to_string(), - object: object.to_string(), - description: format!("Object size {} exceeds maximum {}", file_size, self.config.max_object_size), - metadata: None, - }); - } - - // Verify checksum if enabled - let checksum = if self.config.verify_checksum { - match self.verify_checksum(path).await { - Ok(cs) => { - self.update_statistics(|stats| stats.checksum_verifications += 1).await; - Some(cs) - } - Err(_e) => { - self.update_statistics(|stats| stats.checksum_failures += 1).await; - health_issues.push(HealthIssue { - issue_type: HealthIssueType::ChecksumMismatch, - severity: Severity::High, - bucket: bucket.to_string(), - object: object.to_string(), - description: "Checksum verification failed".to_string(), - metadata: None, - }); - None - } - } - } else { - None - }; - - // Check replication status if enabled - let replication_status = if self.config.check_replication { - match self.check_replication_status(bucket, object).await { - Ok(status) => { - self.update_statistics(|stats| stats.replication_checks += 1).await; - Some(status) - } - Err(_e) => { - self.update_statistics(|stats| stats.replication_failures += 1).await; - health_issues.push(HealthIssue { - issue_type: HealthIssueType::MissingReplica, - severity: Severity::High, - bucket: bucket.to_string(), - object: object.to_string(), - description: "Replication status check failed".to_string(), - metadata: None, - }); - None - } - } - } else { - None - }; - - // Validate metadata if enabled - if self.config.validate_metadata { - if let Some(issue) = self.validate_metadata(bucket, object, &metadata).await? { - health_issues.push(issue); - } - } - - // Create object metadata - let object_metadata = ObjectMetadata { - size: file_size, - modified_time: metadata.modified().unwrap_or(SystemTime::now()), - content_type: self.detect_content_type(path), - etag: self.calculate_etag(path).await?, - checksum, - replication_status, - encryption_status: None, // TODO: Implement encryption status check - custom_metadata: HashMap::new(), // TODO: Extract custom metadata - }; - - let scan_duration = scan_start.elapsed(); - let success = health_issues.is_empty(); - - // Update statistics - self.update_statistics(|stats| { - stats.objects_scanned += 1; - if !health_issues.is_empty() { - stats.objects_with_issues += 1; - stats.total_issues_found += health_issues.len() as u64; - } - stats.total_scan_time += scan_duration; - stats.average_scan_time = Duration::from_millis( - stats.total_scan_time.as_millis() as u64 / stats.objects_scanned.max(1) - ); - }).await; - - Ok(ObjectScanResult { - bucket: bucket.to_string(), - object: object.to_string(), - version_id: version_id.map(|v| v.to_string()), - success, - metadata: Some(object_metadata), - health_issues, - scan_duration, - error_message, - }) - } - - /// Verify object checksum - async fn verify_checksum(&self, _path: &Path) -> Result { - // TODO: Implement actual checksum verification - // For now, return a placeholder checksum - Ok("placeholder_checksum".to_string()) - } - - /// Check object replication status - async fn check_replication_status(&self, _bucket: &str, _object: &str) -> Result { - // TODO: Implement actual replication status checking - // For now, return a placeholder status - Ok("replicated".to_string()) - } - - /// Validate object metadata - async fn validate_metadata(&self, _bucket: &str, _object: &str, _metadata: &std::fs::Metadata) -> Result> { - // TODO: Implement actual metadata validation - // For now, return None (no issues) - Ok(None) - } - - /// Detect content type from file extension - fn detect_content_type(&self, path: &Path) -> String { - if let Some(extension) = path.extension() { - match extension.to_str().unwrap_or("").to_lowercase().as_str() { - "txt" => "text/plain", - "json" => "application/json", - "xml" => "application/xml", - "html" | "htm" => "text/html", - "css" => "text/css", - "js" => "application/javascript", - "png" => "image/png", - "jpg" | "jpeg" => "image/jpeg", - "gif" => "image/gif", - "pdf" => "application/pdf", - "zip" => "application/zip", - "tar" => "application/x-tar", - "gz" => "application/gzip", - _ => "application/octet-stream", - }.to_string() - } else { - "application/octet-stream".to_string() - } - } - - /// Calculate object ETag - async fn calculate_etag(&self, _path: &Path) -> Result { - // TODO: Implement actual ETag calculation - // For now, return a placeholder ETag - Ok("placeholder_etag".to_string()) - } - - /// Update scanner statistics - async fn update_statistics(&self, update_fn: F) - where - F: FnOnce(&mut ObjectScannerStatistics), - { - let mut stats = self.statistics.write().await; - update_fn(&mut stats); - } - - /// Get current statistics - pub async fn statistics(&self) -> ObjectScannerStatistics { - self.statistics.read().await.clone() - } - - /// Reset statistics - pub async fn reset_statistics(&self) { - let mut stats = self.statistics.write().await; - *stats = ObjectScannerStatistics::default(); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - use std::fs::File; - use std::io::Write; - - #[tokio::test] - async fn test_object_scanner_creation() { - let config = ObjectScannerConfig::default(); - let scanner = ObjectScanner::new(config); - assert_eq!(scanner.statistics().await.objects_scanned, 0); - } - - #[tokio::test] - async fn test_content_type_detection() { - let config = ObjectScannerConfig::default(); - let scanner = ObjectScanner::new(config); - - let path = Path::new("test.txt"); - assert_eq!(scanner.detect_content_type(path), "text/plain"); - - let path = Path::new("test.json"); - assert_eq!(scanner.detect_content_type(path), "application/json"); - - let path = Path::new("test.unknown"); - assert_eq!(scanner.detect_content_type(path), "application/octet-stream"); - } - - #[tokio::test] - async fn test_object_scanning() { - let temp_dir = TempDir::new().unwrap(); - let test_file = temp_dir.path().join("test.txt"); - - // Create a test file - let mut file = File::create(&test_file).unwrap(); - writeln!(file, "test content").unwrap(); - - let config = ObjectScannerConfig::default(); - let scanner = ObjectScanner::new(config); - - let result = scanner.scan_object("test-bucket", "test.txt", None, &test_file).await.unwrap(); - - assert!(result.success); - assert_eq!(result.bucket, "test-bucket"); - assert_eq!(result.object, "test.txt"); - assert!(result.metadata.is_some()); - - let metadata = result.metadata.unwrap(); - assert!(metadata.size > 0); - assert_eq!(metadata.content_type, "text/plain"); - } -} \ No newline at end of file