From 1f11a3167ba2d76e96cc0bdc80e61afcd6e65ada Mon Sep 17 00:00:00 2001 From: junxiang Mu <1948535941@qq.com> Date: Wed, 9 Jul 2025 07:03:56 +0800 Subject: [PATCH] fix: Refact heal and scanner design Signed-off-by: junxiang Mu <1948535941@qq.com> --- crates/ahm/architecture.md | 557 +++++++++ crates/ahm/architecture_ch.md | 557 +++++++++ crates/ahm/src/api/admin_api.rs | 843 +++++++++++++ crates/ahm/src/api/metrics_api.rs | 1180 +++++++++++++++++++ crates/ahm/src/api/mod.rs | 504 ++++++++ crates/ahm/src/api/status_api.rs | 761 ++++++++++++ crates/ahm/src/core/coordinator.rs | 448 +++++++ crates/ahm/src/core/lifecycle.rs | 22 + crates/ahm/src/core/mod.rs | 40 + crates/ahm/src/core/scheduler.rs | 226 ++++ crates/ahm/src/heal/engine.rs | 438 +++++++ crates/ahm/src/heal/mod.rs | 360 ++++++ crates/ahm/src/heal/priority_queue.rs | 413 +++++++ crates/ahm/src/heal/repair_worker.rs | 505 ++++++++ crates/ahm/src/heal/validation.rs | 453 +++++++ crates/ahm/src/metrics/aggregator.rs | 739 ++++++++++++ crates/ahm/src/metrics/collector.rs | 426 +++++++ crates/ahm/src/metrics/mod.rs | 617 ++++++++++ crates/ahm/src/metrics/reporter.rs | 861 ++++++++++++++ crates/ahm/src/metrics/storage.rs | 573 +++++++++ crates/ahm/src/policy/heal_policy.rs | 508 ++++++++ crates/ahm/src/policy/mod.rs | 258 ++++ crates/ahm/src/policy/retention_policy.rs | 487 ++++++++ crates/ahm/src/policy/scan_policy.rs | 373 ++++++ crates/ahm/src/scanner/bandwidth_limiter.rs | 353 ++++++ crates/ahm/src/scanner/disk_scanner.rs | 591 ++++++++++ crates/ahm/src/scanner/engine.rs | 536 +++++++++ crates/ahm/src/scanner/metrics_collector.rs | 526 +++++++++ crates/ahm/src/scanner/object_scanner.rs | 419 +++++++ 29 files changed, 14574 insertions(+) create mode 100644 crates/ahm/architecture.md create mode 100644 crates/ahm/architecture_ch.md create mode 100644 crates/ahm/src/api/admin_api.rs create mode 100644 crates/ahm/src/api/metrics_api.rs create mode 100644 crates/ahm/src/api/mod.rs create mode 100644 crates/ahm/src/api/status_api.rs create mode 100644 crates/ahm/src/core/coordinator.rs create mode 100644 crates/ahm/src/core/lifecycle.rs create mode 100644 crates/ahm/src/core/mod.rs create mode 100644 crates/ahm/src/core/scheduler.rs create mode 100644 crates/ahm/src/heal/engine.rs create mode 100644 crates/ahm/src/heal/mod.rs create mode 100644 crates/ahm/src/heal/priority_queue.rs create mode 100644 crates/ahm/src/heal/repair_worker.rs create mode 100644 crates/ahm/src/heal/validation.rs create mode 100644 crates/ahm/src/metrics/aggregator.rs create mode 100644 crates/ahm/src/metrics/collector.rs create mode 100644 crates/ahm/src/metrics/mod.rs create mode 100644 crates/ahm/src/metrics/reporter.rs create mode 100644 crates/ahm/src/metrics/storage.rs create mode 100644 crates/ahm/src/policy/heal_policy.rs create mode 100644 crates/ahm/src/policy/mod.rs create mode 100644 crates/ahm/src/policy/retention_policy.rs create mode 100644 crates/ahm/src/policy/scan_policy.rs create mode 100644 crates/ahm/src/scanner/bandwidth_limiter.rs create mode 100644 crates/ahm/src/scanner/disk_scanner.rs create mode 100644 crates/ahm/src/scanner/engine.rs create mode 100644 crates/ahm/src/scanner/metrics_collector.rs create mode 100644 crates/ahm/src/scanner/object_scanner.rs diff --git a/crates/ahm/architecture.md b/crates/ahm/architecture.md new file mode 100644 index 00000000..fab56c49 --- /dev/null +++ b/crates/ahm/architecture.md @@ -0,0 +1,557 @@ +# RustFS Advanced Health & Metrics (AHM) System Architecture + +## Overview + +The RustFS AHM system is a newly designed distributed storage health monitoring and repair system that provides intelligent scanning, automatic repair, rich metrics, and policy-driven management capabilities. + +## System Architecture + +### Overall Architecture Diagram + +``` +┌─────────────────────────────────────┐ +│ API Layer (REST/gRPC) │ +├─────────────────────────────────────┤ +│ Policy & Configuration │ +├─────────────────────────────────────┤ +│ Core Coordination Engine │ +├─────────────────────────────────────┤ +│ Scanner Engine │ Heal Engine │ +├─────────────────────────────────────┤ +│ Metrics & Observability │ +├─────────────────────────────────────┤ +│ Storage Abstraction │ +└─────────────────────────────────────┘ +``` + +### Module Structure + +``` +rustfs/crates/ecstore/src/ahm/ +├── mod.rs # Module entry point and public interfaces +├── core/ # Core engines +│ ├── coordinator.rs # Distributed coordinator - event routing and state management +│ ├── scheduler.rs # Task scheduler - priority queue and work assignment +│ └── lifecycle.rs # Lifecycle manager - system startup/shutdown control +├── scanner/ # Scanning system +│ ├── engine.rs # Scan engine - scan process control +│ ├── object_scanner.rs # Object scanner - object-level integrity checks +│ ├── disk_scanner.rs # Disk scanner - disk-level health checks +│ ├── metrics_collector.rs # Metrics collector - scan process data collection +│ └── bandwidth_limiter.rs # Bandwidth limiter - I/O resource control +├── heal/ # Repair system +│ ├── engine.rs # Heal engine - repair process control +│ ├── priority_queue.rs # Priority queue - repair task ordering +│ ├── repair_worker.rs # Repair worker - actual repair execution +│ └── validation.rs # Repair validator - repair result verification +├── metrics/ # Metrics system +│ ├── collector.rs # Metrics collector - real-time data collection +│ ├── aggregator.rs # Metrics aggregator - data aggregation and computation +│ ├── storage.rs # Metrics storage - time-series data storage +│ └── reporter.rs # Metrics reporter - external system export +├── policy/ # Policy system +│ ├── scan_policy.rs # Scan policy - scan behavior configuration +│ ├── heal_policy.rs # Heal policy - repair priority and strategy +│ └── retention_policy.rs # Retention policy - data lifecycle management +└── api/ # API interfaces + ├── admin_api.rs # Admin API - system management operations + ├── metrics_api.rs # Metrics API - metrics query and export + └── status_api.rs # Status API - system status monitoring +``` + +## Core Design Principles + +### 1. Event-Driven Architecture + +```rust +pub enum SystemEvent { + ObjectDiscovered { bucket: String, object: String, metadata: ObjectMetadata }, + HealthIssueDetected { issue_type: HealthIssueType, severity: Severity }, + HealCompleted { result: HealResult }, + ScanCycleCompleted { statistics: ScanStatistics }, + ResourceUsageUpdated { usage: ResourceUsage }, +} +``` + +- **Scanner** generates discovery events +- **Heal** responds to repair events +- **Metrics** collects all event statistics +- **Policy** controls event processing strategies + +### 2. Layered Modular Design + +#### **API Layer**: REST/gRPC interfaces +- Unified response format +- Comprehensive error handling +- Authentication and authorization support + +#### **Policy Layer**: Configurable business rules +- Scan frequency and depth control +- Repair priority policies +- Data retention rules + +#### **Coordination Layer**: System coordination and scheduling +- Event routing and distribution +- Resource management and allocation +- Task scheduling and execution + +#### **Engine Layer**: Core business logic +- Intelligent scanning algorithms +- Adaptive repair strategies +- Performance optimization control + +#### **Metrics Layer**: Observability support +- Real-time metrics collection +- Historical trend analysis +- Multi-format export + +### 3. Multi-Mode Scanning Strategies + +```rust +pub enum ScanStrategy { + Full { mode: ScanMode, scope: ScanScope }, // Full scan + Incremental { since: Instant, mode: ScanMode }, // Incremental scan + Smart { sample_rate: f64, favor_unscanned: bool }, // Smart sampling + Targeted { targets: Vec, mode: ScanMode }, // Targeted scan +} + +pub enum ScanMode { + Quick, // Quick scan - metadata only + Normal, // Normal scan - basic integrity verification + Deep, // Deep scan - includes bit-rot detection +} +``` + +### 4. Priority-Based Repair System + +```rust +pub enum HealPriority { + Low = 0, + Normal = 1, + High = 2, + Critical = 3, + Emergency = 4, +} + +pub enum HealMode { + RealTime, // Real-time repair - triggered on GET/PUT + Background, // Background repair - scheduled tasks + OnDemand, // On-demand repair - admin triggered + Emergency, // Emergency repair - critical issues +} +``` + +## API Usage Guide + +### 1. System Management API + +#### Start AHM System + +```http +POST /admin/system/start +Content-Type: application/json + +{ + "coordinator": { + "event_buffer_size": 10000, + "max_concurrent_operations": 1000 + }, + "scanner": { + "default_scan_mode": "Normal", + "scan_interval": "24h" + }, + "heal": { + "max_workers": 16, + "queue_capacity": 50000 + } +} +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "system_id": "ahm-001", + "status": "Running", + "started_at": "2024-01-15T10:30:00Z" + }, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +#### Get System Status + +```http +GET /status/health +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "status": "Running", + "version": "1.0.0", + "uptime_seconds": 3600, + "subsystems": { + "scanner": { + "status": "Scanning", + "last_check": "2024-01-15T10:29:00Z", + "error_message": null + }, + "heal": { + "status": "Idle", + "last_check": "2024-01-15T10:29:00Z", + "error_message": null + }, + "metrics": { + "status": "Running", + "last_check": "2024-01-15T10:29:00Z", + "error_message": null + } + } + }, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +### 2. Scan Management API + +#### Start Scan Task + +```http +POST /admin/scan/start +Content-Type: application/json + +{ + "strategy": { + "type": "Full", + "mode": "Normal", + "scope": { + "buckets": ["important-data", "user-uploads"], + "include_system_objects": false, + "max_objects": 1000000 + } + }, + "priority": "High" +} +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "scan_id": "scan-12345", + "status": "Started", + "estimated_duration": "2h30m", + "estimated_objects": 850000 + }, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +#### Query Scan Status + +```http +GET /admin/scan/{scan_id}/status +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "scan_id": "scan-12345", + "status": "Scanning", + "progress": { + "objects_scanned": 425000, + "bytes_scanned": 1073741824000, + "issues_detected": 23, + "completion_percentage": 50.0, + "scan_rate_ops": 117.5, + "scan_rate_bps": 268435456, + "elapsed_time": "1h15m", + "estimated_remaining": "1h15m" + }, + "issues": [ + { + "issue_type": "MissingShards", + "severity": "High", + "bucket": "user-uploads", + "object": "photos/IMG_001.jpg", + "description": "Missing 1 data shard", + "detected_at": "2024-01-15T11:15:00Z" + } + ] + }, + "timestamp": "2024-01-15T11:45:00Z" +} +``` + +### 3. Heal Management API + +#### Submit Heal Request + +```http +POST /admin/heal/request +Content-Type: application/json + +{ + "bucket": "user-uploads", + "object": "photos/IMG_001.jpg", + "version_id": null, + "priority": "High", + "mode": "OnDemand", + "max_retries": 3 +} +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "heal_request_id": "heal-67890", + "status": "Queued", + "priority": "High", + "estimated_start": "2024-01-15T11:50:00Z", + "queue_position": 5 + }, + "timestamp": "2024-01-15T11:45:00Z" +} +``` + +#### Query Heal Status + +```http +GET /admin/heal/{heal_request_id}/status +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "heal_request_id": "heal-67890", + "status": "Completed", + "result": { + "success": true, + "shards_repaired": 1, + "total_shards": 8, + "duration": "45s", + "strategy_used": "ParityShardRepair", + "validation_results": [ + { + "validation_type": "Checksum", + "passed": true, + "details": "Object checksum verified", + "duration": "2s" + }, + { + "validation_type": "ShardCount", + "passed": true, + "details": "All 8 shards present", + "duration": "1s" + } + ] + } + }, + "timestamp": "2024-01-15T11:46:00Z" +} +``` + +### 4. Metrics Query API + +#### Get System Metrics + +```http +GET /metrics/system?period=1h&metrics=objects_total,scan_rate,heal_success_rate +``` + +**Response Example:** +```json +{ + "success": true, + "data": { + "period": "1h", + "timestamp_range": { + "start": "2024-01-15T10:45:00Z", + "end": "2024-01-15T11:45:00Z" + }, + "metrics": { + "objects_total": { + "value": 2500000, + "unit": "count", + "labels": {} + }, + "scan_rate_objects_per_second": { + "value": 117.5, + "unit": "ops", + "labels": {} + }, + "heal_success_rate": { + "value": 0.98, + "unit": "ratio", + "labels": {} + } + } + }, + "timestamp": "2024-01-15T11:45:00Z" +} +``` + +#### Export Prometheus Format Metrics + +```http +GET /metrics/prometheus +``` + +**Response Example:** +``` +# HELP rustfs_objects_total Total number of objects in the system +# TYPE rustfs_objects_total gauge +rustfs_objects_total 2500000 + +# HELP rustfs_scan_rate_objects_per_second Object scanning rate +# TYPE rustfs_scan_rate_objects_per_second gauge +rustfs_scan_rate_objects_per_second 117.5 + +# HELP rustfs_heal_success_rate Healing operation success rate +# TYPE rustfs_heal_success_rate gauge +rustfs_heal_success_rate 0.98 + +# HELP rustfs_health_issues_total Total health issues detected +# TYPE rustfs_health_issues_total counter +rustfs_health_issues_total{severity="critical"} 0 +rustfs_health_issues_total{severity="high"} 3 +rustfs_health_issues_total{severity="medium"} 15 +rustfs_health_issues_total{severity="low"} 45 +``` + +### 5. Policy Configuration API + +#### Update Scan Policy + +```http +PUT /admin/policy/scan +Content-Type: application/json + +{ + "default_scan_interval": "12h", + "deep_scan_probability": 0.1, + "bandwidth_limit_mbps": 100, + "concurrent_scanners": 4, + "skip_system_objects": true, + "priority_buckets": ["critical-data", "user-data"] +} +``` + +#### Update Heal Policy + +```http +PUT /admin/policy/heal +Content-Type: application/json + +{ + "max_concurrent_heals": 8, + "emergency_heal_timeout": "5m", + "auto_heal_enabled": true, + "heal_verification_required": true, + "priority_mapping": { + "critical_buckets": "Emergency", + "important_buckets": "High", + "standard_buckets": "Normal" + } +} +``` + +## Usage Examples + +### Complete Monitoring and Repair Workflow + +```bash +# 1. Start AHM system +curl -X POST http://localhost:9000/admin/system/start \ + -H "Content-Type: application/json" \ + -d '{"scanner": {"default_scan_mode": "Normal"}}' + +# 2. Start full scan +SCAN_ID=$(curl -X POST http://localhost:9000/admin/scan/start \ + -H "Content-Type: application/json" \ + -d '{"strategy": {"type": "Full", "mode": "Normal"}}' | \ + jq -r '.data.scan_id') + +# 3. Monitor scan progress +watch "curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | jq '.data.progress'" + +# 4. View discovered issues +curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | \ + jq '.data.issues[]' + +# 5. Start repair for discovered issues +HEAL_ID=$(curl -X POST http://localhost:9000/admin/heal/request \ + -H "Content-Type: application/json" \ + -d '{ + "bucket": "user-uploads", + "object": "photos/IMG_001.jpg", + "priority": "High" + }' | jq -r '.data.heal_request_id') + +# 6. Monitor repair progress +watch "curl -s http://localhost:9000/admin/heal/$HEAL_ID/status | jq '.data'" + +# 7. View system metrics +curl -s http://localhost:9000/metrics/system?period=1h | jq '.data.metrics' + +# 8. Export Prometheus metrics +curl -s http://localhost:9000/metrics/prometheus +``` + +## Key Features + +### 1. Intelligent Scanning +- **Multi-level scan modes**: Quick/Normal/Deep three depths +- **Adaptive sampling**: Intelligent object selection based on historical data +- **Bandwidth control**: Configurable I/O resource limits +- **Incremental scanning**: Timestamp-based change detection + +### 2. Intelligent Repair +- **Priority queue**: Repair ordering based on business importance +- **Multiple repair strategies**: Data shard, parity shard, hybrid repair +- **Real-time validation**: Post-repair integrity verification +- **Retry mechanism**: Configurable failure retry policies + +### 3. Rich Metrics +- **Real-time statistics**: Object counts, storage usage, performance metrics +- **Historical trends**: Time-series data storage and analysis +- **Multi-format export**: Prometheus, JSON, CSV formats +- **Custom metrics**: Extensible metrics definition framework + +### 4. Policy-Driven +- **Configurable policies**: Independent configuration for scan, heal, retention policies +- **Dynamic adjustment**: Runtime policy updates without restart +- **Business alignment**: Differentiated handling based on business importance + +## Deployment Recommendations + +### 1. Resource Configuration +- **CPU**: Recommended 16+ cores for parallel scanning and repair +- **Memory**: Recommended 32GB+ for metrics cache and task queues +- **Network**: Recommended gigabit+ bandwidth for cross-node data sync +- **Storage**: Recommended SSD for metrics data storage + +### 2. Monitoring Integration +- **Prometheus**: Metrics collection and alerting +- **Grafana**: Visualization dashboards +- **ELK Stack**: Log aggregation and analysis +- **Jaeger**: Distributed tracing + +### 3. High Availability Deployment +- **Multi-instance deployment**: Avoid single points of failure +- **Load balancing**: API request distribution +- **Data backup**: Metrics and configuration data backup +- **Failover**: Automatic failure detection and switching + +This architecture design provides RustFS with modern, scalable, and highly observable health monitoring and repair capabilities that meet the operational requirements of enterprise-grade distributed storage systems. \ No newline at end of file diff --git a/crates/ahm/architecture_ch.md b/crates/ahm/architecture_ch.md new file mode 100644 index 00000000..e349cf51 --- /dev/null +++ b/crates/ahm/architecture_ch.md @@ -0,0 +1,557 @@ +# RustFS Advanced Health & Metrics (AHM) 系统架构设计 + +## 概述 + +RustFS AHM 系统是一个全新设计的分布式存储健康监控和修复系统,提供智能扫描、自动修复、丰富指标和策略驱动的管理能力。 + +## 系统架构 + +### 整体架构图 + +``` +┌─────────────────────────────────────┐ +│ API Layer (REST/gRPC) │ +├─────────────────────────────────────┤ +│ Policy & Configuration │ +├─────────────────────────────────────┤ +│ Core Coordination Engine │ +├─────────────────────────────────────┤ +│ Scanner Engine │ Heal Engine │ +├─────────────────────────────────────┤ +│ Metrics & Observability │ +├─────────────────────────────────────┤ +│ Storage Abstraction │ +└─────────────────────────────────────┘ +``` + +### 模块结构 + +``` +rustfs/crates/ecstore/src/ahm/ +├── mod.rs # 模块入口和公共接口 +├── core/ # 核心引擎 +│ ├── coordinator.rs # 分布式协调器 - 事件路由和状态管理 +│ ├── scheduler.rs # 任务调度器 - 优先级队列和工作分配 +│ └── lifecycle.rs # 生命周期管理器 - 系统启停控制 +├── scanner/ # 扫描系统 +│ ├── engine.rs # 扫描引擎 - 扫描流程控制 +│ ├── object_scanner.rs # 对象扫描器 - 对象级完整性检查 +│ ├── disk_scanner.rs # 磁盘扫描器 - 磁盘级健康检查 +│ ├── metrics_collector.rs # 指标收集器 - 扫描过程数据收集 +│ └── bandwidth_limiter.rs # 带宽限制器 - I/O 资源控制 +├── heal/ # 修复系统 +│ ├── engine.rs # 修复引擎 - 修复流程控制 +│ ├── priority_queue.rs # 优先级队列 - 修复任务排序 +│ ├── repair_worker.rs # 修复工作器 - 实际修复执行 +│ └── validation.rs # 修复验证器 - 修复结果验证 +├── metrics/ # 指标系统 +│ ├── collector.rs # 指标收集器 - 实时数据收集 +│ ├── aggregator.rs # 指标聚合器 - 数据聚合计算 +│ ├── storage.rs # 指标存储器 - 时序数据存储 +│ └── reporter.rs # 指标报告器 - 外部系统导出 +├── policy/ # 策略系统 +│ ├── scan_policy.rs # 扫描策略 - 扫描行为配置 +│ ├── heal_policy.rs # 修复策略 - 修复优先级和策略 +│ └── retention_policy.rs # 保留策略 - 数据生命周期管理 +└── api/ # API接口 + ├── admin_api.rs # 管理API - 系统管理操作 + ├── metrics_api.rs # 指标API - 指标查询和导出 + └── status_api.rs # 状态API - 系统状态监控 +``` + +## 核心设计理念 + +### 1. 事件驱动架构 + +```rust +pub enum SystemEvent { + ObjectDiscovered { bucket: String, object: String, metadata: ObjectMetadata }, + HealthIssueDetected { issue_type: HealthIssueType, severity: Severity }, + HealCompleted { result: HealResult }, + ScanCycleCompleted { statistics: ScanStatistics }, + ResourceUsageUpdated { usage: ResourceUsage }, +} +``` + +- **Scanner** 产生发现事件 +- **Heal** 响应修复事件 +- **Metrics** 收集所有事件统计 +- **Policy** 控制事件处理策略 + +### 2. 分层模块化设计 + +#### **API层**: REST/gRPC接口 +- 统一的响应格式 +- 完整的错误处理 +- 认证和授权支持 + +#### **策略层**: 可配置的业务规则 +- 扫描频率和深度控制 +- 修复优先级策略 +- 数据保留规则 + +#### **协调层**: 系统协调和调度 +- 事件路由分发 +- 资源管理分配 +- 任务调度执行 + +#### **引擎层**: 核心业务逻辑 +- 智能扫描算法 +- 自适应修复策略 +- 性能优化控制 + +#### **指标层**: 可观测性支持 +- 实时指标收集 +- 历史趋势分析 +- 多格式导出 + +### 3. 多模式扫描策略 + +```rust +pub enum ScanStrategy { + Full { mode: ScanMode, scope: ScanScope }, // 全量扫描 + Incremental { since: Instant, mode: ScanMode }, // 增量扫描 + Smart { sample_rate: f64, favor_unscanned: bool }, // 智能采样 + Targeted { targets: Vec, mode: ScanMode }, // 定向扫描 +} + +pub enum ScanMode { + Quick, // 快速扫描 - 仅元数据检查 + Normal, // 标准扫描 - 基础完整性验证 + Deep, // 深度扫描 - 包含位腐蚀检测 +} +``` + +### 4. 优先级修复系统 + +```rust +pub enum HealPriority { + Low = 0, + Normal = 1, + High = 2, + Critical = 3, + Emergency = 4, +} + +pub enum HealMode { + RealTime, // 实时修复 - GET/PUT时触发 + Background, // 后台修复 - 计划任务 + OnDemand, // 按需修复 - 管理员触发 + Emergency, // 紧急修复 - 关键问题 +} +``` + +## API 使用指南 + +### 1. 系统管理 API + +#### 启动 AHM 系统 + +```http +POST /admin/system/start +Content-Type: application/json + +{ + "coordinator": { + "event_buffer_size": 10000, + "max_concurrent_operations": 1000 + }, + "scanner": { + "default_scan_mode": "Normal", + "scan_interval": "24h" + }, + "heal": { + "max_workers": 16, + "queue_capacity": 50000 + } +} +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "system_id": "ahm-001", + "status": "Running", + "started_at": "2024-01-15T10:30:00Z" + }, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +#### 获取系统状态 + +```http +GET /status/health +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "status": "Running", + "version": "1.0.0", + "uptime_seconds": 3600, + "subsystems": { + "scanner": { + "status": "Scanning", + "last_check": "2024-01-15T10:29:00Z", + "error_message": null + }, + "heal": { + "status": "Idle", + "last_check": "2024-01-15T10:29:00Z", + "error_message": null + }, + "metrics": { + "status": "Running", + "last_check": "2024-01-15T10:29:00Z", + "error_message": null + } + } + }, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +### 2. 扫描管理 API + +#### 启动扫描任务 + +```http +POST /admin/scan/start +Content-Type: application/json + +{ + "strategy": { + "type": "Full", + "mode": "Normal", + "scope": { + "buckets": ["important-data", "user-uploads"], + "include_system_objects": false, + "max_objects": 1000000 + } + }, + "priority": "High" +} +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "scan_id": "scan-12345", + "status": "Started", + "estimated_duration": "2h30m", + "estimated_objects": 850000 + }, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +#### 查询扫描状态 + +```http +GET /admin/scan/{scan_id}/status +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "scan_id": "scan-12345", + "status": "Scanning", + "progress": { + "objects_scanned": 425000, + "bytes_scanned": 1073741824000, + "issues_detected": 23, + "completion_percentage": 50.0, + "scan_rate_ops": 117.5, + "scan_rate_bps": 268435456, + "elapsed_time": "1h15m", + "estimated_remaining": "1h15m" + }, + "issues": [ + { + "issue_type": "MissingShards", + "severity": "High", + "bucket": "user-uploads", + "object": "photos/IMG_001.jpg", + "description": "Missing 1 data shard", + "detected_at": "2024-01-15T11:15:00Z" + } + ] + }, + "timestamp": "2024-01-15T11:45:00Z" +} +``` + +### 3. 修复管理 API + +#### 提交修复请求 + +```http +POST /admin/heal/request +Content-Type: application/json + +{ + "bucket": "user-uploads", + "object": "photos/IMG_001.jpg", + "version_id": null, + "priority": "High", + "mode": "OnDemand", + "max_retries": 3 +} +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "heal_request_id": "heal-67890", + "status": "Queued", + "priority": "High", + "estimated_start": "2024-01-15T11:50:00Z", + "queue_position": 5 + }, + "timestamp": "2024-01-15T11:45:00Z" +} +``` + +#### 查询修复状态 + +```http +GET /admin/heal/{heal_request_id}/status +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "heal_request_id": "heal-67890", + "status": "Completed", + "result": { + "success": true, + "shards_repaired": 1, + "total_shards": 8, + "duration": "45s", + "strategy_used": "ParityShardRepair", + "validation_results": [ + { + "validation_type": "Checksum", + "passed": true, + "details": "Object checksum verified", + "duration": "2s" + }, + { + "validation_type": "ShardCount", + "passed": true, + "details": "All 8 shards present", + "duration": "1s" + } + ] + } + }, + "timestamp": "2024-01-15T11:46:00Z" +} +``` + +### 4. 指标查询 API + +#### 获取系统指标 + +```http +GET /metrics/system?period=1h&metrics=objects_total,scan_rate,heal_success_rate +``` + +**响应示例:** +```json +{ + "success": true, + "data": { + "period": "1h", + "timestamp_range": { + "start": "2024-01-15T10:45:00Z", + "end": "2024-01-15T11:45:00Z" + }, + "metrics": { + "objects_total": { + "value": 2500000, + "unit": "count", + "labels": {} + }, + "scan_rate_objects_per_second": { + "value": 117.5, + "unit": "ops", + "labels": {} + }, + "heal_success_rate": { + "value": 0.98, + "unit": "ratio", + "labels": {} + } + } + }, + "timestamp": "2024-01-15T11:45:00Z" +} +``` + +#### 导出 Prometheus 格式指标 + +```http +GET /metrics/prometheus +``` + +**响应示例:** +``` +# HELP rustfs_objects_total Total number of objects in the system +# TYPE rustfs_objects_total gauge +rustfs_objects_total 2500000 + +# HELP rustfs_scan_rate_objects_per_second Object scanning rate +# TYPE rustfs_scan_rate_objects_per_second gauge +rustfs_scan_rate_objects_per_second 117.5 + +# HELP rustfs_heal_success_rate Healing operation success rate +# TYPE rustfs_heal_success_rate gauge +rustfs_heal_success_rate 0.98 + +# HELP rustfs_health_issues_total Total health issues detected +# TYPE rustfs_health_issues_total counter +rustfs_health_issues_total{severity="critical"} 0 +rustfs_health_issues_total{severity="high"} 3 +rustfs_health_issues_total{severity="medium"} 15 +rustfs_health_issues_total{severity="low"} 45 +``` + +### 5. 策略配置 API + +#### 更新扫描策略 + +```http +PUT /admin/policy/scan +Content-Type: application/json + +{ + "default_scan_interval": "12h", + "deep_scan_probability": 0.1, + "bandwidth_limit_mbps": 100, + "concurrent_scanners": 4, + "skip_system_objects": true, + "priority_buckets": ["critical-data", "user-data"] +} +``` + +#### 更新修复策略 + +```http +PUT /admin/policy/heal +Content-Type: application/json + +{ + "max_concurrent_heals": 8, + "emergency_heal_timeout": "5m", + "auto_heal_enabled": true, + "heal_verification_required": true, + "priority_mapping": { + "critical_buckets": "Emergency", + "important_buckets": "High", + "standard_buckets": "Normal" + } +} +``` + +## 使用示例 + +### 完整的监控和修复流程 + +```bash +# 1. 启动 AHM 系统 +curl -X POST http://localhost:9000/admin/system/start \ + -H "Content-Type: application/json" \ + -d '{"scanner": {"default_scan_mode": "Normal"}}' + +# 2. 启动全量扫描 +SCAN_ID=$(curl -X POST http://localhost:9000/admin/scan/start \ + -H "Content-Type: application/json" \ + -d '{"strategy": {"type": "Full", "mode": "Normal"}}' | \ + jq -r '.data.scan_id') + +# 3. 监控扫描进度 +watch "curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | jq '.data.progress'" + +# 4. 查看发现的问题 +curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | \ + jq '.data.issues[]' + +# 5. 针对发现的问题启动修复 +HEAL_ID=$(curl -X POST http://localhost:9000/admin/heal/request \ + -H "Content-Type: application/json" \ + -d '{ + "bucket": "user-uploads", + "object": "photos/IMG_001.jpg", + "priority": "High" + }' | jq -r '.data.heal_request_id') + +# 6. 监控修复进度 +watch "curl -s http://localhost:9000/admin/heal/$HEAL_ID/status | jq '.data'" + +# 7. 查看系统指标 +curl -s http://localhost:9000/metrics/system?period=1h | jq '.data.metrics' + +# 8. 导出 Prometheus 指标 +curl -s http://localhost:9000/metrics/prometheus +``` + +## 关键特性 + +### 1. 智能扫描 +- **多级扫描模式**: Quick/Normal/Deep 三种深度 +- **自适应采样**: 基于历史数据智能选择扫描对象 +- **带宽控制**: 可配置的 I/O 资源限制 +- **增量扫描**: 基于时间戳的变化检测 + +### 2. 智能修复 +- **优先级队列**: 基于业务重要性的修复排序 +- **多种修复策略**: 数据分片、奇偶校验、混合修复 +- **实时验证**: 修复后的完整性验证 +- **重试机制**: 可配置的失败重试策略 + +### 3. 丰富指标 +- **实时统计**: 对象数量、存储使用、性能指标 +- **历史趋势**: 时序数据存储和分析 +- **多格式导出**: Prometheus、JSON、CSV 等格式 +- **自定义指标**: 可扩展的指标定义框架 + +### 4. 策略驱动 +- **可配置策略**: 扫描、修复、保留策略独立配置 +- **动态调整**: 运行时策略更新,无需重启 +- **业务对齐**: 基于业务重要性的差异化处理 + +## 部署建议 + +### 1. 资源配置 +- **CPU**: 推荐 16+ 核心用于并行扫描和修复 +- **内存**: 推荐 32GB+ 用于指标缓存和任务队列 +- **网络**: 推荐千兆以上带宽用于跨节点数据同步 +- **存储**: 推荐 SSD 用于指标数据存储 + +### 2. 监控集成 +- **Prometheus**: 指标收集和告警 +- **Grafana**: 可视化仪表板 +- **ELK Stack**: 日志聚合和分析 +- **Jaeger**: 分布式链路追踪 + +### 3. 高可用部署 +- **多实例部署**: 避免单点故障 +- **负载均衡**: API 请求分发 +- **数据备份**: 指标和配置数据备份 +- **故障转移**: 自动故障检测和切换 + +这个架构设计为 RustFS 提供了现代化、可扩展、高可观测的健康监控和修复能力,能够满足企业级分布式存储系统的运维需求。 \ No newline at end of file diff --git a/crates/ahm/src/api/admin_api.rs b/crates/ahm/src/api/admin_api.rs new file mode 100644 index 00000000..19c1a494 --- /dev/null +++ b/crates/ahm/src/api/admin_api.rs @@ -0,0 +1,843 @@ +// Copyright 2024 RustFS Team + +use std::sync::Arc; + +use tracing::{debug, error, info, warn}; + +use crate::{ + error::Result, + heal::HealEngine, + policy::{ScanPolicyEngine as PolicyEngine}, + scanner::{Engine as ScanEngine}, +}; + +use super::{HttpRequest, HttpResponse}; + +/// Configuration for the admin API +#[derive(Debug, Clone)] +pub struct AdminApiConfig { + /// Whether to enable admin API + pub enabled: bool, + /// Admin API prefix + pub prefix: String, + /// Authentication required + pub require_auth: bool, + /// Admin token + pub admin_token: Option, + /// Rate limiting for admin endpoints + pub rate_limit_requests_per_minute: u32, + /// Maximum request body size + pub max_request_size: usize, + /// Enable audit logging + pub enable_audit_logging: bool, + /// Audit log path + pub audit_log_path: Option, +} + +impl Default for AdminApiConfig { + fn default() -> Self { + Self { + enabled: true, + prefix: "/admin".to_string(), + require_auth: true, + admin_token: Some("admin-secret-token".to_string()), + rate_limit_requests_per_minute: 100, + max_request_size: 1024 * 1024, // 1 MB + enable_audit_logging: true, + audit_log_path: Some("/tmp/rustfs/admin-audit.log".to_string()), + } + } +} + +/// Admin API that provides administrative operations +pub struct AdminApi { + config: AdminApiConfig, + scan_engine: Arc, + heal_engine: Arc, + policy_engine: Arc, +} + +impl AdminApi { + /// Create a new admin API + pub async fn new( + config: AdminApiConfig, + scan_engine: Arc, + heal_engine: Arc, + policy_engine: Arc, + ) -> Result { + Ok(Self { + config, + scan_engine, + heal_engine, + policy_engine, + }) + } + + /// Get the configuration + pub fn config(&self) -> &AdminApiConfig { + &self.config + } + + /// Handle HTTP request + pub async fn handle_request(&self, request: HttpRequest) -> Result { + // Check authentication if required + if self.config.require_auth { + if !self.authenticate_request(&request).await? { + return Ok(HttpResponse { + status_code: 401, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Unauthorized", + "message": "Authentication required" + }).to_string(), + }); + } + } + + // Log audit if enabled + if self.config.enable_audit_logging { + self.log_audit(&request).await?; + } + + match request.path.as_str() { + // Scan operations + "/admin/scan/start" => self.start_scan(request).await, + "/admin/scan/stop" => self.stop_scan(request).await, + "/admin/scan/status" => self.get_scan_status(request).await, + "/admin/scan/config" => self.get_scan_config(request).await, + "/admin/scan/config" if request.method == "PUT" => self.update_scan_config(request).await, + + // Heal operations + "/admin/heal/start" => self.start_heal(request).await, + "/admin/heal/stop" => self.stop_heal(request).await, + "/admin/heal/status" => self.get_heal_status(request).await, + "/admin/heal/config" => self.get_heal_config(request).await, + "/admin/heal/config" if request.method == "PUT" => self.update_heal_config(request).await, + + // Policy operations + "/admin/policy/list" => self.list_policies(request).await, + "/admin/policy/get" => self.get_policy(request).await, + "/admin/policy/create" => self.create_policy(request).await, + "/admin/policy/update" => self.update_policy(request).await, + "/admin/policy/delete" => self.delete_policy(request).await, + + // System operations + "/admin/system/status" => self.get_system_status(request).await, + "/admin/system/config" => self.get_system_config(request).await, + "/admin/system/restart" => self.restart_system(request).await, + "/admin/system/shutdown" => self.shutdown_system(request).await, + + // Default 404 + _ => Ok(HttpResponse { + status_code: 404, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Not Found", + "message": "Admin endpoint not found" + }).to_string(), + }), + } + } + + /// Authenticate request + async fn authenticate_request(&self, request: &HttpRequest) -> Result { + if let Some(token) = &self.config.admin_token { + // Check for Authorization header + if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") { + if auth_header.1 == format!("Bearer {}", token) { + return Ok(true); + } + } + + // Check for token in query parameters + if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") { + if token_param.1 == *token { + return Ok(true); + } + } + } + + Ok(false) + } + + /// Log audit entry + async fn log_audit(&self, request: &HttpRequest) -> Result<()> { + let audit_entry = serde_json::json!({ + "timestamp": chrono::Utc::now().to_rfc3339(), + "method": request.method, + "path": request.path, + "ip": "127.0.0.1", // In real implementation, get from request + "user_agent": "admin-api", // In real implementation, get from headers + }); + + if let Some(log_path) = &self.config.audit_log_path { + // In a real implementation, this would write to the audit log file + debug!("Audit log entry: {}", audit_entry); + } + + Ok(()) + } + + /// Start scan operation + async fn start_scan(&self, _request: HttpRequest) -> Result { + match self.scan_engine.start_scan().await { + Ok(_) => { + info!("Scan started via admin API"); + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Scan started successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to start scan: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to start scan: {}", e) + }).to_string(), + }) + } + } + } + + /// Stop scan operation + async fn stop_scan(&self, _request: HttpRequest) -> Result { + match self.scan_engine.stop_scan().await { + Ok(_) => { + info!("Scan stopped via admin API"); + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Scan stopped successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to stop scan: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to stop scan: {}", e) + }).to_string(), + }) + } + } + } + + /// Get scan status + async fn get_scan_status(&self, _request: HttpRequest) -> Result { + let status = self.scan_engine.get_status().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "scan_status": status, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get scan configuration + async fn get_scan_config(&self, _request: HttpRequest) -> Result { + let config = self.scan_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "scan_config": config, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Update scan configuration + async fn update_scan_config(&self, request: HttpRequest) -> Result { + if let Some(body) = request.body { + match serde_json::from_str::(&body) { + Ok(config_json) => { + // In a real implementation, this would update the scan configuration + info!("Scan config updated via admin API: {:?}", config_json); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Scan configuration updated successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": format!("Invalid JSON: {}", e) + }).to_string(), + }) + } + } + } else { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": "Request body required" + }).to_string(), + }) + } + } + + /// Start heal operation + async fn start_heal(&self, _request: HttpRequest) -> Result { + match self.heal_engine.start_healing().await { + Ok(_) => { + info!("Healing started via admin API"); + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Healing started successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to start healing: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to start healing: {}", e) + }).to_string(), + }) + } + } + } + + /// Stop heal operation + async fn stop_heal(&self, _request: HttpRequest) -> Result { + match self.heal_engine.stop_healing().await { + Ok(_) => { + info!("Healing stopped via admin API"); + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Healing stopped successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to stop healing: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to stop healing: {}", e) + }).to_string(), + }) + } + } + } + + /// Get heal status + async fn get_heal_status(&self, _request: HttpRequest) -> Result { + let status = self.heal_engine.get_status().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "heal_status": status, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get heal configuration + async fn get_heal_config(&self, _request: HttpRequest) -> Result { + let config = self.heal_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "heal_config": config, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Update heal configuration + async fn update_heal_config(&self, request: HttpRequest) -> Result { + if let Some(body) = request.body { + match serde_json::from_str::(&body) { + Ok(config_json) => { + // In a real implementation, this would update the heal configuration + info!("Heal config updated via admin API: {:?}", config_json); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Heal configuration updated successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": format!("Invalid JSON: {}", e) + }).to_string(), + }) + } + } + } else { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": "Request body required" + }).to_string(), + }) + } + } + + /// List policies + async fn list_policies(&self, _request: HttpRequest) -> Result { + let policies = self.policy_engine.list_policies().await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "policies": policies, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get policy + async fn get_policy(&self, request: HttpRequest) -> Result { + if let Some(policy_name) = request.query_params.iter().find(|(k, _)| k == "name") { + match self.policy_engine.get_policy(&policy_name.1).await { + Ok(policy) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "policy": policy, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + Ok(HttpResponse { + status_code: 404, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Not Found", + "message": format!("Policy not found: {}", e) + }).to_string(), + }) + } + } + } else { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": "Policy name parameter required" + }).to_string(), + }) + } + } + + /// Create policy + async fn create_policy(&self, request: HttpRequest) -> Result { + if let Some(body) = request.body { + match serde_json::from_str::(&body) { + Ok(policy_json) => { + // In a real implementation, this would create the policy + info!("Policy created via admin API: {:?}", policy_json); + + Ok(HttpResponse { + status_code: 201, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Policy created successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": format!("Invalid JSON: {}", e) + }).to_string(), + }) + } + } + } else { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": "Request body required" + }).to_string(), + }) + } + } + + /// Update policy + async fn update_policy(&self, request: HttpRequest) -> Result { + if let Some(body) = request.body { + match serde_json::from_str::(&body) { + Ok(policy_json) => { + // In a real implementation, this would update the policy + info!("Policy updated via admin API: {:?}", policy_json); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Policy updated successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": format!("Invalid JSON: {}", e) + }).to_string(), + }) + } + } + } else { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": "Request body required" + }).to_string(), + }) + } + } + + /// Delete policy + async fn delete_policy(&self, request: HttpRequest) -> Result { + if let Some(policy_name) = request.query_params.iter().find(|(k, _)| k == "name") { + // In a real implementation, this would delete the policy + info!("Policy deleted via admin API: {}", policy_name.1); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "Policy deleted successfully", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } else { + Ok(HttpResponse { + status_code: 400, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Bad Request", + "message": "Policy name parameter required" + }).to_string(), + }) + } + } + + /// Get system status + async fn get_system_status(&self, _request: HttpRequest) -> Result { + let scan_status = self.scan_engine.get_status().await; + let heal_status = self.heal_engine.get_status().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "system_status": { + "scan": scan_status, + "heal": heal_status, + "overall": "healthy" + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get system configuration + async fn get_system_config(&self, _request: HttpRequest) -> Result { + let scan_config = self.scan_engine.get_config().await; + let heal_config = self.heal_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "system_config": { + "scan": scan_config, + "heal": heal_config + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Restart system + async fn restart_system(&self, _request: HttpRequest) -> Result { + // In a real implementation, this would restart the system + info!("System restart requested via admin API"); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "System restart initiated", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Shutdown system + async fn shutdown_system(&self, _request: HttpRequest) -> Result { + // In a real implementation, this would shutdown the system + info!("System shutdown requested via admin API"); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "message": "System shutdown initiated", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + heal::HealEngineConfig, + policy::PolicyEngineConfig, + scanner::ScanEngineConfig, + }; + + #[tokio::test] + async fn test_admin_api_creation() { + let config = AdminApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + assert!(admin_api.config().enabled); + assert_eq!(admin_api.config().prefix, "/admin"); + } + + #[tokio::test] + async fn test_authentication() { + let config = AdminApiConfig { + admin_token: Some("test-token".to_string()), + ..Default::default() + }; + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + // Test with valid token in header + let request = HttpRequest { + method: "GET".to_string(), + path: "/admin/scan/status".to_string(), + headers: vec![("Authorization".to_string(), "Bearer test-token".to_string())], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + + // Test with valid token in query + let request = HttpRequest { + method: "GET".to_string(), + path: "/admin/scan/status".to_string(), + headers: vec![], + body: None, + query_params: vec![("token".to_string(), "test-token".to_string())], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + + // Test with invalid token + let request = HttpRequest { + method: "GET".to_string(), + path: "/admin/scan/status".to_string(), + headers: vec![("Authorization".to_string(), "Bearer invalid-token".to_string())], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 401); + } + + #[tokio::test] + async fn test_scan_operations() { + let config = AdminApiConfig { + require_auth: false, // Disable auth for testing + ..Default::default() + }; + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + // Test start scan + let request = HttpRequest { + method: "POST".to_string(), + path: "/admin/scan/start".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + + // Test get scan status + let request = HttpRequest { + method: "GET".to_string(), + path: "/admin/scan/status".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + } + + #[tokio::test] + async fn test_heal_operations() { + let config = AdminApiConfig { + require_auth: false, // Disable auth for testing + ..Default::default() + }; + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + // Test start heal + let request = HttpRequest { + method: "POST".to_string(), + path: "/admin/heal/start".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + + // Test get heal status + let request = HttpRequest { + method: "GET".to_string(), + path: "/admin/heal/status".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + } + + #[tokio::test] + async fn test_system_operations() { + let config = AdminApiConfig { + require_auth: false, // Disable auth for testing + ..Default::default() + }; + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + // Test get system status + let request = HttpRequest { + method: "GET".to_string(), + path: "/admin/system/status".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = admin_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("system_status")); + } +} \ No newline at end of file diff --git a/crates/ahm/src/api/metrics_api.rs b/crates/ahm/src/api/metrics_api.rs new file mode 100644 index 00000000..06f1efa5 --- /dev/null +++ b/crates/ahm/src/api/metrics_api.rs @@ -0,0 +1,1180 @@ +// Copyright 2024 RustFS Team + +use std::sync::Arc; +use std::time::{SystemTime, Duration}; + +use tracing::{debug, error, info, warn}; + +use crate::{ + error::Result, + metrics::{Collector, Reporter, Storage, MetricsQuery, MetricType}, +}; + +use super::{HttpRequest, HttpResponse}; + +/// Configuration for the metrics API +#[derive(Debug, Clone)] +pub struct MetricsApiConfig { + /// Whether to enable metrics API + pub enabled: bool, + /// Metrics API prefix + pub prefix: String, + /// Authentication required + pub require_auth: bool, + /// Metrics token + pub metrics_token: Option, + /// Rate limiting for metrics endpoints + pub rate_limit_requests_per_minute: u32, + /// Maximum request body size + pub max_request_size: usize, + /// Enable metrics caching + pub enable_caching: bool, + /// Cache TTL in seconds + pub cache_ttl_seconds: u64, + /// Enable metrics compression + pub enable_compression: bool, + /// Default metrics format + pub default_format: MetricsFormat, +} + +impl Default for MetricsApiConfig { + fn default() -> Self { + Self { + enabled: true, + prefix: "/metrics".to_string(), + require_auth: false, + metrics_token: None, + rate_limit_requests_per_minute: 1000, + max_request_size: 1024 * 1024, // 1 MB + enable_caching: true, + cache_ttl_seconds: 300, // 5 minutes + enable_compression: true, + default_format: MetricsFormat::Json, + } + } +} + +/// Metrics format +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MetricsFormat { + Json, + Prometheus, + Csv, + Xml, +} + +/// Backup report +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BackupReport { + pub timestamp: SystemTime, + pub backup_id: String, + pub status: BackupStatus, + pub objects_backed_up: u64, + pub total_size: u64, + pub duration: Duration, + pub errors: Vec, +} + +/// Restore report +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RestoreReport { + pub timestamp: SystemTime, + pub restore_id: String, + pub status: RestoreStatus, + pub objects_restored: u64, + pub total_size: u64, + pub duration: Duration, + pub errors: Vec, +} + +/// Data integrity report +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataIntegrityReport { + pub timestamp: SystemTime, + pub validation_id: String, + pub status: ValidationStatus, + pub objects_validated: u64, + pub corrupted_objects: u64, + pub duration: Duration, + pub details: Vec, +} + +/// Backup status +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum BackupStatus { + Pending, + InProgress, + Completed, + Failed, +} + +/// Restore status +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum RestoreStatus { + Pending, + InProgress, + Completed, + Failed, +} + +/// Validation status +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum ValidationStatus { + Pending, + InProgress, + Completed, + Failed, +} + +/// Validation detail +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ValidationDetail { + pub object_path: String, + pub status: ValidationResult, + pub error_message: Option, +} + +/// Validation result +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum ValidationResult { + Valid, + Corrupted, + Missing, + AccessDenied, +} + +/// Metrics API that provides metrics data and operations +pub struct MetricsApi { + config: MetricsApiConfig, + collector: Arc, + reporter: Arc, + storage: Arc, +} + +impl MetricsApi { + /// Create a new metrics API + pub async fn new( + config: MetricsApiConfig, + collector: Arc, + reporter: Arc, + storage: Arc, + ) -> Result { + Ok(Self { + config, + collector, + reporter, + storage, + }) + } + + /// Get the configuration + pub fn config(&self) -> &MetricsApiConfig { + &self.config + } + + /// Handle HTTP request + pub async fn handle_request(&self, request: HttpRequest) -> Result { + // Check authentication if required + if self.config.require_auth { + if !self.authenticate_request(&request).await? { + return Ok(HttpResponse { + status_code: 401, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Unauthorized", + "message": "Authentication required" + }).to_string(), + }) + } + } + + match request.path.as_str() { + // Current metrics + "/metrics/current" => self.get_current_metrics(request).await, + "/metrics/latest" => self.get_latest_metrics(request).await, + + // Historical metrics + "/metrics/history" => self.get_metrics_history(request).await, + "/metrics/range" => self.get_metrics_range(request).await, + + // Aggregated metrics + "/metrics/aggregated" => self.get_aggregated_metrics(request).await, + "/metrics/summary" => self.get_metrics_summary(request).await, + + // Specific metric types + "/metrics/system" => self.get_system_metrics(request).await, + "/metrics/scan" => self.get_scan_metrics(request).await, + "/metrics/heal" => self.get_heal_metrics(request).await, + "/metrics/policy" => self.get_policy_metrics(request).await, + "/metrics/network" => self.get_network_metrics(request).await, + "/metrics/disk" => self.get_disk_metrics(request).await, + + // Health issues + "/metrics/health-issues" => self.get_health_issues(request).await, + "/metrics/alerts" => self.get_alerts(request).await, + + // Reports + "/metrics/reports" => self.get_reports(request).await, + "/metrics/reports/comprehensive" => self.get_comprehensive_report(request).await, + + // Prometheus format + "/metrics/prometheus" => self.get_prometheus_metrics(request).await, + + // Storage operations + "/metrics/storage/backup" => self.backup_metrics(request).await, + "/metrics/storage/restore" => self.restore_metrics(request).await, + "/metrics/storage/validate" => self.validate_metrics(request).await, + + // Default 404 + _ => Ok(HttpResponse { + status_code: 404, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Not Found", + "message": "Metrics endpoint not found" + }).to_string(), + }), + } + } + + /// Authenticate request + async fn authenticate_request(&self, request: &HttpRequest) -> Result { + if let Some(token) = &self.config.metrics_token { + // Check for Authorization header + if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") { + if auth_header.1 == format!("Bearer {}", token) { + return Ok(true); + } + } + + // Check for token in query parameters + if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") { + if token_param.1 == *token { + return Ok(true); + } + } + } + + Ok(false) + } + + /// Get current metrics + async fn get_current_metrics(&self, _request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let format = self.get_request_format(&_request); + let body = self.format_metrics(&metrics, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to collect current metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to collect metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get latest metrics + async fn get_latest_metrics(&self, _request: HttpRequest) -> Result { + match self.collector.get_latest_metrics().await { + Ok(Some(metrics)) => { + let format = self.get_request_format(&_request); + let body = self.format_metrics(&metrics, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Ok(None) => { + Ok(HttpResponse { + status_code: 404, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Not Found", + "message": "No metrics available" + }).to_string(), + }) + } + Err(e) => { + error!("Failed to get latest metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get latest metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get metrics history + async fn get_metrics_history(&self, request: HttpRequest) -> Result { + let hours = request.query_params + .iter() + .find(|(k, _)| k == "hours") + .and_then(|(_, v)| v.parse::().ok()) + .unwrap_or(24); + + let end_time = std::time::SystemTime::now(); + let start_time = end_time - std::time::Duration::from_secs(hours * 3600); + + let query = MetricsQuery { + start_time, + end_time, + interval: std::time::Duration::from_secs(300), // 5 minutes + metrics: vec![], + severity_filter: None, + limit: None, + }; + + match self.collector.query_metrics(query).await { + Ok(aggregated) => { + let format = self.get_request_format(&request); + let body = self.format_aggregated_metrics(&aggregated, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get metrics history: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get metrics history: {}", e) + }).to_string(), + }) + } + } + } + + /// Get metrics range + async fn get_metrics_range(&self, request: HttpRequest) -> Result { + let start_time = request.query_params + .iter() + .find(|(k, _)| k == "start") + .and_then(|(_, v)| v.parse::().ok()) + .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) + .unwrap_or_else(|| std::time::SystemTime::now() - std::time::Duration::from_secs(3600)); + + let end_time = request.query_params + .iter() + .find(|(k, _)| k == "end") + .and_then(|(_, v)| v.parse::().ok()) + .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) + .unwrap_or_else(std::time::SystemTime::now); + + let query = MetricsQuery { + start_time, + end_time, + interval: std::time::Duration::from_secs(300), // 5 minutes + metrics: vec![], + severity_filter: None, + limit: None, + }; + + match self.collector.query_metrics(query).await { + Ok(aggregated) => { + let format = self.get_request_format(&request); + let body = self.format_aggregated_metrics(&aggregated, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get metrics range: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get metrics range: {}", e) + }).to_string(), + }) + } + } + } + + /// Get aggregated metrics + async fn get_aggregated_metrics(&self, request: HttpRequest) -> Result { + let query = self.parse_metrics_query(&request)?; + + match self.collector.query_metrics(query).await { + Ok(aggregated) => { + let format = self.get_request_format(&request); + let body = self.format_aggregated_metrics(&aggregated, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get aggregated metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get aggregated metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get metrics summary + async fn get_metrics_summary(&self, request: HttpRequest) -> Result { + let hours = request.query_params + .iter() + .find(|(k, _)| k == "hours") + .and_then(|(_, v)| v.parse::().ok()) + .unwrap_or(24); + + let end_time = std::time::SystemTime::now(); + let start_time = end_time - std::time::Duration::from_secs(hours * 3600); + + let query = MetricsQuery { + start_time, + end_time, + interval: std::time::Duration::from_secs(3600), // 1 hour + metrics: vec![], + severity_filter: None, + limit: None, + }; + + match self.collector.query_metrics(query).await { + Ok(aggregated) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "summary": aggregated.summary, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to get metrics summary: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get metrics summary: {}", e) + }).to_string(), + }) + } + } + } + + /// Get system metrics + async fn get_system_metrics(&self, request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let system_data = serde_json::json!({ + "cpu_usage": metrics.cpu_usage, + "memory_usage": metrics.memory_usage, + "disk_usage": metrics.disk_usage, + "system_load": metrics.system_load, + "active_operations": metrics.active_operations, + "network_io": metrics.network_io, + "disk_io": metrics.disk_io, + }); + + let format = self.get_request_format(&request); + let body = self.format_json_data(&system_data, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get system metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get system metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get scan metrics + async fn get_scan_metrics(&self, request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let scan_data = serde_json::json!({ + "scan_metrics": metrics.scan_metrics, + }); + + let format = self.get_request_format(&request); + let body = self.format_json_data(&scan_data, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get scan metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get scan metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get heal metrics + async fn get_heal_metrics(&self, request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let heal_data = serde_json::json!({ + "heal_metrics": metrics.heal_metrics, + }); + + let format = self.get_request_format(&request); + let body = self.format_json_data(&heal_data, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get heal metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get heal metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get policy metrics + async fn get_policy_metrics(&self, request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let policy_data = serde_json::json!({ + "policy_metrics": metrics.policy_metrics, + }); + + let format = self.get_request_format(&request); + let body = self.format_json_data(&policy_data, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get policy metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get policy metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get network metrics + async fn get_network_metrics(&self, request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let network_data = serde_json::json!({ + "network_io": metrics.network_io, + }); + + let format = self.get_request_format(&request); + let body = self.format_json_data(&network_data, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get network metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get network metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get disk metrics + async fn get_disk_metrics(&self, request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let disk_data = serde_json::json!({ + "disk_io": metrics.disk_io, + "disk_usage": metrics.disk_usage, + }); + + let format = self.get_request_format(&request); + let body = self.format_json_data(&disk_data, format.clone()).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), self.get_content_type(format))], + body, + }) + } + Err(e) => { + error!("Failed to get disk metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get disk metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get health issues + async fn get_health_issues(&self, _request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "health_issues": metrics.health_issues, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to get health issues: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get health issues: {}", e) + }).to_string(), + }) + } + } + } + + /// Get alerts + async fn get_alerts(&self, request: HttpRequest) -> Result { + let hours = request.query_params + .iter() + .find(|(k, _)| k == "hours") + .and_then(|(_, v)| v.parse::().ok()) + .unwrap_or(24); + + match self.reporter.get_recent_alerts(hours).await { + Ok(alerts) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "alerts": alerts, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to get alerts: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get alerts: {}", e) + }).to_string(), + }) + } + } + } + + /// Get reports + async fn get_reports(&self, _request: HttpRequest) -> Result { + let stats = self.reporter.get_statistics().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "reports": stats, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get comprehensive report + async fn get_comprehensive_report(&self, request: HttpRequest) -> Result { + let query = self.parse_metrics_query(&request)?; + + match self.collector.query_metrics(query).await { + Ok(aggregated) => { + match self.reporter.generate_comprehensive_report(&aggregated).await { + Ok(report) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "report": report, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to generate comprehensive report: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to generate report: {}", e) + }).to_string(), + }) + } + } + } + Err(e) => { + error!("Failed to get metrics for comprehensive report: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to get metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Get Prometheus metrics + async fn get_prometheus_metrics(&self, _request: HttpRequest) -> Result { + match self.collector.collect_metrics().await { + Ok(metrics) => { + let prometheus_data = self.format_prometheus_metrics(&metrics).await?; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "text/plain; version=0.0.4; charset=utf-8".to_string())], + body: prometheus_data, + }) + } + Err(e) => { + error!("Failed to get Prometheus metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "text/plain".to_string())], + body: format!("# ERROR: Failed to get metrics: {}", e), + }) + } + } + } + + /// Backup metrics + async fn backup_metrics(&self, request: HttpRequest) -> Result { + let backup_path = request.query_params + .iter() + .find(|(k, _)| k == "path") + .map(|(_, v)| std::path::PathBuf::from(v)) + .unwrap_or_else(|| std::path::PathBuf::from("/tmp/rustfs/metrics-backup")); + + match self.storage.backup_data(&backup_path).await { + Ok(report) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "backup_report": report, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to backup metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to backup metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Restore metrics + async fn restore_metrics(&self, request: HttpRequest) -> Result { + let backup_path = request.query_params + .iter() + .find(|(k, _)| k == "path") + .map(|(_, v)| std::path::PathBuf::from(v)) + .unwrap_or_else(|| std::path::PathBuf::from("/tmp/rustfs/metrics-backup")); + + match self.storage.restore_data(&backup_path).await { + Ok(report) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "restore_report": report, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to restore metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to restore metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Validate metrics + async fn validate_metrics(&self, _request: HttpRequest) -> Result { + match self.storage.validate_data_integrity().await { + Ok(report) => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "validation_report": report, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + Err(e) => { + error!("Failed to validate metrics: {}", e); + Ok(HttpResponse { + status_code: 500, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Internal Server Error", + "message": format!("Failed to validate metrics: {}", e) + }).to_string(), + }) + } + } + } + + /// Helper methods + fn get_request_format(&self, request: &HttpRequest) -> MetricsFormat { + request.query_params + .iter() + .find(|(k, _)| k == "format") + .and_then(|(_, v)| match v.as_str() { + "prometheus" => Some(MetricsFormat::Prometheus), + "csv" => Some(MetricsFormat::Csv), + "xml" => Some(MetricsFormat::Xml), + _ => Some(MetricsFormat::Json), + }) + .unwrap_or(self.config.default_format.clone()) + } + + fn get_content_type(&self, format: MetricsFormat) -> String { + match format { + MetricsFormat::Json => "application/json".to_string(), + MetricsFormat::Prometheus => "text/plain; version=0.0.4; charset=utf-8".to_string(), + MetricsFormat::Csv => "text/csv".to_string(), + MetricsFormat::Xml => "application/xml".to_string(), + } + } + + async fn format_metrics(&self, metrics: &crate::metrics::SystemMetrics, format: MetricsFormat) -> Result { + match format { + MetricsFormat::Json => Ok(serde_json::json!({ + "status": "success", + "metrics": metrics, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string()), + MetricsFormat::Prometheus => self.format_prometheus_metrics(metrics).await, + MetricsFormat::Csv => self.format_csv_metrics(metrics).await, + MetricsFormat::Xml => self.format_xml_metrics(metrics).await, + } + } + + async fn format_aggregated_metrics(&self, aggregated: &crate::metrics::AggregatedMetrics, format: MetricsFormat) -> Result { + match format { + MetricsFormat::Json => Ok(serde_json::json!({ + "status": "success", + "aggregated_metrics": aggregated, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string()), + MetricsFormat::Prometheus => self.format_prometheus_aggregated(aggregated).await, + MetricsFormat::Csv => self.format_csv_aggregated(aggregated).await, + MetricsFormat::Xml => self.format_xml_aggregated(aggregated).await, + } + } + + async fn format_json_data(&self, data: &serde_json::Value, format: MetricsFormat) -> Result { + match format { + MetricsFormat::Json => Ok(serde_json::json!({ + "status": "success", + "data": data, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string()), + _ => Ok(serde_json::json!(data).to_string()), + } + } + + async fn format_prometheus_metrics(&self, metrics: &crate::metrics::SystemMetrics) -> Result { + let mut prometheus_lines = Vec::new(); + + // System metrics + prometheus_lines.push(format!("rustfs_cpu_usage_percent {}", metrics.cpu_usage)); + prometheus_lines.push(format!("rustfs_memory_usage_percent {}", metrics.memory_usage)); + prometheus_lines.push(format!("rustfs_disk_usage_percent {}", metrics.disk_usage)); + prometheus_lines.push(format!("rustfs_system_load {}", metrics.system_load)); + prometheus_lines.push(format!("rustfs_active_operations {}", metrics.active_operations)); + + // Network metrics + prometheus_lines.push(format!("rustfs_network_bytes_received_per_sec {}", metrics.network_io.bytes_received_per_sec)); + prometheus_lines.push(format!("rustfs_network_bytes_sent_per_sec {}", metrics.network_io.bytes_sent_per_sec)); + + // Disk metrics + prometheus_lines.push(format!("rustfs_disk_bytes_read_per_sec {}", metrics.disk_io.bytes_read_per_sec)); + prometheus_lines.push(format!("rustfs_disk_bytes_written_per_sec {}", metrics.disk_io.bytes_written_per_sec)); + + // Scan metrics + prometheus_lines.push(format!("rustfs_scan_objects_scanned_total {}", metrics.scan_metrics.objects_scanned)); + prometheus_lines.push(format!("rustfs_scan_bytes_scanned_total {}", metrics.scan_metrics.bytes_scanned)); + + // Heal metrics + prometheus_lines.push(format!("rustfs_heal_total_repairs {}", metrics.heal_metrics.total_repairs)); + prometheus_lines.push(format!("rustfs_heal_successful_repairs {}", metrics.heal_metrics.successful_repairs)); + prometheus_lines.push(format!("rustfs_heal_failed_repairs {}", metrics.heal_metrics.failed_repairs)); + + Ok(prometheus_lines.join("\n")) + } + + async fn format_prometheus_aggregated(&self, _aggregated: &crate::metrics::AggregatedMetrics) -> Result { + // In a real implementation, this would format aggregated metrics for Prometheus + Ok("# Aggregated metrics not yet implemented for Prometheus format".to_string()) + } + + async fn format_csv_metrics(&self, _metrics: &crate::metrics::SystemMetrics) -> Result { + // In a real implementation, this would format metrics as CSV + Ok("timestamp,cpu_usage,memory_usage,disk_usage\n".to_string()) + } + + async fn format_csv_aggregated(&self, _aggregated: &crate::metrics::AggregatedMetrics) -> Result { + // In a real implementation, this would format aggregated metrics as CSV + Ok("timestamp,avg_cpu_usage,avg_memory_usage,avg_disk_usage\n".to_string()) + } + + async fn format_xml_metrics(&self, _metrics: &crate::metrics::SystemMetrics) -> Result { + // In a real implementation, this would format metrics as XML + Ok("success".to_string()) + } + + async fn format_xml_aggregated(&self, _aggregated: &crate::metrics::AggregatedMetrics) -> Result { + // In a real implementation, this would format aggregated metrics as XML + Ok("success".to_string()) + } + + fn parse_metrics_query(&self, request: &HttpRequest) -> Result { + let start_time = request.query_params + .iter() + .find(|(k, _)| k == "start") + .and_then(|(_, v)| v.parse::().ok()) + .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) + .unwrap_or_else(|| std::time::SystemTime::now() - std::time::Duration::from_secs(3600)); + + let end_time = request.query_params + .iter() + .find(|(k, _)| k == "end") + .and_then(|(_, v)| v.parse::().ok()) + .map(|ts| std::time::SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(ts)) + .unwrap_or_else(std::time::SystemTime::now); + + let interval = request.query_params + .iter() + .find(|(k, _)| k == "interval") + .and_then(|(_, v)| v.parse::().ok()) + .map(|secs| std::time::Duration::from_secs(secs)) + .unwrap_or(std::time::Duration::from_secs(300)); + + let metrics = request.query_params + .iter() + .filter(|(k, _)| k == "metric") + .map(|(_, v)| match v.as_str() { + "system" => MetricType::System, + "network" => MetricType::Network, + "disk" => MetricType::DiskIo, + "scan" => MetricType::Scan, + "heal" => MetricType::Heal, + "policy" => MetricType::Policy, + "health" => MetricType::HealthIssues, + _ => MetricType::System, + }) + .collect(); + + let limit = request.query_params + .iter() + .find(|(k, _)| k == "limit") + .and_then(|(_, v)| v.parse::().ok()); + + Ok(MetricsQuery { + start_time, + end_time, + interval, + metrics, + severity_filter: None, + limit, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metrics::{CollectorConfig, ReporterConfig, StorageConfig}; + + #[tokio::test] + async fn test_metrics_api_creation() { + let config = MetricsApiConfig::default(); + let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); + + assert!(metrics_api.config().enabled); + assert_eq!(metrics_api.config().prefix, "/metrics"); + } + + #[tokio::test] + async fn test_current_metrics() { + let config = MetricsApiConfig::default(); + let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/metrics/current".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = metrics_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("status")); + } + + #[tokio::test] + async fn test_prometheus_metrics() { + let config = MetricsApiConfig::default(); + let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/metrics/prometheus".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = metrics_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("rustfs_cpu_usage_percent")); + } + + #[tokio::test] + async fn test_system_metrics() { + let config = MetricsApiConfig::default(); + let collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let metrics_api = MetricsApi::new(config, collector, reporter, storage).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/metrics/system".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = metrics_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("cpu_usage")); + } +} \ No newline at end of file diff --git a/crates/ahm/src/api/mod.rs b/crates/ahm/src/api/mod.rs new file mode 100644 index 00000000..913fa455 --- /dev/null +++ b/crates/ahm/src/api/mod.rs @@ -0,0 +1,504 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! API interfaces for the AHM system +//! +//! Provides REST and gRPC endpoints for: +//! - Administrative operations +//! - Metrics and monitoring +//! - System status and control + +pub mod admin_api; +pub mod metrics_api; +pub mod status_api; + +pub use admin_api::{AdminApi, AdminApiConfig}; +pub use metrics_api::{MetricsApi, MetricsApiConfig}; +pub use status_api::{StatusApi, StatusApiConfig}; + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; + +use crate::{ + error::Result, + heal::HealEngine, + metrics::{Collector, Reporter, Storage}, + policy::{ScanPolicyEngine as PolicyEngine}, + scanner::{Engine as ScanEngine}, +}; + +/// Configuration for the API server +#[derive(Debug, Clone)] +pub struct ApiConfig { + /// Admin API configuration + pub admin: AdminApiConfig, + /// Metrics API configuration + pub metrics: MetricsApiConfig, + /// Status API configuration + pub status: StatusApiConfig, + /// Server address + pub address: String, + /// Server port + pub port: u16, + /// Enable HTTPS + pub enable_https: bool, + /// SSL certificate path + pub ssl_cert_path: Option, + /// SSL key path + pub ssl_key_path: Option, + /// Request timeout + pub request_timeout: std::time::Duration, + /// Maximum request size + pub max_request_size: usize, + /// Enable CORS + pub enable_cors: bool, + /// CORS origins + pub cors_origins: Vec, + /// Enable rate limiting + pub enable_rate_limiting: bool, + /// Rate limit requests per minute + pub rate_limit_requests_per_minute: u32, +} + +impl Default for ApiConfig { + fn default() -> Self { + Self { + admin: AdminApiConfig::default(), + metrics: MetricsApiConfig::default(), + status: StatusApiConfig::default(), + address: "127.0.0.1".to_string(), + port: 8080, + enable_https: false, + ssl_cert_path: None, + ssl_key_path: None, + request_timeout: std::time::Duration::from_secs(30), + max_request_size: 1024 * 1024, // 1 MB + enable_cors: true, + cors_origins: vec!["*".to_string()], + enable_rate_limiting: true, + rate_limit_requests_per_minute: 1000, + } + } +} + +/// API server that provides HTTP endpoints for AHM functionality +pub struct ApiServer { + config: ApiConfig, + admin_api: Arc, + metrics_api: Arc, + status_api: Arc, + scan_engine: Arc, + heal_engine: Arc, + policy_engine: Arc, + metrics_collector: Arc, + metrics_reporter: Arc, + metrics_storage: Arc, +} + +impl ApiServer { + /// Create a new API server + pub async fn new( + config: ApiConfig, + scan_engine: Arc, + heal_engine: Arc, + policy_engine: Arc, + metrics_collector: Arc, + metrics_reporter: Arc, + metrics_storage: Arc, + ) -> Result { + let admin_api = Arc::new(AdminApi::new(config.admin.clone(), scan_engine.clone(), heal_engine.clone(), policy_engine.clone()).await?); + let metrics_api = Arc::new(MetricsApi::new(config.metrics.clone(), metrics_collector.clone(), metrics_reporter.clone(), metrics_storage.clone()).await?); + let status_api = Arc::new(StatusApi::new(config.status.clone(), scan_engine.clone(), heal_engine.clone(), policy_engine.clone()).await?); + + Ok(Self { + config, + admin_api, + metrics_api, + status_api, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + }) + } + + /// Get the configuration + pub fn config(&self) -> &ApiConfig { + &self.config + } + + /// Start the API server + pub async fn start(&self) -> Result<()> { + // In a real implementation, this would start an HTTP server + // For now, we'll just simulate the server startup + tracing::info!("API server starting on {}:{}", self.config.address, self.config.port); + + if self.config.enable_https { + tracing::info!("HTTPS enabled"); + } + + if self.config.enable_cors { + tracing::info!("CORS enabled with origins: {:?}", self.config.cors_origins); + } + + if self.config.enable_rate_limiting { + tracing::info!("Rate limiting enabled: {} requests/minute", self.config.rate_limit_requests_per_minute); + } + + tracing::info!("API server started successfully"); + Ok(()) + } + + /// Stop the API server + pub async fn stop(&self) -> Result<()> { + tracing::info!("API server stopping"); + tracing::info!("API server stopped successfully"); + Ok(()) + } + + /// Get server status + pub async fn status(&self) -> ServerStatus { + ServerStatus { + address: self.config.address.clone(), + port: self.config.port, + https_enabled: self.config.enable_https, + cors_enabled: self.config.enable_cors, + rate_limiting_enabled: self.config.enable_rate_limiting, + admin_api_enabled: true, + metrics_api_enabled: true, + status_api_enabled: true, + } + } + + /// Get admin API + pub fn admin_api(&self) -> &Arc { + &self.admin_api + } + + /// Get metrics API + pub fn metrics_api(&self) -> &Arc { + &self.metrics_api + } + + /// Get status API + pub fn status_api(&self) -> &Arc { + &self.status_api + } + + /// Handle HTTP request + pub async fn handle_request(&self, request: HttpRequest) -> Result { + match request.path.as_str() { + // Admin API routes + path if path.starts_with("/admin") => { + self.admin_api.handle_request(request).await + } + // Metrics API routes + path if path.starts_with("/metrics") => { + self.metrics_api.handle_request(request).await + } + // Status API routes + path if path.starts_with("/status") => { + self.status_api.handle_request(request).await + } + // Health check + "/health" => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "healthy", + "timestamp": chrono::Utc::now().to_rfc3339(), + "version": env!("CARGO_PKG_VERSION") + }).to_string(), + }) + } + // Root endpoint + "/" => { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "service": "RustFS AHM API", + "version": env!("CARGO_PKG_VERSION"), + "endpoints": { + "admin": "/admin", + "metrics": "/metrics", + "status": "/status", + "health": "/health" + } + }).to_string(), + }) + } + // 404 for unknown routes + _ => { + Ok(HttpResponse { + status_code: 404, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Not Found", + "message": "The requested endpoint does not exist" + }).to_string(), + }) + } + } + } +} + +/// HTTP request +#[derive(Debug, Clone)] +pub struct HttpRequest { + pub method: String, + pub path: String, + pub headers: Vec<(String, String)>, + pub body: Option, + pub query_params: Vec<(String, String)>, +} + +/// HTTP response +#[derive(Debug, Clone)] +pub struct HttpResponse { + pub status_code: u16, + pub headers: Vec<(String, String)>, + pub body: String, +} + +/// Server status +#[derive(Debug, Clone)] +pub struct ServerStatus { + pub address: String, + pub port: u16, + pub https_enabled: bool, + pub cors_enabled: bool, + pub rate_limiting_enabled: bool, + pub admin_api_enabled: bool, + pub metrics_api_enabled: bool, + pub status_api_enabled: bool, +} + +/// API endpoint information +#[derive(Debug, Clone)] +pub struct EndpointInfo { + pub path: String, + pub method: String, + pub description: String, + pub parameters: Vec, + pub response_type: String, +} + +/// Parameter information +#[derive(Debug, Clone)] +pub struct ParameterInfo { + pub name: String, + pub parameter_type: String, + pub required: bool, + pub description: String, +} + +/// API documentation +#[derive(Debug, Clone)] +pub struct ApiDocumentation { + pub title: String, + pub version: String, + pub description: String, + pub endpoints: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + heal::HealEngineConfig, + metrics::{CollectorConfig, ReporterConfig, StorageConfig}, + policy::PolicyEngineConfig, + scanner::ScanEngineConfig, + }; + + #[tokio::test] + async fn test_api_server_creation() { + let config = ApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let server = ApiServer::new( + config, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + ).await.unwrap(); + + assert_eq!(server.config().port, 8080); + assert_eq!(server.config().address, "127.0.0.1"); + } + + #[tokio::test] + async fn test_api_server_start_stop() { + let config = ApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let server = ApiServer::new( + config, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + ).await.unwrap(); + + server.start().await.unwrap(); + server.stop().await.unwrap(); + } + + #[tokio::test] + async fn test_api_server_status() { + let config = ApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let server = ApiServer::new( + config, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + ).await.unwrap(); + + let status = server.status().await; + assert_eq!(status.port, 8080); + assert_eq!(status.address, "127.0.0.1"); + assert!(status.admin_api_enabled); + assert!(status.metrics_api_enabled); + assert!(status.status_api_enabled); + } + + #[tokio::test] + async fn test_health_endpoint() { + let config = ApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let server = ApiServer::new( + config, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + ).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/health".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = server.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("healthy")); + } + + #[tokio::test] + async fn test_root_endpoint() { + let config = ApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let server = ApiServer::new( + config, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + ).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = server.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("RustFS AHM API")); + } + + #[tokio::test] + async fn test_404_endpoint() { + let config = ApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap()); + let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap()); + let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap()); + + let server = ApiServer::new( + config, + scan_engine, + heal_engine, + policy_engine, + metrics_collector, + metrics_reporter, + metrics_storage, + ).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/unknown".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = server.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 404); + assert!(response.body.contains("Not Found")); + } +} \ No newline at end of file diff --git a/crates/ahm/src/api/status_api.rs b/crates/ahm/src/api/status_api.rs new file mode 100644 index 00000000..ca47b121 --- /dev/null +++ b/crates/ahm/src/api/status_api.rs @@ -0,0 +1,761 @@ +// Copyright 2024 RustFS Team + +use std::sync::Arc; + +use tracing::{debug, error, info, warn}; + +use crate::{ + error::Result, + heal::HealEngine, + policy::{ScanPolicyEngine as PolicyEngine}, + scanner::{Engine as ScanEngine}, +}; + +use super::{HttpRequest, HttpResponse}; + +use serde::{Deserialize, Serialize}; + +/// Configuration for the status API +#[derive(Debug, Clone)] +pub struct StatusApiConfig { + /// Whether to enable status API + pub enabled: bool, + /// Status API prefix + pub prefix: String, + /// Authentication required + pub require_auth: bool, + /// Status token + pub status_token: Option, + /// Rate limiting for status endpoints + pub rate_limit_requests_per_minute: u32, + /// Maximum request body size + pub max_request_size: usize, + /// Enable detailed status information + pub enable_detailed_status: bool, + /// Status cache TTL in seconds + pub status_cache_ttl_seconds: u64, + /// Enable health checks + pub enable_health_checks: bool, + /// Health check timeout + pub health_check_timeout: std::time::Duration, +} + +impl Default for StatusApiConfig { + fn default() -> Self { + Self { + enabled: true, + prefix: "/status".to_string(), + require_auth: false, + status_token: None, + rate_limit_requests_per_minute: 1000, + max_request_size: 1024 * 1024, // 1 MB + enable_detailed_status: true, + status_cache_ttl_seconds: 30, // 30 seconds + enable_health_checks: true, + health_check_timeout: std::time::Duration::from_secs(5), + } + } +} + +/// Status API that provides system status and health information +pub struct StatusApi { + config: StatusApiConfig, + scan_engine: Arc, + heal_engine: Arc, + policy_engine: Arc, +} + +impl StatusApi { + /// Create a new status API + pub async fn new( + config: StatusApiConfig, + scan_engine: Arc, + heal_engine: Arc, + policy_engine: Arc, + ) -> Result { + Ok(Self { + config, + scan_engine, + heal_engine, + policy_engine, + }) + } + + /// Get the configuration + pub fn config(&self) -> &StatusApiConfig { + &self.config + } + + /// Handle HTTP request + pub async fn handle_request(&self, request: HttpRequest) -> Result { + // Check authentication if required + if self.config.require_auth { + if !self.authenticate_request(&request).await? { + return Ok(HttpResponse { + status_code: 401, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Unauthorized", + "message": "Authentication required" + }).to_string(), + }) + } + } + + match request.path.as_str() { + // Basic status + "/status" => self.get_status(request).await, + "/status/health" => self.get_health_status(request).await, + "/status/overview" => self.get_overview_status(request).await, + + // Component status + "/status/scan" => self.get_scan_status(request).await, + "/status/heal" => self.get_heal_status(request).await, + "/status/policy" => self.get_policy_status(request).await, + + // Detailed status + "/status/detailed" => self.get_detailed_status(request).await, + "/status/components" => self.get_components_status(request).await, + "/status/resources" => self.get_resources_status(request).await, + + // Health checks + "/status/health/check" => self.perform_health_check(request).await, + "/status/health/readiness" => self.get_readiness_status(request).await, + "/status/health/liveness" => self.get_liveness_status(request).await, + + // System information + "/status/info" => self.get_system_info(request).await, + "/status/version" => self.get_version_info(request).await, + "/status/uptime" => self.get_uptime_info(request).await, + + // Default 404 + _ => Ok(HttpResponse { + status_code: 404, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Not Found", + "message": "Status endpoint not found" + }).to_string(), + }), + } + } + + /// Authenticate request + async fn authenticate_request(&self, request: &HttpRequest) -> Result { + if let Some(token) = &self.config.status_token { + // Check for Authorization header + if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") { + if auth_header.1 == format!("Bearer {}", token) { + return Ok(true); + } + } + + // Check for token in query parameters + if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") { + if token_param.1 == *token { + return Ok(true); + } + } + } + + Ok(false) + } + + /// Get basic status + async fn get_status(&self, _request: HttpRequest) -> Result { + let scan_status = self.scan_engine.status().await; + let heal_status = self.heal_engine.get_status().await; + + let overall_status = if scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running { + "healthy" + } else if scan_status == crate::scanner::Status::Stopped && heal_status == crate::heal::Status::Stopped { + "stopped" + } else { + "degraded" + }; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "overall_status": overall_status, + "components": { + "scan": scan_status, + "heal": heal_status + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get health status + async fn get_health_status(&self, _request: HttpRequest) -> Result { + let scan_status = self.scan_engine.status().await; + let heal_status = self.heal_engine.get_status().await; + + let is_healthy = scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running; + let status_code = if is_healthy { 200 } else { 503 }; + + Ok(HttpResponse { + status_code, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": if is_healthy { "healthy" } else { "unhealthy" }, + "components": { + "scan": { + "status": scan_status, + "healthy": scan_status == crate::scanner::Status::Running + }, + "heal": { + "status": heal_status, + "healthy": heal_status == crate::heal::Status::Running + } + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get overview status + async fn get_overview_status(&self, _request: HttpRequest) -> Result { + let scan_status = self.scan_engine.status().await; + let heal_status = self.heal_engine.get_status().await; + + let scan_config = self.scan_engine.get_config().await; + let heal_config = self.heal_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "overview": { + "scan": { + "status": scan_status, + "enabled": scan_config.enabled, + "scan_interval": scan_config.scan_interval.as_secs() + }, + "heal": { + "status": heal_status, + "enabled": heal_config.auto_heal_enabled, + "max_workers": heal_config.max_workers + } + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get scan status + async fn get_scan_status(&self, _request: HttpRequest) -> Result { + let status = self.scan_engine.status().await; + let config = self.scan_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "scan": { + "status": status, + "enabled": config.enabled, + "scan_interval": config.scan_interval.as_secs(), + "max_concurrent_scans": config.max_concurrent_scans, + "scan_paths": config.scan_paths, + "bandwidth_limit": config.bandwidth_limit + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get heal status + async fn get_heal_status(&self, _request: HttpRequest) -> Result { + let status = self.heal_engine.get_status().await; + let config = self.heal_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "heal": { + "status": status, + "enabled": config.auto_heal_enabled, + "max_workers": config.max_workers, + "repair_timeout": config.repair_timeout.as_secs(), + "retry_attempts": config.max_retry_attempts, + "priority_queue_size": config.max_queue_size + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get policy status + async fn get_policy_status(&self, _request: HttpRequest) -> Result { + let policies = self.policy_engine.list_policies().await?; + let config = self.policy_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "policy": { + "enabled": config.enabled, + "total_policies": policies.len(), + "policies": policies, + "evaluation_timeout": config.evaluation_timeout.as_secs(), + "cache_enabled": config.cache_enabled + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get detailed status + async fn get_detailed_status(&self, _request: HttpRequest) -> Result { + if !self.config.enable_detailed_status { + return Ok(HttpResponse { + status_code: 403, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "error": "Forbidden", + "message": "Detailed status is disabled" + }).to_string(), + }); + } + + let scan_status = self.scan_engine.status().await; + let heal_status = self.heal_engine.get_status().await; + let scan_config = self.scan_engine.get_config().await; + let heal_config = self.heal_engine.get_config().await; + let policy_config = self.policy_engine.get_config().await; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "detailed_status": { + "scan": { + "status": scan_status, + "config": scan_config + }, + "heal": { + "status": heal_status, + "config": heal_config + }, + "policy": { + "config": policy_config + } + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get components status + async fn get_components_status(&self, _request: HttpRequest) -> Result { + let scan_status = self.scan_engine.status().await; + let heal_status = self.heal_engine.get_status().await; + + let components = vec![ + serde_json::json!({ + "name": "scan_engine", + "status": scan_status, + "healthy": scan_status == crate::scanner::Status::Running, + "type": "scanner" + }), + serde_json::json!({ + "name": "heal_engine", + "status": heal_status, + "healthy": heal_status == crate::heal::Status::Running, + "type": "healer" + }), + serde_json::json!({ + "name": "policy_engine", + "status": "running", + "healthy": true, + "type": "policy" + }) + ]; + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "components": components, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get resources status + async fn get_resources_status(&self, _request: HttpRequest) -> Result { + // In a real implementation, this would collect actual resource usage + // For now, we'll return simulated data + let resources = serde_json::json!({ + "cpu": { + "usage_percent": 25.5, + "cores": 8, + "load_average": 0.75 + }, + "memory": { + "usage_percent": 60.2, + "total_bytes": 8589934592, // 8 GB + "available_bytes": 3422552064 // ~3.2 GB + }, + "disk": { + "usage_percent": 45.8, + "total_bytes": 107374182400, // 100 GB + "available_bytes": 58133032960 // ~54 GB + }, + "network": { + "bytes_received_per_sec": 1048576, // 1 MB/s + "bytes_sent_per_sec": 524288 // 512 KB/s + } + }); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "resources": resources, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Perform health checks + async fn perform_health_checks(&self) -> Result> { + let mut checks = Vec::new(); + let start_time = std::time::Instant::now(); + + // Check scan engine + let scan_start = std::time::Instant::now(); + let scan_status = self.scan_engine.status().await; + let scan_duration = scan_start.elapsed(); + checks.push(HealthCheckResult { + name: "scan_engine".to_string(), + healthy: scan_status == crate::scanner::Status::Running, + message: format!("Scan engine status: {:?}", scan_status), + duration_ms: scan_duration.as_millis() as u64, + }); + + // Check heal engine + let heal_start = std::time::Instant::now(); + let heal_status = self.heal_engine.get_status().await; + let heal_duration = heal_start.elapsed(); + checks.push(HealthCheckResult { + name: "heal_engine".to_string(), + healthy: heal_status == crate::heal::Status::Running, + message: format!("Heal engine status: {:?}", heal_status), + duration_ms: heal_duration.as_millis() as u64, + }); + + // Check policy engine + let policy_start = std::time::Instant::now(); + let policy_result = self.policy_engine.list_policies().await; + let policy_duration = policy_start.elapsed(); + checks.push(HealthCheckResult { + name: "policy_engine".to_string(), + healthy: policy_result.is_ok(), + message: if policy_result.is_ok() { + "Policy engine is responding".to_string() + } else { + format!("Policy engine error: {:?}", policy_result.unwrap_err()) + }, + duration_ms: policy_duration.as_millis() as u64, + }); + + let total_duration = start_time.elapsed(); + info!("Health checks completed in {:?}", total_duration); + + Ok(checks) + } + + /// Perform health check (alias for perform_health_checks) + async fn perform_health_check(&self, _request: HttpRequest) -> Result { + let checks = self.perform_health_checks().await?; + let all_healthy = checks.iter().all(|check| check.healthy); + let check_time = std::time::Instant::now().elapsed(); + + Ok(HttpResponse { + status_code: if all_healthy { 200 } else { 503 }, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": if all_healthy { "healthy" } else { "unhealthy" }, + "checks": checks, + "check_time_ms": check_time.as_millis(), + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get readiness status + async fn get_readiness_status(&self, _request: HttpRequest) -> Result { + let scan_status = self.scan_engine.status().await; + let heal_status = self.heal_engine.get_status().await; + + let is_ready = scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running; + let status_code = if is_ready { 200 } else { 503 }; + + Ok(HttpResponse { + status_code, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": if is_ready { "ready" } else { "not_ready" }, + "components": { + "scan_engine": scan_status == crate::scanner::Status::Running, + "heal_engine": heal_status == crate::heal::Status::Running + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get liveness status + async fn get_liveness_status(&self, _request: HttpRequest) -> Result { + // Liveness check is simple - if we can respond, we're alive + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "alive", + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get system information + async fn get_system_info(&self, _request: HttpRequest) -> Result { + let system_info = serde_json::json!({ + "service": "RustFS AHM", + "version": env!("CARGO_PKG_VERSION"), + "system_info": { + "rust_version": option_env!("RUST_VERSION").unwrap_or("unknown"), + "target_arch": option_env!("TARGET_ARCH").unwrap_or("unknown"), + "target_os": option_env!("TARGET_OS").unwrap_or("unknown"), + "build_time": option_env!("VERGEN_BUILD_TIMESTAMP").unwrap_or("unknown"), + "git_commit": option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"), + "git_branch": option_env!("VERGEN_GIT_BRANCH").unwrap_or("unknown"), + }, + }); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "system_info": system_info, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get version information + async fn get_version_info(&self, _request: HttpRequest) -> Result { + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "version": env!("CARGO_PKG_VERSION"), + "build_time": option_env!("VERGEN_BUILD_TIMESTAMP").unwrap_or("unknown"), + "git_commit": option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"), + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } + + /// Get uptime information + async fn get_uptime_info(&self, _request: HttpRequest) -> Result { + // In a real implementation, this would track actual uptime + // For now, we'll return simulated data + let uptime_seconds = 3600; // 1 hour + let uptime_duration = std::time::Duration::from_secs(uptime_seconds); + + Ok(HttpResponse { + status_code: 200, + headers: vec![("Content-Type".to_string(), "application/json".to_string())], + body: serde_json::json!({ + "status": "success", + "uptime": { + "seconds": uptime_seconds, + "duration": format!("{:?}", uptime_duration), + "start_time": chrono::Utc::now() - chrono::Duration::seconds(uptime_seconds as i64) + }, + "timestamp": chrono::Utc::now().to_rfc3339() + }).to_string(), + }) + } +} + +/// Health check result +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthCheckResult { + pub name: String, + pub healthy: bool, + pub message: String, + pub duration_ms: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + heal::HealEngineConfig, + policy::PolicyEngineConfig, + scanner::ScanEngineConfig, + }; + + #[tokio::test] + async fn test_status_api_creation() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + assert!(status_api.config().enabled); + assert_eq!(status_api.config().prefix, "/status"); + } + + #[tokio::test] + async fn test_basic_status() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/status".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = status_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("overall_status")); + } + + #[tokio::test] + async fn test_health_status() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/status/health".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = status_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("status")); + } + + #[tokio::test] + async fn test_scan_status() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/status/scan".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = status_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("scan")); + } + + #[tokio::test] + async fn test_heal_status() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/status/heal".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = status_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("heal")); + } + + #[tokio::test] + async fn test_version_info() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/status/version".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = status_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("version")); + } + + #[tokio::test] + async fn test_liveness_status() { + let config = StatusApiConfig::default(); + let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap()); + let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap()); + let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap()); + + let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap(); + + let request = HttpRequest { + method: "GET".to_string(), + path: "/status/health/liveness".to_string(), + headers: vec![], + body: None, + query_params: vec![], + }; + + let response = status_api.handle_request(request).await.unwrap(); + assert_eq!(response.status_code, 200); + assert!(response.body.contains("alive")); + } +} \ No newline at end of file diff --git a/crates/ahm/src/core/coordinator.rs b/crates/ahm/src/core/coordinator.rs new file mode 100644 index 00000000..0c82a11b --- /dev/null +++ b/crates/ahm/src/core/coordinator.rs @@ -0,0 +1,448 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Core coordinator for the AHM system +//! +//! The coordinator is responsible for: +//! - Event routing and distribution between subsystems +//! - Resource management and allocation +//! - Global state coordination +//! - Cross-system communication + +use std::{ + sync::{Arc, atomic::{AtomicU64, Ordering}}, + time::{Duration, Instant}, +}; + +use tokio::{ + sync::{broadcast, RwLock}, + task::JoinHandle, + time::interval, +}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, warn}; + +use crate::{SystemEvent, metrics}; +use super::{Status, Scheduler, SchedulerConfig}; +use crate::scanner; +use crate::error::Result; +use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + +/// Configuration for the coordinator +#[derive(Debug, Clone)] +pub struct CoordinatorConfig { + /// Event channel buffer size + pub event_buffer_size: usize, + /// Resource monitoring interval + pub resource_monitor_interval: Duration, + /// Maximum number of concurrent operations + pub max_concurrent_operations: usize, + /// Scheduler configuration + pub scheduler: SchedulerConfig, + /// Event channel capacity + pub event_channel_capacity: usize, + /// Health check interval + pub health_check_interval: Duration, + /// Metrics update interval + pub metrics_update_interval: Duration, +} + +impl Default for CoordinatorConfig { + fn default() -> Self { + Self { + event_buffer_size: 10000, + resource_monitor_interval: Duration::from_secs(30), + max_concurrent_operations: 100, + scheduler: SchedulerConfig::default(), + event_channel_capacity: 1024, + health_check_interval: Duration::from_secs(300), + metrics_update_interval: Duration::from_secs(60), + } + } +} + +/// Core coordinator for the AHM system +#[derive(Debug)] +pub struct Coordinator { + /// Configuration + config: CoordinatorConfig, + /// Current status + status: Arc>, + /// Event broadcaster + event_tx: broadcast::Sender, + /// Resource monitor handle + resource_monitor_handle: Arc>>>, + /// Event processor handle + event_processor_handle: Arc>>>, + /// Task scheduler + scheduler: Arc, + /// Metrics collector reference + metrics: Arc, + /// Active operations counter + active_operations: AtomicU64, + /// Cancellation token + cancel_token: CancellationToken, + /// Operation statistics + operation_stats: Arc>, +} + +impl Coordinator { + /// Create a new coordinator + pub async fn new( + config: CoordinatorConfig, + metrics: Arc, + cancel_token: CancellationToken, + ) -> Result { + let (event_tx, _) = broadcast::channel(config.event_buffer_size); + let scheduler = Arc::new(Scheduler::new(config.scheduler.clone()).await?); + + Ok(Self { + config, + status: Arc::new(RwLock::new(Status::Initializing)), + event_tx, + resource_monitor_handle: Arc::new(RwLock::new(None)), + event_processor_handle: Arc::new(RwLock::new(None)), + scheduler, + metrics, + active_operations: AtomicU64::new(0), + cancel_token, + operation_stats: Arc::new(RwLock::new(OperationStatistics::default())), + }) + } + + /// Start the coordinator + pub async fn start(&self) -> Result<()> { + info!("Starting AHM coordinator"); + + // Update status + *self.status.write().await = Status::Running; + + // Start resource monitor + self.start_resource_monitor().await?; + + // Start event processor + self.start_event_processor().await?; + + // Start scheduler + self.scheduler.start().await?; + + info!("AHM coordinator started successfully"); + Ok(()) + } + + /// Stop the coordinator + pub async fn stop(&self) -> Result<()> { + info!("Stopping AHM coordinator"); + + // Update status + *self.status.write().await = Status::Stopping; + + // Stop scheduler + self.scheduler.stop().await?; + + // Stop resource monitor + if let Some(handle) = self.resource_monitor_handle.write().await.take() { + handle.abort(); + } + + // Stop event processor + if let Some(handle) = self.event_processor_handle.write().await.take() { + handle.abort(); + } + + *self.status.write().await = Status::Stopped; + info!("AHM coordinator stopped"); + Ok(()) + } + + /// Get current status + pub async fn status(&self) -> Status { + self.status.read().await.clone() + } + + /// Subscribe to system events + pub fn subscribe_events(&self) -> broadcast::Receiver { + self.event_tx.subscribe() + } + + /// Publish a system event + pub async fn publish_event(&self, event: SystemEvent) -> Result<()> { + debug!("Publishing system event: {:?}", event); + + // Update operation statistics + self.update_operation_stats(&event).await; + + // Send to all subscribers + if let Err(e) = self.event_tx.send(event.clone()) { + warn!("Failed to publish event: {:?}", e); + } + + // Record the event in metrics + self.metrics.record_health_issue(&HealthIssue { + issue_type: HealthIssueType::Unknown, + severity: Severity::Low, + bucket: "system".to_string(), + object: "coordinator".to_string(), + description: format!("System event: {:?}", event), + metadata: None, + }).await?; + + Ok(()) + } + + /// Get system resource usage + pub async fn get_resource_usage(&self) -> metrics::ResourceUsage { + metrics::ResourceUsage { + disk_usage: metrics::DiskUsage { + total_bytes: 1_000_000_000, + used_bytes: 500_000_000, + available_bytes: 500_000_000, + usage_percentage: 50.0, + }, + memory_usage: metrics::MemoryUsage { + total_bytes: 16_000_000_000, + used_bytes: 4_000_000_000, + available_bytes: 12_000_000_000, + usage_percentage: 25.0, + }, + network_usage: metrics::NetworkUsage { + bytes_received: 1_000_000, + bytes_sent: 500_000, + packets_received: 1000, + packets_sent: 500, + }, + cpu_usage: metrics::CpuUsage { + usage_percentage: 0.25, + cores: 8, + load_average: 1.5, + }, + } + } + + /// Get operation statistics + pub async fn get_operation_statistics(&self) -> OperationStatistics { + self.operation_stats.read().await.clone() + } + + /// Get active operations count + pub fn get_active_operations_count(&self) -> u64 { + self.active_operations.load(Ordering::Relaxed) + } + + /// Register an active operation + pub fn register_operation(&self) -> OperationGuard { + let count = self.active_operations.fetch_add(1, Ordering::Relaxed); + debug!("Registered operation, active count: {}", count + 1); + OperationGuard::new(&self.active_operations) + } + + /// Start the resource monitor + async fn start_resource_monitor(&self) -> Result<()> { + let cancel_token = self.cancel_token.clone(); + let _event_tx = self.event_tx.clone(); + let interval_duration = self.config.resource_monitor_interval; + + let handle = tokio::spawn(async move { + let mut interval = interval(interval_duration); + + loop { + tokio::select! { + _ = cancel_token.cancelled() => { + debug!("Resource monitor cancelled"); + break; + } + _ = interval.tick() => { + // This would collect real resource metrics + // For now, we'll skip the actual collection + debug!("Resource monitor tick"); + } + } + } + }); + + *self.resource_monitor_handle.write().await = Some(handle); + Ok(()) + } + + /// Start the event processor + async fn start_event_processor(&self) -> Result<()> { + let mut event_rx = self.event_tx.subscribe(); + let cancel_token = self.cancel_token.clone(); + + let handle = tokio::spawn(async move { + loop { + tokio::select! { + _ = cancel_token.cancelled() => { + debug!("Event processor cancelled"); + break; + } + event = event_rx.recv() => { + match event { + Ok(event) => { + debug!("Processing system event: {:?}", event); + // Process the event (e.g., route to specific handlers) + } + Err(e) => { + warn!("Event processor error: {:?}", e); + } + } + } + } + } + }); + + *self.event_processor_handle.write().await = Some(handle); + Ok(()) + } + + /// Update operation statistics based on events + async fn update_operation_stats(&self, event: &SystemEvent) { + let mut stats = self.operation_stats.write().await; + + match event { + SystemEvent::ObjectDiscovered { .. } => { + stats.objects_discovered += 1; + } + SystemEvent::HealthIssueDetected(issue) => { + stats.health_issues_detected += 1; + match issue.severity { + scanner::Severity::Critical => stats.critical_issues += 1, + scanner::Severity::High => stats.high_priority_issues += 1, + scanner::Severity::Medium => stats.medium_priority_issues += 1, + scanner::Severity::Low => stats.low_priority_issues += 1, + } + } + SystemEvent::HealCompleted(result) => { + if result.success { + stats.heal_operations_succeeded += 1; + } else { + stats.heal_operations_failed += 1; + } + } + SystemEvent::ScanCompleted(_) => { + stats.scan_cycles_completed += 1; + } + SystemEvent::ResourceUsageUpdated { .. } => { + stats.resource_updates += 1; + } + } + + stats.last_updated = Instant::now(); + } +} + +/// RAII guard for tracking active operations +pub struct OperationGuard<'a> { + active_operations: &'a AtomicU64, +} + +impl<'a> OperationGuard<'a> { + pub fn new(active_operations: &'a AtomicU64) -> Self { + active_operations.fetch_add(1, Ordering::Relaxed); + Self { active_operations } + } +} + +impl Drop for OperationGuard<'_> { + fn drop(&mut self) { + self.active_operations.fetch_sub(1, Ordering::Relaxed); + } +} + +/// Operation statistics tracked by the coordinator +#[derive(Debug, Clone)] +pub struct OperationStatistics { + pub objects_discovered: u64, + pub health_issues_detected: u64, + pub heal_operations_succeeded: u64, + pub heal_operations_failed: u64, + pub scan_cycles_completed: u64, + pub resource_updates: u64, + pub critical_issues: u64, + pub high_priority_issues: u64, + pub medium_priority_issues: u64, + pub low_priority_issues: u64, + pub last_updated: Instant, +} + +impl Default for OperationStatistics { + fn default() -> Self { + Self { + objects_discovered: 0, + health_issues_detected: 0, + heal_operations_succeeded: 0, + heal_operations_failed: 0, + scan_cycles_completed: 0, + resource_updates: 0, + critical_issues: 0, + high_priority_issues: 0, + medium_priority_issues: 0, + low_priority_issues: 0, + last_updated: Instant::now(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metrics::CollectorConfig; + + #[tokio::test] + async fn test_coordinator_lifecycle() { + let config = CoordinatorConfig::default(); + let metrics_config = CollectorConfig::default(); + let metrics = Arc::new(metrics::Collector::new(metrics_config).await.unwrap()); + let cancel_token = CancellationToken::new(); + + let coordinator = Coordinator::new(config, metrics, cancel_token).await.unwrap(); + + // Test initial status + assert_eq!(coordinator.status().await, Status::Initializing); + + // Start coordinator + coordinator.start().await.unwrap(); + assert_eq!(coordinator.status().await, Status::Running); + + // Stop coordinator + coordinator.stop().await.unwrap(); + assert_eq!(coordinator.status().await, Status::Stopped); + } + + #[tokio::test] + async fn test_operation_guard() { + let config = CoordinatorConfig::default(); + let metrics_config = CollectorConfig::default(); + let metrics = Arc::new(metrics::Collector::new(metrics_config).await.unwrap()); + let cancel_token = CancellationToken::new(); + + let coordinator = Coordinator::new(config, metrics, cancel_token).await.unwrap(); + + assert_eq!(coordinator.get_active_operations_count(), 0); + + { + let _guard1 = coordinator.register_operation(); + assert_eq!(coordinator.get_active_operations_count(), 1); + + { + let _guard2 = coordinator.register_operation(); + assert_eq!(coordinator.get_active_operations_count(), 2); + } + + assert_eq!(coordinator.get_active_operations_count(), 1); + } + + assert_eq!(coordinator.get_active_operations_count(), 0); + } +} \ No newline at end of file diff --git a/crates/ahm/src/core/lifecycle.rs b/crates/ahm/src/core/lifecycle.rs new file mode 100644 index 00000000..ddb5d17a --- /dev/null +++ b/crates/ahm/src/core/lifecycle.rs @@ -0,0 +1,22 @@ +// Copyright 2024 RustFS Team + +use crate::error::Result; + +#[derive(Debug, Clone, Default)] +pub struct LifecycleConfig {} + +pub struct LifecycleManager {} + +impl LifecycleManager { + pub async fn new(_config: LifecycleConfig) -> Result { + Ok(Self {}) + } + + pub async fn start(&self) -> Result<()> { + Ok(()) + } + + pub async fn stop(&self) -> Result<()> { + Ok(()) + } +} \ No newline at end of file diff --git a/crates/ahm/src/core/mod.rs b/crates/ahm/src/core/mod.rs new file mode 100644 index 00000000..582162af --- /dev/null +++ b/crates/ahm/src/core/mod.rs @@ -0,0 +1,40 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Core coordination and lifecycle management for the AHM system + +pub mod coordinator; +pub mod scheduler; +pub mod lifecycle; + +pub use coordinator::{Coordinator, CoordinatorConfig}; +pub use scheduler::{Scheduler, SchedulerConfig, Task, TaskPriority}; +pub use lifecycle::{LifecycleManager, LifecycleConfig}; + +/// Status of the core coordination system +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Status { + /// System is initializing + Initializing, + /// System is running normally + Running, + /// System is degraded but operational + Degraded, + /// System is shutting down + Stopping, + /// System has stopped + Stopped, + /// System encountered an error + Error(String), +} \ No newline at end of file diff --git a/crates/ahm/src/core/scheduler.rs b/crates/ahm/src/core/scheduler.rs new file mode 100644 index 00000000..fa25fd27 --- /dev/null +++ b/crates/ahm/src/core/scheduler.rs @@ -0,0 +1,226 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Task scheduler for the AHM system + +use std::{ + collections::{BinaryHeap, HashMap}, + sync::{Arc, atomic::{AtomicU64, Ordering}}, + time::{Duration, Instant}, +}; + +use tokio::{ + sync::RwLock, + task::JoinHandle, +}; +use uuid::Uuid; + +use crate::error::Result; + +/// Task scheduler configuration +#[derive(Debug, Clone)] +pub struct SchedulerConfig { + /// Maximum number of concurrent tasks + pub max_concurrent_tasks: usize, + /// Default task timeout + pub default_timeout: Duration, + /// Queue capacity + pub queue_capacity: usize, + pub default_task_priority: TaskPriority, +} + +impl Default for SchedulerConfig { + fn default() -> Self { + Self { + max_concurrent_tasks: 10, + default_timeout: Duration::from_secs(300), // 5 minutes + queue_capacity: 1000, + default_task_priority: TaskPriority::Normal, + } + } +} + +/// Task priority levels +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum TaskPriority { + Low = 0, + Normal = 1, + High = 2, + Critical = 3, +} + +/// A scheduled task +#[derive(Debug, Clone)] +pub struct Task { + pub id: Uuid, + pub priority: TaskPriority, + pub scheduled_time: Instant, + pub timeout: Duration, + pub task_type: TaskType, + pub payload: TaskPayload, +} + +impl Task { + pub fn new(task_type: TaskType, payload: TaskPayload) -> Self { + Self { + id: Uuid::new_v4(), + priority: TaskPriority::Normal, + scheduled_time: Instant::now(), + timeout: Duration::from_secs(300), + task_type, + payload, + } + } + + pub fn with_priority(mut self, priority: TaskPriority) -> Self { + self.priority = priority; + self + } + + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = timeout; + self + } + + pub fn with_delay(mut self, delay: Duration) -> Self { + self.scheduled_time = Instant::now() + delay; + self + } +} + +/// Types of tasks that can be scheduled +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TaskType { + Scan, + Heal, + Cleanup, + Maintenance, + Report, +} + +/// Task payload data +#[derive(Debug, Clone)] +pub enum TaskPayload { + Scan { + bucket: Option, + object_prefix: Option, + deep_scan: bool, + }, + Heal { + bucket: String, + object: String, + version_id: Option, + }, + Cleanup { + older_than: Duration, + }, + Maintenance { + operation: String, + }, + Report { + report_type: String, + }, +} + +/// Task scheduler +#[allow(dead_code)] +#[derive(Debug)] +pub struct Scheduler { + config: SchedulerConfig, + task_queue: Arc>>, + active_tasks: Arc>>>, + task_counter: AtomicU64, + worker_handles: Arc>>>, +} + +impl Scheduler { + pub async fn new(config: SchedulerConfig) -> Result { + Ok(Self { + config, + task_queue: Arc::new(RwLock::new(BinaryHeap::new())), + active_tasks: Arc::new(RwLock::new(HashMap::new())), + task_counter: AtomicU64::new(0), + worker_handles: Arc::new(RwLock::new(Vec::new())), + }) + } + + pub async fn start(&self) -> Result<()> { + // Start worker tasks + // Implementation would go here + Ok(()) + } + + pub async fn stop(&self) -> Result<()> { + // Stop all workers and drain queues + // Implementation would go here + Ok(()) + } + + pub async fn schedule_task(&self, task: Task) -> Result { + let task_id = task.id; + let prioritized_task = PrioritizedTask { + task, + sequence: self.task_counter.fetch_add(1, Ordering::Relaxed), + }; + + self.task_queue.write().await.push(prioritized_task); + Ok(task_id) + } + + pub async fn cancel_task(&self, task_id: Uuid) -> Result { + if let Some(handle) = self.active_tasks.write().await.remove(&task_id) { + handle.abort(); + Ok(true) + } else { + Ok(false) + } + } +} + +/// Task wrapper for priority queue ordering +#[derive(Debug)] +struct PrioritizedTask { + task: Task, + sequence: u64, +} + +impl PartialEq for PrioritizedTask { + fn eq(&self, other: &Self) -> bool { + self.task.priority == other.task.priority && self.sequence == other.sequence + } +} + +impl Eq for PrioritizedTask {} + +impl PartialOrd for PrioritizedTask { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for PrioritizedTask { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Higher priority first, then by sequence number for fairness + other.task.priority.cmp(&self.task.priority) + .then_with(|| self.sequence.cmp(&other.sequence)) + } +} + +#[derive(Debug, Clone)] +pub struct ScheduledTask { + pub id: Uuid, + pub task_type: TaskType, + pub priority: TaskPriority, + pub created_at: Instant, +} \ No newline at end of file diff --git a/crates/ahm/src/heal/engine.rs b/crates/ahm/src/heal/engine.rs new file mode 100644 index 00000000..ba90108f --- /dev/null +++ b/crates/ahm/src/heal/engine.rs @@ -0,0 +1,438 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::{ + sync::{mpsc, RwLock}, + time::sleep, +}; +use tracing::{error, info, warn}; +use uuid::Uuid; + +use crate::error::Result; +use super::{HealConfig, HealPriority, HealResult, HealStatistics, HealTask, Status}; + +/// Main healing engine that coordinates repair operations +pub struct HealEngine { + config: HealConfig, + status: Arc>, + statistics: Arc>, + task_queue: Arc>>, + active_tasks: Arc>>, + completed_tasks: Arc>>, + shutdown_tx: Option>, +} + +impl HealEngine { + /// Create a new healing engine + pub fn new(config: HealConfig) -> Self { + Self { + config, + status: Arc::new(RwLock::new(Status::Initializing)), + statistics: Arc::new(RwLock::new(HealStatistics::default())), + task_queue: Arc::new(RwLock::new(Vec::new())), + active_tasks: Arc::new(RwLock::new(HashMap::new())), + completed_tasks: Arc::new(RwLock::new(Vec::new())), + shutdown_tx: None, + } + } + + /// Start the healing engine + pub async fn start(&mut self) -> Result<()> { + info!("Starting heal engine"); + + let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1); + self.shutdown_tx = Some(shutdown_tx); + + // Update status + { + let mut status = self.status.write().await; + *status = Status::Idle; + } + + let config = self.config.clone(); + let status = Arc::clone(&self.status); + let statistics = Arc::clone(&self.statistics); + let task_queue = Arc::clone(&self.task_queue); + let active_tasks = Arc::clone(&self.active_tasks); + let completed_tasks = Arc::clone(&self.completed_tasks); + + // Start the main healing loop + tokio::spawn(async move { + let mut interval = tokio::time::interval(config.heal_interval); + + loop { + tokio::select! { + _ = interval.tick() => { + if let Err(e) = Self::process_healing_cycle( + &config, + &status, + &statistics, + &task_queue, + &active_tasks, + &completed_tasks, + ).await { + error!("Healing cycle failed: {}", e); + } + } + _ = shutdown_rx.recv() => { + info!("Shutdown signal received, stopping heal engine"); + break; + } + } + } + + // Update status to stopped + let mut status = status.write().await; + *status = Status::Stopped; + }); + + info!("Heal engine started successfully"); + Ok(()) + } + + /// Stop the healing engine + pub async fn stop(&mut self) -> Result<()> { + info!("Stopping heal engine"); + + // Update status + { + let mut status = self.status.write().await; + *status = Status::Stopping; + } + + // Send shutdown signal + if let Some(shutdown_tx) = &self.shutdown_tx { + let _ = shutdown_tx.send(()).await; + } + + // Wait for engine to stop + let mut attempts = 0; + while attempts < 10 { + let status = self.status.read().await; + if *status == Status::Stopped { + break; + } + drop(status); + sleep(Duration::from_millis(100)).await; + attempts += 1; + } + + info!("Heal engine stopped"); + Ok(()) + } + + /// Add a healing task to the queue + pub async fn add_task(&self, task: HealTask) -> Result<()> { + let task_id = task.id.clone(); + let queue = Arc::clone(&self.task_queue); + + // Add task to priority queue + queue.write().await.push(task); + + info!("Added healing task to queue: {}", task_id); + Ok(()) + } + + /// Get current engine status + pub async fn status(&self) -> Status { + self.status.read().await.clone() + } + + /// Get current engine status (alias for status) + pub async fn get_status(&self) -> Status { + self.status.read().await.clone() + } + + /// Get engine configuration + pub async fn get_config(&self) -> HealConfig { + self.config.clone() + } + + /// Get healing statistics + pub async fn statistics(&self) -> HealStatistics { + self.statistics.read().await.clone() + } + + /// Get completed healing results + pub async fn completed_results(&self) -> Vec { + self.completed_tasks.read().await.clone() + } + + /// Process a single healing cycle + async fn process_healing_cycle( + config: &HealConfig, + status: &Arc>, + statistics: &Arc>, + task_queue: &Arc>>, + active_tasks: &Arc>>, + completed_tasks: &Arc>>, + ) -> Result<()> { + // Update status to healing + { + let mut status = status.write().await; + *status = Status::Healing; + } + + // Get ready tasks from queue + let mut queue = task_queue.write().await; + let mut ready_tasks = Vec::new(); + let mut remaining_tasks = Vec::new(); + + for task in queue.drain(..) { + if task.is_ready() { + ready_tasks.push(task); + } else { + remaining_tasks.push(task); + } + } + + // Sort ready tasks by priority + ready_tasks.sort_by(|a, b| a.priority.cmp(&b.priority)); + + // Process ready tasks + let active_count = active_tasks.read().await.len(); + let max_concurrent = config.max_workers.saturating_sub(active_count); + + for task in ready_tasks.into_iter().take(max_concurrent) { + if let Err(e) = Self::process_task( + config, + statistics, + active_tasks, + completed_tasks, + task, + ).await { + error!("Failed to process healing task: {}", e); + } + } + + // Put remaining tasks back in queue + queue.extend(remaining_tasks); + + // Update statistics + { + let mut stats = statistics.write().await; + stats.queued_tasks = queue.len() as u64; + stats.active_workers = active_tasks.read().await.len() as u64; + } + + // Update status back to idle + { + let mut status = status.write().await; + *status = Status::Idle; + } + + Ok(()) + } + + /// Process a single healing task + async fn process_task( + config: &HealConfig, + statistics: &Arc>, + active_tasks: &Arc>>, + completed_tasks: &Arc>>, + task: HealTask, + ) -> Result<()> { + let task_id = task.id.clone(); + + // Add task to active tasks + { + let mut active = active_tasks.write().await; + active.insert(task_id.clone(), task.clone()); + } + + // Update statistics + { + let mut stats = statistics.write().await; + stats.total_repairs += 1; + stats.active_workers = active_tasks.read().await.len() as u64; + } + + info!("Processing healing task: {}", task_id); + + // Simulate healing operation + let start_time = Instant::now(); + let result = Self::perform_healing_operation(&task, config).await; + let duration = start_time.elapsed(); + + // Create heal result + let heal_result = HealResult { + success: result.is_ok(), + original_issue: task.issue.clone(), + repair_duration: duration, + retry_attempts: task.retry_count, + error_message: result.err().map(|e| e.to_string()), + metadata: None, + completed_at: SystemTime::now(), + }; + + // Update statistics + { + let mut stats = statistics.write().await; + if heal_result.success { + stats.successful_repairs += 1; + } else { + stats.failed_repairs += 1; + } + stats.total_repair_time += duration; + stats.average_repair_time = if stats.total_repairs > 0 { + Duration::from_secs_f64( + stats.total_repair_time.as_secs_f64() / stats.total_repairs as f64 + ) + } else { + Duration::ZERO + }; + stats.last_repair_time = Some(SystemTime::now()); + stats.total_retry_attempts += task.retry_count as u64; + } + + // Add result to completed tasks + { + let mut completed = completed_tasks.write().await; + completed.push(heal_result.clone()); + } + + // Remove task from active tasks + { + let mut active = active_tasks.write().await; + active.remove(&task_id); + } + + // Update statistics + { + let mut stats = statistics.write().await; + stats.active_workers = active_tasks.read().await.len() as u64; + } + + if heal_result.success { + info!("Healing task completed successfully: {}", task_id); + } else { + warn!("Healing task failed: {}", task_id); + } + + Ok(()) + } + + /// Perform the actual healing operation + async fn perform_healing_operation(task: &HealTask, _config: &HealConfig) -> Result<()> { + // Simulate healing operation based on issue type + match task.issue.issue_type { + crate::scanner::HealthIssueType::MissingReplica => { + // Simulate replica repair + sleep(Duration::from_millis(100)).await; + info!("Repaired missing replica for {}/{}", task.issue.bucket, task.issue.object); + } + crate::scanner::HealthIssueType::ChecksumMismatch => { + // Simulate checksum repair + sleep(Duration::from_millis(200)).await; + info!("Repaired checksum mismatch for {}/{}", task.issue.bucket, task.issue.object); + } + crate::scanner::HealthIssueType::DiskReadError => { + // Simulate disk error recovery + sleep(Duration::from_millis(300)).await; + info!("Recovered from disk read error for {}/{}", task.issue.bucket, task.issue.object); + } + _ => { + // Generic repair for other issue types + sleep(Duration::from_millis(150)).await; + info!("Performed generic repair for {}/{}", task.issue.bucket, task.issue.object); + } + } + + // Simulate occasional failures for testing + if task.retry_count > 0 && task.retry_count % 3 == 0 { + return Err(crate::error::Error::Other(anyhow::anyhow!("Simulated healing failure"))); + } + + Ok(()) + } + + /// Start healing operations + pub async fn start_healing(&self) -> Result<()> { + let mut status = self.status.write().await; + *status = Status::Running; + info!("Healing operations started"); + Ok(()) + } + + /// Stop healing operations + pub async fn stop_healing(&self) -> Result<()> { + let mut status = self.status.write().await; + *status = Status::Stopped; + info!("Healing operations stopped"); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + + #[tokio::test] + async fn test_heal_engine_creation() { + let config = HealConfig::default(); + let engine = HealEngine::new(config); + + assert_eq!(engine.status().await, Status::Initializing); + } + + #[tokio::test] + async fn test_heal_engine_start_stop() { + let config = HealConfig::default(); + let mut engine = HealEngine::new(config); + + // Start engine + engine.start().await.unwrap(); + sleep(Duration::from_millis(100)).await; + + // Check status + let status = engine.status().await; + assert!(matches!(status, Status::Idle | Status::Healing)); + + // Stop engine + engine.stop().await.unwrap(); + sleep(Duration::from_millis(100)).await; + + // Check status + let status = engine.status().await; + assert_eq!(status, Status::Stopped); + } + + #[tokio::test] + async fn test_add_healing_task() { + let config = HealConfig::default(); + let engine = HealEngine::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let task = HealTask::new(issue); + engine.add_task(task).await.unwrap(); + + let stats = engine.statistics().await; + assert_eq!(stats.queued_tasks, 1); + } +} \ No newline at end of file diff --git a/crates/ahm/src/heal/mod.rs b/crates/ahm/src/heal/mod.rs new file mode 100644 index 00000000..9e7847c9 --- /dev/null +++ b/crates/ahm/src/heal/mod.rs @@ -0,0 +1,360 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Healing subsystem for the AHM system +//! +//! The heal subsystem provides intelligent repair capabilities: +//! - Priority-based healing queue +//! - Real-time and background healing modes +//! - Comprehensive repair validation +//! - Adaptive healing strategies + +pub mod engine; +pub mod priority_queue; +pub mod repair_worker; +pub mod validation; + +pub use engine::HealEngine; +pub use priority_queue::PriorityQueue; +pub use repair_worker::RepairWorker; +pub use validation::HealValidator; + +use std::time::{Duration, SystemTime}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use derive_builder::Builder; + +use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + +/// Configuration for the healing system +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealConfig { + /// Maximum number of concurrent repair workers + pub max_workers: usize, + /// Maximum number of tasks in the priority queue + pub max_queue_size: usize, + /// Timeout for individual repair operations + pub repair_timeout: Duration, + /// Interval between healing cycles + pub heal_interval: Duration, + /// Whether to enable automatic healing + pub auto_heal_enabled: bool, + /// Maximum number of retry attempts for failed repairs + pub max_retry_attempts: u32, + /// Backoff delay between retry attempts + pub retry_backoff_delay: Duration, + /// Whether to validate repairs after completion + pub validate_after_repair: bool, +} + +impl Default for HealConfig { + fn default() -> Self { + Self { + max_workers: 4, + max_queue_size: 1000, + repair_timeout: Duration::from_secs(300), // 5 minutes + heal_interval: Duration::from_secs(60), // 1 minute + auto_heal_enabled: true, + max_retry_attempts: 3, + retry_backoff_delay: Duration::from_secs(30), + validate_after_repair: true, + } + } +} + +/// Result of a healing operation +#[derive(Debug, Clone)] +pub struct HealResult { + /// Whether the healing operation was successful + pub success: bool, + /// The original health issue that was addressed + pub original_issue: HealthIssue, + /// Time taken to complete the repair + pub repair_duration: Duration, + /// Number of retry attempts made + pub retry_attempts: u32, + /// Error message if repair failed + pub error_message: Option, + /// Additional metadata about the repair + pub metadata: Option, + /// Timestamp when the repair was completed + pub completed_at: SystemTime, +} + +/// Statistics for the healing system +#[derive(Debug, Clone, Default)] +pub struct HealStatistics { + /// Total number of repair tasks processed + pub total_repairs: u64, + /// Number of successful repairs + pub successful_repairs: u64, + /// Number of failed repairs + pub failed_repairs: u64, + /// Number of tasks currently in queue + pub queued_tasks: u64, + /// Number of active workers + pub active_workers: u64, + /// Total time spent on repairs + pub total_repair_time: Duration, + /// Average repair time + pub average_repair_time: Duration, + /// Last repair completion time + pub last_repair_time: Option, + /// Number of retry attempts made + pub total_retry_attempts: u64, +} + +/// Priority levels for healing tasks +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum HealPriority { + /// Critical issues that need immediate attention + Critical = 0, + /// High priority issues + High = 1, + /// Medium priority issues + Medium = 2, + /// Low priority issues + Low = 3, +} + +impl From for HealPriority { + fn from(severity: Severity) -> Self { + match severity { + Severity::Critical => HealPriority::Critical, + Severity::High => HealPriority::High, + Severity::Medium => HealPriority::Medium, + Severity::Low => HealPriority::Low, + } + } +} + +/// A healing task to be processed +#[derive(Debug, Clone)] +pub struct HealTask { + /// Unique identifier for the task + pub id: String, + /// The health issue to be repaired + pub issue: HealthIssue, + /// Priority level for this task + pub priority: HealPriority, + /// When the task was created + pub created_at: SystemTime, + /// When the task should be processed (for delayed tasks) + pub scheduled_at: Option, + /// Number of retry attempts made + pub retry_count: u32, + /// Maximum number of retry attempts allowed + pub max_retries: u32, + /// Additional context for the repair operation + pub context: Option, +} + +impl HealTask { + /// Create a new healing task + pub fn new(issue: HealthIssue) -> Self { + let priority = HealPriority::from(issue.severity); + Self { + id: uuid::Uuid::new_v4().to_string(), + issue, + priority, + created_at: SystemTime::now(), + scheduled_at: None, + retry_count: 0, + max_retries: 3, + context: None, + } + } + + /// Create a delayed healing task + pub fn delayed(issue: HealthIssue, delay: Duration) -> Self { + let mut task = Self::new(issue); + task.scheduled_at = Some(SystemTime::now() + delay); + task + } + + /// Check if the task is ready to be processed + pub fn is_ready(&self) -> bool { + if let Some(scheduled_at) = self.scheduled_at { + SystemTime::now() >= scheduled_at + } else { + true + } + } + + /// Check if the task can be retried + pub fn can_retry(&self) -> bool { + self.retry_count < self.max_retries + } + + /// Increment the retry count + pub fn increment_retry(&mut self) { + self.retry_count += 1; + } +} + +/// Heal engine status +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum Status { + /// Heal engine is initializing + Initializing, + /// Heal engine is idle + Idle, + /// Heal engine is running normally + Running, + /// Heal engine is actively healing + Healing, + /// Heal engine is paused + Paused, + /// Heal engine is stopping + Stopping, + /// Heal engine has stopped + Stopped, + /// Heal engine encountered an error + Error(String), +} + +/// Healing operation modes +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum HealMode { + /// Real-time healing during GET/PUT operations + RealTime, + /// Background healing during scheduled scans + Background, + /// On-demand healing triggered by admin + OnDemand, + /// Emergency healing for critical issues + Emergency, +} + +/// Validation result for a repaired object +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ValidationResult { + /// Type of validation performed + pub validation_type: ValidationType, + /// Whether validation passed + pub passed: bool, + /// Details about the validation + pub details: String, + /// Time taken for validation + pub duration: Duration, +} + +/// Types of validation that can be performed +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum ValidationType { + /// Checksum verification + Checksum, + /// Shard count verification + ShardCount, + /// Data integrity check + DataIntegrity, + /// Metadata consistency check + MetadataConsistency, + /// Cross-shard redundancy check + RedundancyCheck, +} + +/// Healing strategies for different scenarios +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum HealStrategy { + /// Repair using available data shards + DataShardRepair, + /// Repair using parity shards + ParityShardRepair, + /// Hybrid repair using both data and parity + HybridRepair, + /// Metadata-only repair + MetadataRepair, + /// Full object reconstruction + FullReconstruction, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_heal_priority_from_severity() { + assert_eq!(HealPriority::from(Severity::Critical), HealPriority::Critical); + assert_eq!(HealPriority::from(Severity::High), HealPriority::High); + assert_eq!(HealPriority::from(Severity::Medium), HealPriority::Medium); + assert_eq!(HealPriority::from(Severity::Low), HealPriority::Low); + } + + #[test] + fn test_heal_task_creation() { + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let task = HealTask::new(issue.clone()); + assert_eq!(task.priority, HealPriority::Critical); + assert_eq!(task.issue.bucket, issue.bucket); + assert_eq!(task.issue.object, issue.object); + assert_eq!(task.retry_count, 0); + assert_eq!(task.max_retries, 3); + assert!(task.is_ready()); + } + + #[test] + fn test_delayed_heal_task() { + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Medium, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let delay = Duration::from_secs(1); + let task = HealTask::delayed(issue, delay); + + assert!(task.scheduled_at.is_some()); + assert!(!task.is_ready()); // Should not be ready immediately + + // Wait for the delay to pass + std::thread::sleep(delay + Duration::from_millis(100)); + assert!(task.is_ready()); + } + + #[test] + fn test_heal_task_retry_logic() { + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Low, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let mut task = HealTask::new(issue); + assert!(task.can_retry()); + + task.increment_retry(); + assert_eq!(task.retry_count, 1); + assert!(task.can_retry()); + + task.increment_retry(); + task.increment_retry(); + assert_eq!(task.retry_count, 3); + assert!(!task.can_retry()); + } +} \ No newline at end of file diff --git a/crates/ahm/src/heal/priority_queue.rs b/crates/ahm/src/heal/priority_queue.rs new file mode 100644 index 00000000..07a1b600 --- /dev/null +++ b/crates/ahm/src/heal/priority_queue.rs @@ -0,0 +1,413 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::BinaryHeap, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{debug, info, warn}; + +use crate::error::Result; +use super::{HealPriority, HealTask}; + +/// Priority queue for healing tasks +pub struct PriorityQueue { + tasks: Arc>>, + max_size: usize, + statistics: Arc>, +} + +/// Statistics for the priority queue +#[derive(Debug, Clone, Default)] +pub struct QueueStatistics { + /// Total number of tasks added to the queue + pub total_tasks_added: u64, + /// Total number of tasks removed from the queue + pub total_tasks_removed: u64, + /// Current number of tasks in the queue + pub current_queue_size: u64, + /// Maximum queue size reached + pub max_queue_size_reached: u64, + /// Number of tasks rejected due to queue being full + pub tasks_rejected: u64, + /// Average time tasks spend in queue + pub average_queue_time: Duration, + /// Total time all tasks have spent in queue + pub total_queue_time: Duration, +} + +impl PriorityQueue { + /// Create a new priority queue + pub fn new(max_size: usize) -> Self { + Self { + tasks: Arc::new(RwLock::new(BinaryHeap::new())), + max_size, + statistics: Arc::new(RwLock::new(QueueStatistics::default())), + } + } + + /// Add a task to the queue + pub async fn push(&self, task: HealTask) -> Result<()> { + let mut tasks = self.tasks.write().await; + let mut stats = self.statistics.write().await; + + if tasks.len() >= self.max_size { + stats.tasks_rejected += 1; + warn!("Priority queue is full, rejecting task: {}", task.id); + return Err(crate::error::Error::Other(anyhow::anyhow!("Queue is full"))); + } + + let task_id = task.id.clone(); + let priority = task.priority.clone(); + tasks.push(task); + stats.total_tasks_added += 1; + stats.current_queue_size = tasks.len() as u64; + stats.max_queue_size_reached = stats.max_queue_size_reached.max(tasks.len() as u64); + + debug!("Added task to priority queue: {} (priority: {:?})", task_id, priority); + Ok(()) + } + + /// Remove and return the highest priority task + pub async fn pop(&self) -> Option { + let mut tasks = self.tasks.write().await; + let mut stats = self.statistics.write().await; + + if let Some(task) = tasks.pop() { + stats.total_tasks_removed += 1; + stats.current_queue_size = tasks.len() as u64; + + // Update queue time statistics + let queue_time = SystemTime::now().duration_since(task.created_at).unwrap_or(Duration::ZERO); + stats.total_queue_time += queue_time; + stats.average_queue_time = if stats.total_tasks_removed > 0 { + Duration::from_secs_f64( + stats.total_queue_time.as_secs_f64() / stats.total_tasks_removed as f64 + ) + } else { + Duration::ZERO + }; + + debug!("Removed task from priority queue: {} (priority: {:?})", task.id, task.priority); + Some(task) + } else { + None + } + } + + /// Peek at the highest priority task without removing it + pub async fn peek(&self) -> Option { + let tasks = self.tasks.read().await; + tasks.peek().cloned() + } + + /// Get the current size of the queue + pub async fn len(&self) -> usize { + self.tasks.read().await.len() + } + + /// Check if the queue is empty + pub async fn is_empty(&self) -> bool { + self.tasks.read().await.is_empty() + } + + /// Get queue statistics + pub async fn statistics(&self) -> QueueStatistics { + self.statistics.read().await.clone() + } + + /// Clear all tasks from the queue + pub async fn clear(&self) { + let mut tasks = self.tasks.write().await; + let mut stats = self.statistics.write().await; + + let cleared_count = tasks.len(); + tasks.clear(); + stats.current_queue_size = 0; + + info!("Cleared {} tasks from priority queue", cleared_count); + } + + /// Get all tasks that are ready to be processed + pub async fn get_ready_tasks(&self, max_count: usize) -> Vec { + let mut tasks = self.tasks.write().await; + let mut ready_tasks = Vec::new(); + let mut remaining_tasks = Vec::new(); + + while let Some(task) = tasks.pop() { + if task.is_ready() && ready_tasks.len() < max_count { + ready_tasks.push(task); + } else { + remaining_tasks.push(task); + } + } + + // Put remaining tasks back + for task in remaining_tasks { + tasks.push(task); + } + + ready_tasks + } + + /// Remove a specific task by ID + pub async fn remove_task(&self, task_id: &str) -> bool { + let mut tasks = self.tasks.write().await; + let mut stats = self.statistics.write().await; + + let mut temp_tasks = Vec::new(); + let mut found = false; + + while let Some(task) = tasks.pop() { + if task.id == task_id { + found = true; + stats.total_tasks_removed += 1; + debug!("Removed specific task from queue: {}", task_id); + } else { + temp_tasks.push(task); + } + } + + // Put remaining tasks back + for task in temp_tasks { + tasks.push(task); + } + + stats.current_queue_size = tasks.len() as u64; + found + } + + /// Get tasks by priority level + pub async fn get_tasks_by_priority(&self, priority: HealPriority) -> Vec { + let mut tasks = self.tasks.write().await; + let mut matching_tasks = Vec::new(); + let mut other_tasks = Vec::new(); + + while let Some(task) = tasks.pop() { + if task.priority == priority { + matching_tasks.push(task); + } else { + other_tasks.push(task); + } + } + + // Put other tasks back + for task in other_tasks { + tasks.push(task); + } + + matching_tasks + } + + /// Update task priority + pub async fn update_priority(&self, task_id: &str, new_priority: HealPriority) -> bool { + let mut tasks = self.tasks.write().await; + + let mut temp_tasks = Vec::new(); + let mut found = false; + + while let Some(mut task) = tasks.pop() { + if task.id == task_id { + task.priority = new_priority.clone(); + found = true; + debug!("Updated task priority: {} -> {:?}", task_id, new_priority); + } + temp_tasks.push(task); + } + + // Put all tasks back + for task in temp_tasks { + tasks.push(task); + } + + found + } +} + +// Implement Ord for HealTask to enable priority queue functionality +impl std::cmp::Ord for HealTask { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Higher priority (lower enum value) comes first + self.priority.cmp(&other.priority) + .then_with(|| self.created_at.cmp(&other.created_at)) + } +} + +impl std::cmp::PartialOrd for HealTask { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl std::cmp::PartialEq for HealTask { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl std::cmp::Eq for HealTask {} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + + #[tokio::test] + async fn test_priority_queue_creation() { + let queue = PriorityQueue::new(100); + assert_eq!(queue.len().await, 0); + assert!(queue.is_empty().await); + } + + #[tokio::test] + async fn test_priority_queue_push_pop() { + let queue = PriorityQueue::new(10); + + let issue1 = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Low, + bucket: "bucket1".to_string(), + object: "object1".to_string(), + description: "Test issue 1".to_string(), + metadata: None, + }; + + let issue2 = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "bucket2".to_string(), + object: "object2".to_string(), + description: "Test issue 2".to_string(), + metadata: None, + }; + + let task1 = HealTask::new(issue1); + let task2 = HealTask::new(issue2); + + // Add tasks + queue.push(task1.clone()).await.unwrap(); + queue.push(task2.clone()).await.unwrap(); + + assert_eq!(queue.len().await, 2); + + // Critical task should come first + let first_task = queue.pop().await.unwrap(); + assert_eq!(first_task.priority, HealPriority::Critical); + assert_eq!(first_task.id, task2.id); + + let second_task = queue.pop().await.unwrap(); + assert_eq!(second_task.priority, HealPriority::Low); + assert_eq!(second_task.id, task1.id); + + assert!(queue.is_empty().await); + } + + #[tokio::test] + async fn test_priority_queue_full() { + let queue = PriorityQueue::new(1); + + let issue1 = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Low, + bucket: "bucket1".to_string(), + object: "object1".to_string(), + description: "Test issue 1".to_string(), + metadata: None, + }; + + let issue2 = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "bucket2".to_string(), + object: "object2".to_string(), + description: "Test issue 2".to_string(), + metadata: None, + }; + + let task1 = HealTask::new(issue1); + let task2 = HealTask::new(issue2); + + // First task should succeed + queue.push(task1).await.unwrap(); + assert_eq!(queue.len().await, 1); + + // Second task should fail + let result = queue.push(task2).await; + assert!(result.is_err()); + assert_eq!(queue.len().await, 1); + + let stats = queue.statistics().await; + assert_eq!(stats.tasks_rejected, 1); + } + + #[tokio::test] + async fn test_priority_queue_remove_task() { + let queue = PriorityQueue::new(10); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Medium, + bucket: "bucket1".to_string(), + object: "object1".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let task = HealTask::new(issue); + let task_id = task.id.clone(); + + queue.push(task).await.unwrap(); + assert_eq!(queue.len().await, 1); + + // Remove the task + let removed = queue.remove_task(&task_id).await; + assert!(removed); + assert_eq!(queue.len().await, 0); + + // Try to remove non-existent task + let removed = queue.remove_task("non-existent").await; + assert!(!removed); + } + + #[tokio::test] + async fn test_priority_queue_update_priority() { + let queue = PriorityQueue::new(10); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Low, + bucket: "bucket1".to_string(), + object: "object1".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let task = HealTask::new(issue); + let task_id = task.id.clone(); + + queue.push(task).await.unwrap(); + + // Update priority + let updated = queue.update_priority(&task_id, HealPriority::Critical).await; + assert!(updated); + + // Check that the task now has higher priority + let popped_task = queue.pop().await.unwrap(); + assert_eq!(popped_task.priority, HealPriority::Critical); + assert_eq!(popped_task.id, task_id); + } +} \ No newline at end of file diff --git a/crates/ahm/src/heal/repair_worker.rs b/crates/ahm/src/heal/repair_worker.rs new file mode 100644 index 00000000..018e62a5 --- /dev/null +++ b/crates/ahm/src/heal/repair_worker.rs @@ -0,0 +1,505 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::{ + sync::{mpsc, RwLock}, + time::{sleep, timeout}, +}; +use tracing::{debug, error, info, warn}; + +use crate::error::Result; +use super::{HealConfig, HealResult, HealTask, Status}; + +/// Configuration for repair workers +#[derive(Debug, Clone)] +pub struct RepairWorkerConfig { + /// Worker ID + pub worker_id: String, + /// Maximum time to spend on a single repair operation + pub operation_timeout: Duration, + /// Whether to enable detailed logging + pub enable_detailed_logging: bool, + /// Maximum number of concurrent operations + pub max_concurrent_operations: usize, + /// Retry configuration + pub retry_config: RetryConfig, +} + +/// Retry configuration for repair operations +#[derive(Debug, Clone)] +pub struct RetryConfig { + /// Maximum number of retry attempts + pub max_attempts: u32, + /// Initial backoff delay + pub initial_backoff: Duration, + /// Maximum backoff delay + pub max_backoff: Duration, + /// Backoff multiplier + pub backoff_multiplier: f64, + /// Whether to use exponential backoff + pub exponential_backoff: bool, +} + +impl Default for RepairWorkerConfig { + fn default() -> Self { + Self { + worker_id: "worker-1".to_string(), + operation_timeout: Duration::from_secs(300), // 5 minutes + enable_detailed_logging: true, + max_concurrent_operations: 1, + retry_config: RetryConfig::default(), + } + } +} + +impl Default for RetryConfig { + fn default() -> Self { + Self { + max_attempts: 3, + initial_backoff: Duration::from_secs(1), + max_backoff: Duration::from_secs(60), + backoff_multiplier: 2.0, + exponential_backoff: true, + } + } +} + +/// Statistics for a repair worker +#[derive(Debug, Clone, Default)] +pub struct WorkerStatistics { + /// Total number of tasks processed + pub total_tasks_processed: u64, + /// Number of successful repairs + pub successful_repairs: u64, + /// Number of failed repairs + pub failed_repairs: u64, + /// Total time spent on repairs + pub total_repair_time: Duration, + /// Average repair time + pub average_repair_time: Duration, + /// Number of retry attempts made + pub total_retry_attempts: u64, + /// Current worker status + pub status: WorkerStatus, + /// Last task completion time + pub last_task_time: Option, +} + +/// Worker status +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum WorkerStatus { + /// Worker is idle + Idle, + /// Worker is processing a task + Processing, + /// Worker is retrying a failed task + Retrying, + /// Worker is stopping + Stopping, + /// Worker has stopped + Stopped, + /// Worker encountered an error + Error(String), +} + +impl Default for WorkerStatus { + fn default() -> Self { + WorkerStatus::Idle + } +} + +/// Repair worker that executes healing tasks +pub struct RepairWorker { + config: RepairWorkerConfig, + statistics: Arc>, + status: Arc>, + result_tx: mpsc::Sender, + shutdown_tx: Option>, +} + +impl RepairWorker { + /// Create a new repair worker + pub fn new( + config: RepairWorkerConfig, + result_tx: mpsc::Sender, + ) -> Self { + Self { + config, + statistics: Arc::new(RwLock::new(WorkerStatistics::default())), + status: Arc::new(RwLock::new(WorkerStatus::Idle)), + result_tx, + shutdown_tx: None, + } + } + + /// Start the repair worker + pub async fn start(&mut self) -> Result<()> { + info!("Starting repair worker: {}", self.config.worker_id); + + let (_task_tx, task_rx) = mpsc::channel(100); + let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1); + + self.shutdown_tx = Some(shutdown_tx); + + // Update status + { + let mut status = self.status.write().await; + *status = WorkerStatus::Idle; + } + + let config = self.config.clone(); + let statistics = Arc::clone(&self.statistics); + let status = Arc::clone(&self.status); + let result_tx = self.result_tx.clone(); + + // Start the worker loop + tokio::spawn(async move { + let mut task_rx = task_rx; + + loop { + tokio::select! { + Some(task) = task_rx.recv() => { + if let Err(e) = Self::process_task( + &config, + &statistics, + &status, + &result_tx, + task, + ).await { + error!("Failed to process task: {}", e); + } + } + _ = shutdown_rx.recv() => { + info!("Shutdown signal received, stopping worker: {}", config.worker_id); + break; + } + } + } + + // Update status to stopped + let mut status = status.write().await; + *status = WorkerStatus::Stopped; + }); + + info!("Repair worker started: {}", self.config.worker_id); + Ok(()) + } + + /// Stop the repair worker + pub async fn stop(&mut self) -> Result<()> { + info!("Stopping repair worker: {}", self.config.worker_id); + + // Update status + { + let mut status = self.status.write().await; + *status = WorkerStatus::Stopping; + } + + // Send shutdown signal + if let Some(shutdown_tx) = &self.shutdown_tx { + let _ = shutdown_tx.send(()).await; + } + + // Wait for worker to stop + let mut attempts = 0; + while attempts < 10 { + let status = self.status.read().await; + if *status == WorkerStatus::Stopped { + break; + } + drop(status); + sleep(Duration::from_millis(100)).await; + attempts += 1; + } + + info!("Repair worker stopped: {}", self.config.worker_id); + Ok(()) + } + + /// Submit a task to the worker + pub async fn submit_task(&self, _task: HealTask) -> Result<()> { + // TODO: Implement task submission + Err(crate::error::Error::Other(anyhow::anyhow!("Task submission not implemented"))) + } + + /// Get worker statistics + pub async fn statistics(&self) -> WorkerStatistics { + self.statistics.read().await.clone() + } + + /// Get worker status + pub async fn status(&self) -> WorkerStatus { + self.status.read().await.clone() + } + + /// Process a single task + async fn process_task( + config: &RepairWorkerConfig, + statistics: &Arc>, + status: &Arc>, + result_tx: &mpsc::Sender, + task: HealTask, + ) -> Result<()> { + let task_id = task.id.clone(); + + // Update status to processing + { + let mut status = status.write().await; + *status = WorkerStatus::Processing; + } + + // Update statistics + { + let mut stats = statistics.write().await; + stats.total_tasks_processed += 1; + stats.status = WorkerStatus::Processing; + } + + info!("Processing repair task: {} (worker: {})", task_id, config.worker_id); + + let start_time = Instant::now(); + let mut attempt = 0; + let mut last_error = None; + + // Retry loop + while attempt < config.retry_config.max_attempts { + attempt += 1; + + if attempt > 1 { + // Update status to retrying + { + let mut status = status.write().await; + *status = WorkerStatus::Retrying; + } + + // Calculate backoff delay + let backoff_delay = if config.retry_config.exponential_backoff { + let delay = config.retry_config.initial_backoff * + (config.retry_config.backoff_multiplier.powi((attempt - 1) as i32)) as u32; + delay.min(config.retry_config.max_backoff) + } else { + config.retry_config.initial_backoff + }; + + warn!("Retrying task {} (attempt {}/{}), waiting {:?}", + task_id, attempt, config.retry_config.max_attempts, backoff_delay); + sleep(backoff_delay).await; + } + + // Attempt the repair operation + let result = timeout( + config.operation_timeout, + Self::perform_repair_operation(&task, config) + ).await; + + match result { + Ok(Ok(())) => { + // Success + let duration = start_time.elapsed(); + let heal_result = HealResult { + success: true, + original_issue: task.issue.clone(), + repair_duration: duration, + retry_attempts: attempt - 1, + error_message: None, + metadata: None, + completed_at: SystemTime::now(), + }; + + // Send result + if let Err(e) = result_tx.send(heal_result).await { + error!("Failed to send heal result: {}", e); + } + + // Update statistics + { + let mut stats = statistics.write().await; + stats.successful_repairs += 1; + stats.total_repair_time += duration; + stats.average_repair_time = if stats.total_tasks_processed > 0 { + Duration::from_secs_f64( + stats.total_repair_time.as_secs_f64() / stats.total_tasks_processed as f64 + ) + } else { + Duration::ZERO + }; + stats.total_retry_attempts += (attempt - 1) as u64; + stats.last_task_time = Some(SystemTime::now()); + stats.status = WorkerStatus::Idle; + } + + info!("Successfully completed repair task: {} (worker: {})", task_id, config.worker_id); + return Ok(()); + } + Ok(Err(e)) => { + // Operation failed + let error_msg = e.to_string(); + last_error = Some(e); + warn!("Repair operation failed for task {} (attempt {}/{}): {}", + task_id, attempt, config.retry_config.max_attempts, error_msg); + } + Err(_) => { + // Operation timed out + last_error = Some(crate::error::Error::Other(anyhow::anyhow!("Operation timed out"))); + warn!("Repair operation timed out for task {} (attempt {}/{})", + task_id, attempt, config.retry_config.max_attempts); + } + } + } + + // All attempts failed + let duration = start_time.elapsed(); + let heal_result = HealResult { + success: false, + original_issue: task.issue.clone(), + repair_duration: duration, + retry_attempts: attempt - 1, + error_message: last_error.map(|e| e.to_string()), + metadata: None, + completed_at: SystemTime::now(), + }; + + // Send result + if let Err(e) = result_tx.send(heal_result).await { + error!("Failed to send heal result: {}", e); + } + + // Update statistics + { + let mut stats = statistics.write().await; + stats.failed_repairs += 1; + stats.total_repair_time += duration; + stats.average_repair_time = if stats.total_tasks_processed > 0 { + Duration::from_secs_f64( + stats.total_repair_time.as_secs_f64() / stats.total_tasks_processed as f64 + ) + } else { + Duration::ZERO + }; + stats.total_retry_attempts += (attempt - 1) as u64; + stats.last_task_time = Some(SystemTime::now()); + stats.status = WorkerStatus::Idle; + } + + error!("Failed to complete repair task after {} attempts: {} (worker: {})", + attempt, task_id, config.worker_id); + Ok(()) + } + + /// Perform the actual repair operation + async fn perform_repair_operation(task: &HealTask, config: &RepairWorkerConfig) -> Result<()> { + if config.enable_detailed_logging { + debug!("Starting repair operation for task: {} (worker: {})", task.id, config.worker_id); + } + + // Simulate repair operation based on issue type + match task.issue.issue_type { + crate::scanner::HealthIssueType::MissingReplica => { + // Simulate replica repair + sleep(Duration::from_millis(100)).await; + if config.enable_detailed_logging { + debug!("Repaired missing replica for {}/{}", task.issue.bucket, task.issue.object); + } + } + crate::scanner::HealthIssueType::ChecksumMismatch => { + // Simulate checksum repair + sleep(Duration::from_millis(200)).await; + if config.enable_detailed_logging { + debug!("Repaired checksum mismatch for {}/{}", task.issue.bucket, task.issue.object); + } + } + crate::scanner::HealthIssueType::DiskReadError => { + // Simulate disk error recovery + sleep(Duration::from_millis(300)).await; + if config.enable_detailed_logging { + debug!("Recovered from disk read error for {}/{}", task.issue.bucket, task.issue.object); + } + } + _ => { + // Generic repair for other issue types + sleep(Duration::from_millis(150)).await; + if config.enable_detailed_logging { + debug!("Performed generic repair for {}/{}", task.issue.bucket, task.issue.object); + } + } + } + + // Simulate occasional failures for testing + if task.retry_count > 0 && task.retry_count % 3 == 0 { + return Err(crate::error::Error::Other(anyhow::anyhow!("Simulated repair failure"))); + } + + if config.enable_detailed_logging { + debug!("Completed repair operation for task: {} (worker: {})", task.id, config.worker_id); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + + #[tokio::test] + async fn test_repair_worker_creation() { + let config = RepairWorkerConfig::default(); + let (result_tx, _result_rx) = mpsc::channel(100); + let worker = RepairWorker::new(config, result_tx); + + assert_eq!(worker.status().await, WorkerStatus::Idle); + } + + #[tokio::test] + async fn test_repair_worker_start_stop() { + let config = RepairWorkerConfig::default(); + let (result_tx, _result_rx) = mpsc::channel(100); + let mut worker = RepairWorker::new(config, result_tx); + + // Start worker + worker.start().await.unwrap(); + sleep(Duration::from_millis(100)).await; + + // Check status + let status = worker.status().await; + assert_eq!(status, WorkerStatus::Idle); + + // Stop worker + worker.stop().await.unwrap(); + sleep(Duration::from_millis(100)).await; + + // Check status + let status = worker.status().await; + assert_eq!(status, WorkerStatus::Stopped); + } + + #[tokio::test] + async fn test_repair_worker_statistics() { + let config = RepairWorkerConfig::default(); + let (result_tx, _result_rx) = mpsc::channel(100); + let worker = RepairWorker::new(config, result_tx); + + let stats = worker.statistics().await; + assert_eq!(stats.total_tasks_processed, 0); + assert_eq!(stats.successful_repairs, 0); + assert_eq!(stats.failed_repairs, 0); + assert_eq!(stats.status, WorkerStatus::Idle); + } +} \ No newline at end of file diff --git a/crates/ahm/src/heal/validation.rs b/crates/ahm/src/heal/validation.rs new file mode 100644 index 00000000..2bb481f0 --- /dev/null +++ b/crates/ahm/src/heal/validation.rs @@ -0,0 +1,453 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{debug, error, info, warn}; + +use crate::error::Result; +use super::{HealResult, HealTask}; + +/// Configuration for validation operations +#[derive(Debug, Clone)] +pub struct ValidationConfig { + /// Whether to enable validation after repair + pub enable_post_repair_validation: bool, + /// Timeout for validation operations + pub validation_timeout: Duration, + /// Whether to enable detailed validation logging + pub enable_detailed_logging: bool, + /// Maximum number of validation retries + pub max_validation_retries: u32, + /// Validation retry delay + pub validation_retry_delay: Duration, +} + +impl Default for ValidationConfig { + fn default() -> Self { + Self { + enable_post_repair_validation: true, + validation_timeout: Duration::from_secs(60), // 1 minute + max_validation_retries: 3, + validation_retry_delay: Duration::from_secs(5), + enable_detailed_logging: true, + } + } +} + +/// Validation result for a repair operation +#[derive(Debug, Clone)] +pub struct ValidationResult { + /// Whether validation passed + pub passed: bool, + /// Validation type + pub validation_type: ValidationType, + /// Detailed validation message + pub message: String, + /// Time taken for validation + pub duration: Duration, + /// Validation timestamp + pub timestamp: SystemTime, + /// Additional validation metadata + pub metadata: Option, +} + +/// Types of validation that can be performed +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ValidationType { + /// Checksum validation + Checksum, + /// File existence validation + FileExistence, + /// File size validation + FileSize, + /// File permissions validation + FilePermissions, + /// Metadata consistency validation + MetadataConsistency, + /// Replication status validation + ReplicationStatus, + /// Data integrity validation + DataIntegrity, + /// Custom validation + Custom(String), +} + +/// Statistics for validation operations +#[derive(Debug, Clone, Default)] +pub struct ValidationStatistics { + /// Total number of validations performed + pub total_validations: u64, + /// Number of successful validations + pub successful_validations: u64, + /// Number of failed validations + pub failed_validations: u64, + /// Total time spent on validation + pub total_validation_time: Duration, + /// Average validation time + pub average_validation_time: Duration, + /// Number of validation retries + pub total_validation_retries: u64, + /// Last validation time + pub last_validation_time: Option, +} + +/// Validator for repair operations +pub struct HealValidator { + config: ValidationConfig, + statistics: Arc>, +} + +impl HealValidator { + /// Create a new validator + pub fn new(config: ValidationConfig) -> Self { + Self { + config, + statistics: Arc::new(RwLock::new(ValidationStatistics::default())), + } + } + + /// Validate a repair operation + pub async fn validate_repair(&self, task: &HealTask, result: &HealResult) -> Result> { + if !self.config.enable_post_repair_validation { + return Ok(Vec::new()); + } + + let start_time = Instant::now(); + let mut validation_results = Vec::new(); + + info!("Starting validation for repair task: {}", task.id); + + // Perform different types of validation based on the issue type + match task.issue.issue_type { + crate::scanner::HealthIssueType::MissingReplica => { + validation_results.extend(self.validate_replica_repair(task, result).await?); + } + crate::scanner::HealthIssueType::ChecksumMismatch => { + validation_results.extend(self.validate_checksum_repair(task, result).await?); + } + crate::scanner::HealthIssueType::DiskReadError => { + validation_results.extend(self.validate_disk_repair(task, result).await?); + } + _ => { + validation_results.extend(self.validate_generic_repair(task, result).await?); + } + } + + let duration = start_time.elapsed(); + + // Update statistics + { + let mut stats = self.statistics.write().await; + stats.total_validations += validation_results.len() as u64; + stats.total_validation_time += duration; + stats.average_validation_time = if stats.total_validations > 0 { + Duration::from_secs_f64( + stats.total_validation_time.as_secs_f64() / stats.total_validations as f64 + ) + } else { + Duration::ZERO + }; + stats.last_validation_time = Some(SystemTime::now()); + + let successful_count = validation_results.iter().filter(|r| r.passed).count(); + let failed_count = validation_results.len() - successful_count; + stats.successful_validations += successful_count as u64; + stats.failed_validations += failed_count as u64; + } + + if self.config.enable_detailed_logging { + debug!("Validation completed for task {}: {} passed, {} failed", + task.id, + validation_results.iter().filter(|r| r.passed).count(), + validation_results.iter().filter(|r| !r.passed).count() + ); + } + + Ok(validation_results) + } + + /// Validate replica repair + async fn validate_replica_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { + let mut results = Vec::new(); + + // Validate file existence + let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await; + results.push(existence_result); + + // Validate replication status + let replication_result = self.validate_replication_status(&task.issue.bucket, &task.issue.object).await; + results.push(replication_result); + + Ok(results) + } + + /// Validate checksum repair + async fn validate_checksum_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { + let mut results = Vec::new(); + + // Validate checksum + let checksum_result = self.validate_checksum(&task.issue.bucket, &task.issue.object).await; + results.push(checksum_result); + + // Validate data integrity + let integrity_result = self.validate_data_integrity(&task.issue.bucket, &task.issue.object).await; + results.push(integrity_result); + + Ok(results) + } + + /// Validate disk repair + async fn validate_disk_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { + let mut results = Vec::new(); + + // Validate file existence + let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await; + results.push(existence_result); + + // Validate file permissions + let permissions_result = self.validate_file_permissions(&task.issue.bucket, &task.issue.object).await; + results.push(permissions_result); + + Ok(results) + } + + /// Validate generic repair + async fn validate_generic_repair(&self, task: &HealTask, _result: &HealResult) -> Result> { + let mut results = Vec::new(); + + // Validate file existence + let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await; + results.push(existence_result); + + // Validate metadata consistency + let metadata_result = self.validate_metadata_consistency(&task.issue.bucket, &task.issue.object).await; + results.push(metadata_result); + + Ok(results) + } + + /// Validate file existence + async fn validate_file_existence(&self, bucket: &str, object: &str) -> ValidationResult { + let start_time = Instant::now(); + + // Simulate file existence check + tokio::time::sleep(Duration::from_millis(10)).await; + + let duration = start_time.elapsed(); + let passed = true; // Simulate successful validation + + ValidationResult { + passed, + validation_type: ValidationType::FileExistence, + message: format!("File existence validation for {}/{}", bucket, object), + duration, + timestamp: SystemTime::now(), + metadata: None, + } + } + + /// Validate checksum + async fn validate_checksum(&self, bucket: &str, object: &str) -> ValidationResult { + let start_time = Instant::now(); + + // Simulate checksum validation + tokio::time::sleep(Duration::from_millis(20)).await; + + let duration = start_time.elapsed(); + let passed = true; // Simulate successful validation + + ValidationResult { + passed, + validation_type: ValidationType::Checksum, + message: format!("Checksum validation for {}/{}", bucket, object), + duration, + timestamp: SystemTime::now(), + metadata: None, + } + } + + /// Validate replication status + async fn validate_replication_status(&self, bucket: &str, object: &str) -> ValidationResult { + let start_time = Instant::now(); + + // Simulate replication status validation + tokio::time::sleep(Duration::from_millis(15)).await; + + let duration = start_time.elapsed(); + let passed = true; // Simulate successful validation + + ValidationResult { + passed, + validation_type: ValidationType::ReplicationStatus, + message: format!("Replication status validation for {}/{}", bucket, object), + duration, + timestamp: SystemTime::now(), + metadata: None, + } + } + + /// Validate file permissions + async fn validate_file_permissions(&self, bucket: &str, object: &str) -> ValidationResult { + let start_time = Instant::now(); + + // Simulate file permissions validation + tokio::time::sleep(Duration::from_millis(5)).await; + + let duration = start_time.elapsed(); + let passed = true; // Simulate successful validation + + ValidationResult { + passed, + validation_type: ValidationType::FilePermissions, + message: format!("File permissions validation for {}/{}", bucket, object), + duration, + timestamp: SystemTime::now(), + metadata: None, + } + } + + /// Validate metadata consistency + async fn validate_metadata_consistency(&self, bucket: &str, object: &str) -> ValidationResult { + let start_time = Instant::now(); + + // Simulate metadata consistency validation + tokio::time::sleep(Duration::from_millis(25)).await; + + let duration = start_time.elapsed(); + let passed = true; // Simulate successful validation + + ValidationResult { + passed, + validation_type: ValidationType::MetadataConsistency, + message: format!("Metadata consistency validation for {}/{}", bucket, object), + duration, + timestamp: SystemTime::now(), + metadata: None, + } + } + + /// Validate data integrity + async fn validate_data_integrity(&self, bucket: &str, object: &str) -> ValidationResult { + let start_time = Instant::now(); + + // Simulate data integrity validation + tokio::time::sleep(Duration::from_millis(30)).await; + + let duration = start_time.elapsed(); + let passed = true; // Simulate successful validation + + ValidationResult { + passed, + validation_type: ValidationType::DataIntegrity, + message: format!("Data integrity validation for {}/{}", bucket, object), + duration, + timestamp: SystemTime::now(), + metadata: None, + } + } + + /// Get validation statistics + pub async fn statistics(&self) -> ValidationStatistics { + self.statistics.read().await.clone() + } + + /// Reset validation statistics + pub async fn reset_statistics(&self) { + let mut stats = self.statistics.write().await; + *stats = ValidationStatistics::default(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + + #[tokio::test] + async fn test_validator_creation() { + let config = ValidationConfig::default(); + let validator = HealValidator::new(config); + + let stats = validator.statistics().await; + assert_eq!(stats.total_validations, 0); + } + + #[tokio::test] + async fn test_validate_repair() { + let config = ValidationConfig::default(); + let validator = HealValidator::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let task = super::HealTask::new(issue); + let result = super::HealResult { + success: true, + original_issue: task.issue.clone(), + repair_duration: Duration::from_secs(1), + retry_attempts: 0, + error_message: None, + metadata: None, + completed_at: SystemTime::now(), + }; + + let validation_results = validator.validate_repair(&task, &result).await.unwrap(); + assert!(!validation_results.is_empty()); + + let stats = validator.statistics().await; + assert_eq!(stats.total_validations, validation_results.len() as u64); + } + + #[tokio::test] + async fn test_validation_disabled() { + let mut config = ValidationConfig::default(); + config.enable_post_repair_validation = false; + let validator = HealValidator::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let task = super::HealTask::new(issue); + let result = super::HealResult { + success: true, + original_issue: task.issue.clone(), + repair_duration: Duration::from_secs(1), + retry_attempts: 0, + error_message: None, + metadata: None, + completed_at: SystemTime::now(), + }; + + let validation_results = validator.validate_repair(&task, &result).await.unwrap(); + assert!(validation_results.is_empty()); + } +} \ No newline at end of file diff --git a/crates/ahm/src/metrics/aggregator.rs b/crates/ahm/src/metrics/aggregator.rs new file mode 100644 index 00000000..961a1344 --- /dev/null +++ b/crates/ahm/src/metrics/aggregator.rs @@ -0,0 +1,739 @@ +// Copyright 2024 RustFS Team + +use std::{ + collections::HashMap, + time::{Duration, SystemTime}, +}; + +use tracing::{debug, error, info, warn}; + +use crate::error::Result; + +use super::{ + AggregatedMetrics, DiskMetrics, HealMetrics, MetricsDataPoint, MetricsQuery, MetricsSummary, + NetworkMetrics, PolicyMetrics, ScanMetrics, SystemMetrics, +}; + +/// Configuration for the metrics aggregator +#[derive(Debug, Clone)] +pub struct AggregatorConfig { + /// Default aggregation interval + pub default_interval: Duration, + /// Maximum number of data points to keep in memory + pub max_data_points: usize, + /// Whether to enable automatic aggregation + pub enable_auto_aggregation: bool, + /// Aggregation window size + pub aggregation_window: Duration, + /// Whether to enable data compression + pub enable_compression: bool, + /// Compression threshold (number of points before compression) + pub compression_threshold: usize, + /// Whether to enable outlier detection + pub enable_outlier_detection: bool, + /// Outlier detection threshold (standard deviations) + pub outlier_threshold: f64, +} + +impl Default for AggregatorConfig { + fn default() -> Self { + Self { + default_interval: Duration::from_secs(300), // 5 minutes + max_data_points: 10000, + enable_auto_aggregation: true, + aggregation_window: Duration::from_secs(3600), // 1 hour + enable_compression: true, + compression_threshold: 1000, + enable_outlier_detection: true, + outlier_threshold: 2.0, // 2 standard deviations + } + } +} + +/// Metrics aggregator that processes and aggregates metrics data +#[derive(Debug, Clone)] +pub struct Aggregator { + config: AggregatorConfig, + data_points: Vec, + aggregation_cache: HashMap, + last_aggregation_time: SystemTime, + aggregation_count: u64, +} + +impl Aggregator { + /// Create a new metrics aggregator + pub async fn new(interval: Duration) -> Result { + let config = AggregatorConfig { + default_interval: interval, + ..Default::default() + }; + + Ok(Self { + config, + data_points: Vec::new(), + aggregation_cache: HashMap::new(), + last_aggregation_time: SystemTime::now(), + aggregation_count: 0, + }) + } + + /// Get the configuration + pub fn config(&self) -> &AggregatorConfig { + &self.config + } + + /// Add metrics data point + pub async fn add_data_point(&mut self, data_point: MetricsDataPoint) -> Result<()> { + self.data_points.push(data_point); + + // Trim old data points if we exceed the limit + if self.data_points.len() > self.config.max_data_points { + let excess = self.data_points.len() - self.config.max_data_points; + self.data_points.drain(0..excess); + } + + // Auto-aggregate if enabled + if self.config.enable_auto_aggregation { + self.auto_aggregate().await?; + } + + Ok(()) + } + + /// Aggregate metrics based on query + pub async fn aggregate_metrics(&mut self, query: MetricsQuery) -> Result { + let start_time = SystemTime::now(); + + // Check cache first + let cache_key = self.generate_cache_key(&query); + if let Some(cached) = self.aggregation_cache.get(&cache_key) { + debug!("Returning cached aggregation result"); + return Ok(cached.clone()); + } + + // Filter data points by time range + let filtered_points: Vec<&MetricsDataPoint> = self + .data_points + .iter() + .filter(|point| { + point.timestamp >= query.start_time && point.timestamp <= query.end_time + }) + .collect(); + + if filtered_points.is_empty() { + warn!("No data points found for the specified time range"); + return Ok(AggregatedMetrics { + query, + data_points: Vec::new(), + summary: MetricsSummary::default(), + }); + } + + // Aggregate data points + let aggregated_points = self.aggregate_data_points(&filtered_points, &query).await?; + + // Generate summary + let summary = self.generate_summary(&aggregated_points, &query).await?; + + let result = AggregatedMetrics { + query, + data_points: aggregated_points, + summary, + }; + + // Cache the result + self.aggregation_cache.insert(cache_key, result.clone()); + + let aggregation_time = start_time.elapsed(); + debug!("Metrics aggregation completed in {:?}", aggregation_time); + + Ok(result) + } + + /// Auto-aggregate data points + async fn auto_aggregate(&mut self) -> Result<()> { + let now = SystemTime::now(); + + // Check if it's time to aggregate + if now.duration_since(self.last_aggregation_time).unwrap() < self.config.aggregation_window { + return Ok(()); + } + + // Perform aggregation + let window_start = now - self.config.aggregation_window; + let query = MetricsQuery { + start_time: window_start, + end_time: now, + interval: self.config.default_interval, + metrics: vec![], // All metrics + severity_filter: None, + limit: None, + }; + + let _aggregated = self.aggregate_metrics(query).await?; + + self.last_aggregation_time = now; + self.aggregation_count += 1; + + info!("Auto-aggregation completed, count: {}", self.aggregation_count); + + Ok(()) + } + + /// Aggregate data points based on interval + async fn aggregate_data_points( + &self, + points: &[&MetricsDataPoint], + query: &MetricsQuery, + ) -> Result> { + if points.is_empty() { + return Ok(Vec::new()); + } + + let mut aggregated_points = Vec::new(); + let mut current_bucket_start = query.start_time; + let mut current_bucket_points = Vec::new(); + + for point in points { + if point.timestamp >= current_bucket_start + query.interval { + // Process current bucket + if !current_bucket_points.is_empty() { + let aggregated = self.aggregate_bucket(¤t_bucket_points, current_bucket_start).await?; + aggregated_points.push(aggregated); + } + + // Start new bucket + current_bucket_start = current_bucket_start + query.interval; + current_bucket_points.clear(); + } + + current_bucket_points.push(*point); + } + + // Process last bucket + if !current_bucket_points.is_empty() { + let aggregated = self.aggregate_bucket(¤t_bucket_points, current_bucket_start).await?; + aggregated_points.push(aggregated); + } + + Ok(aggregated_points) + } + + /// Aggregate a bucket of data points + async fn aggregate_bucket( + &self, + points: &[&MetricsDataPoint], + bucket_start: SystemTime, + ) -> Result { + let mut aggregated = MetricsDataPoint { + timestamp: bucket_start, + system: None, + network: None, + disk_io: None, + scan: None, + heal: None, + policy: None, + }; + + // Aggregate system metrics + let system_metrics: Vec<&SystemMetrics> = points + .iter() + .filter_map(|p| p.system.as_ref()) + .collect(); + + if !system_metrics.is_empty() { + aggregated.system = Some(self.aggregate_system_metrics(&system_metrics).await?); + } + + // Aggregate network metrics + let network_metrics: Vec<&NetworkMetrics> = points + .iter() + .filter_map(|p| p.network.as_ref()) + .collect(); + + if !network_metrics.is_empty() { + aggregated.network = Some(self.aggregate_network_metrics(&network_metrics).await?); + } + + // Aggregate disk I/O metrics + let disk_metrics: Vec<&DiskMetrics> = points + .iter() + .filter_map(|p| p.disk_io.as_ref()) + .collect(); + + if !disk_metrics.is_empty() { + aggregated.disk_io = Some(self.aggregate_disk_metrics(&disk_metrics).await?); + } + + // Aggregate scan metrics + let scan_metrics: Vec<&ScanMetrics> = points + .iter() + .filter_map(|p| p.scan.as_ref()) + .collect(); + + if !scan_metrics.is_empty() { + aggregated.scan = Some(self.aggregate_scan_metrics(&scan_metrics).await?); + } + + // Aggregate heal metrics + let heal_metrics: Vec<&HealMetrics> = points + .iter() + .filter_map(|p| p.heal.as_ref()) + .collect(); + + if !heal_metrics.is_empty() { + aggregated.heal = Some(self.aggregate_heal_metrics(&heal_metrics).await?); + } + + // Aggregate policy metrics + let policy_metrics: Vec<&PolicyMetrics> = points + .iter() + .filter_map(|p| p.policy.as_ref()) + .collect(); + + if !policy_metrics.is_empty() { + aggregated.policy = Some(self.aggregate_policy_metrics(&policy_metrics).await?); + } + + Ok(aggregated) + } + + /// Aggregate system metrics + async fn aggregate_system_metrics(&self, metrics: &[&SystemMetrics]) -> Result { + if metrics.is_empty() { + return Ok(SystemMetrics::default()); + } + + let cpu_usage: f64 = metrics.iter().map(|m| m.cpu_usage).sum::() / metrics.len() as f64; + let memory_usage: f64 = metrics.iter().map(|m| m.memory_usage).sum::() / metrics.len() as f64; + let disk_usage: f64 = metrics.iter().map(|m| m.disk_usage).sum::() / metrics.len() as f64; + let system_load: f64 = metrics.iter().map(|m| m.system_load).sum::() / metrics.len() as f64; + let active_operations: u64 = metrics.iter().map(|m| m.active_operations).sum::() / metrics.len() as u64; + + // Aggregate health issues + let mut health_issues = HashMap::new(); + for metric in metrics { + for (severity, count) in &metric.health_issues { + *health_issues.entry(*severity).or_insert(0) += count; + } + } + + Ok(SystemMetrics { + timestamp: SystemTime::now(), + cpu_usage, + memory_usage, + disk_usage, + network_io: NetworkMetrics::default(), // Will be aggregated separately + disk_io: DiskMetrics::default(), // Will be aggregated separately + active_operations, + system_load, + health_issues, + scan_metrics: ScanMetrics::default(), // Will be aggregated separately + heal_metrics: HealMetrics::default(), // Will be aggregated separately + policy_metrics: PolicyMetrics::default(), // Will be aggregated separately + }) + } + + /// Aggregate network metrics + async fn aggregate_network_metrics(&self, metrics: &[&NetworkMetrics]) -> Result { + if metrics.is_empty() { + return Ok(NetworkMetrics::default()); + } + + let bytes_received_per_sec: u64 = metrics.iter().map(|m| m.bytes_received_per_sec).sum::() / metrics.len() as u64; + let bytes_sent_per_sec: u64 = metrics.iter().map(|m| m.bytes_sent_per_sec).sum::() / metrics.len() as u64; + let packets_received_per_sec: u64 = metrics.iter().map(|m| m.packets_received_per_sec).sum::() / metrics.len() as u64; + let packets_sent_per_sec: u64 = metrics.iter().map(|m| m.packets_sent_per_sec).sum::() / metrics.len() as u64; + + Ok(NetworkMetrics { + bytes_received_per_sec, + bytes_sent_per_sec, + packets_received_per_sec, + packets_sent_per_sec, + }) + } + + /// Aggregate disk metrics + async fn aggregate_disk_metrics(&self, metrics: &[&DiskMetrics]) -> Result { + if metrics.is_empty() { + return Ok(DiskMetrics::default()); + } + + let bytes_read_per_sec: u64 = metrics.iter().map(|m| m.bytes_read_per_sec).sum::() / metrics.len() as u64; + let bytes_written_per_sec: u64 = metrics.iter().map(|m| m.bytes_written_per_sec).sum::() / metrics.len() as u64; + let read_ops_per_sec: u64 = metrics.iter().map(|m| m.read_ops_per_sec).sum::() / metrics.len() as u64; + let write_ops_per_sec: u64 = metrics.iter().map(|m| m.write_ops_per_sec).sum::() / metrics.len() as u64; + let avg_read_latency_ms: f64 = metrics.iter().map(|m| m.avg_read_latency_ms).sum::() / metrics.len() as f64; + let avg_write_latency_ms: f64 = metrics.iter().map(|m| m.avg_write_latency_ms).sum::() / metrics.len() as f64; + + Ok(DiskMetrics { + bytes_read_per_sec, + bytes_written_per_sec, + read_ops_per_sec, + write_ops_per_sec, + avg_read_latency_ms, + avg_write_latency_ms, + }) + } + + /// Aggregate scan metrics + async fn aggregate_scan_metrics(&self, metrics: &[&ScanMetrics]) -> Result { + if metrics.is_empty() { + return Ok(ScanMetrics::default()); + } + + let objects_scanned: u64 = metrics.iter().map(|m| m.objects_scanned).sum(); + let bytes_scanned: u64 = metrics.iter().map(|m| m.bytes_scanned).sum(); + let scan_duration: Duration = metrics.iter().map(|m| m.scan_duration).sum(); + let health_issues_found: u64 = metrics.iter().map(|m| m.health_issues_found).sum(); + let scan_cycles_completed: u64 = metrics.iter().map(|m| m.scan_cycles_completed).sum(); + + // Calculate rates + let total_duration_secs = scan_duration.as_secs_f64(); + let scan_rate_objects_per_sec = if total_duration_secs > 0.0 { + objects_scanned as f64 / total_duration_secs + } else { + 0.0 + }; + + let scan_rate_bytes_per_sec = if total_duration_secs > 0.0 { + bytes_scanned as f64 / total_duration_secs + } else { + 0.0 + }; + + Ok(ScanMetrics { + objects_scanned, + bytes_scanned, + scan_duration, + scan_rate_objects_per_sec, + scan_rate_bytes_per_sec, + health_issues_found, + scan_cycles_completed, + last_scan_time: metrics.last().and_then(|m| m.last_scan_time), + }) + } + + /// Aggregate heal metrics + async fn aggregate_heal_metrics(&self, metrics: &[&HealMetrics]) -> Result { + if metrics.is_empty() { + return Ok(HealMetrics::default()); + } + + let total_repairs: u64 = metrics.iter().map(|m| m.total_repairs).sum(); + let successful_repairs: u64 = metrics.iter().map(|m| m.successful_repairs).sum(); + let failed_repairs: u64 = metrics.iter().map(|m| m.failed_repairs).sum(); + let total_repair_time: Duration = metrics.iter().map(|m| m.total_repair_time).sum(); + let total_retry_attempts: u64 = metrics.iter().map(|m| m.total_retry_attempts).sum(); + + // Calculate average repair time + let average_repair_time = if total_repairs > 0 { + let total_ms = total_repair_time.as_millis() as u64; + Duration::from_millis(total_ms / total_repairs) + } else { + Duration::ZERO + }; + + // Get latest values for current state + let active_repair_workers = metrics.last().map(|m| m.active_repair_workers).unwrap_or(0); + let queued_repair_tasks = metrics.last().map(|m| m.queued_repair_tasks).unwrap_or(0); + let last_repair_time = metrics.last().and_then(|m| m.last_repair_time); + + Ok(HealMetrics { + total_repairs, + successful_repairs, + failed_repairs, + total_repair_time, + average_repair_time, + active_repair_workers, + queued_repair_tasks, + last_repair_time, + total_retry_attempts, + }) + } + + /// Aggregate policy metrics + async fn aggregate_policy_metrics(&self, metrics: &[&PolicyMetrics]) -> Result { + if metrics.is_empty() { + return Ok(PolicyMetrics::default()); + } + + let total_evaluations: u64 = metrics.iter().map(|m| m.total_evaluations).sum(); + let allowed_operations: u64 = metrics.iter().map(|m| m.allowed_operations).sum(); + let denied_operations: u64 = metrics.iter().map(|m| m.denied_operations).sum(); + let scan_policy_evaluations: u64 = metrics.iter().map(|m| m.scan_policy_evaluations).sum(); + let heal_policy_evaluations: u64 = metrics.iter().map(|m| m.heal_policy_evaluations).sum(); + let retention_policy_evaluations: u64 = metrics.iter().map(|m| m.retention_policy_evaluations).sum(); + let total_evaluation_time: Duration = metrics.iter().map(|m| m.average_evaluation_time).sum(); + + // Calculate average evaluation time + let average_evaluation_time = if total_evaluations > 0 { + let total_ms = total_evaluation_time.as_millis() as u64; + Duration::from_millis(total_ms / total_evaluations) + } else { + Duration::ZERO + }; + + Ok(PolicyMetrics { + total_evaluations, + allowed_operations, + denied_operations, + scan_policy_evaluations, + heal_policy_evaluations, + retention_policy_evaluations, + average_evaluation_time, + }) + } + + /// Generate summary statistics + async fn generate_summary( + &self, + data_points: &[MetricsDataPoint], + query: &MetricsQuery, + ) -> Result { + let total_points = data_points.len() as u64; + let time_range = query.end_time.duration_since(query.start_time).unwrap_or(Duration::ZERO); + + // Calculate averages from system metrics + let system_metrics: Vec<&SystemMetrics> = data_points + .iter() + .filter_map(|p| p.system.as_ref()) + .collect(); + + let avg_cpu_usage = if !system_metrics.is_empty() { + system_metrics.iter().map(|m| m.cpu_usage).sum::() / system_metrics.len() as f64 + } else { + 0.0 + }; + + let avg_memory_usage = if !system_metrics.is_empty() { + system_metrics.iter().map(|m| m.memory_usage).sum::() / system_metrics.len() as f64 + } else { + 0.0 + }; + + let avg_disk_usage = if !system_metrics.is_empty() { + system_metrics.iter().map(|m| m.disk_usage).sum::() / system_metrics.len() as f64 + } else { + 0.0 + }; + + // Calculate totals from scan and heal metrics + let scan_metrics: Vec<&ScanMetrics> = data_points + .iter() + .filter_map(|p| p.scan.as_ref()) + .collect(); + + let total_objects_scanned = scan_metrics.iter().map(|m| m.objects_scanned).sum(); + let total_health_issues = scan_metrics.iter().map(|m| m.health_issues_found).sum(); + + let heal_metrics: Vec<&HealMetrics> = data_points + .iter() + .filter_map(|p| p.heal.as_ref()) + .collect(); + + let total_repairs = heal_metrics.iter().map(|m| m.total_repairs).sum(); + let successful_repairs: u64 = heal_metrics.iter().map(|m| m.successful_repairs).sum(); + let repair_success_rate = if total_repairs > 0 { + successful_repairs as f64 / total_repairs as f64 + } else { + 0.0 + }; + + Ok(MetricsSummary { + total_points, + time_range, + avg_cpu_usage, + avg_memory_usage, + avg_disk_usage, + total_objects_scanned, + total_repairs, + repair_success_rate, + total_health_issues, + }) + } + + /// Generate cache key for query + fn generate_cache_key(&self, query: &MetricsQuery) -> String { + format!( + "{:?}_{:?}_{:?}_{:?}", + query.start_time, query.end_time, query.interval, query.metrics + ) + } + + /// Clear old cache entries + pub async fn clear_old_cache(&mut self) -> Result<()> { + let now = SystemTime::now(); + let retention_period = Duration::from_secs(3600); // 1 hour + + self.aggregation_cache.retain(|_key, value| { + if let Some(latest_point) = value.data_points.last() { + now.duration_since(latest_point.timestamp).unwrap_or(Duration::ZERO) < retention_period + } else { + false + } + }); + + info!("Cleared old cache entries, remaining: {}", self.aggregation_cache.len()); + Ok(()) + } + + /// Get aggregation statistics + pub fn get_statistics(&self) -> AggregatorStatistics { + AggregatorStatistics { + total_data_points: self.data_points.len(), + total_aggregations: self.aggregation_count, + cache_size: self.aggregation_cache.len(), + last_aggregation_time: self.last_aggregation_time, + config: self.config.clone(), + } + } +} + +/// Aggregator statistics +#[derive(Debug, Clone)] +pub struct AggregatorStatistics { + pub total_data_points: usize, + pub total_aggregations: u64, + pub cache_size: usize, + pub last_aggregation_time: SystemTime, + pub config: AggregatorConfig, +} + +impl Default for MetricsSummary { + fn default() -> Self { + Self { + total_points: 0, + time_range: Duration::ZERO, + avg_cpu_usage: 0.0, + avg_memory_usage: 0.0, + avg_disk_usage: 0.0, + total_objects_scanned: 0, + total_repairs: 0, + repair_success_rate: 0.0, + total_health_issues: 0, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::Severity; + + #[tokio::test] + async fn test_aggregator_creation() { + let aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); + + assert_eq!(aggregator.config().default_interval, Duration::from_secs(300)); + assert!(aggregator.config().enable_auto_aggregation); + } + + #[tokio::test] + async fn test_data_point_addition() { + let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); + + let data_point = MetricsDataPoint { + timestamp: SystemTime::now(), + system: Some(SystemMetrics::default()), + network: None, + disk_io: None, + scan: None, + heal: None, + policy: None, + }; + + aggregator.add_data_point(data_point).await.unwrap(); + + let stats = aggregator.get_statistics(); + assert_eq!(stats.total_data_points, 1); + } + + #[tokio::test] + async fn test_metrics_aggregation() { + let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); + + // Add some test data points + for i in 0..5 { + let mut system_metrics = SystemMetrics::default(); + system_metrics.cpu_usage = i as f64 * 10.0; + system_metrics.memory_usage = i as f64 * 20.0; + + let data_point = MetricsDataPoint { + timestamp: SystemTime::now() + Duration::from_secs(i * 60), + system: Some(system_metrics), + network: None, + disk_io: None, + scan: None, + heal: None, + policy: None, + }; + + aggregator.add_data_point(data_point).await.unwrap(); + } + + let query = MetricsQuery { + start_time: SystemTime::now(), + end_time: SystemTime::now() + Duration::from_secs(300), + interval: Duration::from_secs(60), + metrics: vec![MetricType::System], + severity_filter: None, + limit: None, + }; + + let result = aggregator.aggregate_metrics(query).await.unwrap(); + assert_eq!(result.data_points.len(), 5); + assert_eq!(result.summary.total_points, 5); + } + + #[tokio::test] + async fn test_system_metrics_aggregation() { + let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); + + let metrics = vec![ + SystemMetrics { + cpu_usage: 10.0, + memory_usage: 20.0, + disk_usage: 30.0, + ..Default::default() + }, + SystemMetrics { + cpu_usage: 20.0, + memory_usage: 40.0, + disk_usage: 60.0, + ..Default::default() + }, + ]; + + let aggregated = aggregator.aggregate_system_metrics(&metrics.iter().collect::>()).await.unwrap(); + + assert_eq!(aggregated.cpu_usage, 15.0); + assert_eq!(aggregated.memory_usage, 30.0); + assert_eq!(aggregated.disk_usage, 45.0); + } + + #[tokio::test] + async fn test_cache_clearing() { + let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap(); + + // Add some cached data + let query = MetricsQuery { + start_time: SystemTime::now() - Duration::from_secs(3600), + end_time: SystemTime::now() - Duration::from_secs(3000), + interval: Duration::from_secs(60), + metrics: vec![], + severity_filter: None, + limit: None, + }; + + let _result = aggregator.aggregate_metrics(query).await.unwrap(); + + let stats_before = aggregator.get_statistics(); + assert_eq!(stats_before.cache_size, 1); + + aggregator.clear_old_cache().await.unwrap(); + + let stats_after = aggregator.get_statistics(); + assert_eq!(stats_after.cache_size, 0); + } +} \ No newline at end of file diff --git a/crates/ahm/src/metrics/collector.rs b/crates/ahm/src/metrics/collector.rs new file mode 100644 index 00000000..3932b94e --- /dev/null +++ b/crates/ahm/src/metrics/collector.rs @@ -0,0 +1,426 @@ +// Copyright 2024 RustFS Team + +use std::{ + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{debug, error, info, warn}; + +use crate::{ + error::Result, + scanner::{HealthIssue, Severity}, +}; + +use super::{ + AggregatedMetrics, Aggregator, DiskMetrics, HealMetrics, MetricsQuery, MetricType, + NetworkMetrics, PolicyMetrics, ScanMetrics, SystemMetrics, +}; + +/// Configuration for the metrics collector +#[derive(Debug, Clone)] +pub struct CollectorConfig { + /// Collection interval + pub collection_interval: Duration, + /// Whether to enable detailed metrics collection + pub enable_detailed_metrics: bool, + /// Maximum number of metrics to keep in memory + pub max_metrics_in_memory: usize, + /// Whether to enable automatic aggregation + pub enable_auto_aggregation: bool, + /// Aggregation interval + pub aggregation_interval: Duration, + /// Whether to enable resource monitoring + pub enable_resource_monitoring: bool, + /// Resource monitoring interval + pub resource_monitoring_interval: Duration, + /// Whether to enable health issue tracking + pub enable_health_issue_tracking: bool, + /// Metrics retention period + pub metrics_retention_period: Duration, +} + +impl Default for CollectorConfig { + fn default() -> Self { + Self { + collection_interval: Duration::from_secs(30), // 30 seconds + enable_detailed_metrics: true, + max_metrics_in_memory: 10000, + enable_auto_aggregation: true, + aggregation_interval: Duration::from_secs(300), // 5 minutes + enable_resource_monitoring: true, + resource_monitoring_interval: Duration::from_secs(10), // 10 seconds + enable_health_issue_tracking: true, + metrics_retention_period: Duration::from_secs(86400 * 7), // 7 days + } + } +} + +/// Metrics collector that gathers system metrics +#[derive(Debug)] +pub struct Collector { + config: CollectorConfig, + metrics: Arc>>, + aggregator: Arc, + last_collection_time: Arc>, + collection_count: Arc>, + health_issues: Arc>>, +} + +impl Collector { + /// Create a new metrics collector + pub async fn new(config: CollectorConfig) -> Result { + let aggregator = Arc::new(Aggregator::new(config.aggregation_interval).await?); + + Ok(Self { + config, + metrics: Arc::new(RwLock::new(Vec::new())), + aggregator, + last_collection_time: Arc::new(RwLock::new(SystemTime::now())), + collection_count: Arc::new(RwLock::new(0)), + health_issues: Arc::new(RwLock::new(std::collections::HashMap::new())), + }) + } + + /// Get the configuration + pub fn config(&self) -> &CollectorConfig { + &self.config + } + + /// Collect current system metrics + pub async fn collect_metrics(&self) -> Result { + let start_time = Instant::now(); + + let mut metrics = SystemMetrics::default(); + metrics.timestamp = SystemTime::now(); + + // Collect system resource metrics + if self.config.enable_resource_monitoring { + self.collect_system_resources(&mut metrics).await?; + } + + // Collect scan metrics + self.collect_scan_metrics(&mut metrics).await?; + + // Collect heal metrics + self.collect_heal_metrics(&mut metrics).await?; + + // Collect policy metrics + self.collect_policy_metrics(&mut metrics).await?; + + // Collect health issues + if self.config.enable_health_issue_tracking { + self.collect_health_issues(&mut metrics).await?; + } + + // Store metrics + { + let mut metrics_store = self.metrics.write().await; + metrics_store.push(metrics.clone()); + + // Trim old metrics if we exceed the limit + if metrics_store.len() > self.config.max_metrics_in_memory { + let excess = metrics_store.len() - self.config.max_metrics_in_memory; + metrics_store.drain(0..excess); + } + } + + // Update collection statistics + { + let mut last_time = self.last_collection_time.write().await; + *last_time = metrics.timestamp; + + let mut count = self.collection_count.write().await; + *count += 1; + } + + let collection_time = start_time.elapsed(); + debug!("Metrics collection completed in {:?}", collection_time); + + Ok(metrics) + } + + /// Collect system resource metrics + async fn collect_system_resources(&self, metrics: &mut SystemMetrics) -> Result<()> { + // Simulate system resource collection + // In a real implementation, this would use system APIs + + metrics.cpu_usage = self.get_cpu_usage().await?; + metrics.memory_usage = self.get_memory_usage().await?; + metrics.disk_usage = self.get_disk_usage().await?; + metrics.system_load = self.get_system_load().await?; + metrics.active_operations = self.get_active_operations().await?; + + // Collect network metrics + metrics.network_io = self.get_network_metrics().await?; + + // Collect disk I/O metrics + metrics.disk_io = self.get_disk_io_metrics().await?; + + Ok(()) + } + + /// Collect scan metrics + async fn collect_scan_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> { + // In a real implementation, this would get data from the scanner + metrics.scan_metrics = ScanMetrics::default(); + + // Simulate some scan metrics + metrics.scan_metrics.objects_scanned = 1000; + metrics.scan_metrics.bytes_scanned = 1024 * 1024 * 100; // 100 MB + metrics.scan_metrics.scan_duration = Duration::from_secs(60); + metrics.scan_metrics.scan_rate_objects_per_sec = 16.67; // 1000 / 60 + metrics.scan_metrics.scan_rate_bytes_per_sec = 1_747_200.0; // 100MB / 60s + metrics.scan_metrics.health_issues_found = 5; + metrics.scan_metrics.scan_cycles_completed = 1; + metrics.scan_metrics.last_scan_time = Some(SystemTime::now()); + + Ok(()) + } + + /// Collect heal metrics + async fn collect_heal_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> { + // In a real implementation, this would get data from the heal system + metrics.heal_metrics = HealMetrics::default(); + + // Simulate some heal metrics + metrics.heal_metrics.total_repairs = 10; + metrics.heal_metrics.successful_repairs = 8; + metrics.heal_metrics.failed_repairs = 2; + metrics.heal_metrics.total_repair_time = Duration::from_secs(300); + metrics.heal_metrics.average_repair_time = Duration::from_secs(30); + metrics.heal_metrics.active_repair_workers = 2; + metrics.heal_metrics.queued_repair_tasks = 5; + metrics.heal_metrics.last_repair_time = Some(SystemTime::now()); + metrics.heal_metrics.total_retry_attempts = 3; + + Ok(()) + } + + /// Collect policy metrics + async fn collect_policy_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> { + // In a real implementation, this would get data from the policy system + metrics.policy_metrics = PolicyMetrics::default(); + + // Simulate some policy metrics + metrics.policy_metrics.total_evaluations = 50; + metrics.policy_metrics.allowed_operations = 45; + metrics.policy_metrics.denied_operations = 5; + metrics.policy_metrics.scan_policy_evaluations = 20; + metrics.policy_metrics.heal_policy_evaluations = 20; + metrics.policy_metrics.retention_policy_evaluations = 10; + metrics.policy_metrics.average_evaluation_time = Duration::from_millis(10); + + Ok(()) + } + + /// Collect health issues + async fn collect_health_issues(&self, metrics: &mut SystemMetrics) -> Result<()> { + let health_issues = self.health_issues.read().await; + metrics.health_issues = health_issues.clone(); + Ok(()) + } + + /// Record a health issue + pub async fn record_health_issue(&self, issue: &HealthIssue) -> Result<()> { + let mut issues = self.health_issues.write().await; + let count = issues.entry(issue.severity).or_insert(0); + *count += 1; + + info!("Recorded health issue: {:?} - {}", issue.severity, issue.description); + Ok(()) + } + + /// Record an event (alias for record_health_issue) + pub async fn record_event(&self, issue: &HealthIssue) -> Result<()> { + self.record_health_issue(issue).await + } + + /// Clear health issues + pub async fn clear_health_issues(&self) -> Result<()> { + let mut health_issues = self.health_issues.write().await; + health_issues.clear(); + + info!("Cleared all health issues"); + Ok(()) + } + + /// Query metrics with aggregation + pub async fn query_metrics(&self, query: MetricsQuery) -> Result { + // In a real implementation, this would query the aggregator + // For now, we'll return a simple aggregated result + let aggregator = self.aggregator.as_ref(); + let mut aggregator_guard = aggregator.write().await; + aggregator_guard.aggregate_metrics(query).await + } + + /// Get metrics for a specific time range + pub async fn get_metrics_range(&self, start_time: SystemTime, end_time: SystemTime) -> Result> { + let metrics = self.metrics.read().await; + let filtered_metrics: Vec = metrics + .iter() + .filter(|m| m.timestamp >= start_time && m.timestamp <= end_time) + .cloned() + .collect(); + + Ok(filtered_metrics) + } + + /// Get latest metrics + pub async fn get_latest_metrics(&self) -> Result> { + let metrics = self.metrics.read().await; + Ok(metrics.last().cloned()) + } + + /// Get collection statistics + pub async fn get_collection_statistics(&self) -> CollectionStatistics { + let collection_count = *self.collection_count.read().await; + let last_collection_time = *self.last_collection_time.read().await; + let metrics_count = self.metrics.read().await.len(); + + CollectionStatistics { + total_collections: collection_count, + last_collection_time, + metrics_in_memory: metrics_count, + config: self.config.clone(), + } + } + + /// Simulated system resource collection methods + async fn get_cpu_usage(&self) -> Result { + // Simulate CPU usage collection + Ok(25.5) // 25.5% + } + + async fn get_memory_usage(&self) -> Result { + // Simulate memory usage collection + Ok(60.2) // 60.2% + } + + async fn get_disk_usage(&self) -> Result { + // Simulate disk usage collection + Ok(45.8) // 45.8% + } + + async fn get_system_load(&self) -> Result { + // Simulate system load collection + Ok(0.75) // 0.75 + } + + async fn get_active_operations(&self) -> Result { + // Simulate active operations count + Ok(15) + } + + async fn get_network_metrics(&self) -> Result { + // Simulate network metrics collection + Ok(NetworkMetrics { + bytes_received_per_sec: 1024 * 1024, // 1 MB/s + bytes_sent_per_sec: 512 * 1024, // 512 KB/s + packets_received_per_sec: 1000, + packets_sent_per_sec: 500, + }) + } + + async fn get_disk_io_metrics(&self) -> Result { + // Simulate disk I/O metrics collection + Ok(DiskMetrics { + bytes_read_per_sec: 2 * 1024 * 1024, // 2 MB/s + bytes_written_per_sec: 1 * 1024 * 1024, // 1 MB/s + read_ops_per_sec: 200, + write_ops_per_sec: 100, + avg_read_latency_ms: 5.0, + avg_write_latency_ms: 8.0, + }) + } +} + +/// Collection statistics +#[derive(Debug, Clone)] +pub struct CollectionStatistics { + pub total_collections: u64, + pub last_collection_time: SystemTime, + pub metrics_in_memory: usize, + pub config: CollectorConfig, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType}; + + #[tokio::test] + async fn test_collector_creation() { + let config = CollectorConfig::default(); + let collector = Collector::new(config).await.unwrap(); + + assert_eq!(collector.config().collection_interval, Duration::from_secs(30)); + assert!(collector.config().enable_detailed_metrics); + } + + #[tokio::test] + async fn test_metrics_collection() { + let config = CollectorConfig::default(); + let collector = Collector::new(config).await.unwrap(); + + let metrics = collector.collect_metrics().await.unwrap(); + assert_eq!(metrics.cpu_usage, 25.5); + assert_eq!(metrics.memory_usage, 60.2); + assert_eq!(metrics.disk_usage, 45.8); + } + + #[tokio::test] + async fn test_health_issue_recording() { + let config = CollectorConfig::default(); + let collector = Collector::new(config).await.unwrap(); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + collector.record_health_issue(&issue).await.unwrap(); + + let stats = collector.get_collection_statistics().await; + assert_eq!(stats.total_collections, 0); // No collection yet + } + + #[tokio::test] + async fn test_latest_metrics() { + let config = CollectorConfig::default(); + let collector = Collector::new(config).await.unwrap(); + + // Initially no metrics + let latest = collector.get_latest_metrics().await.unwrap(); + assert!(latest.is_none()); + + // Collect metrics + collector.collect_metrics().await.unwrap(); + + // Now should have metrics + let latest = collector.get_latest_metrics().await.unwrap(); + assert!(latest.is_some()); + } + + #[tokio::test] + async fn test_collection_statistics() { + let config = CollectorConfig::default(); + let collector = Collector::new(config).await.unwrap(); + + let stats = collector.get_collection_statistics().await; + assert_eq!(stats.total_collections, 0); + assert_eq!(stats.metrics_in_memory, 0); + + // Collect metrics + collector.collect_metrics().await.unwrap(); + + let stats = collector.get_collection_statistics().await; + assert_eq!(stats.total_collections, 1); + assert_eq!(stats.metrics_in_memory, 1); + } +} \ No newline at end of file diff --git a/crates/ahm/src/metrics/mod.rs b/crates/ahm/src/metrics/mod.rs new file mode 100644 index 00000000..f5f6ae85 --- /dev/null +++ b/crates/ahm/src/metrics/mod.rs @@ -0,0 +1,617 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Metrics collection and aggregation system +//! +//! The metrics subsystem provides comprehensive data collection and analysis: +//! - Real-time metrics collection from all subsystems +//! - Time-series data storage and aggregation +//! - Export capabilities for external monitoring systems +//! - Performance analytics and trend analysis + +pub mod collector; +pub mod aggregator; +pub mod storage; +pub mod reporter; + +pub use collector::{Collector, CollectorConfig}; +pub use aggregator::{Aggregator, AggregatorConfig}; +pub use storage::{Storage, StorageConfig}; +pub use reporter::{Reporter, ReporterConfig}; + +use std::time::{Duration, SystemTime}; +use std::collections::HashMap; +use serde::{Deserialize, Serialize}; + +use crate::scanner::{HealthIssue, Severity}; + +/// Metrics subsystem status +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Status { + /// Metrics system is initializing + Initializing, + /// Metrics system is running normally + Running, + /// Metrics system is degraded (some exporters failing) + Degraded, + /// Metrics system is stopping + Stopping, + /// Metrics system has stopped + Stopped, + /// Metrics system encountered an error + Error(String), +} + +/// Metric data point with timestamp and value +#[derive(Debug, Clone)] +pub struct MetricPoint { + /// Metric name + pub name: String, + /// Metric value + pub value: MetricValue, + /// Timestamp when metric was collected + pub timestamp: SystemTime, + /// Additional labels/tags + pub labels: HashMap, +} + +/// Different types of metric values +#[derive(Debug, Clone)] +pub enum MetricValue { + /// Counter that only increases + Counter(u64), + /// Gauge that can go up or down + Gauge(f64), + /// Histogram with buckets + Histogram { + count: u64, + sum: f64, + buckets: Vec, + }, + /// Summary with quantiles + Summary { + count: u64, + sum: f64, + quantiles: Vec, + }, +} + +/// Histogram bucket +#[derive(Debug, Clone)] +pub struct HistogramBucket { + /// Upper bound of the bucket + pub le: f64, + /// Count of observations in this bucket + pub count: u64, +} + +/// Summary quantile +#[derive(Debug, Clone)] +pub struct Quantile { + /// Quantile value (e.g., 0.5 for median) + pub quantile: f64, + /// Value at this quantile + pub value: f64, +} + +/// Aggregation functions for metrics +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum AggregationFunction { + Sum, + Average, + Min, + Max, + Count, + Rate, + Percentile(u8), +} + +/// Time window for aggregation +#[derive(Debug, Clone)] +pub struct TimeWindow { + /// Duration of the window + pub duration: Duration, + /// How often to create new windows + pub step: Duration, +} + +/// Metric export configuration +#[derive(Debug, Clone)] +pub struct ExportConfig { + /// Export format + pub format: ExportFormat, + /// Export destination + pub destination: ExportDestination, + /// Export interval + pub interval: Duration, + /// Metric filters + pub filters: Vec, +} + +/// Supported export formats +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ExportFormat { + /// Prometheus format + Prometheus, + /// JSON format + Json, + /// CSV format + Csv, + /// Custom format + Custom(String), +} + +/// Export destinations +#[derive(Debug, Clone)] +pub enum ExportDestination { + /// HTTP endpoint + Http { url: String, headers: HashMap }, + /// File system + File { path: String }, + /// Standard output + Stdout, + /// Custom destination + Custom(String), +} + +/// Metric filtering rules +#[derive(Debug, Clone)] +pub struct MetricFilter { + /// Metric name pattern (regex) + pub name_pattern: String, + /// Label filters + pub label_filters: HashMap, + /// Include or exclude matching metrics + pub include: bool, +} + +/// System-wide metrics that are automatically collected +pub mod system_metrics { + /// Object-related metrics + pub const OBJECTS_TOTAL: &str = "rustfs_objects_total"; + pub const OBJECTS_SIZE_BYTES: &str = "rustfs_objects_size_bytes"; + pub const OBJECTS_SCANNED_TOTAL: &str = "rustfs_objects_scanned_total"; + pub const OBJECTS_HEAL_OPERATIONS_TOTAL: &str = "rustfs_objects_heal_operations_total"; + + /// Scanner metrics + pub const SCAN_CYCLES_TOTAL: &str = "rustfs_scan_cycles_total"; + pub const SCAN_DURATION_SECONDS: &str = "rustfs_scan_duration_seconds"; + pub const SCAN_RATE_OBJECTS_PER_SECOND: &str = "rustfs_scan_rate_objects_per_second"; + pub const SCAN_RATE_BYTES_PER_SECOND: &str = "rustfs_scan_rate_bytes_per_second"; + + /// Health metrics + pub const HEALTH_ISSUES_TOTAL: &str = "rustfs_health_issues_total"; + pub const HEALTH_ISSUES_BY_SEVERITY: &str = "rustfs_health_issues_by_severity"; + pub const HEAL_SUCCESS_RATE: &str = "rustfs_heal_success_rate"; + + /// System resource metrics + pub const DISK_USAGE_BYTES: &str = "rustfs_disk_usage_bytes"; + pub const DISK_IOPS: &str = "rustfs_disk_iops"; + pub const MEMORY_USAGE_BYTES: &str = "rustfs_memory_usage_bytes"; + pub const CPU_USAGE_PERCENT: &str = "rustfs_cpu_usage_percent"; + + /// Performance metrics + pub const OPERATION_DURATION_SECONDS: &str = "rustfs_operation_duration_seconds"; + pub const ACTIVE_OPERATIONS: &str = "rustfs_active_operations"; + pub const THROUGHPUT_BYTES_PER_SECOND: &str = "rustfs_throughput_bytes_per_second"; +} + +/// System metrics collected by AHM +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SystemMetrics { + /// Timestamp when metrics were collected + pub timestamp: SystemTime, + /// CPU usage percentage + pub cpu_usage: f64, + /// Memory usage percentage + pub memory_usage: f64, + /// Disk usage percentage + pub disk_usage: f64, + /// Network I/O bytes per second + pub network_io: NetworkMetrics, + /// Disk I/O bytes per second + pub disk_io: DiskMetrics, + /// Active operations count + pub active_operations: u64, + /// System load average + pub system_load: f64, + /// Health issues count by severity + pub health_issues: std::collections::HashMap, + /// Scan metrics + pub scan_metrics: ScanMetrics, + /// Heal metrics + pub heal_metrics: HealMetrics, + /// Policy metrics + pub policy_metrics: PolicyMetrics, +} + +/// Network I/O metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NetworkMetrics { + /// Bytes received per second + pub bytes_received_per_sec: u64, + /// Bytes sent per second + pub bytes_sent_per_sec: u64, + /// Packets received per second + pub packets_received_per_sec: u64, + /// Packets sent per second + pub packets_sent_per_sec: u64, +} + +/// Disk I/O metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DiskMetrics { + /// Bytes read per second + pub bytes_read_per_sec: u64, + /// Bytes written per second + pub bytes_written_per_sec: u64, + /// Read operations per second + pub read_ops_per_sec: u64, + /// Write operations per second + pub write_ops_per_sec: u64, + /// Average read latency in milliseconds + pub avg_read_latency_ms: f64, + /// Average write latency in milliseconds + pub avg_write_latency_ms: f64, +} + +/// Scan operation metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ScanMetrics { + /// Total objects scanned + pub objects_scanned: u64, + /// Total bytes scanned + pub bytes_scanned: u64, + /// Scan duration + pub scan_duration: Duration, + /// Scan rate (objects per second) + pub scan_rate_objects_per_sec: f64, + /// Scan rate (bytes per second) + pub scan_rate_bytes_per_sec: f64, + /// Health issues found + pub health_issues_found: u64, + /// Scan cycles completed + pub scan_cycles_completed: u64, + /// Last scan time + pub last_scan_time: Option, +} + +/// Heal operation metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealMetrics { + /// Total repair operations + pub total_repairs: u64, + /// Successful repairs + pub successful_repairs: u64, + /// Failed repairs + pub failed_repairs: u64, + /// Total repair time + pub total_repair_time: Duration, + /// Average repair time + pub average_repair_time: Duration, + /// Active repair workers + pub active_repair_workers: u64, + /// Queued repair tasks + pub queued_repair_tasks: u64, + /// Last repair time + pub last_repair_time: Option, + /// Retry attempts + pub total_retry_attempts: u64, +} + +/// Policy evaluation metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyMetrics { + /// Total policy evaluations + pub total_evaluations: u64, + /// Allowed operations + pub allowed_operations: u64, + /// Denied operations + pub denied_operations: u64, + /// Scan policy evaluations + pub scan_policy_evaluations: u64, + /// Heal policy evaluations + pub heal_policy_evaluations: u64, + /// Retention policy evaluations + pub retention_policy_evaluations: u64, + /// Average evaluation time + pub average_evaluation_time: Duration, +} + +impl Default for SystemMetrics { + fn default() -> Self { + Self { + timestamp: SystemTime::now(), + cpu_usage: 0.0, + memory_usage: 0.0, + disk_usage: 0.0, + network_io: NetworkMetrics::default(), + disk_io: DiskMetrics::default(), + active_operations: 0, + system_load: 0.0, + health_issues: std::collections::HashMap::new(), + scan_metrics: ScanMetrics::default(), + heal_metrics: HealMetrics::default(), + policy_metrics: PolicyMetrics::default(), + } + } +} + +impl Default for NetworkMetrics { + fn default() -> Self { + Self { + bytes_received_per_sec: 0, + bytes_sent_per_sec: 0, + packets_received_per_sec: 0, + packets_sent_per_sec: 0, + } + } +} + +impl Default for DiskMetrics { + fn default() -> Self { + Self { + bytes_read_per_sec: 0, + bytes_written_per_sec: 0, + read_ops_per_sec: 0, + write_ops_per_sec: 0, + avg_read_latency_ms: 0.0, + avg_write_latency_ms: 0.0, + } + } +} + +impl Default for ScanMetrics { + fn default() -> Self { + Self { + objects_scanned: 0, + bytes_scanned: 0, + scan_duration: Duration::ZERO, + scan_rate_objects_per_sec: 0.0, + scan_rate_bytes_per_sec: 0.0, + health_issues_found: 0, + scan_cycles_completed: 0, + last_scan_time: None, + } + } +} + +impl Default for HealMetrics { + fn default() -> Self { + Self { + total_repairs: 0, + successful_repairs: 0, + failed_repairs: 0, + total_repair_time: Duration::ZERO, + average_repair_time: Duration::ZERO, + active_repair_workers: 0, + queued_repair_tasks: 0, + last_repair_time: None, + total_retry_attempts: 0, + } + } +} + +impl Default for PolicyMetrics { + fn default() -> Self { + Self { + total_evaluations: 0, + allowed_operations: 0, + denied_operations: 0, + scan_policy_evaluations: 0, + heal_policy_evaluations: 0, + retention_policy_evaluations: 0, + average_evaluation_time: Duration::ZERO, + } + } +} + +/// Metrics query parameters +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsQuery { + /// Start time for the query + pub start_time: SystemTime, + /// End time for the query + pub end_time: SystemTime, + /// Metrics aggregation interval + pub interval: Duration, + /// Metrics to include in the query + pub metrics: Vec, + /// Filter by severity + pub severity_filter: Option, + /// Limit number of results + pub limit: Option, +} + +/// Types of metrics that can be queried +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum MetricType { + /// System metrics (CPU, memory, disk) + System, + /// Network metrics + Network, + /// Disk I/O metrics + DiskIo, + /// Scan metrics + Scan, + /// Heal metrics + Heal, + /// Policy metrics + Policy, + /// Health issues + HealthIssues, +} + +/// Aggregated metrics data +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AggregatedMetrics { + /// Query parameters used + pub query: MetricsQuery, + /// Aggregated data points + pub data_points: Vec, + /// Summary statistics + pub summary: MetricsSummary, +} + +/// Individual metrics data point +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsDataPoint { + /// Timestamp for this data point + pub timestamp: SystemTime, + /// System metrics + pub system: Option, + /// Network metrics + pub network: Option, + /// Disk I/O metrics + pub disk_io: Option, + /// Scan metrics + pub scan: Option, + /// Heal metrics + pub heal: Option, + /// Policy metrics + pub policy: Option, +} + +/// Summary statistics for aggregated metrics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsSummary { + /// Total data points + pub total_points: u64, + /// Time range covered + pub time_range: Duration, + /// Average CPU usage + pub avg_cpu_usage: f64, + /// Average memory usage + pub avg_memory_usage: f64, + /// Average disk usage + pub avg_disk_usage: f64, + /// Total objects scanned + pub total_objects_scanned: u64, + /// Total repairs performed + pub total_repairs: u64, + /// Success rate for repairs + pub repair_success_rate: f64, + /// Total health issues + pub total_health_issues: u64, +} + +/// Resource usage information +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResourceUsage { + /// Disk usage information + pub disk_usage: DiskUsage, + /// Memory usage information + pub memory_usage: MemoryUsage, + /// Network usage information + pub network_usage: NetworkUsage, + /// CPU usage information + pub cpu_usage: CpuUsage, +} + +/// Disk usage information +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DiskUsage { + /// Total disk space in bytes + pub total_bytes: u64, + /// Used disk space in bytes + pub used_bytes: u64, + /// Available disk space in bytes + pub available_bytes: u64, + /// Usage percentage + pub usage_percentage: f64, +} + +/// Memory usage information +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MemoryUsage { + /// Total memory in bytes + pub total_bytes: u64, + /// Used memory in bytes + pub used_bytes: u64, + /// Available memory in bytes + pub available_bytes: u64, + /// Usage percentage + pub usage_percentage: f64, +} + +/// Network usage information +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NetworkUsage { + /// Bytes received + pub bytes_received: u64, + /// Bytes sent + pub bytes_sent: u64, + /// Packets received + pub packets_received: u64, + /// Packets sent + pub packets_sent: u64, +} + +/// CPU usage information +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CpuUsage { + /// CPU usage percentage + pub usage_percentage: f64, + /// Number of CPU cores + pub cores: u32, + /// Load average + pub load_average: f64, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_system_metrics_creation() { + let metrics = SystemMetrics::default(); + assert_eq!(metrics.cpu_usage, 0.0); + assert_eq!(metrics.memory_usage, 0.0); + assert_eq!(metrics.active_operations, 0); + } + + #[test] + fn test_scan_metrics_creation() { + let metrics = ScanMetrics::default(); + assert_eq!(metrics.objects_scanned, 0); + assert_eq!(metrics.bytes_scanned, 0); + assert_eq!(metrics.scan_cycles_completed, 0); + } + + #[test] + fn test_heal_metrics_creation() { + let metrics = HealMetrics::default(); + assert_eq!(metrics.total_repairs, 0); + assert_eq!(metrics.successful_repairs, 0); + assert_eq!(metrics.failed_repairs, 0); + } + + #[test] + fn test_metrics_query_creation() { + let start_time = SystemTime::now(); + let end_time = start_time + Duration::from_secs(3600); + let query = MetricsQuery { + start_time, + end_time, + interval: Duration::from_secs(60), + metrics: vec![MetricType::System, MetricType::Scan], + severity_filter: Some(Severity::Critical), + limit: Some(100), + }; + + assert_eq!(query.metrics.len(), 2); + assert_eq!(query.interval, Duration::from_secs(60)); + assert_eq!(query.limit, Some(100)); + } +} \ No newline at end of file diff --git a/crates/ahm/src/metrics/reporter.rs b/crates/ahm/src/metrics/reporter.rs new file mode 100644 index 00000000..45aacc03 --- /dev/null +++ b/crates/ahm/src/metrics/reporter.rs @@ -0,0 +1,861 @@ +// Copyright 2024 RustFS Team + +use std::{ + collections::HashMap, + fmt, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{debug, error, info, warn}; +use serde::{Serialize, Deserialize}; + +use crate::error::Result; + +use super::{ + AggregatedMetrics, MetricsQuery, MetricsSummary, SystemMetrics, +}; + +/// Configuration for the metrics reporter +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReporterConfig { + /// Whether to enable reporting + pub enabled: bool, + /// Report generation interval + pub report_interval: Duration, + /// Maximum number of reports to keep in memory + pub max_reports_in_memory: usize, + /// Alert thresholds + pub alert_thresholds: AlertThresholds, + /// Report output format + pub default_format: ReportFormat, + /// Whether to enable alerting + pub enable_alerts: bool, + /// Maximum number of alerts to keep in memory + pub max_alerts_in_memory: usize, + /// Report output directory + pub output_directory: Option, + /// Whether to enable HTTP reporting + pub enable_http_reporting: bool, + /// HTTP reporting endpoint + pub http_endpoint: Option, +} + +impl Default for ReporterConfig { + fn default() -> Self { + Self { + enabled: true, + report_interval: Duration::from_secs(60), // 1 minute + max_reports_in_memory: 1000, + alert_thresholds: AlertThresholds::default(), + default_format: ReportFormat::Json, + enable_alerts: true, + max_alerts_in_memory: 1000, + output_directory: None, + enable_http_reporting: false, + http_endpoint: None, + } + } +} + +/// Alert thresholds for metrics reporting +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AlertThresholds { + /// CPU usage threshold (percentage) + pub cpu_usage_threshold: f64, + /// Memory usage threshold (percentage) + pub memory_usage_threshold: f64, + /// Disk usage threshold (percentage) + pub disk_usage_threshold: f64, + /// System load threshold + pub system_load_threshold: f64, + /// Repair failure rate threshold (percentage) + pub repair_failure_rate_threshold: f64, + /// Health issues threshold (count) + pub health_issues_threshold: u64, +} + +impl Default for AlertThresholds { + fn default() -> Self { + Self { + cpu_usage_threshold: 80.0, + memory_usage_threshold: 85.0, + disk_usage_threshold: 90.0, + system_load_threshold: 5.0, + repair_failure_rate_threshold: 20.0, + health_issues_threshold: 10, + } + } +} + +/// Metrics reporter that generates and outputs metrics reports +pub struct Reporter { + config: ReporterConfig, + reports: Arc>>, + alerts: Arc>>, + last_report_time: Arc>, + report_count: Arc>, + alert_count: Arc>, +} + +impl Reporter { + /// Create a new metrics reporter + pub async fn new(config: ReporterConfig) -> Result { + Ok(Self { + config, + reports: Arc::new(RwLock::new(Vec::new())), + alerts: Arc::new(RwLock::new(Vec::new())), + last_report_time: Arc::new(RwLock::new(SystemTime::now())), + report_count: Arc::new(RwLock::new(0)), + alert_count: Arc::new(RwLock::new(0)), + }) + } + + /// Get the configuration + pub fn config(&self) -> &ReporterConfig { + &self.config + } + + /// Generate a metrics report + pub async fn generate_report(&self, metrics: &SystemMetrics) -> Result { + let start_time = SystemTime::now(); + + let report = MetricsReport { + timestamp: start_time, + metrics: metrics.clone(), + alerts: self.check_alerts(metrics).await?, + summary: self.generate_summary(metrics).await?, + format: self.config.default_format, + }; + + // Store report + { + let mut reports = self.reports.write().await; + reports.push(report.clone()); + + // Trim old reports if we exceed the limit + if reports.len() > self.config.max_reports_in_memory { + let excess = reports.len() - self.config.max_reports_in_memory; + reports.drain(0..excess); + } + } + + // Update statistics + { + let mut last_time = self.last_report_time.write().await; + *last_time = start_time; + + let mut count = self.report_count.write().await; + *count += 1; + } + + info!("Generated metrics report #{}", *self.report_count.read().await); + Ok(report) + } + + /// Generate a comprehensive report from aggregated metrics + pub async fn generate_comprehensive_report(&self, aggregated: &AggregatedMetrics) -> Result { + let start_time = SystemTime::now(); + + let report = ComprehensiveReport { + timestamp: start_time, + query: aggregated.query.clone(), + data_points: aggregated.data_points.len(), + summary: aggregated.summary.clone(), + alerts: self.check_aggregated_alerts(aggregated).await?, + trends: self.analyze_trends(aggregated).await?, + recommendations: self.generate_recommendations(aggregated).await?, + }; + + info!("Generated comprehensive report with {} data points", report.data_points); + Ok(report) + } + + /// Output a report in the specified format + pub async fn output_report(&self, report: &MetricsReport, format: ReportFormat) -> Result<()> { + match format { + ReportFormat::Console => self.output_to_console(report).await?, + ReportFormat::File => self.output_to_file(report).await?, + ReportFormat::Http => self.output_to_http(report).await?, + ReportFormat::Prometheus => self.output_prometheus(report).await?, + ReportFormat::Json => self.output_json(report).await?, + ReportFormat::Csv => self.output_csv(report).await?, + } + + Ok(()) + } + + /// Check for alerts based on metrics + async fn check_alerts(&self, metrics: &SystemMetrics) -> Result> { + let mut alerts = Vec::new(); + + // Check CPU usage + if metrics.cpu_usage > self.config.alert_thresholds.cpu_usage_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Warning, + category: AlertCategory::System, + message: format!("High CPU usage: {:.1}%", metrics.cpu_usage), + metric_value: metrics.cpu_usage, + threshold: self.config.alert_thresholds.cpu_usage_threshold, + }); + } + + // Check memory usage + if metrics.memory_usage > self.config.alert_thresholds.memory_usage_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Warning, + category: AlertCategory::System, + message: format!("High memory usage: {:.1}%", metrics.memory_usage), + metric_value: metrics.memory_usage, + threshold: self.config.alert_thresholds.memory_usage_threshold, + }); + } + + // Check disk usage + if metrics.disk_usage > self.config.alert_thresholds.disk_usage_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Critical, + category: AlertCategory::System, + message: format!("High disk usage: {:.1}%", metrics.disk_usage), + metric_value: metrics.disk_usage, + threshold: self.config.alert_thresholds.disk_usage_threshold, + }); + } + + // Check system load + if metrics.system_load > self.config.alert_thresholds.system_load_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Warning, + category: AlertCategory::System, + message: format!("High system load: {:.2}", metrics.system_load), + metric_value: metrics.system_load, + threshold: self.config.alert_thresholds.system_load_threshold, + }); + } + + // Check repair failure rate + if metrics.heal_metrics.total_repairs > 0 { + let failure_rate = (metrics.heal_metrics.failed_repairs as f64 / metrics.heal_metrics.total_repairs as f64) * 100.0; + if failure_rate > self.config.alert_thresholds.repair_failure_rate_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Critical, + category: AlertCategory::Heal, + message: format!("High repair failure rate: {:.1}%", failure_rate), + metric_value: failure_rate, + threshold: self.config.alert_thresholds.repair_failure_rate_threshold, + }); + } + } + + // Check health issues + let total_health_issues: u64 = metrics.health_issues.values().sum(); + if total_health_issues > self.config.alert_thresholds.health_issues_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Warning, + category: AlertCategory::Health, + message: format!("High number of health issues: {}", total_health_issues), + metric_value: total_health_issues as f64, + threshold: self.config.alert_thresholds.health_issues_threshold as f64, + }); + } + + // Store alerts + if !alerts.is_empty() { + let mut alert_store = self.alerts.write().await; + alert_store.extend(alerts.clone()); + + let mut count = self.alert_count.write().await; + *count += alerts.len() as u64; + } + + Ok(alerts) + } + + /// Check for alerts based on aggregated metrics + async fn check_aggregated_alerts(&self, aggregated: &AggregatedMetrics) -> Result> { + let mut alerts = Vec::new(); + + // Check summary statistics + if aggregated.summary.avg_cpu_usage > self.config.alert_thresholds.cpu_usage_threshold { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Warning, + category: AlertCategory::System, + message: format!("High average CPU usage: {:.1}%", aggregated.summary.avg_cpu_usage), + metric_value: aggregated.summary.avg_cpu_usage, + threshold: self.config.alert_thresholds.cpu_usage_threshold, + }); + } + + if aggregated.summary.repair_success_rate < (100.0 - self.config.alert_thresholds.repair_failure_rate_threshold) { + alerts.push(Alert { + timestamp: SystemTime::now(), + severity: AlertSeverity::Critical, + category: AlertCategory::Heal, + message: format!("Low repair success rate: {:.1}%", aggregated.summary.repair_success_rate * 100.0), + metric_value: aggregated.summary.repair_success_rate * 100.0, + threshold: 100.0 - self.config.alert_thresholds.repair_failure_rate_threshold, + }); + } + + Ok(alerts) + } + + /// Generate summary for metrics + async fn generate_summary(&self, metrics: &SystemMetrics) -> Result { + Ok(ReportSummary { + system_health: self.calculate_system_health(metrics), + performance_score: self.calculate_performance_score(metrics), + resource_utilization: self.calculate_resource_utilization(metrics), + operational_status: self.determine_operational_status(metrics), + key_metrics: self.extract_key_metrics(metrics), + }) + } + + /// Analyze trends in aggregated data + async fn analyze_trends(&self, aggregated: &AggregatedMetrics) -> Result> { + let mut trends = Vec::new(); + + if aggregated.data_points.len() < 2 { + return Ok(trends); + } + + // Analyze CPU usage trend + let cpu_values: Vec = aggregated + .data_points + .iter() + .filter_map(|p| p.system.as_ref().map(|s| s.cpu_usage)) + .collect(); + + if cpu_values.len() >= 2 { + let trend = self.calculate_trend(&cpu_values, "CPU Usage"); + trends.push(trend); + } + + // Analyze memory usage trend + let memory_values: Vec = aggregated + .data_points + .iter() + .filter_map(|p| p.system.as_ref().map(|s| s.memory_usage)) + .collect(); + + if memory_values.len() >= 2 { + let trend = self.calculate_trend(&memory_values, "Memory Usage"); + trends.push(trend); + } + + Ok(trends) + } + + /// Generate recommendations based on metrics + async fn generate_recommendations(&self, aggregated: &AggregatedMetrics) -> Result> { + let mut recommendations = Vec::new(); + + // Check for high resource usage + if aggregated.summary.avg_cpu_usage > 70.0 { + recommendations.push(Recommendation { + priority: RecommendationPriority::High, + category: RecommendationCategory::Performance, + title: "High CPU Usage".to_string(), + description: "Consider scaling up CPU resources or optimizing workload distribution".to_string(), + action: "Monitor CPU usage patterns and consider resource allocation adjustments".to_string(), + }); + } + + if aggregated.summary.avg_memory_usage > 80.0 { + recommendations.push(Recommendation { + priority: RecommendationPriority::High, + category: RecommendationCategory::Performance, + title: "High Memory Usage".to_string(), + description: "Memory usage is approaching critical levels".to_string(), + action: "Consider increasing memory allocation or optimizing memory usage".to_string(), + }); + } + + // Check for repair issues + if aggregated.summary.repair_success_rate < 0.8 { + recommendations.push(Recommendation { + priority: RecommendationPriority::Critical, + category: RecommendationCategory::Reliability, + title: "Low Repair Success Rate".to_string(), + description: "Data repair operations are failing frequently".to_string(), + action: "Investigate repair failures and check system health".to_string(), + }); + } + + Ok(recommendations) + } + + /// Calculate trend for a series of values + fn calculate_trend(&self, values: &[f64], metric_name: &str) -> TrendAnalysis { + if values.len() < 2 { + return TrendAnalysis { + metric_name: metric_name.to_string(), + trend_direction: TrendDirection::Stable, + change_rate: 0.0, + confidence: 0.0, + }; + } + + let first = values[0]; + let last = values[values.len() - 1]; + let change_rate = ((last - first) / first) * 100.0; + + let trend_direction = if change_rate > 5.0 { + TrendDirection::Increasing + } else if change_rate < -5.0 { + TrendDirection::Decreasing + } else { + TrendDirection::Stable + }; + + // Simple confidence calculation based on data points + let confidence = (values.len() as f64 / 10.0).min(1.0); + + TrendAnalysis { + metric_name: metric_name.to_string(), + trend_direction, + change_rate, + confidence, + } + } + + /// Calculate system health score + fn calculate_system_health(&self, metrics: &SystemMetrics) -> f64 { + let mut score = 100.0; + + // Deduct points for high resource usage + if metrics.cpu_usage > 80.0 { + score -= (metrics.cpu_usage - 80.0) * 0.5; + } + if metrics.memory_usage > 85.0 { + score -= (metrics.memory_usage - 85.0) * 0.5; + } + if metrics.disk_usage > 90.0 { + score -= (metrics.disk_usage - 90.0) * 1.0; + } + + // Deduct points for health issues + let total_health_issues: u64 = metrics.health_issues.values().sum(); + score -= total_health_issues as f64 * 5.0; + + // Deduct points for repair failures + if metrics.heal_metrics.total_repairs > 0 { + let failure_rate = metrics.heal_metrics.failed_repairs as f64 / metrics.heal_metrics.total_repairs as f64; + score -= failure_rate * 20.0; + } + + score.max(0.0) + } + + /// Calculate performance score + fn calculate_performance_score(&self, metrics: &SystemMetrics) -> f64 { + let mut score = 100.0; + + // Base score on resource efficiency + score -= metrics.cpu_usage * 0.3; + score -= metrics.memory_usage * 0.3; + score -= metrics.disk_usage * 0.2; + score -= metrics.system_load * 10.0; + + score.max(0.0) + } + + /// Calculate resource utilization + fn calculate_resource_utilization(&self, metrics: &SystemMetrics) -> f64 { + (metrics.cpu_usage + metrics.memory_usage + metrics.disk_usage) / 3.0 + } + + /// Determine operational status + fn determine_operational_status(&self, metrics: &SystemMetrics) -> OperationalStatus { + let health_score = self.calculate_system_health(metrics); + + if health_score >= 90.0 { + OperationalStatus::Excellent + } else if health_score >= 75.0 { + OperationalStatus::Good + } else if health_score >= 50.0 { + OperationalStatus::Fair + } else { + OperationalStatus::Poor + } + } + + /// Extract key metrics + fn extract_key_metrics(&self, metrics: &SystemMetrics) -> HashMap { + let mut key_metrics = HashMap::new(); + key_metrics.insert("cpu_usage".to_string(), metrics.cpu_usage); + key_metrics.insert("memory_usage".to_string(), metrics.memory_usage); + key_metrics.insert("disk_usage".to_string(), metrics.disk_usage); + key_metrics.insert("system_load".to_string(), metrics.system_load); + key_metrics.insert("active_operations".to_string(), metrics.active_operations as f64); + key_metrics.insert("objects_scanned".to_string(), metrics.scan_metrics.objects_scanned as f64); + key_metrics.insert("total_repairs".to_string(), metrics.heal_metrics.total_repairs as f64); + key_metrics.insert("successful_repairs".to_string(), metrics.heal_metrics.successful_repairs as f64); + + key_metrics + } + + /// Output methods (simulated) + async fn output_to_console(&self, report: &MetricsReport) -> Result<()> { + if self.config.enabled { + info!("=== Metrics Report ==="); + info!("Timestamp: {:?}", report.timestamp); + info!("System Health: {:.1}%", report.summary.system_health); + info!("Performance Score: {:.1}%", report.summary.performance_score); + info!("Operational Status: {:?}", report.summary.operational_status); + + if !report.alerts.is_empty() { + info!("=== Alerts ==="); + for alert in &report.alerts { + info!("[{}] {}: {}", alert.severity, alert.category, alert.message); + } + } + } + Ok(()) + } + + async fn output_to_file(&self, _report: &MetricsReport) -> Result<()> { + if self.config.enabled { + // In a real implementation, this would write to a file + debug!("Would write report to file: {}", self.config.output_directory.as_ref().unwrap_or(&String::new())); + } + Ok(()) + } + + async fn output_to_http(&self, _report: &MetricsReport) -> Result<()> { + if self.config.enable_http_reporting { + // In a real implementation, this would serve via HTTP + debug!("Would serve report via HTTP on endpoint: {}", self.config.http_endpoint.as_ref().unwrap_or(&String::new())); + } + Ok(()) + } + + async fn output_prometheus(&self, _report: &MetricsReport) -> Result<()> { + if self.config.enabled { + // In a real implementation, this would output Prometheus format + debug!("Would output Prometheus format"); + } + Ok(()) + } + + async fn output_json(&self, _report: &MetricsReport) -> Result<()> { + if self.config.enabled { + // In a real implementation, this would output JSON format + debug!("Would output JSON format"); + } + Ok(()) + } + + async fn output_csv(&self, _report: &MetricsReport) -> Result<()> { + if self.config.enabled { + // In a real implementation, this would output CSV format + debug!("Would output CSV format"); + } + Ok(()) + } + + /// Get reporting statistics + pub async fn get_statistics(&self) -> ReporterStatistics { + let report_count = *self.report_count.read().await; + let alert_count = *self.alert_count.read().await; + let last_report_time = *self.last_report_time.read().await; + let reports_count = self.reports.read().await.len(); + let alerts_count = self.alerts.read().await.len(); + + ReporterStatistics { + total_reports: report_count, + total_alerts: alert_count, + reports_in_memory: reports_count, + alerts_in_memory: alerts_count, + last_report_time, + config: self.config.clone(), + } + } + + /// Get recent alerts + pub async fn get_recent_alerts(&self, hours: u64) -> Result> { + let cutoff_time = SystemTime::now() - Duration::from_secs(hours * 3600); + let alerts = self.alerts.read().await; + + let recent_alerts: Vec = alerts + .iter() + .filter(|alert| alert.timestamp >= cutoff_time) + .cloned() + .collect(); + + Ok(recent_alerts) + } + + /// Clear old alerts + pub async fn clear_old_alerts(&self, hours: u64) -> Result<()> { + let cutoff_time = SystemTime::now() - Duration::from_secs(hours * 3600); + let mut alerts = self.alerts.write().await; + alerts.retain(|alert| alert.timestamp >= cutoff_time); + + info!("Cleared alerts older than {} hours", hours); + Ok(()) + } +} + +/// Metrics report +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsReport { + pub timestamp: SystemTime, + pub metrics: SystemMetrics, + pub alerts: Vec, + pub summary: ReportSummary, + pub format: ReportFormat, +} + +/// Comprehensive report +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ComprehensiveReport { + pub timestamp: SystemTime, + pub query: MetricsQuery, + pub data_points: usize, + pub summary: MetricsSummary, + pub alerts: Vec, + pub trends: Vec, + pub recommendations: Vec, +} + +/// Report summary +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReportSummary { + pub system_health: f64, + pub performance_score: f64, + pub resource_utilization: f64, + pub operational_status: OperationalStatus, + pub key_metrics: HashMap, +} + +/// Alert +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Alert { + pub timestamp: SystemTime, + pub severity: AlertSeverity, + pub category: AlertCategory, + pub message: String, + pub metric_value: f64, + pub threshold: f64, +} + +/// Alert severity +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum AlertSeverity { + Info, + Warning, + Critical, +} + +impl fmt::Display for AlertSeverity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + AlertSeverity::Info => write!(f, "INFO"), + AlertSeverity::Warning => write!(f, "WARNING"), + AlertSeverity::Critical => write!(f, "CRITICAL"), + } + } +} + +/// Alert category +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum AlertCategory { + System, + Performance, + Health, + Heal, + Security, +} + +impl fmt::Display for AlertCategory { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + AlertCategory::System => write!(f, "SYSTEM"), + AlertCategory::Performance => write!(f, "PERFORMANCE"), + AlertCategory::Health => write!(f, "HEALTH"), + AlertCategory::Heal => write!(f, "HEAL"), + AlertCategory::Security => write!(f, "SECURITY"), + } + } +} + +/// Report format +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum ReportFormat { + Console, + File, + Http, + Prometheus, + Json, + Csv, +} + +/// Operational status +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum OperationalStatus { + Excellent, + Good, + Fair, + Poor, +} + +/// Trend analysis +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TrendAnalysis { + pub metric_name: String, + pub trend_direction: TrendDirection, + pub change_rate: f64, + pub confidence: f64, +} + +/// Trend direction +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum TrendDirection { + Increasing, + Decreasing, + Stable, +} + +/// Recommendation +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Recommendation { + pub priority: RecommendationPriority, + pub category: RecommendationCategory, + pub title: String, + pub description: String, + pub action: String, +} + +/// Recommendation priority +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum RecommendationPriority { + Low, + Medium, + High, + Critical, +} + +/// Recommendation category +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum RecommendationCategory { + Performance, + Reliability, + Security, + Maintenance, +} + +/// Reporter statistics +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReporterStatistics { + pub total_reports: u64, + pub total_alerts: u64, + pub reports_in_memory: usize, + pub alerts_in_memory: usize, + pub last_report_time: SystemTime, + pub config: ReporterConfig, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_reporter_creation() { + let config = ReporterConfig::default(); + let reporter = Reporter::new(config).await.unwrap(); + + assert_eq!(reporter.config().report_interval, Duration::from_secs(60)); + assert!(reporter.config().enabled); + } + + #[tokio::test] + async fn test_report_generation() { + let config = ReporterConfig::default(); + let reporter = Reporter::new(config).await.unwrap(); + + let metrics = SystemMetrics::default(); + let report = reporter.generate_report(&metrics).await.unwrap(); + + assert_eq!(report.metrics.cpu_usage, 0.0); + assert_eq!(report.alerts.len(), 0); + } + + #[tokio::test] + async fn test_alert_generation() { + let config = ReporterConfig { + alert_thresholds: AlertThresholds { + cpu_usage_threshold: 50.0, + ..Default::default() + }, + ..Default::default() + }; + let reporter = Reporter::new(config).await.unwrap(); + + let mut metrics = SystemMetrics::default(); + metrics.cpu_usage = 75.0; // Above threshold + + let report = reporter.generate_report(&metrics).await.unwrap(); + assert!(!report.alerts.is_empty()); + assert_eq!(report.alerts[0].severity, AlertSeverity::Warning); + } + + #[tokio::test] + async fn test_comprehensive_report() { + let config = ReporterConfig::default(); + let reporter = Reporter::new(config).await.unwrap(); + + let aggregated = AggregatedMetrics { + query: MetricsQuery { + start_time: SystemTime::now(), + end_time: SystemTime::now() + Duration::from_secs(3600), + interval: Duration::from_secs(60), + metrics: vec![], + severity_filter: None, + limit: None, + }, + data_points: vec![], + summary: MetricsSummary::default(), + }; + + let report = reporter.generate_comprehensive_report(&aggregated).await.unwrap(); + assert_eq!(report.data_points, 0); + assert!(report.recommendations.is_empty()); + } + + #[tokio::test] + async fn test_reporter_statistics() { + let config = ReporterConfig::default(); + let reporter = Reporter::new(config).await.unwrap(); + + let stats = reporter.get_statistics().await; + assert_eq!(stats.total_reports, 0); + assert_eq!(stats.total_alerts, 0); + } + + #[tokio::test] + async fn test_alert_clearing() { + let config = ReporterConfig::default(); + let reporter = Reporter::new(config).await.unwrap(); + + // Generate some alerts + let mut metrics = SystemMetrics::default(); + metrics.cpu_usage = 90.0; // Above threshold + + let _report = reporter.generate_report(&metrics).await.unwrap(); + + // Clear old alerts + reporter.clear_old_alerts(1).await.unwrap(); + + let stats = reporter.get_statistics().await; + assert_eq!(stats.alerts_in_memory, 0); + } +} \ No newline at end of file diff --git a/crates/ahm/src/metrics/storage.rs b/crates/ahm/src/metrics/storage.rs new file mode 100644 index 00000000..66520ff7 --- /dev/null +++ b/crates/ahm/src/metrics/storage.rs @@ -0,0 +1,573 @@ +// Copyright 2024 RustFS Team + +use std::{ + collections::HashMap, + sync::Arc, + path::PathBuf, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{debug, error, info, warn}; + +use crate::error::Result; + +use super::{ + AggregatedMetrics, MetricsDataPoint, MetricsQuery, MetricsSummary, SystemMetrics, +}; + +/// Configuration for metrics storage +#[derive(Debug, Clone)] +pub struct StorageConfig { + /// Storage directory path + pub storage_path: PathBuf, + /// Maximum file size for metrics files + pub max_file_size: u64, + /// Compression enabled + pub compression_enabled: bool, + /// Retention period for metrics data + pub retention_period: Duration, + /// Batch size for writes + pub batch_size: usize, + /// Flush interval + pub flush_interval: Duration, + /// Whether to enable data validation + pub enable_validation: bool, + /// Whether to enable data encryption + pub enable_encryption: bool, + /// Encryption key (if enabled) + pub encryption_key: Option, +} + +impl Default for StorageConfig { + fn default() -> Self { + Self { + storage_path: PathBuf::from("/tmp/rustfs/metrics"), + max_file_size: 100 * 1024 * 1024, // 100 MB + compression_enabled: true, + retention_period: Duration::from_secs(86400 * 30), // 30 days + batch_size: 1000, + flush_interval: Duration::from_secs(60), // 1 minute + enable_validation: true, + enable_encryption: false, + encryption_key: None, + } + } +} + +/// Metrics storage that persists metrics data to disk +pub struct Storage { + config: StorageConfig, + metrics_buffer: Arc>>, + aggregated_buffer: Arc>>, + file_handles: Arc>>, + last_flush_time: Arc>, + total_writes: Arc>, + total_reads: Arc>, +} + +impl Storage { + /// Create a new metrics storage + pub async fn new(config: StorageConfig) -> Result { + // Create storage directory if it doesn't exist + tokio::fs::create_dir_all(&config.storage_path).await?; + + Ok(Self { + config, + metrics_buffer: Arc::new(RwLock::new(Vec::new())), + aggregated_buffer: Arc::new(RwLock::new(Vec::new())), + file_handles: Arc::new(RwLock::new(HashMap::new())), + last_flush_time: Arc::new(RwLock::new(SystemTime::now())), + total_writes: Arc::new(RwLock::new(0)), + total_reads: Arc::new(RwLock::new(0)), + }) + } + + /// Get the configuration + pub fn config(&self) -> &StorageConfig { + &self.config + } + + /// Store system metrics + pub async fn store_metrics(&self, metrics: SystemMetrics) -> Result<()> { + let mut buffer = self.metrics_buffer.write().await; + buffer.push(metrics); + + // Flush if buffer is full + if buffer.len() >= self.config.batch_size { + self.flush_metrics_buffer().await?; + } + + // Update write count + { + let mut writes = self.total_writes.write().await; + *writes += 1; + } + + Ok(()) + } + + /// Store aggregated metrics + pub async fn store_aggregated_metrics(&self, aggregated: AggregatedMetrics) -> Result<()> { + let mut buffer = self.aggregated_buffer.write().await; + buffer.push(aggregated); + + // Flush if buffer is full + if buffer.len() >= self.config.batch_size { + self.flush_aggregated_buffer().await?; + } + + Ok(()) + } + + /// Retrieve metrics for a time range + pub async fn retrieve_metrics(&self, query: &MetricsQuery) -> Result> { + let start_time = Instant::now(); + + // Update read count + { + let mut reads = self.total_reads.write().await; + *reads += 1; + } + + // In a real implementation, this would read from disk files + // For now, we'll return data from the buffer + let buffer = self.metrics_buffer.read().await; + let filtered_metrics: Vec = buffer + .iter() + .filter(|m| m.timestamp >= query.start_time && m.timestamp <= query.end_time) + .cloned() + .collect(); + + let retrieval_time = start_time.elapsed(); + debug!("Metrics retrieval completed in {:?}", retrieval_time); + + Ok(filtered_metrics) + } + + /// Retrieve aggregated metrics + pub async fn retrieve_aggregated_metrics(&self, query: &MetricsQuery) -> Result> { + let buffer = self.aggregated_buffer.read().await; + let filtered_metrics: Vec = buffer + .iter() + .filter(|m| { + if let Some(first_point) = m.data_points.first() { + first_point.timestamp >= query.start_time + } else { + false + } + }) + .filter(|m| { + if let Some(last_point) = m.data_points.last() { + last_point.timestamp <= query.end_time + } else { + false + } + }) + .cloned() + .collect(); + + Ok(filtered_metrics) + } + + /// Flush metrics buffer to disk + async fn flush_metrics_buffer(&self) -> Result<()> { + let mut buffer = self.metrics_buffer.write().await; + if buffer.is_empty() { + return Ok(()); + } + + let metrics_to_write = buffer.drain(..).collect::>(); + drop(buffer); // Release lock + + // Write to file + self.write_metrics_to_file(&metrics_to_write).await?; + + // Update flush time + { + let mut last_flush = self.last_flush_time.write().await; + *last_flush = SystemTime::now(); + } + + info!("Flushed {} metrics to disk", metrics_to_write.len()); + Ok(()) + } + + /// Flush aggregated buffer to disk + async fn flush_aggregated_buffer(&self) -> Result<()> { + let mut buffer = self.aggregated_buffer.write().await; + if buffer.is_empty() { + return Ok(()); + } + + let aggregated_to_write = buffer.drain(..).collect::>(); + drop(buffer); // Release lock + + // Write to file + self.write_aggregated_to_file(&aggregated_to_write).await?; + + info!("Flushed {} aggregated metrics to disk", aggregated_to_write.len()); + Ok(()) + } + + /// Write metrics to file + async fn write_metrics_to_file(&self, metrics: &[SystemMetrics]) -> Result<()> { + let filename = format!("metrics_{}.json", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs()); + let filepath = self.config.storage_path.join(filename); + + // In a real implementation, this would write to a file + // For now, we'll just simulate the write + debug!("Would write {} metrics to {}", metrics.len(), filepath.display()); + + Ok(()) + } + + /// Write aggregated metrics to file + async fn write_aggregated_to_file(&self, aggregated: &[AggregatedMetrics]) -> Result<()> { + let filename = format!("aggregated_{}.json", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs()); + let filepath = self.config.storage_path.join(filename); + + // In a real implementation, this would write to a file + // For now, we'll just simulate the write + debug!("Would write {} aggregated metrics to {}", aggregated.len(), filepath.display()); + + Ok(()) + } + + /// Force flush all buffers + pub async fn force_flush(&self) -> Result<()> { + self.flush_metrics_buffer().await?; + self.flush_aggregated_buffer().await?; + + info!("Force flush completed"); + Ok(()) + } + + /// Clean up old data based on retention policy + pub async fn cleanup_old_data(&self) -> Result<()> { + let cutoff_time = SystemTime::now() - self.config.retention_period; + + // Clean up metrics buffer + { + let mut buffer = self.metrics_buffer.write().await; + buffer.retain(|m| m.timestamp >= cutoff_time); + } + + // Clean up aggregated buffer + { + let mut buffer = self.aggregated_buffer.write().await; + buffer.retain(|m| { + if let Some(first_point) = m.data_points.first() { + first_point.timestamp >= cutoff_time + } else { + false + } + }); + } + + // In a real implementation, this would also clean up old files + info!("Cleanup completed, removed data older than {:?}", cutoff_time); + Ok(()) + } + + /// Get storage statistics + pub async fn get_statistics(&self) -> StorageStatistics { + let metrics_count = self.metrics_buffer.read().await.len(); + let aggregated_count = self.aggregated_buffer.read().await.len(); + let total_writes = *self.total_writes.read().await; + let total_reads = *self.total_reads.read().await; + let last_flush_time = *self.last_flush_time.read().await; + + StorageStatistics { + metrics_in_buffer: metrics_count, + aggregated_in_buffer: aggregated_count, + total_writes, + total_reads, + last_flush_time, + config: self.config.clone(), + } + } + + /// Validate stored data integrity + pub async fn validate_data_integrity(&self) -> Result { + if !self.config.enable_validation { + return Ok(DataIntegrityReport { + is_valid: true, + total_records: 0, + corrupted_records: 0, + validation_time: Duration::ZERO, + errors: Vec::new(), + }); + } + + let start_time = Instant::now(); + let mut errors = Vec::new(); + let mut corrupted_records = 0; + + // Validate metrics buffer + { + let buffer = self.metrics_buffer.read().await; + for (i, metric) in buffer.iter().enumerate() { + if !self.validate_metric(metric) { + errors.push(format!("Invalid metric at index {}: {:?}", i, metric)); + corrupted_records += 1; + } + } + } + + // Validate aggregated buffer + { + let buffer = self.aggregated_buffer.read().await; + for (i, aggregated) in buffer.iter().enumerate() { + if !self.validate_aggregated(aggregated) { + errors.push(format!("Invalid aggregated metrics at index {}: {:?}", i, aggregated)); + corrupted_records += 1; + } + } + } + + let validation_time = start_time.elapsed(); + let total_records = { + let metrics_count = self.metrics_buffer.read().await.len(); + let aggregated_count = self.aggregated_buffer.read().await.len(); + metrics_count + aggregated_count + }; + + let is_valid = corrupted_records == 0; + + Ok(DataIntegrityReport { + is_valid, + total_records, + corrupted_records, + validation_time, + errors, + }) + } + + /// Validate a single metric + fn validate_metric(&self, metric: &SystemMetrics) -> bool { + // Basic validation checks + metric.cpu_usage >= 0.0 && metric.cpu_usage <= 100.0 + && metric.memory_usage >= 0.0 && metric.memory_usage <= 100.0 + && metric.disk_usage >= 0.0 && metric.disk_usage <= 100.0 + && metric.system_load >= 0.0 + } + + /// Validate aggregated metrics + fn validate_aggregated(&self, aggregated: &AggregatedMetrics) -> bool { + // Basic validation checks + !aggregated.data_points.is_empty() + && aggregated.query.start_time <= aggregated.query.end_time + && aggregated.summary.total_points > 0 + } + + /// Backup metrics data + pub async fn backup_data(&self, backup_path: &PathBuf) -> Result { + let start_time = Instant::now(); + + // Create backup directory + tokio::fs::create_dir_all(backup_path).await?; + + // In a real implementation, this would copy files to backup location + // For now, we'll just simulate the backup + let metrics_count = self.metrics_buffer.read().await.len(); + let aggregated_count = self.aggregated_buffer.read().await.len(); + + let backup_time = start_time.elapsed(); + + Ok(BackupReport { + backup_path: backup_path.clone(), + metrics_backed_up: metrics_count, + aggregated_backed_up: aggregated_count, + backup_time, + success: true, + }) + } + + /// Restore metrics data from backup + pub async fn restore_data(&self, backup_path: &PathBuf) -> Result { + let start_time = Instant::now(); + + // In a real implementation, this would restore from backup files + // For now, we'll just simulate the restore + debug!("Would restore data from {}", backup_path.display()); + + let restore_time = start_time.elapsed(); + + Ok(RestoreReport { + backup_path: backup_path.clone(), + metrics_restored: 0, + aggregated_restored: 0, + restore_time, + success: true, + }) + } +} + +/// Storage statistics +#[derive(Debug, Clone)] +pub struct StorageStatistics { + pub metrics_in_buffer: usize, + pub aggregated_in_buffer: usize, + pub total_writes: u64, + pub total_reads: u64, + pub last_flush_time: SystemTime, + pub config: StorageConfig, +} + +/// Data integrity validation report +#[derive(Debug, Clone)] +pub struct DataIntegrityReport { + pub is_valid: bool, + pub total_records: usize, + pub corrupted_records: usize, + pub validation_time: Duration, + pub errors: Vec, +} + +/// Backup report +#[derive(Debug, Clone)] +pub struct BackupReport { + pub backup_path: PathBuf, + pub metrics_backed_up: usize, + pub aggregated_backed_up: usize, + pub backup_time: Duration, + pub success: bool, +} + +/// Restore report +#[derive(Debug, Clone)] +pub struct RestoreReport { + pub backup_path: PathBuf, + pub metrics_restored: usize, + pub aggregated_restored: usize, + pub restore_time: Duration, + pub success: bool, +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Instant; + + #[tokio::test] + async fn test_storage_creation() { + let config = StorageConfig::default(); + let storage = Storage::new(config).await.unwrap(); + + assert_eq!(storage.config().batch_size, 1000); + assert!(storage.config().compression_enabled); + } + + #[tokio::test] + async fn test_metrics_storage() { + let config = StorageConfig::default(); + let storage = Storage::new(config).await.unwrap(); + + let metrics = SystemMetrics::default(); + storage.store_metrics(metrics).await.unwrap(); + + let stats = storage.get_statistics().await; + assert_eq!(stats.metrics_in_buffer, 1); + assert_eq!(stats.total_writes, 1); + } + + #[tokio::test] + async fn test_aggregated_storage() { + let config = StorageConfig::default(); + let storage = Storage::new(config).await.unwrap(); + + let aggregated = AggregatedMetrics { + query: MetricsQuery { + start_time: SystemTime::now(), + end_time: SystemTime::now() + Duration::from_secs(3600), + interval: Duration::from_secs(60), + metrics: vec![], + severity_filter: None, + limit: None, + }, + data_points: vec![], + summary: MetricsSummary::default(), + }; + + storage.store_aggregated_metrics(aggregated).await.unwrap(); + + let stats = storage.get_statistics().await; + assert_eq!(stats.aggregated_in_buffer, 1); + } + + #[tokio::test] + async fn test_metrics_retrieval() { + let config = StorageConfig::default(); + let storage = Storage::new(config).await.unwrap(); + + // Store some metrics + for i in 0..5 { + let mut metrics = SystemMetrics::default(); + metrics.timestamp = SystemTime::now() + Duration::from_secs(i * 60); + storage.store_metrics(metrics).await.unwrap(); + } + + let query = MetricsQuery { + start_time: SystemTime::now(), + end_time: SystemTime::now() + Duration::from_secs(300), + interval: Duration::from_secs(60), + metrics: vec![], + severity_filter: None, + limit: None, + }; + + let retrieved = storage.retrieve_metrics(&query).await.unwrap(); + assert_eq!(retrieved.len(), 5); + } + + #[tokio::test] + async fn test_data_integrity_validation() { + let config = StorageConfig { + enable_validation: true, + ..Default::default() + }; + let storage = Storage::new(config).await.unwrap(); + + let report = storage.validate_data_integrity().await.unwrap(); + assert!(report.is_valid); + assert_eq!(report.corrupted_records, 0); + } + + #[tokio::test] + async fn test_force_flush() { + let config = StorageConfig::default(); + let storage = Storage::new(config).await.unwrap(); + + // Add some data + storage.store_metrics(SystemMetrics::default()).await.unwrap(); + + // Force flush + storage.force_flush().await.unwrap(); + + let stats = storage.get_statistics().await; + assert_eq!(stats.metrics_in_buffer, 0); + } + + #[tokio::test] + async fn test_cleanup_old_data() { + let config = StorageConfig::default(); + let storage = Storage::new(config).await.unwrap(); + + // Add some old data + let mut old_metrics = SystemMetrics::default(); + old_metrics.timestamp = SystemTime::now() - Duration::from_secs(86400 * 31); // 31 days old + storage.store_metrics(old_metrics).await.unwrap(); + + // Add some recent data + let mut recent_metrics = SystemMetrics::default(); + recent_metrics.timestamp = SystemTime::now(); + storage.store_metrics(recent_metrics).await.unwrap(); + + // Cleanup + storage.cleanup_old_data().await.unwrap(); + + let stats = storage.get_statistics().await; + assert_eq!(stats.metrics_in_buffer, 1); // Only recent data should remain + } +} \ No newline at end of file diff --git a/crates/ahm/src/policy/heal_policy.rs b/crates/ahm/src/policy/heal_policy.rs new file mode 100644 index 00000000..8342f089 --- /dev/null +++ b/crates/ahm/src/policy/heal_policy.rs @@ -0,0 +1,508 @@ +// Copyright 2024 RustFS Team + +use std::time::{Duration, SystemTime}; + +use crate::scanner::{HealthIssue, Severity}; + +use super::{PolicyContext, PolicyResult, ResourceUsage}; + +/// Configuration for heal policies +#[derive(Debug, Clone)] +pub struct HealPolicyConfig { + /// Maximum number of concurrent repairs + pub max_concurrent_repairs: usize, + /// Maximum repair duration per operation + pub max_repair_duration: Duration, + /// Minimum interval between repairs + pub min_repair_interval: Duration, + /// Maximum system load threshold for healing + pub max_system_load: f64, + /// Minimum available disk space percentage for healing + pub min_disk_space: f64, + /// Maximum number of active operations for healing + pub max_active_operations: u64, + /// Whether to enable automatic healing + pub auto_heal_enabled: bool, + /// Priority-based healing configuration + pub priority_config: HealPriorityConfig, + /// Resource-based healing configuration + pub resource_config: HealResourceConfig, + /// Retry configuration + pub retry_config: HealRetryConfig, +} + +/// Priority-based healing configuration +#[derive(Debug, Clone)] +pub struct HealPriorityConfig { + /// Whether to enable priority-based healing + pub enabled: bool, + /// Critical issues heal immediately + pub critical_immediate: bool, + /// High priority issues heal within + pub high_timeout: Duration, + /// Medium priority issues heal within + pub medium_timeout: Duration, + /// Low priority issues heal within + pub low_timeout: Duration, +} + +/// Resource-based healing configuration +#[derive(Debug, Clone)] +pub struct HealResourceConfig { + /// Maximum CPU usage for healing + pub max_cpu_usage: f64, + /// Maximum memory usage for healing + pub max_memory_usage: f64, + /// Maximum disk I/O usage for healing + pub max_disk_io_usage: f64, + /// Maximum network I/O usage for healing + pub max_network_io_usage: f64, + /// Whether to enable resource-based throttling + pub enable_throttling: bool, +} + +/// Retry configuration for healing +#[derive(Debug, Clone)] +pub struct HealRetryConfig { + /// Maximum number of retry attempts + pub max_retry_attempts: u32, + /// Initial backoff delay + pub initial_backoff: Duration, + /// Maximum backoff delay + pub max_backoff: Duration, + /// Backoff multiplier + pub backoff_multiplier: f64, + /// Whether to use exponential backoff + pub exponential_backoff: bool, +} + +impl Default for HealPolicyConfig { + fn default() -> Self { + Self { + max_concurrent_repairs: 4, + max_repair_duration: Duration::from_secs(1800), // 30 minutes + min_repair_interval: Duration::from_secs(60), // 1 minute + max_system_load: 0.7, + min_disk_space: 15.0, // 15% minimum disk space + max_active_operations: 50, + auto_heal_enabled: true, + priority_config: HealPriorityConfig::default(), + resource_config: HealResourceConfig::default(), + retry_config: HealRetryConfig::default(), + } + } +} + +impl Default for HealPriorityConfig { + fn default() -> Self { + Self { + enabled: true, + critical_immediate: true, + high_timeout: Duration::from_secs(300), // 5 minutes + medium_timeout: Duration::from_secs(1800), // 30 minutes + low_timeout: Duration::from_secs(3600), // 1 hour + } + } +} + +impl Default for HealResourceConfig { + fn default() -> Self { + Self { + max_cpu_usage: 80.0, + max_memory_usage: 80.0, + max_disk_io_usage: 70.0, + max_network_io_usage: 70.0, + enable_throttling: true, + } + } +} + +impl Default for HealRetryConfig { + fn default() -> Self { + Self { + max_retry_attempts: 3, + initial_backoff: Duration::from_secs(30), + max_backoff: Duration::from_secs(300), + backoff_multiplier: 2.0, + exponential_backoff: true, + } + } +} + +/// Heal policy engine +pub struct HealPolicyEngine { + config: HealPolicyConfig, + last_repair_time: SystemTime, + repair_count: u64, + active_repairs: u64, +} + +impl HealPolicyEngine { + /// Create a new heal policy engine + pub fn new(config: HealPolicyConfig) -> Self { + Self { + config, + last_repair_time: SystemTime::now(), + repair_count: 0, + active_repairs: 0, + } + } + + /// Get the configuration + pub fn config(&self) -> &HealPolicyConfig { + &self.config + } + + /// Evaluate heal policy + pub async fn evaluate(&self, issue: &HealthIssue, context: &PolicyContext) -> PolicyResult { + let mut reasons = Vec::new(); + let mut allowed = true; + + // Check if auto-heal is enabled + if !self.config.auto_heal_enabled { + allowed = false; + reasons.push("Auto-heal is disabled".to_string()); + } + + // Check system load + if context.system_load > self.config.max_system_load { + allowed = false; + reasons.push(format!( + "System load too high: {:.2} > {:.2}", + context.system_load, self.config.max_system_load + )); + } + + // Check disk space + if context.disk_space_available < self.config.min_disk_space { + allowed = false; + reasons.push(format!( + "Disk space too low: {:.1}% < {:.1}%", + context.disk_space_available, self.config.min_disk_space + )); + } + + // Check active operations + if context.active_operations > self.config.max_active_operations { + allowed = false; + reasons.push(format!( + "Too many active operations: {} > {}", + context.active_operations, self.config.max_active_operations + )); + } + + // Check repair interval + let time_since_last_repair = context.current_time + .duration_since(self.last_repair_time) + .unwrap_or(Duration::ZERO); + + if time_since_last_repair < self.config.min_repair_interval { + allowed = false; + reasons.push(format!( + "Repair interval too short: {:?} < {:?}", + time_since_last_repair, self.config.min_repair_interval + )); + } + + // Check resource usage + if self.config.resource_config.enable_throttling { + if context.resource_usage.cpu_usage > self.config.resource_config.max_cpu_usage { + allowed = false; + reasons.push(format!( + "CPU usage too high: {:.1}% > {:.1}%", + context.resource_usage.cpu_usage, self.config.resource_config.max_cpu_usage + )); + } + + if context.resource_usage.memory_usage > self.config.resource_config.max_memory_usage { + allowed = false; + reasons.push(format!( + "Memory usage too high: {:.1}% > {:.1}%", + context.resource_usage.memory_usage, self.config.resource_config.max_memory_usage + )); + } + + if context.resource_usage.disk_io_usage > self.config.resource_config.max_disk_io_usage { + allowed = false; + reasons.push(format!( + "Disk I/O usage too high: {:.1}% > {:.1}%", + context.resource_usage.disk_io_usage, self.config.resource_config.max_disk_io_usage + )); + } + + if context.resource_usage.network_io_usage > self.config.resource_config.max_network_io_usage { + allowed = false; + reasons.push(format!( + "Network I/O usage too high: {:.1}% > {:.1}%", + context.resource_usage.network_io_usage, self.config.resource_config.max_network_io_usage + )); + } + } + + // Check priority-based policies + if self.config.priority_config.enabled { + match issue.severity { + Severity::Critical => { + if self.config.priority_config.critical_immediate { + // Critical issues should always be allowed unless resource constraints prevent it + if allowed { + reasons.clear(); + reasons.push("Critical issue - immediate repair allowed".to_string()); + } + } + } + Severity::High => { + // Check if we're within the high priority timeout + if time_since_last_repair > self.config.priority_config.high_timeout { + allowed = false; + reasons.push(format!( + "High priority issue timeout exceeded: {:?} > {:?}", + time_since_last_repair, self.config.priority_config.high_timeout + )); + } + } + Severity::Medium => { + // Check if we're within the medium priority timeout + if time_since_last_repair > self.config.priority_config.medium_timeout { + allowed = false; + reasons.push(format!( + "Medium priority issue timeout exceeded: {:?} > {:?}", + time_since_last_repair, self.config.priority_config.medium_timeout + )); + } + } + Severity::Low => { + // Check if we're within the low priority timeout + if time_since_last_repair > self.config.priority_config.low_timeout { + allowed = false; + reasons.push(format!( + "Low priority issue timeout exceeded: {:?} > {:?}", + time_since_last_repair, self.config.priority_config.low_timeout + )); + } + } + } + } + + let reason = if reasons.is_empty() { + "Heal allowed".to_string() + } else { + reasons.join("; ") + }; + + PolicyResult { + allowed, + reason, + metadata: Some(serde_json::json!({ + "repair_count": self.repair_count, + "active_repairs": self.active_repairs, + "time_since_last_repair": time_since_last_repair.as_secs(), + "issue_severity": format!("{:?}", issue.severity), + "issue_type": format!("{:?}", issue.issue_type), + "system_load": context.system_load, + "disk_space_available": context.disk_space_available, + "active_operations": context.active_operations, + })), + evaluated_at: context.current_time, + } + } + + /// Get repair timeout based on priority + pub fn get_repair_timeout(&self, severity: Severity) -> Duration { + if !self.config.priority_config.enabled { + return self.config.max_repair_duration; + } + + match severity { + Severity::Critical => Duration::from_secs(300), // 5 minutes for critical + Severity::High => self.config.priority_config.high_timeout, + Severity::Medium => self.config.priority_config.medium_timeout, + Severity::Low => self.config.priority_config.low_timeout, + } + } + + /// Get retry configuration + pub fn get_retry_config(&self) -> &HealRetryConfig { + &self.config.retry_config + } + + /// Update repair statistics + pub fn record_repair(&mut self) { + self.last_repair_time = SystemTime::now(); + self.repair_count += 1; + } + + /// Increment active repairs + pub fn increment_active_repairs(&mut self) { + self.active_repairs += 1; + } + + /// Decrement active repairs + pub fn decrement_active_repairs(&mut self) { + if self.active_repairs > 0 { + self.active_repairs -= 1; + } + } + + /// Get heal statistics + pub fn get_statistics(&self) -> HealPolicyStatistics { + HealPolicyStatistics { + total_repairs: self.repair_count, + active_repairs: self.active_repairs, + last_repair_time: self.last_repair_time, + config: self.config.clone(), + } + } +} + +/// Heal policy statistics +#[derive(Debug, Clone)] +pub struct HealPolicyStatistics { + pub total_repairs: u64, + pub active_repairs: u64, + pub last_repair_time: SystemTime, + pub config: HealPolicyConfig, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType, Severity}; + + #[tokio::test] + async fn test_heal_policy_creation() { + let config = HealPolicyConfig::default(); + let engine = HealPolicyEngine::new(config); + + assert_eq!(engine.config().max_concurrent_repairs, 4); + assert_eq!(engine.config().max_system_load, 0.7); + assert_eq!(engine.config().min_disk_space, 15.0); + } + + #[tokio::test] + async fn test_heal_policy_evaluation() { + let config = HealPolicyConfig::default(); + let engine = HealPolicyEngine::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Medium, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate(&issue, &context).await; + assert!(result.allowed); + assert!(result.reason.contains("Heal allowed")); + } + + #[tokio::test] + async fn test_heal_policy_critical_immediate() { + let config = HealPolicyConfig::default(); + let engine = HealPolicyEngine::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate(&issue, &context).await; + assert!(result.allowed); + assert!(result.reason.contains("Critical issue - immediate repair allowed")); + } + + #[tokio::test] + async fn test_heal_policy_system_load_limit() { + let config = HealPolicyConfig::default(); + let engine = HealPolicyEngine::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Medium, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let context = PolicyContext { + system_load: 0.8, // Above threshold + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate(&issue, &context).await; + assert!(!result.allowed); + assert!(result.reason.contains("System load too high")); + } + + #[tokio::test] + async fn test_repair_timeouts() { + let config = HealPolicyConfig::default(); + let engine = HealPolicyEngine::new(config); + + assert_eq!( + engine.get_repair_timeout(Severity::Critical), + Duration::from_secs(300) + ); + assert_eq!( + engine.get_repair_timeout(Severity::High), + Duration::from_secs(300) + ); + assert_eq!( + engine.get_repair_timeout(Severity::Medium), + Duration::from_secs(1800) + ); + assert_eq!( + engine.get_repair_timeout(Severity::Low), + Duration::from_secs(3600) + ); + } + + #[tokio::test] + async fn test_heal_statistics() { + let config = HealPolicyConfig::default(); + let mut engine = HealPolicyEngine::new(config); + + assert_eq!(engine.get_statistics().total_repairs, 0); + assert_eq!(engine.get_statistics().active_repairs, 0); + + engine.record_repair(); + engine.increment_active_repairs(); + engine.increment_active_repairs(); + + let stats = engine.get_statistics(); + assert_eq!(stats.total_repairs, 1); + assert_eq!(stats.active_repairs, 2); + + engine.decrement_active_repairs(); + assert_eq!(engine.get_statistics().active_repairs, 1); + } +} \ No newline at end of file diff --git a/crates/ahm/src/policy/mod.rs b/crates/ahm/src/policy/mod.rs new file mode 100644 index 00000000..507113cc --- /dev/null +++ b/crates/ahm/src/policy/mod.rs @@ -0,0 +1,258 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Policy system for AHM operations +//! +//! Defines configurable policies for: +//! - Scanning behavior and frequency +//! - Healing priorities and strategies +//! - Data retention and lifecycle management + +pub mod scan_policy; +pub mod heal_policy; +pub mod retention_policy; + +pub use scan_policy::{ScanPolicyConfig, ScanPolicyEngine}; +pub use heal_policy::{HealPolicyConfig, HealPolicyEngine}; +pub use retention_policy::{RetentionPolicyConfig, RetentionPolicyEngine}; + +use std::time::{Duration, SystemTime}; +use serde::{Deserialize, Serialize}; + +use crate::scanner::{HealthIssue, Severity}; + +/// Policy evaluation result +#[derive(Debug, Clone)] +pub struct PolicyResult { + /// Whether the policy allows the action + pub allowed: bool, + /// Reason for the decision + pub reason: String, + /// Additional metadata + pub metadata: Option, + /// When the policy was evaluated + pub evaluated_at: SystemTime, +} + +/// Policy evaluation context +#[derive(Debug, Clone)] +pub struct PolicyContext { + /// Current system load + pub system_load: f64, + /// Available disk space percentage + pub disk_space_available: f64, + /// Number of active operations + pub active_operations: u64, + /// Current time + pub current_time: SystemTime, + /// Health issues count by severity + pub health_issues: std::collections::HashMap, + /// Resource usage metrics + pub resource_usage: ResourceUsage, +} + +/// Resource usage information +#[derive(Debug, Clone)] +pub struct ResourceUsage { + /// CPU usage percentage + pub cpu_usage: f64, + /// Memory usage percentage + pub memory_usage: f64, + /// Disk I/O usage percentage + pub disk_io_usage: f64, + /// Network I/O usage percentage + pub network_io_usage: f64, +} + +impl Default for ResourceUsage { + fn default() -> Self { + Self { + cpu_usage: 0.0, + memory_usage: 0.0, + disk_io_usage: 0.0, + network_io_usage: 0.0, + } + } +} + +/// Policy manager that coordinates all policies +pub struct PolicyManager { + scan_policy: ScanPolicyEngine, + heal_policy: HealPolicyEngine, + retention_policy: RetentionPolicyEngine, +} + +impl PolicyManager { + /// Create a new policy manager + pub fn new( + scan_config: ScanPolicyConfig, + heal_config: HealPolicyConfig, + retention_config: RetentionPolicyConfig, + ) -> Self { + Self { + scan_policy: ScanPolicyEngine::new(scan_config), + heal_policy: HealPolicyEngine::new(heal_config), + retention_policy: RetentionPolicyEngine::new(retention_config), + } + } + + /// Evaluate scan policy + pub async fn evaluate_scan_policy(&self, context: &PolicyContext) -> PolicyResult { + self.scan_policy.evaluate(context).await + } + + /// Evaluate heal policy + pub async fn evaluate_heal_policy(&self, issue: &HealthIssue, context: &PolicyContext) -> PolicyResult { + self.heal_policy.evaluate(issue, context).await + } + + /// Evaluate retention policy + pub async fn evaluate_retention_policy(&self, object_age: Duration, context: &PolicyContext) -> PolicyResult { + self.retention_policy.evaluate(object_age, context).await + } + + /// Get scan policy engine + pub fn scan_policy(&self) -> &ScanPolicyEngine { + &self.scan_policy + } + + /// Get heal policy engine + pub fn heal_policy(&self) -> &HealPolicyEngine { + &self.heal_policy + } + + /// Get retention policy engine + pub fn retention_policy(&self) -> &RetentionPolicyEngine { + &self.retention_policy + } + + /// Update scan policy configuration + pub async fn update_scan_policy(&mut self, config: ScanPolicyConfig) { + self.scan_policy = ScanPolicyEngine::new(config); + } + + /// Update heal policy configuration + pub async fn update_heal_policy(&mut self, config: HealPolicyConfig) { + self.heal_policy = HealPolicyEngine::new(config); + } + + /// Update retention policy configuration + pub async fn update_retention_policy(&mut self, config: RetentionPolicyConfig) { + self.retention_policy = RetentionPolicyEngine::new(config); + } + + /// List all policies + pub async fn list_policies(&self) -> crate::error::Result> { + // In a real implementation, this would return actual policy names + Ok(vec![ + "scan_policy".to_string(), + "heal_policy".to_string(), + "retention_policy".to_string(), + ]) + } + + /// Get a specific policy + pub async fn get_policy(&self, name: &str) -> crate::error::Result { + // In a real implementation, this would return the actual policy + Ok(format!("Policy configuration for: {}", name)) + } + + /// Get engine configuration + pub async fn get_config(&self) -> PolicyConfig { + PolicyConfig::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::{HealthIssue, HealthIssueType}; + + #[tokio::test] + async fn test_policy_manager_creation() { + let scan_config = ScanPolicyConfig::default(); + let heal_config = HealPolicyConfig::default(); + let retention_config = RetentionPolicyConfig::default(); + + let manager = PolicyManager::new(scan_config, heal_config, retention_config); + + // Test that all policy engines are available + assert!(manager.scan_policy().config().max_concurrent_scans > 0); + assert!(manager.heal_policy().config().max_concurrent_repairs > 0); + assert!(manager.retention_policy().config().default_retention_days > 0); + } + + #[tokio::test] + async fn test_policy_evaluation() { + let scan_config = ScanPolicyConfig::default(); + let heal_config = HealPolicyConfig::default(); + let retention_config = RetentionPolicyConfig::default(); + + let manager = PolicyManager::new(scan_config, heal_config, retention_config); + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + // Test scan policy evaluation + let scan_result = manager.evaluate_scan_policy(&context).await; + assert!(scan_result.allowed); + + // Test heal policy evaluation + let issue = HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + let heal_result = manager.evaluate_heal_policy(&issue, &context).await; + assert!(heal_result.allowed); + + // Test retention policy evaluation + let retention_result = manager.evaluate_retention_policy(Duration::from_secs(86400), &context).await; + assert!(retention_result.allowed); + } +} + +/// Master policy configuration +#[derive(Debug, Clone)] +pub struct PolicyConfig { + pub scan: ScanPolicyConfig, + pub heal: HealPolicyConfig, + pub retention: RetentionPolicyConfig, +} + +impl Default for PolicyConfig { + fn default() -> Self { + Self { + scan: ScanPolicyConfig::default(), + heal: HealPolicyConfig::default(), + retention: RetentionPolicyConfig::default(), + } + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct PolicyManagerConfig { + #[serde(default)] + pub default_scan_interval: Duration, +} \ No newline at end of file diff --git a/crates/ahm/src/policy/retention_policy.rs b/crates/ahm/src/policy/retention_policy.rs new file mode 100644 index 00000000..f54591fa --- /dev/null +++ b/crates/ahm/src/policy/retention_policy.rs @@ -0,0 +1,487 @@ +// Copyright 2024 RustFS Team + +use std::time::{Duration, SystemTime}; + +use super::{PolicyContext, PolicyResult, ResourceUsage}; + +/// Configuration for retention policies +#[derive(Debug, Clone)] +pub struct RetentionPolicyConfig { + /// Default retention period in days + pub default_retention_days: u32, + /// Whether to enable retention policies + pub enabled: bool, + /// Maximum system load threshold for retention operations + pub max_system_load: f64, + /// Minimum available disk space percentage for retention operations + pub min_disk_space: f64, + /// Maximum number of active operations for retention + pub max_active_operations: u64, + /// Retention rules by object type + pub retention_rules: Vec, + /// Whether to enable automatic cleanup + pub auto_cleanup_enabled: bool, + /// Cleanup interval + pub cleanup_interval: Duration, + /// Maximum objects to delete per cleanup cycle + pub max_objects_per_cleanup: u64, +} + +/// Retention rule for specific object types +#[derive(Debug, Clone)] +pub struct RetentionRule { + /// Object type pattern (e.g., "*.log", "temp/*") + pub pattern: String, + /// Retention period in days + pub retention_days: u32, + /// Whether this rule is enabled + pub enabled: bool, + /// Priority of this rule (higher = more important) + pub priority: u32, + /// Whether to apply this rule recursively + pub recursive: bool, +} + +impl Default for RetentionPolicyConfig { + fn default() -> Self { + Self { + default_retention_days: 30, + enabled: true, + max_system_load: 0.6, + min_disk_space: 20.0, // 20% minimum disk space + max_active_operations: 20, + retention_rules: vec![ + RetentionRule { + pattern: "*.log".to_string(), + retention_days: 7, + enabled: true, + priority: 1, + recursive: false, + }, + RetentionRule { + pattern: "temp/*".to_string(), + retention_days: 1, + enabled: true, + priority: 2, + recursive: true, + }, + RetentionRule { + pattern: "cache/*".to_string(), + retention_days: 3, + enabled: true, + priority: 3, + recursive: true, + }, + ], + auto_cleanup_enabled: true, + cleanup_interval: Duration::from_secs(3600), // 1 hour + max_objects_per_cleanup: 1000, + } + } +} + +/// Retention policy engine +pub struct RetentionPolicyEngine { + config: RetentionPolicyConfig, + last_cleanup_time: SystemTime, + cleanup_count: u64, + objects_deleted: u64, +} + +impl RetentionPolicyEngine { + /// Create a new retention policy engine + pub fn new(config: RetentionPolicyConfig) -> Self { + Self { + config, + last_cleanup_time: SystemTime::now(), + cleanup_count: 0, + objects_deleted: 0, + } + } + + /// Get the configuration + pub fn config(&self) -> &RetentionPolicyConfig { + &self.config + } + + /// Evaluate retention policy + pub async fn evaluate(&self, object_age: Duration, context: &PolicyContext) -> PolicyResult { + let mut reasons = Vec::new(); + let mut allowed = false; + + // Check if retention policies are enabled + if !self.config.enabled { + allowed = false; + reasons.push("Retention policies are disabled".to_string()); + } else { + // Check if object should be retained based on age + let retention_days = self.get_retention_days_for_object("default"); + let retention_duration = Duration::from_secs(retention_days as u64 * 24 * 3600); + + if object_age > retention_duration { + allowed = true; + reasons.push(format!( + "Object age exceeds retention period: {:?} > {:?}", + object_age, retention_duration + )); + } else { + allowed = false; + reasons.push(format!( + "Object within retention period: {:?} <= {:?}", + object_age, retention_duration + )); + } + } + + // Check system constraints + if context.system_load > self.config.max_system_load { + allowed = false; + reasons.push(format!( + "System load too high: {:.2} > {:.2}", + context.system_load, self.config.max_system_load + )); + } + + if context.disk_space_available < self.config.min_disk_space { + allowed = false; + reasons.push(format!( + "Disk space too low: {:.1}% < {:.1}%", + context.disk_space_available, self.config.min_disk_space + )); + } + + if context.active_operations > self.config.max_active_operations { + allowed = false; + reasons.push(format!( + "Too many active operations: {} > {}", + context.active_operations, self.config.max_active_operations + )); + } + + let reason = if reasons.is_empty() { + "Retention evaluation completed".to_string() + } else { + reasons.join("; ") + }; + + PolicyResult { + allowed, + reason, + metadata: Some(serde_json::json!({ + "object_age_seconds": object_age.as_secs(), + "cleanup_count": self.cleanup_count, + "objects_deleted": self.objects_deleted, + "system_load": context.system_load, + "disk_space_available": context.disk_space_available, + "active_operations": context.active_operations, + })), + evaluated_at: context.current_time, + } + } + + /// Evaluate cleanup policy + pub async fn evaluate_cleanup(&self, context: &PolicyContext) -> PolicyResult { + let mut reasons = Vec::new(); + let mut allowed = false; + + // Check if auto-cleanup is enabled + if !self.config.auto_cleanup_enabled { + allowed = false; + reasons.push("Auto-cleanup is disabled".to_string()); + } else { + // Check cleanup interval + let time_since_last_cleanup = context.current_time + .duration_since(self.last_cleanup_time) + .unwrap_or(Duration::ZERO); + + if time_since_last_cleanup >= self.config.cleanup_interval { + allowed = true; + reasons.push("Cleanup interval reached".to_string()); + } else { + allowed = false; + reasons.push(format!( + "Cleanup interval not reached: {:?} < {:?}", + time_since_last_cleanup, self.config.cleanup_interval + )); + } + } + + // Check system constraints + if context.system_load > self.config.max_system_load { + allowed = false; + reasons.push(format!( + "System load too high: {:.2} > {:.2}", + context.system_load, self.config.max_system_load + )); + } + + if context.disk_space_available < self.config.min_disk_space { + allowed = false; + reasons.push(format!( + "Disk space too low: {:.1}% < {:.1}%", + context.disk_space_available, self.config.min_disk_space + )); + } + + let reason = if reasons.is_empty() { + "Cleanup evaluation completed".to_string() + } else { + reasons.join("; ") + }; + + PolicyResult { + allowed, + reason, + metadata: Some(serde_json::json!({ + "cleanup_count": self.cleanup_count, + "objects_deleted": self.objects_deleted, + "max_objects_per_cleanup": self.config.max_objects_per_cleanup, + "system_load": context.system_load, + "disk_space_available": context.disk_space_available, + })), + evaluated_at: context.current_time, + } + } + + /// Get retention days for a specific object + pub fn get_retention_days_for_object(&self, object_path: &str) -> u32 { + // Find the highest priority matching rule + let mut best_rule: Option<&RetentionRule> = None; + let mut best_priority = 0; + + for rule in &self.config.retention_rules { + if !rule.enabled { + continue; + } + + if self.matches_pattern(object_path, &rule.pattern) { + if rule.priority > best_priority { + best_rule = Some(rule); + best_priority = rule.priority; + } + } + } + + best_rule + .map(|rule| rule.retention_days) + .unwrap_or(self.config.default_retention_days) + } + + /// Check if an object path matches a pattern + fn matches_pattern(&self, object_path: &str, pattern: &str) -> bool { + // Simple pattern matching - can be enhanced with regex + if pattern.contains('*') { + // Wildcard matching + let pattern_parts: Vec<&str> = pattern.split('*').collect(); + if pattern_parts.len() == 2 { + let prefix = pattern_parts[0]; + let suffix = pattern_parts[1]; + object_path.starts_with(prefix) && object_path.ends_with(suffix) + } else { + false + } + } else { + // Exact match + object_path == pattern + } + } + + /// Get all retention rules + pub fn get_retention_rules(&self) -> &[RetentionRule] { + &self.config.retention_rules + } + + /// Add a new retention rule + pub fn add_retention_rule(&mut self, rule: RetentionRule) { + self.config.retention_rules.push(rule); + } + + /// Remove a retention rule by pattern + pub fn remove_retention_rule(&mut self, pattern: &str) -> bool { + let initial_len = self.config.retention_rules.len(); + self.config.retention_rules.retain(|rule| rule.pattern != pattern); + self.config.retention_rules.len() < initial_len + } + + /// Update cleanup statistics + pub fn record_cleanup(&mut self, objects_deleted: u64) { + self.last_cleanup_time = SystemTime::now(); + self.cleanup_count += 1; + self.objects_deleted += objects_deleted; + } + + /// Get retention statistics + pub fn get_statistics(&self) -> RetentionPolicyStatistics { + RetentionPolicyStatistics { + total_cleanups: self.cleanup_count, + total_objects_deleted: self.objects_deleted, + last_cleanup_time: self.last_cleanup_time, + config: self.config.clone(), + } + } +} + +/// Retention policy statistics +#[derive(Debug, Clone)] +pub struct RetentionPolicyStatistics { + pub total_cleanups: u64, + pub total_objects_deleted: u64, + pub last_cleanup_time: SystemTime, + pub config: RetentionPolicyConfig, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_retention_policy_creation() { + let config = RetentionPolicyConfig::default(); + let engine = RetentionPolicyEngine::new(config); + + assert_eq!(engine.config().default_retention_days, 30); + assert_eq!(engine.config().max_system_load, 0.6); + assert_eq!(engine.config().min_disk_space, 20.0); + } + + #[tokio::test] + async fn test_retention_policy_evaluation() { + let config = RetentionPolicyConfig::default(); + let engine = RetentionPolicyEngine::new(config); + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + // Test object within retention period + let object_age = Duration::from_secs(7 * 24 * 3600); // 7 days + let result = engine.evaluate(object_age, &context).await; + assert!(!result.allowed); + assert!(result.reason.contains("Object within retention period")); + + // Test object exceeding retention period + let object_age = Duration::from_secs(40 * 24 * 3600); // 40 days + let result = engine.evaluate(object_age, &context).await; + assert!(result.allowed); + assert!(result.reason.contains("Object age exceeds retention period")); + } + + #[tokio::test] + async fn test_retention_policy_system_constraints() { + let config = RetentionPolicyConfig::default(); + let engine = RetentionPolicyEngine::new(config); + + let context = PolicyContext { + system_load: 0.7, // Above threshold + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let object_age = Duration::from_secs(40 * 24 * 3600); // 40 days + let result = engine.evaluate(object_age, &context).await; + assert!(!result.allowed); + assert!(result.reason.contains("System load too high")); + } + + #[tokio::test] + async fn test_retention_rules() { + let config = RetentionPolicyConfig::default(); + let engine = RetentionPolicyEngine::new(config); + + // Test default retention + assert_eq!(engine.get_retention_days_for_object("unknown.txt"), 30); + + // Test log file retention + assert_eq!(engine.get_retention_days_for_object("app.log"), 7); + + // Test temp file retention + assert_eq!(engine.get_retention_days_for_object("temp/file.txt"), 1); + + // Test cache file retention + assert_eq!(engine.get_retention_days_for_object("cache/data.bin"), 3); + } + + #[tokio::test] + async fn test_pattern_matching() { + let config = RetentionPolicyConfig::default(); + let engine = RetentionPolicyEngine::new(config); + + // Test wildcard matching + assert!(engine.matches_pattern("app.log", "*.log")); + assert!(engine.matches_pattern("error.log", "*.log")); + assert!(!engine.matches_pattern("app.txt", "*.log")); + + // Test exact matching + assert!(engine.matches_pattern("temp/file.txt", "temp/file.txt")); + assert!(!engine.matches_pattern("temp/file.txt", "temp/other.txt")); + } + + #[tokio::test] + async fn test_cleanup_evaluation() { + let config = RetentionPolicyConfig::default(); + let engine = RetentionPolicyEngine::new(config); + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate_cleanup(&context).await; + // Should be allowed if enough time has passed since last cleanup + assert!(result.allowed || result.reason.contains("Cleanup interval not reached")); + } + + #[tokio::test] + async fn test_retention_statistics() { + let config = RetentionPolicyConfig::default(); + let mut engine = RetentionPolicyEngine::new(config); + + assert_eq!(engine.get_statistics().total_cleanups, 0); + assert_eq!(engine.get_statistics().total_objects_deleted, 0); + + engine.record_cleanup(50); + assert_eq!(engine.get_statistics().total_cleanups, 1); + assert_eq!(engine.get_statistics().total_objects_deleted, 50); + + engine.record_cleanup(30); + assert_eq!(engine.get_statistics().total_cleanups, 2); + assert_eq!(engine.get_statistics().total_objects_deleted, 80); + } + + #[tokio::test] + async fn test_retention_rule_management() { + let config = RetentionPolicyConfig::default(); + let mut engine = RetentionPolicyEngine::new(config); + + let initial_rules = engine.get_retention_rules().len(); + + // Add a new rule + let new_rule = RetentionRule { + pattern: "backup/*".to_string(), + retention_days: 90, + enabled: true, + priority: 4, + recursive: true, + }; + engine.add_retention_rule(new_rule); + + assert_eq!(engine.get_retention_rules().len(), initial_rules + 1); + + // Remove a rule + let removed = engine.remove_retention_rule("*.log"); + assert!(removed); + assert_eq!(engine.get_retention_rules().len(), initial_rules); + } +} \ No newline at end of file diff --git a/crates/ahm/src/policy/scan_policy.rs b/crates/ahm/src/policy/scan_policy.rs new file mode 100644 index 00000000..44e3fc14 --- /dev/null +++ b/crates/ahm/src/policy/scan_policy.rs @@ -0,0 +1,373 @@ +// Copyright 2024 RustFS Team + +use std::time::{Duration, SystemTime}; + +use crate::scanner::Severity; + +use super::{PolicyContext, PolicyResult, ResourceUsage}; + +/// Configuration for scan policies +#[derive(Debug, Clone)] +pub struct ScanPolicyConfig { + /// Maximum number of concurrent scans + pub max_concurrent_scans: usize, + /// Maximum scan duration per cycle + pub max_scan_duration: Duration, + /// Minimum interval between scans + pub min_scan_interval: Duration, + /// Maximum system load threshold for scanning + pub max_system_load: f64, + /// Minimum available disk space percentage for scanning + pub min_disk_space: f64, + /// Maximum number of active operations for scanning + pub max_active_operations: u64, + /// Whether to enable deep scanning + pub enable_deep_scan: bool, + /// Deep scan interval (how often to perform deep scans) + pub deep_scan_interval: Duration, + /// Bandwidth limit for scanning (bytes per second) + pub bandwidth_limit: Option, + /// Priority-based scanning configuration + pub priority_config: ScanPriorityConfig, +} + +/// Priority-based scanning configuration +#[derive(Debug, Clone)] +pub struct ScanPriorityConfig { + /// Whether to enable priority-based scanning + pub enabled: bool, + /// Critical issues scan interval + pub critical_interval: Duration, + /// High priority issues scan interval + pub high_interval: Duration, + /// Medium priority issues scan interval + pub medium_interval: Duration, + /// Low priority issues scan interval + pub low_interval: Duration, +} + +impl Default for ScanPolicyConfig { + fn default() -> Self { + Self { + max_concurrent_scans: 4, + max_scan_duration: Duration::from_secs(3600), // 1 hour + min_scan_interval: Duration::from_secs(300), // 5 minutes + max_system_load: 0.8, + min_disk_space: 10.0, // 10% minimum disk space + max_active_operations: 100, + enable_deep_scan: true, + deep_scan_interval: Duration::from_secs(86400), // 24 hours + bandwidth_limit: Some(100 * 1024 * 1024), // 100 MB/s + priority_config: ScanPriorityConfig::default(), + } + } +} + +impl Default for ScanPriorityConfig { + fn default() -> Self { + Self { + enabled: true, + critical_interval: Duration::from_secs(60), // 1 minute + high_interval: Duration::from_secs(300), // 5 minutes + medium_interval: Duration::from_secs(1800), // 30 minutes + low_interval: Duration::from_secs(3600), // 1 hour + } + } +} + +/// Scan policy engine +pub struct ScanPolicyEngine { + config: ScanPolicyConfig, + last_scan_time: SystemTime, + last_deep_scan_time: SystemTime, + scan_count: u64, +} + +impl ScanPolicyEngine { + /// Create a new scan policy engine + pub fn new(config: ScanPolicyConfig) -> Self { + Self { + config, + last_scan_time: SystemTime::now(), + last_deep_scan_time: SystemTime::now(), + scan_count: 0, + } + } + + /// Get the configuration + pub fn config(&self) -> &ScanPolicyConfig { + &self.config + } + + /// Evaluate scan policy + pub async fn evaluate(&self, context: &PolicyContext) -> PolicyResult { + let mut reasons = Vec::new(); + let mut allowed = true; + + // Check system load + if context.system_load > self.config.max_system_load { + allowed = false; + reasons.push(format!( + "System load too high: {:.2} > {:.2}", + context.system_load, self.config.max_system_load + )); + } + + // Check disk space + if context.disk_space_available < self.config.min_disk_space { + allowed = false; + reasons.push(format!( + "Disk space too low: {:.1}% < {:.1}%", + context.disk_space_available, self.config.min_disk_space + )); + } + + // Check active operations + if context.active_operations > self.config.max_active_operations { + allowed = false; + reasons.push(format!( + "Too many active operations: {} > {}", + context.active_operations, self.config.max_active_operations + )); + } + + // Check scan interval + let time_since_last_scan = context.current_time + .duration_since(self.last_scan_time) + .unwrap_or(Duration::ZERO); + + if time_since_last_scan < self.config.min_scan_interval { + allowed = false; + reasons.push(format!( + "Scan interval too short: {:?} < {:?}", + time_since_last_scan, self.config.min_scan_interval + )); + } + + // Check resource usage + if context.resource_usage.cpu_usage > 90.0 { + allowed = false; + reasons.push("CPU usage too high".to_string()); + } + + if context.resource_usage.memory_usage > 90.0 { + allowed = false; + reasons.push("Memory usage too high".to_string()); + } + + let reason = if reasons.is_empty() { + "Scan allowed".to_string() + } else { + reasons.join("; ") + }; + + PolicyResult { + allowed, + reason, + metadata: Some(serde_json::json!({ + "scan_count": self.scan_count, + "time_since_last_scan": time_since_last_scan.as_secs(), + "system_load": context.system_load, + "disk_space_available": context.disk_space_available, + "active_operations": context.active_operations, + })), + evaluated_at: context.current_time, + } + } + + /// Evaluate deep scan policy + pub async fn evaluate_deep_scan(&self, context: &PolicyContext) -> PolicyResult { + let mut base_result = self.evaluate(context).await; + + if !base_result.allowed { + return base_result; + } + + // Check deep scan interval + let time_since_last_deep_scan = context.current_time + .duration_since(self.last_deep_scan_time) + .unwrap_or(Duration::ZERO); + + if time_since_last_deep_scan < self.config.deep_scan_interval { + base_result.allowed = false; + base_result.reason = format!( + "Deep scan interval too short: {:?} < {:?}", + time_since_last_deep_scan, self.config.deep_scan_interval + ); + } else { + base_result.reason = "Deep scan allowed".to_string(); + } + + // Add deep scan metadata + if let Some(ref mut metadata) = base_result.metadata { + if let Some(obj) = metadata.as_object_mut() { + obj.insert( + "time_since_last_deep_scan".to_string(), + serde_json::Value::Number(serde_json::Number::from(time_since_last_deep_scan.as_secs())), + ); + obj.insert( + "deep_scan_enabled".to_string(), + serde_json::Value::Bool(self.config.enable_deep_scan), + ); + } + } + + base_result + } + + /// Get scan interval based on priority + pub fn get_priority_interval(&self, severity: Severity) -> Duration { + if !self.config.priority_config.enabled { + return self.config.min_scan_interval; + } + + match severity { + Severity::Critical => self.config.priority_config.critical_interval, + Severity::High => self.config.priority_config.high_interval, + Severity::Medium => self.config.priority_config.medium_interval, + Severity::Low => self.config.priority_config.low_interval, + } + } + + /// Update scan statistics + pub fn record_scan(&mut self) { + self.last_scan_time = SystemTime::now(); + self.scan_count += 1; + } + + /// Update deep scan statistics + pub fn record_deep_scan(&mut self) { + self.last_deep_scan_time = SystemTime::now(); + } + + /// Get scan statistics + pub fn get_statistics(&self) -> ScanPolicyStatistics { + ScanPolicyStatistics { + total_scans: self.scan_count, + last_scan_time: self.last_scan_time, + last_deep_scan_time: self.last_deep_scan_time, + config: self.config.clone(), + } + } +} + +/// Scan policy statistics +#[derive(Debug, Clone)] +pub struct ScanPolicyStatistics { + pub total_scans: u64, + pub last_scan_time: SystemTime, + pub last_deep_scan_time: SystemTime, + pub config: ScanPolicyConfig, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scanner::Severity; + + #[tokio::test] + async fn test_scan_policy_creation() { + let config = ScanPolicyConfig::default(); + let engine = ScanPolicyEngine::new(config); + + assert_eq!(engine.config().max_concurrent_scans, 4); + assert_eq!(engine.config().max_system_load, 0.8); + assert_eq!(engine.config().min_disk_space, 10.0); + } + + #[tokio::test] + async fn test_scan_policy_evaluation() { + let config = ScanPolicyConfig::default(); + let engine = ScanPolicyEngine::new(config); + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate(&context).await; + assert!(result.allowed); + assert!(result.reason.contains("Scan allowed")); + } + + #[tokio::test] + async fn test_scan_policy_system_load_limit() { + let config = ScanPolicyConfig::default(); + let engine = ScanPolicyEngine::new(config); + + let context = PolicyContext { + system_load: 0.9, // Above threshold + disk_space_available: 80.0, + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate(&context).await; + assert!(!result.allowed); + assert!(result.reason.contains("System load too high")); + } + + #[tokio::test] + async fn test_scan_policy_disk_space_limit() { + let config = ScanPolicyConfig::default(); + let engine = ScanPolicyEngine::new(config); + + let context = PolicyContext { + system_load: 0.5, + disk_space_available: 5.0, // Below threshold + active_operations: 10, + current_time: SystemTime::now(), + health_issues: std::collections::HashMap::new(), + resource_usage: ResourceUsage::default(), + }; + + let result = engine.evaluate(&context).await; + assert!(!result.allowed); + assert!(result.reason.contains("Disk space too low")); + } + + #[tokio::test] + async fn test_priority_intervals() { + let config = ScanPolicyConfig::default(); + let engine = ScanPolicyEngine::new(config); + + assert_eq!( + engine.get_priority_interval(Severity::Critical), + Duration::from_secs(60) + ); + assert_eq!( + engine.get_priority_interval(Severity::High), + Duration::from_secs(300) + ); + assert_eq!( + engine.get_priority_interval(Severity::Medium), + Duration::from_secs(1800) + ); + assert_eq!( + engine.get_priority_interval(Severity::Low), + Duration::from_secs(3600) + ); + } + + #[tokio::test] + async fn test_scan_statistics() { + let config = ScanPolicyConfig::default(); + let mut engine = ScanPolicyEngine::new(config); + + assert_eq!(engine.get_statistics().total_scans, 0); + + engine.record_scan(); + assert_eq!(engine.get_statistics().total_scans, 1); + + engine.record_deep_scan(); + let stats = engine.get_statistics(); + assert_eq!(stats.total_scans, 1); + assert!(stats.last_deep_scan_time > stats.last_scan_time); + } +} \ No newline at end of file diff --git a/crates/ahm/src/scanner/bandwidth_limiter.rs b/crates/ahm/src/scanner/bandwidth_limiter.rs new file mode 100644 index 00000000..cf0e6a8d --- /dev/null +++ b/crates/ahm/src/scanner/bandwidth_limiter.rs @@ -0,0 +1,353 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant}, +}; + +use tokio::{ + sync::RwLock, + time::{sleep, sleep_until, Instant as TokioInstant}, +}; +use tracing::{debug, info, warn}; + +use crate::error::Result; + +/// Configuration for bandwidth limiting +#[derive(Debug, Clone)] +pub struct BandwidthConfig { + /// Maximum bytes per second + pub bytes_per_second: u64, + /// Maximum operations per second + pub operations_per_second: u64, + /// Burst allowance multiplier + pub burst_multiplier: f64, + /// Whether to enable adaptive throttling + pub adaptive_throttling: bool, + /// Minimum sleep duration between operations + pub min_sleep_duration: Duration, + /// Maximum sleep duration between operations + pub max_sleep_duration: Duration, +} + +impl Default for BandwidthConfig { + fn default() -> Self { + Self { + bytes_per_second: 100 * 1024 * 1024, // 100 MB/s + operations_per_second: 1000, // 1000 ops/s + burst_multiplier: 2.0, + adaptive_throttling: true, + min_sleep_duration: Duration::from_micros(100), + max_sleep_duration: Duration::from_millis(100), + } + } +} + +/// Bandwidth limiter for controlling scan I/O rates +pub struct BandwidthLimiter { + config: BandwidthConfig, + bytes_this_second: Arc, + operations_this_second: Arc, + last_reset: Arc>, + adaptive_sleep_duration: Arc>, + total_bytes_processed: Arc, + total_operations_processed: Arc, + start_time: Instant, +} + +impl BandwidthLimiter { + /// Create a new bandwidth limiter + pub fn new(config: BandwidthConfig) -> Self { + let adaptive_sleep = if config.adaptive_throttling { + config.min_sleep_duration + } else { + Duration::from_micros(1000) // 1ms default + }; + + Self { + config, + bytes_this_second: Arc::new(AtomicU64::new(0)), + operations_this_second: Arc::new(AtomicU64::new(0)), + last_reset: Arc::new(RwLock::new(Instant::now())), + adaptive_sleep_duration: Arc::new(RwLock::new(adaptive_sleep)), + total_bytes_processed: Arc::new(AtomicU64::new(0)), + total_operations_processed: Arc::new(AtomicU64::new(0)), + start_time: Instant::now(), + } + } + + /// Wait for bandwidth allowance before processing bytes + pub async fn wait_for_bytes(&self, bytes: u64) -> Result<()> { + if self.config.bytes_per_second == 0 { + return Ok(()); + } + + let mut total_wait_time = Duration::ZERO; + let mut remaining_bytes = bytes; + + while remaining_bytes > 0 { + // Reset counters if a second has passed + self.reset_counters_if_needed().await; + + let current_bytes = self.bytes_this_second.load(Ordering::Relaxed); + let burst_limit = (self.config.bytes_per_second as f64 * self.config.burst_multiplier) as u64; + + if current_bytes >= burst_limit { + // We're over the burst limit, wait + let wait_time = self.calculate_wait_time(current_bytes, self.config.bytes_per_second).await; + sleep(wait_time).await; + total_wait_time += wait_time; + continue; + } + + let bytes_to_process = std::cmp::min(remaining_bytes, burst_limit - current_bytes); + self.bytes_this_second.fetch_add(bytes_to_process, Ordering::Relaxed); + self.total_bytes_processed.fetch_add(bytes_to_process, Ordering::Relaxed); + remaining_bytes -= bytes_to_process; + + // Adaptive throttling + if self.config.adaptive_throttling { + self.update_adaptive_sleep(bytes_to_process).await; + } + } + + if total_wait_time > Duration::ZERO { + debug!("Bandwidth limiter waited {:?} for {} bytes", total_wait_time, bytes); + } + + Ok(()) + } + + /// Wait for bandwidth allowance before processing an operation + pub async fn wait_for_operation(&self) -> Result<()> { + if self.config.operations_per_second == 0 { + return Ok(()); + } + + // Reset counters if a second has passed + self.reset_counters_if_needed().await; + + let current_ops = self.operations_this_second.load(Ordering::Relaxed); + let burst_limit = (self.config.operations_per_second as f64 * self.config.burst_multiplier) as u64; + + if current_ops >= burst_limit { + // We're over the burst limit, wait + let wait_time = self.calculate_wait_time(current_ops, self.config.operations_per_second).await; + sleep(wait_time).await; + debug!("Bandwidth limiter waited {:?} for operation", wait_time); + } + + self.operations_this_second.fetch_add(1, Ordering::Relaxed); + self.total_operations_processed.fetch_add(1, Ordering::Relaxed); + + Ok(()) + } + + /// Wait for bandwidth allowance before processing both bytes and operations + pub async fn wait_for_bytes_and_operation(&self, bytes: u64) -> Result<()> { + self.wait_for_bytes(bytes).await?; + self.wait_for_operation().await?; + Ok(()) + } + + /// Reset counters if a second has passed + async fn reset_counters_if_needed(&self) { + let mut last_reset = self.last_reset.write().await; + let now = Instant::now(); + + if now.duration_since(*last_reset) >= Duration::from_secs(1) { + self.bytes_this_second.store(0, Ordering::Relaxed); + self.operations_this_second.store(0, Ordering::Relaxed); + *last_reset = now; + } + } + + /// Calculate wait time based on current usage and limit + async fn calculate_wait_time(&self, current: u64, limit: u64) -> Duration { + if current == 0 || limit == 0 { + return self.config.min_sleep_duration; + } + + let utilization = current as f64 / limit as f64; + let base_sleep = self.config.min_sleep_duration.as_micros() as f64; + let max_sleep = self.config.max_sleep_duration.as_micros() as f64; + + // Exponential backoff based on utilization + let sleep_micros = base_sleep * (utilization * utilization); + let sleep_micros = sleep_micros.min(max_sleep).max(base_sleep); + + Duration::from_micros(sleep_micros as u64) + } + + /// Update adaptive sleep duration based on recent activity + async fn update_adaptive_sleep(&self, bytes_processed: u64) { + let mut sleep_duration = self.adaptive_sleep_duration.write().await; + + // Simple adaptive algorithm: increase sleep if we're processing too much + let current_rate = bytes_processed as f64 / sleep_duration.as_secs_f64(); + let target_rate = self.config.bytes_per_second as f64; + + if current_rate > target_rate * 1.1 { + // We're going too fast, increase sleep + *sleep_duration = Duration::from_micros( + (sleep_duration.as_micros() as f64 * 1.1) as u64 + ).min(self.config.max_sleep_duration); + } else if current_rate < target_rate * 0.9 { + // We're going too slow, decrease sleep + *sleep_duration = Duration::from_micros( + (sleep_duration.as_micros() as f64 * 0.9) as u64 + ).max(self.config.min_sleep_duration); + } + } + + /// Get current bandwidth statistics + pub async fn statistics(&self) -> BandwidthStatistics { + let elapsed = self.start_time.elapsed(); + let total_bytes = self.total_bytes_processed.load(Ordering::Relaxed); + let total_ops = self.total_operations_processed.load(Ordering::Relaxed); + let current_bytes = self.bytes_this_second.load(Ordering::Relaxed); + let current_ops = self.operations_this_second.load(Ordering::Relaxed); + let adaptive_sleep = *self.adaptive_sleep_duration.read().await; + + BandwidthStatistics { + total_bytes_processed: total_bytes, + total_operations_processed: total_ops, + current_bytes_per_second: current_bytes, + current_operations_per_second: current_ops, + average_bytes_per_second: if elapsed.as_secs() > 0 { + total_bytes / elapsed.as_secs() + } else { + 0 + }, + average_operations_per_second: if elapsed.as_secs() > 0 { + total_ops / elapsed.as_secs() + } else { + 0 + }, + adaptive_sleep_duration: adaptive_sleep, + uptime: elapsed, + } + } + + /// Reset all statistics + pub async fn reset_statistics(&self) { + self.total_bytes_processed.store(0, Ordering::Relaxed); + self.total_operations_processed.store(0, Ordering::Relaxed); + self.bytes_this_second.store(0, Ordering::Relaxed); + self.operations_this_second.store(0, Ordering::Relaxed); + *self.last_reset.write().await = Instant::now(); + *self.adaptive_sleep_duration.write().await = self.config.min_sleep_duration; + } + + /// Update configuration + pub async fn update_config(&self, new_config: BandwidthConfig) { + info!("Updating bandwidth limiter config: {:?}", new_config); + + // Reset adaptive sleep if adaptive throttling is disabled + if !new_config.adaptive_throttling { + *self.adaptive_sleep_duration.write().await = new_config.min_sleep_duration; + } + + // Note: We can't update the config struct itself since it's not wrapped in Arc + // In a real implementation, you might want to wrap the config in Arc as well + warn!("Config update not fully implemented - config struct is not mutable"); + } +} + +/// Statistics for bandwidth limiting +#[derive(Debug, Clone)] +pub struct BandwidthStatistics { + pub total_bytes_processed: u64, + pub total_operations_processed: u64, + pub current_bytes_per_second: u64, + pub current_operations_per_second: u64, + pub average_bytes_per_second: u64, + pub average_operations_per_second: u64, + pub adaptive_sleep_duration: Duration, + pub uptime: Duration, +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::time::Instant as TokioInstant; + + #[tokio::test] + async fn test_bandwidth_limiter_creation() { + let config = BandwidthConfig::default(); + let limiter = BandwidthLimiter::new(config); + let stats = limiter.statistics().await; + assert_eq!(stats.total_bytes_processed, 0); + assert_eq!(stats.total_operations_processed, 0); + } + + #[tokio::test] + async fn test_bytes_limiting() { + let config = BandwidthConfig { + bytes_per_second: 1000, // 1KB/s + operations_per_second: 1000, + ..Default::default() + }; + let limiter = BandwidthLimiter::new(config); + + let start = TokioInstant::now(); + + // Process 500 bytes (should not be limited) + limiter.wait_for_bytes(500).await.unwrap(); + + // Process another 600 bytes (should be limited) + limiter.wait_for_bytes(600).await.unwrap(); + + let elapsed = start.elapsed(); + assert!(elapsed >= Duration::from_millis(100)); // Should take some time due to limiting + } + + #[tokio::test] + async fn test_operation_limiting() { + let config = BandwidthConfig { + bytes_per_second: 1000000, // 1MB/s + operations_per_second: 10, // 10 ops/s + ..Default::default() + }; + let limiter = BandwidthLimiter::new(config); + + let start = TokioInstant::now(); + + // Process 15 operations (should be limited) + for _ in 0..15 { + limiter.wait_for_operation().await.unwrap(); + } + + let elapsed = start.elapsed(); + assert!(elapsed >= Duration::from_millis(500)); // Should take some time due to limiting + } + + #[tokio::test] + async fn test_statistics() { + let config = BandwidthConfig::default(); + let limiter = BandwidthLimiter::new(config); + + limiter.wait_for_bytes(1000).await.unwrap(); + limiter.wait_for_operation().await.unwrap(); + + let stats = limiter.statistics().await; + assert_eq!(stats.total_bytes_processed, 1000); + assert_eq!(stats.total_operations_processed, 1); + assert!(stats.uptime > Duration::ZERO); + } +} \ No newline at end of file diff --git a/crates/ahm/src/scanner/disk_scanner.rs b/crates/ahm/src/scanner/disk_scanner.rs new file mode 100644 index 00000000..8d636107 --- /dev/null +++ b/crates/ahm/src/scanner/disk_scanner.rs @@ -0,0 +1,591 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + path::Path, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{error, info}; +use anyhow; + +use crate::error::Result; +use super::{HealthIssue, HealthIssueType, Severity}; + +/// Configuration for disk scanning +#[derive(Debug, Clone)] +pub struct DiskScannerConfig { + /// Scan interval for disk health checks + pub scan_interval: Duration, + /// Minimum free space threshold (percentage) + pub min_free_space_percent: f64, + /// Maximum disk usage threshold (percentage) + pub max_disk_usage_percent: f64, + /// Minimum inode usage threshold (percentage) + pub min_inode_usage_percent: f64, + /// Maximum inode usage threshold (percentage) + pub max_inode_usage_percent: f64, + /// Whether to check disk I/O performance + pub check_io_performance: bool, + /// Whether to check disk temperature (if available) + pub check_temperature: bool, + /// Whether to check disk SMART status (if available) + pub check_smart_status: bool, + /// Timeout for individual disk operations + pub operation_timeout: Duration, + /// Maximum number of concurrent disk scans + pub max_concurrent_scans: usize, +} + +impl Default for DiskScannerConfig { + fn default() -> Self { + Self { + scan_interval: Duration::from_secs(300), // 5 minutes + min_free_space_percent: 10.0, // 10% minimum free space + max_disk_usage_percent: 90.0, // 90% maximum usage + min_inode_usage_percent: 5.0, // 5% minimum inode usage + max_inode_usage_percent: 95.0, // 95% maximum inode usage + check_io_performance: true, + check_temperature: false, // Disabled by default + check_smart_status: false, // Disabled by default + operation_timeout: Duration::from_secs(30), + max_concurrent_scans: 4, + } + } +} + +/// Disk information and health status +#[derive(Debug, Clone)] +pub struct DiskInfo { + pub device_path: String, + pub mount_point: String, + pub filesystem_type: String, + pub total_space: u64, + pub used_space: u64, + pub free_space: u64, + pub available_space: u64, + pub usage_percent: f64, + pub inode_total: Option, + pub inode_used: Option, + pub inode_free: Option, + pub inode_usage_percent: Option, + pub last_scan_time: SystemTime, + pub health_status: DiskHealthStatus, + pub performance_metrics: Option, + pub temperature: Option, + pub smart_status: Option, +} + +/// Disk health status +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DiskHealthStatus { + Healthy, + Warning, + Critical, + Unknown, +} + +/// Disk performance metrics +#[derive(Debug, Clone)] +pub struct DiskPerformanceMetrics { + pub read_bytes_per_sec: f64, + pub write_bytes_per_sec: f64, + pub read_operations_per_sec: f64, + pub write_operations_per_sec: f64, + pub average_response_time_ms: f64, + pub queue_depth: f64, + pub utilization_percent: f64, + pub last_updated: SystemTime, +} + +/// SMART status information +#[derive(Debug, Clone)] +pub struct SmartStatus { + pub overall_health: SmartHealthStatus, + pub temperature: Option, + pub power_on_hours: Option, + pub reallocated_sectors: Option, + pub pending_sectors: Option, + pub uncorrectable_sectors: Option, + pub attributes: HashMap, +} + +/// SMART health status +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SmartHealthStatus { + Passed, + Failed, + Unknown, +} + +/// SMART attribute +#[derive(Debug, Clone)] +pub struct SmartAttribute { + pub name: String, + pub value: u64, + pub worst: u64, + pub threshold: u64, + pub status: SmartAttributeStatus, +} + +/// SMART attribute status +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SmartAttributeStatus { + Good, + Warning, + Critical, + Unknown, +} + +/// Result of scanning a single disk +#[derive(Debug, Clone)] +pub struct DiskScanResult { + pub disk_info: DiskInfo, + pub health_issues: Vec, + pub scan_duration: Duration, + pub success: bool, + pub error_message: Option, +} + +/// Disk scanner for monitoring disk health and performance +pub struct DiskScanner { + config: DiskScannerConfig, + statistics: Arc>, + last_scan_results: Arc>>, +} + +/// Statistics for disk scanning +#[derive(Debug, Clone, Default)] +pub struct DiskScannerStatistics { + pub disks_scanned: u64, + pub disks_with_issues: u64, + pub total_issues_found: u64, + pub total_scan_time: Duration, + pub average_scan_time: Duration, + pub last_scan_time: Option, + pub scan_cycles_completed: u64, + pub scan_cycles_failed: u64, +} + +impl DiskScanner { + /// Create a new disk scanner + pub fn new(config: DiskScannerConfig) -> Self { + Self { + config, + statistics: Arc::new(RwLock::new(DiskScannerStatistics::default())), + last_scan_results: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Scan all mounted disks + pub async fn scan_all_disks(&self) -> Result> { + let scan_start = Instant::now(); + let mut results = Vec::new(); + + // Get list of mounted filesystems + let mount_points = self.get_mount_points().await?; + + info!("Starting disk scan for {} mount points", mount_points.len()); + + // Scan each mount point + for mount_point in mount_points { + match self.scan_disk(&mount_point).await { + Ok(result) => { + results.push(result.clone()); + + // Store result for later reference + let mut last_results = self.last_scan_results.write().await; + last_results.insert(mount_point.clone(), result); + } + Err(e) => { + error!("Failed to scan disk at {}: {}", mount_point, e); + + // Create error result + let error_result = DiskScanResult { + disk_info: DiskInfo { + device_path: "unknown".to_string(), + mount_point: mount_point.clone(), + filesystem_type: "unknown".to_string(), + total_space: 0, + used_space: 0, + free_space: 0, + available_space: 0, + usage_percent: 0.0, + inode_total: None, + inode_used: None, + inode_free: None, + inode_usage_percent: None, + last_scan_time: SystemTime::now(), + health_status: DiskHealthStatus::Unknown, + performance_metrics: None, + temperature: None, + smart_status: None, + }, + health_issues: vec![HealthIssue { + issue_type: HealthIssueType::DiskReadError, + severity: Severity::High, + bucket: "system".to_string(), + object: mount_point.clone(), + description: format!("Failed to scan disk: {}", e), + metadata: None, + }], + scan_duration: scan_start.elapsed(), + success: false, + error_message: Some(e.to_string()), + }; + + results.push(error_result); + } + } + } + + // Update statistics + self.update_statistics(|stats| { + stats.disks_scanned += results.len() as u64; + stats.disks_with_issues += results.iter().filter(|r| !r.health_issues.is_empty()).count() as u64; + stats.total_issues_found += results.iter().map(|r| r.health_issues.len() as u64).sum::(); + stats.total_scan_time += scan_start.elapsed(); + stats.average_scan_time = Duration::from_millis( + stats.total_scan_time.as_millis() as u64 / stats.disks_scanned.max(1) + ); + stats.last_scan_time = Some(SystemTime::now()); + stats.scan_cycles_completed += 1; + }).await; + + info!( + "Disk scan completed: {} disks, {} issues found in {:?}", + results.len(), + results.iter().map(|r| r.health_issues.len()).sum::(), + scan_start.elapsed() + ); + + Ok(results) + } + + /// Scan a single disk + pub async fn scan_disk(&self, mount_point: &str) -> Result { + let scan_start = Instant::now(); + let mut health_issues = Vec::new(); + + // Get disk space information + let disk_info = self.get_disk_info(mount_point).await?; + + // Check disk space usage + if disk_info.usage_percent > self.config.max_disk_usage_percent { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskFull, + severity: if disk_info.usage_percent > 95.0 { Severity::Critical } else { Severity::High }, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: format!("Disk usage is {}%, exceeds threshold of {}%", + disk_info.usage_percent, self.config.max_disk_usage_percent), + metadata: None, + }); + } + + if disk_info.usage_percent < self.config.min_free_space_percent { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskFull, + severity: Severity::Medium, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: format!("Free space is only {}%, below threshold of {}%", + 100.0 - disk_info.usage_percent, self.config.min_free_space_percent), + metadata: None, + }); + } + + // Check inode usage if available + if let Some(inode_usage) = disk_info.inode_usage_percent { + if inode_usage > self.config.max_inode_usage_percent { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskFull, + severity: if inode_usage > 95.0 { Severity::Critical } else { Severity::High }, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: format!("Inode usage is {}%, exceeds threshold of {}%", + inode_usage, self.config.max_inode_usage_percent), + metadata: None, + }); + } + } + + // Check I/O performance if enabled + if self.config.check_io_performance { + if let Some(metrics) = &disk_info.performance_metrics { + if metrics.utilization_percent > 90.0 { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskReadError, + severity: Severity::Medium, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: format!("High disk utilization: {}%", metrics.utilization_percent), + metadata: None, + }); + } + + if metrics.average_response_time_ms > 100.0 { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskReadError, + severity: Severity::Medium, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: format!("High disk response time: {}ms", metrics.average_response_time_ms), + metadata: None, + }); + } + } + } + + // Check temperature if enabled + if self.config.check_temperature { + if let Some(temp) = disk_info.temperature { + if temp > 60.0 { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskReadError, + severity: if temp > 70.0 { Severity::Critical } else { Severity::High }, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: format!("High disk temperature: {}°C", temp), + metadata: None, + }); + } + } + } + + // Check SMART status if enabled + if self.config.check_smart_status { + if let Some(smart) = &disk_info.smart_status { + if smart.overall_health == SmartHealthStatus::Failed { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskReadError, + severity: Severity::Critical, + bucket: "system".to_string(), + object: mount_point.to_string(), + description: "SMART health check failed".to_string(), + metadata: None, + }); + } + } + } + + let scan_duration = scan_start.elapsed(); + let success = health_issues.is_empty(); + + Ok(DiskScanResult { + disk_info, + health_issues, + scan_duration, + success, + error_message: None, + }) + } + + /// Get list of mounted filesystems + async fn get_mount_points(&self) -> Result> { + // TODO: Implement actual mount point detection + // For now, return common mount points + Ok(vec![ + "/".to_string(), + "/data".to_string(), + "/var".to_string(), + ]) + } + + /// Get disk information for a mount point + async fn get_disk_info(&self, mount_point: &str) -> Result { + let path = Path::new(mount_point); + + // Get filesystem statistics using std::fs instead of nix for now + let _metadata = match std::fs::metadata(path) { + Ok(metadata) => metadata, + Err(e) => { + return Err(crate::error::Error::Other(anyhow::anyhow!("Failed to get filesystem stats: {}", e))); + } + }; + + // For now, use placeholder values since we can't easily get filesystem stats + let total_space = 1000000000; // 1GB placeholder + let free_space = 500000000; // 500MB placeholder + let available_space = 450000000; // 450MB placeholder + let used_space = total_space - free_space; + let usage_percent = (used_space as f64 / total_space as f64) * 100.0; + + // Get inode information (placeholder) + let inode_total = Some(1000000); + let inode_free = Some(500000); + let inode_used = Some(500000); + let inode_usage_percent = Some(50.0); + + // Get filesystem type + let filesystem_type = self.get_filesystem_type(mount_point).await.unwrap_or_else(|_| "unknown".to_string()); + + // Get device path + let device_path = self.get_device_path(mount_point).await.unwrap_or_else(|_| "unknown".to_string()); + + // Get performance metrics if enabled + let performance_metrics = if self.config.check_io_performance { + self.get_performance_metrics(&device_path).await.ok() + } else { + None + }; + + // Get temperature if enabled + let temperature = if self.config.check_temperature { + self.get_disk_temperature(&device_path).await.ok().flatten() + } else { + None + }; + + // Get SMART status if enabled + let smart_status = if self.config.check_smart_status { + self.get_smart_status(&device_path).await.ok().flatten() + } else { + None + }; + + // Determine health status (placeholder - will be set by scan_disk method) + let health_status = DiskHealthStatus::Healthy; + + Ok(DiskInfo { + device_path, + mount_point: mount_point.to_string(), + filesystem_type, + total_space, + used_space, + free_space, + available_space, + usage_percent, + inode_total, + inode_used, + inode_free, + inode_usage_percent, + last_scan_time: SystemTime::now(), + health_status, + performance_metrics, + temperature, + smart_status, + }) + } + + /// Get filesystem type for a mount point + async fn get_filesystem_type(&self, _mount_point: &str) -> Result { + // TODO: Implement filesystem type detection + // For now, return a placeholder + Ok("ext4".to_string()) + } + + /// Get device path for a mount point + async fn get_device_path(&self, _mount_point: &str) -> Result { + // TODO: Implement device path detection + // For now, return a placeholder + Ok("/dev/sda1".to_string()) + } + + /// Get disk performance metrics + async fn get_performance_metrics(&self, _device_path: &str) -> Result { + // TODO: Implement performance metrics collection + // For now, return placeholder metrics + Ok(DiskPerformanceMetrics { + read_bytes_per_sec: 1000000.0, // 1MB/s + write_bytes_per_sec: 500000.0, // 500KB/s + read_operations_per_sec: 100.0, + write_operations_per_sec: 50.0, + average_response_time_ms: 5.0, + queue_depth: 1.0, + utilization_percent: 10.0, + last_updated: SystemTime::now(), + }) + } + + /// Get disk temperature + async fn get_disk_temperature(&self, _device_path: &str) -> Result> { + // TODO: Implement temperature monitoring + // For now, return None (temperature not available) + Ok(None) + } + + /// Get SMART status + async fn get_smart_status(&self, _device_path: &str) -> Result> { + // TODO: Implement SMART status checking + // For now, return None (SMART not available) + Ok(None) + } + + /// Update scanner statistics + async fn update_statistics(&self, update_fn: F) + where + F: FnOnce(&mut DiskScannerStatistics), + { + let mut stats = self.statistics.write().await; + update_fn(&mut stats); + } + + /// Get current statistics + pub async fn statistics(&self) -> DiskScannerStatistics { + self.statistics.read().await.clone() + } + + /// Get last scan results + pub async fn last_scan_results(&self) -> HashMap { + self.last_scan_results.read().await.clone() + } + + /// Reset statistics + pub async fn reset_statistics(&self) { + let mut stats = self.statistics.write().await; + *stats = DiskScannerStatistics::default(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_disk_scanner_creation() { + let config = DiskScannerConfig::default(); + let scanner = DiskScanner::new(config); + assert_eq!(scanner.statistics().await.disks_scanned, 0); + } + + #[tokio::test] + async fn test_disk_info_creation() { + let disk_info = DiskInfo { + device_path: "/dev/sda1".to_string(), + mount_point: "/".to_string(), + filesystem_type: "ext4".to_string(), + total_space: 1000000000, + used_space: 500000000, + free_space: 500000000, + available_space: 450000000, + usage_percent: 50.0, + inode_total: Some(1000000), + inode_used: Some(500000), + inode_free: Some(500000), + inode_usage_percent: Some(50.0), + last_scan_time: SystemTime::now(), + health_status: DiskHealthStatus::Healthy, + performance_metrics: None, + temperature: None, + smart_status: None, + }; + + assert_eq!(disk_info.usage_percent, 50.0); + assert_eq!(disk_info.health_status, DiskHealthStatus::Healthy); + } +} \ No newline at end of file diff --git a/crates/ahm/src/scanner/engine.rs b/crates/ahm/src/scanner/engine.rs new file mode 100644 index 00000000..129b21a7 --- /dev/null +++ b/crates/ahm/src/scanner/engine.rs @@ -0,0 +1,536 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + path::{Path, PathBuf}, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::{ + sync::{broadcast, RwLock}, + time::sleep, +}; +use tracing::{error, info, warn}; +use tokio_util::sync::CancellationToken; + +use crate::{core, error::Result, metrics, SystemEvent}; +use crate::core::Status; +use super::{HealthIssue, HealthIssueType, Severity}; + +/// Represents a discovered object during scanning +#[derive(Debug, Clone)] +pub struct ScannedObject { + pub bucket: String, + pub object: String, + pub version_id: Option, + pub path: PathBuf, + pub size: u64, + pub modified_time: SystemTime, + pub metadata: HashMap, + pub health_issues: Vec, +} + +/// Configuration for the scanner engine +#[derive(Debug, Clone)] +pub struct EngineConfig { + /// Root directory to scan + pub root_path: String, + /// Maximum number of concurrent scan workers + pub max_workers: usize, + /// Scan interval between cycles + pub scan_interval: Duration, + /// Bandwidth limit for scanning (bytes per second) + pub bandwidth_limit: Option, + /// Whether to enable deep scanning (bitrot detection) + pub enable_deep_scan: bool, + /// Probability of healing objects during scan (1 in N) + pub heal_probability: u32, + /// Maximum folders to scan before compacting + pub max_folders_before_compact: u64, + /// Sleep duration between folder scans + pub folder_sleep_duration: Duration, +} + +impl Default for EngineConfig { + fn default() -> Self { + Self { + root_path: "/data".to_string(), + max_workers: 4, + scan_interval: Duration::from_secs(300), // 5 minutes + bandwidth_limit: None, + enable_deep_scan: false, + heal_probability: 1024, // 1 in 1024 objects + max_folders_before_compact: 10000, + folder_sleep_duration: Duration::from_millis(1), + } + } +} + +/// Scanner statistics +#[derive(Debug, Clone, Default)] +pub struct ScannerStatistics { + pub objects_scanned: u64, + pub bytes_scanned: u64, + pub issues_found: u64, + pub scan_duration: Duration, + pub scan_rate_objects_per_sec: f64, + pub scan_rate_bytes_per_sec: f64, + pub folders_scanned: u64, + pub objects_with_issues: u64, +} + +/// Main scanner engine +pub struct Engine { + config: EngineConfig, + coordinator: Arc, + metrics: Arc, + cancel_token: CancellationToken, + status: Arc>, + statistics: Arc>, + scan_cycle: Arc>, +} + +impl Engine { + /// Create a new scanner engine + pub async fn new( + config: EngineConfig, + coordinator: Arc, + metrics: Arc, + cancel_token: CancellationToken, + ) -> Result { + let engine = Self { + config, + coordinator, + metrics, + cancel_token, + status: Arc::new(RwLock::new(Status::Initializing)), + statistics: Arc::new(RwLock::new(ScannerStatistics::default())), + scan_cycle: Arc::new(RwLock::new(0)), + }; + + info!("Scanner engine created with config: {:?}", engine.config); + Ok(engine) + } + + /// Start the scanner engine + pub async fn start(&self) -> Result<()> { + info!("Starting scanner engine"); + *self.status.write().await = Status::Running; + + let engine = self.clone_for_background(); + tokio::spawn(async move { + if let Err(e) = engine.run_scan_loop().await { + error!("Scanner engine error: {}", e); + } + }); + + Ok(()) + } + + /// Stop the scanner engine + pub async fn stop(&self) -> Result<()> { + info!("Stopping scanner engine"); + *self.status.write().await = Status::Stopping; + self.cancel_token.cancel(); + *self.status.write().await = Status::Stopped; + Ok(()) + } + + /// Get current status + pub async fn status(&self) -> Status { + self.status.read().await.clone() + } + + /// Get current statistics + pub async fn statistics(&self) -> ScannerStatistics { + self.statistics.read().await.clone() + } + + /// Clone the engine for background tasks + fn clone_for_background(&self) -> Arc { + Arc::new(Self { + config: self.config.clone(), + coordinator: self.coordinator.clone(), + metrics: self.metrics.clone(), + cancel_token: self.cancel_token.clone(), + status: self.status.clone(), + statistics: self.statistics.clone(), + scan_cycle: self.scan_cycle.clone(), + }) + } + + /// Main scan loop + async fn run_scan_loop(&self) -> Result<()> { + info!("Scanner engine loop started"); + + loop { + tokio::select! { + _ = self.cancel_token.cancelled() => { + info!("Scanner engine received cancellation signal"); + break; + } + _ = sleep(self.config.scan_interval) => { + if let Err(e) = self.run_scan_cycle().await { + error!("Scan cycle failed: {}", e); + } + } + } + } + + Ok(()) + } + + /// Run a single scan cycle + async fn run_scan_cycle(&self) -> Result<()> { + let cycle_start = Instant::now(); + let cycle = { + let mut cycle_guard = self.scan_cycle.write().await; + *cycle_guard += 1; + *cycle_guard + }; + + info!("Starting scan cycle {}", cycle); + + // Reset statistics for new cycle + { + let mut stats = self.statistics.write().await; + *stats = ScannerStatistics::default(); + } + + // Scan the root directory + let scan_result = self.scan_directory(&self.config.root_path).await?; + + // Update statistics + { + let mut stats = self.statistics.write().await; + stats.scan_duration = cycle_start.elapsed(); + stats.objects_scanned = scan_result.objects.len() as u64; + stats.bytes_scanned = scan_result.total_size; + stats.issues_found = scan_result.total_issues; + stats.folders_scanned = scan_result.folders_scanned; + stats.objects_with_issues = scan_result.objects_with_issues; + + if stats.scan_duration.as_secs() > 0 { + stats.scan_rate_objects_per_sec = stats.objects_scanned as f64 / stats.scan_duration.as_secs() as f64; + stats.scan_rate_bytes_per_sec = stats.bytes_scanned as f64 / stats.scan_duration.as_secs() as f64; + } + } + + // Publish scan completion event + let scan_report = crate::scanner::ScanReport { + scan_id: cycle.to_string(), + status: "completed".to_string(), + summary: format!("Scanned {} objects, found {} issues", scan_result.objects.len(), scan_result.total_issues), + issues_found: scan_result.total_issues, + }; + + self.coordinator.publish_event(SystemEvent::ScanCompleted(scan_report)).await?; + + info!( + "Scan cycle {} completed: {} objects, {} bytes, {} issues in {:?}", + cycle, + scan_result.objects.len(), + scan_result.total_size, + scan_result.total_issues, + cycle_start.elapsed() + ); + + Ok(()) + } + + /// Scan a directory recursively + async fn scan_directory(&self, path: &str) -> Result { + let mut result = ScanResult::default(); + let path_buf = PathBuf::from(path); + + if !path_buf.exists() { + warn!("Scan path does not exist: {}", path); + return Ok(result); + } + + if !path_buf.is_dir() { + warn!("Scan path is not a directory: {}", path); + return Ok(result); + } + + self.scan_directory_recursive(&path_buf, &mut result).await?; + Ok(result) + } + + /// Recursively scan a directory + async fn scan_directory_recursive(&self, dir_path: &Path, result: &mut ScanResult) -> Result<()> { + result.folders_scanned += 1; + + // Check for cancellation + if self.cancel_token.is_cancelled() { + return Ok(()); + } + + let entries = match std::fs::read_dir(dir_path) { + Ok(entries) => entries, + Err(e) => { + warn!("Failed to read directory {}: {}", dir_path.display(), e); + return Ok(()); + } + }; + + for entry in entries { + if self.cancel_token.is_cancelled() { + break; + } + + let entry = match entry { + Ok(entry) => entry, + Err(e) => { + warn!("Failed to read directory entry: {}", e); + continue; + } + }; + + let file_path = entry.path(); + let _path_str = file_path.to_string_lossy(); + let entry_name = file_path.file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown"); + + // Skip hidden files and system files + if entry_name.starts_with('.') || entry_name == ".." || entry_name == "." { + continue; + } + + if file_path.is_dir() { + // Recursively scan subdirectories + Box::pin(self.scan_directory_recursive(&file_path, result)).await?; + } else if file_path.is_file() { + // Scan individual file + if let Some(scanned_object) = self.scan_object(&file_path).await? { + result.objects.push(scanned_object.clone()); + result.total_size += scanned_object.size; + + if !scanned_object.health_issues.is_empty() { + result.objects_with_issues += 1; + result.total_issues += scanned_object.health_issues.len() as u64; + + // Publish health issues + for issue in &scanned_object.health_issues { + let health_issue = crate::scanner::HealthIssue { + issue_type: issue.issue_type.clone(), + severity: issue.severity, + bucket: scanned_object.bucket.clone(), + object: scanned_object.object.clone(), + description: issue.description.clone(), + metadata: None, // TODO: Convert HashMap to ObjectMetadata + }; + + self.coordinator.publish_event(SystemEvent::HealthIssueDetected(health_issue)).await?; + } + } + + // Publish object discovered event + let metadata = crate::ObjectMetadata { + size: scanned_object.size, + mod_time: scanned_object.modified_time.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64, + content_type: "application/octet-stream".to_string(), + etag: "".to_string(), // TODO: Calculate actual ETag + }; + + self.coordinator.publish_event(SystemEvent::ObjectDiscovered { + bucket: scanned_object.bucket.clone(), + object: scanned_object.object.clone(), + version_id: scanned_object.version_id.clone(), + metadata, + }).await?; + } + } + + // Sleep between items to avoid overwhelming the system + sleep(self.config.folder_sleep_duration).await; + } + + Ok(()) + } + + /// Scan a single object file + async fn scan_object(&self, file_path: &Path) -> Result> { + let metadata = match std::fs::metadata(file_path) { + Ok(metadata) => metadata, + Err(e) => { + warn!("Failed to read file metadata {}: {}", file_path.display(), e); + return Ok(None); + } + }; + + // Extract bucket and object from path + let (bucket, object) = self.extract_bucket_object_from_path(file_path)?; + if bucket.is_empty() || object.is_empty() { + return Ok(None); + } + + // Check for health issues + let health_issues = self.check_object_health(file_path, &metadata).await?; + + let scanned_object = ScannedObject { + bucket, + object, + version_id: None, // TODO: Extract version ID from path + path: file_path.to_path_buf(), + size: metadata.len(), + modified_time: metadata.modified().unwrap_or(SystemTime::now()), + metadata: HashMap::new(), // TODO: Extract metadata + health_issues, + }; + + Ok(Some(scanned_object)) + } + + /// Extract bucket and object name from file path + fn extract_bucket_object_from_path(&self, file_path: &Path) -> Result<(String, String)> { + let _path_str = file_path.to_string_lossy(); + let root_path = Path::new(&self.config.root_path); + + if let Ok(relative_path) = file_path.strip_prefix(root_path) { + let components: Vec<&str> = relative_path.components() + .filter_map(|c| c.as_os_str().to_str()) + .collect(); + + if components.len() >= 2 { + let bucket = components[0].to_string(); + let object = components[1..].join("/"); + return Ok((bucket, object)); + } + } + + Ok((String::new(), String::new())) + } + + /// Check object health and detect issues + async fn check_object_health(&self, file_path: &Path, metadata: &std::fs::Metadata) -> Result> { + let mut issues = Vec::new(); + + // Extract bucket and object from path for health issues + let (bucket, object) = self.extract_bucket_object_from_path(file_path)?; + + // Check file size + if metadata.len() == 0 { + issues.push(HealthIssue { + issue_type: HealthIssueType::ObjectTooSmall, + severity: Severity::Low, + bucket: bucket.clone(), + object: object.clone(), + description: "Object has zero size".to_string(), + metadata: None, + }); + } + + // Check file permissions + if !metadata.permissions().readonly() { + issues.push(HealthIssue { + issue_type: HealthIssueType::PolicyViolation, + severity: Severity::Medium, + bucket: bucket.clone(), + object: object.clone(), + description: "Object is not read-only".to_string(), + metadata: None, + }); + } + + // TODO: Add more health checks: + // - Checksum verification + // - Replication status + // - Encryption status + // - Metadata consistency + // - Disk health + + Ok(issues) + } + + /// Start scanning operations + pub async fn start_scan(&self) -> Result<()> { + let mut status = self.status.write().await; + *status = Status::Running; + info!("Scanning operations started"); + Ok(()) + } + + /// Stop scanning operations + pub async fn stop_scan(&self) -> Result<()> { + let mut status = self.status.write().await; + *status = Status::Stopped; + info!("Scanning operations stopped"); + Ok(()) + } + + /// Get engine configuration + pub async fn get_config(&self) -> ScanConfig { + self.config.clone() + } +} + +/// Result of a scan operation +#[derive(Debug, Clone, Default)] +pub struct ScanResult { + pub objects: Vec, + pub total_size: u64, + pub total_issues: u64, + pub folders_scanned: u64, + pub objects_with_issues: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::time::Duration; + + #[tokio::test] + async fn test_engine_creation() { + let config = EngineConfig::default(); + let coordinator = Arc::new(core::Coordinator::new( + core::CoordinatorConfig::default(), + Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()), + CancellationToken::new(), + ).await.unwrap()); + let metrics = Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()); + let cancel_token = CancellationToken::new(); + + let engine = Engine::new(config, coordinator, metrics, cancel_token).await; + assert!(engine.is_ok()); + } + + #[tokio::test] + async fn test_path_extraction() { + let config = EngineConfig { + root_path: "/data".to_string(), + ..Default::default() + }; + let coordinator = Arc::new(core::Coordinator::new( + core::CoordinatorConfig::default(), + Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()), + CancellationToken::new(), + ).await.unwrap()); + let metrics = Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()); + let cancel_token = CancellationToken::new(); + + let engine = Engine::new(config, coordinator, metrics, cancel_token).await.unwrap(); + + let test_path = Path::new("/data/bucket1/object1.txt"); + let (bucket, object) = engine.extract_bucket_object_from_path(test_path).unwrap(); + + assert_eq!(bucket, "bucket1"); + assert_eq!(object, "object1.txt"); + } +} \ No newline at end of file diff --git a/crates/ahm/src/scanner/metrics_collector.rs b/crates/ahm/src/scanner/metrics_collector.rs new file mode 100644 index 00000000..45c0d4b1 --- /dev/null +++ b/crates/ahm/src/scanner/metrics_collector.rs @@ -0,0 +1,526 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::{debug, error, info, warn}; + +use crate::error::Result; +use super::{HealthIssue, HealthIssueType, Severity}; + +/// Configuration for metrics collection +#[derive(Debug, Clone)] +pub struct MetricsConfig { + /// Collection interval for metrics + pub collection_interval: Duration, + /// Retention period for historical metrics + pub retention_period: Duration, + /// Maximum number of data points to keep in memory + pub max_data_points: usize, + /// Whether to enable detailed metrics collection + pub enable_detailed_metrics: bool, + /// Whether to enable performance profiling + pub enable_profiling: bool, + /// Whether to enable resource usage tracking + pub enable_resource_tracking: bool, +} + +impl Default for MetricsConfig { + fn default() -> Self { + Self { + collection_interval: Duration::from_secs(60), // 1 minute + retention_period: Duration::from_secs(3600 * 24), // 24 hours + max_data_points: 1440, // 24 hours worth of minute-level data + enable_detailed_metrics: true, + enable_profiling: false, + enable_resource_tracking: true, + } + } +} + +/// Scanner performance metrics +#[derive(Debug, Clone)] +pub struct ScannerMetrics { + /// Objects scanned per second + pub objects_per_second: f64, + /// Bytes scanned per second + pub bytes_per_second: f64, + /// Average scan time per object + pub avg_scan_time_per_object: Duration, + /// Total objects scanned in current cycle + pub total_objects_scanned: u64, + /// Total bytes scanned in current cycle + pub total_bytes_scanned: u64, + /// Number of health issues detected + pub health_issues_detected: u64, + /// Scan success rate (percentage) + pub success_rate: f64, + /// Current scan cycle duration + pub current_cycle_duration: Duration, + /// Average scan cycle duration + pub avg_cycle_duration: Duration, + /// Last scan completion time + pub last_scan_completion: Option, +} + +/// Resource usage metrics +#[derive(Debug, Clone)] +pub struct ResourceMetrics { + /// CPU usage percentage + pub cpu_usage_percent: f64, + /// Memory usage in bytes + pub memory_usage_bytes: u64, + /// Memory usage percentage + pub memory_usage_percent: f64, + /// Disk I/O operations per second + pub disk_io_ops_per_sec: f64, + /// Disk I/O bytes per second + pub disk_io_bytes_per_sec: f64, + /// Network I/O bytes per second + pub network_io_bytes_per_sec: f64, + /// Number of active threads + pub active_threads: u32, + /// Number of open file descriptors + pub open_file_descriptors: u32, +} + +/// Health metrics summary +#[derive(Debug, Clone)] +pub struct HealthMetrics { + /// Total health issues by severity + pub issues_by_severity: HashMap, + /// Total health issues by type + pub issues_by_type: HashMap, + /// Objects with health issues + pub objects_with_issues: u64, + /// Percentage of objects with issues + pub objects_with_issues_percent: f64, + /// Last health check time + pub last_health_check: SystemTime, + /// Health score (0-100, higher is better) + pub health_score: f64, +} + +/// Historical metrics data point +#[derive(Debug, Clone)] +pub struct MetricsDataPoint { + pub timestamp: SystemTime, + pub scanner_metrics: ScannerMetrics, + pub resource_metrics: ResourceMetrics, + pub health_metrics: HealthMetrics, +} + +/// Metrics collector for scanner system +pub struct MetricsCollector { + config: MetricsConfig, + current_metrics: Arc>, + historical_data: Arc>>, + collection_start_time: Instant, +} + +/// Current metrics state +#[derive(Debug, Clone)] +pub struct CurrentMetrics { + pub scanner_metrics: ScannerMetrics, + pub resource_metrics: ResourceMetrics, + pub health_metrics: HealthMetrics, + pub last_update: SystemTime, +} + +impl MetricsCollector { + /// Create a new metrics collector + pub fn new(config: MetricsConfig) -> Self { + let collector = Self { + config, + current_metrics: Arc::new(RwLock::new(CurrentMetrics { + scanner_metrics: ScannerMetrics { + objects_per_second: 0.0, + bytes_per_second: 0.0, + avg_scan_time_per_object: Duration::ZERO, + total_objects_scanned: 0, + total_bytes_scanned: 0, + health_issues_detected: 0, + success_rate: 100.0, + current_cycle_duration: Duration::ZERO, + avg_cycle_duration: Duration::ZERO, + last_scan_completion: None, + }, + resource_metrics: ResourceMetrics { + cpu_usage_percent: 0.0, + memory_usage_bytes: 0, + memory_usage_percent: 0.0, + disk_io_ops_per_sec: 0.0, + disk_io_bytes_per_sec: 0.0, + network_io_bytes_per_sec: 0.0, + active_threads: 0, + open_file_descriptors: 0, + }, + health_metrics: HealthMetrics { + issues_by_severity: HashMap::new(), + issues_by_type: HashMap::new(), + objects_with_issues: 0, + objects_with_issues_percent: 0.0, + last_health_check: SystemTime::now(), + health_score: 100.0, + }, + last_update: SystemTime::now(), + })), + historical_data: Arc::new(RwLock::new(Vec::new())), + collection_start_time: Instant::now(), + }; + + info!("Metrics collector created with config: {:?}", collector.config); + collector + } + + /// Start metrics collection + pub async fn start_collection(&self) -> Result<()> { + info!("Starting metrics collection"); + + let collector = self.clone_for_background(); + tokio::spawn(async move { + if let Err(e) = collector.run_collection_loop().await { + error!("Metrics collection error: {}", e); + } + }); + + Ok(()) + } + + /// Stop metrics collection + pub async fn stop_collection(&self) -> Result<()> { + info!("Stopping metrics collection"); + Ok(()) + } + + /// Update scanner metrics + pub async fn update_scanner_metrics(&self, metrics: ScannerMetrics) -> Result<()> { + let mut current = self.current_metrics.write().await; + current.scanner_metrics = metrics; + current.last_update = SystemTime::now(); + Ok(()) + } + + /// Update resource metrics + pub async fn update_resource_metrics(&self, metrics: ResourceMetrics) -> Result<()> { + let mut current = self.current_metrics.write().await; + current.resource_metrics = metrics; + current.last_update = SystemTime::now(); + Ok(()) + } + + /// Update health metrics + pub async fn update_health_metrics(&self, metrics: HealthMetrics) -> Result<()> { + let mut current = self.current_metrics.write().await; + current.health_metrics = metrics; + current.last_update = SystemTime::now(); + Ok(()) + } + + /// Record a health issue + pub async fn record_health_issue(&self, issue: &HealthIssue) -> Result<()> { + let mut current = self.current_metrics.write().await; + + // Update severity count + *current.health_metrics.issues_by_severity.entry(issue.severity).or_insert(0) += 1; + + // Update type count + *current.health_metrics.issues_by_type.entry(issue.issue_type.clone()).or_insert(0) += 1; + + // Update scanner metrics + current.scanner_metrics.health_issues_detected += 1; + + current.last_update = SystemTime::now(); + Ok(()) + } + + /// Get current metrics + pub async fn current_metrics(&self) -> CurrentMetrics { + self.current_metrics.read().await.clone() + } + + /// Get historical metrics + pub async fn historical_metrics(&self, duration: Duration) -> Vec { + let historical = self.historical_data.read().await; + let cutoff_time = SystemTime::now() - duration; + + historical.iter() + .filter(|point| point.timestamp >= cutoff_time) + .cloned() + .collect() + } + + /// Get metrics summary + pub async fn metrics_summary(&self) -> MetricsSummary { + let current = self.current_metrics.read().await; + let historical = self.historical_data.read().await; + + let uptime = self.collection_start_time.elapsed(); + let total_data_points = historical.len(); + + // Calculate averages from historical data + let avg_objects_per_sec = if !historical.is_empty() { + historical.iter() + .map(|point| point.scanner_metrics.objects_per_second) + .sum::() / historical.len() as f64 + } else { + 0.0 + }; + + let avg_bytes_per_sec = if !historical.is_empty() { + historical.iter() + .map(|point| point.scanner_metrics.bytes_per_second) + .sum::() / historical.len() as f64 + } else { + 0.0 + }; + + let avg_cpu_usage = if !historical.is_empty() { + historical.iter() + .map(|point| point.resource_metrics.cpu_usage_percent) + .sum::() / historical.len() as f64 + } else { + 0.0 + }; + + let avg_memory_usage = if !historical.is_empty() { + historical.iter() + .map(|point| point.resource_metrics.memory_usage_percent) + .sum::() / historical.len() as f64 + } else { + 0.0 + }; + + MetricsSummary { + uptime, + total_data_points, + current_scanner_metrics: current.scanner_metrics.clone(), + current_resource_metrics: current.resource_metrics.clone(), + current_health_metrics: current.health_metrics.clone(), + avg_objects_per_sec, + avg_bytes_per_sec, + avg_cpu_usage, + avg_memory_usage, + last_update: current.last_update, + } + } + + /// Clone the collector for background tasks + fn clone_for_background(&self) -> Arc { + Arc::new(Self { + config: self.config.clone(), + current_metrics: self.current_metrics.clone(), + historical_data: self.historical_data.clone(), + collection_start_time: self.collection_start_time, + }) + } + + /// Main collection loop + async fn run_collection_loop(&self) -> Result<()> { + info!("Metrics collection loop started"); + + loop { + // Collect current metrics + self.collect_current_metrics().await?; + + // Store historical data point + self.store_historical_data_point().await?; + + // Clean up old data + self.cleanup_old_data().await?; + + // Wait for next collection interval + tokio::time::sleep(self.config.collection_interval).await; + } + } + + /// Collect current system metrics + async fn collect_current_metrics(&self) -> Result<()> { + if self.config.enable_resource_tracking { + let resource_metrics = self.collect_resource_metrics().await?; + self.update_resource_metrics(resource_metrics).await?; + } + + Ok(()) + } + + /// Collect resource usage metrics + async fn collect_resource_metrics(&self) -> Result { + // TODO: Implement actual resource metrics collection + // For now, return placeholder metrics + Ok(ResourceMetrics { + cpu_usage_percent: 0.0, + memory_usage_bytes: 0, + memory_usage_percent: 0.0, + disk_io_ops_per_sec: 0.0, + disk_io_bytes_per_sec: 0.0, + network_io_bytes_per_sec: 0.0, + active_threads: 0, + open_file_descriptors: 0, + }) + } + + /// Store current metrics as historical data point + async fn store_historical_data_point(&self) -> Result<()> { + let current = self.current_metrics.read().await; + let data_point = MetricsDataPoint { + timestamp: SystemTime::now(), + scanner_metrics: current.scanner_metrics.clone(), + resource_metrics: current.resource_metrics.clone(), + health_metrics: current.health_metrics.clone(), + }; + + let mut historical = self.historical_data.write().await; + historical.push(data_point); + + // Limit the number of data points + if historical.len() > self.config.max_data_points { + historical.remove(0); + } + + Ok(()) + } + + /// Clean up old historical data + async fn cleanup_old_data(&self) -> Result<()> { + let cutoff_time = SystemTime::now() - self.config.retention_period; + let mut historical = self.historical_data.write().await; + + historical.retain(|point| point.timestamp >= cutoff_time); + + Ok(()) + } + + /// Reset all metrics + pub async fn reset_metrics(&self) -> Result<()> { + let mut current = self.current_metrics.write().await; + *current = CurrentMetrics { + scanner_metrics: ScannerMetrics { + objects_per_second: 0.0, + bytes_per_second: 0.0, + avg_scan_time_per_object: Duration::ZERO, + total_objects_scanned: 0, + total_bytes_scanned: 0, + health_issues_detected: 0, + success_rate: 100.0, + current_cycle_duration: Duration::ZERO, + avg_cycle_duration: Duration::ZERO, + last_scan_completion: None, + }, + resource_metrics: ResourceMetrics { + cpu_usage_percent: 0.0, + memory_usage_bytes: 0, + memory_usage_percent: 0.0, + disk_io_ops_per_sec: 0.0, + disk_io_bytes_per_sec: 0.0, + network_io_bytes_per_sec: 0.0, + active_threads: 0, + open_file_descriptors: 0, + }, + health_metrics: HealthMetrics { + issues_by_severity: HashMap::new(), + issues_by_type: HashMap::new(), + objects_with_issues: 0, + objects_with_issues_percent: 0.0, + last_health_check: SystemTime::now(), + health_score: 100.0, + }, + last_update: SystemTime::now(), + }; + + let mut historical = self.historical_data.write().await; + historical.clear(); + + Ok(()) + } +} + +/// Summary of all metrics +#[derive(Debug, Clone)] +pub struct MetricsSummary { + pub uptime: Duration, + pub total_data_points: usize, + pub current_scanner_metrics: ScannerMetrics, + pub current_resource_metrics: ResourceMetrics, + pub current_health_metrics: HealthMetrics, + pub avg_objects_per_sec: f64, + pub avg_bytes_per_sec: f64, + pub avg_cpu_usage: f64, + pub avg_memory_usage: f64, + pub last_update: SystemTime, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_metrics_collector_creation() { + let config = MetricsConfig::default(); + let collector = MetricsCollector::new(config); + let metrics = collector.current_metrics().await; + assert_eq!(metrics.scanner_metrics.total_objects_scanned, 0); + } + + #[tokio::test] + async fn test_metrics_update() { + let config = MetricsConfig::default(); + let collector = MetricsCollector::new(config); + + let scanner_metrics = ScannerMetrics { + objects_per_second: 100.0, + bytes_per_second: 1024.0, + avg_scan_time_per_object: Duration::from_millis(10), + total_objects_scanned: 1000, + total_bytes_scanned: 1024000, + health_issues_detected: 5, + success_rate: 99.5, + current_cycle_duration: Duration::from_secs(60), + avg_cycle_duration: Duration::from_secs(65), + last_scan_completion: Some(SystemTime::now()), + }; + + collector.update_scanner_metrics(scanner_metrics).await.unwrap(); + + let current = collector.current_metrics().await; + assert_eq!(current.scanner_metrics.total_objects_scanned, 1000); + assert_eq!(current.scanner_metrics.health_issues_detected, 5); + } + + #[tokio::test] + async fn test_health_issue_recording() { + let config = MetricsConfig::default(); + let collector = MetricsCollector::new(config); + + let issue = HealthIssue { + issue_type: HealthIssueType::DiskFull, + severity: Severity::High, + bucket: "test-bucket".to_string(), + object: "test-object".to_string(), + description: "Test issue".to_string(), + metadata: None, + }; + + collector.record_health_issue(&issue).await.unwrap(); + + let current = collector.current_metrics().await; + assert_eq!(current.scanner_metrics.health_issues_detected, 1); + assert_eq!(current.health_metrics.issues_by_severity.get(&Severity::High), Some(&1)); + } +} \ No newline at end of file diff --git a/crates/ahm/src/scanner/object_scanner.rs b/crates/ahm/src/scanner/object_scanner.rs new file mode 100644 index 00000000..fd438d8f --- /dev/null +++ b/crates/ahm/src/scanner/object_scanner.rs @@ -0,0 +1,419 @@ +// Copyright 2024 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + collections::HashMap, + path::Path, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use tokio::sync::RwLock; +use tracing::info; + +use crate::error::Result; +use super::{HealthIssue, HealthIssueType, Severity}; + +/// Configuration for object scanning +#[derive(Debug, Clone)] +pub struct ObjectScannerConfig { + /// Whether to perform checksum verification + pub verify_checksum: bool, + /// Whether to check replication status + pub check_replication: bool, + /// Whether to validate metadata consistency + pub validate_metadata: bool, + /// Maximum object size to scan (bytes) + pub max_object_size: u64, + /// Minimum object size (bytes) + pub min_object_size: u64, + /// Timeout for individual object scans + pub scan_timeout: Duration, + /// Whether to enable deep scanning (bitrot detection) + pub enable_deep_scan: bool, +} + +impl Default for ObjectScannerConfig { + fn default() -> Self { + Self { + verify_checksum: true, + check_replication: true, + validate_metadata: true, + max_object_size: 1024 * 1024 * 1024 * 1024, // 1TB + min_object_size: 0, + scan_timeout: Duration::from_secs(30), + enable_deep_scan: false, + } + } +} + +/// Result of scanning a single object +#[derive(Debug, Clone)] +pub struct ObjectScanResult { + /// Object identifier + pub bucket: String, + pub object: String, + pub version_id: Option, + /// Scan success status + pub success: bool, + /// Object metadata discovered + pub metadata: Option, + /// Health issues detected + pub health_issues: Vec, + /// Time taken to scan this object + pub scan_duration: Duration, + /// Error message if scan failed + pub error_message: Option, +} + +/// Object metadata +#[derive(Debug, Clone)] +pub struct ObjectMetadata { + pub size: u64, + pub modified_time: SystemTime, + pub content_type: String, + pub etag: String, + pub checksum: Option, + pub replication_status: Option, + pub encryption_status: Option, + pub custom_metadata: HashMap, +} + +/// Object scanner for individual object health checking +pub struct ObjectScanner { + config: ObjectScannerConfig, + statistics: Arc>, +} + +/// Statistics for object scanning +#[derive(Debug, Clone, Default)] +pub struct ObjectScannerStatistics { + pub objects_scanned: u64, + pub objects_with_issues: u64, + pub total_issues_found: u64, + pub total_scan_time: Duration, + pub average_scan_time: Duration, + pub checksum_verifications: u64, + pub checksum_failures: u64, + pub replication_checks: u64, + pub replication_failures: u64, +} + +impl ObjectScanner { + /// Create a new object scanner + pub fn new(config: ObjectScannerConfig) -> Self { + Self { + config, + statistics: Arc::new(RwLock::new(ObjectScannerStatistics::default())), + } + } + + /// Scan a single object for health issues + pub async fn scan_object(&self, bucket: &str, object: &str, version_id: Option<&str>, path: &Path) -> Result { + let scan_start = std::time::Instant::now(); + let mut health_issues = Vec::new(); + let mut error_message = None; + + // Check if file exists + if !path.exists() { + return Ok(ObjectScanResult { + bucket: bucket.to_string(), + object: object.to_string(), + version_id: version_id.map(|v| v.to_string()), + success: false, + metadata: None, + health_issues: vec![HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::Critical, + bucket: bucket.to_string(), + object: object.to_string(), + description: "Object file does not exist".to_string(), + metadata: None, + }], + scan_duration: scan_start.elapsed(), + error_message: Some("Object file not found".to_string()), + }); + } + + // Get file metadata + let metadata = match std::fs::metadata(path) { + Ok(metadata) => metadata, + Err(e) => { + error_message = Some(format!("Failed to read file metadata: {}", e)); + health_issues.push(HealthIssue { + issue_type: HealthIssueType::DiskReadError, + severity: Severity::High, + bucket: bucket.to_string(), + object: object.to_string(), + description: "Failed to read file metadata".to_string(), + metadata: None, + }); + return Ok(ObjectScanResult { + bucket: bucket.to_string(), + object: object.to_string(), + version_id: version_id.map(|v| v.to_string()), + success: false, + metadata: None, + health_issues, + scan_duration: scan_start.elapsed(), + error_message, + }); + } + }; + + // Check file size + let file_size = metadata.len(); + if file_size < self.config.min_object_size { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::ObjectTooSmall, + severity: Severity::Low, + bucket: bucket.to_string(), + object: object.to_string(), + description: format!("Object size {} is below minimum {}", file_size, self.config.min_object_size), + metadata: None, + }); + } + + if file_size > self.config.max_object_size { + health_issues.push(HealthIssue { + issue_type: HealthIssueType::ObjectTooLarge, + severity: Severity::Medium, + bucket: bucket.to_string(), + object: object.to_string(), + description: format!("Object size {} exceeds maximum {}", file_size, self.config.max_object_size), + metadata: None, + }); + } + + // Verify checksum if enabled + let checksum = if self.config.verify_checksum { + match self.verify_checksum(path).await { + Ok(cs) => { + self.update_statistics(|stats| stats.checksum_verifications += 1).await; + Some(cs) + } + Err(_e) => { + self.update_statistics(|stats| stats.checksum_failures += 1).await; + health_issues.push(HealthIssue { + issue_type: HealthIssueType::ChecksumMismatch, + severity: Severity::High, + bucket: bucket.to_string(), + object: object.to_string(), + description: "Checksum verification failed".to_string(), + metadata: None, + }); + None + } + } + } else { + None + }; + + // Check replication status if enabled + let replication_status = if self.config.check_replication { + match self.check_replication_status(bucket, object).await { + Ok(status) => { + self.update_statistics(|stats| stats.replication_checks += 1).await; + Some(status) + } + Err(_e) => { + self.update_statistics(|stats| stats.replication_failures += 1).await; + health_issues.push(HealthIssue { + issue_type: HealthIssueType::MissingReplica, + severity: Severity::High, + bucket: bucket.to_string(), + object: object.to_string(), + description: "Replication status check failed".to_string(), + metadata: None, + }); + None + } + } + } else { + None + }; + + // Validate metadata if enabled + if self.config.validate_metadata { + if let Some(issue) = self.validate_metadata(bucket, object, &metadata).await? { + health_issues.push(issue); + } + } + + // Create object metadata + let object_metadata = ObjectMetadata { + size: file_size, + modified_time: metadata.modified().unwrap_or(SystemTime::now()), + content_type: self.detect_content_type(path), + etag: self.calculate_etag(path).await?, + checksum, + replication_status, + encryption_status: None, // TODO: Implement encryption status check + custom_metadata: HashMap::new(), // TODO: Extract custom metadata + }; + + let scan_duration = scan_start.elapsed(); + let success = health_issues.is_empty(); + + // Update statistics + self.update_statistics(|stats| { + stats.objects_scanned += 1; + if !health_issues.is_empty() { + stats.objects_with_issues += 1; + stats.total_issues_found += health_issues.len() as u64; + } + stats.total_scan_time += scan_duration; + stats.average_scan_time = Duration::from_millis( + stats.total_scan_time.as_millis() as u64 / stats.objects_scanned.max(1) + ); + }).await; + + Ok(ObjectScanResult { + bucket: bucket.to_string(), + object: object.to_string(), + version_id: version_id.map(|v| v.to_string()), + success, + metadata: Some(object_metadata), + health_issues, + scan_duration, + error_message, + }) + } + + /// Verify object checksum + async fn verify_checksum(&self, _path: &Path) -> Result { + // TODO: Implement actual checksum verification + // For now, return a placeholder checksum + Ok("placeholder_checksum".to_string()) + } + + /// Check object replication status + async fn check_replication_status(&self, _bucket: &str, _object: &str) -> Result { + // TODO: Implement actual replication status checking + // For now, return a placeholder status + Ok("replicated".to_string()) + } + + /// Validate object metadata + async fn validate_metadata(&self, _bucket: &str, _object: &str, _metadata: &std::fs::Metadata) -> Result> { + // TODO: Implement actual metadata validation + // For now, return None (no issues) + Ok(None) + } + + /// Detect content type from file extension + fn detect_content_type(&self, path: &Path) -> String { + if let Some(extension) = path.extension() { + match extension.to_str().unwrap_or("").to_lowercase().as_str() { + "txt" => "text/plain", + "json" => "application/json", + "xml" => "application/xml", + "html" | "htm" => "text/html", + "css" => "text/css", + "js" => "application/javascript", + "png" => "image/png", + "jpg" | "jpeg" => "image/jpeg", + "gif" => "image/gif", + "pdf" => "application/pdf", + "zip" => "application/zip", + "tar" => "application/x-tar", + "gz" => "application/gzip", + _ => "application/octet-stream", + }.to_string() + } else { + "application/octet-stream".to_string() + } + } + + /// Calculate object ETag + async fn calculate_etag(&self, _path: &Path) -> Result { + // TODO: Implement actual ETag calculation + // For now, return a placeholder ETag + Ok("placeholder_etag".to_string()) + } + + /// Update scanner statistics + async fn update_statistics(&self, update_fn: F) + where + F: FnOnce(&mut ObjectScannerStatistics), + { + let mut stats = self.statistics.write().await; + update_fn(&mut stats); + } + + /// Get current statistics + pub async fn statistics(&self) -> ObjectScannerStatistics { + self.statistics.read().await.clone() + } + + /// Reset statistics + pub async fn reset_statistics(&self) { + let mut stats = self.statistics.write().await; + *stats = ObjectScannerStatistics::default(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + use std::fs::File; + use std::io::Write; + + #[tokio::test] + async fn test_object_scanner_creation() { + let config = ObjectScannerConfig::default(); + let scanner = ObjectScanner::new(config); + assert_eq!(scanner.statistics().await.objects_scanned, 0); + } + + #[tokio::test] + async fn test_content_type_detection() { + let config = ObjectScannerConfig::default(); + let scanner = ObjectScanner::new(config); + + let path = Path::new("test.txt"); + assert_eq!(scanner.detect_content_type(path), "text/plain"); + + let path = Path::new("test.json"); + assert_eq!(scanner.detect_content_type(path), "application/json"); + + let path = Path::new("test.unknown"); + assert_eq!(scanner.detect_content_type(path), "application/octet-stream"); + } + + #[tokio::test] + async fn test_object_scanning() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + + // Create a test file + let mut file = File::create(&test_file).unwrap(); + writeln!(file, "test content").unwrap(); + + let config = ObjectScannerConfig::default(); + let scanner = ObjectScanner::new(config); + + let result = scanner.scan_object("test-bucket", "test.txt", None, &test_file).await.unwrap(); + + assert!(result.success); + assert_eq!(result.bucket, "test-bucket"); + assert_eq!(result.object, "test.txt"); + assert!(result.metadata.is_some()); + + let metadata = result.metadata.unwrap(); + assert!(metadata.size > 0); + assert_eq!(metadata.content_type, "text/plain"); + } +} \ No newline at end of file