mirror of
https://github.com/rustfs/rustfs.git
synced 2026-01-17 01:30:33 +00:00
fix: Refact heal and scanner design
Signed-off-by: junxiang Mu <1948535941@qq.com>
This commit is contained in:
557
crates/ahm/architecture.md
Normal file
557
crates/ahm/architecture.md
Normal file
@@ -0,0 +1,557 @@
|
||||
# RustFS Advanced Health & Metrics (AHM) System Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
The RustFS AHM system is a newly designed distributed storage health monitoring and repair system that provides intelligent scanning, automatic repair, rich metrics, and policy-driven management capabilities.
|
||||
|
||||
## System Architecture
|
||||
|
||||
### Overall Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────┐
|
||||
│ API Layer (REST/gRPC) │
|
||||
├─────────────────────────────────────┤
|
||||
│ Policy & Configuration │
|
||||
├─────────────────────────────────────┤
|
||||
│ Core Coordination Engine │
|
||||
├─────────────────────────────────────┤
|
||||
│ Scanner Engine │ Heal Engine │
|
||||
├─────────────────────────────────────┤
|
||||
│ Metrics & Observability │
|
||||
├─────────────────────────────────────┤
|
||||
│ Storage Abstraction │
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Module Structure
|
||||
|
||||
```
|
||||
rustfs/crates/ecstore/src/ahm/
|
||||
├── mod.rs # Module entry point and public interfaces
|
||||
├── core/ # Core engines
|
||||
│ ├── coordinator.rs # Distributed coordinator - event routing and state management
|
||||
│ ├── scheduler.rs # Task scheduler - priority queue and work assignment
|
||||
│ └── lifecycle.rs # Lifecycle manager - system startup/shutdown control
|
||||
├── scanner/ # Scanning system
|
||||
│ ├── engine.rs # Scan engine - scan process control
|
||||
│ ├── object_scanner.rs # Object scanner - object-level integrity checks
|
||||
│ ├── disk_scanner.rs # Disk scanner - disk-level health checks
|
||||
│ ├── metrics_collector.rs # Metrics collector - scan process data collection
|
||||
│ └── bandwidth_limiter.rs # Bandwidth limiter - I/O resource control
|
||||
├── heal/ # Repair system
|
||||
│ ├── engine.rs # Heal engine - repair process control
|
||||
│ ├── priority_queue.rs # Priority queue - repair task ordering
|
||||
│ ├── repair_worker.rs # Repair worker - actual repair execution
|
||||
│ └── validation.rs # Repair validator - repair result verification
|
||||
├── metrics/ # Metrics system
|
||||
│ ├── collector.rs # Metrics collector - real-time data collection
|
||||
│ ├── aggregator.rs # Metrics aggregator - data aggregation and computation
|
||||
│ ├── storage.rs # Metrics storage - time-series data storage
|
||||
│ └── reporter.rs # Metrics reporter - external system export
|
||||
├── policy/ # Policy system
|
||||
│ ├── scan_policy.rs # Scan policy - scan behavior configuration
|
||||
│ ├── heal_policy.rs # Heal policy - repair priority and strategy
|
||||
│ └── retention_policy.rs # Retention policy - data lifecycle management
|
||||
└── api/ # API interfaces
|
||||
├── admin_api.rs # Admin API - system management operations
|
||||
├── metrics_api.rs # Metrics API - metrics query and export
|
||||
└── status_api.rs # Status API - system status monitoring
|
||||
```
|
||||
|
||||
## Core Design Principles
|
||||
|
||||
### 1. Event-Driven Architecture
|
||||
|
||||
```rust
|
||||
pub enum SystemEvent {
|
||||
ObjectDiscovered { bucket: String, object: String, metadata: ObjectMetadata },
|
||||
HealthIssueDetected { issue_type: HealthIssueType, severity: Severity },
|
||||
HealCompleted { result: HealResult },
|
||||
ScanCycleCompleted { statistics: ScanStatistics },
|
||||
ResourceUsageUpdated { usage: ResourceUsage },
|
||||
}
|
||||
```
|
||||
|
||||
- **Scanner** generates discovery events
|
||||
- **Heal** responds to repair events
|
||||
- **Metrics** collects all event statistics
|
||||
- **Policy** controls event processing strategies
|
||||
|
||||
### 2. Layered Modular Design
|
||||
|
||||
#### **API Layer**: REST/gRPC interfaces
|
||||
- Unified response format
|
||||
- Comprehensive error handling
|
||||
- Authentication and authorization support
|
||||
|
||||
#### **Policy Layer**: Configurable business rules
|
||||
- Scan frequency and depth control
|
||||
- Repair priority policies
|
||||
- Data retention rules
|
||||
|
||||
#### **Coordination Layer**: System coordination and scheduling
|
||||
- Event routing and distribution
|
||||
- Resource management and allocation
|
||||
- Task scheduling and execution
|
||||
|
||||
#### **Engine Layer**: Core business logic
|
||||
- Intelligent scanning algorithms
|
||||
- Adaptive repair strategies
|
||||
- Performance optimization control
|
||||
|
||||
#### **Metrics Layer**: Observability support
|
||||
- Real-time metrics collection
|
||||
- Historical trend analysis
|
||||
- Multi-format export
|
||||
|
||||
### 3. Multi-Mode Scanning Strategies
|
||||
|
||||
```rust
|
||||
pub enum ScanStrategy {
|
||||
Full { mode: ScanMode, scope: ScanScope }, // Full scan
|
||||
Incremental { since: Instant, mode: ScanMode }, // Incremental scan
|
||||
Smart { sample_rate: f64, favor_unscanned: bool }, // Smart sampling
|
||||
Targeted { targets: Vec<ObjectTarget>, mode: ScanMode }, // Targeted scan
|
||||
}
|
||||
|
||||
pub enum ScanMode {
|
||||
Quick, // Quick scan - metadata only
|
||||
Normal, // Normal scan - basic integrity verification
|
||||
Deep, // Deep scan - includes bit-rot detection
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Priority-Based Repair System
|
||||
|
||||
```rust
|
||||
pub enum HealPriority {
|
||||
Low = 0,
|
||||
Normal = 1,
|
||||
High = 2,
|
||||
Critical = 3,
|
||||
Emergency = 4,
|
||||
}
|
||||
|
||||
pub enum HealMode {
|
||||
RealTime, // Real-time repair - triggered on GET/PUT
|
||||
Background, // Background repair - scheduled tasks
|
||||
OnDemand, // On-demand repair - admin triggered
|
||||
Emergency, // Emergency repair - critical issues
|
||||
}
|
||||
```
|
||||
|
||||
## API Usage Guide
|
||||
|
||||
### 1. System Management API
|
||||
|
||||
#### Start AHM System
|
||||
|
||||
```http
|
||||
POST /admin/system/start
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"coordinator": {
|
||||
"event_buffer_size": 10000,
|
||||
"max_concurrent_operations": 1000
|
||||
},
|
||||
"scanner": {
|
||||
"default_scan_mode": "Normal",
|
||||
"scan_interval": "24h"
|
||||
},
|
||||
"heal": {
|
||||
"max_workers": 16,
|
||||
"queue_capacity": 50000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"system_id": "ahm-001",
|
||||
"status": "Running",
|
||||
"started_at": "2024-01-15T10:30:00Z"
|
||||
},
|
||||
"timestamp": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### Get System Status
|
||||
|
||||
```http
|
||||
GET /status/health
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"status": "Running",
|
||||
"version": "1.0.0",
|
||||
"uptime_seconds": 3600,
|
||||
"subsystems": {
|
||||
"scanner": {
|
||||
"status": "Scanning",
|
||||
"last_check": "2024-01-15T10:29:00Z",
|
||||
"error_message": null
|
||||
},
|
||||
"heal": {
|
||||
"status": "Idle",
|
||||
"last_check": "2024-01-15T10:29:00Z",
|
||||
"error_message": null
|
||||
},
|
||||
"metrics": {
|
||||
"status": "Running",
|
||||
"last_check": "2024-01-15T10:29:00Z",
|
||||
"error_message": null
|
||||
}
|
||||
}
|
||||
},
|
||||
"timestamp": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Scan Management API
|
||||
|
||||
#### Start Scan Task
|
||||
|
||||
```http
|
||||
POST /admin/scan/start
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"strategy": {
|
||||
"type": "Full",
|
||||
"mode": "Normal",
|
||||
"scope": {
|
||||
"buckets": ["important-data", "user-uploads"],
|
||||
"include_system_objects": false,
|
||||
"max_objects": 1000000
|
||||
}
|
||||
},
|
||||
"priority": "High"
|
||||
}
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"scan_id": "scan-12345",
|
||||
"status": "Started",
|
||||
"estimated_duration": "2h30m",
|
||||
"estimated_objects": 850000
|
||||
},
|
||||
"timestamp": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### Query Scan Status
|
||||
|
||||
```http
|
||||
GET /admin/scan/{scan_id}/status
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"scan_id": "scan-12345",
|
||||
"status": "Scanning",
|
||||
"progress": {
|
||||
"objects_scanned": 425000,
|
||||
"bytes_scanned": 1073741824000,
|
||||
"issues_detected": 23,
|
||||
"completion_percentage": 50.0,
|
||||
"scan_rate_ops": 117.5,
|
||||
"scan_rate_bps": 268435456,
|
||||
"elapsed_time": "1h15m",
|
||||
"estimated_remaining": "1h15m"
|
||||
},
|
||||
"issues": [
|
||||
{
|
||||
"issue_type": "MissingShards",
|
||||
"severity": "High",
|
||||
"bucket": "user-uploads",
|
||||
"object": "photos/IMG_001.jpg",
|
||||
"description": "Missing 1 data shard",
|
||||
"detected_at": "2024-01-15T11:15:00Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
"timestamp": "2024-01-15T11:45:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Heal Management API
|
||||
|
||||
#### Submit Heal Request
|
||||
|
||||
```http
|
||||
POST /admin/heal/request
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"bucket": "user-uploads",
|
||||
"object": "photos/IMG_001.jpg",
|
||||
"version_id": null,
|
||||
"priority": "High",
|
||||
"mode": "OnDemand",
|
||||
"max_retries": 3
|
||||
}
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"heal_request_id": "heal-67890",
|
||||
"status": "Queued",
|
||||
"priority": "High",
|
||||
"estimated_start": "2024-01-15T11:50:00Z",
|
||||
"queue_position": 5
|
||||
},
|
||||
"timestamp": "2024-01-15T11:45:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### Query Heal Status
|
||||
|
||||
```http
|
||||
GET /admin/heal/{heal_request_id}/status
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"heal_request_id": "heal-67890",
|
||||
"status": "Completed",
|
||||
"result": {
|
||||
"success": true,
|
||||
"shards_repaired": 1,
|
||||
"total_shards": 8,
|
||||
"duration": "45s",
|
||||
"strategy_used": "ParityShardRepair",
|
||||
"validation_results": [
|
||||
{
|
||||
"validation_type": "Checksum",
|
||||
"passed": true,
|
||||
"details": "Object checksum verified",
|
||||
"duration": "2s"
|
||||
},
|
||||
{
|
||||
"validation_type": "ShardCount",
|
||||
"passed": true,
|
||||
"details": "All 8 shards present",
|
||||
"duration": "1s"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"timestamp": "2024-01-15T11:46:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Metrics Query API
|
||||
|
||||
#### Get System Metrics
|
||||
|
||||
```http
|
||||
GET /metrics/system?period=1h&metrics=objects_total,scan_rate,heal_success_rate
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"period": "1h",
|
||||
"timestamp_range": {
|
||||
"start": "2024-01-15T10:45:00Z",
|
||||
"end": "2024-01-15T11:45:00Z"
|
||||
},
|
||||
"metrics": {
|
||||
"objects_total": {
|
||||
"value": 2500000,
|
||||
"unit": "count",
|
||||
"labels": {}
|
||||
},
|
||||
"scan_rate_objects_per_second": {
|
||||
"value": 117.5,
|
||||
"unit": "ops",
|
||||
"labels": {}
|
||||
},
|
||||
"heal_success_rate": {
|
||||
"value": 0.98,
|
||||
"unit": "ratio",
|
||||
"labels": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
"timestamp": "2024-01-15T11:45:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### Export Prometheus Format Metrics
|
||||
|
||||
```http
|
||||
GET /metrics/prometheus
|
||||
```
|
||||
|
||||
**Response Example:**
|
||||
```
|
||||
# HELP rustfs_objects_total Total number of objects in the system
|
||||
# TYPE rustfs_objects_total gauge
|
||||
rustfs_objects_total 2500000
|
||||
|
||||
# HELP rustfs_scan_rate_objects_per_second Object scanning rate
|
||||
# TYPE rustfs_scan_rate_objects_per_second gauge
|
||||
rustfs_scan_rate_objects_per_second 117.5
|
||||
|
||||
# HELP rustfs_heal_success_rate Healing operation success rate
|
||||
# TYPE rustfs_heal_success_rate gauge
|
||||
rustfs_heal_success_rate 0.98
|
||||
|
||||
# HELP rustfs_health_issues_total Total health issues detected
|
||||
# TYPE rustfs_health_issues_total counter
|
||||
rustfs_health_issues_total{severity="critical"} 0
|
||||
rustfs_health_issues_total{severity="high"} 3
|
||||
rustfs_health_issues_total{severity="medium"} 15
|
||||
rustfs_health_issues_total{severity="low"} 45
|
||||
```
|
||||
|
||||
### 5. Policy Configuration API
|
||||
|
||||
#### Update Scan Policy
|
||||
|
||||
```http
|
||||
PUT /admin/policy/scan
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"default_scan_interval": "12h",
|
||||
"deep_scan_probability": 0.1,
|
||||
"bandwidth_limit_mbps": 100,
|
||||
"concurrent_scanners": 4,
|
||||
"skip_system_objects": true,
|
||||
"priority_buckets": ["critical-data", "user-data"]
|
||||
}
|
||||
```
|
||||
|
||||
#### Update Heal Policy
|
||||
|
||||
```http
|
||||
PUT /admin/policy/heal
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"max_concurrent_heals": 8,
|
||||
"emergency_heal_timeout": "5m",
|
||||
"auto_heal_enabled": true,
|
||||
"heal_verification_required": true,
|
||||
"priority_mapping": {
|
||||
"critical_buckets": "Emergency",
|
||||
"important_buckets": "High",
|
||||
"standard_buckets": "Normal"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Complete Monitoring and Repair Workflow
|
||||
|
||||
```bash
|
||||
# 1. Start AHM system
|
||||
curl -X POST http://localhost:9000/admin/system/start \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"scanner": {"default_scan_mode": "Normal"}}'
|
||||
|
||||
# 2. Start full scan
|
||||
SCAN_ID=$(curl -X POST http://localhost:9000/admin/scan/start \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"strategy": {"type": "Full", "mode": "Normal"}}' | \
|
||||
jq -r '.data.scan_id')
|
||||
|
||||
# 3. Monitor scan progress
|
||||
watch "curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | jq '.data.progress'"
|
||||
|
||||
# 4. View discovered issues
|
||||
curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | \
|
||||
jq '.data.issues[]'
|
||||
|
||||
# 5. Start repair for discovered issues
|
||||
HEAL_ID=$(curl -X POST http://localhost:9000/admin/heal/request \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"bucket": "user-uploads",
|
||||
"object": "photos/IMG_001.jpg",
|
||||
"priority": "High"
|
||||
}' | jq -r '.data.heal_request_id')
|
||||
|
||||
# 6. Monitor repair progress
|
||||
watch "curl -s http://localhost:9000/admin/heal/$HEAL_ID/status | jq '.data'"
|
||||
|
||||
# 7. View system metrics
|
||||
curl -s http://localhost:9000/metrics/system?period=1h | jq '.data.metrics'
|
||||
|
||||
# 8. Export Prometheus metrics
|
||||
curl -s http://localhost:9000/metrics/prometheus
|
||||
```
|
||||
|
||||
## Key Features
|
||||
|
||||
### 1. Intelligent Scanning
|
||||
- **Multi-level scan modes**: Quick/Normal/Deep three depths
|
||||
- **Adaptive sampling**: Intelligent object selection based on historical data
|
||||
- **Bandwidth control**: Configurable I/O resource limits
|
||||
- **Incremental scanning**: Timestamp-based change detection
|
||||
|
||||
### 2. Intelligent Repair
|
||||
- **Priority queue**: Repair ordering based on business importance
|
||||
- **Multiple repair strategies**: Data shard, parity shard, hybrid repair
|
||||
- **Real-time validation**: Post-repair integrity verification
|
||||
- **Retry mechanism**: Configurable failure retry policies
|
||||
|
||||
### 3. Rich Metrics
|
||||
- **Real-time statistics**: Object counts, storage usage, performance metrics
|
||||
- **Historical trends**: Time-series data storage and analysis
|
||||
- **Multi-format export**: Prometheus, JSON, CSV formats
|
||||
- **Custom metrics**: Extensible metrics definition framework
|
||||
|
||||
### 4. Policy-Driven
|
||||
- **Configurable policies**: Independent configuration for scan, heal, retention policies
|
||||
- **Dynamic adjustment**: Runtime policy updates without restart
|
||||
- **Business alignment**: Differentiated handling based on business importance
|
||||
|
||||
## Deployment Recommendations
|
||||
|
||||
### 1. Resource Configuration
|
||||
- **CPU**: Recommended 16+ cores for parallel scanning and repair
|
||||
- **Memory**: Recommended 32GB+ for metrics cache and task queues
|
||||
- **Network**: Recommended gigabit+ bandwidth for cross-node data sync
|
||||
- **Storage**: Recommended SSD for metrics data storage
|
||||
|
||||
### 2. Monitoring Integration
|
||||
- **Prometheus**: Metrics collection and alerting
|
||||
- **Grafana**: Visualization dashboards
|
||||
- **ELK Stack**: Log aggregation and analysis
|
||||
- **Jaeger**: Distributed tracing
|
||||
|
||||
### 3. High Availability Deployment
|
||||
- **Multi-instance deployment**: Avoid single points of failure
|
||||
- **Load balancing**: API request distribution
|
||||
- **Data backup**: Metrics and configuration data backup
|
||||
- **Failover**: Automatic failure detection and switching
|
||||
|
||||
This architecture design provides RustFS with modern, scalable, and highly observable health monitoring and repair capabilities that meet the operational requirements of enterprise-grade distributed storage systems.
|
||||
557
crates/ahm/architecture_ch.md
Normal file
557
crates/ahm/architecture_ch.md
Normal file
@@ -0,0 +1,557 @@
|
||||
# RustFS Advanced Health & Metrics (AHM) 系统架构设计
|
||||
|
||||
## 概述
|
||||
|
||||
RustFS AHM 系统是一个全新设计的分布式存储健康监控和修复系统,提供智能扫描、自动修复、丰富指标和策略驱动的管理能力。
|
||||
|
||||
## 系统架构
|
||||
|
||||
### 整体架构图
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────┐
|
||||
│ API Layer (REST/gRPC) │
|
||||
├─────────────────────────────────────┤
|
||||
│ Policy & Configuration │
|
||||
├─────────────────────────────────────┤
|
||||
│ Core Coordination Engine │
|
||||
├─────────────────────────────────────┤
|
||||
│ Scanner Engine │ Heal Engine │
|
||||
├─────────────────────────────────────┤
|
||||
│ Metrics & Observability │
|
||||
├─────────────────────────────────────┤
|
||||
│ Storage Abstraction │
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 模块结构
|
||||
|
||||
```
|
||||
rustfs/crates/ecstore/src/ahm/
|
||||
├── mod.rs # 模块入口和公共接口
|
||||
├── core/ # 核心引擎
|
||||
│ ├── coordinator.rs # 分布式协调器 - 事件路由和状态管理
|
||||
│ ├── scheduler.rs # 任务调度器 - 优先级队列和工作分配
|
||||
│ └── lifecycle.rs # 生命周期管理器 - 系统启停控制
|
||||
├── scanner/ # 扫描系统
|
||||
│ ├── engine.rs # 扫描引擎 - 扫描流程控制
|
||||
│ ├── object_scanner.rs # 对象扫描器 - 对象级完整性检查
|
||||
│ ├── disk_scanner.rs # 磁盘扫描器 - 磁盘级健康检查
|
||||
│ ├── metrics_collector.rs # 指标收集器 - 扫描过程数据收集
|
||||
│ └── bandwidth_limiter.rs # 带宽限制器 - I/O 资源控制
|
||||
├── heal/ # 修复系统
|
||||
│ ├── engine.rs # 修复引擎 - 修复流程控制
|
||||
│ ├── priority_queue.rs # 优先级队列 - 修复任务排序
|
||||
│ ├── repair_worker.rs # 修复工作器 - 实际修复执行
|
||||
│ └── validation.rs # 修复验证器 - 修复结果验证
|
||||
├── metrics/ # 指标系统
|
||||
│ ├── collector.rs # 指标收集器 - 实时数据收集
|
||||
│ ├── aggregator.rs # 指标聚合器 - 数据聚合计算
|
||||
│ ├── storage.rs # 指标存储器 - 时序数据存储
|
||||
│ └── reporter.rs # 指标报告器 - 外部系统导出
|
||||
├── policy/ # 策略系统
|
||||
│ ├── scan_policy.rs # 扫描策略 - 扫描行为配置
|
||||
│ ├── heal_policy.rs # 修复策略 - 修复优先级和策略
|
||||
│ └── retention_policy.rs # 保留策略 - 数据生命周期管理
|
||||
└── api/ # API接口
|
||||
├── admin_api.rs # 管理API - 系统管理操作
|
||||
├── metrics_api.rs # 指标API - 指标查询和导出
|
||||
└── status_api.rs # 状态API - 系统状态监控
|
||||
```
|
||||
|
||||
## 核心设计理念
|
||||
|
||||
### 1. 事件驱动架构
|
||||
|
||||
```rust
|
||||
pub enum SystemEvent {
|
||||
ObjectDiscovered { bucket: String, object: String, metadata: ObjectMetadata },
|
||||
HealthIssueDetected { issue_type: HealthIssueType, severity: Severity },
|
||||
HealCompleted { result: HealResult },
|
||||
ScanCycleCompleted { statistics: ScanStatistics },
|
||||
ResourceUsageUpdated { usage: ResourceUsage },
|
||||
}
|
||||
```
|
||||
|
||||
- **Scanner** 产生发现事件
|
||||
- **Heal** 响应修复事件
|
||||
- **Metrics** 收集所有事件统计
|
||||
- **Policy** 控制事件处理策略
|
||||
|
||||
### 2. 分层模块化设计
|
||||
|
||||
#### **API层**: REST/gRPC接口
|
||||
- 统一的响应格式
|
||||
- 完整的错误处理
|
||||
- 认证和授权支持
|
||||
|
||||
#### **策略层**: 可配置的业务规则
|
||||
- 扫描频率和深度控制
|
||||
- 修复优先级策略
|
||||
- 数据保留规则
|
||||
|
||||
#### **协调层**: 系统协调和调度
|
||||
- 事件路由分发
|
||||
- 资源管理分配
|
||||
- 任务调度执行
|
||||
|
||||
#### **引擎层**: 核心业务逻辑
|
||||
- 智能扫描算法
|
||||
- 自适应修复策略
|
||||
- 性能优化控制
|
||||
|
||||
#### **指标层**: 可观测性支持
|
||||
- 实时指标收集
|
||||
- 历史趋势分析
|
||||
- 多格式导出
|
||||
|
||||
### 3. 多模式扫描策略
|
||||
|
||||
```rust
|
||||
pub enum ScanStrategy {
|
||||
Full { mode: ScanMode, scope: ScanScope }, // 全量扫描
|
||||
Incremental { since: Instant, mode: ScanMode }, // 增量扫描
|
||||
Smart { sample_rate: f64, favor_unscanned: bool }, // 智能采样
|
||||
Targeted { targets: Vec<ObjectTarget>, mode: ScanMode }, // 定向扫描
|
||||
}
|
||||
|
||||
pub enum ScanMode {
|
||||
Quick, // 快速扫描 - 仅元数据检查
|
||||
Normal, // 标准扫描 - 基础完整性验证
|
||||
Deep, // 深度扫描 - 包含位腐蚀检测
|
||||
}
|
||||
```
|
||||
|
||||
### 4. 优先级修复系统
|
||||
|
||||
```rust
|
||||
pub enum HealPriority {
|
||||
Low = 0,
|
||||
Normal = 1,
|
||||
High = 2,
|
||||
Critical = 3,
|
||||
Emergency = 4,
|
||||
}
|
||||
|
||||
pub enum HealMode {
|
||||
RealTime, // 实时修复 - GET/PUT时触发
|
||||
Background, // 后台修复 - 计划任务
|
||||
OnDemand, // 按需修复 - 管理员触发
|
||||
Emergency, // 紧急修复 - 关键问题
|
||||
}
|
||||
```
|
||||
|
||||
## API 使用指南
|
||||
|
||||
### 1. 系统管理 API
|
||||
|
||||
#### 启动 AHM 系统
|
||||
|
||||
```http
|
||||
POST /admin/system/start
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"coordinator": {
|
||||
"event_buffer_size": 10000,
|
||||
"max_concurrent_operations": 1000
|
||||
},
|
||||
"scanner": {
|
||||
"default_scan_mode": "Normal",
|
||||
"scan_interval": "24h"
|
||||
},
|
||||
"heal": {
|
||||
"max_workers": 16,
|
||||
"queue_capacity": 50000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"system_id": "ahm-001",
|
||||
"status": "Running",
|
||||
"started_at": "2024-01-15T10:30:00Z"
|
||||
},
|
||||
"timestamp": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### 获取系统状态
|
||||
|
||||
```http
|
||||
GET /status/health
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"status": "Running",
|
||||
"version": "1.0.0",
|
||||
"uptime_seconds": 3600,
|
||||
"subsystems": {
|
||||
"scanner": {
|
||||
"status": "Scanning",
|
||||
"last_check": "2024-01-15T10:29:00Z",
|
||||
"error_message": null
|
||||
},
|
||||
"heal": {
|
||||
"status": "Idle",
|
||||
"last_check": "2024-01-15T10:29:00Z",
|
||||
"error_message": null
|
||||
},
|
||||
"metrics": {
|
||||
"status": "Running",
|
||||
"last_check": "2024-01-15T10:29:00Z",
|
||||
"error_message": null
|
||||
}
|
||||
}
|
||||
},
|
||||
"timestamp": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 2. 扫描管理 API
|
||||
|
||||
#### 启动扫描任务
|
||||
|
||||
```http
|
||||
POST /admin/scan/start
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"strategy": {
|
||||
"type": "Full",
|
||||
"mode": "Normal",
|
||||
"scope": {
|
||||
"buckets": ["important-data", "user-uploads"],
|
||||
"include_system_objects": false,
|
||||
"max_objects": 1000000
|
||||
}
|
||||
},
|
||||
"priority": "High"
|
||||
}
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"scan_id": "scan-12345",
|
||||
"status": "Started",
|
||||
"estimated_duration": "2h30m",
|
||||
"estimated_objects": 850000
|
||||
},
|
||||
"timestamp": "2024-01-15T10:30:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### 查询扫描状态
|
||||
|
||||
```http
|
||||
GET /admin/scan/{scan_id}/status
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"scan_id": "scan-12345",
|
||||
"status": "Scanning",
|
||||
"progress": {
|
||||
"objects_scanned": 425000,
|
||||
"bytes_scanned": 1073741824000,
|
||||
"issues_detected": 23,
|
||||
"completion_percentage": 50.0,
|
||||
"scan_rate_ops": 117.5,
|
||||
"scan_rate_bps": 268435456,
|
||||
"elapsed_time": "1h15m",
|
||||
"estimated_remaining": "1h15m"
|
||||
},
|
||||
"issues": [
|
||||
{
|
||||
"issue_type": "MissingShards",
|
||||
"severity": "High",
|
||||
"bucket": "user-uploads",
|
||||
"object": "photos/IMG_001.jpg",
|
||||
"description": "Missing 1 data shard",
|
||||
"detected_at": "2024-01-15T11:15:00Z"
|
||||
}
|
||||
]
|
||||
},
|
||||
"timestamp": "2024-01-15T11:45:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 3. 修复管理 API
|
||||
|
||||
#### 提交修复请求
|
||||
|
||||
```http
|
||||
POST /admin/heal/request
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"bucket": "user-uploads",
|
||||
"object": "photos/IMG_001.jpg",
|
||||
"version_id": null,
|
||||
"priority": "High",
|
||||
"mode": "OnDemand",
|
||||
"max_retries": 3
|
||||
}
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"heal_request_id": "heal-67890",
|
||||
"status": "Queued",
|
||||
"priority": "High",
|
||||
"estimated_start": "2024-01-15T11:50:00Z",
|
||||
"queue_position": 5
|
||||
},
|
||||
"timestamp": "2024-01-15T11:45:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### 查询修复状态
|
||||
|
||||
```http
|
||||
GET /admin/heal/{heal_request_id}/status
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"heal_request_id": "heal-67890",
|
||||
"status": "Completed",
|
||||
"result": {
|
||||
"success": true,
|
||||
"shards_repaired": 1,
|
||||
"total_shards": 8,
|
||||
"duration": "45s",
|
||||
"strategy_used": "ParityShardRepair",
|
||||
"validation_results": [
|
||||
{
|
||||
"validation_type": "Checksum",
|
||||
"passed": true,
|
||||
"details": "Object checksum verified",
|
||||
"duration": "2s"
|
||||
},
|
||||
{
|
||||
"validation_type": "ShardCount",
|
||||
"passed": true,
|
||||
"details": "All 8 shards present",
|
||||
"duration": "1s"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"timestamp": "2024-01-15T11:46:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### 4. 指标查询 API
|
||||
|
||||
#### 获取系统指标
|
||||
|
||||
```http
|
||||
GET /metrics/system?period=1h&metrics=objects_total,scan_rate,heal_success_rate
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"period": "1h",
|
||||
"timestamp_range": {
|
||||
"start": "2024-01-15T10:45:00Z",
|
||||
"end": "2024-01-15T11:45:00Z"
|
||||
},
|
||||
"metrics": {
|
||||
"objects_total": {
|
||||
"value": 2500000,
|
||||
"unit": "count",
|
||||
"labels": {}
|
||||
},
|
||||
"scan_rate_objects_per_second": {
|
||||
"value": 117.5,
|
||||
"unit": "ops",
|
||||
"labels": {}
|
||||
},
|
||||
"heal_success_rate": {
|
||||
"value": 0.98,
|
||||
"unit": "ratio",
|
||||
"labels": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
"timestamp": "2024-01-15T11:45:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
#### 导出 Prometheus 格式指标
|
||||
|
||||
```http
|
||||
GET /metrics/prometheus
|
||||
```
|
||||
|
||||
**响应示例:**
|
||||
```
|
||||
# HELP rustfs_objects_total Total number of objects in the system
|
||||
# TYPE rustfs_objects_total gauge
|
||||
rustfs_objects_total 2500000
|
||||
|
||||
# HELP rustfs_scan_rate_objects_per_second Object scanning rate
|
||||
# TYPE rustfs_scan_rate_objects_per_second gauge
|
||||
rustfs_scan_rate_objects_per_second 117.5
|
||||
|
||||
# HELP rustfs_heal_success_rate Healing operation success rate
|
||||
# TYPE rustfs_heal_success_rate gauge
|
||||
rustfs_heal_success_rate 0.98
|
||||
|
||||
# HELP rustfs_health_issues_total Total health issues detected
|
||||
# TYPE rustfs_health_issues_total counter
|
||||
rustfs_health_issues_total{severity="critical"} 0
|
||||
rustfs_health_issues_total{severity="high"} 3
|
||||
rustfs_health_issues_total{severity="medium"} 15
|
||||
rustfs_health_issues_total{severity="low"} 45
|
||||
```
|
||||
|
||||
### 5. 策略配置 API
|
||||
|
||||
#### 更新扫描策略
|
||||
|
||||
```http
|
||||
PUT /admin/policy/scan
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"default_scan_interval": "12h",
|
||||
"deep_scan_probability": 0.1,
|
||||
"bandwidth_limit_mbps": 100,
|
||||
"concurrent_scanners": 4,
|
||||
"skip_system_objects": true,
|
||||
"priority_buckets": ["critical-data", "user-data"]
|
||||
}
|
||||
```
|
||||
|
||||
#### 更新修复策略
|
||||
|
||||
```http
|
||||
PUT /admin/policy/heal
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"max_concurrent_heals": 8,
|
||||
"emergency_heal_timeout": "5m",
|
||||
"auto_heal_enabled": true,
|
||||
"heal_verification_required": true,
|
||||
"priority_mapping": {
|
||||
"critical_buckets": "Emergency",
|
||||
"important_buckets": "High",
|
||||
"standard_buckets": "Normal"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 完整的监控和修复流程
|
||||
|
||||
```bash
|
||||
# 1. 启动 AHM 系统
|
||||
curl -X POST http://localhost:9000/admin/system/start \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"scanner": {"default_scan_mode": "Normal"}}'
|
||||
|
||||
# 2. 启动全量扫描
|
||||
SCAN_ID=$(curl -X POST http://localhost:9000/admin/scan/start \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"strategy": {"type": "Full", "mode": "Normal"}}' | \
|
||||
jq -r '.data.scan_id')
|
||||
|
||||
# 3. 监控扫描进度
|
||||
watch "curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | jq '.data.progress'"
|
||||
|
||||
# 4. 查看发现的问题
|
||||
curl -s http://localhost:9000/admin/scan/$SCAN_ID/status | \
|
||||
jq '.data.issues[]'
|
||||
|
||||
# 5. 针对发现的问题启动修复
|
||||
HEAL_ID=$(curl -X POST http://localhost:9000/admin/heal/request \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"bucket": "user-uploads",
|
||||
"object": "photos/IMG_001.jpg",
|
||||
"priority": "High"
|
||||
}' | jq -r '.data.heal_request_id')
|
||||
|
||||
# 6. 监控修复进度
|
||||
watch "curl -s http://localhost:9000/admin/heal/$HEAL_ID/status | jq '.data'"
|
||||
|
||||
# 7. 查看系统指标
|
||||
curl -s http://localhost:9000/metrics/system?period=1h | jq '.data.metrics'
|
||||
|
||||
# 8. 导出 Prometheus 指标
|
||||
curl -s http://localhost:9000/metrics/prometheus
|
||||
```
|
||||
|
||||
## 关键特性
|
||||
|
||||
### 1. 智能扫描
|
||||
- **多级扫描模式**: Quick/Normal/Deep 三种深度
|
||||
- **自适应采样**: 基于历史数据智能选择扫描对象
|
||||
- **带宽控制**: 可配置的 I/O 资源限制
|
||||
- **增量扫描**: 基于时间戳的变化检测
|
||||
|
||||
### 2. 智能修复
|
||||
- **优先级队列**: 基于业务重要性的修复排序
|
||||
- **多种修复策略**: 数据分片、奇偶校验、混合修复
|
||||
- **实时验证**: 修复后的完整性验证
|
||||
- **重试机制**: 可配置的失败重试策略
|
||||
|
||||
### 3. 丰富指标
|
||||
- **实时统计**: 对象数量、存储使用、性能指标
|
||||
- **历史趋势**: 时序数据存储和分析
|
||||
- **多格式导出**: Prometheus、JSON、CSV 等格式
|
||||
- **自定义指标**: 可扩展的指标定义框架
|
||||
|
||||
### 4. 策略驱动
|
||||
- **可配置策略**: 扫描、修复、保留策略独立配置
|
||||
- **动态调整**: 运行时策略更新,无需重启
|
||||
- **业务对齐**: 基于业务重要性的差异化处理
|
||||
|
||||
## 部署建议
|
||||
|
||||
### 1. 资源配置
|
||||
- **CPU**: 推荐 16+ 核心用于并行扫描和修复
|
||||
- **内存**: 推荐 32GB+ 用于指标缓存和任务队列
|
||||
- **网络**: 推荐千兆以上带宽用于跨节点数据同步
|
||||
- **存储**: 推荐 SSD 用于指标数据存储
|
||||
|
||||
### 2. 监控集成
|
||||
- **Prometheus**: 指标收集和告警
|
||||
- **Grafana**: 可视化仪表板
|
||||
- **ELK Stack**: 日志聚合和分析
|
||||
- **Jaeger**: 分布式链路追踪
|
||||
|
||||
### 3. 高可用部署
|
||||
- **多实例部署**: 避免单点故障
|
||||
- **负载均衡**: API 请求分发
|
||||
- **数据备份**: 指标和配置数据备份
|
||||
- **故障转移**: 自动故障检测和切换
|
||||
|
||||
这个架构设计为 RustFS 提供了现代化、可扩展、高可观测的健康监控和修复能力,能够满足企业级分布式存储系统的运维需求。
|
||||
843
crates/ahm/src/api/admin_api.rs
Normal file
843
crates/ahm/src/api/admin_api.rs
Normal file
@@ -0,0 +1,843 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
error::Result,
|
||||
heal::HealEngine,
|
||||
policy::{ScanPolicyEngine as PolicyEngine},
|
||||
scanner::{Engine as ScanEngine},
|
||||
};
|
||||
|
||||
use super::{HttpRequest, HttpResponse};
|
||||
|
||||
/// Configuration for the admin API
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AdminApiConfig {
|
||||
/// Whether to enable admin API
|
||||
pub enabled: bool,
|
||||
/// Admin API prefix
|
||||
pub prefix: String,
|
||||
/// Authentication required
|
||||
pub require_auth: bool,
|
||||
/// Admin token
|
||||
pub admin_token: Option<String>,
|
||||
/// Rate limiting for admin endpoints
|
||||
pub rate_limit_requests_per_minute: u32,
|
||||
/// Maximum request body size
|
||||
pub max_request_size: usize,
|
||||
/// Enable audit logging
|
||||
pub enable_audit_logging: bool,
|
||||
/// Audit log path
|
||||
pub audit_log_path: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for AdminApiConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
prefix: "/admin".to_string(),
|
||||
require_auth: true,
|
||||
admin_token: Some("admin-secret-token".to_string()),
|
||||
rate_limit_requests_per_minute: 100,
|
||||
max_request_size: 1024 * 1024, // 1 MB
|
||||
enable_audit_logging: true,
|
||||
audit_log_path: Some("/tmp/rustfs/admin-audit.log".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Admin API that provides administrative operations
|
||||
pub struct AdminApi {
|
||||
config: AdminApiConfig,
|
||||
scan_engine: Arc<ScanEngine>,
|
||||
heal_engine: Arc<HealEngine>,
|
||||
policy_engine: Arc<PolicyEngine>,
|
||||
}
|
||||
|
||||
impl AdminApi {
|
||||
/// Create a new admin API
|
||||
pub async fn new(
|
||||
config: AdminApiConfig,
|
||||
scan_engine: Arc<ScanEngine>,
|
||||
heal_engine: Arc<HealEngine>,
|
||||
policy_engine: Arc<PolicyEngine>,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &AdminApiConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Handle HTTP request
|
||||
pub async fn handle_request(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
// Check authentication if required
|
||||
if self.config.require_auth {
|
||||
if !self.authenticate_request(&request).await? {
|
||||
return Ok(HttpResponse {
|
||||
status_code: 401,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Unauthorized",
|
||||
"message": "Authentication required"
|
||||
}).to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Log audit if enabled
|
||||
if self.config.enable_audit_logging {
|
||||
self.log_audit(&request).await?;
|
||||
}
|
||||
|
||||
match request.path.as_str() {
|
||||
// Scan operations
|
||||
"/admin/scan/start" => self.start_scan(request).await,
|
||||
"/admin/scan/stop" => self.stop_scan(request).await,
|
||||
"/admin/scan/status" => self.get_scan_status(request).await,
|
||||
"/admin/scan/config" => self.get_scan_config(request).await,
|
||||
"/admin/scan/config" if request.method == "PUT" => self.update_scan_config(request).await,
|
||||
|
||||
// Heal operations
|
||||
"/admin/heal/start" => self.start_heal(request).await,
|
||||
"/admin/heal/stop" => self.stop_heal(request).await,
|
||||
"/admin/heal/status" => self.get_heal_status(request).await,
|
||||
"/admin/heal/config" => self.get_heal_config(request).await,
|
||||
"/admin/heal/config" if request.method == "PUT" => self.update_heal_config(request).await,
|
||||
|
||||
// Policy operations
|
||||
"/admin/policy/list" => self.list_policies(request).await,
|
||||
"/admin/policy/get" => self.get_policy(request).await,
|
||||
"/admin/policy/create" => self.create_policy(request).await,
|
||||
"/admin/policy/update" => self.update_policy(request).await,
|
||||
"/admin/policy/delete" => self.delete_policy(request).await,
|
||||
|
||||
// System operations
|
||||
"/admin/system/status" => self.get_system_status(request).await,
|
||||
"/admin/system/config" => self.get_system_config(request).await,
|
||||
"/admin/system/restart" => self.restart_system(request).await,
|
||||
"/admin/system/shutdown" => self.shutdown_system(request).await,
|
||||
|
||||
// Default 404
|
||||
_ => Ok(HttpResponse {
|
||||
status_code: 404,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Not Found",
|
||||
"message": "Admin endpoint not found"
|
||||
}).to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authenticate request
|
||||
async fn authenticate_request(&self, request: &HttpRequest) -> Result<bool> {
|
||||
if let Some(token) = &self.config.admin_token {
|
||||
// Check for Authorization header
|
||||
if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") {
|
||||
if auth_header.1 == format!("Bearer {}", token) {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for token in query parameters
|
||||
if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") {
|
||||
if token_param.1 == *token {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Log audit entry
|
||||
async fn log_audit(&self, request: &HttpRequest) -> Result<()> {
|
||||
let audit_entry = serde_json::json!({
|
||||
"timestamp": chrono::Utc::now().to_rfc3339(),
|
||||
"method": request.method,
|
||||
"path": request.path,
|
||||
"ip": "127.0.0.1", // In real implementation, get from request
|
||||
"user_agent": "admin-api", // In real implementation, get from headers
|
||||
});
|
||||
|
||||
if let Some(log_path) = &self.config.audit_log_path {
|
||||
// In a real implementation, this would write to the audit log file
|
||||
debug!("Audit log entry: {}", audit_entry);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start scan operation
|
||||
async fn start_scan(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
match self.scan_engine.start_scan().await {
|
||||
Ok(_) => {
|
||||
info!("Scan started via admin API");
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Scan started successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to start scan: {}", e);
|
||||
Ok(HttpResponse {
|
||||
status_code: 500,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Internal Server Error",
|
||||
"message": format!("Failed to start scan: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop scan operation
|
||||
async fn stop_scan(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
match self.scan_engine.stop_scan().await {
|
||||
Ok(_) => {
|
||||
info!("Scan stopped via admin API");
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Scan stopped successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stop scan: {}", e);
|
||||
Ok(HttpResponse {
|
||||
status_code: 500,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Internal Server Error",
|
||||
"message": format!("Failed to stop scan: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get scan status
|
||||
async fn get_scan_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let status = self.scan_engine.get_status().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"scan_status": status,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get scan configuration
|
||||
async fn get_scan_config(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let config = self.scan_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"scan_config": config,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Update scan configuration
|
||||
async fn update_scan_config(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
if let Some(body) = request.body {
|
||||
match serde_json::from_str::<serde_json::Value>(&body) {
|
||||
Ok(config_json) => {
|
||||
// In a real implementation, this would update the scan configuration
|
||||
info!("Scan config updated via admin API: {:?}", config_json);
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Scan configuration updated successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": format!("Invalid JSON: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": "Request body required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Start heal operation
|
||||
async fn start_heal(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
match self.heal_engine.start_healing().await {
|
||||
Ok(_) => {
|
||||
info!("Healing started via admin API");
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Healing started successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to start healing: {}", e);
|
||||
Ok(HttpResponse {
|
||||
status_code: 500,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Internal Server Error",
|
||||
"message": format!("Failed to start healing: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Stop heal operation
|
||||
async fn stop_heal(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
match self.heal_engine.stop_healing().await {
|
||||
Ok(_) => {
|
||||
info!("Healing stopped via admin API");
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Healing stopped successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stop healing: {}", e);
|
||||
Ok(HttpResponse {
|
||||
status_code: 500,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Internal Server Error",
|
||||
"message": format!("Failed to stop healing: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get heal status
|
||||
async fn get_heal_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let status = self.heal_engine.get_status().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"heal_status": status,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get heal configuration
|
||||
async fn get_heal_config(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let config = self.heal_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"heal_config": config,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Update heal configuration
|
||||
async fn update_heal_config(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
if let Some(body) = request.body {
|
||||
match serde_json::from_str::<serde_json::Value>(&body) {
|
||||
Ok(config_json) => {
|
||||
// In a real implementation, this would update the heal configuration
|
||||
info!("Heal config updated via admin API: {:?}", config_json);
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Heal configuration updated successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": format!("Invalid JSON: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": "Request body required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// List policies
|
||||
async fn list_policies(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let policies = self.policy_engine.list_policies().await?;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"policies": policies,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get policy
|
||||
async fn get_policy(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
if let Some(policy_name) = request.query_params.iter().find(|(k, _)| k == "name") {
|
||||
match self.policy_engine.get_policy(&policy_name.1).await {
|
||||
Ok(policy) => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"policy": policy,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 404,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Not Found",
|
||||
"message": format!("Policy not found: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": "Policy name parameter required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Create policy
|
||||
async fn create_policy(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
if let Some(body) = request.body {
|
||||
match serde_json::from_str::<serde_json::Value>(&body) {
|
||||
Ok(policy_json) => {
|
||||
// In a real implementation, this would create the policy
|
||||
info!("Policy created via admin API: {:?}", policy_json);
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 201,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Policy created successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": format!("Invalid JSON: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": "Request body required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Update policy
|
||||
async fn update_policy(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
if let Some(body) = request.body {
|
||||
match serde_json::from_str::<serde_json::Value>(&body) {
|
||||
Ok(policy_json) => {
|
||||
// In a real implementation, this would update the policy
|
||||
info!("Policy updated via admin API: {:?}", policy_json);
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Policy updated successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": format!("Invalid JSON: {}", e)
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": "Request body required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete policy
|
||||
async fn delete_policy(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
if let Some(policy_name) = request.query_params.iter().find(|(k, _)| k == "name") {
|
||||
// In a real implementation, this would delete the policy
|
||||
info!("Policy deleted via admin API: {}", policy_name.1);
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "Policy deleted successfully",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
} else {
|
||||
Ok(HttpResponse {
|
||||
status_code: 400,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Bad Request",
|
||||
"message": "Policy name parameter required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Get system status
|
||||
async fn get_system_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_status = self.scan_engine.get_status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"system_status": {
|
||||
"scan": scan_status,
|
||||
"heal": heal_status,
|
||||
"overall": "healthy"
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get system configuration
|
||||
async fn get_system_config(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_config = self.scan_engine.get_config().await;
|
||||
let heal_config = self.heal_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"system_config": {
|
||||
"scan": scan_config,
|
||||
"heal": heal_config
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Restart system
|
||||
async fn restart_system(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
// In a real implementation, this would restart the system
|
||||
info!("System restart requested via admin API");
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "System restart initiated",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Shutdown system
|
||||
async fn shutdown_system(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
// In a real implementation, this would shutdown the system
|
||||
info!("System shutdown requested via admin API");
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"message": "System shutdown initiated",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
heal::HealEngineConfig,
|
||||
policy::PolicyEngineConfig,
|
||||
scanner::ScanEngineConfig,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_admin_api_creation() {
|
||||
let config = AdminApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
assert!(admin_api.config().enabled);
|
||||
assert_eq!(admin_api.config().prefix, "/admin");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_authentication() {
|
||||
let config = AdminApiConfig {
|
||||
admin_token: Some("test-token".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
// Test with valid token in header
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/admin/scan/status".to_string(),
|
||||
headers: vec![("Authorization".to_string(), "Bearer test-token".to_string())],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
|
||||
// Test with valid token in query
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/admin/scan/status".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![("token".to_string(), "test-token".to_string())],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
|
||||
// Test with invalid token
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/admin/scan/status".to_string(),
|
||||
headers: vec![("Authorization".to_string(), "Bearer invalid-token".to_string())],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 401);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_operations() {
|
||||
let config = AdminApiConfig {
|
||||
require_auth: false, // Disable auth for testing
|
||||
..Default::default()
|
||||
};
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
// Test start scan
|
||||
let request = HttpRequest {
|
||||
method: "POST".to_string(),
|
||||
path: "/admin/scan/start".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
|
||||
// Test get scan status
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/admin/scan/status".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_operations() {
|
||||
let config = AdminApiConfig {
|
||||
require_auth: false, // Disable auth for testing
|
||||
..Default::default()
|
||||
};
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
// Test start heal
|
||||
let request = HttpRequest {
|
||||
method: "POST".to_string(),
|
||||
path: "/admin/heal/start".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
|
||||
// Test get heal status
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/admin/heal/status".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_system_operations() {
|
||||
let config = AdminApiConfig {
|
||||
require_auth: false, // Disable auth for testing
|
||||
..Default::default()
|
||||
};
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let admin_api = AdminApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
// Test get system status
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/admin/system/status".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = admin_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("system_status"));
|
||||
}
|
||||
}
|
||||
1180
crates/ahm/src/api/metrics_api.rs
Normal file
1180
crates/ahm/src/api/metrics_api.rs
Normal file
File diff suppressed because it is too large
Load Diff
504
crates/ahm/src/api/mod.rs
Normal file
504
crates/ahm/src/api/mod.rs
Normal file
@@ -0,0 +1,504 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! API interfaces for the AHM system
|
||||
//!
|
||||
//! Provides REST and gRPC endpoints for:
|
||||
//! - Administrative operations
|
||||
//! - Metrics and monitoring
|
||||
//! - System status and control
|
||||
|
||||
pub mod admin_api;
|
||||
pub mod metrics_api;
|
||||
pub mod status_api;
|
||||
|
||||
pub use admin_api::{AdminApi, AdminApiConfig};
|
||||
pub use metrics_api::{MetricsApi, MetricsApiConfig};
|
||||
pub use status_api::{StatusApi, StatusApiConfig};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
error::Result,
|
||||
heal::HealEngine,
|
||||
metrics::{Collector, Reporter, Storage},
|
||||
policy::{ScanPolicyEngine as PolicyEngine},
|
||||
scanner::{Engine as ScanEngine},
|
||||
};
|
||||
|
||||
/// Configuration for the API server
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ApiConfig {
|
||||
/// Admin API configuration
|
||||
pub admin: AdminApiConfig,
|
||||
/// Metrics API configuration
|
||||
pub metrics: MetricsApiConfig,
|
||||
/// Status API configuration
|
||||
pub status: StatusApiConfig,
|
||||
/// Server address
|
||||
pub address: String,
|
||||
/// Server port
|
||||
pub port: u16,
|
||||
/// Enable HTTPS
|
||||
pub enable_https: bool,
|
||||
/// SSL certificate path
|
||||
pub ssl_cert_path: Option<String>,
|
||||
/// SSL key path
|
||||
pub ssl_key_path: Option<String>,
|
||||
/// Request timeout
|
||||
pub request_timeout: std::time::Duration,
|
||||
/// Maximum request size
|
||||
pub max_request_size: usize,
|
||||
/// Enable CORS
|
||||
pub enable_cors: bool,
|
||||
/// CORS origins
|
||||
pub cors_origins: Vec<String>,
|
||||
/// Enable rate limiting
|
||||
pub enable_rate_limiting: bool,
|
||||
/// Rate limit requests per minute
|
||||
pub rate_limit_requests_per_minute: u32,
|
||||
}
|
||||
|
||||
impl Default for ApiConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
admin: AdminApiConfig::default(),
|
||||
metrics: MetricsApiConfig::default(),
|
||||
status: StatusApiConfig::default(),
|
||||
address: "127.0.0.1".to_string(),
|
||||
port: 8080,
|
||||
enable_https: false,
|
||||
ssl_cert_path: None,
|
||||
ssl_key_path: None,
|
||||
request_timeout: std::time::Duration::from_secs(30),
|
||||
max_request_size: 1024 * 1024, // 1 MB
|
||||
enable_cors: true,
|
||||
cors_origins: vec!["*".to_string()],
|
||||
enable_rate_limiting: true,
|
||||
rate_limit_requests_per_minute: 1000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// API server that provides HTTP endpoints for AHM functionality
|
||||
pub struct ApiServer {
|
||||
config: ApiConfig,
|
||||
admin_api: Arc<AdminApi>,
|
||||
metrics_api: Arc<MetricsApi>,
|
||||
status_api: Arc<StatusApi>,
|
||||
scan_engine: Arc<ScanEngine>,
|
||||
heal_engine: Arc<HealEngine>,
|
||||
policy_engine: Arc<PolicyEngine>,
|
||||
metrics_collector: Arc<Collector>,
|
||||
metrics_reporter: Arc<Reporter>,
|
||||
metrics_storage: Arc<Storage>,
|
||||
}
|
||||
|
||||
impl ApiServer {
|
||||
/// Create a new API server
|
||||
pub async fn new(
|
||||
config: ApiConfig,
|
||||
scan_engine: Arc<ScanEngine>,
|
||||
heal_engine: Arc<HealEngine>,
|
||||
policy_engine: Arc<PolicyEngine>,
|
||||
metrics_collector: Arc<Collector>,
|
||||
metrics_reporter: Arc<Reporter>,
|
||||
metrics_storage: Arc<Storage>,
|
||||
) -> Result<Self> {
|
||||
let admin_api = Arc::new(AdminApi::new(config.admin.clone(), scan_engine.clone(), heal_engine.clone(), policy_engine.clone()).await?);
|
||||
let metrics_api = Arc::new(MetricsApi::new(config.metrics.clone(), metrics_collector.clone(), metrics_reporter.clone(), metrics_storage.clone()).await?);
|
||||
let status_api = Arc::new(StatusApi::new(config.status.clone(), scan_engine.clone(), heal_engine.clone(), policy_engine.clone()).await?);
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
admin_api,
|
||||
metrics_api,
|
||||
status_api,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &ApiConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Start the API server
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
// In a real implementation, this would start an HTTP server
|
||||
// For now, we'll just simulate the server startup
|
||||
tracing::info!("API server starting on {}:{}", self.config.address, self.config.port);
|
||||
|
||||
if self.config.enable_https {
|
||||
tracing::info!("HTTPS enabled");
|
||||
}
|
||||
|
||||
if self.config.enable_cors {
|
||||
tracing::info!("CORS enabled with origins: {:?}", self.config.cors_origins);
|
||||
}
|
||||
|
||||
if self.config.enable_rate_limiting {
|
||||
tracing::info!("Rate limiting enabled: {} requests/minute", self.config.rate_limit_requests_per_minute);
|
||||
}
|
||||
|
||||
tracing::info!("API server started successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the API server
|
||||
pub async fn stop(&self) -> Result<()> {
|
||||
tracing::info!("API server stopping");
|
||||
tracing::info!("API server stopped successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get server status
|
||||
pub async fn status(&self) -> ServerStatus {
|
||||
ServerStatus {
|
||||
address: self.config.address.clone(),
|
||||
port: self.config.port,
|
||||
https_enabled: self.config.enable_https,
|
||||
cors_enabled: self.config.enable_cors,
|
||||
rate_limiting_enabled: self.config.enable_rate_limiting,
|
||||
admin_api_enabled: true,
|
||||
metrics_api_enabled: true,
|
||||
status_api_enabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get admin API
|
||||
pub fn admin_api(&self) -> &Arc<AdminApi> {
|
||||
&self.admin_api
|
||||
}
|
||||
|
||||
/// Get metrics API
|
||||
pub fn metrics_api(&self) -> &Arc<MetricsApi> {
|
||||
&self.metrics_api
|
||||
}
|
||||
|
||||
/// Get status API
|
||||
pub fn status_api(&self) -> &Arc<StatusApi> {
|
||||
&self.status_api
|
||||
}
|
||||
|
||||
/// Handle HTTP request
|
||||
pub async fn handle_request(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
match request.path.as_str() {
|
||||
// Admin API routes
|
||||
path if path.starts_with("/admin") => {
|
||||
self.admin_api.handle_request(request).await
|
||||
}
|
||||
// Metrics API routes
|
||||
path if path.starts_with("/metrics") => {
|
||||
self.metrics_api.handle_request(request).await
|
||||
}
|
||||
// Status API routes
|
||||
path if path.starts_with("/status") => {
|
||||
self.status_api.handle_request(request).await
|
||||
}
|
||||
// Health check
|
||||
"/health" => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "healthy",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339(),
|
||||
"version": env!("CARGO_PKG_VERSION")
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
// Root endpoint
|
||||
"/" => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"service": "RustFS AHM API",
|
||||
"version": env!("CARGO_PKG_VERSION"),
|
||||
"endpoints": {
|
||||
"admin": "/admin",
|
||||
"metrics": "/metrics",
|
||||
"status": "/status",
|
||||
"health": "/health"
|
||||
}
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
// 404 for unknown routes
|
||||
_ => {
|
||||
Ok(HttpResponse {
|
||||
status_code: 404,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Not Found",
|
||||
"message": "The requested endpoint does not exist"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// HTTP request
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HttpRequest {
|
||||
pub method: String,
|
||||
pub path: String,
|
||||
pub headers: Vec<(String, String)>,
|
||||
pub body: Option<String>,
|
||||
pub query_params: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
/// HTTP response
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HttpResponse {
|
||||
pub status_code: u16,
|
||||
pub headers: Vec<(String, String)>,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
/// Server status
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ServerStatus {
|
||||
pub address: String,
|
||||
pub port: u16,
|
||||
pub https_enabled: bool,
|
||||
pub cors_enabled: bool,
|
||||
pub rate_limiting_enabled: bool,
|
||||
pub admin_api_enabled: bool,
|
||||
pub metrics_api_enabled: bool,
|
||||
pub status_api_enabled: bool,
|
||||
}
|
||||
|
||||
/// API endpoint information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EndpointInfo {
|
||||
pub path: String,
|
||||
pub method: String,
|
||||
pub description: String,
|
||||
pub parameters: Vec<ParameterInfo>,
|
||||
pub response_type: String,
|
||||
}
|
||||
|
||||
/// Parameter information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ParameterInfo {
|
||||
pub name: String,
|
||||
pub parameter_type: String,
|
||||
pub required: bool,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
/// API documentation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ApiDocumentation {
|
||||
pub title: String,
|
||||
pub version: String,
|
||||
pub description: String,
|
||||
pub endpoints: Vec<EndpointInfo>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
heal::HealEngineConfig,
|
||||
metrics::{CollectorConfig, ReporterConfig, StorageConfig},
|
||||
policy::PolicyEngineConfig,
|
||||
scanner::ScanEngineConfig,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_server_creation() {
|
||||
let config = ApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap());
|
||||
let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap());
|
||||
let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap());
|
||||
|
||||
let server = ApiServer::new(
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
).await.unwrap();
|
||||
|
||||
assert_eq!(server.config().port, 8080);
|
||||
assert_eq!(server.config().address, "127.0.0.1");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_server_start_stop() {
|
||||
let config = ApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap());
|
||||
let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap());
|
||||
let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap());
|
||||
|
||||
let server = ApiServer::new(
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
).await.unwrap();
|
||||
|
||||
server.start().await.unwrap();
|
||||
server.stop().await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_api_server_status() {
|
||||
let config = ApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap());
|
||||
let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap());
|
||||
let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap());
|
||||
|
||||
let server = ApiServer::new(
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
).await.unwrap();
|
||||
|
||||
let status = server.status().await;
|
||||
assert_eq!(status.port, 8080);
|
||||
assert_eq!(status.address, "127.0.0.1");
|
||||
assert!(status.admin_api_enabled);
|
||||
assert!(status.metrics_api_enabled);
|
||||
assert!(status.status_api_enabled);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_health_endpoint() {
|
||||
let config = ApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap());
|
||||
let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap());
|
||||
let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap());
|
||||
|
||||
let server = ApiServer::new(
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/health".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = server.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("healthy"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_root_endpoint() {
|
||||
let config = ApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap());
|
||||
let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap());
|
||||
let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap());
|
||||
|
||||
let server = ApiServer::new(
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = server.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("RustFS AHM API"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_404_endpoint() {
|
||||
let config = ApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
let metrics_collector = Arc::new(Collector::new(CollectorConfig::default()).await.unwrap());
|
||||
let metrics_reporter = Arc::new(Reporter::new(ReporterConfig::default()).await.unwrap());
|
||||
let metrics_storage = Arc::new(Storage::new(StorageConfig::default()).await.unwrap());
|
||||
|
||||
let server = ApiServer::new(
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
metrics_collector,
|
||||
metrics_reporter,
|
||||
metrics_storage,
|
||||
).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/unknown".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = server.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 404);
|
||||
assert!(response.body.contains("Not Found"));
|
||||
}
|
||||
}
|
||||
761
crates/ahm/src/api/status_api.rs
Normal file
761
crates/ahm/src/api/status_api.rs
Normal file
@@ -0,0 +1,761 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
error::Result,
|
||||
heal::HealEngine,
|
||||
policy::{ScanPolicyEngine as PolicyEngine},
|
||||
scanner::{Engine as ScanEngine},
|
||||
};
|
||||
|
||||
use super::{HttpRequest, HttpResponse};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Configuration for the status API
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StatusApiConfig {
|
||||
/// Whether to enable status API
|
||||
pub enabled: bool,
|
||||
/// Status API prefix
|
||||
pub prefix: String,
|
||||
/// Authentication required
|
||||
pub require_auth: bool,
|
||||
/// Status token
|
||||
pub status_token: Option<String>,
|
||||
/// Rate limiting for status endpoints
|
||||
pub rate_limit_requests_per_minute: u32,
|
||||
/// Maximum request body size
|
||||
pub max_request_size: usize,
|
||||
/// Enable detailed status information
|
||||
pub enable_detailed_status: bool,
|
||||
/// Status cache TTL in seconds
|
||||
pub status_cache_ttl_seconds: u64,
|
||||
/// Enable health checks
|
||||
pub enable_health_checks: bool,
|
||||
/// Health check timeout
|
||||
pub health_check_timeout: std::time::Duration,
|
||||
}
|
||||
|
||||
impl Default for StatusApiConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
prefix: "/status".to_string(),
|
||||
require_auth: false,
|
||||
status_token: None,
|
||||
rate_limit_requests_per_minute: 1000,
|
||||
max_request_size: 1024 * 1024, // 1 MB
|
||||
enable_detailed_status: true,
|
||||
status_cache_ttl_seconds: 30, // 30 seconds
|
||||
enable_health_checks: true,
|
||||
health_check_timeout: std::time::Duration::from_secs(5),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Status API that provides system status and health information
|
||||
pub struct StatusApi {
|
||||
config: StatusApiConfig,
|
||||
scan_engine: Arc<ScanEngine>,
|
||||
heal_engine: Arc<HealEngine>,
|
||||
policy_engine: Arc<PolicyEngine>,
|
||||
}
|
||||
|
||||
impl StatusApi {
|
||||
/// Create a new status API
|
||||
pub async fn new(
|
||||
config: StatusApiConfig,
|
||||
scan_engine: Arc<ScanEngine>,
|
||||
heal_engine: Arc<HealEngine>,
|
||||
policy_engine: Arc<PolicyEngine>,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config,
|
||||
scan_engine,
|
||||
heal_engine,
|
||||
policy_engine,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &StatusApiConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Handle HTTP request
|
||||
pub async fn handle_request(&self, request: HttpRequest) -> Result<HttpResponse> {
|
||||
// Check authentication if required
|
||||
if self.config.require_auth {
|
||||
if !self.authenticate_request(&request).await? {
|
||||
return Ok(HttpResponse {
|
||||
status_code: 401,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Unauthorized",
|
||||
"message": "Authentication required"
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
match request.path.as_str() {
|
||||
// Basic status
|
||||
"/status" => self.get_status(request).await,
|
||||
"/status/health" => self.get_health_status(request).await,
|
||||
"/status/overview" => self.get_overview_status(request).await,
|
||||
|
||||
// Component status
|
||||
"/status/scan" => self.get_scan_status(request).await,
|
||||
"/status/heal" => self.get_heal_status(request).await,
|
||||
"/status/policy" => self.get_policy_status(request).await,
|
||||
|
||||
// Detailed status
|
||||
"/status/detailed" => self.get_detailed_status(request).await,
|
||||
"/status/components" => self.get_components_status(request).await,
|
||||
"/status/resources" => self.get_resources_status(request).await,
|
||||
|
||||
// Health checks
|
||||
"/status/health/check" => self.perform_health_check(request).await,
|
||||
"/status/health/readiness" => self.get_readiness_status(request).await,
|
||||
"/status/health/liveness" => self.get_liveness_status(request).await,
|
||||
|
||||
// System information
|
||||
"/status/info" => self.get_system_info(request).await,
|
||||
"/status/version" => self.get_version_info(request).await,
|
||||
"/status/uptime" => self.get_uptime_info(request).await,
|
||||
|
||||
// Default 404
|
||||
_ => Ok(HttpResponse {
|
||||
status_code: 404,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Not Found",
|
||||
"message": "Status endpoint not found"
|
||||
}).to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authenticate request
|
||||
async fn authenticate_request(&self, request: &HttpRequest) -> Result<bool> {
|
||||
if let Some(token) = &self.config.status_token {
|
||||
// Check for Authorization header
|
||||
if let Some(auth_header) = request.headers.iter().find(|(k, _)| k.to_lowercase() == "authorization") {
|
||||
if auth_header.1 == format!("Bearer {}", token) {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for token in query parameters
|
||||
if let Some(token_param) = request.query_params.iter().find(|(k, _)| k == "token") {
|
||||
if token_param.1 == *token {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Get basic status
|
||||
async fn get_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
|
||||
let overall_status = if scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running {
|
||||
"healthy"
|
||||
} else if scan_status == crate::scanner::Status::Stopped && heal_status == crate::heal::Status::Stopped {
|
||||
"stopped"
|
||||
} else {
|
||||
"degraded"
|
||||
};
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"overall_status": overall_status,
|
||||
"components": {
|
||||
"scan": scan_status,
|
||||
"heal": heal_status
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get health status
|
||||
async fn get_health_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
|
||||
let is_healthy = scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running;
|
||||
let status_code = if is_healthy { 200 } else { 503 };
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": if is_healthy { "healthy" } else { "unhealthy" },
|
||||
"components": {
|
||||
"scan": {
|
||||
"status": scan_status,
|
||||
"healthy": scan_status == crate::scanner::Status::Running
|
||||
},
|
||||
"heal": {
|
||||
"status": heal_status,
|
||||
"healthy": heal_status == crate::heal::Status::Running
|
||||
}
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get overview status
|
||||
async fn get_overview_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
|
||||
let scan_config = self.scan_engine.get_config().await;
|
||||
let heal_config = self.heal_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"overview": {
|
||||
"scan": {
|
||||
"status": scan_status,
|
||||
"enabled": scan_config.enabled,
|
||||
"scan_interval": scan_config.scan_interval.as_secs()
|
||||
},
|
||||
"heal": {
|
||||
"status": heal_status,
|
||||
"enabled": heal_config.auto_heal_enabled,
|
||||
"max_workers": heal_config.max_workers
|
||||
}
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get scan status
|
||||
async fn get_scan_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let status = self.scan_engine.status().await;
|
||||
let config = self.scan_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"scan": {
|
||||
"status": status,
|
||||
"enabled": config.enabled,
|
||||
"scan_interval": config.scan_interval.as_secs(),
|
||||
"max_concurrent_scans": config.max_concurrent_scans,
|
||||
"scan_paths": config.scan_paths,
|
||||
"bandwidth_limit": config.bandwidth_limit
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get heal status
|
||||
async fn get_heal_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let status = self.heal_engine.get_status().await;
|
||||
let config = self.heal_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"heal": {
|
||||
"status": status,
|
||||
"enabled": config.auto_heal_enabled,
|
||||
"max_workers": config.max_workers,
|
||||
"repair_timeout": config.repair_timeout.as_secs(),
|
||||
"retry_attempts": config.max_retry_attempts,
|
||||
"priority_queue_size": config.max_queue_size
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get policy status
|
||||
async fn get_policy_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let policies = self.policy_engine.list_policies().await?;
|
||||
let config = self.policy_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"policy": {
|
||||
"enabled": config.enabled,
|
||||
"total_policies": policies.len(),
|
||||
"policies": policies,
|
||||
"evaluation_timeout": config.evaluation_timeout.as_secs(),
|
||||
"cache_enabled": config.cache_enabled
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get detailed status
|
||||
async fn get_detailed_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
if !self.config.enable_detailed_status {
|
||||
return Ok(HttpResponse {
|
||||
status_code: 403,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"error": "Forbidden",
|
||||
"message": "Detailed status is disabled"
|
||||
}).to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
let scan_config = self.scan_engine.get_config().await;
|
||||
let heal_config = self.heal_engine.get_config().await;
|
||||
let policy_config = self.policy_engine.get_config().await;
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"detailed_status": {
|
||||
"scan": {
|
||||
"status": scan_status,
|
||||
"config": scan_config
|
||||
},
|
||||
"heal": {
|
||||
"status": heal_status,
|
||||
"config": heal_config
|
||||
},
|
||||
"policy": {
|
||||
"config": policy_config
|
||||
}
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get components status
|
||||
async fn get_components_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
|
||||
let components = vec![
|
||||
serde_json::json!({
|
||||
"name": "scan_engine",
|
||||
"status": scan_status,
|
||||
"healthy": scan_status == crate::scanner::Status::Running,
|
||||
"type": "scanner"
|
||||
}),
|
||||
serde_json::json!({
|
||||
"name": "heal_engine",
|
||||
"status": heal_status,
|
||||
"healthy": heal_status == crate::heal::Status::Running,
|
||||
"type": "healer"
|
||||
}),
|
||||
serde_json::json!({
|
||||
"name": "policy_engine",
|
||||
"status": "running",
|
||||
"healthy": true,
|
||||
"type": "policy"
|
||||
})
|
||||
];
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"components": components,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get resources status
|
||||
async fn get_resources_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
// In a real implementation, this would collect actual resource usage
|
||||
// For now, we'll return simulated data
|
||||
let resources = serde_json::json!({
|
||||
"cpu": {
|
||||
"usage_percent": 25.5,
|
||||
"cores": 8,
|
||||
"load_average": 0.75
|
||||
},
|
||||
"memory": {
|
||||
"usage_percent": 60.2,
|
||||
"total_bytes": 8589934592, // 8 GB
|
||||
"available_bytes": 3422552064 // ~3.2 GB
|
||||
},
|
||||
"disk": {
|
||||
"usage_percent": 45.8,
|
||||
"total_bytes": 107374182400, // 100 GB
|
||||
"available_bytes": 58133032960 // ~54 GB
|
||||
},
|
||||
"network": {
|
||||
"bytes_received_per_sec": 1048576, // 1 MB/s
|
||||
"bytes_sent_per_sec": 524288 // 512 KB/s
|
||||
}
|
||||
});
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"resources": resources,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Perform health checks
|
||||
async fn perform_health_checks(&self) -> Result<Vec<HealthCheckResult>> {
|
||||
let mut checks = Vec::new();
|
||||
let start_time = std::time::Instant::now();
|
||||
|
||||
// Check scan engine
|
||||
let scan_start = std::time::Instant::now();
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let scan_duration = scan_start.elapsed();
|
||||
checks.push(HealthCheckResult {
|
||||
name: "scan_engine".to_string(),
|
||||
healthy: scan_status == crate::scanner::Status::Running,
|
||||
message: format!("Scan engine status: {:?}", scan_status),
|
||||
duration_ms: scan_duration.as_millis() as u64,
|
||||
});
|
||||
|
||||
// Check heal engine
|
||||
let heal_start = std::time::Instant::now();
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
let heal_duration = heal_start.elapsed();
|
||||
checks.push(HealthCheckResult {
|
||||
name: "heal_engine".to_string(),
|
||||
healthy: heal_status == crate::heal::Status::Running,
|
||||
message: format!("Heal engine status: {:?}", heal_status),
|
||||
duration_ms: heal_duration.as_millis() as u64,
|
||||
});
|
||||
|
||||
// Check policy engine
|
||||
let policy_start = std::time::Instant::now();
|
||||
let policy_result = self.policy_engine.list_policies().await;
|
||||
let policy_duration = policy_start.elapsed();
|
||||
checks.push(HealthCheckResult {
|
||||
name: "policy_engine".to_string(),
|
||||
healthy: policy_result.is_ok(),
|
||||
message: if policy_result.is_ok() {
|
||||
"Policy engine is responding".to_string()
|
||||
} else {
|
||||
format!("Policy engine error: {:?}", policy_result.unwrap_err())
|
||||
},
|
||||
duration_ms: policy_duration.as_millis() as u64,
|
||||
});
|
||||
|
||||
let total_duration = start_time.elapsed();
|
||||
info!("Health checks completed in {:?}", total_duration);
|
||||
|
||||
Ok(checks)
|
||||
}
|
||||
|
||||
/// Perform health check (alias for perform_health_checks)
|
||||
async fn perform_health_check(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let checks = self.perform_health_checks().await?;
|
||||
let all_healthy = checks.iter().all(|check| check.healthy);
|
||||
let check_time = std::time::Instant::now().elapsed();
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: if all_healthy { 200 } else { 503 },
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": if all_healthy { "healthy" } else { "unhealthy" },
|
||||
"checks": checks,
|
||||
"check_time_ms": check_time.as_millis(),
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get readiness status
|
||||
async fn get_readiness_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let scan_status = self.scan_engine.status().await;
|
||||
let heal_status = self.heal_engine.get_status().await;
|
||||
|
||||
let is_ready = scan_status == crate::scanner::Status::Running && heal_status == crate::heal::Status::Running;
|
||||
let status_code = if is_ready { 200 } else { 503 };
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": if is_ready { "ready" } else { "not_ready" },
|
||||
"components": {
|
||||
"scan_engine": scan_status == crate::scanner::Status::Running,
|
||||
"heal_engine": heal_status == crate::heal::Status::Running
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get liveness status
|
||||
async fn get_liveness_status(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
// Liveness check is simple - if we can respond, we're alive
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "alive",
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get system information
|
||||
async fn get_system_info(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
let system_info = serde_json::json!({
|
||||
"service": "RustFS AHM",
|
||||
"version": env!("CARGO_PKG_VERSION"),
|
||||
"system_info": {
|
||||
"rust_version": option_env!("RUST_VERSION").unwrap_or("unknown"),
|
||||
"target_arch": option_env!("TARGET_ARCH").unwrap_or("unknown"),
|
||||
"target_os": option_env!("TARGET_OS").unwrap_or("unknown"),
|
||||
"build_time": option_env!("VERGEN_BUILD_TIMESTAMP").unwrap_or("unknown"),
|
||||
"git_commit": option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"),
|
||||
"git_branch": option_env!("VERGEN_GIT_BRANCH").unwrap_or("unknown"),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"system_info": system_info,
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get version information
|
||||
async fn get_version_info(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"version": env!("CARGO_PKG_VERSION"),
|
||||
"build_time": option_env!("VERGEN_BUILD_TIMESTAMP").unwrap_or("unknown"),
|
||||
"git_commit": option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"),
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get uptime information
|
||||
async fn get_uptime_info(&self, _request: HttpRequest) -> Result<HttpResponse> {
|
||||
// In a real implementation, this would track actual uptime
|
||||
// For now, we'll return simulated data
|
||||
let uptime_seconds = 3600; // 1 hour
|
||||
let uptime_duration = std::time::Duration::from_secs(uptime_seconds);
|
||||
|
||||
Ok(HttpResponse {
|
||||
status_code: 200,
|
||||
headers: vec![("Content-Type".to_string(), "application/json".to_string())],
|
||||
body: serde_json::json!({
|
||||
"status": "success",
|
||||
"uptime": {
|
||||
"seconds": uptime_seconds,
|
||||
"duration": format!("{:?}", uptime_duration),
|
||||
"start_time": chrono::Utc::now() - chrono::Duration::seconds(uptime_seconds as i64)
|
||||
},
|
||||
"timestamp": chrono::Utc::now().to_rfc3339()
|
||||
}).to_string(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Health check result
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthCheckResult {
|
||||
pub name: String,
|
||||
pub healthy: bool,
|
||||
pub message: String,
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{
|
||||
heal::HealEngineConfig,
|
||||
policy::PolicyEngineConfig,
|
||||
scanner::ScanEngineConfig,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_status_api_creation() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
assert!(status_api.config().enabled);
|
||||
assert_eq!(status_api.config().prefix, "/status");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic_status() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/status".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = status_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("overall_status"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_health_status() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/status/health".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = status_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("status"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_status() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/status/scan".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = status_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("scan"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_status() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/status/heal".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = status_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("heal"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_version_info() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/status/version".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = status_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("version"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_liveness_status() {
|
||||
let config = StatusApiConfig::default();
|
||||
let scan_engine = Arc::new(ScanEngine::new(ScanEngineConfig::default()).await.unwrap());
|
||||
let heal_engine = Arc::new(HealEngine::new(HealEngineConfig::default()).await.unwrap());
|
||||
let policy_engine = Arc::new(PolicyEngine::new(PolicyEngineConfig::default()).await.unwrap());
|
||||
|
||||
let status_api = StatusApi::new(config, scan_engine, heal_engine, policy_engine).await.unwrap();
|
||||
|
||||
let request = HttpRequest {
|
||||
method: "GET".to_string(),
|
||||
path: "/status/health/liveness".to_string(),
|
||||
headers: vec![],
|
||||
body: None,
|
||||
query_params: vec![],
|
||||
};
|
||||
|
||||
let response = status_api.handle_request(request).await.unwrap();
|
||||
assert_eq!(response.status_code, 200);
|
||||
assert!(response.body.contains("alive"));
|
||||
}
|
||||
}
|
||||
448
crates/ahm/src/core/coordinator.rs
Normal file
448
crates/ahm/src/core/coordinator.rs
Normal file
@@ -0,0 +1,448 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Core coordinator for the AHM system
|
||||
//!
|
||||
//! The coordinator is responsible for:
|
||||
//! - Event routing and distribution between subsystems
|
||||
//! - Resource management and allocation
|
||||
//! - Global state coordination
|
||||
//! - Cross-system communication
|
||||
|
||||
use std::{
|
||||
sync::{Arc, atomic::{AtomicU64, Ordering}},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use tokio::{
|
||||
sync::{broadcast, RwLock},
|
||||
task::JoinHandle,
|
||||
time::interval,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::{SystemEvent, metrics};
|
||||
use super::{Status, Scheduler, SchedulerConfig};
|
||||
use crate::scanner;
|
||||
use crate::error::Result;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
/// Configuration for the coordinator
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CoordinatorConfig {
|
||||
/// Event channel buffer size
|
||||
pub event_buffer_size: usize,
|
||||
/// Resource monitoring interval
|
||||
pub resource_monitor_interval: Duration,
|
||||
/// Maximum number of concurrent operations
|
||||
pub max_concurrent_operations: usize,
|
||||
/// Scheduler configuration
|
||||
pub scheduler: SchedulerConfig,
|
||||
/// Event channel capacity
|
||||
pub event_channel_capacity: usize,
|
||||
/// Health check interval
|
||||
pub health_check_interval: Duration,
|
||||
/// Metrics update interval
|
||||
pub metrics_update_interval: Duration,
|
||||
}
|
||||
|
||||
impl Default for CoordinatorConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
event_buffer_size: 10000,
|
||||
resource_monitor_interval: Duration::from_secs(30),
|
||||
max_concurrent_operations: 100,
|
||||
scheduler: SchedulerConfig::default(),
|
||||
event_channel_capacity: 1024,
|
||||
health_check_interval: Duration::from_secs(300),
|
||||
metrics_update_interval: Duration::from_secs(60),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Core coordinator for the AHM system
|
||||
#[derive(Debug)]
|
||||
pub struct Coordinator {
|
||||
/// Configuration
|
||||
config: CoordinatorConfig,
|
||||
/// Current status
|
||||
status: Arc<RwLock<Status>>,
|
||||
/// Event broadcaster
|
||||
event_tx: broadcast::Sender<SystemEvent>,
|
||||
/// Resource monitor handle
|
||||
resource_monitor_handle: Arc<RwLock<Option<JoinHandle<()>>>>,
|
||||
/// Event processor handle
|
||||
event_processor_handle: Arc<RwLock<Option<JoinHandle<()>>>>,
|
||||
/// Task scheduler
|
||||
scheduler: Arc<Scheduler>,
|
||||
/// Metrics collector reference
|
||||
metrics: Arc<metrics::Collector>,
|
||||
/// Active operations counter
|
||||
active_operations: AtomicU64,
|
||||
/// Cancellation token
|
||||
cancel_token: CancellationToken,
|
||||
/// Operation statistics
|
||||
operation_stats: Arc<RwLock<OperationStatistics>>,
|
||||
}
|
||||
|
||||
impl Coordinator {
|
||||
/// Create a new coordinator
|
||||
pub async fn new(
|
||||
config: CoordinatorConfig,
|
||||
metrics: Arc<metrics::Collector>,
|
||||
cancel_token: CancellationToken,
|
||||
) -> Result<Self> {
|
||||
let (event_tx, _) = broadcast::channel(config.event_buffer_size);
|
||||
let scheduler = Arc::new(Scheduler::new(config.scheduler.clone()).await?);
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
status: Arc::new(RwLock::new(Status::Initializing)),
|
||||
event_tx,
|
||||
resource_monitor_handle: Arc::new(RwLock::new(None)),
|
||||
event_processor_handle: Arc::new(RwLock::new(None)),
|
||||
scheduler,
|
||||
metrics,
|
||||
active_operations: AtomicU64::new(0),
|
||||
cancel_token,
|
||||
operation_stats: Arc::new(RwLock::new(OperationStatistics::default())),
|
||||
})
|
||||
}
|
||||
|
||||
/// Start the coordinator
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
info!("Starting AHM coordinator");
|
||||
|
||||
// Update status
|
||||
*self.status.write().await = Status::Running;
|
||||
|
||||
// Start resource monitor
|
||||
self.start_resource_monitor().await?;
|
||||
|
||||
// Start event processor
|
||||
self.start_event_processor().await?;
|
||||
|
||||
// Start scheduler
|
||||
self.scheduler.start().await?;
|
||||
|
||||
info!("AHM coordinator started successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the coordinator
|
||||
pub async fn stop(&self) -> Result<()> {
|
||||
info!("Stopping AHM coordinator");
|
||||
|
||||
// Update status
|
||||
*self.status.write().await = Status::Stopping;
|
||||
|
||||
// Stop scheduler
|
||||
self.scheduler.stop().await?;
|
||||
|
||||
// Stop resource monitor
|
||||
if let Some(handle) = self.resource_monitor_handle.write().await.take() {
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
// Stop event processor
|
||||
if let Some(handle) = self.event_processor_handle.write().await.take() {
|
||||
handle.abort();
|
||||
}
|
||||
|
||||
*self.status.write().await = Status::Stopped;
|
||||
info!("AHM coordinator stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get current status
|
||||
pub async fn status(&self) -> Status {
|
||||
self.status.read().await.clone()
|
||||
}
|
||||
|
||||
/// Subscribe to system events
|
||||
pub fn subscribe_events(&self) -> broadcast::Receiver<SystemEvent> {
|
||||
self.event_tx.subscribe()
|
||||
}
|
||||
|
||||
/// Publish a system event
|
||||
pub async fn publish_event(&self, event: SystemEvent) -> Result<()> {
|
||||
debug!("Publishing system event: {:?}", event);
|
||||
|
||||
// Update operation statistics
|
||||
self.update_operation_stats(&event).await;
|
||||
|
||||
// Send to all subscribers
|
||||
if let Err(e) = self.event_tx.send(event.clone()) {
|
||||
warn!("Failed to publish event: {:?}", e);
|
||||
}
|
||||
|
||||
// Record the event in metrics
|
||||
self.metrics.record_health_issue(&HealthIssue {
|
||||
issue_type: HealthIssueType::Unknown,
|
||||
severity: Severity::Low,
|
||||
bucket: "system".to_string(),
|
||||
object: "coordinator".to_string(),
|
||||
description: format!("System event: {:?}", event),
|
||||
metadata: None,
|
||||
}).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get system resource usage
|
||||
pub async fn get_resource_usage(&self) -> metrics::ResourceUsage {
|
||||
metrics::ResourceUsage {
|
||||
disk_usage: metrics::DiskUsage {
|
||||
total_bytes: 1_000_000_000,
|
||||
used_bytes: 500_000_000,
|
||||
available_bytes: 500_000_000,
|
||||
usage_percentage: 50.0,
|
||||
},
|
||||
memory_usage: metrics::MemoryUsage {
|
||||
total_bytes: 16_000_000_000,
|
||||
used_bytes: 4_000_000_000,
|
||||
available_bytes: 12_000_000_000,
|
||||
usage_percentage: 25.0,
|
||||
},
|
||||
network_usage: metrics::NetworkUsage {
|
||||
bytes_received: 1_000_000,
|
||||
bytes_sent: 500_000,
|
||||
packets_received: 1000,
|
||||
packets_sent: 500,
|
||||
},
|
||||
cpu_usage: metrics::CpuUsage {
|
||||
usage_percentage: 0.25,
|
||||
cores: 8,
|
||||
load_average: 1.5,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Get operation statistics
|
||||
pub async fn get_operation_statistics(&self) -> OperationStatistics {
|
||||
self.operation_stats.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get active operations count
|
||||
pub fn get_active_operations_count(&self) -> u64 {
|
||||
self.active_operations.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Register an active operation
|
||||
pub fn register_operation(&self) -> OperationGuard {
|
||||
let count = self.active_operations.fetch_add(1, Ordering::Relaxed);
|
||||
debug!("Registered operation, active count: {}", count + 1);
|
||||
OperationGuard::new(&self.active_operations)
|
||||
}
|
||||
|
||||
/// Start the resource monitor
|
||||
async fn start_resource_monitor(&self) -> Result<()> {
|
||||
let cancel_token = self.cancel_token.clone();
|
||||
let _event_tx = self.event_tx.clone();
|
||||
let interval_duration = self.config.resource_monitor_interval;
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let mut interval = interval(interval_duration);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel_token.cancelled() => {
|
||||
debug!("Resource monitor cancelled");
|
||||
break;
|
||||
}
|
||||
_ = interval.tick() => {
|
||||
// This would collect real resource metrics
|
||||
// For now, we'll skip the actual collection
|
||||
debug!("Resource monitor tick");
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
*self.resource_monitor_handle.write().await = Some(handle);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start the event processor
|
||||
async fn start_event_processor(&self) -> Result<()> {
|
||||
let mut event_rx = self.event_tx.subscribe();
|
||||
let cancel_token = self.cancel_token.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel_token.cancelled() => {
|
||||
debug!("Event processor cancelled");
|
||||
break;
|
||||
}
|
||||
event = event_rx.recv() => {
|
||||
match event {
|
||||
Ok(event) => {
|
||||
debug!("Processing system event: {:?}", event);
|
||||
// Process the event (e.g., route to specific handlers)
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Event processor error: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
*self.event_processor_handle.write().await = Some(handle);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update operation statistics based on events
|
||||
async fn update_operation_stats(&self, event: &SystemEvent) {
|
||||
let mut stats = self.operation_stats.write().await;
|
||||
|
||||
match event {
|
||||
SystemEvent::ObjectDiscovered { .. } => {
|
||||
stats.objects_discovered += 1;
|
||||
}
|
||||
SystemEvent::HealthIssueDetected(issue) => {
|
||||
stats.health_issues_detected += 1;
|
||||
match issue.severity {
|
||||
scanner::Severity::Critical => stats.critical_issues += 1,
|
||||
scanner::Severity::High => stats.high_priority_issues += 1,
|
||||
scanner::Severity::Medium => stats.medium_priority_issues += 1,
|
||||
scanner::Severity::Low => stats.low_priority_issues += 1,
|
||||
}
|
||||
}
|
||||
SystemEvent::HealCompleted(result) => {
|
||||
if result.success {
|
||||
stats.heal_operations_succeeded += 1;
|
||||
} else {
|
||||
stats.heal_operations_failed += 1;
|
||||
}
|
||||
}
|
||||
SystemEvent::ScanCompleted(_) => {
|
||||
stats.scan_cycles_completed += 1;
|
||||
}
|
||||
SystemEvent::ResourceUsageUpdated { .. } => {
|
||||
stats.resource_updates += 1;
|
||||
}
|
||||
}
|
||||
|
||||
stats.last_updated = Instant::now();
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII guard for tracking active operations
|
||||
pub struct OperationGuard<'a> {
|
||||
active_operations: &'a AtomicU64,
|
||||
}
|
||||
|
||||
impl<'a> OperationGuard<'a> {
|
||||
pub fn new(active_operations: &'a AtomicU64) -> Self {
|
||||
active_operations.fetch_add(1, Ordering::Relaxed);
|
||||
Self { active_operations }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OperationGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.active_operations.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Operation statistics tracked by the coordinator
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OperationStatistics {
|
||||
pub objects_discovered: u64,
|
||||
pub health_issues_detected: u64,
|
||||
pub heal_operations_succeeded: u64,
|
||||
pub heal_operations_failed: u64,
|
||||
pub scan_cycles_completed: u64,
|
||||
pub resource_updates: u64,
|
||||
pub critical_issues: u64,
|
||||
pub high_priority_issues: u64,
|
||||
pub medium_priority_issues: u64,
|
||||
pub low_priority_issues: u64,
|
||||
pub last_updated: Instant,
|
||||
}
|
||||
|
||||
impl Default for OperationStatistics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
objects_discovered: 0,
|
||||
health_issues_detected: 0,
|
||||
heal_operations_succeeded: 0,
|
||||
heal_operations_failed: 0,
|
||||
scan_cycles_completed: 0,
|
||||
resource_updates: 0,
|
||||
critical_issues: 0,
|
||||
high_priority_issues: 0,
|
||||
medium_priority_issues: 0,
|
||||
low_priority_issues: 0,
|
||||
last_updated: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::metrics::CollectorConfig;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_coordinator_lifecycle() {
|
||||
let config = CoordinatorConfig::default();
|
||||
let metrics_config = CollectorConfig::default();
|
||||
let metrics = Arc::new(metrics::Collector::new(metrics_config).await.unwrap());
|
||||
let cancel_token = CancellationToken::new();
|
||||
|
||||
let coordinator = Coordinator::new(config, metrics, cancel_token).await.unwrap();
|
||||
|
||||
// Test initial status
|
||||
assert_eq!(coordinator.status().await, Status::Initializing);
|
||||
|
||||
// Start coordinator
|
||||
coordinator.start().await.unwrap();
|
||||
assert_eq!(coordinator.status().await, Status::Running);
|
||||
|
||||
// Stop coordinator
|
||||
coordinator.stop().await.unwrap();
|
||||
assert_eq!(coordinator.status().await, Status::Stopped);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_operation_guard() {
|
||||
let config = CoordinatorConfig::default();
|
||||
let metrics_config = CollectorConfig::default();
|
||||
let metrics = Arc::new(metrics::Collector::new(metrics_config).await.unwrap());
|
||||
let cancel_token = CancellationToken::new();
|
||||
|
||||
let coordinator = Coordinator::new(config, metrics, cancel_token).await.unwrap();
|
||||
|
||||
assert_eq!(coordinator.get_active_operations_count(), 0);
|
||||
|
||||
{
|
||||
let _guard1 = coordinator.register_operation();
|
||||
assert_eq!(coordinator.get_active_operations_count(), 1);
|
||||
|
||||
{
|
||||
let _guard2 = coordinator.register_operation();
|
||||
assert_eq!(coordinator.get_active_operations_count(), 2);
|
||||
}
|
||||
|
||||
assert_eq!(coordinator.get_active_operations_count(), 1);
|
||||
}
|
||||
|
||||
assert_eq!(coordinator.get_active_operations_count(), 0);
|
||||
}
|
||||
}
|
||||
22
crates/ahm/src/core/lifecycle.rs
Normal file
22
crates/ahm/src/core/lifecycle.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct LifecycleConfig {}
|
||||
|
||||
pub struct LifecycleManager {}
|
||||
|
||||
impl LifecycleManager {
|
||||
pub async fn new(_config: LifecycleConfig) -> Result<Self> {
|
||||
Ok(Self {})
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
40
crates/ahm/src/core/mod.rs
Normal file
40
crates/ahm/src/core/mod.rs
Normal file
@@ -0,0 +1,40 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Core coordination and lifecycle management for the AHM system
|
||||
|
||||
pub mod coordinator;
|
||||
pub mod scheduler;
|
||||
pub mod lifecycle;
|
||||
|
||||
pub use coordinator::{Coordinator, CoordinatorConfig};
|
||||
pub use scheduler::{Scheduler, SchedulerConfig, Task, TaskPriority};
|
||||
pub use lifecycle::{LifecycleManager, LifecycleConfig};
|
||||
|
||||
/// Status of the core coordination system
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Status {
|
||||
/// System is initializing
|
||||
Initializing,
|
||||
/// System is running normally
|
||||
Running,
|
||||
/// System is degraded but operational
|
||||
Degraded,
|
||||
/// System is shutting down
|
||||
Stopping,
|
||||
/// System has stopped
|
||||
Stopped,
|
||||
/// System encountered an error
|
||||
Error(String),
|
||||
}
|
||||
226
crates/ahm/src/core/scheduler.rs
Normal file
226
crates/ahm/src/core/scheduler.rs
Normal file
@@ -0,0 +1,226 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Task scheduler for the AHM system
|
||||
|
||||
use std::{
|
||||
collections::{BinaryHeap, HashMap},
|
||||
sync::{Arc, atomic::{AtomicU64, Ordering}},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use tokio::{
|
||||
sync::RwLock,
|
||||
task::JoinHandle,
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
/// Task scheduler configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SchedulerConfig {
|
||||
/// Maximum number of concurrent tasks
|
||||
pub max_concurrent_tasks: usize,
|
||||
/// Default task timeout
|
||||
pub default_timeout: Duration,
|
||||
/// Queue capacity
|
||||
pub queue_capacity: usize,
|
||||
pub default_task_priority: TaskPriority,
|
||||
}
|
||||
|
||||
impl Default for SchedulerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_concurrent_tasks: 10,
|
||||
default_timeout: Duration::from_secs(300), // 5 minutes
|
||||
queue_capacity: 1000,
|
||||
default_task_priority: TaskPriority::Normal,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Task priority levels
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum TaskPriority {
|
||||
Low = 0,
|
||||
Normal = 1,
|
||||
High = 2,
|
||||
Critical = 3,
|
||||
}
|
||||
|
||||
/// A scheduled task
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Task {
|
||||
pub id: Uuid,
|
||||
pub priority: TaskPriority,
|
||||
pub scheduled_time: Instant,
|
||||
pub timeout: Duration,
|
||||
pub task_type: TaskType,
|
||||
pub payload: TaskPayload,
|
||||
}
|
||||
|
||||
impl Task {
|
||||
pub fn new(task_type: TaskType, payload: TaskPayload) -> Self {
|
||||
Self {
|
||||
id: Uuid::new_v4(),
|
||||
priority: TaskPriority::Normal,
|
||||
scheduled_time: Instant::now(),
|
||||
timeout: Duration::from_secs(300),
|
||||
task_type,
|
||||
payload,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_priority(mut self, priority: TaskPriority) -> Self {
|
||||
self.priority = priority;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_timeout(mut self, timeout: Duration) -> Self {
|
||||
self.timeout = timeout;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_delay(mut self, delay: Duration) -> Self {
|
||||
self.scheduled_time = Instant::now() + delay;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Types of tasks that can be scheduled
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TaskType {
|
||||
Scan,
|
||||
Heal,
|
||||
Cleanup,
|
||||
Maintenance,
|
||||
Report,
|
||||
}
|
||||
|
||||
/// Task payload data
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum TaskPayload {
|
||||
Scan {
|
||||
bucket: Option<String>,
|
||||
object_prefix: Option<String>,
|
||||
deep_scan: bool,
|
||||
},
|
||||
Heal {
|
||||
bucket: String,
|
||||
object: String,
|
||||
version_id: Option<String>,
|
||||
},
|
||||
Cleanup {
|
||||
older_than: Duration,
|
||||
},
|
||||
Maintenance {
|
||||
operation: String,
|
||||
},
|
||||
Report {
|
||||
report_type: String,
|
||||
},
|
||||
}
|
||||
|
||||
/// Task scheduler
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug)]
|
||||
pub struct Scheduler {
|
||||
config: SchedulerConfig,
|
||||
task_queue: Arc<RwLock<BinaryHeap<PrioritizedTask>>>,
|
||||
active_tasks: Arc<RwLock<HashMap<Uuid, JoinHandle<()>>>>,
|
||||
task_counter: AtomicU64,
|
||||
worker_handles: Arc<RwLock<Vec<JoinHandle<()>>>>,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub async fn new(config: SchedulerConfig) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config,
|
||||
task_queue: Arc::new(RwLock::new(BinaryHeap::new())),
|
||||
active_tasks: Arc::new(RwLock::new(HashMap::new())),
|
||||
task_counter: AtomicU64::new(0),
|
||||
worker_handles: Arc::new(RwLock::new(Vec::new())),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
// Start worker tasks
|
||||
// Implementation would go here
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self) -> Result<()> {
|
||||
// Stop all workers and drain queues
|
||||
// Implementation would go here
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn schedule_task(&self, task: Task) -> Result<Uuid> {
|
||||
let task_id = task.id;
|
||||
let prioritized_task = PrioritizedTask {
|
||||
task,
|
||||
sequence: self.task_counter.fetch_add(1, Ordering::Relaxed),
|
||||
};
|
||||
|
||||
self.task_queue.write().await.push(prioritized_task);
|
||||
Ok(task_id)
|
||||
}
|
||||
|
||||
pub async fn cancel_task(&self, task_id: Uuid) -> Result<bool> {
|
||||
if let Some(handle) = self.active_tasks.write().await.remove(&task_id) {
|
||||
handle.abort();
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Task wrapper for priority queue ordering
|
||||
#[derive(Debug)]
|
||||
struct PrioritizedTask {
|
||||
task: Task,
|
||||
sequence: u64,
|
||||
}
|
||||
|
||||
impl PartialEq for PrioritizedTask {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.task.priority == other.task.priority && self.sequence == other.sequence
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for PrioritizedTask {}
|
||||
|
||||
impl PartialOrd for PrioritizedTask {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for PrioritizedTask {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// Higher priority first, then by sequence number for fairness
|
||||
other.task.priority.cmp(&self.task.priority)
|
||||
.then_with(|| self.sequence.cmp(&other.sequence))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScheduledTask {
|
||||
pub id: Uuid,
|
||||
pub task_type: TaskType,
|
||||
pub priority: TaskPriority,
|
||||
pub created_at: Instant,
|
||||
}
|
||||
438
crates/ahm/src/heal/engine.rs
Normal file
438
crates/ahm/src/heal/engine.rs
Normal file
@@ -0,0 +1,438 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::{
|
||||
sync::{mpsc, RwLock},
|
||||
time::sleep,
|
||||
};
|
||||
use tracing::{error, info, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealConfig, HealPriority, HealResult, HealStatistics, HealTask, Status};
|
||||
|
||||
/// Main healing engine that coordinates repair operations
|
||||
pub struct HealEngine {
|
||||
config: HealConfig,
|
||||
status: Arc<RwLock<Status>>,
|
||||
statistics: Arc<RwLock<HealStatistics>>,
|
||||
task_queue: Arc<RwLock<Vec<HealTask>>>,
|
||||
active_tasks: Arc<RwLock<HashMap<String, HealTask>>>,
|
||||
completed_tasks: Arc<RwLock<Vec<HealResult>>>,
|
||||
shutdown_tx: Option<mpsc::Sender<()>>,
|
||||
}
|
||||
|
||||
impl HealEngine {
|
||||
/// Create a new healing engine
|
||||
pub fn new(config: HealConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
status: Arc::new(RwLock::new(Status::Initializing)),
|
||||
statistics: Arc::new(RwLock::new(HealStatistics::default())),
|
||||
task_queue: Arc::new(RwLock::new(Vec::new())),
|
||||
active_tasks: Arc::new(RwLock::new(HashMap::new())),
|
||||
completed_tasks: Arc::new(RwLock::new(Vec::new())),
|
||||
shutdown_tx: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Start the healing engine
|
||||
pub async fn start(&mut self) -> Result<()> {
|
||||
info!("Starting heal engine");
|
||||
|
||||
let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1);
|
||||
self.shutdown_tx = Some(shutdown_tx);
|
||||
|
||||
// Update status
|
||||
{
|
||||
let mut status = self.status.write().await;
|
||||
*status = Status::Idle;
|
||||
}
|
||||
|
||||
let config = self.config.clone();
|
||||
let status = Arc::clone(&self.status);
|
||||
let statistics = Arc::clone(&self.statistics);
|
||||
let task_queue = Arc::clone(&self.task_queue);
|
||||
let active_tasks = Arc::clone(&self.active_tasks);
|
||||
let completed_tasks = Arc::clone(&self.completed_tasks);
|
||||
|
||||
// Start the main healing loop
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(config.heal_interval);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {
|
||||
if let Err(e) = Self::process_healing_cycle(
|
||||
&config,
|
||||
&status,
|
||||
&statistics,
|
||||
&task_queue,
|
||||
&active_tasks,
|
||||
&completed_tasks,
|
||||
).await {
|
||||
error!("Healing cycle failed: {}", e);
|
||||
}
|
||||
}
|
||||
_ = shutdown_rx.recv() => {
|
||||
info!("Shutdown signal received, stopping heal engine");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update status to stopped
|
||||
let mut status = status.write().await;
|
||||
*status = Status::Stopped;
|
||||
});
|
||||
|
||||
info!("Heal engine started successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the healing engine
|
||||
pub async fn stop(&mut self) -> Result<()> {
|
||||
info!("Stopping heal engine");
|
||||
|
||||
// Update status
|
||||
{
|
||||
let mut status = self.status.write().await;
|
||||
*status = Status::Stopping;
|
||||
}
|
||||
|
||||
// Send shutdown signal
|
||||
if let Some(shutdown_tx) = &self.shutdown_tx {
|
||||
let _ = shutdown_tx.send(()).await;
|
||||
}
|
||||
|
||||
// Wait for engine to stop
|
||||
let mut attempts = 0;
|
||||
while attempts < 10 {
|
||||
let status = self.status.read().await;
|
||||
if *status == Status::Stopped {
|
||||
break;
|
||||
}
|
||||
drop(status);
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
attempts += 1;
|
||||
}
|
||||
|
||||
info!("Heal engine stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a healing task to the queue
|
||||
pub async fn add_task(&self, task: HealTask) -> Result<()> {
|
||||
let task_id = task.id.clone();
|
||||
let queue = Arc::clone(&self.task_queue);
|
||||
|
||||
// Add task to priority queue
|
||||
queue.write().await.push(task);
|
||||
|
||||
info!("Added healing task to queue: {}", task_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get current engine status
|
||||
pub async fn status(&self) -> Status {
|
||||
self.status.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get current engine status (alias for status)
|
||||
pub async fn get_status(&self) -> Status {
|
||||
self.status.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get engine configuration
|
||||
pub async fn get_config(&self) -> HealConfig {
|
||||
self.config.clone()
|
||||
}
|
||||
|
||||
/// Get healing statistics
|
||||
pub async fn statistics(&self) -> HealStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get completed healing results
|
||||
pub async fn completed_results(&self) -> Vec<HealResult> {
|
||||
self.completed_tasks.read().await.clone()
|
||||
}
|
||||
|
||||
/// Process a single healing cycle
|
||||
async fn process_healing_cycle(
|
||||
config: &HealConfig,
|
||||
status: &Arc<RwLock<Status>>,
|
||||
statistics: &Arc<RwLock<HealStatistics>>,
|
||||
task_queue: &Arc<RwLock<Vec<HealTask>>>,
|
||||
active_tasks: &Arc<RwLock<HashMap<String, HealTask>>>,
|
||||
completed_tasks: &Arc<RwLock<Vec<HealResult>>>,
|
||||
) -> Result<()> {
|
||||
// Update status to healing
|
||||
{
|
||||
let mut status = status.write().await;
|
||||
*status = Status::Healing;
|
||||
}
|
||||
|
||||
// Get ready tasks from queue
|
||||
let mut queue = task_queue.write().await;
|
||||
let mut ready_tasks = Vec::new();
|
||||
let mut remaining_tasks = Vec::new();
|
||||
|
||||
for task in queue.drain(..) {
|
||||
if task.is_ready() {
|
||||
ready_tasks.push(task);
|
||||
} else {
|
||||
remaining_tasks.push(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort ready tasks by priority
|
||||
ready_tasks.sort_by(|a, b| a.priority.cmp(&b.priority));
|
||||
|
||||
// Process ready tasks
|
||||
let active_count = active_tasks.read().await.len();
|
||||
let max_concurrent = config.max_workers.saturating_sub(active_count);
|
||||
|
||||
for task in ready_tasks.into_iter().take(max_concurrent) {
|
||||
if let Err(e) = Self::process_task(
|
||||
config,
|
||||
statistics,
|
||||
active_tasks,
|
||||
completed_tasks,
|
||||
task,
|
||||
).await {
|
||||
error!("Failed to process healing task: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Put remaining tasks back in queue
|
||||
queue.extend(remaining_tasks);
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
stats.queued_tasks = queue.len() as u64;
|
||||
stats.active_workers = active_tasks.read().await.len() as u64;
|
||||
}
|
||||
|
||||
// Update status back to idle
|
||||
{
|
||||
let mut status = status.write().await;
|
||||
*status = Status::Idle;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process a single healing task
|
||||
async fn process_task(
|
||||
config: &HealConfig,
|
||||
statistics: &Arc<RwLock<HealStatistics>>,
|
||||
active_tasks: &Arc<RwLock<HashMap<String, HealTask>>>,
|
||||
completed_tasks: &Arc<RwLock<Vec<HealResult>>>,
|
||||
task: HealTask,
|
||||
) -> Result<()> {
|
||||
let task_id = task.id.clone();
|
||||
|
||||
// Add task to active tasks
|
||||
{
|
||||
let mut active = active_tasks.write().await;
|
||||
active.insert(task_id.clone(), task.clone());
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
stats.total_repairs += 1;
|
||||
stats.active_workers = active_tasks.read().await.len() as u64;
|
||||
}
|
||||
|
||||
info!("Processing healing task: {}", task_id);
|
||||
|
||||
// Simulate healing operation
|
||||
let start_time = Instant::now();
|
||||
let result = Self::perform_healing_operation(&task, config).await;
|
||||
let duration = start_time.elapsed();
|
||||
|
||||
// Create heal result
|
||||
let heal_result = HealResult {
|
||||
success: result.is_ok(),
|
||||
original_issue: task.issue.clone(),
|
||||
repair_duration: duration,
|
||||
retry_attempts: task.retry_count,
|
||||
error_message: result.err().map(|e| e.to_string()),
|
||||
metadata: None,
|
||||
completed_at: SystemTime::now(),
|
||||
};
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
if heal_result.success {
|
||||
stats.successful_repairs += 1;
|
||||
} else {
|
||||
stats.failed_repairs += 1;
|
||||
}
|
||||
stats.total_repair_time += duration;
|
||||
stats.average_repair_time = if stats.total_repairs > 0 {
|
||||
Duration::from_secs_f64(
|
||||
stats.total_repair_time.as_secs_f64() / stats.total_repairs as f64
|
||||
)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
stats.last_repair_time = Some(SystemTime::now());
|
||||
stats.total_retry_attempts += task.retry_count as u64;
|
||||
}
|
||||
|
||||
// Add result to completed tasks
|
||||
{
|
||||
let mut completed = completed_tasks.write().await;
|
||||
completed.push(heal_result.clone());
|
||||
}
|
||||
|
||||
// Remove task from active tasks
|
||||
{
|
||||
let mut active = active_tasks.write().await;
|
||||
active.remove(&task_id);
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
stats.active_workers = active_tasks.read().await.len() as u64;
|
||||
}
|
||||
|
||||
if heal_result.success {
|
||||
info!("Healing task completed successfully: {}", task_id);
|
||||
} else {
|
||||
warn!("Healing task failed: {}", task_id);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Perform the actual healing operation
|
||||
async fn perform_healing_operation(task: &HealTask, _config: &HealConfig) -> Result<()> {
|
||||
// Simulate healing operation based on issue type
|
||||
match task.issue.issue_type {
|
||||
crate::scanner::HealthIssueType::MissingReplica => {
|
||||
// Simulate replica repair
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
info!("Repaired missing replica for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
crate::scanner::HealthIssueType::ChecksumMismatch => {
|
||||
// Simulate checksum repair
|
||||
sleep(Duration::from_millis(200)).await;
|
||||
info!("Repaired checksum mismatch for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
crate::scanner::HealthIssueType::DiskReadError => {
|
||||
// Simulate disk error recovery
|
||||
sleep(Duration::from_millis(300)).await;
|
||||
info!("Recovered from disk read error for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
_ => {
|
||||
// Generic repair for other issue types
|
||||
sleep(Duration::from_millis(150)).await;
|
||||
info!("Performed generic repair for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
}
|
||||
|
||||
// Simulate occasional failures for testing
|
||||
if task.retry_count > 0 && task.retry_count % 3 == 0 {
|
||||
return Err(crate::error::Error::Other(anyhow::anyhow!("Simulated healing failure")));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start healing operations
|
||||
pub async fn start_healing(&self) -> Result<()> {
|
||||
let mut status = self.status.write().await;
|
||||
*status = Status::Running;
|
||||
info!("Healing operations started");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop healing operations
|
||||
pub async fn stop_healing(&self) -> Result<()> {
|
||||
let mut status = self.status.write().await;
|
||||
*status = Status::Stopped;
|
||||
info!("Healing operations stopped");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_engine_creation() {
|
||||
let config = HealConfig::default();
|
||||
let engine = HealEngine::new(config);
|
||||
|
||||
assert_eq!(engine.status().await, Status::Initializing);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_engine_start_stop() {
|
||||
let config = HealConfig::default();
|
||||
let mut engine = HealEngine::new(config);
|
||||
|
||||
// Start engine
|
||||
engine.start().await.unwrap();
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Check status
|
||||
let status = engine.status().await;
|
||||
assert!(matches!(status, Status::Idle | Status::Healing));
|
||||
|
||||
// Stop engine
|
||||
engine.stop().await.unwrap();
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Check status
|
||||
let status = engine.status().await;
|
||||
assert_eq!(status, Status::Stopped);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_healing_task() {
|
||||
let config = HealConfig::default();
|
||||
let engine = HealEngine::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task = HealTask::new(issue);
|
||||
engine.add_task(task).await.unwrap();
|
||||
|
||||
let stats = engine.statistics().await;
|
||||
assert_eq!(stats.queued_tasks, 1);
|
||||
}
|
||||
}
|
||||
360
crates/ahm/src/heal/mod.rs
Normal file
360
crates/ahm/src/heal/mod.rs
Normal file
@@ -0,0 +1,360 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Healing subsystem for the AHM system
|
||||
//!
|
||||
//! The heal subsystem provides intelligent repair capabilities:
|
||||
//! - Priority-based healing queue
|
||||
//! - Real-time and background healing modes
|
||||
//! - Comprehensive repair validation
|
||||
//! - Adaptive healing strategies
|
||||
|
||||
pub mod engine;
|
||||
pub mod priority_queue;
|
||||
pub mod repair_worker;
|
||||
pub mod validation;
|
||||
|
||||
pub use engine::HealEngine;
|
||||
pub use priority_queue::PriorityQueue;
|
||||
pub use repair_worker::RepairWorker;
|
||||
pub use validation::HealValidator;
|
||||
|
||||
use std::time::{Duration, SystemTime};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
use derive_builder::Builder;
|
||||
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
/// Configuration for the healing system
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealConfig {
|
||||
/// Maximum number of concurrent repair workers
|
||||
pub max_workers: usize,
|
||||
/// Maximum number of tasks in the priority queue
|
||||
pub max_queue_size: usize,
|
||||
/// Timeout for individual repair operations
|
||||
pub repair_timeout: Duration,
|
||||
/// Interval between healing cycles
|
||||
pub heal_interval: Duration,
|
||||
/// Whether to enable automatic healing
|
||||
pub auto_heal_enabled: bool,
|
||||
/// Maximum number of retry attempts for failed repairs
|
||||
pub max_retry_attempts: u32,
|
||||
/// Backoff delay between retry attempts
|
||||
pub retry_backoff_delay: Duration,
|
||||
/// Whether to validate repairs after completion
|
||||
pub validate_after_repair: bool,
|
||||
}
|
||||
|
||||
impl Default for HealConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_workers: 4,
|
||||
max_queue_size: 1000,
|
||||
repair_timeout: Duration::from_secs(300), // 5 minutes
|
||||
heal_interval: Duration::from_secs(60), // 1 minute
|
||||
auto_heal_enabled: true,
|
||||
max_retry_attempts: 3,
|
||||
retry_backoff_delay: Duration::from_secs(30),
|
||||
validate_after_repair: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a healing operation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealResult {
|
||||
/// Whether the healing operation was successful
|
||||
pub success: bool,
|
||||
/// The original health issue that was addressed
|
||||
pub original_issue: HealthIssue,
|
||||
/// Time taken to complete the repair
|
||||
pub repair_duration: Duration,
|
||||
/// Number of retry attempts made
|
||||
pub retry_attempts: u32,
|
||||
/// Error message if repair failed
|
||||
pub error_message: Option<String>,
|
||||
/// Additional metadata about the repair
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
/// Timestamp when the repair was completed
|
||||
pub completed_at: SystemTime,
|
||||
}
|
||||
|
||||
/// Statistics for the healing system
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct HealStatistics {
|
||||
/// Total number of repair tasks processed
|
||||
pub total_repairs: u64,
|
||||
/// Number of successful repairs
|
||||
pub successful_repairs: u64,
|
||||
/// Number of failed repairs
|
||||
pub failed_repairs: u64,
|
||||
/// Number of tasks currently in queue
|
||||
pub queued_tasks: u64,
|
||||
/// Number of active workers
|
||||
pub active_workers: u64,
|
||||
/// Total time spent on repairs
|
||||
pub total_repair_time: Duration,
|
||||
/// Average repair time
|
||||
pub average_repair_time: Duration,
|
||||
/// Last repair completion time
|
||||
pub last_repair_time: Option<SystemTime>,
|
||||
/// Number of retry attempts made
|
||||
pub total_retry_attempts: u64,
|
||||
}
|
||||
|
||||
/// Priority levels for healing tasks
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum HealPriority {
|
||||
/// Critical issues that need immediate attention
|
||||
Critical = 0,
|
||||
/// High priority issues
|
||||
High = 1,
|
||||
/// Medium priority issues
|
||||
Medium = 2,
|
||||
/// Low priority issues
|
||||
Low = 3,
|
||||
}
|
||||
|
||||
impl From<Severity> for HealPriority {
|
||||
fn from(severity: Severity) -> Self {
|
||||
match severity {
|
||||
Severity::Critical => HealPriority::Critical,
|
||||
Severity::High => HealPriority::High,
|
||||
Severity::Medium => HealPriority::Medium,
|
||||
Severity::Low => HealPriority::Low,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A healing task to be processed
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealTask {
|
||||
/// Unique identifier for the task
|
||||
pub id: String,
|
||||
/// The health issue to be repaired
|
||||
pub issue: HealthIssue,
|
||||
/// Priority level for this task
|
||||
pub priority: HealPriority,
|
||||
/// When the task was created
|
||||
pub created_at: SystemTime,
|
||||
/// When the task should be processed (for delayed tasks)
|
||||
pub scheduled_at: Option<SystemTime>,
|
||||
/// Number of retry attempts made
|
||||
pub retry_count: u32,
|
||||
/// Maximum number of retry attempts allowed
|
||||
pub max_retries: u32,
|
||||
/// Additional context for the repair operation
|
||||
pub context: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
impl HealTask {
|
||||
/// Create a new healing task
|
||||
pub fn new(issue: HealthIssue) -> Self {
|
||||
let priority = HealPriority::from(issue.severity);
|
||||
Self {
|
||||
id: uuid::Uuid::new_v4().to_string(),
|
||||
issue,
|
||||
priority,
|
||||
created_at: SystemTime::now(),
|
||||
scheduled_at: None,
|
||||
retry_count: 0,
|
||||
max_retries: 3,
|
||||
context: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a delayed healing task
|
||||
pub fn delayed(issue: HealthIssue, delay: Duration) -> Self {
|
||||
let mut task = Self::new(issue);
|
||||
task.scheduled_at = Some(SystemTime::now() + delay);
|
||||
task
|
||||
}
|
||||
|
||||
/// Check if the task is ready to be processed
|
||||
pub fn is_ready(&self) -> bool {
|
||||
if let Some(scheduled_at) = self.scheduled_at {
|
||||
SystemTime::now() >= scheduled_at
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if the task can be retried
|
||||
pub fn can_retry(&self) -> bool {
|
||||
self.retry_count < self.max_retries
|
||||
}
|
||||
|
||||
/// Increment the retry count
|
||||
pub fn increment_retry(&mut self) {
|
||||
self.retry_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Heal engine status
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Status {
|
||||
/// Heal engine is initializing
|
||||
Initializing,
|
||||
/// Heal engine is idle
|
||||
Idle,
|
||||
/// Heal engine is running normally
|
||||
Running,
|
||||
/// Heal engine is actively healing
|
||||
Healing,
|
||||
/// Heal engine is paused
|
||||
Paused,
|
||||
/// Heal engine is stopping
|
||||
Stopping,
|
||||
/// Heal engine has stopped
|
||||
Stopped,
|
||||
/// Heal engine encountered an error
|
||||
Error(String),
|
||||
}
|
||||
|
||||
/// Healing operation modes
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum HealMode {
|
||||
/// Real-time healing during GET/PUT operations
|
||||
RealTime,
|
||||
/// Background healing during scheduled scans
|
||||
Background,
|
||||
/// On-demand healing triggered by admin
|
||||
OnDemand,
|
||||
/// Emergency healing for critical issues
|
||||
Emergency,
|
||||
}
|
||||
|
||||
/// Validation result for a repaired object
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ValidationResult {
|
||||
/// Type of validation performed
|
||||
pub validation_type: ValidationType,
|
||||
/// Whether validation passed
|
||||
pub passed: bool,
|
||||
/// Details about the validation
|
||||
pub details: String,
|
||||
/// Time taken for validation
|
||||
pub duration: Duration,
|
||||
}
|
||||
|
||||
/// Types of validation that can be performed
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum ValidationType {
|
||||
/// Checksum verification
|
||||
Checksum,
|
||||
/// Shard count verification
|
||||
ShardCount,
|
||||
/// Data integrity check
|
||||
DataIntegrity,
|
||||
/// Metadata consistency check
|
||||
MetadataConsistency,
|
||||
/// Cross-shard redundancy check
|
||||
RedundancyCheck,
|
||||
}
|
||||
|
||||
/// Healing strategies for different scenarios
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum HealStrategy {
|
||||
/// Repair using available data shards
|
||||
DataShardRepair,
|
||||
/// Repair using parity shards
|
||||
ParityShardRepair,
|
||||
/// Hybrid repair using both data and parity
|
||||
HybridRepair,
|
||||
/// Metadata-only repair
|
||||
MetadataRepair,
|
||||
/// Full object reconstruction
|
||||
FullReconstruction,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_heal_priority_from_severity() {
|
||||
assert_eq!(HealPriority::from(Severity::Critical), HealPriority::Critical);
|
||||
assert_eq!(HealPriority::from(Severity::High), HealPriority::High);
|
||||
assert_eq!(HealPriority::from(Severity::Medium), HealPriority::Medium);
|
||||
assert_eq!(HealPriority::from(Severity::Low), HealPriority::Low);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_heal_task_creation() {
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task = HealTask::new(issue.clone());
|
||||
assert_eq!(task.priority, HealPriority::Critical);
|
||||
assert_eq!(task.issue.bucket, issue.bucket);
|
||||
assert_eq!(task.issue.object, issue.object);
|
||||
assert_eq!(task.retry_count, 0);
|
||||
assert_eq!(task.max_retries, 3);
|
||||
assert!(task.is_ready());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delayed_heal_task() {
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Medium,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let delay = Duration::from_secs(1);
|
||||
let task = HealTask::delayed(issue, delay);
|
||||
|
||||
assert!(task.scheduled_at.is_some());
|
||||
assert!(!task.is_ready()); // Should not be ready immediately
|
||||
|
||||
// Wait for the delay to pass
|
||||
std::thread::sleep(delay + Duration::from_millis(100));
|
||||
assert!(task.is_ready());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_heal_task_retry_logic() {
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Low,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let mut task = HealTask::new(issue);
|
||||
assert!(task.can_retry());
|
||||
|
||||
task.increment_retry();
|
||||
assert_eq!(task.retry_count, 1);
|
||||
assert!(task.can_retry());
|
||||
|
||||
task.increment_retry();
|
||||
task.increment_retry();
|
||||
assert_eq!(task.retry_count, 3);
|
||||
assert!(!task.can_retry());
|
||||
}
|
||||
}
|
||||
413
crates/ahm/src/heal/priority_queue.rs
Normal file
413
crates/ahm/src/heal/priority_queue.rs
Normal file
@@ -0,0 +1,413 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::BinaryHeap,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealPriority, HealTask};
|
||||
|
||||
/// Priority queue for healing tasks
|
||||
pub struct PriorityQueue {
|
||||
tasks: Arc<RwLock<BinaryHeap<HealTask>>>,
|
||||
max_size: usize,
|
||||
statistics: Arc<RwLock<QueueStatistics>>,
|
||||
}
|
||||
|
||||
/// Statistics for the priority queue
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct QueueStatistics {
|
||||
/// Total number of tasks added to the queue
|
||||
pub total_tasks_added: u64,
|
||||
/// Total number of tasks removed from the queue
|
||||
pub total_tasks_removed: u64,
|
||||
/// Current number of tasks in the queue
|
||||
pub current_queue_size: u64,
|
||||
/// Maximum queue size reached
|
||||
pub max_queue_size_reached: u64,
|
||||
/// Number of tasks rejected due to queue being full
|
||||
pub tasks_rejected: u64,
|
||||
/// Average time tasks spend in queue
|
||||
pub average_queue_time: Duration,
|
||||
/// Total time all tasks have spent in queue
|
||||
pub total_queue_time: Duration,
|
||||
}
|
||||
|
||||
impl PriorityQueue {
|
||||
/// Create a new priority queue
|
||||
pub fn new(max_size: usize) -> Self {
|
||||
Self {
|
||||
tasks: Arc::new(RwLock::new(BinaryHeap::new())),
|
||||
max_size,
|
||||
statistics: Arc::new(RwLock::new(QueueStatistics::default())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a task to the queue
|
||||
pub async fn push(&self, task: HealTask) -> Result<()> {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
let mut stats = self.statistics.write().await;
|
||||
|
||||
if tasks.len() >= self.max_size {
|
||||
stats.tasks_rejected += 1;
|
||||
warn!("Priority queue is full, rejecting task: {}", task.id);
|
||||
return Err(crate::error::Error::Other(anyhow::anyhow!("Queue is full")));
|
||||
}
|
||||
|
||||
let task_id = task.id.clone();
|
||||
let priority = task.priority.clone();
|
||||
tasks.push(task);
|
||||
stats.total_tasks_added += 1;
|
||||
stats.current_queue_size = tasks.len() as u64;
|
||||
stats.max_queue_size_reached = stats.max_queue_size_reached.max(tasks.len() as u64);
|
||||
|
||||
debug!("Added task to priority queue: {} (priority: {:?})", task_id, priority);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove and return the highest priority task
|
||||
pub async fn pop(&self) -> Option<HealTask> {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
let mut stats = self.statistics.write().await;
|
||||
|
||||
if let Some(task) = tasks.pop() {
|
||||
stats.total_tasks_removed += 1;
|
||||
stats.current_queue_size = tasks.len() as u64;
|
||||
|
||||
// Update queue time statistics
|
||||
let queue_time = SystemTime::now().duration_since(task.created_at).unwrap_or(Duration::ZERO);
|
||||
stats.total_queue_time += queue_time;
|
||||
stats.average_queue_time = if stats.total_tasks_removed > 0 {
|
||||
Duration::from_secs_f64(
|
||||
stats.total_queue_time.as_secs_f64() / stats.total_tasks_removed as f64
|
||||
)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
|
||||
debug!("Removed task from priority queue: {} (priority: {:?})", task.id, task.priority);
|
||||
Some(task)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Peek at the highest priority task without removing it
|
||||
pub async fn peek(&self) -> Option<HealTask> {
|
||||
let tasks = self.tasks.read().await;
|
||||
tasks.peek().cloned()
|
||||
}
|
||||
|
||||
/// Get the current size of the queue
|
||||
pub async fn len(&self) -> usize {
|
||||
self.tasks.read().await.len()
|
||||
}
|
||||
|
||||
/// Check if the queue is empty
|
||||
pub async fn is_empty(&self) -> bool {
|
||||
self.tasks.read().await.is_empty()
|
||||
}
|
||||
|
||||
/// Get queue statistics
|
||||
pub async fn statistics(&self) -> QueueStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Clear all tasks from the queue
|
||||
pub async fn clear(&self) {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
let mut stats = self.statistics.write().await;
|
||||
|
||||
let cleared_count = tasks.len();
|
||||
tasks.clear();
|
||||
stats.current_queue_size = 0;
|
||||
|
||||
info!("Cleared {} tasks from priority queue", cleared_count);
|
||||
}
|
||||
|
||||
/// Get all tasks that are ready to be processed
|
||||
pub async fn get_ready_tasks(&self, max_count: usize) -> Vec<HealTask> {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
let mut ready_tasks = Vec::new();
|
||||
let mut remaining_tasks = Vec::new();
|
||||
|
||||
while let Some(task) = tasks.pop() {
|
||||
if task.is_ready() && ready_tasks.len() < max_count {
|
||||
ready_tasks.push(task);
|
||||
} else {
|
||||
remaining_tasks.push(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Put remaining tasks back
|
||||
for task in remaining_tasks {
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
ready_tasks
|
||||
}
|
||||
|
||||
/// Remove a specific task by ID
|
||||
pub async fn remove_task(&self, task_id: &str) -> bool {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
let mut stats = self.statistics.write().await;
|
||||
|
||||
let mut temp_tasks = Vec::new();
|
||||
let mut found = false;
|
||||
|
||||
while let Some(task) = tasks.pop() {
|
||||
if task.id == task_id {
|
||||
found = true;
|
||||
stats.total_tasks_removed += 1;
|
||||
debug!("Removed specific task from queue: {}", task_id);
|
||||
} else {
|
||||
temp_tasks.push(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Put remaining tasks back
|
||||
for task in temp_tasks {
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
stats.current_queue_size = tasks.len() as u64;
|
||||
found
|
||||
}
|
||||
|
||||
/// Get tasks by priority level
|
||||
pub async fn get_tasks_by_priority(&self, priority: HealPriority) -> Vec<HealTask> {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
let mut matching_tasks = Vec::new();
|
||||
let mut other_tasks = Vec::new();
|
||||
|
||||
while let Some(task) = tasks.pop() {
|
||||
if task.priority == priority {
|
||||
matching_tasks.push(task);
|
||||
} else {
|
||||
other_tasks.push(task);
|
||||
}
|
||||
}
|
||||
|
||||
// Put other tasks back
|
||||
for task in other_tasks {
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
matching_tasks
|
||||
}
|
||||
|
||||
/// Update task priority
|
||||
pub async fn update_priority(&self, task_id: &str, new_priority: HealPriority) -> bool {
|
||||
let mut tasks = self.tasks.write().await;
|
||||
|
||||
let mut temp_tasks = Vec::new();
|
||||
let mut found = false;
|
||||
|
||||
while let Some(mut task) = tasks.pop() {
|
||||
if task.id == task_id {
|
||||
task.priority = new_priority.clone();
|
||||
found = true;
|
||||
debug!("Updated task priority: {} -> {:?}", task_id, new_priority);
|
||||
}
|
||||
temp_tasks.push(task);
|
||||
}
|
||||
|
||||
// Put all tasks back
|
||||
for task in temp_tasks {
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
found
|
||||
}
|
||||
}
|
||||
|
||||
// Implement Ord for HealTask to enable priority queue functionality
|
||||
impl std::cmp::Ord for HealTask {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// Higher priority (lower enum value) comes first
|
||||
self.priority.cmp(&other.priority)
|
||||
.then_with(|| self.created_at.cmp(&other.created_at))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::PartialOrd for HealTask {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::PartialEq for HealTask {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.id == other.id
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Eq for HealTask {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_priority_queue_creation() {
|
||||
let queue = PriorityQueue::new(100);
|
||||
assert_eq!(queue.len().await, 0);
|
||||
assert!(queue.is_empty().await);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_priority_queue_push_pop() {
|
||||
let queue = PriorityQueue::new(10);
|
||||
|
||||
let issue1 = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Low,
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
description: "Test issue 1".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let issue2 = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "bucket2".to_string(),
|
||||
object: "object2".to_string(),
|
||||
description: "Test issue 2".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task1 = HealTask::new(issue1);
|
||||
let task2 = HealTask::new(issue2);
|
||||
|
||||
// Add tasks
|
||||
queue.push(task1.clone()).await.unwrap();
|
||||
queue.push(task2.clone()).await.unwrap();
|
||||
|
||||
assert_eq!(queue.len().await, 2);
|
||||
|
||||
// Critical task should come first
|
||||
let first_task = queue.pop().await.unwrap();
|
||||
assert_eq!(first_task.priority, HealPriority::Critical);
|
||||
assert_eq!(first_task.id, task2.id);
|
||||
|
||||
let second_task = queue.pop().await.unwrap();
|
||||
assert_eq!(second_task.priority, HealPriority::Low);
|
||||
assert_eq!(second_task.id, task1.id);
|
||||
|
||||
assert!(queue.is_empty().await);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_priority_queue_full() {
|
||||
let queue = PriorityQueue::new(1);
|
||||
|
||||
let issue1 = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Low,
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
description: "Test issue 1".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let issue2 = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "bucket2".to_string(),
|
||||
object: "object2".to_string(),
|
||||
description: "Test issue 2".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task1 = HealTask::new(issue1);
|
||||
let task2 = HealTask::new(issue2);
|
||||
|
||||
// First task should succeed
|
||||
queue.push(task1).await.unwrap();
|
||||
assert_eq!(queue.len().await, 1);
|
||||
|
||||
// Second task should fail
|
||||
let result = queue.push(task2).await;
|
||||
assert!(result.is_err());
|
||||
assert_eq!(queue.len().await, 1);
|
||||
|
||||
let stats = queue.statistics().await;
|
||||
assert_eq!(stats.tasks_rejected, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_priority_queue_remove_task() {
|
||||
let queue = PriorityQueue::new(10);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Medium,
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task = HealTask::new(issue);
|
||||
let task_id = task.id.clone();
|
||||
|
||||
queue.push(task).await.unwrap();
|
||||
assert_eq!(queue.len().await, 1);
|
||||
|
||||
// Remove the task
|
||||
let removed = queue.remove_task(&task_id).await;
|
||||
assert!(removed);
|
||||
assert_eq!(queue.len().await, 0);
|
||||
|
||||
// Try to remove non-existent task
|
||||
let removed = queue.remove_task("non-existent").await;
|
||||
assert!(!removed);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_priority_queue_update_priority() {
|
||||
let queue = PriorityQueue::new(10);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Low,
|
||||
bucket: "bucket1".to_string(),
|
||||
object: "object1".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task = HealTask::new(issue);
|
||||
let task_id = task.id.clone();
|
||||
|
||||
queue.push(task).await.unwrap();
|
||||
|
||||
// Update priority
|
||||
let updated = queue.update_priority(&task_id, HealPriority::Critical).await;
|
||||
assert!(updated);
|
||||
|
||||
// Check that the task now has higher priority
|
||||
let popped_task = queue.pop().await.unwrap();
|
||||
assert_eq!(popped_task.priority, HealPriority::Critical);
|
||||
assert_eq!(popped_task.id, task_id);
|
||||
}
|
||||
}
|
||||
505
crates/ahm/src/heal/repair_worker.rs
Normal file
505
crates/ahm/src/heal/repair_worker.rs
Normal file
@@ -0,0 +1,505 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::{
|
||||
sync::{mpsc, RwLock},
|
||||
time::{sleep, timeout},
|
||||
};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealConfig, HealResult, HealTask, Status};
|
||||
|
||||
/// Configuration for repair workers
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RepairWorkerConfig {
|
||||
/// Worker ID
|
||||
pub worker_id: String,
|
||||
/// Maximum time to spend on a single repair operation
|
||||
pub operation_timeout: Duration,
|
||||
/// Whether to enable detailed logging
|
||||
pub enable_detailed_logging: bool,
|
||||
/// Maximum number of concurrent operations
|
||||
pub max_concurrent_operations: usize,
|
||||
/// Retry configuration
|
||||
pub retry_config: RetryConfig,
|
||||
}
|
||||
|
||||
/// Retry configuration for repair operations
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RetryConfig {
|
||||
/// Maximum number of retry attempts
|
||||
pub max_attempts: u32,
|
||||
/// Initial backoff delay
|
||||
pub initial_backoff: Duration,
|
||||
/// Maximum backoff delay
|
||||
pub max_backoff: Duration,
|
||||
/// Backoff multiplier
|
||||
pub backoff_multiplier: f64,
|
||||
/// Whether to use exponential backoff
|
||||
pub exponential_backoff: bool,
|
||||
}
|
||||
|
||||
impl Default for RepairWorkerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
worker_id: "worker-1".to_string(),
|
||||
operation_timeout: Duration::from_secs(300), // 5 minutes
|
||||
enable_detailed_logging: true,
|
||||
max_concurrent_operations: 1,
|
||||
retry_config: RetryConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RetryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_attempts: 3,
|
||||
initial_backoff: Duration::from_secs(1),
|
||||
max_backoff: Duration::from_secs(60),
|
||||
backoff_multiplier: 2.0,
|
||||
exponential_backoff: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics for a repair worker
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct WorkerStatistics {
|
||||
/// Total number of tasks processed
|
||||
pub total_tasks_processed: u64,
|
||||
/// Number of successful repairs
|
||||
pub successful_repairs: u64,
|
||||
/// Number of failed repairs
|
||||
pub failed_repairs: u64,
|
||||
/// Total time spent on repairs
|
||||
pub total_repair_time: Duration,
|
||||
/// Average repair time
|
||||
pub average_repair_time: Duration,
|
||||
/// Number of retry attempts made
|
||||
pub total_retry_attempts: u64,
|
||||
/// Current worker status
|
||||
pub status: WorkerStatus,
|
||||
/// Last task completion time
|
||||
pub last_task_time: Option<SystemTime>,
|
||||
}
|
||||
|
||||
/// Worker status
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum WorkerStatus {
|
||||
/// Worker is idle
|
||||
Idle,
|
||||
/// Worker is processing a task
|
||||
Processing,
|
||||
/// Worker is retrying a failed task
|
||||
Retrying,
|
||||
/// Worker is stopping
|
||||
Stopping,
|
||||
/// Worker has stopped
|
||||
Stopped,
|
||||
/// Worker encountered an error
|
||||
Error(String),
|
||||
}
|
||||
|
||||
impl Default for WorkerStatus {
|
||||
fn default() -> Self {
|
||||
WorkerStatus::Idle
|
||||
}
|
||||
}
|
||||
|
||||
/// Repair worker that executes healing tasks
|
||||
pub struct RepairWorker {
|
||||
config: RepairWorkerConfig,
|
||||
statistics: Arc<RwLock<WorkerStatistics>>,
|
||||
status: Arc<RwLock<WorkerStatus>>,
|
||||
result_tx: mpsc::Sender<HealResult>,
|
||||
shutdown_tx: Option<mpsc::Sender<()>>,
|
||||
}
|
||||
|
||||
impl RepairWorker {
|
||||
/// Create a new repair worker
|
||||
pub fn new(
|
||||
config: RepairWorkerConfig,
|
||||
result_tx: mpsc::Sender<HealResult>,
|
||||
) -> Self {
|
||||
Self {
|
||||
config,
|
||||
statistics: Arc::new(RwLock::new(WorkerStatistics::default())),
|
||||
status: Arc::new(RwLock::new(WorkerStatus::Idle)),
|
||||
result_tx,
|
||||
shutdown_tx: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Start the repair worker
|
||||
pub async fn start(&mut self) -> Result<()> {
|
||||
info!("Starting repair worker: {}", self.config.worker_id);
|
||||
|
||||
let (_task_tx, task_rx) = mpsc::channel(100);
|
||||
let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1);
|
||||
|
||||
self.shutdown_tx = Some(shutdown_tx);
|
||||
|
||||
// Update status
|
||||
{
|
||||
let mut status = self.status.write().await;
|
||||
*status = WorkerStatus::Idle;
|
||||
}
|
||||
|
||||
let config = self.config.clone();
|
||||
let statistics = Arc::clone(&self.statistics);
|
||||
let status = Arc::clone(&self.status);
|
||||
let result_tx = self.result_tx.clone();
|
||||
|
||||
// Start the worker loop
|
||||
tokio::spawn(async move {
|
||||
let mut task_rx = task_rx;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
Some(task) = task_rx.recv() => {
|
||||
if let Err(e) = Self::process_task(
|
||||
&config,
|
||||
&statistics,
|
||||
&status,
|
||||
&result_tx,
|
||||
task,
|
||||
).await {
|
||||
error!("Failed to process task: {}", e);
|
||||
}
|
||||
}
|
||||
_ = shutdown_rx.recv() => {
|
||||
info!("Shutdown signal received, stopping worker: {}", config.worker_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update status to stopped
|
||||
let mut status = status.write().await;
|
||||
*status = WorkerStatus::Stopped;
|
||||
});
|
||||
|
||||
info!("Repair worker started: {}", self.config.worker_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the repair worker
|
||||
pub async fn stop(&mut self) -> Result<()> {
|
||||
info!("Stopping repair worker: {}", self.config.worker_id);
|
||||
|
||||
// Update status
|
||||
{
|
||||
let mut status = self.status.write().await;
|
||||
*status = WorkerStatus::Stopping;
|
||||
}
|
||||
|
||||
// Send shutdown signal
|
||||
if let Some(shutdown_tx) = &self.shutdown_tx {
|
||||
let _ = shutdown_tx.send(()).await;
|
||||
}
|
||||
|
||||
// Wait for worker to stop
|
||||
let mut attempts = 0;
|
||||
while attempts < 10 {
|
||||
let status = self.status.read().await;
|
||||
if *status == WorkerStatus::Stopped {
|
||||
break;
|
||||
}
|
||||
drop(status);
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
attempts += 1;
|
||||
}
|
||||
|
||||
info!("Repair worker stopped: {}", self.config.worker_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Submit a task to the worker
|
||||
pub async fn submit_task(&self, _task: HealTask) -> Result<()> {
|
||||
// TODO: Implement task submission
|
||||
Err(crate::error::Error::Other(anyhow::anyhow!("Task submission not implemented")))
|
||||
}
|
||||
|
||||
/// Get worker statistics
|
||||
pub async fn statistics(&self) -> WorkerStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get worker status
|
||||
pub async fn status(&self) -> WorkerStatus {
|
||||
self.status.read().await.clone()
|
||||
}
|
||||
|
||||
/// Process a single task
|
||||
async fn process_task(
|
||||
config: &RepairWorkerConfig,
|
||||
statistics: &Arc<RwLock<WorkerStatistics>>,
|
||||
status: &Arc<RwLock<WorkerStatus>>,
|
||||
result_tx: &mpsc::Sender<HealResult>,
|
||||
task: HealTask,
|
||||
) -> Result<()> {
|
||||
let task_id = task.id.clone();
|
||||
|
||||
// Update status to processing
|
||||
{
|
||||
let mut status = status.write().await;
|
||||
*status = WorkerStatus::Processing;
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
stats.total_tasks_processed += 1;
|
||||
stats.status = WorkerStatus::Processing;
|
||||
}
|
||||
|
||||
info!("Processing repair task: {} (worker: {})", task_id, config.worker_id);
|
||||
|
||||
let start_time = Instant::now();
|
||||
let mut attempt = 0;
|
||||
let mut last_error = None;
|
||||
|
||||
// Retry loop
|
||||
while attempt < config.retry_config.max_attempts {
|
||||
attempt += 1;
|
||||
|
||||
if attempt > 1 {
|
||||
// Update status to retrying
|
||||
{
|
||||
let mut status = status.write().await;
|
||||
*status = WorkerStatus::Retrying;
|
||||
}
|
||||
|
||||
// Calculate backoff delay
|
||||
let backoff_delay = if config.retry_config.exponential_backoff {
|
||||
let delay = config.retry_config.initial_backoff *
|
||||
(config.retry_config.backoff_multiplier.powi((attempt - 1) as i32)) as u32;
|
||||
delay.min(config.retry_config.max_backoff)
|
||||
} else {
|
||||
config.retry_config.initial_backoff
|
||||
};
|
||||
|
||||
warn!("Retrying task {} (attempt {}/{}), waiting {:?}",
|
||||
task_id, attempt, config.retry_config.max_attempts, backoff_delay);
|
||||
sleep(backoff_delay).await;
|
||||
}
|
||||
|
||||
// Attempt the repair operation
|
||||
let result = timeout(
|
||||
config.operation_timeout,
|
||||
Self::perform_repair_operation(&task, config)
|
||||
).await;
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
// Success
|
||||
let duration = start_time.elapsed();
|
||||
let heal_result = HealResult {
|
||||
success: true,
|
||||
original_issue: task.issue.clone(),
|
||||
repair_duration: duration,
|
||||
retry_attempts: attempt - 1,
|
||||
error_message: None,
|
||||
metadata: None,
|
||||
completed_at: SystemTime::now(),
|
||||
};
|
||||
|
||||
// Send result
|
||||
if let Err(e) = result_tx.send(heal_result).await {
|
||||
error!("Failed to send heal result: {}", e);
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
stats.successful_repairs += 1;
|
||||
stats.total_repair_time += duration;
|
||||
stats.average_repair_time = if stats.total_tasks_processed > 0 {
|
||||
Duration::from_secs_f64(
|
||||
stats.total_repair_time.as_secs_f64() / stats.total_tasks_processed as f64
|
||||
)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
stats.total_retry_attempts += (attempt - 1) as u64;
|
||||
stats.last_task_time = Some(SystemTime::now());
|
||||
stats.status = WorkerStatus::Idle;
|
||||
}
|
||||
|
||||
info!("Successfully completed repair task: {} (worker: {})", task_id, config.worker_id);
|
||||
return Ok(());
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// Operation failed
|
||||
let error_msg = e.to_string();
|
||||
last_error = Some(e);
|
||||
warn!("Repair operation failed for task {} (attempt {}/{}): {}",
|
||||
task_id, attempt, config.retry_config.max_attempts, error_msg);
|
||||
}
|
||||
Err(_) => {
|
||||
// Operation timed out
|
||||
last_error = Some(crate::error::Error::Other(anyhow::anyhow!("Operation timed out")));
|
||||
warn!("Repair operation timed out for task {} (attempt {}/{})",
|
||||
task_id, attempt, config.retry_config.max_attempts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// All attempts failed
|
||||
let duration = start_time.elapsed();
|
||||
let heal_result = HealResult {
|
||||
success: false,
|
||||
original_issue: task.issue.clone(),
|
||||
repair_duration: duration,
|
||||
retry_attempts: attempt - 1,
|
||||
error_message: last_error.map(|e| e.to_string()),
|
||||
metadata: None,
|
||||
completed_at: SystemTime::now(),
|
||||
};
|
||||
|
||||
// Send result
|
||||
if let Err(e) = result_tx.send(heal_result).await {
|
||||
error!("Failed to send heal result: {}", e);
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = statistics.write().await;
|
||||
stats.failed_repairs += 1;
|
||||
stats.total_repair_time += duration;
|
||||
stats.average_repair_time = if stats.total_tasks_processed > 0 {
|
||||
Duration::from_secs_f64(
|
||||
stats.total_repair_time.as_secs_f64() / stats.total_tasks_processed as f64
|
||||
)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
stats.total_retry_attempts += (attempt - 1) as u64;
|
||||
stats.last_task_time = Some(SystemTime::now());
|
||||
stats.status = WorkerStatus::Idle;
|
||||
}
|
||||
|
||||
error!("Failed to complete repair task after {} attempts: {} (worker: {})",
|
||||
attempt, task_id, config.worker_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Perform the actual repair operation
|
||||
async fn perform_repair_operation(task: &HealTask, config: &RepairWorkerConfig) -> Result<()> {
|
||||
if config.enable_detailed_logging {
|
||||
debug!("Starting repair operation for task: {} (worker: {})", task.id, config.worker_id);
|
||||
}
|
||||
|
||||
// Simulate repair operation based on issue type
|
||||
match task.issue.issue_type {
|
||||
crate::scanner::HealthIssueType::MissingReplica => {
|
||||
// Simulate replica repair
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
if config.enable_detailed_logging {
|
||||
debug!("Repaired missing replica for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
}
|
||||
crate::scanner::HealthIssueType::ChecksumMismatch => {
|
||||
// Simulate checksum repair
|
||||
sleep(Duration::from_millis(200)).await;
|
||||
if config.enable_detailed_logging {
|
||||
debug!("Repaired checksum mismatch for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
}
|
||||
crate::scanner::HealthIssueType::DiskReadError => {
|
||||
// Simulate disk error recovery
|
||||
sleep(Duration::from_millis(300)).await;
|
||||
if config.enable_detailed_logging {
|
||||
debug!("Recovered from disk read error for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Generic repair for other issue types
|
||||
sleep(Duration::from_millis(150)).await;
|
||||
if config.enable_detailed_logging {
|
||||
debug!("Performed generic repair for {}/{}", task.issue.bucket, task.issue.object);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Simulate occasional failures for testing
|
||||
if task.retry_count > 0 && task.retry_count % 3 == 0 {
|
||||
return Err(crate::error::Error::Other(anyhow::anyhow!("Simulated repair failure")));
|
||||
}
|
||||
|
||||
if config.enable_detailed_logging {
|
||||
debug!("Completed repair operation for task: {} (worker: {})", task.id, config.worker_id);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repair_worker_creation() {
|
||||
let config = RepairWorkerConfig::default();
|
||||
let (result_tx, _result_rx) = mpsc::channel(100);
|
||||
let worker = RepairWorker::new(config, result_tx);
|
||||
|
||||
assert_eq!(worker.status().await, WorkerStatus::Idle);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repair_worker_start_stop() {
|
||||
let config = RepairWorkerConfig::default();
|
||||
let (result_tx, _result_rx) = mpsc::channel(100);
|
||||
let mut worker = RepairWorker::new(config, result_tx);
|
||||
|
||||
// Start worker
|
||||
worker.start().await.unwrap();
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Check status
|
||||
let status = worker.status().await;
|
||||
assert_eq!(status, WorkerStatus::Idle);
|
||||
|
||||
// Stop worker
|
||||
worker.stop().await.unwrap();
|
||||
sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Check status
|
||||
let status = worker.status().await;
|
||||
assert_eq!(status, WorkerStatus::Stopped);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repair_worker_statistics() {
|
||||
let config = RepairWorkerConfig::default();
|
||||
let (result_tx, _result_rx) = mpsc::channel(100);
|
||||
let worker = RepairWorker::new(config, result_tx);
|
||||
|
||||
let stats = worker.statistics().await;
|
||||
assert_eq!(stats.total_tasks_processed, 0);
|
||||
assert_eq!(stats.successful_repairs, 0);
|
||||
assert_eq!(stats.failed_repairs, 0);
|
||||
assert_eq!(stats.status, WorkerStatus::Idle);
|
||||
}
|
||||
}
|
||||
453
crates/ahm/src/heal/validation.rs
Normal file
453
crates/ahm/src/heal/validation.rs
Normal file
@@ -0,0 +1,453 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealResult, HealTask};
|
||||
|
||||
/// Configuration for validation operations
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ValidationConfig {
|
||||
/// Whether to enable validation after repair
|
||||
pub enable_post_repair_validation: bool,
|
||||
/// Timeout for validation operations
|
||||
pub validation_timeout: Duration,
|
||||
/// Whether to enable detailed validation logging
|
||||
pub enable_detailed_logging: bool,
|
||||
/// Maximum number of validation retries
|
||||
pub max_validation_retries: u32,
|
||||
/// Validation retry delay
|
||||
pub validation_retry_delay: Duration,
|
||||
}
|
||||
|
||||
impl Default for ValidationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enable_post_repair_validation: true,
|
||||
validation_timeout: Duration::from_secs(60), // 1 minute
|
||||
max_validation_retries: 3,
|
||||
validation_retry_delay: Duration::from_secs(5),
|
||||
enable_detailed_logging: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Validation result for a repair operation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ValidationResult {
|
||||
/// Whether validation passed
|
||||
pub passed: bool,
|
||||
/// Validation type
|
||||
pub validation_type: ValidationType,
|
||||
/// Detailed validation message
|
||||
pub message: String,
|
||||
/// Time taken for validation
|
||||
pub duration: Duration,
|
||||
/// Validation timestamp
|
||||
pub timestamp: SystemTime,
|
||||
/// Additional validation metadata
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
/// Types of validation that can be performed
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum ValidationType {
|
||||
/// Checksum validation
|
||||
Checksum,
|
||||
/// File existence validation
|
||||
FileExistence,
|
||||
/// File size validation
|
||||
FileSize,
|
||||
/// File permissions validation
|
||||
FilePermissions,
|
||||
/// Metadata consistency validation
|
||||
MetadataConsistency,
|
||||
/// Replication status validation
|
||||
ReplicationStatus,
|
||||
/// Data integrity validation
|
||||
DataIntegrity,
|
||||
/// Custom validation
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
/// Statistics for validation operations
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ValidationStatistics {
|
||||
/// Total number of validations performed
|
||||
pub total_validations: u64,
|
||||
/// Number of successful validations
|
||||
pub successful_validations: u64,
|
||||
/// Number of failed validations
|
||||
pub failed_validations: u64,
|
||||
/// Total time spent on validation
|
||||
pub total_validation_time: Duration,
|
||||
/// Average validation time
|
||||
pub average_validation_time: Duration,
|
||||
/// Number of validation retries
|
||||
pub total_validation_retries: u64,
|
||||
/// Last validation time
|
||||
pub last_validation_time: Option<SystemTime>,
|
||||
}
|
||||
|
||||
/// Validator for repair operations
|
||||
pub struct HealValidator {
|
||||
config: ValidationConfig,
|
||||
statistics: Arc<RwLock<ValidationStatistics>>,
|
||||
}
|
||||
|
||||
impl HealValidator {
|
||||
/// Create a new validator
|
||||
pub fn new(config: ValidationConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
statistics: Arc::new(RwLock::new(ValidationStatistics::default())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate a repair operation
|
||||
pub async fn validate_repair(&self, task: &HealTask, result: &HealResult) -> Result<Vec<ValidationResult>> {
|
||||
if !self.config.enable_post_repair_validation {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let start_time = Instant::now();
|
||||
let mut validation_results = Vec::new();
|
||||
|
||||
info!("Starting validation for repair task: {}", task.id);
|
||||
|
||||
// Perform different types of validation based on the issue type
|
||||
match task.issue.issue_type {
|
||||
crate::scanner::HealthIssueType::MissingReplica => {
|
||||
validation_results.extend(self.validate_replica_repair(task, result).await?);
|
||||
}
|
||||
crate::scanner::HealthIssueType::ChecksumMismatch => {
|
||||
validation_results.extend(self.validate_checksum_repair(task, result).await?);
|
||||
}
|
||||
crate::scanner::HealthIssueType::DiskReadError => {
|
||||
validation_results.extend(self.validate_disk_repair(task, result).await?);
|
||||
}
|
||||
_ => {
|
||||
validation_results.extend(self.validate_generic_repair(task, result).await?);
|
||||
}
|
||||
}
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = self.statistics.write().await;
|
||||
stats.total_validations += validation_results.len() as u64;
|
||||
stats.total_validation_time += duration;
|
||||
stats.average_validation_time = if stats.total_validations > 0 {
|
||||
Duration::from_secs_f64(
|
||||
stats.total_validation_time.as_secs_f64() / stats.total_validations as f64
|
||||
)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
stats.last_validation_time = Some(SystemTime::now());
|
||||
|
||||
let successful_count = validation_results.iter().filter(|r| r.passed).count();
|
||||
let failed_count = validation_results.len() - successful_count;
|
||||
stats.successful_validations += successful_count as u64;
|
||||
stats.failed_validations += failed_count as u64;
|
||||
}
|
||||
|
||||
if self.config.enable_detailed_logging {
|
||||
debug!("Validation completed for task {}: {} passed, {} failed",
|
||||
task.id,
|
||||
validation_results.iter().filter(|r| r.passed).count(),
|
||||
validation_results.iter().filter(|r| !r.passed).count()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(validation_results)
|
||||
}
|
||||
|
||||
/// Validate replica repair
|
||||
async fn validate_replica_repair(&self, task: &HealTask, _result: &HealResult) -> Result<Vec<ValidationResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Validate file existence
|
||||
let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(existence_result);
|
||||
|
||||
// Validate replication status
|
||||
let replication_result = self.validate_replication_status(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(replication_result);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Validate checksum repair
|
||||
async fn validate_checksum_repair(&self, task: &HealTask, _result: &HealResult) -> Result<Vec<ValidationResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Validate checksum
|
||||
let checksum_result = self.validate_checksum(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(checksum_result);
|
||||
|
||||
// Validate data integrity
|
||||
let integrity_result = self.validate_data_integrity(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(integrity_result);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Validate disk repair
|
||||
async fn validate_disk_repair(&self, task: &HealTask, _result: &HealResult) -> Result<Vec<ValidationResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Validate file existence
|
||||
let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(existence_result);
|
||||
|
||||
// Validate file permissions
|
||||
let permissions_result = self.validate_file_permissions(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(permissions_result);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Validate generic repair
|
||||
async fn validate_generic_repair(&self, task: &HealTask, _result: &HealResult) -> Result<Vec<ValidationResult>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Validate file existence
|
||||
let existence_result = self.validate_file_existence(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(existence_result);
|
||||
|
||||
// Validate metadata consistency
|
||||
let metadata_result = self.validate_metadata_consistency(&task.issue.bucket, &task.issue.object).await;
|
||||
results.push(metadata_result);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Validate file existence
|
||||
async fn validate_file_existence(&self, bucket: &str, object: &str) -> ValidationResult {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Simulate file existence check
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
let passed = true; // Simulate successful validation
|
||||
|
||||
ValidationResult {
|
||||
passed,
|
||||
validation_type: ValidationType::FileExistence,
|
||||
message: format!("File existence validation for {}/{}", bucket, object),
|
||||
duration,
|
||||
timestamp: SystemTime::now(),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate checksum
|
||||
async fn validate_checksum(&self, bucket: &str, object: &str) -> ValidationResult {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Simulate checksum validation
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
let passed = true; // Simulate successful validation
|
||||
|
||||
ValidationResult {
|
||||
passed,
|
||||
validation_type: ValidationType::Checksum,
|
||||
message: format!("Checksum validation for {}/{}", bucket, object),
|
||||
duration,
|
||||
timestamp: SystemTime::now(),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate replication status
|
||||
async fn validate_replication_status(&self, bucket: &str, object: &str) -> ValidationResult {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Simulate replication status validation
|
||||
tokio::time::sleep(Duration::from_millis(15)).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
let passed = true; // Simulate successful validation
|
||||
|
||||
ValidationResult {
|
||||
passed,
|
||||
validation_type: ValidationType::ReplicationStatus,
|
||||
message: format!("Replication status validation for {}/{}", bucket, object),
|
||||
duration,
|
||||
timestamp: SystemTime::now(),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate file permissions
|
||||
async fn validate_file_permissions(&self, bucket: &str, object: &str) -> ValidationResult {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Simulate file permissions validation
|
||||
tokio::time::sleep(Duration::from_millis(5)).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
let passed = true; // Simulate successful validation
|
||||
|
||||
ValidationResult {
|
||||
passed,
|
||||
validation_type: ValidationType::FilePermissions,
|
||||
message: format!("File permissions validation for {}/{}", bucket, object),
|
||||
duration,
|
||||
timestamp: SystemTime::now(),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate metadata consistency
|
||||
async fn validate_metadata_consistency(&self, bucket: &str, object: &str) -> ValidationResult {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Simulate metadata consistency validation
|
||||
tokio::time::sleep(Duration::from_millis(25)).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
let passed = true; // Simulate successful validation
|
||||
|
||||
ValidationResult {
|
||||
passed,
|
||||
validation_type: ValidationType::MetadataConsistency,
|
||||
message: format!("Metadata consistency validation for {}/{}", bucket, object),
|
||||
duration,
|
||||
timestamp: SystemTime::now(),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate data integrity
|
||||
async fn validate_data_integrity(&self, bucket: &str, object: &str) -> ValidationResult {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Simulate data integrity validation
|
||||
tokio::time::sleep(Duration::from_millis(30)).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
let passed = true; // Simulate successful validation
|
||||
|
||||
ValidationResult {
|
||||
passed,
|
||||
validation_type: ValidationType::DataIntegrity,
|
||||
message: format!("Data integrity validation for {}/{}", bucket, object),
|
||||
duration,
|
||||
timestamp: SystemTime::now(),
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get validation statistics
|
||||
pub async fn statistics(&self) -> ValidationStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Reset validation statistics
|
||||
pub async fn reset_statistics(&self) {
|
||||
let mut stats = self.statistics.write().await;
|
||||
*stats = ValidationStatistics::default();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_validator_creation() {
|
||||
let config = ValidationConfig::default();
|
||||
let validator = HealValidator::new(config);
|
||||
|
||||
let stats = validator.statistics().await;
|
||||
assert_eq!(stats.total_validations, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_validate_repair() {
|
||||
let config = ValidationConfig::default();
|
||||
let validator = HealValidator::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task = super::HealTask::new(issue);
|
||||
let result = super::HealResult {
|
||||
success: true,
|
||||
original_issue: task.issue.clone(),
|
||||
repair_duration: Duration::from_secs(1),
|
||||
retry_attempts: 0,
|
||||
error_message: None,
|
||||
metadata: None,
|
||||
completed_at: SystemTime::now(),
|
||||
};
|
||||
|
||||
let validation_results = validator.validate_repair(&task, &result).await.unwrap();
|
||||
assert!(!validation_results.is_empty());
|
||||
|
||||
let stats = validator.statistics().await;
|
||||
assert_eq!(stats.total_validations, validation_results.len() as u64);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_validation_disabled() {
|
||||
let mut config = ValidationConfig::default();
|
||||
config.enable_post_repair_validation = false;
|
||||
let validator = HealValidator::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let task = super::HealTask::new(issue);
|
||||
let result = super::HealResult {
|
||||
success: true,
|
||||
original_issue: task.issue.clone(),
|
||||
repair_duration: Duration::from_secs(1),
|
||||
retry_attempts: 0,
|
||||
error_message: None,
|
||||
metadata: None,
|
||||
completed_at: SystemTime::now(),
|
||||
};
|
||||
|
||||
let validation_results = validator.validate_repair(&task, &result).await.unwrap();
|
||||
assert!(validation_results.is_empty());
|
||||
}
|
||||
}
|
||||
739
crates/ahm/src/metrics/aggregator.rs
Normal file
739
crates/ahm/src/metrics/aggregator.rs
Normal file
@@ -0,0 +1,739 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
use super::{
|
||||
AggregatedMetrics, DiskMetrics, HealMetrics, MetricsDataPoint, MetricsQuery, MetricsSummary,
|
||||
NetworkMetrics, PolicyMetrics, ScanMetrics, SystemMetrics,
|
||||
};
|
||||
|
||||
/// Configuration for the metrics aggregator
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AggregatorConfig {
|
||||
/// Default aggregation interval
|
||||
pub default_interval: Duration,
|
||||
/// Maximum number of data points to keep in memory
|
||||
pub max_data_points: usize,
|
||||
/// Whether to enable automatic aggregation
|
||||
pub enable_auto_aggregation: bool,
|
||||
/// Aggregation window size
|
||||
pub aggregation_window: Duration,
|
||||
/// Whether to enable data compression
|
||||
pub enable_compression: bool,
|
||||
/// Compression threshold (number of points before compression)
|
||||
pub compression_threshold: usize,
|
||||
/// Whether to enable outlier detection
|
||||
pub enable_outlier_detection: bool,
|
||||
/// Outlier detection threshold (standard deviations)
|
||||
pub outlier_threshold: f64,
|
||||
}
|
||||
|
||||
impl Default for AggregatorConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
default_interval: Duration::from_secs(300), // 5 minutes
|
||||
max_data_points: 10000,
|
||||
enable_auto_aggregation: true,
|
||||
aggregation_window: Duration::from_secs(3600), // 1 hour
|
||||
enable_compression: true,
|
||||
compression_threshold: 1000,
|
||||
enable_outlier_detection: true,
|
||||
outlier_threshold: 2.0, // 2 standard deviations
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics aggregator that processes and aggregates metrics data
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Aggregator {
|
||||
config: AggregatorConfig,
|
||||
data_points: Vec<MetricsDataPoint>,
|
||||
aggregation_cache: HashMap<String, AggregatedMetrics>,
|
||||
last_aggregation_time: SystemTime,
|
||||
aggregation_count: u64,
|
||||
}
|
||||
|
||||
impl Aggregator {
|
||||
/// Create a new metrics aggregator
|
||||
pub async fn new(interval: Duration) -> Result<Self> {
|
||||
let config = AggregatorConfig {
|
||||
default_interval: interval,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
data_points: Vec::new(),
|
||||
aggregation_cache: HashMap::new(),
|
||||
last_aggregation_time: SystemTime::now(),
|
||||
aggregation_count: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &AggregatorConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Add metrics data point
|
||||
pub async fn add_data_point(&mut self, data_point: MetricsDataPoint) -> Result<()> {
|
||||
self.data_points.push(data_point);
|
||||
|
||||
// Trim old data points if we exceed the limit
|
||||
if self.data_points.len() > self.config.max_data_points {
|
||||
let excess = self.data_points.len() - self.config.max_data_points;
|
||||
self.data_points.drain(0..excess);
|
||||
}
|
||||
|
||||
// Auto-aggregate if enabled
|
||||
if self.config.enable_auto_aggregation {
|
||||
self.auto_aggregate().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Aggregate metrics based on query
|
||||
pub async fn aggregate_metrics(&mut self, query: MetricsQuery) -> Result<AggregatedMetrics> {
|
||||
let start_time = SystemTime::now();
|
||||
|
||||
// Check cache first
|
||||
let cache_key = self.generate_cache_key(&query);
|
||||
if let Some(cached) = self.aggregation_cache.get(&cache_key) {
|
||||
debug!("Returning cached aggregation result");
|
||||
return Ok(cached.clone());
|
||||
}
|
||||
|
||||
// Filter data points by time range
|
||||
let filtered_points: Vec<&MetricsDataPoint> = self
|
||||
.data_points
|
||||
.iter()
|
||||
.filter(|point| {
|
||||
point.timestamp >= query.start_time && point.timestamp <= query.end_time
|
||||
})
|
||||
.collect();
|
||||
|
||||
if filtered_points.is_empty() {
|
||||
warn!("No data points found for the specified time range");
|
||||
return Ok(AggregatedMetrics {
|
||||
query,
|
||||
data_points: Vec::new(),
|
||||
summary: MetricsSummary::default(),
|
||||
});
|
||||
}
|
||||
|
||||
// Aggregate data points
|
||||
let aggregated_points = self.aggregate_data_points(&filtered_points, &query).await?;
|
||||
|
||||
// Generate summary
|
||||
let summary = self.generate_summary(&aggregated_points, &query).await?;
|
||||
|
||||
let result = AggregatedMetrics {
|
||||
query,
|
||||
data_points: aggregated_points,
|
||||
summary,
|
||||
};
|
||||
|
||||
// Cache the result
|
||||
self.aggregation_cache.insert(cache_key, result.clone());
|
||||
|
||||
let aggregation_time = start_time.elapsed();
|
||||
debug!("Metrics aggregation completed in {:?}", aggregation_time);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Auto-aggregate data points
|
||||
async fn auto_aggregate(&mut self) -> Result<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
// Check if it's time to aggregate
|
||||
if now.duration_since(self.last_aggregation_time).unwrap() < self.config.aggregation_window {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Perform aggregation
|
||||
let window_start = now - self.config.aggregation_window;
|
||||
let query = MetricsQuery {
|
||||
start_time: window_start,
|
||||
end_time: now,
|
||||
interval: self.config.default_interval,
|
||||
metrics: vec![], // All metrics
|
||||
severity_filter: None,
|
||||
limit: None,
|
||||
};
|
||||
|
||||
let _aggregated = self.aggregate_metrics(query).await?;
|
||||
|
||||
self.last_aggregation_time = now;
|
||||
self.aggregation_count += 1;
|
||||
|
||||
info!("Auto-aggregation completed, count: {}", self.aggregation_count);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Aggregate data points based on interval
|
||||
async fn aggregate_data_points(
|
||||
&self,
|
||||
points: &[&MetricsDataPoint],
|
||||
query: &MetricsQuery,
|
||||
) -> Result<Vec<MetricsDataPoint>> {
|
||||
if points.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut aggregated_points = Vec::new();
|
||||
let mut current_bucket_start = query.start_time;
|
||||
let mut current_bucket_points = Vec::new();
|
||||
|
||||
for point in points {
|
||||
if point.timestamp >= current_bucket_start + query.interval {
|
||||
// Process current bucket
|
||||
if !current_bucket_points.is_empty() {
|
||||
let aggregated = self.aggregate_bucket(¤t_bucket_points, current_bucket_start).await?;
|
||||
aggregated_points.push(aggregated);
|
||||
}
|
||||
|
||||
// Start new bucket
|
||||
current_bucket_start = current_bucket_start + query.interval;
|
||||
current_bucket_points.clear();
|
||||
}
|
||||
|
||||
current_bucket_points.push(*point);
|
||||
}
|
||||
|
||||
// Process last bucket
|
||||
if !current_bucket_points.is_empty() {
|
||||
let aggregated = self.aggregate_bucket(¤t_bucket_points, current_bucket_start).await?;
|
||||
aggregated_points.push(aggregated);
|
||||
}
|
||||
|
||||
Ok(aggregated_points)
|
||||
}
|
||||
|
||||
/// Aggregate a bucket of data points
|
||||
async fn aggregate_bucket(
|
||||
&self,
|
||||
points: &[&MetricsDataPoint],
|
||||
bucket_start: SystemTime,
|
||||
) -> Result<MetricsDataPoint> {
|
||||
let mut aggregated = MetricsDataPoint {
|
||||
timestamp: bucket_start,
|
||||
system: None,
|
||||
network: None,
|
||||
disk_io: None,
|
||||
scan: None,
|
||||
heal: None,
|
||||
policy: None,
|
||||
};
|
||||
|
||||
// Aggregate system metrics
|
||||
let system_metrics: Vec<&SystemMetrics> = points
|
||||
.iter()
|
||||
.filter_map(|p| p.system.as_ref())
|
||||
.collect();
|
||||
|
||||
if !system_metrics.is_empty() {
|
||||
aggregated.system = Some(self.aggregate_system_metrics(&system_metrics).await?);
|
||||
}
|
||||
|
||||
// Aggregate network metrics
|
||||
let network_metrics: Vec<&NetworkMetrics> = points
|
||||
.iter()
|
||||
.filter_map(|p| p.network.as_ref())
|
||||
.collect();
|
||||
|
||||
if !network_metrics.is_empty() {
|
||||
aggregated.network = Some(self.aggregate_network_metrics(&network_metrics).await?);
|
||||
}
|
||||
|
||||
// Aggregate disk I/O metrics
|
||||
let disk_metrics: Vec<&DiskMetrics> = points
|
||||
.iter()
|
||||
.filter_map(|p| p.disk_io.as_ref())
|
||||
.collect();
|
||||
|
||||
if !disk_metrics.is_empty() {
|
||||
aggregated.disk_io = Some(self.aggregate_disk_metrics(&disk_metrics).await?);
|
||||
}
|
||||
|
||||
// Aggregate scan metrics
|
||||
let scan_metrics: Vec<&ScanMetrics> = points
|
||||
.iter()
|
||||
.filter_map(|p| p.scan.as_ref())
|
||||
.collect();
|
||||
|
||||
if !scan_metrics.is_empty() {
|
||||
aggregated.scan = Some(self.aggregate_scan_metrics(&scan_metrics).await?);
|
||||
}
|
||||
|
||||
// Aggregate heal metrics
|
||||
let heal_metrics: Vec<&HealMetrics> = points
|
||||
.iter()
|
||||
.filter_map(|p| p.heal.as_ref())
|
||||
.collect();
|
||||
|
||||
if !heal_metrics.is_empty() {
|
||||
aggregated.heal = Some(self.aggregate_heal_metrics(&heal_metrics).await?);
|
||||
}
|
||||
|
||||
// Aggregate policy metrics
|
||||
let policy_metrics: Vec<&PolicyMetrics> = points
|
||||
.iter()
|
||||
.filter_map(|p| p.policy.as_ref())
|
||||
.collect();
|
||||
|
||||
if !policy_metrics.is_empty() {
|
||||
aggregated.policy = Some(self.aggregate_policy_metrics(&policy_metrics).await?);
|
||||
}
|
||||
|
||||
Ok(aggregated)
|
||||
}
|
||||
|
||||
/// Aggregate system metrics
|
||||
async fn aggregate_system_metrics(&self, metrics: &[&SystemMetrics]) -> Result<SystemMetrics> {
|
||||
if metrics.is_empty() {
|
||||
return Ok(SystemMetrics::default());
|
||||
}
|
||||
|
||||
let cpu_usage: f64 = metrics.iter().map(|m| m.cpu_usage).sum::<f64>() / metrics.len() as f64;
|
||||
let memory_usage: f64 = metrics.iter().map(|m| m.memory_usage).sum::<f64>() / metrics.len() as f64;
|
||||
let disk_usage: f64 = metrics.iter().map(|m| m.disk_usage).sum::<f64>() / metrics.len() as f64;
|
||||
let system_load: f64 = metrics.iter().map(|m| m.system_load).sum::<f64>() / metrics.len() as f64;
|
||||
let active_operations: u64 = metrics.iter().map(|m| m.active_operations).sum::<u64>() / metrics.len() as u64;
|
||||
|
||||
// Aggregate health issues
|
||||
let mut health_issues = HashMap::new();
|
||||
for metric in metrics {
|
||||
for (severity, count) in &metric.health_issues {
|
||||
*health_issues.entry(*severity).or_insert(0) += count;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(SystemMetrics {
|
||||
timestamp: SystemTime::now(),
|
||||
cpu_usage,
|
||||
memory_usage,
|
||||
disk_usage,
|
||||
network_io: NetworkMetrics::default(), // Will be aggregated separately
|
||||
disk_io: DiskMetrics::default(), // Will be aggregated separately
|
||||
active_operations,
|
||||
system_load,
|
||||
health_issues,
|
||||
scan_metrics: ScanMetrics::default(), // Will be aggregated separately
|
||||
heal_metrics: HealMetrics::default(), // Will be aggregated separately
|
||||
policy_metrics: PolicyMetrics::default(), // Will be aggregated separately
|
||||
})
|
||||
}
|
||||
|
||||
/// Aggregate network metrics
|
||||
async fn aggregate_network_metrics(&self, metrics: &[&NetworkMetrics]) -> Result<NetworkMetrics> {
|
||||
if metrics.is_empty() {
|
||||
return Ok(NetworkMetrics::default());
|
||||
}
|
||||
|
||||
let bytes_received_per_sec: u64 = metrics.iter().map(|m| m.bytes_received_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let bytes_sent_per_sec: u64 = metrics.iter().map(|m| m.bytes_sent_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let packets_received_per_sec: u64 = metrics.iter().map(|m| m.packets_received_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let packets_sent_per_sec: u64 = metrics.iter().map(|m| m.packets_sent_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
|
||||
Ok(NetworkMetrics {
|
||||
bytes_received_per_sec,
|
||||
bytes_sent_per_sec,
|
||||
packets_received_per_sec,
|
||||
packets_sent_per_sec,
|
||||
})
|
||||
}
|
||||
|
||||
/// Aggregate disk metrics
|
||||
async fn aggregate_disk_metrics(&self, metrics: &[&DiskMetrics]) -> Result<DiskMetrics> {
|
||||
if metrics.is_empty() {
|
||||
return Ok(DiskMetrics::default());
|
||||
}
|
||||
|
||||
let bytes_read_per_sec: u64 = metrics.iter().map(|m| m.bytes_read_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let bytes_written_per_sec: u64 = metrics.iter().map(|m| m.bytes_written_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let read_ops_per_sec: u64 = metrics.iter().map(|m| m.read_ops_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let write_ops_per_sec: u64 = metrics.iter().map(|m| m.write_ops_per_sec).sum::<u64>() / metrics.len() as u64;
|
||||
let avg_read_latency_ms: f64 = metrics.iter().map(|m| m.avg_read_latency_ms).sum::<f64>() / metrics.len() as f64;
|
||||
let avg_write_latency_ms: f64 = metrics.iter().map(|m| m.avg_write_latency_ms).sum::<f64>() / metrics.len() as f64;
|
||||
|
||||
Ok(DiskMetrics {
|
||||
bytes_read_per_sec,
|
||||
bytes_written_per_sec,
|
||||
read_ops_per_sec,
|
||||
write_ops_per_sec,
|
||||
avg_read_latency_ms,
|
||||
avg_write_latency_ms,
|
||||
})
|
||||
}
|
||||
|
||||
/// Aggregate scan metrics
|
||||
async fn aggregate_scan_metrics(&self, metrics: &[&ScanMetrics]) -> Result<ScanMetrics> {
|
||||
if metrics.is_empty() {
|
||||
return Ok(ScanMetrics::default());
|
||||
}
|
||||
|
||||
let objects_scanned: u64 = metrics.iter().map(|m| m.objects_scanned).sum();
|
||||
let bytes_scanned: u64 = metrics.iter().map(|m| m.bytes_scanned).sum();
|
||||
let scan_duration: Duration = metrics.iter().map(|m| m.scan_duration).sum();
|
||||
let health_issues_found: u64 = metrics.iter().map(|m| m.health_issues_found).sum();
|
||||
let scan_cycles_completed: u64 = metrics.iter().map(|m| m.scan_cycles_completed).sum();
|
||||
|
||||
// Calculate rates
|
||||
let total_duration_secs = scan_duration.as_secs_f64();
|
||||
let scan_rate_objects_per_sec = if total_duration_secs > 0.0 {
|
||||
objects_scanned as f64 / total_duration_secs
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let scan_rate_bytes_per_sec = if total_duration_secs > 0.0 {
|
||||
bytes_scanned as f64 / total_duration_secs
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
Ok(ScanMetrics {
|
||||
objects_scanned,
|
||||
bytes_scanned,
|
||||
scan_duration,
|
||||
scan_rate_objects_per_sec,
|
||||
scan_rate_bytes_per_sec,
|
||||
health_issues_found,
|
||||
scan_cycles_completed,
|
||||
last_scan_time: metrics.last().and_then(|m| m.last_scan_time),
|
||||
})
|
||||
}
|
||||
|
||||
/// Aggregate heal metrics
|
||||
async fn aggregate_heal_metrics(&self, metrics: &[&HealMetrics]) -> Result<HealMetrics> {
|
||||
if metrics.is_empty() {
|
||||
return Ok(HealMetrics::default());
|
||||
}
|
||||
|
||||
let total_repairs: u64 = metrics.iter().map(|m| m.total_repairs).sum();
|
||||
let successful_repairs: u64 = metrics.iter().map(|m| m.successful_repairs).sum();
|
||||
let failed_repairs: u64 = metrics.iter().map(|m| m.failed_repairs).sum();
|
||||
let total_repair_time: Duration = metrics.iter().map(|m| m.total_repair_time).sum();
|
||||
let total_retry_attempts: u64 = metrics.iter().map(|m| m.total_retry_attempts).sum();
|
||||
|
||||
// Calculate average repair time
|
||||
let average_repair_time = if total_repairs > 0 {
|
||||
let total_ms = total_repair_time.as_millis() as u64;
|
||||
Duration::from_millis(total_ms / total_repairs)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
|
||||
// Get latest values for current state
|
||||
let active_repair_workers = metrics.last().map(|m| m.active_repair_workers).unwrap_or(0);
|
||||
let queued_repair_tasks = metrics.last().map(|m| m.queued_repair_tasks).unwrap_or(0);
|
||||
let last_repair_time = metrics.last().and_then(|m| m.last_repair_time);
|
||||
|
||||
Ok(HealMetrics {
|
||||
total_repairs,
|
||||
successful_repairs,
|
||||
failed_repairs,
|
||||
total_repair_time,
|
||||
average_repair_time,
|
||||
active_repair_workers,
|
||||
queued_repair_tasks,
|
||||
last_repair_time,
|
||||
total_retry_attempts,
|
||||
})
|
||||
}
|
||||
|
||||
/// Aggregate policy metrics
|
||||
async fn aggregate_policy_metrics(&self, metrics: &[&PolicyMetrics]) -> Result<PolicyMetrics> {
|
||||
if metrics.is_empty() {
|
||||
return Ok(PolicyMetrics::default());
|
||||
}
|
||||
|
||||
let total_evaluations: u64 = metrics.iter().map(|m| m.total_evaluations).sum();
|
||||
let allowed_operations: u64 = metrics.iter().map(|m| m.allowed_operations).sum();
|
||||
let denied_operations: u64 = metrics.iter().map(|m| m.denied_operations).sum();
|
||||
let scan_policy_evaluations: u64 = metrics.iter().map(|m| m.scan_policy_evaluations).sum();
|
||||
let heal_policy_evaluations: u64 = metrics.iter().map(|m| m.heal_policy_evaluations).sum();
|
||||
let retention_policy_evaluations: u64 = metrics.iter().map(|m| m.retention_policy_evaluations).sum();
|
||||
let total_evaluation_time: Duration = metrics.iter().map(|m| m.average_evaluation_time).sum();
|
||||
|
||||
// Calculate average evaluation time
|
||||
let average_evaluation_time = if total_evaluations > 0 {
|
||||
let total_ms = total_evaluation_time.as_millis() as u64;
|
||||
Duration::from_millis(total_ms / total_evaluations)
|
||||
} else {
|
||||
Duration::ZERO
|
||||
};
|
||||
|
||||
Ok(PolicyMetrics {
|
||||
total_evaluations,
|
||||
allowed_operations,
|
||||
denied_operations,
|
||||
scan_policy_evaluations,
|
||||
heal_policy_evaluations,
|
||||
retention_policy_evaluations,
|
||||
average_evaluation_time,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generate summary statistics
|
||||
async fn generate_summary(
|
||||
&self,
|
||||
data_points: &[MetricsDataPoint],
|
||||
query: &MetricsQuery,
|
||||
) -> Result<MetricsSummary> {
|
||||
let total_points = data_points.len() as u64;
|
||||
let time_range = query.end_time.duration_since(query.start_time).unwrap_or(Duration::ZERO);
|
||||
|
||||
// Calculate averages from system metrics
|
||||
let system_metrics: Vec<&SystemMetrics> = data_points
|
||||
.iter()
|
||||
.filter_map(|p| p.system.as_ref())
|
||||
.collect();
|
||||
|
||||
let avg_cpu_usage = if !system_metrics.is_empty() {
|
||||
system_metrics.iter().map(|m| m.cpu_usage).sum::<f64>() / system_metrics.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_memory_usage = if !system_metrics.is_empty() {
|
||||
system_metrics.iter().map(|m| m.memory_usage).sum::<f64>() / system_metrics.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_disk_usage = if !system_metrics.is_empty() {
|
||||
system_metrics.iter().map(|m| m.disk_usage).sum::<f64>() / system_metrics.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Calculate totals from scan and heal metrics
|
||||
let scan_metrics: Vec<&ScanMetrics> = data_points
|
||||
.iter()
|
||||
.filter_map(|p| p.scan.as_ref())
|
||||
.collect();
|
||||
|
||||
let total_objects_scanned = scan_metrics.iter().map(|m| m.objects_scanned).sum();
|
||||
let total_health_issues = scan_metrics.iter().map(|m| m.health_issues_found).sum();
|
||||
|
||||
let heal_metrics: Vec<&HealMetrics> = data_points
|
||||
.iter()
|
||||
.filter_map(|p| p.heal.as_ref())
|
||||
.collect();
|
||||
|
||||
let total_repairs = heal_metrics.iter().map(|m| m.total_repairs).sum();
|
||||
let successful_repairs: u64 = heal_metrics.iter().map(|m| m.successful_repairs).sum();
|
||||
let repair_success_rate = if total_repairs > 0 {
|
||||
successful_repairs as f64 / total_repairs as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
Ok(MetricsSummary {
|
||||
total_points,
|
||||
time_range,
|
||||
avg_cpu_usage,
|
||||
avg_memory_usage,
|
||||
avg_disk_usage,
|
||||
total_objects_scanned,
|
||||
total_repairs,
|
||||
repair_success_rate,
|
||||
total_health_issues,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generate cache key for query
|
||||
fn generate_cache_key(&self, query: &MetricsQuery) -> String {
|
||||
format!(
|
||||
"{:?}_{:?}_{:?}_{:?}",
|
||||
query.start_time, query.end_time, query.interval, query.metrics
|
||||
)
|
||||
}
|
||||
|
||||
/// Clear old cache entries
|
||||
pub async fn clear_old_cache(&mut self) -> Result<()> {
|
||||
let now = SystemTime::now();
|
||||
let retention_period = Duration::from_secs(3600); // 1 hour
|
||||
|
||||
self.aggregation_cache.retain(|_key, value| {
|
||||
if let Some(latest_point) = value.data_points.last() {
|
||||
now.duration_since(latest_point.timestamp).unwrap_or(Duration::ZERO) < retention_period
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
|
||||
info!("Cleared old cache entries, remaining: {}", self.aggregation_cache.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get aggregation statistics
|
||||
pub fn get_statistics(&self) -> AggregatorStatistics {
|
||||
AggregatorStatistics {
|
||||
total_data_points: self.data_points.len(),
|
||||
total_aggregations: self.aggregation_count,
|
||||
cache_size: self.aggregation_cache.len(),
|
||||
last_aggregation_time: self.last_aggregation_time,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregator statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AggregatorStatistics {
|
||||
pub total_data_points: usize,
|
||||
pub total_aggregations: u64,
|
||||
pub cache_size: usize,
|
||||
pub last_aggregation_time: SystemTime,
|
||||
pub config: AggregatorConfig,
|
||||
}
|
||||
|
||||
impl Default for MetricsSummary {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
total_points: 0,
|
||||
time_range: Duration::ZERO,
|
||||
avg_cpu_usage: 0.0,
|
||||
avg_memory_usage: 0.0,
|
||||
avg_disk_usage: 0.0,
|
||||
total_objects_scanned: 0,
|
||||
total_repairs: 0,
|
||||
repair_success_rate: 0.0,
|
||||
total_health_issues: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::Severity;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_aggregator_creation() {
|
||||
let aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap();
|
||||
|
||||
assert_eq!(aggregator.config().default_interval, Duration::from_secs(300));
|
||||
assert!(aggregator.config().enable_auto_aggregation);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_data_point_addition() {
|
||||
let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap();
|
||||
|
||||
let data_point = MetricsDataPoint {
|
||||
timestamp: SystemTime::now(),
|
||||
system: Some(SystemMetrics::default()),
|
||||
network: None,
|
||||
disk_io: None,
|
||||
scan: None,
|
||||
heal: None,
|
||||
policy: None,
|
||||
};
|
||||
|
||||
aggregator.add_data_point(data_point).await.unwrap();
|
||||
|
||||
let stats = aggregator.get_statistics();
|
||||
assert_eq!(stats.total_data_points, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metrics_aggregation() {
|
||||
let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap();
|
||||
|
||||
// Add some test data points
|
||||
for i in 0..5 {
|
||||
let mut system_metrics = SystemMetrics::default();
|
||||
system_metrics.cpu_usage = i as f64 * 10.0;
|
||||
system_metrics.memory_usage = i as f64 * 20.0;
|
||||
|
||||
let data_point = MetricsDataPoint {
|
||||
timestamp: SystemTime::now() + Duration::from_secs(i * 60),
|
||||
system: Some(system_metrics),
|
||||
network: None,
|
||||
disk_io: None,
|
||||
scan: None,
|
||||
heal: None,
|
||||
policy: None,
|
||||
};
|
||||
|
||||
aggregator.add_data_point(data_point).await.unwrap();
|
||||
}
|
||||
|
||||
let query = MetricsQuery {
|
||||
start_time: SystemTime::now(),
|
||||
end_time: SystemTime::now() + Duration::from_secs(300),
|
||||
interval: Duration::from_secs(60),
|
||||
metrics: vec![MetricType::System],
|
||||
severity_filter: None,
|
||||
limit: None,
|
||||
};
|
||||
|
||||
let result = aggregator.aggregate_metrics(query).await.unwrap();
|
||||
assert_eq!(result.data_points.len(), 5);
|
||||
assert_eq!(result.summary.total_points, 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_system_metrics_aggregation() {
|
||||
let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap();
|
||||
|
||||
let metrics = vec![
|
||||
SystemMetrics {
|
||||
cpu_usage: 10.0,
|
||||
memory_usage: 20.0,
|
||||
disk_usage: 30.0,
|
||||
..Default::default()
|
||||
},
|
||||
SystemMetrics {
|
||||
cpu_usage: 20.0,
|
||||
memory_usage: 40.0,
|
||||
disk_usage: 60.0,
|
||||
..Default::default()
|
||||
},
|
||||
];
|
||||
|
||||
let aggregated = aggregator.aggregate_system_metrics(&metrics.iter().collect::<Vec<_>>()).await.unwrap();
|
||||
|
||||
assert_eq!(aggregated.cpu_usage, 15.0);
|
||||
assert_eq!(aggregated.memory_usage, 30.0);
|
||||
assert_eq!(aggregated.disk_usage, 45.0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cache_clearing() {
|
||||
let mut aggregator = Aggregator::new(Duration::from_secs(300)).await.unwrap();
|
||||
|
||||
// Add some cached data
|
||||
let query = MetricsQuery {
|
||||
start_time: SystemTime::now() - Duration::from_secs(3600),
|
||||
end_time: SystemTime::now() - Duration::from_secs(3000),
|
||||
interval: Duration::from_secs(60),
|
||||
metrics: vec![],
|
||||
severity_filter: None,
|
||||
limit: None,
|
||||
};
|
||||
|
||||
let _result = aggregator.aggregate_metrics(query).await.unwrap();
|
||||
|
||||
let stats_before = aggregator.get_statistics();
|
||||
assert_eq!(stats_before.cache_size, 1);
|
||||
|
||||
aggregator.clear_old_cache().await.unwrap();
|
||||
|
||||
let stats_after = aggregator.get_statistics();
|
||||
assert_eq!(stats_after.cache_size, 0);
|
||||
}
|
||||
}
|
||||
426
crates/ahm/src/metrics/collector.rs
Normal file
426
crates/ahm/src/metrics/collector.rs
Normal file
@@ -0,0 +1,426 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
error::Result,
|
||||
scanner::{HealthIssue, Severity},
|
||||
};
|
||||
|
||||
use super::{
|
||||
AggregatedMetrics, Aggregator, DiskMetrics, HealMetrics, MetricsQuery, MetricType,
|
||||
NetworkMetrics, PolicyMetrics, ScanMetrics, SystemMetrics,
|
||||
};
|
||||
|
||||
/// Configuration for the metrics collector
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CollectorConfig {
|
||||
/// Collection interval
|
||||
pub collection_interval: Duration,
|
||||
/// Whether to enable detailed metrics collection
|
||||
pub enable_detailed_metrics: bool,
|
||||
/// Maximum number of metrics to keep in memory
|
||||
pub max_metrics_in_memory: usize,
|
||||
/// Whether to enable automatic aggregation
|
||||
pub enable_auto_aggregation: bool,
|
||||
/// Aggregation interval
|
||||
pub aggregation_interval: Duration,
|
||||
/// Whether to enable resource monitoring
|
||||
pub enable_resource_monitoring: bool,
|
||||
/// Resource monitoring interval
|
||||
pub resource_monitoring_interval: Duration,
|
||||
/// Whether to enable health issue tracking
|
||||
pub enable_health_issue_tracking: bool,
|
||||
/// Metrics retention period
|
||||
pub metrics_retention_period: Duration,
|
||||
}
|
||||
|
||||
impl Default for CollectorConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
collection_interval: Duration::from_secs(30), // 30 seconds
|
||||
enable_detailed_metrics: true,
|
||||
max_metrics_in_memory: 10000,
|
||||
enable_auto_aggregation: true,
|
||||
aggregation_interval: Duration::from_secs(300), // 5 minutes
|
||||
enable_resource_monitoring: true,
|
||||
resource_monitoring_interval: Duration::from_secs(10), // 10 seconds
|
||||
enable_health_issue_tracking: true,
|
||||
metrics_retention_period: Duration::from_secs(86400 * 7), // 7 days
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics collector that gathers system metrics
|
||||
#[derive(Debug)]
|
||||
pub struct Collector {
|
||||
config: CollectorConfig,
|
||||
metrics: Arc<RwLock<Vec<SystemMetrics>>>,
|
||||
aggregator: Arc<Aggregator>,
|
||||
last_collection_time: Arc<RwLock<SystemTime>>,
|
||||
collection_count: Arc<RwLock<u64>>,
|
||||
health_issues: Arc<RwLock<std::collections::HashMap<Severity, u64>>>,
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
/// Create a new metrics collector
|
||||
pub async fn new(config: CollectorConfig) -> Result<Self> {
|
||||
let aggregator = Arc::new(Aggregator::new(config.aggregation_interval).await?);
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
metrics: Arc::new(RwLock::new(Vec::new())),
|
||||
aggregator,
|
||||
last_collection_time: Arc::new(RwLock::new(SystemTime::now())),
|
||||
collection_count: Arc::new(RwLock::new(0)),
|
||||
health_issues: Arc::new(RwLock::new(std::collections::HashMap::new())),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &CollectorConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Collect current system metrics
|
||||
pub async fn collect_metrics(&self) -> Result<SystemMetrics> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut metrics = SystemMetrics::default();
|
||||
metrics.timestamp = SystemTime::now();
|
||||
|
||||
// Collect system resource metrics
|
||||
if self.config.enable_resource_monitoring {
|
||||
self.collect_system_resources(&mut metrics).await?;
|
||||
}
|
||||
|
||||
// Collect scan metrics
|
||||
self.collect_scan_metrics(&mut metrics).await?;
|
||||
|
||||
// Collect heal metrics
|
||||
self.collect_heal_metrics(&mut metrics).await?;
|
||||
|
||||
// Collect policy metrics
|
||||
self.collect_policy_metrics(&mut metrics).await?;
|
||||
|
||||
// Collect health issues
|
||||
if self.config.enable_health_issue_tracking {
|
||||
self.collect_health_issues(&mut metrics).await?;
|
||||
}
|
||||
|
||||
// Store metrics
|
||||
{
|
||||
let mut metrics_store = self.metrics.write().await;
|
||||
metrics_store.push(metrics.clone());
|
||||
|
||||
// Trim old metrics if we exceed the limit
|
||||
if metrics_store.len() > self.config.max_metrics_in_memory {
|
||||
let excess = metrics_store.len() - self.config.max_metrics_in_memory;
|
||||
metrics_store.drain(0..excess);
|
||||
}
|
||||
}
|
||||
|
||||
// Update collection statistics
|
||||
{
|
||||
let mut last_time = self.last_collection_time.write().await;
|
||||
*last_time = metrics.timestamp;
|
||||
|
||||
let mut count = self.collection_count.write().await;
|
||||
*count += 1;
|
||||
}
|
||||
|
||||
let collection_time = start_time.elapsed();
|
||||
debug!("Metrics collection completed in {:?}", collection_time);
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
/// Collect system resource metrics
|
||||
async fn collect_system_resources(&self, metrics: &mut SystemMetrics) -> Result<()> {
|
||||
// Simulate system resource collection
|
||||
// In a real implementation, this would use system APIs
|
||||
|
||||
metrics.cpu_usage = self.get_cpu_usage().await?;
|
||||
metrics.memory_usage = self.get_memory_usage().await?;
|
||||
metrics.disk_usage = self.get_disk_usage().await?;
|
||||
metrics.system_load = self.get_system_load().await?;
|
||||
metrics.active_operations = self.get_active_operations().await?;
|
||||
|
||||
// Collect network metrics
|
||||
metrics.network_io = self.get_network_metrics().await?;
|
||||
|
||||
// Collect disk I/O metrics
|
||||
metrics.disk_io = self.get_disk_io_metrics().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect scan metrics
|
||||
async fn collect_scan_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> {
|
||||
// In a real implementation, this would get data from the scanner
|
||||
metrics.scan_metrics = ScanMetrics::default();
|
||||
|
||||
// Simulate some scan metrics
|
||||
metrics.scan_metrics.objects_scanned = 1000;
|
||||
metrics.scan_metrics.bytes_scanned = 1024 * 1024 * 100; // 100 MB
|
||||
metrics.scan_metrics.scan_duration = Duration::from_secs(60);
|
||||
metrics.scan_metrics.scan_rate_objects_per_sec = 16.67; // 1000 / 60
|
||||
metrics.scan_metrics.scan_rate_bytes_per_sec = 1_747_200.0; // 100MB / 60s
|
||||
metrics.scan_metrics.health_issues_found = 5;
|
||||
metrics.scan_metrics.scan_cycles_completed = 1;
|
||||
metrics.scan_metrics.last_scan_time = Some(SystemTime::now());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect heal metrics
|
||||
async fn collect_heal_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> {
|
||||
// In a real implementation, this would get data from the heal system
|
||||
metrics.heal_metrics = HealMetrics::default();
|
||||
|
||||
// Simulate some heal metrics
|
||||
metrics.heal_metrics.total_repairs = 10;
|
||||
metrics.heal_metrics.successful_repairs = 8;
|
||||
metrics.heal_metrics.failed_repairs = 2;
|
||||
metrics.heal_metrics.total_repair_time = Duration::from_secs(300);
|
||||
metrics.heal_metrics.average_repair_time = Duration::from_secs(30);
|
||||
metrics.heal_metrics.active_repair_workers = 2;
|
||||
metrics.heal_metrics.queued_repair_tasks = 5;
|
||||
metrics.heal_metrics.last_repair_time = Some(SystemTime::now());
|
||||
metrics.heal_metrics.total_retry_attempts = 3;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect policy metrics
|
||||
async fn collect_policy_metrics(&self, metrics: &mut SystemMetrics) -> Result<()> {
|
||||
// In a real implementation, this would get data from the policy system
|
||||
metrics.policy_metrics = PolicyMetrics::default();
|
||||
|
||||
// Simulate some policy metrics
|
||||
metrics.policy_metrics.total_evaluations = 50;
|
||||
metrics.policy_metrics.allowed_operations = 45;
|
||||
metrics.policy_metrics.denied_operations = 5;
|
||||
metrics.policy_metrics.scan_policy_evaluations = 20;
|
||||
metrics.policy_metrics.heal_policy_evaluations = 20;
|
||||
metrics.policy_metrics.retention_policy_evaluations = 10;
|
||||
metrics.policy_metrics.average_evaluation_time = Duration::from_millis(10);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect health issues
|
||||
async fn collect_health_issues(&self, metrics: &mut SystemMetrics) -> Result<()> {
|
||||
let health_issues = self.health_issues.read().await;
|
||||
metrics.health_issues = health_issues.clone();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Record a health issue
|
||||
pub async fn record_health_issue(&self, issue: &HealthIssue) -> Result<()> {
|
||||
let mut issues = self.health_issues.write().await;
|
||||
let count = issues.entry(issue.severity).or_insert(0);
|
||||
*count += 1;
|
||||
|
||||
info!("Recorded health issue: {:?} - {}", issue.severity, issue.description);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Record an event (alias for record_health_issue)
|
||||
pub async fn record_event(&self, issue: &HealthIssue) -> Result<()> {
|
||||
self.record_health_issue(issue).await
|
||||
}
|
||||
|
||||
/// Clear health issues
|
||||
pub async fn clear_health_issues(&self) -> Result<()> {
|
||||
let mut health_issues = self.health_issues.write().await;
|
||||
health_issues.clear();
|
||||
|
||||
info!("Cleared all health issues");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Query metrics with aggregation
|
||||
pub async fn query_metrics(&self, query: MetricsQuery) -> Result<AggregatedMetrics> {
|
||||
// In a real implementation, this would query the aggregator
|
||||
// For now, we'll return a simple aggregated result
|
||||
let aggregator = self.aggregator.as_ref();
|
||||
let mut aggregator_guard = aggregator.write().await;
|
||||
aggregator_guard.aggregate_metrics(query).await
|
||||
}
|
||||
|
||||
/// Get metrics for a specific time range
|
||||
pub async fn get_metrics_range(&self, start_time: SystemTime, end_time: SystemTime) -> Result<Vec<SystemMetrics>> {
|
||||
let metrics = self.metrics.read().await;
|
||||
let filtered_metrics: Vec<SystemMetrics> = metrics
|
||||
.iter()
|
||||
.filter(|m| m.timestamp >= start_time && m.timestamp <= end_time)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(filtered_metrics)
|
||||
}
|
||||
|
||||
/// Get latest metrics
|
||||
pub async fn get_latest_metrics(&self) -> Result<Option<SystemMetrics>> {
|
||||
let metrics = self.metrics.read().await;
|
||||
Ok(metrics.last().cloned())
|
||||
}
|
||||
|
||||
/// Get collection statistics
|
||||
pub async fn get_collection_statistics(&self) -> CollectionStatistics {
|
||||
let collection_count = *self.collection_count.read().await;
|
||||
let last_collection_time = *self.last_collection_time.read().await;
|
||||
let metrics_count = self.metrics.read().await.len();
|
||||
|
||||
CollectionStatistics {
|
||||
total_collections: collection_count,
|
||||
last_collection_time,
|
||||
metrics_in_memory: metrics_count,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simulated system resource collection methods
|
||||
async fn get_cpu_usage(&self) -> Result<f64> {
|
||||
// Simulate CPU usage collection
|
||||
Ok(25.5) // 25.5%
|
||||
}
|
||||
|
||||
async fn get_memory_usage(&self) -> Result<f64> {
|
||||
// Simulate memory usage collection
|
||||
Ok(60.2) // 60.2%
|
||||
}
|
||||
|
||||
async fn get_disk_usage(&self) -> Result<f64> {
|
||||
// Simulate disk usage collection
|
||||
Ok(45.8) // 45.8%
|
||||
}
|
||||
|
||||
async fn get_system_load(&self) -> Result<f64> {
|
||||
// Simulate system load collection
|
||||
Ok(0.75) // 0.75
|
||||
}
|
||||
|
||||
async fn get_active_operations(&self) -> Result<u64> {
|
||||
// Simulate active operations count
|
||||
Ok(15)
|
||||
}
|
||||
|
||||
async fn get_network_metrics(&self) -> Result<NetworkMetrics> {
|
||||
// Simulate network metrics collection
|
||||
Ok(NetworkMetrics {
|
||||
bytes_received_per_sec: 1024 * 1024, // 1 MB/s
|
||||
bytes_sent_per_sec: 512 * 1024, // 512 KB/s
|
||||
packets_received_per_sec: 1000,
|
||||
packets_sent_per_sec: 500,
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_disk_io_metrics(&self) -> Result<DiskMetrics> {
|
||||
// Simulate disk I/O metrics collection
|
||||
Ok(DiskMetrics {
|
||||
bytes_read_per_sec: 2 * 1024 * 1024, // 2 MB/s
|
||||
bytes_written_per_sec: 1 * 1024 * 1024, // 1 MB/s
|
||||
read_ops_per_sec: 200,
|
||||
write_ops_per_sec: 100,
|
||||
avg_read_latency_ms: 5.0,
|
||||
avg_write_latency_ms: 8.0,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Collection statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CollectionStatistics {
|
||||
pub total_collections: u64,
|
||||
pub last_collection_time: SystemTime,
|
||||
pub metrics_in_memory: usize,
|
||||
pub config: CollectorConfig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collector_creation() {
|
||||
let config = CollectorConfig::default();
|
||||
let collector = Collector::new(config).await.unwrap();
|
||||
|
||||
assert_eq!(collector.config().collection_interval, Duration::from_secs(30));
|
||||
assert!(collector.config().enable_detailed_metrics);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metrics_collection() {
|
||||
let config = CollectorConfig::default();
|
||||
let collector = Collector::new(config).await.unwrap();
|
||||
|
||||
let metrics = collector.collect_metrics().await.unwrap();
|
||||
assert_eq!(metrics.cpu_usage, 25.5);
|
||||
assert_eq!(metrics.memory_usage, 60.2);
|
||||
assert_eq!(metrics.disk_usage, 45.8);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_health_issue_recording() {
|
||||
let config = CollectorConfig::default();
|
||||
let collector = Collector::new(config).await.unwrap();
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
collector.record_health_issue(&issue).await.unwrap();
|
||||
|
||||
let stats = collector.get_collection_statistics().await;
|
||||
assert_eq!(stats.total_collections, 0); // No collection yet
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_latest_metrics() {
|
||||
let config = CollectorConfig::default();
|
||||
let collector = Collector::new(config).await.unwrap();
|
||||
|
||||
// Initially no metrics
|
||||
let latest = collector.get_latest_metrics().await.unwrap();
|
||||
assert!(latest.is_none());
|
||||
|
||||
// Collect metrics
|
||||
collector.collect_metrics().await.unwrap();
|
||||
|
||||
// Now should have metrics
|
||||
let latest = collector.get_latest_metrics().await.unwrap();
|
||||
assert!(latest.is_some());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_collection_statistics() {
|
||||
let config = CollectorConfig::default();
|
||||
let collector = Collector::new(config).await.unwrap();
|
||||
|
||||
let stats = collector.get_collection_statistics().await;
|
||||
assert_eq!(stats.total_collections, 0);
|
||||
assert_eq!(stats.metrics_in_memory, 0);
|
||||
|
||||
// Collect metrics
|
||||
collector.collect_metrics().await.unwrap();
|
||||
|
||||
let stats = collector.get_collection_statistics().await;
|
||||
assert_eq!(stats.total_collections, 1);
|
||||
assert_eq!(stats.metrics_in_memory, 1);
|
||||
}
|
||||
}
|
||||
617
crates/ahm/src/metrics/mod.rs
Normal file
617
crates/ahm/src/metrics/mod.rs
Normal file
@@ -0,0 +1,617 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Metrics collection and aggregation system
|
||||
//!
|
||||
//! The metrics subsystem provides comprehensive data collection and analysis:
|
||||
//! - Real-time metrics collection from all subsystems
|
||||
//! - Time-series data storage and aggregation
|
||||
//! - Export capabilities for external monitoring systems
|
||||
//! - Performance analytics and trend analysis
|
||||
|
||||
pub mod collector;
|
||||
pub mod aggregator;
|
||||
pub mod storage;
|
||||
pub mod reporter;
|
||||
|
||||
pub use collector::{Collector, CollectorConfig};
|
||||
pub use aggregator::{Aggregator, AggregatorConfig};
|
||||
pub use storage::{Storage, StorageConfig};
|
||||
pub use reporter::{Reporter, ReporterConfig};
|
||||
|
||||
use std::time::{Duration, SystemTime};
|
||||
use std::collections::HashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::scanner::{HealthIssue, Severity};
|
||||
|
||||
/// Metrics subsystem status
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Status {
|
||||
/// Metrics system is initializing
|
||||
Initializing,
|
||||
/// Metrics system is running normally
|
||||
Running,
|
||||
/// Metrics system is degraded (some exporters failing)
|
||||
Degraded,
|
||||
/// Metrics system is stopping
|
||||
Stopping,
|
||||
/// Metrics system has stopped
|
||||
Stopped,
|
||||
/// Metrics system encountered an error
|
||||
Error(String),
|
||||
}
|
||||
|
||||
/// Metric data point with timestamp and value
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricPoint {
|
||||
/// Metric name
|
||||
pub name: String,
|
||||
/// Metric value
|
||||
pub value: MetricValue,
|
||||
/// Timestamp when metric was collected
|
||||
pub timestamp: SystemTime,
|
||||
/// Additional labels/tags
|
||||
pub labels: HashMap<String, String>,
|
||||
}
|
||||
|
||||
/// Different types of metric values
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum MetricValue {
|
||||
/// Counter that only increases
|
||||
Counter(u64),
|
||||
/// Gauge that can go up or down
|
||||
Gauge(f64),
|
||||
/// Histogram with buckets
|
||||
Histogram {
|
||||
count: u64,
|
||||
sum: f64,
|
||||
buckets: Vec<HistogramBucket>,
|
||||
},
|
||||
/// Summary with quantiles
|
||||
Summary {
|
||||
count: u64,
|
||||
sum: f64,
|
||||
quantiles: Vec<Quantile>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Histogram bucket
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HistogramBucket {
|
||||
/// Upper bound of the bucket
|
||||
pub le: f64,
|
||||
/// Count of observations in this bucket
|
||||
pub count: u64,
|
||||
}
|
||||
|
||||
/// Summary quantile
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Quantile {
|
||||
/// Quantile value (e.g., 0.5 for median)
|
||||
pub quantile: f64,
|
||||
/// Value at this quantile
|
||||
pub value: f64,
|
||||
}
|
||||
|
||||
/// Aggregation functions for metrics
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum AggregationFunction {
|
||||
Sum,
|
||||
Average,
|
||||
Min,
|
||||
Max,
|
||||
Count,
|
||||
Rate,
|
||||
Percentile(u8),
|
||||
}
|
||||
|
||||
/// Time window for aggregation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TimeWindow {
|
||||
/// Duration of the window
|
||||
pub duration: Duration,
|
||||
/// How often to create new windows
|
||||
pub step: Duration,
|
||||
}
|
||||
|
||||
/// Metric export configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExportConfig {
|
||||
/// Export format
|
||||
pub format: ExportFormat,
|
||||
/// Export destination
|
||||
pub destination: ExportDestination,
|
||||
/// Export interval
|
||||
pub interval: Duration,
|
||||
/// Metric filters
|
||||
pub filters: Vec<MetricFilter>,
|
||||
}
|
||||
|
||||
/// Supported export formats
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum ExportFormat {
|
||||
/// Prometheus format
|
||||
Prometheus,
|
||||
/// JSON format
|
||||
Json,
|
||||
/// CSV format
|
||||
Csv,
|
||||
/// Custom format
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
/// Export destinations
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ExportDestination {
|
||||
/// HTTP endpoint
|
||||
Http { url: String, headers: HashMap<String, String> },
|
||||
/// File system
|
||||
File { path: String },
|
||||
/// Standard output
|
||||
Stdout,
|
||||
/// Custom destination
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
/// Metric filtering rules
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricFilter {
|
||||
/// Metric name pattern (regex)
|
||||
pub name_pattern: String,
|
||||
/// Label filters
|
||||
pub label_filters: HashMap<String, String>,
|
||||
/// Include or exclude matching metrics
|
||||
pub include: bool,
|
||||
}
|
||||
|
||||
/// System-wide metrics that are automatically collected
|
||||
pub mod system_metrics {
|
||||
/// Object-related metrics
|
||||
pub const OBJECTS_TOTAL: &str = "rustfs_objects_total";
|
||||
pub const OBJECTS_SIZE_BYTES: &str = "rustfs_objects_size_bytes";
|
||||
pub const OBJECTS_SCANNED_TOTAL: &str = "rustfs_objects_scanned_total";
|
||||
pub const OBJECTS_HEAL_OPERATIONS_TOTAL: &str = "rustfs_objects_heal_operations_total";
|
||||
|
||||
/// Scanner metrics
|
||||
pub const SCAN_CYCLES_TOTAL: &str = "rustfs_scan_cycles_total";
|
||||
pub const SCAN_DURATION_SECONDS: &str = "rustfs_scan_duration_seconds";
|
||||
pub const SCAN_RATE_OBJECTS_PER_SECOND: &str = "rustfs_scan_rate_objects_per_second";
|
||||
pub const SCAN_RATE_BYTES_PER_SECOND: &str = "rustfs_scan_rate_bytes_per_second";
|
||||
|
||||
/// Health metrics
|
||||
pub const HEALTH_ISSUES_TOTAL: &str = "rustfs_health_issues_total";
|
||||
pub const HEALTH_ISSUES_BY_SEVERITY: &str = "rustfs_health_issues_by_severity";
|
||||
pub const HEAL_SUCCESS_RATE: &str = "rustfs_heal_success_rate";
|
||||
|
||||
/// System resource metrics
|
||||
pub const DISK_USAGE_BYTES: &str = "rustfs_disk_usage_bytes";
|
||||
pub const DISK_IOPS: &str = "rustfs_disk_iops";
|
||||
pub const MEMORY_USAGE_BYTES: &str = "rustfs_memory_usage_bytes";
|
||||
pub const CPU_USAGE_PERCENT: &str = "rustfs_cpu_usage_percent";
|
||||
|
||||
/// Performance metrics
|
||||
pub const OPERATION_DURATION_SECONDS: &str = "rustfs_operation_duration_seconds";
|
||||
pub const ACTIVE_OPERATIONS: &str = "rustfs_active_operations";
|
||||
pub const THROUGHPUT_BYTES_PER_SECOND: &str = "rustfs_throughput_bytes_per_second";
|
||||
}
|
||||
|
||||
/// System metrics collected by AHM
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SystemMetrics {
|
||||
/// Timestamp when metrics were collected
|
||||
pub timestamp: SystemTime,
|
||||
/// CPU usage percentage
|
||||
pub cpu_usage: f64,
|
||||
/// Memory usage percentage
|
||||
pub memory_usage: f64,
|
||||
/// Disk usage percentage
|
||||
pub disk_usage: f64,
|
||||
/// Network I/O bytes per second
|
||||
pub network_io: NetworkMetrics,
|
||||
/// Disk I/O bytes per second
|
||||
pub disk_io: DiskMetrics,
|
||||
/// Active operations count
|
||||
pub active_operations: u64,
|
||||
/// System load average
|
||||
pub system_load: f64,
|
||||
/// Health issues count by severity
|
||||
pub health_issues: std::collections::HashMap<Severity, u64>,
|
||||
/// Scan metrics
|
||||
pub scan_metrics: ScanMetrics,
|
||||
/// Heal metrics
|
||||
pub heal_metrics: HealMetrics,
|
||||
/// Policy metrics
|
||||
pub policy_metrics: PolicyMetrics,
|
||||
}
|
||||
|
||||
/// Network I/O metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NetworkMetrics {
|
||||
/// Bytes received per second
|
||||
pub bytes_received_per_sec: u64,
|
||||
/// Bytes sent per second
|
||||
pub bytes_sent_per_sec: u64,
|
||||
/// Packets received per second
|
||||
pub packets_received_per_sec: u64,
|
||||
/// Packets sent per second
|
||||
pub packets_sent_per_sec: u64,
|
||||
}
|
||||
|
||||
/// Disk I/O metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DiskMetrics {
|
||||
/// Bytes read per second
|
||||
pub bytes_read_per_sec: u64,
|
||||
/// Bytes written per second
|
||||
pub bytes_written_per_sec: u64,
|
||||
/// Read operations per second
|
||||
pub read_ops_per_sec: u64,
|
||||
/// Write operations per second
|
||||
pub write_ops_per_sec: u64,
|
||||
/// Average read latency in milliseconds
|
||||
pub avg_read_latency_ms: f64,
|
||||
/// Average write latency in milliseconds
|
||||
pub avg_write_latency_ms: f64,
|
||||
}
|
||||
|
||||
/// Scan operation metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ScanMetrics {
|
||||
/// Total objects scanned
|
||||
pub objects_scanned: u64,
|
||||
/// Total bytes scanned
|
||||
pub bytes_scanned: u64,
|
||||
/// Scan duration
|
||||
pub scan_duration: Duration,
|
||||
/// Scan rate (objects per second)
|
||||
pub scan_rate_objects_per_sec: f64,
|
||||
/// Scan rate (bytes per second)
|
||||
pub scan_rate_bytes_per_sec: f64,
|
||||
/// Health issues found
|
||||
pub health_issues_found: u64,
|
||||
/// Scan cycles completed
|
||||
pub scan_cycles_completed: u64,
|
||||
/// Last scan time
|
||||
pub last_scan_time: Option<SystemTime>,
|
||||
}
|
||||
|
||||
/// Heal operation metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealMetrics {
|
||||
/// Total repair operations
|
||||
pub total_repairs: u64,
|
||||
/// Successful repairs
|
||||
pub successful_repairs: u64,
|
||||
/// Failed repairs
|
||||
pub failed_repairs: u64,
|
||||
/// Total repair time
|
||||
pub total_repair_time: Duration,
|
||||
/// Average repair time
|
||||
pub average_repair_time: Duration,
|
||||
/// Active repair workers
|
||||
pub active_repair_workers: u64,
|
||||
/// Queued repair tasks
|
||||
pub queued_repair_tasks: u64,
|
||||
/// Last repair time
|
||||
pub last_repair_time: Option<SystemTime>,
|
||||
/// Retry attempts
|
||||
pub total_retry_attempts: u64,
|
||||
}
|
||||
|
||||
/// Policy evaluation metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PolicyMetrics {
|
||||
/// Total policy evaluations
|
||||
pub total_evaluations: u64,
|
||||
/// Allowed operations
|
||||
pub allowed_operations: u64,
|
||||
/// Denied operations
|
||||
pub denied_operations: u64,
|
||||
/// Scan policy evaluations
|
||||
pub scan_policy_evaluations: u64,
|
||||
/// Heal policy evaluations
|
||||
pub heal_policy_evaluations: u64,
|
||||
/// Retention policy evaluations
|
||||
pub retention_policy_evaluations: u64,
|
||||
/// Average evaluation time
|
||||
pub average_evaluation_time: Duration,
|
||||
}
|
||||
|
||||
impl Default for SystemMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
timestamp: SystemTime::now(),
|
||||
cpu_usage: 0.0,
|
||||
memory_usage: 0.0,
|
||||
disk_usage: 0.0,
|
||||
network_io: NetworkMetrics::default(),
|
||||
disk_io: DiskMetrics::default(),
|
||||
active_operations: 0,
|
||||
system_load: 0.0,
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
scan_metrics: ScanMetrics::default(),
|
||||
heal_metrics: HealMetrics::default(),
|
||||
policy_metrics: PolicyMetrics::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for NetworkMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
bytes_received_per_sec: 0,
|
||||
bytes_sent_per_sec: 0,
|
||||
packets_received_per_sec: 0,
|
||||
packets_sent_per_sec: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DiskMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
bytes_read_per_sec: 0,
|
||||
bytes_written_per_sec: 0,
|
||||
read_ops_per_sec: 0,
|
||||
write_ops_per_sec: 0,
|
||||
avg_read_latency_ms: 0.0,
|
||||
avg_write_latency_ms: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ScanMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
objects_scanned: 0,
|
||||
bytes_scanned: 0,
|
||||
scan_duration: Duration::ZERO,
|
||||
scan_rate_objects_per_sec: 0.0,
|
||||
scan_rate_bytes_per_sec: 0.0,
|
||||
health_issues_found: 0,
|
||||
scan_cycles_completed: 0,
|
||||
last_scan_time: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HealMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
total_repairs: 0,
|
||||
successful_repairs: 0,
|
||||
failed_repairs: 0,
|
||||
total_repair_time: Duration::ZERO,
|
||||
average_repair_time: Duration::ZERO,
|
||||
active_repair_workers: 0,
|
||||
queued_repair_tasks: 0,
|
||||
last_repair_time: None,
|
||||
total_retry_attempts: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PolicyMetrics {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
total_evaluations: 0,
|
||||
allowed_operations: 0,
|
||||
denied_operations: 0,
|
||||
scan_policy_evaluations: 0,
|
||||
heal_policy_evaluations: 0,
|
||||
retention_policy_evaluations: 0,
|
||||
average_evaluation_time: Duration::ZERO,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics query parameters
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsQuery {
|
||||
/// Start time for the query
|
||||
pub start_time: SystemTime,
|
||||
/// End time for the query
|
||||
pub end_time: SystemTime,
|
||||
/// Metrics aggregation interval
|
||||
pub interval: Duration,
|
||||
/// Metrics to include in the query
|
||||
pub metrics: Vec<MetricType>,
|
||||
/// Filter by severity
|
||||
pub severity_filter: Option<Severity>,
|
||||
/// Limit number of results
|
||||
pub limit: Option<u64>,
|
||||
}
|
||||
|
||||
/// Types of metrics that can be queried
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum MetricType {
|
||||
/// System metrics (CPU, memory, disk)
|
||||
System,
|
||||
/// Network metrics
|
||||
Network,
|
||||
/// Disk I/O metrics
|
||||
DiskIo,
|
||||
/// Scan metrics
|
||||
Scan,
|
||||
/// Heal metrics
|
||||
Heal,
|
||||
/// Policy metrics
|
||||
Policy,
|
||||
/// Health issues
|
||||
HealthIssues,
|
||||
}
|
||||
|
||||
/// Aggregated metrics data
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AggregatedMetrics {
|
||||
/// Query parameters used
|
||||
pub query: MetricsQuery,
|
||||
/// Aggregated data points
|
||||
pub data_points: Vec<MetricsDataPoint>,
|
||||
/// Summary statistics
|
||||
pub summary: MetricsSummary,
|
||||
}
|
||||
|
||||
/// Individual metrics data point
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsDataPoint {
|
||||
/// Timestamp for this data point
|
||||
pub timestamp: SystemTime,
|
||||
/// System metrics
|
||||
pub system: Option<SystemMetrics>,
|
||||
/// Network metrics
|
||||
pub network: Option<NetworkMetrics>,
|
||||
/// Disk I/O metrics
|
||||
pub disk_io: Option<DiskMetrics>,
|
||||
/// Scan metrics
|
||||
pub scan: Option<ScanMetrics>,
|
||||
/// Heal metrics
|
||||
pub heal: Option<HealMetrics>,
|
||||
/// Policy metrics
|
||||
pub policy: Option<PolicyMetrics>,
|
||||
}
|
||||
|
||||
/// Summary statistics for aggregated metrics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsSummary {
|
||||
/// Total data points
|
||||
pub total_points: u64,
|
||||
/// Time range covered
|
||||
pub time_range: Duration,
|
||||
/// Average CPU usage
|
||||
pub avg_cpu_usage: f64,
|
||||
/// Average memory usage
|
||||
pub avg_memory_usage: f64,
|
||||
/// Average disk usage
|
||||
pub avg_disk_usage: f64,
|
||||
/// Total objects scanned
|
||||
pub total_objects_scanned: u64,
|
||||
/// Total repairs performed
|
||||
pub total_repairs: u64,
|
||||
/// Success rate for repairs
|
||||
pub repair_success_rate: f64,
|
||||
/// Total health issues
|
||||
pub total_health_issues: u64,
|
||||
}
|
||||
|
||||
/// Resource usage information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ResourceUsage {
|
||||
/// Disk usage information
|
||||
pub disk_usage: DiskUsage,
|
||||
/// Memory usage information
|
||||
pub memory_usage: MemoryUsage,
|
||||
/// Network usage information
|
||||
pub network_usage: NetworkUsage,
|
||||
/// CPU usage information
|
||||
pub cpu_usage: CpuUsage,
|
||||
}
|
||||
|
||||
/// Disk usage information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DiskUsage {
|
||||
/// Total disk space in bytes
|
||||
pub total_bytes: u64,
|
||||
/// Used disk space in bytes
|
||||
pub used_bytes: u64,
|
||||
/// Available disk space in bytes
|
||||
pub available_bytes: u64,
|
||||
/// Usage percentage
|
||||
pub usage_percentage: f64,
|
||||
}
|
||||
|
||||
/// Memory usage information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MemoryUsage {
|
||||
/// Total memory in bytes
|
||||
pub total_bytes: u64,
|
||||
/// Used memory in bytes
|
||||
pub used_bytes: u64,
|
||||
/// Available memory in bytes
|
||||
pub available_bytes: u64,
|
||||
/// Usage percentage
|
||||
pub usage_percentage: f64,
|
||||
}
|
||||
|
||||
/// Network usage information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NetworkUsage {
|
||||
/// Bytes received
|
||||
pub bytes_received: u64,
|
||||
/// Bytes sent
|
||||
pub bytes_sent: u64,
|
||||
/// Packets received
|
||||
pub packets_received: u64,
|
||||
/// Packets sent
|
||||
pub packets_sent: u64,
|
||||
}
|
||||
|
||||
/// CPU usage information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CpuUsage {
|
||||
/// CPU usage percentage
|
||||
pub usage_percentage: f64,
|
||||
/// Number of CPU cores
|
||||
pub cores: u32,
|
||||
/// Load average
|
||||
pub load_average: f64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_system_metrics_creation() {
|
||||
let metrics = SystemMetrics::default();
|
||||
assert_eq!(metrics.cpu_usage, 0.0);
|
||||
assert_eq!(metrics.memory_usage, 0.0);
|
||||
assert_eq!(metrics.active_operations, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scan_metrics_creation() {
|
||||
let metrics = ScanMetrics::default();
|
||||
assert_eq!(metrics.objects_scanned, 0);
|
||||
assert_eq!(metrics.bytes_scanned, 0);
|
||||
assert_eq!(metrics.scan_cycles_completed, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_heal_metrics_creation() {
|
||||
let metrics = HealMetrics::default();
|
||||
assert_eq!(metrics.total_repairs, 0);
|
||||
assert_eq!(metrics.successful_repairs, 0);
|
||||
assert_eq!(metrics.failed_repairs, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metrics_query_creation() {
|
||||
let start_time = SystemTime::now();
|
||||
let end_time = start_time + Duration::from_secs(3600);
|
||||
let query = MetricsQuery {
|
||||
start_time,
|
||||
end_time,
|
||||
interval: Duration::from_secs(60),
|
||||
metrics: vec![MetricType::System, MetricType::Scan],
|
||||
severity_filter: Some(Severity::Critical),
|
||||
limit: Some(100),
|
||||
};
|
||||
|
||||
assert_eq!(query.metrics.len(), 2);
|
||||
assert_eq!(query.interval, Duration::from_secs(60));
|
||||
assert_eq!(query.limit, Some(100));
|
||||
}
|
||||
}
|
||||
861
crates/ahm/src/metrics/reporter.rs
Normal file
861
crates/ahm/src/metrics/reporter.rs
Normal file
@@ -0,0 +1,861 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fmt,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
use super::{
|
||||
AggregatedMetrics, MetricsQuery, MetricsSummary, SystemMetrics,
|
||||
};
|
||||
|
||||
/// Configuration for the metrics reporter
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ReporterConfig {
|
||||
/// Whether to enable reporting
|
||||
pub enabled: bool,
|
||||
/// Report generation interval
|
||||
pub report_interval: Duration,
|
||||
/// Maximum number of reports to keep in memory
|
||||
pub max_reports_in_memory: usize,
|
||||
/// Alert thresholds
|
||||
pub alert_thresholds: AlertThresholds,
|
||||
/// Report output format
|
||||
pub default_format: ReportFormat,
|
||||
/// Whether to enable alerting
|
||||
pub enable_alerts: bool,
|
||||
/// Maximum number of alerts to keep in memory
|
||||
pub max_alerts_in_memory: usize,
|
||||
/// Report output directory
|
||||
pub output_directory: Option<String>,
|
||||
/// Whether to enable HTTP reporting
|
||||
pub enable_http_reporting: bool,
|
||||
/// HTTP reporting endpoint
|
||||
pub http_endpoint: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for ReporterConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
report_interval: Duration::from_secs(60), // 1 minute
|
||||
max_reports_in_memory: 1000,
|
||||
alert_thresholds: AlertThresholds::default(),
|
||||
default_format: ReportFormat::Json,
|
||||
enable_alerts: true,
|
||||
max_alerts_in_memory: 1000,
|
||||
output_directory: None,
|
||||
enable_http_reporting: false,
|
||||
http_endpoint: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Alert thresholds for metrics reporting
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AlertThresholds {
|
||||
/// CPU usage threshold (percentage)
|
||||
pub cpu_usage_threshold: f64,
|
||||
/// Memory usage threshold (percentage)
|
||||
pub memory_usage_threshold: f64,
|
||||
/// Disk usage threshold (percentage)
|
||||
pub disk_usage_threshold: f64,
|
||||
/// System load threshold
|
||||
pub system_load_threshold: f64,
|
||||
/// Repair failure rate threshold (percentage)
|
||||
pub repair_failure_rate_threshold: f64,
|
||||
/// Health issues threshold (count)
|
||||
pub health_issues_threshold: u64,
|
||||
}
|
||||
|
||||
impl Default for AlertThresholds {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpu_usage_threshold: 80.0,
|
||||
memory_usage_threshold: 85.0,
|
||||
disk_usage_threshold: 90.0,
|
||||
system_load_threshold: 5.0,
|
||||
repair_failure_rate_threshold: 20.0,
|
||||
health_issues_threshold: 10,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics reporter that generates and outputs metrics reports
|
||||
pub struct Reporter {
|
||||
config: ReporterConfig,
|
||||
reports: Arc<RwLock<Vec<MetricsReport>>>,
|
||||
alerts: Arc<RwLock<Vec<Alert>>>,
|
||||
last_report_time: Arc<RwLock<SystemTime>>,
|
||||
report_count: Arc<RwLock<u64>>,
|
||||
alert_count: Arc<RwLock<u64>>,
|
||||
}
|
||||
|
||||
impl Reporter {
|
||||
/// Create a new metrics reporter
|
||||
pub async fn new(config: ReporterConfig) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config,
|
||||
reports: Arc::new(RwLock::new(Vec::new())),
|
||||
alerts: Arc::new(RwLock::new(Vec::new())),
|
||||
last_report_time: Arc::new(RwLock::new(SystemTime::now())),
|
||||
report_count: Arc::new(RwLock::new(0)),
|
||||
alert_count: Arc::new(RwLock::new(0)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &ReporterConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Generate a metrics report
|
||||
pub async fn generate_report(&self, metrics: &SystemMetrics) -> Result<MetricsReport> {
|
||||
let start_time = SystemTime::now();
|
||||
|
||||
let report = MetricsReport {
|
||||
timestamp: start_time,
|
||||
metrics: metrics.clone(),
|
||||
alerts: self.check_alerts(metrics).await?,
|
||||
summary: self.generate_summary(metrics).await?,
|
||||
format: self.config.default_format,
|
||||
};
|
||||
|
||||
// Store report
|
||||
{
|
||||
let mut reports = self.reports.write().await;
|
||||
reports.push(report.clone());
|
||||
|
||||
// Trim old reports if we exceed the limit
|
||||
if reports.len() > self.config.max_reports_in_memory {
|
||||
let excess = reports.len() - self.config.max_reports_in_memory;
|
||||
reports.drain(0..excess);
|
||||
}
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut last_time = self.last_report_time.write().await;
|
||||
*last_time = start_time;
|
||||
|
||||
let mut count = self.report_count.write().await;
|
||||
*count += 1;
|
||||
}
|
||||
|
||||
info!("Generated metrics report #{}", *self.report_count.read().await);
|
||||
Ok(report)
|
||||
}
|
||||
|
||||
/// Generate a comprehensive report from aggregated metrics
|
||||
pub async fn generate_comprehensive_report(&self, aggregated: &AggregatedMetrics) -> Result<ComprehensiveReport> {
|
||||
let start_time = SystemTime::now();
|
||||
|
||||
let report = ComprehensiveReport {
|
||||
timestamp: start_time,
|
||||
query: aggregated.query.clone(),
|
||||
data_points: aggregated.data_points.len(),
|
||||
summary: aggregated.summary.clone(),
|
||||
alerts: self.check_aggregated_alerts(aggregated).await?,
|
||||
trends: self.analyze_trends(aggregated).await?,
|
||||
recommendations: self.generate_recommendations(aggregated).await?,
|
||||
};
|
||||
|
||||
info!("Generated comprehensive report with {} data points", report.data_points);
|
||||
Ok(report)
|
||||
}
|
||||
|
||||
/// Output a report in the specified format
|
||||
pub async fn output_report(&self, report: &MetricsReport, format: ReportFormat) -> Result<()> {
|
||||
match format {
|
||||
ReportFormat::Console => self.output_to_console(report).await?,
|
||||
ReportFormat::File => self.output_to_file(report).await?,
|
||||
ReportFormat::Http => self.output_to_http(report).await?,
|
||||
ReportFormat::Prometheus => self.output_prometheus(report).await?,
|
||||
ReportFormat::Json => self.output_json(report).await?,
|
||||
ReportFormat::Csv => self.output_csv(report).await?,
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check for alerts based on metrics
|
||||
async fn check_alerts(&self, metrics: &SystemMetrics) -> Result<Vec<Alert>> {
|
||||
let mut alerts = Vec::new();
|
||||
|
||||
// Check CPU usage
|
||||
if metrics.cpu_usage > self.config.alert_thresholds.cpu_usage_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Warning,
|
||||
category: AlertCategory::System,
|
||||
message: format!("High CPU usage: {:.1}%", metrics.cpu_usage),
|
||||
metric_value: metrics.cpu_usage,
|
||||
threshold: self.config.alert_thresholds.cpu_usage_threshold,
|
||||
});
|
||||
}
|
||||
|
||||
// Check memory usage
|
||||
if metrics.memory_usage > self.config.alert_thresholds.memory_usage_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Warning,
|
||||
category: AlertCategory::System,
|
||||
message: format!("High memory usage: {:.1}%", metrics.memory_usage),
|
||||
metric_value: metrics.memory_usage,
|
||||
threshold: self.config.alert_thresholds.memory_usage_threshold,
|
||||
});
|
||||
}
|
||||
|
||||
// Check disk usage
|
||||
if metrics.disk_usage > self.config.alert_thresholds.disk_usage_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Critical,
|
||||
category: AlertCategory::System,
|
||||
message: format!("High disk usage: {:.1}%", metrics.disk_usage),
|
||||
metric_value: metrics.disk_usage,
|
||||
threshold: self.config.alert_thresholds.disk_usage_threshold,
|
||||
});
|
||||
}
|
||||
|
||||
// Check system load
|
||||
if metrics.system_load > self.config.alert_thresholds.system_load_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Warning,
|
||||
category: AlertCategory::System,
|
||||
message: format!("High system load: {:.2}", metrics.system_load),
|
||||
metric_value: metrics.system_load,
|
||||
threshold: self.config.alert_thresholds.system_load_threshold,
|
||||
});
|
||||
}
|
||||
|
||||
// Check repair failure rate
|
||||
if metrics.heal_metrics.total_repairs > 0 {
|
||||
let failure_rate = (metrics.heal_metrics.failed_repairs as f64 / metrics.heal_metrics.total_repairs as f64) * 100.0;
|
||||
if failure_rate > self.config.alert_thresholds.repair_failure_rate_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Critical,
|
||||
category: AlertCategory::Heal,
|
||||
message: format!("High repair failure rate: {:.1}%", failure_rate),
|
||||
metric_value: failure_rate,
|
||||
threshold: self.config.alert_thresholds.repair_failure_rate_threshold,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check health issues
|
||||
let total_health_issues: u64 = metrics.health_issues.values().sum();
|
||||
if total_health_issues > self.config.alert_thresholds.health_issues_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Warning,
|
||||
category: AlertCategory::Health,
|
||||
message: format!("High number of health issues: {}", total_health_issues),
|
||||
metric_value: total_health_issues as f64,
|
||||
threshold: self.config.alert_thresholds.health_issues_threshold as f64,
|
||||
});
|
||||
}
|
||||
|
||||
// Store alerts
|
||||
if !alerts.is_empty() {
|
||||
let mut alert_store = self.alerts.write().await;
|
||||
alert_store.extend(alerts.clone());
|
||||
|
||||
let mut count = self.alert_count.write().await;
|
||||
*count += alerts.len() as u64;
|
||||
}
|
||||
|
||||
Ok(alerts)
|
||||
}
|
||||
|
||||
/// Check for alerts based on aggregated metrics
|
||||
async fn check_aggregated_alerts(&self, aggregated: &AggregatedMetrics) -> Result<Vec<Alert>> {
|
||||
let mut alerts = Vec::new();
|
||||
|
||||
// Check summary statistics
|
||||
if aggregated.summary.avg_cpu_usage > self.config.alert_thresholds.cpu_usage_threshold {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Warning,
|
||||
category: AlertCategory::System,
|
||||
message: format!("High average CPU usage: {:.1}%", aggregated.summary.avg_cpu_usage),
|
||||
metric_value: aggregated.summary.avg_cpu_usage,
|
||||
threshold: self.config.alert_thresholds.cpu_usage_threshold,
|
||||
});
|
||||
}
|
||||
|
||||
if aggregated.summary.repair_success_rate < (100.0 - self.config.alert_thresholds.repair_failure_rate_threshold) {
|
||||
alerts.push(Alert {
|
||||
timestamp: SystemTime::now(),
|
||||
severity: AlertSeverity::Critical,
|
||||
category: AlertCategory::Heal,
|
||||
message: format!("Low repair success rate: {:.1}%", aggregated.summary.repair_success_rate * 100.0),
|
||||
metric_value: aggregated.summary.repair_success_rate * 100.0,
|
||||
threshold: 100.0 - self.config.alert_thresholds.repair_failure_rate_threshold,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(alerts)
|
||||
}
|
||||
|
||||
/// Generate summary for metrics
|
||||
async fn generate_summary(&self, metrics: &SystemMetrics) -> Result<ReportSummary> {
|
||||
Ok(ReportSummary {
|
||||
system_health: self.calculate_system_health(metrics),
|
||||
performance_score: self.calculate_performance_score(metrics),
|
||||
resource_utilization: self.calculate_resource_utilization(metrics),
|
||||
operational_status: self.determine_operational_status(metrics),
|
||||
key_metrics: self.extract_key_metrics(metrics),
|
||||
})
|
||||
}
|
||||
|
||||
/// Analyze trends in aggregated data
|
||||
async fn analyze_trends(&self, aggregated: &AggregatedMetrics) -> Result<Vec<TrendAnalysis>> {
|
||||
let mut trends = Vec::new();
|
||||
|
||||
if aggregated.data_points.len() < 2 {
|
||||
return Ok(trends);
|
||||
}
|
||||
|
||||
// Analyze CPU usage trend
|
||||
let cpu_values: Vec<f64> = aggregated
|
||||
.data_points
|
||||
.iter()
|
||||
.filter_map(|p| p.system.as_ref().map(|s| s.cpu_usage))
|
||||
.collect();
|
||||
|
||||
if cpu_values.len() >= 2 {
|
||||
let trend = self.calculate_trend(&cpu_values, "CPU Usage");
|
||||
trends.push(trend);
|
||||
}
|
||||
|
||||
// Analyze memory usage trend
|
||||
let memory_values: Vec<f64> = aggregated
|
||||
.data_points
|
||||
.iter()
|
||||
.filter_map(|p| p.system.as_ref().map(|s| s.memory_usage))
|
||||
.collect();
|
||||
|
||||
if memory_values.len() >= 2 {
|
||||
let trend = self.calculate_trend(&memory_values, "Memory Usage");
|
||||
trends.push(trend);
|
||||
}
|
||||
|
||||
Ok(trends)
|
||||
}
|
||||
|
||||
/// Generate recommendations based on metrics
|
||||
async fn generate_recommendations(&self, aggregated: &AggregatedMetrics) -> Result<Vec<Recommendation>> {
|
||||
let mut recommendations = Vec::new();
|
||||
|
||||
// Check for high resource usage
|
||||
if aggregated.summary.avg_cpu_usage > 70.0 {
|
||||
recommendations.push(Recommendation {
|
||||
priority: RecommendationPriority::High,
|
||||
category: RecommendationCategory::Performance,
|
||||
title: "High CPU Usage".to_string(),
|
||||
description: "Consider scaling up CPU resources or optimizing workload distribution".to_string(),
|
||||
action: "Monitor CPU usage patterns and consider resource allocation adjustments".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
if aggregated.summary.avg_memory_usage > 80.0 {
|
||||
recommendations.push(Recommendation {
|
||||
priority: RecommendationPriority::High,
|
||||
category: RecommendationCategory::Performance,
|
||||
title: "High Memory Usage".to_string(),
|
||||
description: "Memory usage is approaching critical levels".to_string(),
|
||||
action: "Consider increasing memory allocation or optimizing memory usage".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Check for repair issues
|
||||
if aggregated.summary.repair_success_rate < 0.8 {
|
||||
recommendations.push(Recommendation {
|
||||
priority: RecommendationPriority::Critical,
|
||||
category: RecommendationCategory::Reliability,
|
||||
title: "Low Repair Success Rate".to_string(),
|
||||
description: "Data repair operations are failing frequently".to_string(),
|
||||
action: "Investigate repair failures and check system health".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(recommendations)
|
||||
}
|
||||
|
||||
/// Calculate trend for a series of values
|
||||
fn calculate_trend(&self, values: &[f64], metric_name: &str) -> TrendAnalysis {
|
||||
if values.len() < 2 {
|
||||
return TrendAnalysis {
|
||||
metric_name: metric_name.to_string(),
|
||||
trend_direction: TrendDirection::Stable,
|
||||
change_rate: 0.0,
|
||||
confidence: 0.0,
|
||||
};
|
||||
}
|
||||
|
||||
let first = values[0];
|
||||
let last = values[values.len() - 1];
|
||||
let change_rate = ((last - first) / first) * 100.0;
|
||||
|
||||
let trend_direction = if change_rate > 5.0 {
|
||||
TrendDirection::Increasing
|
||||
} else if change_rate < -5.0 {
|
||||
TrendDirection::Decreasing
|
||||
} else {
|
||||
TrendDirection::Stable
|
||||
};
|
||||
|
||||
// Simple confidence calculation based on data points
|
||||
let confidence = (values.len() as f64 / 10.0).min(1.0);
|
||||
|
||||
TrendAnalysis {
|
||||
metric_name: metric_name.to_string(),
|
||||
trend_direction,
|
||||
change_rate,
|
||||
confidence,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate system health score
|
||||
fn calculate_system_health(&self, metrics: &SystemMetrics) -> f64 {
|
||||
let mut score = 100.0;
|
||||
|
||||
// Deduct points for high resource usage
|
||||
if metrics.cpu_usage > 80.0 {
|
||||
score -= (metrics.cpu_usage - 80.0) * 0.5;
|
||||
}
|
||||
if metrics.memory_usage > 85.0 {
|
||||
score -= (metrics.memory_usage - 85.0) * 0.5;
|
||||
}
|
||||
if metrics.disk_usage > 90.0 {
|
||||
score -= (metrics.disk_usage - 90.0) * 1.0;
|
||||
}
|
||||
|
||||
// Deduct points for health issues
|
||||
let total_health_issues: u64 = metrics.health_issues.values().sum();
|
||||
score -= total_health_issues as f64 * 5.0;
|
||||
|
||||
// Deduct points for repair failures
|
||||
if metrics.heal_metrics.total_repairs > 0 {
|
||||
let failure_rate = metrics.heal_metrics.failed_repairs as f64 / metrics.heal_metrics.total_repairs as f64;
|
||||
score -= failure_rate * 20.0;
|
||||
}
|
||||
|
||||
score.max(0.0)
|
||||
}
|
||||
|
||||
/// Calculate performance score
|
||||
fn calculate_performance_score(&self, metrics: &SystemMetrics) -> f64 {
|
||||
let mut score = 100.0;
|
||||
|
||||
// Base score on resource efficiency
|
||||
score -= metrics.cpu_usage * 0.3;
|
||||
score -= metrics.memory_usage * 0.3;
|
||||
score -= metrics.disk_usage * 0.2;
|
||||
score -= metrics.system_load * 10.0;
|
||||
|
||||
score.max(0.0)
|
||||
}
|
||||
|
||||
/// Calculate resource utilization
|
||||
fn calculate_resource_utilization(&self, metrics: &SystemMetrics) -> f64 {
|
||||
(metrics.cpu_usage + metrics.memory_usage + metrics.disk_usage) / 3.0
|
||||
}
|
||||
|
||||
/// Determine operational status
|
||||
fn determine_operational_status(&self, metrics: &SystemMetrics) -> OperationalStatus {
|
||||
let health_score = self.calculate_system_health(metrics);
|
||||
|
||||
if health_score >= 90.0 {
|
||||
OperationalStatus::Excellent
|
||||
} else if health_score >= 75.0 {
|
||||
OperationalStatus::Good
|
||||
} else if health_score >= 50.0 {
|
||||
OperationalStatus::Fair
|
||||
} else {
|
||||
OperationalStatus::Poor
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract key metrics
|
||||
fn extract_key_metrics(&self, metrics: &SystemMetrics) -> HashMap<String, f64> {
|
||||
let mut key_metrics = HashMap::new();
|
||||
key_metrics.insert("cpu_usage".to_string(), metrics.cpu_usage);
|
||||
key_metrics.insert("memory_usage".to_string(), metrics.memory_usage);
|
||||
key_metrics.insert("disk_usage".to_string(), metrics.disk_usage);
|
||||
key_metrics.insert("system_load".to_string(), metrics.system_load);
|
||||
key_metrics.insert("active_operations".to_string(), metrics.active_operations as f64);
|
||||
key_metrics.insert("objects_scanned".to_string(), metrics.scan_metrics.objects_scanned as f64);
|
||||
key_metrics.insert("total_repairs".to_string(), metrics.heal_metrics.total_repairs as f64);
|
||||
key_metrics.insert("successful_repairs".to_string(), metrics.heal_metrics.successful_repairs as f64);
|
||||
|
||||
key_metrics
|
||||
}
|
||||
|
||||
/// Output methods (simulated)
|
||||
async fn output_to_console(&self, report: &MetricsReport) -> Result<()> {
|
||||
if self.config.enabled {
|
||||
info!("=== Metrics Report ===");
|
||||
info!("Timestamp: {:?}", report.timestamp);
|
||||
info!("System Health: {:.1}%", report.summary.system_health);
|
||||
info!("Performance Score: {:.1}%", report.summary.performance_score);
|
||||
info!("Operational Status: {:?}", report.summary.operational_status);
|
||||
|
||||
if !report.alerts.is_empty() {
|
||||
info!("=== Alerts ===");
|
||||
for alert in &report.alerts {
|
||||
info!("[{}] {}: {}", alert.severity, alert.category, alert.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn output_to_file(&self, _report: &MetricsReport) -> Result<()> {
|
||||
if self.config.enabled {
|
||||
// In a real implementation, this would write to a file
|
||||
debug!("Would write report to file: {}", self.config.output_directory.as_ref().unwrap_or(&String::new()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn output_to_http(&self, _report: &MetricsReport) -> Result<()> {
|
||||
if self.config.enable_http_reporting {
|
||||
// In a real implementation, this would serve via HTTP
|
||||
debug!("Would serve report via HTTP on endpoint: {}", self.config.http_endpoint.as_ref().unwrap_or(&String::new()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn output_prometheus(&self, _report: &MetricsReport) -> Result<()> {
|
||||
if self.config.enabled {
|
||||
// In a real implementation, this would output Prometheus format
|
||||
debug!("Would output Prometheus format");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn output_json(&self, _report: &MetricsReport) -> Result<()> {
|
||||
if self.config.enabled {
|
||||
// In a real implementation, this would output JSON format
|
||||
debug!("Would output JSON format");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn output_csv(&self, _report: &MetricsReport) -> Result<()> {
|
||||
if self.config.enabled {
|
||||
// In a real implementation, this would output CSV format
|
||||
debug!("Would output CSV format");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get reporting statistics
|
||||
pub async fn get_statistics(&self) -> ReporterStatistics {
|
||||
let report_count = *self.report_count.read().await;
|
||||
let alert_count = *self.alert_count.read().await;
|
||||
let last_report_time = *self.last_report_time.read().await;
|
||||
let reports_count = self.reports.read().await.len();
|
||||
let alerts_count = self.alerts.read().await.len();
|
||||
|
||||
ReporterStatistics {
|
||||
total_reports: report_count,
|
||||
total_alerts: alert_count,
|
||||
reports_in_memory: reports_count,
|
||||
alerts_in_memory: alerts_count,
|
||||
last_report_time,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get recent alerts
|
||||
pub async fn get_recent_alerts(&self, hours: u64) -> Result<Vec<Alert>> {
|
||||
let cutoff_time = SystemTime::now() - Duration::from_secs(hours * 3600);
|
||||
let alerts = self.alerts.read().await;
|
||||
|
||||
let recent_alerts: Vec<Alert> = alerts
|
||||
.iter()
|
||||
.filter(|alert| alert.timestamp >= cutoff_time)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(recent_alerts)
|
||||
}
|
||||
|
||||
/// Clear old alerts
|
||||
pub async fn clear_old_alerts(&self, hours: u64) -> Result<()> {
|
||||
let cutoff_time = SystemTime::now() - Duration::from_secs(hours * 3600);
|
||||
let mut alerts = self.alerts.write().await;
|
||||
alerts.retain(|alert| alert.timestamp >= cutoff_time);
|
||||
|
||||
info!("Cleared alerts older than {} hours", hours);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics report
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsReport {
|
||||
pub timestamp: SystemTime,
|
||||
pub metrics: SystemMetrics,
|
||||
pub alerts: Vec<Alert>,
|
||||
pub summary: ReportSummary,
|
||||
pub format: ReportFormat,
|
||||
}
|
||||
|
||||
/// Comprehensive report
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ComprehensiveReport {
|
||||
pub timestamp: SystemTime,
|
||||
pub query: MetricsQuery,
|
||||
pub data_points: usize,
|
||||
pub summary: MetricsSummary,
|
||||
pub alerts: Vec<Alert>,
|
||||
pub trends: Vec<TrendAnalysis>,
|
||||
pub recommendations: Vec<Recommendation>,
|
||||
}
|
||||
|
||||
/// Report summary
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ReportSummary {
|
||||
pub system_health: f64,
|
||||
pub performance_score: f64,
|
||||
pub resource_utilization: f64,
|
||||
pub operational_status: OperationalStatus,
|
||||
pub key_metrics: HashMap<String, f64>,
|
||||
}
|
||||
|
||||
/// Alert
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Alert {
|
||||
pub timestamp: SystemTime,
|
||||
pub severity: AlertSeverity,
|
||||
pub category: AlertCategory,
|
||||
pub message: String,
|
||||
pub metric_value: f64,
|
||||
pub threshold: f64,
|
||||
}
|
||||
|
||||
/// Alert severity
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AlertSeverity {
|
||||
Info,
|
||||
Warning,
|
||||
Critical,
|
||||
}
|
||||
|
||||
impl fmt::Display for AlertSeverity {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
AlertSeverity::Info => write!(f, "INFO"),
|
||||
AlertSeverity::Warning => write!(f, "WARNING"),
|
||||
AlertSeverity::Critical => write!(f, "CRITICAL"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Alert category
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum AlertCategory {
|
||||
System,
|
||||
Performance,
|
||||
Health,
|
||||
Heal,
|
||||
Security,
|
||||
}
|
||||
|
||||
impl fmt::Display for AlertCategory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
AlertCategory::System => write!(f, "SYSTEM"),
|
||||
AlertCategory::Performance => write!(f, "PERFORMANCE"),
|
||||
AlertCategory::Health => write!(f, "HEALTH"),
|
||||
AlertCategory::Heal => write!(f, "HEAL"),
|
||||
AlertCategory::Security => write!(f, "SECURITY"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Report format
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum ReportFormat {
|
||||
Console,
|
||||
File,
|
||||
Http,
|
||||
Prometheus,
|
||||
Json,
|
||||
Csv,
|
||||
}
|
||||
|
||||
/// Operational status
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum OperationalStatus {
|
||||
Excellent,
|
||||
Good,
|
||||
Fair,
|
||||
Poor,
|
||||
}
|
||||
|
||||
/// Trend analysis
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TrendAnalysis {
|
||||
pub metric_name: String,
|
||||
pub trend_direction: TrendDirection,
|
||||
pub change_rate: f64,
|
||||
pub confidence: f64,
|
||||
}
|
||||
|
||||
/// Trend direction
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum TrendDirection {
|
||||
Increasing,
|
||||
Decreasing,
|
||||
Stable,
|
||||
}
|
||||
|
||||
/// Recommendation
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Recommendation {
|
||||
pub priority: RecommendationPriority,
|
||||
pub category: RecommendationCategory,
|
||||
pub title: String,
|
||||
pub description: String,
|
||||
pub action: String,
|
||||
}
|
||||
|
||||
/// Recommendation priority
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum RecommendationPriority {
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
Critical,
|
||||
}
|
||||
|
||||
/// Recommendation category
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum RecommendationCategory {
|
||||
Performance,
|
||||
Reliability,
|
||||
Security,
|
||||
Maintenance,
|
||||
}
|
||||
|
||||
/// Reporter statistics
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ReporterStatistics {
|
||||
pub total_reports: u64,
|
||||
pub total_alerts: u64,
|
||||
pub reports_in_memory: usize,
|
||||
pub alerts_in_memory: usize,
|
||||
pub last_report_time: SystemTime,
|
||||
pub config: ReporterConfig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_reporter_creation() {
|
||||
let config = ReporterConfig::default();
|
||||
let reporter = Reporter::new(config).await.unwrap();
|
||||
|
||||
assert_eq!(reporter.config().report_interval, Duration::from_secs(60));
|
||||
assert!(reporter.config().enabled);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_report_generation() {
|
||||
let config = ReporterConfig::default();
|
||||
let reporter = Reporter::new(config).await.unwrap();
|
||||
|
||||
let metrics = SystemMetrics::default();
|
||||
let report = reporter.generate_report(&metrics).await.unwrap();
|
||||
|
||||
assert_eq!(report.metrics.cpu_usage, 0.0);
|
||||
assert_eq!(report.alerts.len(), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alert_generation() {
|
||||
let config = ReporterConfig {
|
||||
alert_thresholds: AlertThresholds {
|
||||
cpu_usage_threshold: 50.0,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let reporter = Reporter::new(config).await.unwrap();
|
||||
|
||||
let mut metrics = SystemMetrics::default();
|
||||
metrics.cpu_usage = 75.0; // Above threshold
|
||||
|
||||
let report = reporter.generate_report(&metrics).await.unwrap();
|
||||
assert!(!report.alerts.is_empty());
|
||||
assert_eq!(report.alerts[0].severity, AlertSeverity::Warning);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_comprehensive_report() {
|
||||
let config = ReporterConfig::default();
|
||||
let reporter = Reporter::new(config).await.unwrap();
|
||||
|
||||
let aggregated = AggregatedMetrics {
|
||||
query: MetricsQuery {
|
||||
start_time: SystemTime::now(),
|
||||
end_time: SystemTime::now() + Duration::from_secs(3600),
|
||||
interval: Duration::from_secs(60),
|
||||
metrics: vec![],
|
||||
severity_filter: None,
|
||||
limit: None,
|
||||
},
|
||||
data_points: vec![],
|
||||
summary: MetricsSummary::default(),
|
||||
};
|
||||
|
||||
let report = reporter.generate_comprehensive_report(&aggregated).await.unwrap();
|
||||
assert_eq!(report.data_points, 0);
|
||||
assert!(report.recommendations.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_reporter_statistics() {
|
||||
let config = ReporterConfig::default();
|
||||
let reporter = Reporter::new(config).await.unwrap();
|
||||
|
||||
let stats = reporter.get_statistics().await;
|
||||
assert_eq!(stats.total_reports, 0);
|
||||
assert_eq!(stats.total_alerts, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_alert_clearing() {
|
||||
let config = ReporterConfig::default();
|
||||
let reporter = Reporter::new(config).await.unwrap();
|
||||
|
||||
// Generate some alerts
|
||||
let mut metrics = SystemMetrics::default();
|
||||
metrics.cpu_usage = 90.0; // Above threshold
|
||||
|
||||
let _report = reporter.generate_report(&metrics).await.unwrap();
|
||||
|
||||
// Clear old alerts
|
||||
reporter.clear_old_alerts(1).await.unwrap();
|
||||
|
||||
let stats = reporter.get_statistics().await;
|
||||
assert_eq!(stats.alerts_in_memory, 0);
|
||||
}
|
||||
}
|
||||
573
crates/ahm/src/metrics/storage.rs
Normal file
573
crates/ahm/src/metrics/storage.rs
Normal file
@@ -0,0 +1,573 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
path::PathBuf,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
use super::{
|
||||
AggregatedMetrics, MetricsDataPoint, MetricsQuery, MetricsSummary, SystemMetrics,
|
||||
};
|
||||
|
||||
/// Configuration for metrics storage
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StorageConfig {
|
||||
/// Storage directory path
|
||||
pub storage_path: PathBuf,
|
||||
/// Maximum file size for metrics files
|
||||
pub max_file_size: u64,
|
||||
/// Compression enabled
|
||||
pub compression_enabled: bool,
|
||||
/// Retention period for metrics data
|
||||
pub retention_period: Duration,
|
||||
/// Batch size for writes
|
||||
pub batch_size: usize,
|
||||
/// Flush interval
|
||||
pub flush_interval: Duration,
|
||||
/// Whether to enable data validation
|
||||
pub enable_validation: bool,
|
||||
/// Whether to enable data encryption
|
||||
pub enable_encryption: bool,
|
||||
/// Encryption key (if enabled)
|
||||
pub encryption_key: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for StorageConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
storage_path: PathBuf::from("/tmp/rustfs/metrics"),
|
||||
max_file_size: 100 * 1024 * 1024, // 100 MB
|
||||
compression_enabled: true,
|
||||
retention_period: Duration::from_secs(86400 * 30), // 30 days
|
||||
batch_size: 1000,
|
||||
flush_interval: Duration::from_secs(60), // 1 minute
|
||||
enable_validation: true,
|
||||
enable_encryption: false,
|
||||
encryption_key: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics storage that persists metrics data to disk
|
||||
pub struct Storage {
|
||||
config: StorageConfig,
|
||||
metrics_buffer: Arc<RwLock<Vec<SystemMetrics>>>,
|
||||
aggregated_buffer: Arc<RwLock<Vec<AggregatedMetrics>>>,
|
||||
file_handles: Arc<RwLock<HashMap<String, std::fs::File>>>,
|
||||
last_flush_time: Arc<RwLock<SystemTime>>,
|
||||
total_writes: Arc<RwLock<u64>>,
|
||||
total_reads: Arc<RwLock<u64>>,
|
||||
}
|
||||
|
||||
impl Storage {
|
||||
/// Create a new metrics storage
|
||||
pub async fn new(config: StorageConfig) -> Result<Self> {
|
||||
// Create storage directory if it doesn't exist
|
||||
tokio::fs::create_dir_all(&config.storage_path).await?;
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
metrics_buffer: Arc::new(RwLock::new(Vec::new())),
|
||||
aggregated_buffer: Arc::new(RwLock::new(Vec::new())),
|
||||
file_handles: Arc::new(RwLock::new(HashMap::new())),
|
||||
last_flush_time: Arc::new(RwLock::new(SystemTime::now())),
|
||||
total_writes: Arc::new(RwLock::new(0)),
|
||||
total_reads: Arc::new(RwLock::new(0)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &StorageConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Store system metrics
|
||||
pub async fn store_metrics(&self, metrics: SystemMetrics) -> Result<()> {
|
||||
let mut buffer = self.metrics_buffer.write().await;
|
||||
buffer.push(metrics);
|
||||
|
||||
// Flush if buffer is full
|
||||
if buffer.len() >= self.config.batch_size {
|
||||
self.flush_metrics_buffer().await?;
|
||||
}
|
||||
|
||||
// Update write count
|
||||
{
|
||||
let mut writes = self.total_writes.write().await;
|
||||
*writes += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Store aggregated metrics
|
||||
pub async fn store_aggregated_metrics(&self, aggregated: AggregatedMetrics) -> Result<()> {
|
||||
let mut buffer = self.aggregated_buffer.write().await;
|
||||
buffer.push(aggregated);
|
||||
|
||||
// Flush if buffer is full
|
||||
if buffer.len() >= self.config.batch_size {
|
||||
self.flush_aggregated_buffer().await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Retrieve metrics for a time range
|
||||
pub async fn retrieve_metrics(&self, query: &MetricsQuery) -> Result<Vec<SystemMetrics>> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Update read count
|
||||
{
|
||||
let mut reads = self.total_reads.write().await;
|
||||
*reads += 1;
|
||||
}
|
||||
|
||||
// In a real implementation, this would read from disk files
|
||||
// For now, we'll return data from the buffer
|
||||
let buffer = self.metrics_buffer.read().await;
|
||||
let filtered_metrics: Vec<SystemMetrics> = buffer
|
||||
.iter()
|
||||
.filter(|m| m.timestamp >= query.start_time && m.timestamp <= query.end_time)
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let retrieval_time = start_time.elapsed();
|
||||
debug!("Metrics retrieval completed in {:?}", retrieval_time);
|
||||
|
||||
Ok(filtered_metrics)
|
||||
}
|
||||
|
||||
/// Retrieve aggregated metrics
|
||||
pub async fn retrieve_aggregated_metrics(&self, query: &MetricsQuery) -> Result<Vec<AggregatedMetrics>> {
|
||||
let buffer = self.aggregated_buffer.read().await;
|
||||
let filtered_metrics: Vec<AggregatedMetrics> = buffer
|
||||
.iter()
|
||||
.filter(|m| {
|
||||
if let Some(first_point) = m.data_points.first() {
|
||||
first_point.timestamp >= query.start_time
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.filter(|m| {
|
||||
if let Some(last_point) = m.data_points.last() {
|
||||
last_point.timestamp <= query.end_time
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(filtered_metrics)
|
||||
}
|
||||
|
||||
/// Flush metrics buffer to disk
|
||||
async fn flush_metrics_buffer(&self) -> Result<()> {
|
||||
let mut buffer = self.metrics_buffer.write().await;
|
||||
if buffer.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let metrics_to_write = buffer.drain(..).collect::<Vec<_>>();
|
||||
drop(buffer); // Release lock
|
||||
|
||||
// Write to file
|
||||
self.write_metrics_to_file(&metrics_to_write).await?;
|
||||
|
||||
// Update flush time
|
||||
{
|
||||
let mut last_flush = self.last_flush_time.write().await;
|
||||
*last_flush = SystemTime::now();
|
||||
}
|
||||
|
||||
info!("Flushed {} metrics to disk", metrics_to_write.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush aggregated buffer to disk
|
||||
async fn flush_aggregated_buffer(&self) -> Result<()> {
|
||||
let mut buffer = self.aggregated_buffer.write().await;
|
||||
if buffer.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let aggregated_to_write = buffer.drain(..).collect::<Vec<_>>();
|
||||
drop(buffer); // Release lock
|
||||
|
||||
// Write to file
|
||||
self.write_aggregated_to_file(&aggregated_to_write).await?;
|
||||
|
||||
info!("Flushed {} aggregated metrics to disk", aggregated_to_write.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write metrics to file
|
||||
async fn write_metrics_to_file(&self, metrics: &[SystemMetrics]) -> Result<()> {
|
||||
let filename = format!("metrics_{}.json", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs());
|
||||
let filepath = self.config.storage_path.join(filename);
|
||||
|
||||
// In a real implementation, this would write to a file
|
||||
// For now, we'll just simulate the write
|
||||
debug!("Would write {} metrics to {}", metrics.len(), filepath.display());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write aggregated metrics to file
|
||||
async fn write_aggregated_to_file(&self, aggregated: &[AggregatedMetrics]) -> Result<()> {
|
||||
let filename = format!("aggregated_{}.json", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs());
|
||||
let filepath = self.config.storage_path.join(filename);
|
||||
|
||||
// In a real implementation, this would write to a file
|
||||
// For now, we'll just simulate the write
|
||||
debug!("Would write {} aggregated metrics to {}", aggregated.len(), filepath.display());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Force flush all buffers
|
||||
pub async fn force_flush(&self) -> Result<()> {
|
||||
self.flush_metrics_buffer().await?;
|
||||
self.flush_aggregated_buffer().await?;
|
||||
|
||||
info!("Force flush completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up old data based on retention policy
|
||||
pub async fn cleanup_old_data(&self) -> Result<()> {
|
||||
let cutoff_time = SystemTime::now() - self.config.retention_period;
|
||||
|
||||
// Clean up metrics buffer
|
||||
{
|
||||
let mut buffer = self.metrics_buffer.write().await;
|
||||
buffer.retain(|m| m.timestamp >= cutoff_time);
|
||||
}
|
||||
|
||||
// Clean up aggregated buffer
|
||||
{
|
||||
let mut buffer = self.aggregated_buffer.write().await;
|
||||
buffer.retain(|m| {
|
||||
if let Some(first_point) = m.data_points.first() {
|
||||
first_point.timestamp >= cutoff_time
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// In a real implementation, this would also clean up old files
|
||||
info!("Cleanup completed, removed data older than {:?}", cutoff_time);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get storage statistics
|
||||
pub async fn get_statistics(&self) -> StorageStatistics {
|
||||
let metrics_count = self.metrics_buffer.read().await.len();
|
||||
let aggregated_count = self.aggregated_buffer.read().await.len();
|
||||
let total_writes = *self.total_writes.read().await;
|
||||
let total_reads = *self.total_reads.read().await;
|
||||
let last_flush_time = *self.last_flush_time.read().await;
|
||||
|
||||
StorageStatistics {
|
||||
metrics_in_buffer: metrics_count,
|
||||
aggregated_in_buffer: aggregated_count,
|
||||
total_writes,
|
||||
total_reads,
|
||||
last_flush_time,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Validate stored data integrity
|
||||
pub async fn validate_data_integrity(&self) -> Result<DataIntegrityReport> {
|
||||
if !self.config.enable_validation {
|
||||
return Ok(DataIntegrityReport {
|
||||
is_valid: true,
|
||||
total_records: 0,
|
||||
corrupted_records: 0,
|
||||
validation_time: Duration::ZERO,
|
||||
errors: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
let start_time = Instant::now();
|
||||
let mut errors = Vec::new();
|
||||
let mut corrupted_records = 0;
|
||||
|
||||
// Validate metrics buffer
|
||||
{
|
||||
let buffer = self.metrics_buffer.read().await;
|
||||
for (i, metric) in buffer.iter().enumerate() {
|
||||
if !self.validate_metric(metric) {
|
||||
errors.push(format!("Invalid metric at index {}: {:?}", i, metric));
|
||||
corrupted_records += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Validate aggregated buffer
|
||||
{
|
||||
let buffer = self.aggregated_buffer.read().await;
|
||||
for (i, aggregated) in buffer.iter().enumerate() {
|
||||
if !self.validate_aggregated(aggregated) {
|
||||
errors.push(format!("Invalid aggregated metrics at index {}: {:?}", i, aggregated));
|
||||
corrupted_records += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let validation_time = start_time.elapsed();
|
||||
let total_records = {
|
||||
let metrics_count = self.metrics_buffer.read().await.len();
|
||||
let aggregated_count = self.aggregated_buffer.read().await.len();
|
||||
metrics_count + aggregated_count
|
||||
};
|
||||
|
||||
let is_valid = corrupted_records == 0;
|
||||
|
||||
Ok(DataIntegrityReport {
|
||||
is_valid,
|
||||
total_records,
|
||||
corrupted_records,
|
||||
validation_time,
|
||||
errors,
|
||||
})
|
||||
}
|
||||
|
||||
/// Validate a single metric
|
||||
fn validate_metric(&self, metric: &SystemMetrics) -> bool {
|
||||
// Basic validation checks
|
||||
metric.cpu_usage >= 0.0 && metric.cpu_usage <= 100.0
|
||||
&& metric.memory_usage >= 0.0 && metric.memory_usage <= 100.0
|
||||
&& metric.disk_usage >= 0.0 && metric.disk_usage <= 100.0
|
||||
&& metric.system_load >= 0.0
|
||||
}
|
||||
|
||||
/// Validate aggregated metrics
|
||||
fn validate_aggregated(&self, aggregated: &AggregatedMetrics) -> bool {
|
||||
// Basic validation checks
|
||||
!aggregated.data_points.is_empty()
|
||||
&& aggregated.query.start_time <= aggregated.query.end_time
|
||||
&& aggregated.summary.total_points > 0
|
||||
}
|
||||
|
||||
/// Backup metrics data
|
||||
pub async fn backup_data(&self, backup_path: &PathBuf) -> Result<BackupReport> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Create backup directory
|
||||
tokio::fs::create_dir_all(backup_path).await?;
|
||||
|
||||
// In a real implementation, this would copy files to backup location
|
||||
// For now, we'll just simulate the backup
|
||||
let metrics_count = self.metrics_buffer.read().await.len();
|
||||
let aggregated_count = self.aggregated_buffer.read().await.len();
|
||||
|
||||
let backup_time = start_time.elapsed();
|
||||
|
||||
Ok(BackupReport {
|
||||
backup_path: backup_path.clone(),
|
||||
metrics_backed_up: metrics_count,
|
||||
aggregated_backed_up: aggregated_count,
|
||||
backup_time,
|
||||
success: true,
|
||||
})
|
||||
}
|
||||
|
||||
/// Restore metrics data from backup
|
||||
pub async fn restore_data(&self, backup_path: &PathBuf) -> Result<RestoreReport> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
// In a real implementation, this would restore from backup files
|
||||
// For now, we'll just simulate the restore
|
||||
debug!("Would restore data from {}", backup_path.display());
|
||||
|
||||
let restore_time = start_time.elapsed();
|
||||
|
||||
Ok(RestoreReport {
|
||||
backup_path: backup_path.clone(),
|
||||
metrics_restored: 0,
|
||||
aggregated_restored: 0,
|
||||
restore_time,
|
||||
success: true,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StorageStatistics {
|
||||
pub metrics_in_buffer: usize,
|
||||
pub aggregated_in_buffer: usize,
|
||||
pub total_writes: u64,
|
||||
pub total_reads: u64,
|
||||
pub last_flush_time: SystemTime,
|
||||
pub config: StorageConfig,
|
||||
}
|
||||
|
||||
/// Data integrity validation report
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DataIntegrityReport {
|
||||
pub is_valid: bool,
|
||||
pub total_records: usize,
|
||||
pub corrupted_records: usize,
|
||||
pub validation_time: Duration,
|
||||
pub errors: Vec<String>,
|
||||
}
|
||||
|
||||
/// Backup report
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BackupReport {
|
||||
pub backup_path: PathBuf,
|
||||
pub metrics_backed_up: usize,
|
||||
pub aggregated_backed_up: usize,
|
||||
pub backup_time: Duration,
|
||||
pub success: bool,
|
||||
}
|
||||
|
||||
/// Restore report
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RestoreReport {
|
||||
pub backup_path: PathBuf,
|
||||
pub metrics_restored: usize,
|
||||
pub aggregated_restored: usize,
|
||||
pub restore_time: Duration,
|
||||
pub success: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::Instant;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_storage_creation() {
|
||||
let config = StorageConfig::default();
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
assert_eq!(storage.config().batch_size, 1000);
|
||||
assert!(storage.config().compression_enabled);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metrics_storage() {
|
||||
let config = StorageConfig::default();
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
let metrics = SystemMetrics::default();
|
||||
storage.store_metrics(metrics).await.unwrap();
|
||||
|
||||
let stats = storage.get_statistics().await;
|
||||
assert_eq!(stats.metrics_in_buffer, 1);
|
||||
assert_eq!(stats.total_writes, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_aggregated_storage() {
|
||||
let config = StorageConfig::default();
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
let aggregated = AggregatedMetrics {
|
||||
query: MetricsQuery {
|
||||
start_time: SystemTime::now(),
|
||||
end_time: SystemTime::now() + Duration::from_secs(3600),
|
||||
interval: Duration::from_secs(60),
|
||||
metrics: vec![],
|
||||
severity_filter: None,
|
||||
limit: None,
|
||||
},
|
||||
data_points: vec![],
|
||||
summary: MetricsSummary::default(),
|
||||
};
|
||||
|
||||
storage.store_aggregated_metrics(aggregated).await.unwrap();
|
||||
|
||||
let stats = storage.get_statistics().await;
|
||||
assert_eq!(stats.aggregated_in_buffer, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metrics_retrieval() {
|
||||
let config = StorageConfig::default();
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
// Store some metrics
|
||||
for i in 0..5 {
|
||||
let mut metrics = SystemMetrics::default();
|
||||
metrics.timestamp = SystemTime::now() + Duration::from_secs(i * 60);
|
||||
storage.store_metrics(metrics).await.unwrap();
|
||||
}
|
||||
|
||||
let query = MetricsQuery {
|
||||
start_time: SystemTime::now(),
|
||||
end_time: SystemTime::now() + Duration::from_secs(300),
|
||||
interval: Duration::from_secs(60),
|
||||
metrics: vec![],
|
||||
severity_filter: None,
|
||||
limit: None,
|
||||
};
|
||||
|
||||
let retrieved = storage.retrieve_metrics(&query).await.unwrap();
|
||||
assert_eq!(retrieved.len(), 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_data_integrity_validation() {
|
||||
let config = StorageConfig {
|
||||
enable_validation: true,
|
||||
..Default::default()
|
||||
};
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
let report = storage.validate_data_integrity().await.unwrap();
|
||||
assert!(report.is_valid);
|
||||
assert_eq!(report.corrupted_records, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_force_flush() {
|
||||
let config = StorageConfig::default();
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
// Add some data
|
||||
storage.store_metrics(SystemMetrics::default()).await.unwrap();
|
||||
|
||||
// Force flush
|
||||
storage.force_flush().await.unwrap();
|
||||
|
||||
let stats = storage.get_statistics().await;
|
||||
assert_eq!(stats.metrics_in_buffer, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cleanup_old_data() {
|
||||
let config = StorageConfig::default();
|
||||
let storage = Storage::new(config).await.unwrap();
|
||||
|
||||
// Add some old data
|
||||
let mut old_metrics = SystemMetrics::default();
|
||||
old_metrics.timestamp = SystemTime::now() - Duration::from_secs(86400 * 31); // 31 days old
|
||||
storage.store_metrics(old_metrics).await.unwrap();
|
||||
|
||||
// Add some recent data
|
||||
let mut recent_metrics = SystemMetrics::default();
|
||||
recent_metrics.timestamp = SystemTime::now();
|
||||
storage.store_metrics(recent_metrics).await.unwrap();
|
||||
|
||||
// Cleanup
|
||||
storage.cleanup_old_data().await.unwrap();
|
||||
|
||||
let stats = storage.get_statistics().await;
|
||||
assert_eq!(stats.metrics_in_buffer, 1); // Only recent data should remain
|
||||
}
|
||||
}
|
||||
508
crates/ahm/src/policy/heal_policy.rs
Normal file
508
crates/ahm/src/policy/heal_policy.rs
Normal file
@@ -0,0 +1,508 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use crate::scanner::{HealthIssue, Severity};
|
||||
|
||||
use super::{PolicyContext, PolicyResult, ResourceUsage};
|
||||
|
||||
/// Configuration for heal policies
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealPolicyConfig {
|
||||
/// Maximum number of concurrent repairs
|
||||
pub max_concurrent_repairs: usize,
|
||||
/// Maximum repair duration per operation
|
||||
pub max_repair_duration: Duration,
|
||||
/// Minimum interval between repairs
|
||||
pub min_repair_interval: Duration,
|
||||
/// Maximum system load threshold for healing
|
||||
pub max_system_load: f64,
|
||||
/// Minimum available disk space percentage for healing
|
||||
pub min_disk_space: f64,
|
||||
/// Maximum number of active operations for healing
|
||||
pub max_active_operations: u64,
|
||||
/// Whether to enable automatic healing
|
||||
pub auto_heal_enabled: bool,
|
||||
/// Priority-based healing configuration
|
||||
pub priority_config: HealPriorityConfig,
|
||||
/// Resource-based healing configuration
|
||||
pub resource_config: HealResourceConfig,
|
||||
/// Retry configuration
|
||||
pub retry_config: HealRetryConfig,
|
||||
}
|
||||
|
||||
/// Priority-based healing configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealPriorityConfig {
|
||||
/// Whether to enable priority-based healing
|
||||
pub enabled: bool,
|
||||
/// Critical issues heal immediately
|
||||
pub critical_immediate: bool,
|
||||
/// High priority issues heal within
|
||||
pub high_timeout: Duration,
|
||||
/// Medium priority issues heal within
|
||||
pub medium_timeout: Duration,
|
||||
/// Low priority issues heal within
|
||||
pub low_timeout: Duration,
|
||||
}
|
||||
|
||||
/// Resource-based healing configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealResourceConfig {
|
||||
/// Maximum CPU usage for healing
|
||||
pub max_cpu_usage: f64,
|
||||
/// Maximum memory usage for healing
|
||||
pub max_memory_usage: f64,
|
||||
/// Maximum disk I/O usage for healing
|
||||
pub max_disk_io_usage: f64,
|
||||
/// Maximum network I/O usage for healing
|
||||
pub max_network_io_usage: f64,
|
||||
/// Whether to enable resource-based throttling
|
||||
pub enable_throttling: bool,
|
||||
}
|
||||
|
||||
/// Retry configuration for healing
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealRetryConfig {
|
||||
/// Maximum number of retry attempts
|
||||
pub max_retry_attempts: u32,
|
||||
/// Initial backoff delay
|
||||
pub initial_backoff: Duration,
|
||||
/// Maximum backoff delay
|
||||
pub max_backoff: Duration,
|
||||
/// Backoff multiplier
|
||||
pub backoff_multiplier: f64,
|
||||
/// Whether to use exponential backoff
|
||||
pub exponential_backoff: bool,
|
||||
}
|
||||
|
||||
impl Default for HealPolicyConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_concurrent_repairs: 4,
|
||||
max_repair_duration: Duration::from_secs(1800), // 30 minutes
|
||||
min_repair_interval: Duration::from_secs(60), // 1 minute
|
||||
max_system_load: 0.7,
|
||||
min_disk_space: 15.0, // 15% minimum disk space
|
||||
max_active_operations: 50,
|
||||
auto_heal_enabled: true,
|
||||
priority_config: HealPriorityConfig::default(),
|
||||
resource_config: HealResourceConfig::default(),
|
||||
retry_config: HealRetryConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HealPriorityConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
critical_immediate: true,
|
||||
high_timeout: Duration::from_secs(300), // 5 minutes
|
||||
medium_timeout: Duration::from_secs(1800), // 30 minutes
|
||||
low_timeout: Duration::from_secs(3600), // 1 hour
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HealResourceConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_cpu_usage: 80.0,
|
||||
max_memory_usage: 80.0,
|
||||
max_disk_io_usage: 70.0,
|
||||
max_network_io_usage: 70.0,
|
||||
enable_throttling: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for HealRetryConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_retry_attempts: 3,
|
||||
initial_backoff: Duration::from_secs(30),
|
||||
max_backoff: Duration::from_secs(300),
|
||||
backoff_multiplier: 2.0,
|
||||
exponential_backoff: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Heal policy engine
|
||||
pub struct HealPolicyEngine {
|
||||
config: HealPolicyConfig,
|
||||
last_repair_time: SystemTime,
|
||||
repair_count: u64,
|
||||
active_repairs: u64,
|
||||
}
|
||||
|
||||
impl HealPolicyEngine {
|
||||
/// Create a new heal policy engine
|
||||
pub fn new(config: HealPolicyConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_repair_time: SystemTime::now(),
|
||||
repair_count: 0,
|
||||
active_repairs: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &HealPolicyConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Evaluate heal policy
|
||||
pub async fn evaluate(&self, issue: &HealthIssue, context: &PolicyContext) -> PolicyResult {
|
||||
let mut reasons = Vec::new();
|
||||
let mut allowed = true;
|
||||
|
||||
// Check if auto-heal is enabled
|
||||
if !self.config.auto_heal_enabled {
|
||||
allowed = false;
|
||||
reasons.push("Auto-heal is disabled".to_string());
|
||||
}
|
||||
|
||||
// Check system load
|
||||
if context.system_load > self.config.max_system_load {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"System load too high: {:.2} > {:.2}",
|
||||
context.system_load, self.config.max_system_load
|
||||
));
|
||||
}
|
||||
|
||||
// Check disk space
|
||||
if context.disk_space_available < self.config.min_disk_space {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Disk space too low: {:.1}% < {:.1}%",
|
||||
context.disk_space_available, self.config.min_disk_space
|
||||
));
|
||||
}
|
||||
|
||||
// Check active operations
|
||||
if context.active_operations > self.config.max_active_operations {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Too many active operations: {} > {}",
|
||||
context.active_operations, self.config.max_active_operations
|
||||
));
|
||||
}
|
||||
|
||||
// Check repair interval
|
||||
let time_since_last_repair = context.current_time
|
||||
.duration_since(self.last_repair_time)
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
if time_since_last_repair < self.config.min_repair_interval {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Repair interval too short: {:?} < {:?}",
|
||||
time_since_last_repair, self.config.min_repair_interval
|
||||
));
|
||||
}
|
||||
|
||||
// Check resource usage
|
||||
if self.config.resource_config.enable_throttling {
|
||||
if context.resource_usage.cpu_usage > self.config.resource_config.max_cpu_usage {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"CPU usage too high: {:.1}% > {:.1}%",
|
||||
context.resource_usage.cpu_usage, self.config.resource_config.max_cpu_usage
|
||||
));
|
||||
}
|
||||
|
||||
if context.resource_usage.memory_usage > self.config.resource_config.max_memory_usage {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Memory usage too high: {:.1}% > {:.1}%",
|
||||
context.resource_usage.memory_usage, self.config.resource_config.max_memory_usage
|
||||
));
|
||||
}
|
||||
|
||||
if context.resource_usage.disk_io_usage > self.config.resource_config.max_disk_io_usage {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Disk I/O usage too high: {:.1}% > {:.1}%",
|
||||
context.resource_usage.disk_io_usage, self.config.resource_config.max_disk_io_usage
|
||||
));
|
||||
}
|
||||
|
||||
if context.resource_usage.network_io_usage > self.config.resource_config.max_network_io_usage {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Network I/O usage too high: {:.1}% > {:.1}%",
|
||||
context.resource_usage.network_io_usage, self.config.resource_config.max_network_io_usage
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Check priority-based policies
|
||||
if self.config.priority_config.enabled {
|
||||
match issue.severity {
|
||||
Severity::Critical => {
|
||||
if self.config.priority_config.critical_immediate {
|
||||
// Critical issues should always be allowed unless resource constraints prevent it
|
||||
if allowed {
|
||||
reasons.clear();
|
||||
reasons.push("Critical issue - immediate repair allowed".to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Severity::High => {
|
||||
// Check if we're within the high priority timeout
|
||||
if time_since_last_repair > self.config.priority_config.high_timeout {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"High priority issue timeout exceeded: {:?} > {:?}",
|
||||
time_since_last_repair, self.config.priority_config.high_timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
Severity::Medium => {
|
||||
// Check if we're within the medium priority timeout
|
||||
if time_since_last_repair > self.config.priority_config.medium_timeout {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Medium priority issue timeout exceeded: {:?} > {:?}",
|
||||
time_since_last_repair, self.config.priority_config.medium_timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
Severity::Low => {
|
||||
// Check if we're within the low priority timeout
|
||||
if time_since_last_repair > self.config.priority_config.low_timeout {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Low priority issue timeout exceeded: {:?} > {:?}",
|
||||
time_since_last_repair, self.config.priority_config.low_timeout
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let reason = if reasons.is_empty() {
|
||||
"Heal allowed".to_string()
|
||||
} else {
|
||||
reasons.join("; ")
|
||||
};
|
||||
|
||||
PolicyResult {
|
||||
allowed,
|
||||
reason,
|
||||
metadata: Some(serde_json::json!({
|
||||
"repair_count": self.repair_count,
|
||||
"active_repairs": self.active_repairs,
|
||||
"time_since_last_repair": time_since_last_repair.as_secs(),
|
||||
"issue_severity": format!("{:?}", issue.severity),
|
||||
"issue_type": format!("{:?}", issue.issue_type),
|
||||
"system_load": context.system_load,
|
||||
"disk_space_available": context.disk_space_available,
|
||||
"active_operations": context.active_operations,
|
||||
})),
|
||||
evaluated_at: context.current_time,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get repair timeout based on priority
|
||||
pub fn get_repair_timeout(&self, severity: Severity) -> Duration {
|
||||
if !self.config.priority_config.enabled {
|
||||
return self.config.max_repair_duration;
|
||||
}
|
||||
|
||||
match severity {
|
||||
Severity::Critical => Duration::from_secs(300), // 5 minutes for critical
|
||||
Severity::High => self.config.priority_config.high_timeout,
|
||||
Severity::Medium => self.config.priority_config.medium_timeout,
|
||||
Severity::Low => self.config.priority_config.low_timeout,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get retry configuration
|
||||
pub fn get_retry_config(&self) -> &HealRetryConfig {
|
||||
&self.config.retry_config
|
||||
}
|
||||
|
||||
/// Update repair statistics
|
||||
pub fn record_repair(&mut self) {
|
||||
self.last_repair_time = SystemTime::now();
|
||||
self.repair_count += 1;
|
||||
}
|
||||
|
||||
/// Increment active repairs
|
||||
pub fn increment_active_repairs(&mut self) {
|
||||
self.active_repairs += 1;
|
||||
}
|
||||
|
||||
/// Decrement active repairs
|
||||
pub fn decrement_active_repairs(&mut self) {
|
||||
if self.active_repairs > 0 {
|
||||
self.active_repairs -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get heal statistics
|
||||
pub fn get_statistics(&self) -> HealPolicyStatistics {
|
||||
HealPolicyStatistics {
|
||||
total_repairs: self.repair_count,
|
||||
active_repairs: self.active_repairs,
|
||||
last_repair_time: self.last_repair_time,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Heal policy statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealPolicyStatistics {
|
||||
pub total_repairs: u64,
|
||||
pub active_repairs: u64,
|
||||
pub last_repair_time: SystemTime,
|
||||
pub config: HealPolicyConfig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_policy_creation() {
|
||||
let config = HealPolicyConfig::default();
|
||||
let engine = HealPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(engine.config().max_concurrent_repairs, 4);
|
||||
assert_eq!(engine.config().max_system_load, 0.7);
|
||||
assert_eq!(engine.config().min_disk_space, 15.0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_policy_evaluation() {
|
||||
let config = HealPolicyConfig::default();
|
||||
let engine = HealPolicyEngine::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Medium,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate(&issue, &context).await;
|
||||
assert!(result.allowed);
|
||||
assert!(result.reason.contains("Heal allowed"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_policy_critical_immediate() {
|
||||
let config = HealPolicyConfig::default();
|
||||
let engine = HealPolicyEngine::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate(&issue, &context).await;
|
||||
assert!(result.allowed);
|
||||
assert!(result.reason.contains("Critical issue - immediate repair allowed"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_policy_system_load_limit() {
|
||||
let config = HealPolicyConfig::default();
|
||||
let engine = HealPolicyEngine::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Medium,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.8, // Above threshold
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate(&issue, &context).await;
|
||||
assert!(!result.allowed);
|
||||
assert!(result.reason.contains("System load too high"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repair_timeouts() {
|
||||
let config = HealPolicyConfig::default();
|
||||
let engine = HealPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(
|
||||
engine.get_repair_timeout(Severity::Critical),
|
||||
Duration::from_secs(300)
|
||||
);
|
||||
assert_eq!(
|
||||
engine.get_repair_timeout(Severity::High),
|
||||
Duration::from_secs(300)
|
||||
);
|
||||
assert_eq!(
|
||||
engine.get_repair_timeout(Severity::Medium),
|
||||
Duration::from_secs(1800)
|
||||
);
|
||||
assert_eq!(
|
||||
engine.get_repair_timeout(Severity::Low),
|
||||
Duration::from_secs(3600)
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heal_statistics() {
|
||||
let config = HealPolicyConfig::default();
|
||||
let mut engine = HealPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(engine.get_statistics().total_repairs, 0);
|
||||
assert_eq!(engine.get_statistics().active_repairs, 0);
|
||||
|
||||
engine.record_repair();
|
||||
engine.increment_active_repairs();
|
||||
engine.increment_active_repairs();
|
||||
|
||||
let stats = engine.get_statistics();
|
||||
assert_eq!(stats.total_repairs, 1);
|
||||
assert_eq!(stats.active_repairs, 2);
|
||||
|
||||
engine.decrement_active_repairs();
|
||||
assert_eq!(engine.get_statistics().active_repairs, 1);
|
||||
}
|
||||
}
|
||||
258
crates/ahm/src/policy/mod.rs
Normal file
258
crates/ahm/src/policy/mod.rs
Normal file
@@ -0,0 +1,258 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Policy system for AHM operations
|
||||
//!
|
||||
//! Defines configurable policies for:
|
||||
//! - Scanning behavior and frequency
|
||||
//! - Healing priorities and strategies
|
||||
//! - Data retention and lifecycle management
|
||||
|
||||
pub mod scan_policy;
|
||||
pub mod heal_policy;
|
||||
pub mod retention_policy;
|
||||
|
||||
pub use scan_policy::{ScanPolicyConfig, ScanPolicyEngine};
|
||||
pub use heal_policy::{HealPolicyConfig, HealPolicyEngine};
|
||||
pub use retention_policy::{RetentionPolicyConfig, RetentionPolicyEngine};
|
||||
|
||||
use std::time::{Duration, SystemTime};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::scanner::{HealthIssue, Severity};
|
||||
|
||||
/// Policy evaluation result
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PolicyResult {
|
||||
/// Whether the policy allows the action
|
||||
pub allowed: bool,
|
||||
/// Reason for the decision
|
||||
pub reason: String,
|
||||
/// Additional metadata
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
/// When the policy was evaluated
|
||||
pub evaluated_at: SystemTime,
|
||||
}
|
||||
|
||||
/// Policy evaluation context
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PolicyContext {
|
||||
/// Current system load
|
||||
pub system_load: f64,
|
||||
/// Available disk space percentage
|
||||
pub disk_space_available: f64,
|
||||
/// Number of active operations
|
||||
pub active_operations: u64,
|
||||
/// Current time
|
||||
pub current_time: SystemTime,
|
||||
/// Health issues count by severity
|
||||
pub health_issues: std::collections::HashMap<Severity, u64>,
|
||||
/// Resource usage metrics
|
||||
pub resource_usage: ResourceUsage,
|
||||
}
|
||||
|
||||
/// Resource usage information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ResourceUsage {
|
||||
/// CPU usage percentage
|
||||
pub cpu_usage: f64,
|
||||
/// Memory usage percentage
|
||||
pub memory_usage: f64,
|
||||
/// Disk I/O usage percentage
|
||||
pub disk_io_usage: f64,
|
||||
/// Network I/O usage percentage
|
||||
pub network_io_usage: f64,
|
||||
}
|
||||
|
||||
impl Default for ResourceUsage {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpu_usage: 0.0,
|
||||
memory_usage: 0.0,
|
||||
disk_io_usage: 0.0,
|
||||
network_io_usage: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Policy manager that coordinates all policies
|
||||
pub struct PolicyManager {
|
||||
scan_policy: ScanPolicyEngine,
|
||||
heal_policy: HealPolicyEngine,
|
||||
retention_policy: RetentionPolicyEngine,
|
||||
}
|
||||
|
||||
impl PolicyManager {
|
||||
/// Create a new policy manager
|
||||
pub fn new(
|
||||
scan_config: ScanPolicyConfig,
|
||||
heal_config: HealPolicyConfig,
|
||||
retention_config: RetentionPolicyConfig,
|
||||
) -> Self {
|
||||
Self {
|
||||
scan_policy: ScanPolicyEngine::new(scan_config),
|
||||
heal_policy: HealPolicyEngine::new(heal_config),
|
||||
retention_policy: RetentionPolicyEngine::new(retention_config),
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate scan policy
|
||||
pub async fn evaluate_scan_policy(&self, context: &PolicyContext) -> PolicyResult {
|
||||
self.scan_policy.evaluate(context).await
|
||||
}
|
||||
|
||||
/// Evaluate heal policy
|
||||
pub async fn evaluate_heal_policy(&self, issue: &HealthIssue, context: &PolicyContext) -> PolicyResult {
|
||||
self.heal_policy.evaluate(issue, context).await
|
||||
}
|
||||
|
||||
/// Evaluate retention policy
|
||||
pub async fn evaluate_retention_policy(&self, object_age: Duration, context: &PolicyContext) -> PolicyResult {
|
||||
self.retention_policy.evaluate(object_age, context).await
|
||||
}
|
||||
|
||||
/// Get scan policy engine
|
||||
pub fn scan_policy(&self) -> &ScanPolicyEngine {
|
||||
&self.scan_policy
|
||||
}
|
||||
|
||||
/// Get heal policy engine
|
||||
pub fn heal_policy(&self) -> &HealPolicyEngine {
|
||||
&self.heal_policy
|
||||
}
|
||||
|
||||
/// Get retention policy engine
|
||||
pub fn retention_policy(&self) -> &RetentionPolicyEngine {
|
||||
&self.retention_policy
|
||||
}
|
||||
|
||||
/// Update scan policy configuration
|
||||
pub async fn update_scan_policy(&mut self, config: ScanPolicyConfig) {
|
||||
self.scan_policy = ScanPolicyEngine::new(config);
|
||||
}
|
||||
|
||||
/// Update heal policy configuration
|
||||
pub async fn update_heal_policy(&mut self, config: HealPolicyConfig) {
|
||||
self.heal_policy = HealPolicyEngine::new(config);
|
||||
}
|
||||
|
||||
/// Update retention policy configuration
|
||||
pub async fn update_retention_policy(&mut self, config: RetentionPolicyConfig) {
|
||||
self.retention_policy = RetentionPolicyEngine::new(config);
|
||||
}
|
||||
|
||||
/// List all policies
|
||||
pub async fn list_policies(&self) -> crate::error::Result<Vec<String>> {
|
||||
// In a real implementation, this would return actual policy names
|
||||
Ok(vec![
|
||||
"scan_policy".to_string(),
|
||||
"heal_policy".to_string(),
|
||||
"retention_policy".to_string(),
|
||||
])
|
||||
}
|
||||
|
||||
/// Get a specific policy
|
||||
pub async fn get_policy(&self, name: &str) -> crate::error::Result<String> {
|
||||
// In a real implementation, this would return the actual policy
|
||||
Ok(format!("Policy configuration for: {}", name))
|
||||
}
|
||||
|
||||
/// Get engine configuration
|
||||
pub async fn get_config(&self) -> PolicyConfig {
|
||||
PolicyConfig::default()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::{HealthIssue, HealthIssueType};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_policy_manager_creation() {
|
||||
let scan_config = ScanPolicyConfig::default();
|
||||
let heal_config = HealPolicyConfig::default();
|
||||
let retention_config = RetentionPolicyConfig::default();
|
||||
|
||||
let manager = PolicyManager::new(scan_config, heal_config, retention_config);
|
||||
|
||||
// Test that all policy engines are available
|
||||
assert!(manager.scan_policy().config().max_concurrent_scans > 0);
|
||||
assert!(manager.heal_policy().config().max_concurrent_repairs > 0);
|
||||
assert!(manager.retention_policy().config().default_retention_days > 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_policy_evaluation() {
|
||||
let scan_config = ScanPolicyConfig::default();
|
||||
let heal_config = HealPolicyConfig::default();
|
||||
let retention_config = RetentionPolicyConfig::default();
|
||||
|
||||
let manager = PolicyManager::new(scan_config, heal_config, retention_config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
// Test scan policy evaluation
|
||||
let scan_result = manager.evaluate_scan_policy(&context).await;
|
||||
assert!(scan_result.allowed);
|
||||
|
||||
// Test heal policy evaluation
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
let heal_result = manager.evaluate_heal_policy(&issue, &context).await;
|
||||
assert!(heal_result.allowed);
|
||||
|
||||
// Test retention policy evaluation
|
||||
let retention_result = manager.evaluate_retention_policy(Duration::from_secs(86400), &context).await;
|
||||
assert!(retention_result.allowed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Master policy configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PolicyConfig {
|
||||
pub scan: ScanPolicyConfig,
|
||||
pub heal: HealPolicyConfig,
|
||||
pub retention: RetentionPolicyConfig,
|
||||
}
|
||||
|
||||
impl Default for PolicyConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
scan: ScanPolicyConfig::default(),
|
||||
heal: HealPolicyConfig::default(),
|
||||
retention: RetentionPolicyConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct PolicyManagerConfig {
|
||||
#[serde(default)]
|
||||
pub default_scan_interval: Duration,
|
||||
}
|
||||
487
crates/ahm/src/policy/retention_policy.rs
Normal file
487
crates/ahm/src/policy/retention_policy.rs
Normal file
@@ -0,0 +1,487 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use super::{PolicyContext, PolicyResult, ResourceUsage};
|
||||
|
||||
/// Configuration for retention policies
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RetentionPolicyConfig {
|
||||
/// Default retention period in days
|
||||
pub default_retention_days: u32,
|
||||
/// Whether to enable retention policies
|
||||
pub enabled: bool,
|
||||
/// Maximum system load threshold for retention operations
|
||||
pub max_system_load: f64,
|
||||
/// Minimum available disk space percentage for retention operations
|
||||
pub min_disk_space: f64,
|
||||
/// Maximum number of active operations for retention
|
||||
pub max_active_operations: u64,
|
||||
/// Retention rules by object type
|
||||
pub retention_rules: Vec<RetentionRule>,
|
||||
/// Whether to enable automatic cleanup
|
||||
pub auto_cleanup_enabled: bool,
|
||||
/// Cleanup interval
|
||||
pub cleanup_interval: Duration,
|
||||
/// Maximum objects to delete per cleanup cycle
|
||||
pub max_objects_per_cleanup: u64,
|
||||
}
|
||||
|
||||
/// Retention rule for specific object types
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RetentionRule {
|
||||
/// Object type pattern (e.g., "*.log", "temp/*")
|
||||
pub pattern: String,
|
||||
/// Retention period in days
|
||||
pub retention_days: u32,
|
||||
/// Whether this rule is enabled
|
||||
pub enabled: bool,
|
||||
/// Priority of this rule (higher = more important)
|
||||
pub priority: u32,
|
||||
/// Whether to apply this rule recursively
|
||||
pub recursive: bool,
|
||||
}
|
||||
|
||||
impl Default for RetentionPolicyConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
default_retention_days: 30,
|
||||
enabled: true,
|
||||
max_system_load: 0.6,
|
||||
min_disk_space: 20.0, // 20% minimum disk space
|
||||
max_active_operations: 20,
|
||||
retention_rules: vec![
|
||||
RetentionRule {
|
||||
pattern: "*.log".to_string(),
|
||||
retention_days: 7,
|
||||
enabled: true,
|
||||
priority: 1,
|
||||
recursive: false,
|
||||
},
|
||||
RetentionRule {
|
||||
pattern: "temp/*".to_string(),
|
||||
retention_days: 1,
|
||||
enabled: true,
|
||||
priority: 2,
|
||||
recursive: true,
|
||||
},
|
||||
RetentionRule {
|
||||
pattern: "cache/*".to_string(),
|
||||
retention_days: 3,
|
||||
enabled: true,
|
||||
priority: 3,
|
||||
recursive: true,
|
||||
},
|
||||
],
|
||||
auto_cleanup_enabled: true,
|
||||
cleanup_interval: Duration::from_secs(3600), // 1 hour
|
||||
max_objects_per_cleanup: 1000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Retention policy engine
|
||||
pub struct RetentionPolicyEngine {
|
||||
config: RetentionPolicyConfig,
|
||||
last_cleanup_time: SystemTime,
|
||||
cleanup_count: u64,
|
||||
objects_deleted: u64,
|
||||
}
|
||||
|
||||
impl RetentionPolicyEngine {
|
||||
/// Create a new retention policy engine
|
||||
pub fn new(config: RetentionPolicyConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_cleanup_time: SystemTime::now(),
|
||||
cleanup_count: 0,
|
||||
objects_deleted: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &RetentionPolicyConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Evaluate retention policy
|
||||
pub async fn evaluate(&self, object_age: Duration, context: &PolicyContext) -> PolicyResult {
|
||||
let mut reasons = Vec::new();
|
||||
let mut allowed = false;
|
||||
|
||||
// Check if retention policies are enabled
|
||||
if !self.config.enabled {
|
||||
allowed = false;
|
||||
reasons.push("Retention policies are disabled".to_string());
|
||||
} else {
|
||||
// Check if object should be retained based on age
|
||||
let retention_days = self.get_retention_days_for_object("default");
|
||||
let retention_duration = Duration::from_secs(retention_days as u64 * 24 * 3600);
|
||||
|
||||
if object_age > retention_duration {
|
||||
allowed = true;
|
||||
reasons.push(format!(
|
||||
"Object age exceeds retention period: {:?} > {:?}",
|
||||
object_age, retention_duration
|
||||
));
|
||||
} else {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Object within retention period: {:?} <= {:?}",
|
||||
object_age, retention_duration
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Check system constraints
|
||||
if context.system_load > self.config.max_system_load {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"System load too high: {:.2} > {:.2}",
|
||||
context.system_load, self.config.max_system_load
|
||||
));
|
||||
}
|
||||
|
||||
if context.disk_space_available < self.config.min_disk_space {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Disk space too low: {:.1}% < {:.1}%",
|
||||
context.disk_space_available, self.config.min_disk_space
|
||||
));
|
||||
}
|
||||
|
||||
if context.active_operations > self.config.max_active_operations {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Too many active operations: {} > {}",
|
||||
context.active_operations, self.config.max_active_operations
|
||||
));
|
||||
}
|
||||
|
||||
let reason = if reasons.is_empty() {
|
||||
"Retention evaluation completed".to_string()
|
||||
} else {
|
||||
reasons.join("; ")
|
||||
};
|
||||
|
||||
PolicyResult {
|
||||
allowed,
|
||||
reason,
|
||||
metadata: Some(serde_json::json!({
|
||||
"object_age_seconds": object_age.as_secs(),
|
||||
"cleanup_count": self.cleanup_count,
|
||||
"objects_deleted": self.objects_deleted,
|
||||
"system_load": context.system_load,
|
||||
"disk_space_available": context.disk_space_available,
|
||||
"active_operations": context.active_operations,
|
||||
})),
|
||||
evaluated_at: context.current_time,
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate cleanup policy
|
||||
pub async fn evaluate_cleanup(&self, context: &PolicyContext) -> PolicyResult {
|
||||
let mut reasons = Vec::new();
|
||||
let mut allowed = false;
|
||||
|
||||
// Check if auto-cleanup is enabled
|
||||
if !self.config.auto_cleanup_enabled {
|
||||
allowed = false;
|
||||
reasons.push("Auto-cleanup is disabled".to_string());
|
||||
} else {
|
||||
// Check cleanup interval
|
||||
let time_since_last_cleanup = context.current_time
|
||||
.duration_since(self.last_cleanup_time)
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
if time_since_last_cleanup >= self.config.cleanup_interval {
|
||||
allowed = true;
|
||||
reasons.push("Cleanup interval reached".to_string());
|
||||
} else {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Cleanup interval not reached: {:?} < {:?}",
|
||||
time_since_last_cleanup, self.config.cleanup_interval
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Check system constraints
|
||||
if context.system_load > self.config.max_system_load {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"System load too high: {:.2} > {:.2}",
|
||||
context.system_load, self.config.max_system_load
|
||||
));
|
||||
}
|
||||
|
||||
if context.disk_space_available < self.config.min_disk_space {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Disk space too low: {:.1}% < {:.1}%",
|
||||
context.disk_space_available, self.config.min_disk_space
|
||||
));
|
||||
}
|
||||
|
||||
let reason = if reasons.is_empty() {
|
||||
"Cleanup evaluation completed".to_string()
|
||||
} else {
|
||||
reasons.join("; ")
|
||||
};
|
||||
|
||||
PolicyResult {
|
||||
allowed,
|
||||
reason,
|
||||
metadata: Some(serde_json::json!({
|
||||
"cleanup_count": self.cleanup_count,
|
||||
"objects_deleted": self.objects_deleted,
|
||||
"max_objects_per_cleanup": self.config.max_objects_per_cleanup,
|
||||
"system_load": context.system_load,
|
||||
"disk_space_available": context.disk_space_available,
|
||||
})),
|
||||
evaluated_at: context.current_time,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get retention days for a specific object
|
||||
pub fn get_retention_days_for_object(&self, object_path: &str) -> u32 {
|
||||
// Find the highest priority matching rule
|
||||
let mut best_rule: Option<&RetentionRule> = None;
|
||||
let mut best_priority = 0;
|
||||
|
||||
for rule in &self.config.retention_rules {
|
||||
if !rule.enabled {
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.matches_pattern(object_path, &rule.pattern) {
|
||||
if rule.priority > best_priority {
|
||||
best_rule = Some(rule);
|
||||
best_priority = rule.priority;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
best_rule
|
||||
.map(|rule| rule.retention_days)
|
||||
.unwrap_or(self.config.default_retention_days)
|
||||
}
|
||||
|
||||
/// Check if an object path matches a pattern
|
||||
fn matches_pattern(&self, object_path: &str, pattern: &str) -> bool {
|
||||
// Simple pattern matching - can be enhanced with regex
|
||||
if pattern.contains('*') {
|
||||
// Wildcard matching
|
||||
let pattern_parts: Vec<&str> = pattern.split('*').collect();
|
||||
if pattern_parts.len() == 2 {
|
||||
let prefix = pattern_parts[0];
|
||||
let suffix = pattern_parts[1];
|
||||
object_path.starts_with(prefix) && object_path.ends_with(suffix)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
// Exact match
|
||||
object_path == pattern
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all retention rules
|
||||
pub fn get_retention_rules(&self) -> &[RetentionRule] {
|
||||
&self.config.retention_rules
|
||||
}
|
||||
|
||||
/// Add a new retention rule
|
||||
pub fn add_retention_rule(&mut self, rule: RetentionRule) {
|
||||
self.config.retention_rules.push(rule);
|
||||
}
|
||||
|
||||
/// Remove a retention rule by pattern
|
||||
pub fn remove_retention_rule(&mut self, pattern: &str) -> bool {
|
||||
let initial_len = self.config.retention_rules.len();
|
||||
self.config.retention_rules.retain(|rule| rule.pattern != pattern);
|
||||
self.config.retention_rules.len() < initial_len
|
||||
}
|
||||
|
||||
/// Update cleanup statistics
|
||||
pub fn record_cleanup(&mut self, objects_deleted: u64) {
|
||||
self.last_cleanup_time = SystemTime::now();
|
||||
self.cleanup_count += 1;
|
||||
self.objects_deleted += objects_deleted;
|
||||
}
|
||||
|
||||
/// Get retention statistics
|
||||
pub fn get_statistics(&self) -> RetentionPolicyStatistics {
|
||||
RetentionPolicyStatistics {
|
||||
total_cleanups: self.cleanup_count,
|
||||
total_objects_deleted: self.objects_deleted,
|
||||
last_cleanup_time: self.last_cleanup_time,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Retention policy statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RetentionPolicyStatistics {
|
||||
pub total_cleanups: u64,
|
||||
pub total_objects_deleted: u64,
|
||||
pub last_cleanup_time: SystemTime,
|
||||
pub config: RetentionPolicyConfig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retention_policy_creation() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(engine.config().default_retention_days, 30);
|
||||
assert_eq!(engine.config().max_system_load, 0.6);
|
||||
assert_eq!(engine.config().min_disk_space, 20.0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retention_policy_evaluation() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
// Test object within retention period
|
||||
let object_age = Duration::from_secs(7 * 24 * 3600); // 7 days
|
||||
let result = engine.evaluate(object_age, &context).await;
|
||||
assert!(!result.allowed);
|
||||
assert!(result.reason.contains("Object within retention period"));
|
||||
|
||||
// Test object exceeding retention period
|
||||
let object_age = Duration::from_secs(40 * 24 * 3600); // 40 days
|
||||
let result = engine.evaluate(object_age, &context).await;
|
||||
assert!(result.allowed);
|
||||
assert!(result.reason.contains("Object age exceeds retention period"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retention_policy_system_constraints() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.7, // Above threshold
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let object_age = Duration::from_secs(40 * 24 * 3600); // 40 days
|
||||
let result = engine.evaluate(object_age, &context).await;
|
||||
assert!(!result.allowed);
|
||||
assert!(result.reason.contains("System load too high"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retention_rules() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
// Test default retention
|
||||
assert_eq!(engine.get_retention_days_for_object("unknown.txt"), 30);
|
||||
|
||||
// Test log file retention
|
||||
assert_eq!(engine.get_retention_days_for_object("app.log"), 7);
|
||||
|
||||
// Test temp file retention
|
||||
assert_eq!(engine.get_retention_days_for_object("temp/file.txt"), 1);
|
||||
|
||||
// Test cache file retention
|
||||
assert_eq!(engine.get_retention_days_for_object("cache/data.bin"), 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pattern_matching() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
// Test wildcard matching
|
||||
assert!(engine.matches_pattern("app.log", "*.log"));
|
||||
assert!(engine.matches_pattern("error.log", "*.log"));
|
||||
assert!(!engine.matches_pattern("app.txt", "*.log"));
|
||||
|
||||
// Test exact matching
|
||||
assert!(engine.matches_pattern("temp/file.txt", "temp/file.txt"));
|
||||
assert!(!engine.matches_pattern("temp/file.txt", "temp/other.txt"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cleanup_evaluation() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate_cleanup(&context).await;
|
||||
// Should be allowed if enough time has passed since last cleanup
|
||||
assert!(result.allowed || result.reason.contains("Cleanup interval not reached"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retention_statistics() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let mut engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(engine.get_statistics().total_cleanups, 0);
|
||||
assert_eq!(engine.get_statistics().total_objects_deleted, 0);
|
||||
|
||||
engine.record_cleanup(50);
|
||||
assert_eq!(engine.get_statistics().total_cleanups, 1);
|
||||
assert_eq!(engine.get_statistics().total_objects_deleted, 50);
|
||||
|
||||
engine.record_cleanup(30);
|
||||
assert_eq!(engine.get_statistics().total_cleanups, 2);
|
||||
assert_eq!(engine.get_statistics().total_objects_deleted, 80);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retention_rule_management() {
|
||||
let config = RetentionPolicyConfig::default();
|
||||
let mut engine = RetentionPolicyEngine::new(config);
|
||||
|
||||
let initial_rules = engine.get_retention_rules().len();
|
||||
|
||||
// Add a new rule
|
||||
let new_rule = RetentionRule {
|
||||
pattern: "backup/*".to_string(),
|
||||
retention_days: 90,
|
||||
enabled: true,
|
||||
priority: 4,
|
||||
recursive: true,
|
||||
};
|
||||
engine.add_retention_rule(new_rule);
|
||||
|
||||
assert_eq!(engine.get_retention_rules().len(), initial_rules + 1);
|
||||
|
||||
// Remove a rule
|
||||
let removed = engine.remove_retention_rule("*.log");
|
||||
assert!(removed);
|
||||
assert_eq!(engine.get_retention_rules().len(), initial_rules);
|
||||
}
|
||||
}
|
||||
373
crates/ahm/src/policy/scan_policy.rs
Normal file
373
crates/ahm/src/policy/scan_policy.rs
Normal file
@@ -0,0 +1,373 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use crate::scanner::Severity;
|
||||
|
||||
use super::{PolicyContext, PolicyResult, ResourceUsage};
|
||||
|
||||
/// Configuration for scan policies
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScanPolicyConfig {
|
||||
/// Maximum number of concurrent scans
|
||||
pub max_concurrent_scans: usize,
|
||||
/// Maximum scan duration per cycle
|
||||
pub max_scan_duration: Duration,
|
||||
/// Minimum interval between scans
|
||||
pub min_scan_interval: Duration,
|
||||
/// Maximum system load threshold for scanning
|
||||
pub max_system_load: f64,
|
||||
/// Minimum available disk space percentage for scanning
|
||||
pub min_disk_space: f64,
|
||||
/// Maximum number of active operations for scanning
|
||||
pub max_active_operations: u64,
|
||||
/// Whether to enable deep scanning
|
||||
pub enable_deep_scan: bool,
|
||||
/// Deep scan interval (how often to perform deep scans)
|
||||
pub deep_scan_interval: Duration,
|
||||
/// Bandwidth limit for scanning (bytes per second)
|
||||
pub bandwidth_limit: Option<u64>,
|
||||
/// Priority-based scanning configuration
|
||||
pub priority_config: ScanPriorityConfig,
|
||||
}
|
||||
|
||||
/// Priority-based scanning configuration
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScanPriorityConfig {
|
||||
/// Whether to enable priority-based scanning
|
||||
pub enabled: bool,
|
||||
/// Critical issues scan interval
|
||||
pub critical_interval: Duration,
|
||||
/// High priority issues scan interval
|
||||
pub high_interval: Duration,
|
||||
/// Medium priority issues scan interval
|
||||
pub medium_interval: Duration,
|
||||
/// Low priority issues scan interval
|
||||
pub low_interval: Duration,
|
||||
}
|
||||
|
||||
impl Default for ScanPolicyConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_concurrent_scans: 4,
|
||||
max_scan_duration: Duration::from_secs(3600), // 1 hour
|
||||
min_scan_interval: Duration::from_secs(300), // 5 minutes
|
||||
max_system_load: 0.8,
|
||||
min_disk_space: 10.0, // 10% minimum disk space
|
||||
max_active_operations: 100,
|
||||
enable_deep_scan: true,
|
||||
deep_scan_interval: Duration::from_secs(86400), // 24 hours
|
||||
bandwidth_limit: Some(100 * 1024 * 1024), // 100 MB/s
|
||||
priority_config: ScanPriorityConfig::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ScanPriorityConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: true,
|
||||
critical_interval: Duration::from_secs(60), // 1 minute
|
||||
high_interval: Duration::from_secs(300), // 5 minutes
|
||||
medium_interval: Duration::from_secs(1800), // 30 minutes
|
||||
low_interval: Duration::from_secs(3600), // 1 hour
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan policy engine
|
||||
pub struct ScanPolicyEngine {
|
||||
config: ScanPolicyConfig,
|
||||
last_scan_time: SystemTime,
|
||||
last_deep_scan_time: SystemTime,
|
||||
scan_count: u64,
|
||||
}
|
||||
|
||||
impl ScanPolicyEngine {
|
||||
/// Create a new scan policy engine
|
||||
pub fn new(config: ScanPolicyConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_scan_time: SystemTime::now(),
|
||||
last_deep_scan_time: SystemTime::now(),
|
||||
scan_count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the configuration
|
||||
pub fn config(&self) -> &ScanPolicyConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// Evaluate scan policy
|
||||
pub async fn evaluate(&self, context: &PolicyContext) -> PolicyResult {
|
||||
let mut reasons = Vec::new();
|
||||
let mut allowed = true;
|
||||
|
||||
// Check system load
|
||||
if context.system_load > self.config.max_system_load {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"System load too high: {:.2} > {:.2}",
|
||||
context.system_load, self.config.max_system_load
|
||||
));
|
||||
}
|
||||
|
||||
// Check disk space
|
||||
if context.disk_space_available < self.config.min_disk_space {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Disk space too low: {:.1}% < {:.1}%",
|
||||
context.disk_space_available, self.config.min_disk_space
|
||||
));
|
||||
}
|
||||
|
||||
// Check active operations
|
||||
if context.active_operations > self.config.max_active_operations {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Too many active operations: {} > {}",
|
||||
context.active_operations, self.config.max_active_operations
|
||||
));
|
||||
}
|
||||
|
||||
// Check scan interval
|
||||
let time_since_last_scan = context.current_time
|
||||
.duration_since(self.last_scan_time)
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
if time_since_last_scan < self.config.min_scan_interval {
|
||||
allowed = false;
|
||||
reasons.push(format!(
|
||||
"Scan interval too short: {:?} < {:?}",
|
||||
time_since_last_scan, self.config.min_scan_interval
|
||||
));
|
||||
}
|
||||
|
||||
// Check resource usage
|
||||
if context.resource_usage.cpu_usage > 90.0 {
|
||||
allowed = false;
|
||||
reasons.push("CPU usage too high".to_string());
|
||||
}
|
||||
|
||||
if context.resource_usage.memory_usage > 90.0 {
|
||||
allowed = false;
|
||||
reasons.push("Memory usage too high".to_string());
|
||||
}
|
||||
|
||||
let reason = if reasons.is_empty() {
|
||||
"Scan allowed".to_string()
|
||||
} else {
|
||||
reasons.join("; ")
|
||||
};
|
||||
|
||||
PolicyResult {
|
||||
allowed,
|
||||
reason,
|
||||
metadata: Some(serde_json::json!({
|
||||
"scan_count": self.scan_count,
|
||||
"time_since_last_scan": time_since_last_scan.as_secs(),
|
||||
"system_load": context.system_load,
|
||||
"disk_space_available": context.disk_space_available,
|
||||
"active_operations": context.active_operations,
|
||||
})),
|
||||
evaluated_at: context.current_time,
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate deep scan policy
|
||||
pub async fn evaluate_deep_scan(&self, context: &PolicyContext) -> PolicyResult {
|
||||
let mut base_result = self.evaluate(context).await;
|
||||
|
||||
if !base_result.allowed {
|
||||
return base_result;
|
||||
}
|
||||
|
||||
// Check deep scan interval
|
||||
let time_since_last_deep_scan = context.current_time
|
||||
.duration_since(self.last_deep_scan_time)
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
if time_since_last_deep_scan < self.config.deep_scan_interval {
|
||||
base_result.allowed = false;
|
||||
base_result.reason = format!(
|
||||
"Deep scan interval too short: {:?} < {:?}",
|
||||
time_since_last_deep_scan, self.config.deep_scan_interval
|
||||
);
|
||||
} else {
|
||||
base_result.reason = "Deep scan allowed".to_string();
|
||||
}
|
||||
|
||||
// Add deep scan metadata
|
||||
if let Some(ref mut metadata) = base_result.metadata {
|
||||
if let Some(obj) = metadata.as_object_mut() {
|
||||
obj.insert(
|
||||
"time_since_last_deep_scan".to_string(),
|
||||
serde_json::Value::Number(serde_json::Number::from(time_since_last_deep_scan.as_secs())),
|
||||
);
|
||||
obj.insert(
|
||||
"deep_scan_enabled".to_string(),
|
||||
serde_json::Value::Bool(self.config.enable_deep_scan),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
base_result
|
||||
}
|
||||
|
||||
/// Get scan interval based on priority
|
||||
pub fn get_priority_interval(&self, severity: Severity) -> Duration {
|
||||
if !self.config.priority_config.enabled {
|
||||
return self.config.min_scan_interval;
|
||||
}
|
||||
|
||||
match severity {
|
||||
Severity::Critical => self.config.priority_config.critical_interval,
|
||||
Severity::High => self.config.priority_config.high_interval,
|
||||
Severity::Medium => self.config.priority_config.medium_interval,
|
||||
Severity::Low => self.config.priority_config.low_interval,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update scan statistics
|
||||
pub fn record_scan(&mut self) {
|
||||
self.last_scan_time = SystemTime::now();
|
||||
self.scan_count += 1;
|
||||
}
|
||||
|
||||
/// Update deep scan statistics
|
||||
pub fn record_deep_scan(&mut self) {
|
||||
self.last_deep_scan_time = SystemTime::now();
|
||||
}
|
||||
|
||||
/// Get scan statistics
|
||||
pub fn get_statistics(&self) -> ScanPolicyStatistics {
|
||||
ScanPolicyStatistics {
|
||||
total_scans: self.scan_count,
|
||||
last_scan_time: self.last_scan_time,
|
||||
last_deep_scan_time: self.last_deep_scan_time,
|
||||
config: self.config.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan policy statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScanPolicyStatistics {
|
||||
pub total_scans: u64,
|
||||
pub last_scan_time: SystemTime,
|
||||
pub last_deep_scan_time: SystemTime,
|
||||
pub config: ScanPolicyConfig,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scanner::Severity;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_policy_creation() {
|
||||
let config = ScanPolicyConfig::default();
|
||||
let engine = ScanPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(engine.config().max_concurrent_scans, 4);
|
||||
assert_eq!(engine.config().max_system_load, 0.8);
|
||||
assert_eq!(engine.config().min_disk_space, 10.0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_policy_evaluation() {
|
||||
let config = ScanPolicyConfig::default();
|
||||
let engine = ScanPolicyEngine::new(config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate(&context).await;
|
||||
assert!(result.allowed);
|
||||
assert!(result.reason.contains("Scan allowed"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_policy_system_load_limit() {
|
||||
let config = ScanPolicyConfig::default();
|
||||
let engine = ScanPolicyEngine::new(config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.9, // Above threshold
|
||||
disk_space_available: 80.0,
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate(&context).await;
|
||||
assert!(!result.allowed);
|
||||
assert!(result.reason.contains("System load too high"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_policy_disk_space_limit() {
|
||||
let config = ScanPolicyConfig::default();
|
||||
let engine = ScanPolicyEngine::new(config);
|
||||
|
||||
let context = PolicyContext {
|
||||
system_load: 0.5,
|
||||
disk_space_available: 5.0, // Below threshold
|
||||
active_operations: 10,
|
||||
current_time: SystemTime::now(),
|
||||
health_issues: std::collections::HashMap::new(),
|
||||
resource_usage: ResourceUsage::default(),
|
||||
};
|
||||
|
||||
let result = engine.evaluate(&context).await;
|
||||
assert!(!result.allowed);
|
||||
assert!(result.reason.contains("Disk space too low"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_priority_intervals() {
|
||||
let config = ScanPolicyConfig::default();
|
||||
let engine = ScanPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(
|
||||
engine.get_priority_interval(Severity::Critical),
|
||||
Duration::from_secs(60)
|
||||
);
|
||||
assert_eq!(
|
||||
engine.get_priority_interval(Severity::High),
|
||||
Duration::from_secs(300)
|
||||
);
|
||||
assert_eq!(
|
||||
engine.get_priority_interval(Severity::Medium),
|
||||
Duration::from_secs(1800)
|
||||
);
|
||||
assert_eq!(
|
||||
engine.get_priority_interval(Severity::Low),
|
||||
Duration::from_secs(3600)
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scan_statistics() {
|
||||
let config = ScanPolicyConfig::default();
|
||||
let mut engine = ScanPolicyEngine::new(config);
|
||||
|
||||
assert_eq!(engine.get_statistics().total_scans, 0);
|
||||
|
||||
engine.record_scan();
|
||||
assert_eq!(engine.get_statistics().total_scans, 1);
|
||||
|
||||
engine.record_deep_scan();
|
||||
let stats = engine.get_statistics();
|
||||
assert_eq!(stats.total_scans, 1);
|
||||
assert!(stats.last_deep_scan_time > stats.last_scan_time);
|
||||
}
|
||||
}
|
||||
353
crates/ahm/src/scanner/bandwidth_limiter.rs
Normal file
353
crates/ahm/src/scanner/bandwidth_limiter.rs
Normal file
@@ -0,0 +1,353 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use tokio::{
|
||||
sync::RwLock,
|
||||
time::{sleep, sleep_until, Instant as TokioInstant},
|
||||
};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
|
||||
/// Configuration for bandwidth limiting
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BandwidthConfig {
|
||||
/// Maximum bytes per second
|
||||
pub bytes_per_second: u64,
|
||||
/// Maximum operations per second
|
||||
pub operations_per_second: u64,
|
||||
/// Burst allowance multiplier
|
||||
pub burst_multiplier: f64,
|
||||
/// Whether to enable adaptive throttling
|
||||
pub adaptive_throttling: bool,
|
||||
/// Minimum sleep duration between operations
|
||||
pub min_sleep_duration: Duration,
|
||||
/// Maximum sleep duration between operations
|
||||
pub max_sleep_duration: Duration,
|
||||
}
|
||||
|
||||
impl Default for BandwidthConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
bytes_per_second: 100 * 1024 * 1024, // 100 MB/s
|
||||
operations_per_second: 1000, // 1000 ops/s
|
||||
burst_multiplier: 2.0,
|
||||
adaptive_throttling: true,
|
||||
min_sleep_duration: Duration::from_micros(100),
|
||||
max_sleep_duration: Duration::from_millis(100),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Bandwidth limiter for controlling scan I/O rates
|
||||
pub struct BandwidthLimiter {
|
||||
config: BandwidthConfig,
|
||||
bytes_this_second: Arc<AtomicU64>,
|
||||
operations_this_second: Arc<AtomicU64>,
|
||||
last_reset: Arc<RwLock<Instant>>,
|
||||
adaptive_sleep_duration: Arc<RwLock<Duration>>,
|
||||
total_bytes_processed: Arc<AtomicU64>,
|
||||
total_operations_processed: Arc<AtomicU64>,
|
||||
start_time: Instant,
|
||||
}
|
||||
|
||||
impl BandwidthLimiter {
|
||||
/// Create a new bandwidth limiter
|
||||
pub fn new(config: BandwidthConfig) -> Self {
|
||||
let adaptive_sleep = if config.adaptive_throttling {
|
||||
config.min_sleep_duration
|
||||
} else {
|
||||
Duration::from_micros(1000) // 1ms default
|
||||
};
|
||||
|
||||
Self {
|
||||
config,
|
||||
bytes_this_second: Arc::new(AtomicU64::new(0)),
|
||||
operations_this_second: Arc::new(AtomicU64::new(0)),
|
||||
last_reset: Arc::new(RwLock::new(Instant::now())),
|
||||
adaptive_sleep_duration: Arc::new(RwLock::new(adaptive_sleep)),
|
||||
total_bytes_processed: Arc::new(AtomicU64::new(0)),
|
||||
total_operations_processed: Arc::new(AtomicU64::new(0)),
|
||||
start_time: Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for bandwidth allowance before processing bytes
|
||||
pub async fn wait_for_bytes(&self, bytes: u64) -> Result<()> {
|
||||
if self.config.bytes_per_second == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut total_wait_time = Duration::ZERO;
|
||||
let mut remaining_bytes = bytes;
|
||||
|
||||
while remaining_bytes > 0 {
|
||||
// Reset counters if a second has passed
|
||||
self.reset_counters_if_needed().await;
|
||||
|
||||
let current_bytes = self.bytes_this_second.load(Ordering::Relaxed);
|
||||
let burst_limit = (self.config.bytes_per_second as f64 * self.config.burst_multiplier) as u64;
|
||||
|
||||
if current_bytes >= burst_limit {
|
||||
// We're over the burst limit, wait
|
||||
let wait_time = self.calculate_wait_time(current_bytes, self.config.bytes_per_second).await;
|
||||
sleep(wait_time).await;
|
||||
total_wait_time += wait_time;
|
||||
continue;
|
||||
}
|
||||
|
||||
let bytes_to_process = std::cmp::min(remaining_bytes, burst_limit - current_bytes);
|
||||
self.bytes_this_second.fetch_add(bytes_to_process, Ordering::Relaxed);
|
||||
self.total_bytes_processed.fetch_add(bytes_to_process, Ordering::Relaxed);
|
||||
remaining_bytes -= bytes_to_process;
|
||||
|
||||
// Adaptive throttling
|
||||
if self.config.adaptive_throttling {
|
||||
self.update_adaptive_sleep(bytes_to_process).await;
|
||||
}
|
||||
}
|
||||
|
||||
if total_wait_time > Duration::ZERO {
|
||||
debug!("Bandwidth limiter waited {:?} for {} bytes", total_wait_time, bytes);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wait for bandwidth allowance before processing an operation
|
||||
pub async fn wait_for_operation(&self) -> Result<()> {
|
||||
if self.config.operations_per_second == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Reset counters if a second has passed
|
||||
self.reset_counters_if_needed().await;
|
||||
|
||||
let current_ops = self.operations_this_second.load(Ordering::Relaxed);
|
||||
let burst_limit = (self.config.operations_per_second as f64 * self.config.burst_multiplier) as u64;
|
||||
|
||||
if current_ops >= burst_limit {
|
||||
// We're over the burst limit, wait
|
||||
let wait_time = self.calculate_wait_time(current_ops, self.config.operations_per_second).await;
|
||||
sleep(wait_time).await;
|
||||
debug!("Bandwidth limiter waited {:?} for operation", wait_time);
|
||||
}
|
||||
|
||||
self.operations_this_second.fetch_add(1, Ordering::Relaxed);
|
||||
self.total_operations_processed.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wait for bandwidth allowance before processing both bytes and operations
|
||||
pub async fn wait_for_bytes_and_operation(&self, bytes: u64) -> Result<()> {
|
||||
self.wait_for_bytes(bytes).await?;
|
||||
self.wait_for_operation().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reset counters if a second has passed
|
||||
async fn reset_counters_if_needed(&self) {
|
||||
let mut last_reset = self.last_reset.write().await;
|
||||
let now = Instant::now();
|
||||
|
||||
if now.duration_since(*last_reset) >= Duration::from_secs(1) {
|
||||
self.bytes_this_second.store(0, Ordering::Relaxed);
|
||||
self.operations_this_second.store(0, Ordering::Relaxed);
|
||||
*last_reset = now;
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate wait time based on current usage and limit
|
||||
async fn calculate_wait_time(&self, current: u64, limit: u64) -> Duration {
|
||||
if current == 0 || limit == 0 {
|
||||
return self.config.min_sleep_duration;
|
||||
}
|
||||
|
||||
let utilization = current as f64 / limit as f64;
|
||||
let base_sleep = self.config.min_sleep_duration.as_micros() as f64;
|
||||
let max_sleep = self.config.max_sleep_duration.as_micros() as f64;
|
||||
|
||||
// Exponential backoff based on utilization
|
||||
let sleep_micros = base_sleep * (utilization * utilization);
|
||||
let sleep_micros = sleep_micros.min(max_sleep).max(base_sleep);
|
||||
|
||||
Duration::from_micros(sleep_micros as u64)
|
||||
}
|
||||
|
||||
/// Update adaptive sleep duration based on recent activity
|
||||
async fn update_adaptive_sleep(&self, bytes_processed: u64) {
|
||||
let mut sleep_duration = self.adaptive_sleep_duration.write().await;
|
||||
|
||||
// Simple adaptive algorithm: increase sleep if we're processing too much
|
||||
let current_rate = bytes_processed as f64 / sleep_duration.as_secs_f64();
|
||||
let target_rate = self.config.bytes_per_second as f64;
|
||||
|
||||
if current_rate > target_rate * 1.1 {
|
||||
// We're going too fast, increase sleep
|
||||
*sleep_duration = Duration::from_micros(
|
||||
(sleep_duration.as_micros() as f64 * 1.1) as u64
|
||||
).min(self.config.max_sleep_duration);
|
||||
} else if current_rate < target_rate * 0.9 {
|
||||
// We're going too slow, decrease sleep
|
||||
*sleep_duration = Duration::from_micros(
|
||||
(sleep_duration.as_micros() as f64 * 0.9) as u64
|
||||
).max(self.config.min_sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current bandwidth statistics
|
||||
pub async fn statistics(&self) -> BandwidthStatistics {
|
||||
let elapsed = self.start_time.elapsed();
|
||||
let total_bytes = self.total_bytes_processed.load(Ordering::Relaxed);
|
||||
let total_ops = self.total_operations_processed.load(Ordering::Relaxed);
|
||||
let current_bytes = self.bytes_this_second.load(Ordering::Relaxed);
|
||||
let current_ops = self.operations_this_second.load(Ordering::Relaxed);
|
||||
let adaptive_sleep = *self.adaptive_sleep_duration.read().await;
|
||||
|
||||
BandwidthStatistics {
|
||||
total_bytes_processed: total_bytes,
|
||||
total_operations_processed: total_ops,
|
||||
current_bytes_per_second: current_bytes,
|
||||
current_operations_per_second: current_ops,
|
||||
average_bytes_per_second: if elapsed.as_secs() > 0 {
|
||||
total_bytes / elapsed.as_secs()
|
||||
} else {
|
||||
0
|
||||
},
|
||||
average_operations_per_second: if elapsed.as_secs() > 0 {
|
||||
total_ops / elapsed.as_secs()
|
||||
} else {
|
||||
0
|
||||
},
|
||||
adaptive_sleep_duration: adaptive_sleep,
|
||||
uptime: elapsed,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset all statistics
|
||||
pub async fn reset_statistics(&self) {
|
||||
self.total_bytes_processed.store(0, Ordering::Relaxed);
|
||||
self.total_operations_processed.store(0, Ordering::Relaxed);
|
||||
self.bytes_this_second.store(0, Ordering::Relaxed);
|
||||
self.operations_this_second.store(0, Ordering::Relaxed);
|
||||
*self.last_reset.write().await = Instant::now();
|
||||
*self.adaptive_sleep_duration.write().await = self.config.min_sleep_duration;
|
||||
}
|
||||
|
||||
/// Update configuration
|
||||
pub async fn update_config(&self, new_config: BandwidthConfig) {
|
||||
info!("Updating bandwidth limiter config: {:?}", new_config);
|
||||
|
||||
// Reset adaptive sleep if adaptive throttling is disabled
|
||||
if !new_config.adaptive_throttling {
|
||||
*self.adaptive_sleep_duration.write().await = new_config.min_sleep_duration;
|
||||
}
|
||||
|
||||
// Note: We can't update the config struct itself since it's not wrapped in Arc<RwLock>
|
||||
// In a real implementation, you might want to wrap the config in Arc<RwLock> as well
|
||||
warn!("Config update not fully implemented - config struct is not mutable");
|
||||
}
|
||||
}
|
||||
|
||||
/// Statistics for bandwidth limiting
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BandwidthStatistics {
|
||||
pub total_bytes_processed: u64,
|
||||
pub total_operations_processed: u64,
|
||||
pub current_bytes_per_second: u64,
|
||||
pub current_operations_per_second: u64,
|
||||
pub average_bytes_per_second: u64,
|
||||
pub average_operations_per_second: u64,
|
||||
pub adaptive_sleep_duration: Duration,
|
||||
pub uptime: Duration,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tokio::time::Instant as TokioInstant;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_bandwidth_limiter_creation() {
|
||||
let config = BandwidthConfig::default();
|
||||
let limiter = BandwidthLimiter::new(config);
|
||||
let stats = limiter.statistics().await;
|
||||
assert_eq!(stats.total_bytes_processed, 0);
|
||||
assert_eq!(stats.total_operations_processed, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_bytes_limiting() {
|
||||
let config = BandwidthConfig {
|
||||
bytes_per_second: 1000, // 1KB/s
|
||||
operations_per_second: 1000,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = BandwidthLimiter::new(config);
|
||||
|
||||
let start = TokioInstant::now();
|
||||
|
||||
// Process 500 bytes (should not be limited)
|
||||
limiter.wait_for_bytes(500).await.unwrap();
|
||||
|
||||
// Process another 600 bytes (should be limited)
|
||||
limiter.wait_for_bytes(600).await.unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
assert!(elapsed >= Duration::from_millis(100)); // Should take some time due to limiting
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_operation_limiting() {
|
||||
let config = BandwidthConfig {
|
||||
bytes_per_second: 1000000, // 1MB/s
|
||||
operations_per_second: 10, // 10 ops/s
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = BandwidthLimiter::new(config);
|
||||
|
||||
let start = TokioInstant::now();
|
||||
|
||||
// Process 15 operations (should be limited)
|
||||
for _ in 0..15 {
|
||||
limiter.wait_for_operation().await.unwrap();
|
||||
}
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
assert!(elapsed >= Duration::from_millis(500)); // Should take some time due to limiting
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_statistics() {
|
||||
let config = BandwidthConfig::default();
|
||||
let limiter = BandwidthLimiter::new(config);
|
||||
|
||||
limiter.wait_for_bytes(1000).await.unwrap();
|
||||
limiter.wait_for_operation().await.unwrap();
|
||||
|
||||
let stats = limiter.statistics().await;
|
||||
assert_eq!(stats.total_bytes_processed, 1000);
|
||||
assert_eq!(stats.total_operations_processed, 1);
|
||||
assert!(stats.uptime > Duration::ZERO);
|
||||
}
|
||||
}
|
||||
591
crates/ahm/src/scanner/disk_scanner.rs
Normal file
591
crates/ahm/src/scanner/disk_scanner.rs
Normal file
@@ -0,0 +1,591 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{error, info};
|
||||
use anyhow;
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
/// Configuration for disk scanning
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiskScannerConfig {
|
||||
/// Scan interval for disk health checks
|
||||
pub scan_interval: Duration,
|
||||
/// Minimum free space threshold (percentage)
|
||||
pub min_free_space_percent: f64,
|
||||
/// Maximum disk usage threshold (percentage)
|
||||
pub max_disk_usage_percent: f64,
|
||||
/// Minimum inode usage threshold (percentage)
|
||||
pub min_inode_usage_percent: f64,
|
||||
/// Maximum inode usage threshold (percentage)
|
||||
pub max_inode_usage_percent: f64,
|
||||
/// Whether to check disk I/O performance
|
||||
pub check_io_performance: bool,
|
||||
/// Whether to check disk temperature (if available)
|
||||
pub check_temperature: bool,
|
||||
/// Whether to check disk SMART status (if available)
|
||||
pub check_smart_status: bool,
|
||||
/// Timeout for individual disk operations
|
||||
pub operation_timeout: Duration,
|
||||
/// Maximum number of concurrent disk scans
|
||||
pub max_concurrent_scans: usize,
|
||||
}
|
||||
|
||||
impl Default for DiskScannerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
scan_interval: Duration::from_secs(300), // 5 minutes
|
||||
min_free_space_percent: 10.0, // 10% minimum free space
|
||||
max_disk_usage_percent: 90.0, // 90% maximum usage
|
||||
min_inode_usage_percent: 5.0, // 5% minimum inode usage
|
||||
max_inode_usage_percent: 95.0, // 95% maximum inode usage
|
||||
check_io_performance: true,
|
||||
check_temperature: false, // Disabled by default
|
||||
check_smart_status: false, // Disabled by default
|
||||
operation_timeout: Duration::from_secs(30),
|
||||
max_concurrent_scans: 4,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Disk information and health status
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiskInfo {
|
||||
pub device_path: String,
|
||||
pub mount_point: String,
|
||||
pub filesystem_type: String,
|
||||
pub total_space: u64,
|
||||
pub used_space: u64,
|
||||
pub free_space: u64,
|
||||
pub available_space: u64,
|
||||
pub usage_percent: f64,
|
||||
pub inode_total: Option<u64>,
|
||||
pub inode_used: Option<u64>,
|
||||
pub inode_free: Option<u64>,
|
||||
pub inode_usage_percent: Option<f64>,
|
||||
pub last_scan_time: SystemTime,
|
||||
pub health_status: DiskHealthStatus,
|
||||
pub performance_metrics: Option<DiskPerformanceMetrics>,
|
||||
pub temperature: Option<f64>,
|
||||
pub smart_status: Option<SmartStatus>,
|
||||
}
|
||||
|
||||
/// Disk health status
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum DiskHealthStatus {
|
||||
Healthy,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// Disk performance metrics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiskPerformanceMetrics {
|
||||
pub read_bytes_per_sec: f64,
|
||||
pub write_bytes_per_sec: f64,
|
||||
pub read_operations_per_sec: f64,
|
||||
pub write_operations_per_sec: f64,
|
||||
pub average_response_time_ms: f64,
|
||||
pub queue_depth: f64,
|
||||
pub utilization_percent: f64,
|
||||
pub last_updated: SystemTime,
|
||||
}
|
||||
|
||||
/// SMART status information
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SmartStatus {
|
||||
pub overall_health: SmartHealthStatus,
|
||||
pub temperature: Option<f64>,
|
||||
pub power_on_hours: Option<u64>,
|
||||
pub reallocated_sectors: Option<u64>,
|
||||
pub pending_sectors: Option<u64>,
|
||||
pub uncorrectable_sectors: Option<u64>,
|
||||
pub attributes: HashMap<String, SmartAttribute>,
|
||||
}
|
||||
|
||||
/// SMART health status
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum SmartHealthStatus {
|
||||
Passed,
|
||||
Failed,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// SMART attribute
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SmartAttribute {
|
||||
pub name: String,
|
||||
pub value: u64,
|
||||
pub worst: u64,
|
||||
pub threshold: u64,
|
||||
pub status: SmartAttributeStatus,
|
||||
}
|
||||
|
||||
/// SMART attribute status
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum SmartAttributeStatus {
|
||||
Good,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
/// Result of scanning a single disk
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiskScanResult {
|
||||
pub disk_info: DiskInfo,
|
||||
pub health_issues: Vec<HealthIssue>,
|
||||
pub scan_duration: Duration,
|
||||
pub success: bool,
|
||||
pub error_message: Option<String>,
|
||||
}
|
||||
|
||||
/// Disk scanner for monitoring disk health and performance
|
||||
pub struct DiskScanner {
|
||||
config: DiskScannerConfig,
|
||||
statistics: Arc<RwLock<DiskScannerStatistics>>,
|
||||
last_scan_results: Arc<RwLock<HashMap<String, DiskScanResult>>>,
|
||||
}
|
||||
|
||||
/// Statistics for disk scanning
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct DiskScannerStatistics {
|
||||
pub disks_scanned: u64,
|
||||
pub disks_with_issues: u64,
|
||||
pub total_issues_found: u64,
|
||||
pub total_scan_time: Duration,
|
||||
pub average_scan_time: Duration,
|
||||
pub last_scan_time: Option<SystemTime>,
|
||||
pub scan_cycles_completed: u64,
|
||||
pub scan_cycles_failed: u64,
|
||||
}
|
||||
|
||||
impl DiskScanner {
|
||||
/// Create a new disk scanner
|
||||
pub fn new(config: DiskScannerConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
statistics: Arc::new(RwLock::new(DiskScannerStatistics::default())),
|
||||
last_scan_results: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan all mounted disks
|
||||
pub async fn scan_all_disks(&self) -> Result<Vec<DiskScanResult>> {
|
||||
let scan_start = Instant::now();
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Get list of mounted filesystems
|
||||
let mount_points = self.get_mount_points().await?;
|
||||
|
||||
info!("Starting disk scan for {} mount points", mount_points.len());
|
||||
|
||||
// Scan each mount point
|
||||
for mount_point in mount_points {
|
||||
match self.scan_disk(&mount_point).await {
|
||||
Ok(result) => {
|
||||
results.push(result.clone());
|
||||
|
||||
// Store result for later reference
|
||||
let mut last_results = self.last_scan_results.write().await;
|
||||
last_results.insert(mount_point.clone(), result);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to scan disk at {}: {}", mount_point, e);
|
||||
|
||||
// Create error result
|
||||
let error_result = DiskScanResult {
|
||||
disk_info: DiskInfo {
|
||||
device_path: "unknown".to_string(),
|
||||
mount_point: mount_point.clone(),
|
||||
filesystem_type: "unknown".to_string(),
|
||||
total_space: 0,
|
||||
used_space: 0,
|
||||
free_space: 0,
|
||||
available_space: 0,
|
||||
usage_percent: 0.0,
|
||||
inode_total: None,
|
||||
inode_used: None,
|
||||
inode_free: None,
|
||||
inode_usage_percent: None,
|
||||
last_scan_time: SystemTime::now(),
|
||||
health_status: DiskHealthStatus::Unknown,
|
||||
performance_metrics: None,
|
||||
temperature: None,
|
||||
smart_status: None,
|
||||
},
|
||||
health_issues: vec![HealthIssue {
|
||||
issue_type: HealthIssueType::DiskReadError,
|
||||
severity: Severity::High,
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.clone(),
|
||||
description: format!("Failed to scan disk: {}", e),
|
||||
metadata: None,
|
||||
}],
|
||||
scan_duration: scan_start.elapsed(),
|
||||
success: false,
|
||||
error_message: Some(e.to_string()),
|
||||
};
|
||||
|
||||
results.push(error_result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update statistics
|
||||
self.update_statistics(|stats| {
|
||||
stats.disks_scanned += results.len() as u64;
|
||||
stats.disks_with_issues += results.iter().filter(|r| !r.health_issues.is_empty()).count() as u64;
|
||||
stats.total_issues_found += results.iter().map(|r| r.health_issues.len() as u64).sum::<u64>();
|
||||
stats.total_scan_time += scan_start.elapsed();
|
||||
stats.average_scan_time = Duration::from_millis(
|
||||
stats.total_scan_time.as_millis() as u64 / stats.disks_scanned.max(1)
|
||||
);
|
||||
stats.last_scan_time = Some(SystemTime::now());
|
||||
stats.scan_cycles_completed += 1;
|
||||
}).await;
|
||||
|
||||
info!(
|
||||
"Disk scan completed: {} disks, {} issues found in {:?}",
|
||||
results.len(),
|
||||
results.iter().map(|r| r.health_issues.len()).sum::<usize>(),
|
||||
scan_start.elapsed()
|
||||
);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Scan a single disk
|
||||
pub async fn scan_disk(&self, mount_point: &str) -> Result<DiskScanResult> {
|
||||
let scan_start = Instant::now();
|
||||
let mut health_issues = Vec::new();
|
||||
|
||||
// Get disk space information
|
||||
let disk_info = self.get_disk_info(mount_point).await?;
|
||||
|
||||
// Check disk space usage
|
||||
if disk_info.usage_percent > self.config.max_disk_usage_percent {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskFull,
|
||||
severity: if disk_info.usage_percent > 95.0 { Severity::Critical } else { Severity::High },
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: format!("Disk usage is {}%, exceeds threshold of {}%",
|
||||
disk_info.usage_percent, self.config.max_disk_usage_percent),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
if disk_info.usage_percent < self.config.min_free_space_percent {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskFull,
|
||||
severity: Severity::Medium,
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: format!("Free space is only {}%, below threshold of {}%",
|
||||
100.0 - disk_info.usage_percent, self.config.min_free_space_percent),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Check inode usage if available
|
||||
if let Some(inode_usage) = disk_info.inode_usage_percent {
|
||||
if inode_usage > self.config.max_inode_usage_percent {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskFull,
|
||||
severity: if inode_usage > 95.0 { Severity::Critical } else { Severity::High },
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: format!("Inode usage is {}%, exceeds threshold of {}%",
|
||||
inode_usage, self.config.max_inode_usage_percent),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check I/O performance if enabled
|
||||
if self.config.check_io_performance {
|
||||
if let Some(metrics) = &disk_info.performance_metrics {
|
||||
if metrics.utilization_percent > 90.0 {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskReadError,
|
||||
severity: Severity::Medium,
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: format!("High disk utilization: {}%", metrics.utilization_percent),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
if metrics.average_response_time_ms > 100.0 {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskReadError,
|
||||
severity: Severity::Medium,
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: format!("High disk response time: {}ms", metrics.average_response_time_ms),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check temperature if enabled
|
||||
if self.config.check_temperature {
|
||||
if let Some(temp) = disk_info.temperature {
|
||||
if temp > 60.0 {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskReadError,
|
||||
severity: if temp > 70.0 { Severity::Critical } else { Severity::High },
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: format!("High disk temperature: {}°C", temp),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check SMART status if enabled
|
||||
if self.config.check_smart_status {
|
||||
if let Some(smart) = &disk_info.smart_status {
|
||||
if smart.overall_health == SmartHealthStatus::Failed {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskReadError,
|
||||
severity: Severity::Critical,
|
||||
bucket: "system".to_string(),
|
||||
object: mount_point.to_string(),
|
||||
description: "SMART health check failed".to_string(),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let scan_duration = scan_start.elapsed();
|
||||
let success = health_issues.is_empty();
|
||||
|
||||
Ok(DiskScanResult {
|
||||
disk_info,
|
||||
health_issues,
|
||||
scan_duration,
|
||||
success,
|
||||
error_message: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get list of mounted filesystems
|
||||
async fn get_mount_points(&self) -> Result<Vec<String>> {
|
||||
// TODO: Implement actual mount point detection
|
||||
// For now, return common mount points
|
||||
Ok(vec![
|
||||
"/".to_string(),
|
||||
"/data".to_string(),
|
||||
"/var".to_string(),
|
||||
])
|
||||
}
|
||||
|
||||
/// Get disk information for a mount point
|
||||
async fn get_disk_info(&self, mount_point: &str) -> Result<DiskInfo> {
|
||||
let path = Path::new(mount_point);
|
||||
|
||||
// Get filesystem statistics using std::fs instead of nix for now
|
||||
let _metadata = match std::fs::metadata(path) {
|
||||
Ok(metadata) => metadata,
|
||||
Err(e) => {
|
||||
return Err(crate::error::Error::Other(anyhow::anyhow!("Failed to get filesystem stats: {}", e)));
|
||||
}
|
||||
};
|
||||
|
||||
// For now, use placeholder values since we can't easily get filesystem stats
|
||||
let total_space = 1000000000; // 1GB placeholder
|
||||
let free_space = 500000000; // 500MB placeholder
|
||||
let available_space = 450000000; // 450MB placeholder
|
||||
let used_space = total_space - free_space;
|
||||
let usage_percent = (used_space as f64 / total_space as f64) * 100.0;
|
||||
|
||||
// Get inode information (placeholder)
|
||||
let inode_total = Some(1000000);
|
||||
let inode_free = Some(500000);
|
||||
let inode_used = Some(500000);
|
||||
let inode_usage_percent = Some(50.0);
|
||||
|
||||
// Get filesystem type
|
||||
let filesystem_type = self.get_filesystem_type(mount_point).await.unwrap_or_else(|_| "unknown".to_string());
|
||||
|
||||
// Get device path
|
||||
let device_path = self.get_device_path(mount_point).await.unwrap_or_else(|_| "unknown".to_string());
|
||||
|
||||
// Get performance metrics if enabled
|
||||
let performance_metrics = if self.config.check_io_performance {
|
||||
self.get_performance_metrics(&device_path).await.ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Get temperature if enabled
|
||||
let temperature = if self.config.check_temperature {
|
||||
self.get_disk_temperature(&device_path).await.ok().flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Get SMART status if enabled
|
||||
let smart_status = if self.config.check_smart_status {
|
||||
self.get_smart_status(&device_path).await.ok().flatten()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Determine health status (placeholder - will be set by scan_disk method)
|
||||
let health_status = DiskHealthStatus::Healthy;
|
||||
|
||||
Ok(DiskInfo {
|
||||
device_path,
|
||||
mount_point: mount_point.to_string(),
|
||||
filesystem_type,
|
||||
total_space,
|
||||
used_space,
|
||||
free_space,
|
||||
available_space,
|
||||
usage_percent,
|
||||
inode_total,
|
||||
inode_used,
|
||||
inode_free,
|
||||
inode_usage_percent,
|
||||
last_scan_time: SystemTime::now(),
|
||||
health_status,
|
||||
performance_metrics,
|
||||
temperature,
|
||||
smart_status,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get filesystem type for a mount point
|
||||
async fn get_filesystem_type(&self, _mount_point: &str) -> Result<String> {
|
||||
// TODO: Implement filesystem type detection
|
||||
// For now, return a placeholder
|
||||
Ok("ext4".to_string())
|
||||
}
|
||||
|
||||
/// Get device path for a mount point
|
||||
async fn get_device_path(&self, _mount_point: &str) -> Result<String> {
|
||||
// TODO: Implement device path detection
|
||||
// For now, return a placeholder
|
||||
Ok("/dev/sda1".to_string())
|
||||
}
|
||||
|
||||
/// Get disk performance metrics
|
||||
async fn get_performance_metrics(&self, _device_path: &str) -> Result<DiskPerformanceMetrics> {
|
||||
// TODO: Implement performance metrics collection
|
||||
// For now, return placeholder metrics
|
||||
Ok(DiskPerformanceMetrics {
|
||||
read_bytes_per_sec: 1000000.0, // 1MB/s
|
||||
write_bytes_per_sec: 500000.0, // 500KB/s
|
||||
read_operations_per_sec: 100.0,
|
||||
write_operations_per_sec: 50.0,
|
||||
average_response_time_ms: 5.0,
|
||||
queue_depth: 1.0,
|
||||
utilization_percent: 10.0,
|
||||
last_updated: SystemTime::now(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Get disk temperature
|
||||
async fn get_disk_temperature(&self, _device_path: &str) -> Result<Option<f64>> {
|
||||
// TODO: Implement temperature monitoring
|
||||
// For now, return None (temperature not available)
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Get SMART status
|
||||
async fn get_smart_status(&self, _device_path: &str) -> Result<Option<SmartStatus>> {
|
||||
// TODO: Implement SMART status checking
|
||||
// For now, return None (SMART not available)
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Update scanner statistics
|
||||
async fn update_statistics<F>(&self, update_fn: F)
|
||||
where
|
||||
F: FnOnce(&mut DiskScannerStatistics),
|
||||
{
|
||||
let mut stats = self.statistics.write().await;
|
||||
update_fn(&mut stats);
|
||||
}
|
||||
|
||||
/// Get current statistics
|
||||
pub async fn statistics(&self) -> DiskScannerStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get last scan results
|
||||
pub async fn last_scan_results(&self) -> HashMap<String, DiskScanResult> {
|
||||
self.last_scan_results.read().await.clone()
|
||||
}
|
||||
|
||||
/// Reset statistics
|
||||
pub async fn reset_statistics(&self) {
|
||||
let mut stats = self.statistics.write().await;
|
||||
*stats = DiskScannerStatistics::default();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_disk_scanner_creation() {
|
||||
let config = DiskScannerConfig::default();
|
||||
let scanner = DiskScanner::new(config);
|
||||
assert_eq!(scanner.statistics().await.disks_scanned, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_disk_info_creation() {
|
||||
let disk_info = DiskInfo {
|
||||
device_path: "/dev/sda1".to_string(),
|
||||
mount_point: "/".to_string(),
|
||||
filesystem_type: "ext4".to_string(),
|
||||
total_space: 1000000000,
|
||||
used_space: 500000000,
|
||||
free_space: 500000000,
|
||||
available_space: 450000000,
|
||||
usage_percent: 50.0,
|
||||
inode_total: Some(1000000),
|
||||
inode_used: Some(500000),
|
||||
inode_free: Some(500000),
|
||||
inode_usage_percent: Some(50.0),
|
||||
last_scan_time: SystemTime::now(),
|
||||
health_status: DiskHealthStatus::Healthy,
|
||||
performance_metrics: None,
|
||||
temperature: None,
|
||||
smart_status: None,
|
||||
};
|
||||
|
||||
assert_eq!(disk_info.usage_percent, 50.0);
|
||||
assert_eq!(disk_info.health_status, DiskHealthStatus::Healthy);
|
||||
}
|
||||
}
|
||||
536
crates/ahm/src/scanner/engine.rs
Normal file
536
crates/ahm/src/scanner/engine.rs
Normal file
@@ -0,0 +1,536 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::{
|
||||
sync::{broadcast, RwLock},
|
||||
time::sleep,
|
||||
};
|
||||
use tracing::{error, info, warn};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{core, error::Result, metrics, SystemEvent};
|
||||
use crate::core::Status;
|
||||
use super::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
/// Represents a discovered object during scanning
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScannedObject {
|
||||
pub bucket: String,
|
||||
pub object: String,
|
||||
pub version_id: Option<String>,
|
||||
pub path: PathBuf,
|
||||
pub size: u64,
|
||||
pub modified_time: SystemTime,
|
||||
pub metadata: HashMap<String, String>,
|
||||
pub health_issues: Vec<HealthIssue>,
|
||||
}
|
||||
|
||||
/// Configuration for the scanner engine
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EngineConfig {
|
||||
/// Root directory to scan
|
||||
pub root_path: String,
|
||||
/// Maximum number of concurrent scan workers
|
||||
pub max_workers: usize,
|
||||
/// Scan interval between cycles
|
||||
pub scan_interval: Duration,
|
||||
/// Bandwidth limit for scanning (bytes per second)
|
||||
pub bandwidth_limit: Option<u64>,
|
||||
/// Whether to enable deep scanning (bitrot detection)
|
||||
pub enable_deep_scan: bool,
|
||||
/// Probability of healing objects during scan (1 in N)
|
||||
pub heal_probability: u32,
|
||||
/// Maximum folders to scan before compacting
|
||||
pub max_folders_before_compact: u64,
|
||||
/// Sleep duration between folder scans
|
||||
pub folder_sleep_duration: Duration,
|
||||
}
|
||||
|
||||
impl Default for EngineConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
root_path: "/data".to_string(),
|
||||
max_workers: 4,
|
||||
scan_interval: Duration::from_secs(300), // 5 minutes
|
||||
bandwidth_limit: None,
|
||||
enable_deep_scan: false,
|
||||
heal_probability: 1024, // 1 in 1024 objects
|
||||
max_folders_before_compact: 10000,
|
||||
folder_sleep_duration: Duration::from_millis(1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scanner statistics
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ScannerStatistics {
|
||||
pub objects_scanned: u64,
|
||||
pub bytes_scanned: u64,
|
||||
pub issues_found: u64,
|
||||
pub scan_duration: Duration,
|
||||
pub scan_rate_objects_per_sec: f64,
|
||||
pub scan_rate_bytes_per_sec: f64,
|
||||
pub folders_scanned: u64,
|
||||
pub objects_with_issues: u64,
|
||||
}
|
||||
|
||||
/// Main scanner engine
|
||||
pub struct Engine {
|
||||
config: EngineConfig,
|
||||
coordinator: Arc<core::Coordinator>,
|
||||
metrics: Arc<metrics::Collector>,
|
||||
cancel_token: CancellationToken,
|
||||
status: Arc<RwLock<Status>>,
|
||||
statistics: Arc<RwLock<ScannerStatistics>>,
|
||||
scan_cycle: Arc<RwLock<u64>>,
|
||||
}
|
||||
|
||||
impl Engine {
|
||||
/// Create a new scanner engine
|
||||
pub async fn new(
|
||||
config: EngineConfig,
|
||||
coordinator: Arc<core::Coordinator>,
|
||||
metrics: Arc<metrics::Collector>,
|
||||
cancel_token: CancellationToken,
|
||||
) -> Result<Self> {
|
||||
let engine = Self {
|
||||
config,
|
||||
coordinator,
|
||||
metrics,
|
||||
cancel_token,
|
||||
status: Arc::new(RwLock::new(Status::Initializing)),
|
||||
statistics: Arc::new(RwLock::new(ScannerStatistics::default())),
|
||||
scan_cycle: Arc::new(RwLock::new(0)),
|
||||
};
|
||||
|
||||
info!("Scanner engine created with config: {:?}", engine.config);
|
||||
Ok(engine)
|
||||
}
|
||||
|
||||
/// Start the scanner engine
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
info!("Starting scanner engine");
|
||||
*self.status.write().await = Status::Running;
|
||||
|
||||
let engine = self.clone_for_background();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = engine.run_scan_loop().await {
|
||||
error!("Scanner engine error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop the scanner engine
|
||||
pub async fn stop(&self) -> Result<()> {
|
||||
info!("Stopping scanner engine");
|
||||
*self.status.write().await = Status::Stopping;
|
||||
self.cancel_token.cancel();
|
||||
*self.status.write().await = Status::Stopped;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get current status
|
||||
pub async fn status(&self) -> Status {
|
||||
self.status.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get current statistics
|
||||
pub async fn statistics(&self) -> ScannerStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Clone the engine for background tasks
|
||||
fn clone_for_background(&self) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
config: self.config.clone(),
|
||||
coordinator: self.coordinator.clone(),
|
||||
metrics: self.metrics.clone(),
|
||||
cancel_token: self.cancel_token.clone(),
|
||||
status: self.status.clone(),
|
||||
statistics: self.statistics.clone(),
|
||||
scan_cycle: self.scan_cycle.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Main scan loop
|
||||
async fn run_scan_loop(&self) -> Result<()> {
|
||||
info!("Scanner engine loop started");
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = self.cancel_token.cancelled() => {
|
||||
info!("Scanner engine received cancellation signal");
|
||||
break;
|
||||
}
|
||||
_ = sleep(self.config.scan_interval) => {
|
||||
if let Err(e) = self.run_scan_cycle().await {
|
||||
error!("Scan cycle failed: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run a single scan cycle
|
||||
async fn run_scan_cycle(&self) -> Result<()> {
|
||||
let cycle_start = Instant::now();
|
||||
let cycle = {
|
||||
let mut cycle_guard = self.scan_cycle.write().await;
|
||||
*cycle_guard += 1;
|
||||
*cycle_guard
|
||||
};
|
||||
|
||||
info!("Starting scan cycle {}", cycle);
|
||||
|
||||
// Reset statistics for new cycle
|
||||
{
|
||||
let mut stats = self.statistics.write().await;
|
||||
*stats = ScannerStatistics::default();
|
||||
}
|
||||
|
||||
// Scan the root directory
|
||||
let scan_result = self.scan_directory(&self.config.root_path).await?;
|
||||
|
||||
// Update statistics
|
||||
{
|
||||
let mut stats = self.statistics.write().await;
|
||||
stats.scan_duration = cycle_start.elapsed();
|
||||
stats.objects_scanned = scan_result.objects.len() as u64;
|
||||
stats.bytes_scanned = scan_result.total_size;
|
||||
stats.issues_found = scan_result.total_issues;
|
||||
stats.folders_scanned = scan_result.folders_scanned;
|
||||
stats.objects_with_issues = scan_result.objects_with_issues;
|
||||
|
||||
if stats.scan_duration.as_secs() > 0 {
|
||||
stats.scan_rate_objects_per_sec = stats.objects_scanned as f64 / stats.scan_duration.as_secs() as f64;
|
||||
stats.scan_rate_bytes_per_sec = stats.bytes_scanned as f64 / stats.scan_duration.as_secs() as f64;
|
||||
}
|
||||
}
|
||||
|
||||
// Publish scan completion event
|
||||
let scan_report = crate::scanner::ScanReport {
|
||||
scan_id: cycle.to_string(),
|
||||
status: "completed".to_string(),
|
||||
summary: format!("Scanned {} objects, found {} issues", scan_result.objects.len(), scan_result.total_issues),
|
||||
issues_found: scan_result.total_issues,
|
||||
};
|
||||
|
||||
self.coordinator.publish_event(SystemEvent::ScanCompleted(scan_report)).await?;
|
||||
|
||||
info!(
|
||||
"Scan cycle {} completed: {} objects, {} bytes, {} issues in {:?}",
|
||||
cycle,
|
||||
scan_result.objects.len(),
|
||||
scan_result.total_size,
|
||||
scan_result.total_issues,
|
||||
cycle_start.elapsed()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan a directory recursively
|
||||
async fn scan_directory(&self, path: &str) -> Result<ScanResult> {
|
||||
let mut result = ScanResult::default();
|
||||
let path_buf = PathBuf::from(path);
|
||||
|
||||
if !path_buf.exists() {
|
||||
warn!("Scan path does not exist: {}", path);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
if !path_buf.is_dir() {
|
||||
warn!("Scan path is not a directory: {}", path);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
self.scan_directory_recursive(&path_buf, &mut result).await?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Recursively scan a directory
|
||||
async fn scan_directory_recursive(&self, dir_path: &Path, result: &mut ScanResult) -> Result<()> {
|
||||
result.folders_scanned += 1;
|
||||
|
||||
// Check for cancellation
|
||||
if self.cancel_token.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let entries = match std::fs::read_dir(dir_path) {
|
||||
Ok(entries) => entries,
|
||||
Err(e) => {
|
||||
warn!("Failed to read directory {}: {}", dir_path.display(), e);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
for entry in entries {
|
||||
if self.cancel_token.is_cancelled() {
|
||||
break;
|
||||
}
|
||||
|
||||
let entry = match entry {
|
||||
Ok(entry) => entry,
|
||||
Err(e) => {
|
||||
warn!("Failed to read directory entry: {}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let file_path = entry.path();
|
||||
let _path_str = file_path.to_string_lossy();
|
||||
let entry_name = file_path.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or("unknown");
|
||||
|
||||
// Skip hidden files and system files
|
||||
if entry_name.starts_with('.') || entry_name == ".." || entry_name == "." {
|
||||
continue;
|
||||
}
|
||||
|
||||
if file_path.is_dir() {
|
||||
// Recursively scan subdirectories
|
||||
Box::pin(self.scan_directory_recursive(&file_path, result)).await?;
|
||||
} else if file_path.is_file() {
|
||||
// Scan individual file
|
||||
if let Some(scanned_object) = self.scan_object(&file_path).await? {
|
||||
result.objects.push(scanned_object.clone());
|
||||
result.total_size += scanned_object.size;
|
||||
|
||||
if !scanned_object.health_issues.is_empty() {
|
||||
result.objects_with_issues += 1;
|
||||
result.total_issues += scanned_object.health_issues.len() as u64;
|
||||
|
||||
// Publish health issues
|
||||
for issue in &scanned_object.health_issues {
|
||||
let health_issue = crate::scanner::HealthIssue {
|
||||
issue_type: issue.issue_type.clone(),
|
||||
severity: issue.severity,
|
||||
bucket: scanned_object.bucket.clone(),
|
||||
object: scanned_object.object.clone(),
|
||||
description: issue.description.clone(),
|
||||
metadata: None, // TODO: Convert HashMap to ObjectMetadata
|
||||
};
|
||||
|
||||
self.coordinator.publish_event(SystemEvent::HealthIssueDetected(health_issue)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Publish object discovered event
|
||||
let metadata = crate::ObjectMetadata {
|
||||
size: scanned_object.size,
|
||||
mod_time: scanned_object.modified_time.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs() as i64,
|
||||
content_type: "application/octet-stream".to_string(),
|
||||
etag: "".to_string(), // TODO: Calculate actual ETag
|
||||
};
|
||||
|
||||
self.coordinator.publish_event(SystemEvent::ObjectDiscovered {
|
||||
bucket: scanned_object.bucket.clone(),
|
||||
object: scanned_object.object.clone(),
|
||||
version_id: scanned_object.version_id.clone(),
|
||||
metadata,
|
||||
}).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Sleep between items to avoid overwhelming the system
|
||||
sleep(self.config.folder_sleep_duration).await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan a single object file
|
||||
async fn scan_object(&self, file_path: &Path) -> Result<Option<ScannedObject>> {
|
||||
let metadata = match std::fs::metadata(file_path) {
|
||||
Ok(metadata) => metadata,
|
||||
Err(e) => {
|
||||
warn!("Failed to read file metadata {}: {}", file_path.display(), e);
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
// Extract bucket and object from path
|
||||
let (bucket, object) = self.extract_bucket_object_from_path(file_path)?;
|
||||
if bucket.is_empty() || object.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Check for health issues
|
||||
let health_issues = self.check_object_health(file_path, &metadata).await?;
|
||||
|
||||
let scanned_object = ScannedObject {
|
||||
bucket,
|
||||
object,
|
||||
version_id: None, // TODO: Extract version ID from path
|
||||
path: file_path.to_path_buf(),
|
||||
size: metadata.len(),
|
||||
modified_time: metadata.modified().unwrap_or(SystemTime::now()),
|
||||
metadata: HashMap::new(), // TODO: Extract metadata
|
||||
health_issues,
|
||||
};
|
||||
|
||||
Ok(Some(scanned_object))
|
||||
}
|
||||
|
||||
/// Extract bucket and object name from file path
|
||||
fn extract_bucket_object_from_path(&self, file_path: &Path) -> Result<(String, String)> {
|
||||
let _path_str = file_path.to_string_lossy();
|
||||
let root_path = Path::new(&self.config.root_path);
|
||||
|
||||
if let Ok(relative_path) = file_path.strip_prefix(root_path) {
|
||||
let components: Vec<&str> = relative_path.components()
|
||||
.filter_map(|c| c.as_os_str().to_str())
|
||||
.collect();
|
||||
|
||||
if components.len() >= 2 {
|
||||
let bucket = components[0].to_string();
|
||||
let object = components[1..].join("/");
|
||||
return Ok((bucket, object));
|
||||
}
|
||||
}
|
||||
|
||||
Ok((String::new(), String::new()))
|
||||
}
|
||||
|
||||
/// Check object health and detect issues
|
||||
async fn check_object_health(&self, file_path: &Path, metadata: &std::fs::Metadata) -> Result<Vec<HealthIssue>> {
|
||||
let mut issues = Vec::new();
|
||||
|
||||
// Extract bucket and object from path for health issues
|
||||
let (bucket, object) = self.extract_bucket_object_from_path(file_path)?;
|
||||
|
||||
// Check file size
|
||||
if metadata.len() == 0 {
|
||||
issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::ObjectTooSmall,
|
||||
severity: Severity::Low,
|
||||
bucket: bucket.clone(),
|
||||
object: object.clone(),
|
||||
description: "Object has zero size".to_string(),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Check file permissions
|
||||
if !metadata.permissions().readonly() {
|
||||
issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::PolicyViolation,
|
||||
severity: Severity::Medium,
|
||||
bucket: bucket.clone(),
|
||||
object: object.clone(),
|
||||
description: "Object is not read-only".to_string(),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: Add more health checks:
|
||||
// - Checksum verification
|
||||
// - Replication status
|
||||
// - Encryption status
|
||||
// - Metadata consistency
|
||||
// - Disk health
|
||||
|
||||
Ok(issues)
|
||||
}
|
||||
|
||||
/// Start scanning operations
|
||||
pub async fn start_scan(&self) -> Result<()> {
|
||||
let mut status = self.status.write().await;
|
||||
*status = Status::Running;
|
||||
info!("Scanning operations started");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop scanning operations
|
||||
pub async fn stop_scan(&self) -> Result<()> {
|
||||
let mut status = self.status.write().await;
|
||||
*status = Status::Stopped;
|
||||
info!("Scanning operations stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get engine configuration
|
||||
pub async fn get_config(&self) -> ScanConfig {
|
||||
self.config.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of a scan operation
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ScanResult {
|
||||
pub objects: Vec<ScannedObject>,
|
||||
pub total_size: u64,
|
||||
pub total_issues: u64,
|
||||
pub folders_scanned: u64,
|
||||
pub objects_with_issues: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tokio::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_engine_creation() {
|
||||
let config = EngineConfig::default();
|
||||
let coordinator = Arc::new(core::Coordinator::new(
|
||||
core::CoordinatorConfig::default(),
|
||||
Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()),
|
||||
CancellationToken::new(),
|
||||
).await.unwrap());
|
||||
let metrics = Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap());
|
||||
let cancel_token = CancellationToken::new();
|
||||
|
||||
let engine = Engine::new(config, coordinator, metrics, cancel_token).await;
|
||||
assert!(engine.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_path_extraction() {
|
||||
let config = EngineConfig {
|
||||
root_path: "/data".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
let coordinator = Arc::new(core::Coordinator::new(
|
||||
core::CoordinatorConfig::default(),
|
||||
Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap()),
|
||||
CancellationToken::new(),
|
||||
).await.unwrap());
|
||||
let metrics = Arc::new(metrics::Collector::new(metrics::CollectorConfig::default()).await.unwrap());
|
||||
let cancel_token = CancellationToken::new();
|
||||
|
||||
let engine = Engine::new(config, coordinator, metrics, cancel_token).await.unwrap();
|
||||
|
||||
let test_path = Path::new("/data/bucket1/object1.txt");
|
||||
let (bucket, object) = engine.extract_bucket_object_from_path(test_path).unwrap();
|
||||
|
||||
assert_eq!(bucket, "bucket1");
|
||||
assert_eq!(object, "object1.txt");
|
||||
}
|
||||
}
|
||||
526
crates/ahm/src/scanner/metrics_collector.rs
Normal file
526
crates/ahm/src/scanner/metrics_collector.rs
Normal file
@@ -0,0 +1,526 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
/// Configuration for metrics collection
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricsConfig {
|
||||
/// Collection interval for metrics
|
||||
pub collection_interval: Duration,
|
||||
/// Retention period for historical metrics
|
||||
pub retention_period: Duration,
|
||||
/// Maximum number of data points to keep in memory
|
||||
pub max_data_points: usize,
|
||||
/// Whether to enable detailed metrics collection
|
||||
pub enable_detailed_metrics: bool,
|
||||
/// Whether to enable performance profiling
|
||||
pub enable_profiling: bool,
|
||||
/// Whether to enable resource usage tracking
|
||||
pub enable_resource_tracking: bool,
|
||||
}
|
||||
|
||||
impl Default for MetricsConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
collection_interval: Duration::from_secs(60), // 1 minute
|
||||
retention_period: Duration::from_secs(3600 * 24), // 24 hours
|
||||
max_data_points: 1440, // 24 hours worth of minute-level data
|
||||
enable_detailed_metrics: true,
|
||||
enable_profiling: false,
|
||||
enable_resource_tracking: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scanner performance metrics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScannerMetrics {
|
||||
/// Objects scanned per second
|
||||
pub objects_per_second: f64,
|
||||
/// Bytes scanned per second
|
||||
pub bytes_per_second: f64,
|
||||
/// Average scan time per object
|
||||
pub avg_scan_time_per_object: Duration,
|
||||
/// Total objects scanned in current cycle
|
||||
pub total_objects_scanned: u64,
|
||||
/// Total bytes scanned in current cycle
|
||||
pub total_bytes_scanned: u64,
|
||||
/// Number of health issues detected
|
||||
pub health_issues_detected: u64,
|
||||
/// Scan success rate (percentage)
|
||||
pub success_rate: f64,
|
||||
/// Current scan cycle duration
|
||||
pub current_cycle_duration: Duration,
|
||||
/// Average scan cycle duration
|
||||
pub avg_cycle_duration: Duration,
|
||||
/// Last scan completion time
|
||||
pub last_scan_completion: Option<SystemTime>,
|
||||
}
|
||||
|
||||
/// Resource usage metrics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ResourceMetrics {
|
||||
/// CPU usage percentage
|
||||
pub cpu_usage_percent: f64,
|
||||
/// Memory usage in bytes
|
||||
pub memory_usage_bytes: u64,
|
||||
/// Memory usage percentage
|
||||
pub memory_usage_percent: f64,
|
||||
/// Disk I/O operations per second
|
||||
pub disk_io_ops_per_sec: f64,
|
||||
/// Disk I/O bytes per second
|
||||
pub disk_io_bytes_per_sec: f64,
|
||||
/// Network I/O bytes per second
|
||||
pub network_io_bytes_per_sec: f64,
|
||||
/// Number of active threads
|
||||
pub active_threads: u32,
|
||||
/// Number of open file descriptors
|
||||
pub open_file_descriptors: u32,
|
||||
}
|
||||
|
||||
/// Health metrics summary
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HealthMetrics {
|
||||
/// Total health issues by severity
|
||||
pub issues_by_severity: HashMap<Severity, u64>,
|
||||
/// Total health issues by type
|
||||
pub issues_by_type: HashMap<HealthIssueType, u64>,
|
||||
/// Objects with health issues
|
||||
pub objects_with_issues: u64,
|
||||
/// Percentage of objects with issues
|
||||
pub objects_with_issues_percent: f64,
|
||||
/// Last health check time
|
||||
pub last_health_check: SystemTime,
|
||||
/// Health score (0-100, higher is better)
|
||||
pub health_score: f64,
|
||||
}
|
||||
|
||||
/// Historical metrics data point
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricsDataPoint {
|
||||
pub timestamp: SystemTime,
|
||||
pub scanner_metrics: ScannerMetrics,
|
||||
pub resource_metrics: ResourceMetrics,
|
||||
pub health_metrics: HealthMetrics,
|
||||
}
|
||||
|
||||
/// Metrics collector for scanner system
|
||||
pub struct MetricsCollector {
|
||||
config: MetricsConfig,
|
||||
current_metrics: Arc<RwLock<CurrentMetrics>>,
|
||||
historical_data: Arc<RwLock<Vec<MetricsDataPoint>>>,
|
||||
collection_start_time: Instant,
|
||||
}
|
||||
|
||||
/// Current metrics state
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CurrentMetrics {
|
||||
pub scanner_metrics: ScannerMetrics,
|
||||
pub resource_metrics: ResourceMetrics,
|
||||
pub health_metrics: HealthMetrics,
|
||||
pub last_update: SystemTime,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
/// Create a new metrics collector
|
||||
pub fn new(config: MetricsConfig) -> Self {
|
||||
let collector = Self {
|
||||
config,
|
||||
current_metrics: Arc::new(RwLock::new(CurrentMetrics {
|
||||
scanner_metrics: ScannerMetrics {
|
||||
objects_per_second: 0.0,
|
||||
bytes_per_second: 0.0,
|
||||
avg_scan_time_per_object: Duration::ZERO,
|
||||
total_objects_scanned: 0,
|
||||
total_bytes_scanned: 0,
|
||||
health_issues_detected: 0,
|
||||
success_rate: 100.0,
|
||||
current_cycle_duration: Duration::ZERO,
|
||||
avg_cycle_duration: Duration::ZERO,
|
||||
last_scan_completion: None,
|
||||
},
|
||||
resource_metrics: ResourceMetrics {
|
||||
cpu_usage_percent: 0.0,
|
||||
memory_usage_bytes: 0,
|
||||
memory_usage_percent: 0.0,
|
||||
disk_io_ops_per_sec: 0.0,
|
||||
disk_io_bytes_per_sec: 0.0,
|
||||
network_io_bytes_per_sec: 0.0,
|
||||
active_threads: 0,
|
||||
open_file_descriptors: 0,
|
||||
},
|
||||
health_metrics: HealthMetrics {
|
||||
issues_by_severity: HashMap::new(),
|
||||
issues_by_type: HashMap::new(),
|
||||
objects_with_issues: 0,
|
||||
objects_with_issues_percent: 0.0,
|
||||
last_health_check: SystemTime::now(),
|
||||
health_score: 100.0,
|
||||
},
|
||||
last_update: SystemTime::now(),
|
||||
})),
|
||||
historical_data: Arc::new(RwLock::new(Vec::new())),
|
||||
collection_start_time: Instant::now(),
|
||||
};
|
||||
|
||||
info!("Metrics collector created with config: {:?}", collector.config);
|
||||
collector
|
||||
}
|
||||
|
||||
/// Start metrics collection
|
||||
pub async fn start_collection(&self) -> Result<()> {
|
||||
info!("Starting metrics collection");
|
||||
|
||||
let collector = self.clone_for_background();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = collector.run_collection_loop().await {
|
||||
error!("Metrics collection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stop metrics collection
|
||||
pub async fn stop_collection(&self) -> Result<()> {
|
||||
info!("Stopping metrics collection");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update scanner metrics
|
||||
pub async fn update_scanner_metrics(&self, metrics: ScannerMetrics) -> Result<()> {
|
||||
let mut current = self.current_metrics.write().await;
|
||||
current.scanner_metrics = metrics;
|
||||
current.last_update = SystemTime::now();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update resource metrics
|
||||
pub async fn update_resource_metrics(&self, metrics: ResourceMetrics) -> Result<()> {
|
||||
let mut current = self.current_metrics.write().await;
|
||||
current.resource_metrics = metrics;
|
||||
current.last_update = SystemTime::now();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update health metrics
|
||||
pub async fn update_health_metrics(&self, metrics: HealthMetrics) -> Result<()> {
|
||||
let mut current = self.current_metrics.write().await;
|
||||
current.health_metrics = metrics;
|
||||
current.last_update = SystemTime::now();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Record a health issue
|
||||
pub async fn record_health_issue(&self, issue: &HealthIssue) -> Result<()> {
|
||||
let mut current = self.current_metrics.write().await;
|
||||
|
||||
// Update severity count
|
||||
*current.health_metrics.issues_by_severity.entry(issue.severity).or_insert(0) += 1;
|
||||
|
||||
// Update type count
|
||||
*current.health_metrics.issues_by_type.entry(issue.issue_type.clone()).or_insert(0) += 1;
|
||||
|
||||
// Update scanner metrics
|
||||
current.scanner_metrics.health_issues_detected += 1;
|
||||
|
||||
current.last_update = SystemTime::now();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get current metrics
|
||||
pub async fn current_metrics(&self) -> CurrentMetrics {
|
||||
self.current_metrics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Get historical metrics
|
||||
pub async fn historical_metrics(&self, duration: Duration) -> Vec<MetricsDataPoint> {
|
||||
let historical = self.historical_data.read().await;
|
||||
let cutoff_time = SystemTime::now() - duration;
|
||||
|
||||
historical.iter()
|
||||
.filter(|point| point.timestamp >= cutoff_time)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Get metrics summary
|
||||
pub async fn metrics_summary(&self) -> MetricsSummary {
|
||||
let current = self.current_metrics.read().await;
|
||||
let historical = self.historical_data.read().await;
|
||||
|
||||
let uptime = self.collection_start_time.elapsed();
|
||||
let total_data_points = historical.len();
|
||||
|
||||
// Calculate averages from historical data
|
||||
let avg_objects_per_sec = if !historical.is_empty() {
|
||||
historical.iter()
|
||||
.map(|point| point.scanner_metrics.objects_per_second)
|
||||
.sum::<f64>() / historical.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_bytes_per_sec = if !historical.is_empty() {
|
||||
historical.iter()
|
||||
.map(|point| point.scanner_metrics.bytes_per_second)
|
||||
.sum::<f64>() / historical.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_cpu_usage = if !historical.is_empty() {
|
||||
historical.iter()
|
||||
.map(|point| point.resource_metrics.cpu_usage_percent)
|
||||
.sum::<f64>() / historical.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let avg_memory_usage = if !historical.is_empty() {
|
||||
historical.iter()
|
||||
.map(|point| point.resource_metrics.memory_usage_percent)
|
||||
.sum::<f64>() / historical.len() as f64
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
MetricsSummary {
|
||||
uptime,
|
||||
total_data_points,
|
||||
current_scanner_metrics: current.scanner_metrics.clone(),
|
||||
current_resource_metrics: current.resource_metrics.clone(),
|
||||
current_health_metrics: current.health_metrics.clone(),
|
||||
avg_objects_per_sec,
|
||||
avg_bytes_per_sec,
|
||||
avg_cpu_usage,
|
||||
avg_memory_usage,
|
||||
last_update: current.last_update,
|
||||
}
|
||||
}
|
||||
|
||||
/// Clone the collector for background tasks
|
||||
fn clone_for_background(&self) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
config: self.config.clone(),
|
||||
current_metrics: self.current_metrics.clone(),
|
||||
historical_data: self.historical_data.clone(),
|
||||
collection_start_time: self.collection_start_time,
|
||||
})
|
||||
}
|
||||
|
||||
/// Main collection loop
|
||||
async fn run_collection_loop(&self) -> Result<()> {
|
||||
info!("Metrics collection loop started");
|
||||
|
||||
loop {
|
||||
// Collect current metrics
|
||||
self.collect_current_metrics().await?;
|
||||
|
||||
// Store historical data point
|
||||
self.store_historical_data_point().await?;
|
||||
|
||||
// Clean up old data
|
||||
self.cleanup_old_data().await?;
|
||||
|
||||
// Wait for next collection interval
|
||||
tokio::time::sleep(self.config.collection_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect current system metrics
|
||||
async fn collect_current_metrics(&self) -> Result<()> {
|
||||
if self.config.enable_resource_tracking {
|
||||
let resource_metrics = self.collect_resource_metrics().await?;
|
||||
self.update_resource_metrics(resource_metrics).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect resource usage metrics
|
||||
async fn collect_resource_metrics(&self) -> Result<ResourceMetrics> {
|
||||
// TODO: Implement actual resource metrics collection
|
||||
// For now, return placeholder metrics
|
||||
Ok(ResourceMetrics {
|
||||
cpu_usage_percent: 0.0,
|
||||
memory_usage_bytes: 0,
|
||||
memory_usage_percent: 0.0,
|
||||
disk_io_ops_per_sec: 0.0,
|
||||
disk_io_bytes_per_sec: 0.0,
|
||||
network_io_bytes_per_sec: 0.0,
|
||||
active_threads: 0,
|
||||
open_file_descriptors: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Store current metrics as historical data point
|
||||
async fn store_historical_data_point(&self) -> Result<()> {
|
||||
let current = self.current_metrics.read().await;
|
||||
let data_point = MetricsDataPoint {
|
||||
timestamp: SystemTime::now(),
|
||||
scanner_metrics: current.scanner_metrics.clone(),
|
||||
resource_metrics: current.resource_metrics.clone(),
|
||||
health_metrics: current.health_metrics.clone(),
|
||||
};
|
||||
|
||||
let mut historical = self.historical_data.write().await;
|
||||
historical.push(data_point);
|
||||
|
||||
// Limit the number of data points
|
||||
if historical.len() > self.config.max_data_points {
|
||||
historical.remove(0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean up old historical data
|
||||
async fn cleanup_old_data(&self) -> Result<()> {
|
||||
let cutoff_time = SystemTime::now() - self.config.retention_period;
|
||||
let mut historical = self.historical_data.write().await;
|
||||
|
||||
historical.retain(|point| point.timestamp >= cutoff_time);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reset all metrics
|
||||
pub async fn reset_metrics(&self) -> Result<()> {
|
||||
let mut current = self.current_metrics.write().await;
|
||||
*current = CurrentMetrics {
|
||||
scanner_metrics: ScannerMetrics {
|
||||
objects_per_second: 0.0,
|
||||
bytes_per_second: 0.0,
|
||||
avg_scan_time_per_object: Duration::ZERO,
|
||||
total_objects_scanned: 0,
|
||||
total_bytes_scanned: 0,
|
||||
health_issues_detected: 0,
|
||||
success_rate: 100.0,
|
||||
current_cycle_duration: Duration::ZERO,
|
||||
avg_cycle_duration: Duration::ZERO,
|
||||
last_scan_completion: None,
|
||||
},
|
||||
resource_metrics: ResourceMetrics {
|
||||
cpu_usage_percent: 0.0,
|
||||
memory_usage_bytes: 0,
|
||||
memory_usage_percent: 0.0,
|
||||
disk_io_ops_per_sec: 0.0,
|
||||
disk_io_bytes_per_sec: 0.0,
|
||||
network_io_bytes_per_sec: 0.0,
|
||||
active_threads: 0,
|
||||
open_file_descriptors: 0,
|
||||
},
|
||||
health_metrics: HealthMetrics {
|
||||
issues_by_severity: HashMap::new(),
|
||||
issues_by_type: HashMap::new(),
|
||||
objects_with_issues: 0,
|
||||
objects_with_issues_percent: 0.0,
|
||||
last_health_check: SystemTime::now(),
|
||||
health_score: 100.0,
|
||||
},
|
||||
last_update: SystemTime::now(),
|
||||
};
|
||||
|
||||
let mut historical = self.historical_data.write().await;
|
||||
historical.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Summary of all metrics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MetricsSummary {
|
||||
pub uptime: Duration,
|
||||
pub total_data_points: usize,
|
||||
pub current_scanner_metrics: ScannerMetrics,
|
||||
pub current_resource_metrics: ResourceMetrics,
|
||||
pub current_health_metrics: HealthMetrics,
|
||||
pub avg_objects_per_sec: f64,
|
||||
pub avg_bytes_per_sec: f64,
|
||||
pub avg_cpu_usage: f64,
|
||||
pub avg_memory_usage: f64,
|
||||
pub last_update: SystemTime,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metrics_collector_creation() {
|
||||
let config = MetricsConfig::default();
|
||||
let collector = MetricsCollector::new(config);
|
||||
let metrics = collector.current_metrics().await;
|
||||
assert_eq!(metrics.scanner_metrics.total_objects_scanned, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_metrics_update() {
|
||||
let config = MetricsConfig::default();
|
||||
let collector = MetricsCollector::new(config);
|
||||
|
||||
let scanner_metrics = ScannerMetrics {
|
||||
objects_per_second: 100.0,
|
||||
bytes_per_second: 1024.0,
|
||||
avg_scan_time_per_object: Duration::from_millis(10),
|
||||
total_objects_scanned: 1000,
|
||||
total_bytes_scanned: 1024000,
|
||||
health_issues_detected: 5,
|
||||
success_rate: 99.5,
|
||||
current_cycle_duration: Duration::from_secs(60),
|
||||
avg_cycle_duration: Duration::from_secs(65),
|
||||
last_scan_completion: Some(SystemTime::now()),
|
||||
};
|
||||
|
||||
collector.update_scanner_metrics(scanner_metrics).await.unwrap();
|
||||
|
||||
let current = collector.current_metrics().await;
|
||||
assert_eq!(current.scanner_metrics.total_objects_scanned, 1000);
|
||||
assert_eq!(current.scanner_metrics.health_issues_detected, 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_health_issue_recording() {
|
||||
let config = MetricsConfig::default();
|
||||
let collector = MetricsCollector::new(config);
|
||||
|
||||
let issue = HealthIssue {
|
||||
issue_type: HealthIssueType::DiskFull,
|
||||
severity: Severity::High,
|
||||
bucket: "test-bucket".to_string(),
|
||||
object: "test-object".to_string(),
|
||||
description: "Test issue".to_string(),
|
||||
metadata: None,
|
||||
};
|
||||
|
||||
collector.record_health_issue(&issue).await.unwrap();
|
||||
|
||||
let current = collector.current_metrics().await;
|
||||
assert_eq!(current.scanner_metrics.health_issues_detected, 1);
|
||||
assert_eq!(current.health_metrics.issues_by_severity.get(&Severity::High), Some(&1));
|
||||
}
|
||||
}
|
||||
419
crates/ahm/src/scanner/object_scanner.rs
Normal file
419
crates/ahm/src/scanner/object_scanner.rs
Normal file
@@ -0,0 +1,419 @@
|
||||
// Copyright 2024 RustFS Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
path::Path,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::Result;
|
||||
use super::{HealthIssue, HealthIssueType, Severity};
|
||||
|
||||
/// Configuration for object scanning
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ObjectScannerConfig {
|
||||
/// Whether to perform checksum verification
|
||||
pub verify_checksum: bool,
|
||||
/// Whether to check replication status
|
||||
pub check_replication: bool,
|
||||
/// Whether to validate metadata consistency
|
||||
pub validate_metadata: bool,
|
||||
/// Maximum object size to scan (bytes)
|
||||
pub max_object_size: u64,
|
||||
/// Minimum object size (bytes)
|
||||
pub min_object_size: u64,
|
||||
/// Timeout for individual object scans
|
||||
pub scan_timeout: Duration,
|
||||
/// Whether to enable deep scanning (bitrot detection)
|
||||
pub enable_deep_scan: bool,
|
||||
}
|
||||
|
||||
impl Default for ObjectScannerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
verify_checksum: true,
|
||||
check_replication: true,
|
||||
validate_metadata: true,
|
||||
max_object_size: 1024 * 1024 * 1024 * 1024, // 1TB
|
||||
min_object_size: 0,
|
||||
scan_timeout: Duration::from_secs(30),
|
||||
enable_deep_scan: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of scanning a single object
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ObjectScanResult {
|
||||
/// Object identifier
|
||||
pub bucket: String,
|
||||
pub object: String,
|
||||
pub version_id: Option<String>,
|
||||
/// Scan success status
|
||||
pub success: bool,
|
||||
/// Object metadata discovered
|
||||
pub metadata: Option<ObjectMetadata>,
|
||||
/// Health issues detected
|
||||
pub health_issues: Vec<HealthIssue>,
|
||||
/// Time taken to scan this object
|
||||
pub scan_duration: Duration,
|
||||
/// Error message if scan failed
|
||||
pub error_message: Option<String>,
|
||||
}
|
||||
|
||||
/// Object metadata
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ObjectMetadata {
|
||||
pub size: u64,
|
||||
pub modified_time: SystemTime,
|
||||
pub content_type: String,
|
||||
pub etag: String,
|
||||
pub checksum: Option<String>,
|
||||
pub replication_status: Option<String>,
|
||||
pub encryption_status: Option<String>,
|
||||
pub custom_metadata: HashMap<String, String>,
|
||||
}
|
||||
|
||||
/// Object scanner for individual object health checking
|
||||
pub struct ObjectScanner {
|
||||
config: ObjectScannerConfig,
|
||||
statistics: Arc<RwLock<ObjectScannerStatistics>>,
|
||||
}
|
||||
|
||||
/// Statistics for object scanning
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ObjectScannerStatistics {
|
||||
pub objects_scanned: u64,
|
||||
pub objects_with_issues: u64,
|
||||
pub total_issues_found: u64,
|
||||
pub total_scan_time: Duration,
|
||||
pub average_scan_time: Duration,
|
||||
pub checksum_verifications: u64,
|
||||
pub checksum_failures: u64,
|
||||
pub replication_checks: u64,
|
||||
pub replication_failures: u64,
|
||||
}
|
||||
|
||||
impl ObjectScanner {
|
||||
/// Create a new object scanner
|
||||
pub fn new(config: ObjectScannerConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
statistics: Arc::new(RwLock::new(ObjectScannerStatistics::default())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan a single object for health issues
|
||||
pub async fn scan_object(&self, bucket: &str, object: &str, version_id: Option<&str>, path: &Path) -> Result<ObjectScanResult> {
|
||||
let scan_start = std::time::Instant::now();
|
||||
let mut health_issues = Vec::new();
|
||||
let mut error_message = None;
|
||||
|
||||
// Check if file exists
|
||||
if !path.exists() {
|
||||
return Ok(ObjectScanResult {
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
version_id: version_id.map(|v| v.to_string()),
|
||||
success: false,
|
||||
metadata: None,
|
||||
health_issues: vec![HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::Critical,
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
description: "Object file does not exist".to_string(),
|
||||
metadata: None,
|
||||
}],
|
||||
scan_duration: scan_start.elapsed(),
|
||||
error_message: Some("Object file not found".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
// Get file metadata
|
||||
let metadata = match std::fs::metadata(path) {
|
||||
Ok(metadata) => metadata,
|
||||
Err(e) => {
|
||||
error_message = Some(format!("Failed to read file metadata: {}", e));
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::DiskReadError,
|
||||
severity: Severity::High,
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
description: "Failed to read file metadata".to_string(),
|
||||
metadata: None,
|
||||
});
|
||||
return Ok(ObjectScanResult {
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
version_id: version_id.map(|v| v.to_string()),
|
||||
success: false,
|
||||
metadata: None,
|
||||
health_issues,
|
||||
scan_duration: scan_start.elapsed(),
|
||||
error_message,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Check file size
|
||||
let file_size = metadata.len();
|
||||
if file_size < self.config.min_object_size {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::ObjectTooSmall,
|
||||
severity: Severity::Low,
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
description: format!("Object size {} is below minimum {}", file_size, self.config.min_object_size),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
if file_size > self.config.max_object_size {
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::ObjectTooLarge,
|
||||
severity: Severity::Medium,
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
description: format!("Object size {} exceeds maximum {}", file_size, self.config.max_object_size),
|
||||
metadata: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Verify checksum if enabled
|
||||
let checksum = if self.config.verify_checksum {
|
||||
match self.verify_checksum(path).await {
|
||||
Ok(cs) => {
|
||||
self.update_statistics(|stats| stats.checksum_verifications += 1).await;
|
||||
Some(cs)
|
||||
}
|
||||
Err(_e) => {
|
||||
self.update_statistics(|stats| stats.checksum_failures += 1).await;
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::ChecksumMismatch,
|
||||
severity: Severity::High,
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
description: "Checksum verification failed".to_string(),
|
||||
metadata: None,
|
||||
});
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Check replication status if enabled
|
||||
let replication_status = if self.config.check_replication {
|
||||
match self.check_replication_status(bucket, object).await {
|
||||
Ok(status) => {
|
||||
self.update_statistics(|stats| stats.replication_checks += 1).await;
|
||||
Some(status)
|
||||
}
|
||||
Err(_e) => {
|
||||
self.update_statistics(|stats| stats.replication_failures += 1).await;
|
||||
health_issues.push(HealthIssue {
|
||||
issue_type: HealthIssueType::MissingReplica,
|
||||
severity: Severity::High,
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
description: "Replication status check failed".to_string(),
|
||||
metadata: None,
|
||||
});
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Validate metadata if enabled
|
||||
if self.config.validate_metadata {
|
||||
if let Some(issue) = self.validate_metadata(bucket, object, &metadata).await? {
|
||||
health_issues.push(issue);
|
||||
}
|
||||
}
|
||||
|
||||
// Create object metadata
|
||||
let object_metadata = ObjectMetadata {
|
||||
size: file_size,
|
||||
modified_time: metadata.modified().unwrap_or(SystemTime::now()),
|
||||
content_type: self.detect_content_type(path),
|
||||
etag: self.calculate_etag(path).await?,
|
||||
checksum,
|
||||
replication_status,
|
||||
encryption_status: None, // TODO: Implement encryption status check
|
||||
custom_metadata: HashMap::new(), // TODO: Extract custom metadata
|
||||
};
|
||||
|
||||
let scan_duration = scan_start.elapsed();
|
||||
let success = health_issues.is_empty();
|
||||
|
||||
// Update statistics
|
||||
self.update_statistics(|stats| {
|
||||
stats.objects_scanned += 1;
|
||||
if !health_issues.is_empty() {
|
||||
stats.objects_with_issues += 1;
|
||||
stats.total_issues_found += health_issues.len() as u64;
|
||||
}
|
||||
stats.total_scan_time += scan_duration;
|
||||
stats.average_scan_time = Duration::from_millis(
|
||||
stats.total_scan_time.as_millis() as u64 / stats.objects_scanned.max(1)
|
||||
);
|
||||
}).await;
|
||||
|
||||
Ok(ObjectScanResult {
|
||||
bucket: bucket.to_string(),
|
||||
object: object.to_string(),
|
||||
version_id: version_id.map(|v| v.to_string()),
|
||||
success,
|
||||
metadata: Some(object_metadata),
|
||||
health_issues,
|
||||
scan_duration,
|
||||
error_message,
|
||||
})
|
||||
}
|
||||
|
||||
/// Verify object checksum
|
||||
async fn verify_checksum(&self, _path: &Path) -> Result<String> {
|
||||
// TODO: Implement actual checksum verification
|
||||
// For now, return a placeholder checksum
|
||||
Ok("placeholder_checksum".to_string())
|
||||
}
|
||||
|
||||
/// Check object replication status
|
||||
async fn check_replication_status(&self, _bucket: &str, _object: &str) -> Result<String> {
|
||||
// TODO: Implement actual replication status checking
|
||||
// For now, return a placeholder status
|
||||
Ok("replicated".to_string())
|
||||
}
|
||||
|
||||
/// Validate object metadata
|
||||
async fn validate_metadata(&self, _bucket: &str, _object: &str, _metadata: &std::fs::Metadata) -> Result<Option<HealthIssue>> {
|
||||
// TODO: Implement actual metadata validation
|
||||
// For now, return None (no issues)
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Detect content type from file extension
|
||||
fn detect_content_type(&self, path: &Path) -> String {
|
||||
if let Some(extension) = path.extension() {
|
||||
match extension.to_str().unwrap_or("").to_lowercase().as_str() {
|
||||
"txt" => "text/plain",
|
||||
"json" => "application/json",
|
||||
"xml" => "application/xml",
|
||||
"html" | "htm" => "text/html",
|
||||
"css" => "text/css",
|
||||
"js" => "application/javascript",
|
||||
"png" => "image/png",
|
||||
"jpg" | "jpeg" => "image/jpeg",
|
||||
"gif" => "image/gif",
|
||||
"pdf" => "application/pdf",
|
||||
"zip" => "application/zip",
|
||||
"tar" => "application/x-tar",
|
||||
"gz" => "application/gzip",
|
||||
_ => "application/octet-stream",
|
||||
}.to_string()
|
||||
} else {
|
||||
"application/octet-stream".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate object ETag
|
||||
async fn calculate_etag(&self, _path: &Path) -> Result<String> {
|
||||
// TODO: Implement actual ETag calculation
|
||||
// For now, return a placeholder ETag
|
||||
Ok("placeholder_etag".to_string())
|
||||
}
|
||||
|
||||
/// Update scanner statistics
|
||||
async fn update_statistics<F>(&self, update_fn: F)
|
||||
where
|
||||
F: FnOnce(&mut ObjectScannerStatistics),
|
||||
{
|
||||
let mut stats = self.statistics.write().await;
|
||||
update_fn(&mut stats);
|
||||
}
|
||||
|
||||
/// Get current statistics
|
||||
pub async fn statistics(&self) -> ObjectScannerStatistics {
|
||||
self.statistics.read().await.clone()
|
||||
}
|
||||
|
||||
/// Reset statistics
|
||||
pub async fn reset_statistics(&self) {
|
||||
let mut stats = self.statistics.write().await;
|
||||
*stats = ObjectScannerStatistics::default();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_object_scanner_creation() {
|
||||
let config = ObjectScannerConfig::default();
|
||||
let scanner = ObjectScanner::new(config);
|
||||
assert_eq!(scanner.statistics().await.objects_scanned, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_content_type_detection() {
|
||||
let config = ObjectScannerConfig::default();
|
||||
let scanner = ObjectScanner::new(config);
|
||||
|
||||
let path = Path::new("test.txt");
|
||||
assert_eq!(scanner.detect_content_type(path), "text/plain");
|
||||
|
||||
let path = Path::new("test.json");
|
||||
assert_eq!(scanner.detect_content_type(path), "application/json");
|
||||
|
||||
let path = Path::new("test.unknown");
|
||||
assert_eq!(scanner.detect_content_type(path), "application/octet-stream");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_object_scanning() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let test_file = temp_dir.path().join("test.txt");
|
||||
|
||||
// Create a test file
|
||||
let mut file = File::create(&test_file).unwrap();
|
||||
writeln!(file, "test content").unwrap();
|
||||
|
||||
let config = ObjectScannerConfig::default();
|
||||
let scanner = ObjectScanner::new(config);
|
||||
|
||||
let result = scanner.scan_object("test-bucket", "test.txt", None, &test_file).await.unwrap();
|
||||
|
||||
assert!(result.success);
|
||||
assert_eq!(result.bucket, "test-bucket");
|
||||
assert_eq!(result.object, "test.txt");
|
||||
assert!(result.metadata.is_some());
|
||||
|
||||
let metadata = result.metadata.unwrap();
|
||||
assert!(metadata.size > 0);
|
||||
assert_eq!(metadata.content_type, "text/plain");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user