From 60635aeb65c99ae4fd04958f011f2bf914caeb3b Mon Sep 17 00:00:00 2001 From: houseme Date: Thu, 15 May 2025 23:34:36 +0800 Subject: [PATCH] add metrics --- Cargo.lock | 202 ++++++++++- Cargo.toml | 9 +- cli/rustfs-gui/Cargo.toml | 1 + crates/obs/src/metrics/audit.rs | 4 +- crates/obs/src/metrics/bucket.rs | 3 +- crates/obs/src/metrics/bucket_replication.rs | 2 +- crates/obs/src/metrics/cluster_erasure_set.rs | 96 +++++ crates/obs/src/metrics/cluster_health.rs | 28 ++ crates/obs/src/metrics/cluster_iam.rs | 84 +++++ .../obs/src/metrics/cluster_notification.rs | 36 ++ crates/obs/src/metrics/cluster_usage.rs | 131 +++++++ crates/obs/src/metrics/entry/descriptor.rs | 1 + crates/obs/src/metrics/entry/metric_name.rs | 329 +++++++++++++++++- crates/obs/src/metrics/entry/metric_type.rs | 1 + crates/obs/src/metrics/entry/namespace.rs | 1 + crates/obs/src/metrics/entry/path_utils.rs | 3 +- crates/obs/src/metrics/entry/subsystem.rs | 48 +-- crates/obs/src/metrics/ilm.rs | 44 +++ crates/obs/src/metrics/logger_webhook.rs | 35 ++ crates/obs/src/metrics/mod.rs | 14 + crates/obs/src/metrics/replication.rs | 108 ++++++ crates/obs/src/metrics/scanner.rs | 52 +++ crates/obs/src/metrics/system_cpu.rs | 68 ++++ crates/obs/src/metrics/system_drive.rs | 192 ++++++++++ crates/obs/src/metrics/system_memory.rs | 68 ++++ crates/obs/src/metrics/system_network.rs | 44 +++ crates/obs/src/metrics/system_process.rs | 140 ++++++++ 27 files changed, 1702 insertions(+), 42 deletions(-) create mode 100644 crates/obs/src/metrics/cluster_erasure_set.rs create mode 100644 crates/obs/src/metrics/cluster_health.rs create mode 100644 crates/obs/src/metrics/cluster_iam.rs create mode 100644 crates/obs/src/metrics/cluster_notification.rs create mode 100644 crates/obs/src/metrics/cluster_usage.rs create mode 100644 crates/obs/src/metrics/ilm.rs create mode 100644 crates/obs/src/metrics/logger_webhook.rs create mode 100644 crates/obs/src/metrics/replication.rs create mode 100644 crates/obs/src/metrics/scanner.rs create mode 100644 crates/obs/src/metrics/system_cpu.rs create mode 100644 crates/obs/src/metrics/system_drive.rs create mode 100644 crates/obs/src/metrics/system_memory.rs create mode 100644 crates/obs/src/metrics/system_network.rs create mode 100644 crates/obs/src/metrics/system_process.rs diff --git a/Cargo.lock b/Cargo.lock index 9a073fe0..48c3c13c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -835,6 +835,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "base62" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10e52a7bcb1d6beebee21fb5053af9e3cbb7a7ed1a4909e534040e676437ab1f" +dependencies = [ + "rustversion", +] + [[package]] name = "base64" version = "0.21.7" @@ -1012,6 +1021,16 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.17.0" @@ -1291,9 +1310,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.37" +version = "4.5.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eccb054f56cbd38340b380d4a8e69ef1f02f1af43db2f0cc817a4774d80ae071" +checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" dependencies = [ "clap_builder", "clap_derive", @@ -1301,9 +1320,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.37" +version = "4.5.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efd9466fac8543255d3b1fcad4762c5e116ffe808c8a3043d4263cd4fd4862a2" +checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" dependencies = [ "anstream", "anstyle", @@ -1722,6 +1741,25 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -3795,6 +3833,30 @@ dependencies = [ "x11-dl", ] +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + +[[package]] +name = "globwalk" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93e3af942408868f6934a7b85134a3230832b9977cf66125df2f9edcfce4ddcc" +dependencies = [ + "bitflags 1.3.2", + "ignore", + "walkdir", +] + [[package]] name = "gloo-net" version = "0.6.0" @@ -4389,6 +4451,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.9", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -4471,6 +4549,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -5284,6 +5371,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "normpath" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8911957c4b1549ac0dc74e30db9c8b0e66ddcd6d7acc33098f4c63a64a6d7ed" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "ntapi" version = "0.4.1" @@ -7219,6 +7315,60 @@ dependencies = [ "walkdir", ] +[[package]] +name = "rust-i18n" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b6307cde881492032919adf26e254981604a6657b339ae23cce8358e9ee203" +dependencies = [ + "globwalk", + "once_cell", + "regex", + "rust-i18n-macro", + "rust-i18n-support", + "smallvec", +] + +[[package]] +name = "rust-i18n-macro" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c0dc724669fe2ddbbec5ed9daea8147a9030de87ebb46fdc7bb9315701d9912" +dependencies = [ + "glob", + "once_cell", + "proc-macro2", + "quote", + "rust-i18n-support", + "serde", + "serde_json", + "serde_yaml", + "syn 2.0.100", +] + +[[package]] +name = "rust-i18n-support" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b47501de04282525d0192c4b4133f9e3574e1fab3542ddc7bb109ff773dc108b" +dependencies = [ + "arc-swap", + "base62", + "globwalk", + "itertools 0.11.0", + "lazy_static", + "normpath", + "once_cell", + "proc-macro2", + "regex", + "serde", + "serde_json", + "serde_yaml", + "siphasher 1.0.1", + "toml", + "triomphe", +] + [[package]] name = "rust-ini" version = "0.21.1" @@ -7375,6 +7525,7 @@ dependencies = [ "lazy_static", "rfd 0.15.3", "rust-embed", + "rust-i18n", "serde", "serde_json", "sha2 0.10.9", @@ -7508,11 +7659,12 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" dependencies = [ "web-time", + "zeroize", ] [[package]] @@ -7841,6 +7993,19 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.9.0", + "itoa 1.0.15", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "server_fn" version = "0.6.15" @@ -8455,9 +8620,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.19.1" +version = "3.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" dependencies = [ "fastrand", "getrandom 0.3.2", @@ -8923,9 +9088,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.2" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" +checksum = "0fdb0c213ca27a9f57ab69ddb290fd80d970922355b83ae380b395d3986b8a2e" dependencies = [ "async-compression", "bitflags 2.9.0", @@ -9116,6 +9281,17 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343e926fc669bc8cde4fa3129ab681c63671bae288b1f1081ceee6d9d37904fc" +[[package]] +name = "triomphe" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8f7726da4807b58ea5c96fdc122f80702030edc33b35aff9190a51148ccc85" +dependencies = [ + "arc-swap", + "serde", + "stable_deref_trait", +] + [[package]] name = "try-lock" version = "0.2.5" @@ -9239,6 +9415,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index 809c13a7..22e2b9e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -67,7 +67,7 @@ blake2 = "0.10.6" bytes = "1.10.1" bytesize = "2.0.1" chrono = { version = "0.4.41", features = ["serde"] } -clap = { version = "4.5.37", features = ["derive", "env"] } +clap = { version = "4.5.38", features = ["derive", "env"] } config = "0.15.11" const-str = { version = "0.6.2", features = ["std", "proc"] } datafusion = "46.0.1" @@ -147,8 +147,9 @@ rmp = "0.8.14" rmp-serde = "1.3.0" rumqttc = { version = "0.24" } rust-embed = { version = "8.7.1" } +rust-i18n = { version = "3.1.4" } rustls = { version = "0.23.27" } -rustls-pki-types = "1.11.0" +rustls-pki-types = "1.12.0" rustls-pemfile = "2.2.0" s3s = { git = "https://github.com/Nugine/s3s.git", rev = "4733cdfb27b2713e832967232cbff413bb768c10" } s3s-policy = { git = "https://github.com/Nugine/s3s.git", rev = "4733cdfb27b2713e832967232cbff413bb768c10" } @@ -163,7 +164,7 @@ snafu = "0.8.5" socket2 = "0.5.9" strum = { version = "0.27.1", features = ["derive"] } sysinfo = "0.34.2" -tempfile = "3.19.1" +tempfile = "3.20.0" test-case = "3.3.1" thiserror = "2.0.12" time = { version = "0.3.41", features = [ @@ -181,7 +182,7 @@ tokio-util = { version = "0.7.15", features = ["io", "compat"] } tonic = { version = "0.13.1", features = ["gzip"] } tonic-build = { version = "0.13.1" } tower = { version = "0.5.2", features = ["timeout"] } -tower-http = { version = "0.6.2", features = ["cors"] } +tower-http = { version = "0.6.4", features = ["cors"] } tracing = "0.1.41" tracing-core = "0.1.33" tracing-error = "0.2.1" diff --git a/cli/rustfs-gui/Cargo.toml b/cli/rustfs-gui/Cargo.toml index faca71ee..218c8d10 100644 --- a/cli/rustfs-gui/Cargo.toml +++ b/cli/rustfs-gui/Cargo.toml @@ -15,6 +15,7 @@ keyring = { workspace = true } lazy_static = { workspace = true } rfd = { workspace = true } rust-embed = { workspace = true, features = ["interpolate-folder-path"] } +rust-i18n = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sha2 = { workspace = true } diff --git a/crates/obs/src/metrics/audit.rs b/crates/obs/src/metrics/audit.rs index a92c9802..b365607c 100644 --- a/crates/obs/src/metrics/audit.rs +++ b/crates/obs/src/metrics/audit.rs @@ -1,8 +1,10 @@ +/// audit related metric descriptors +/// +/// This module contains the metric descriptors for the audit subsystem. use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; const TARGET_ID: &str = "target_id"; -/// audit related metric descriptors lazy_static::lazy_static! { pub static ref AUDIT_FAILED_MESSAGES_MD: MetricDescriptor = new_counter_md( diff --git a/crates/obs/src/metrics/bucket.rs b/crates/obs/src/metrics/bucket.rs index fda40347..091f2ae5 100644 --- a/crates/obs/src/metrics/bucket.rs +++ b/crates/obs/src/metrics/bucket.rs @@ -1,6 +1,7 @@ +/// bucket level s3 metric descriptor use crate::metrics::{new_counter_md, new_gauge_md, new_histogram_md, subsystems, MetricDescriptor, MetricName}; -/// Bucket 级别 S3 指标描述符 + lazy_static::lazy_static! { pub static ref BUCKET_API_TRAFFIC_SENT_BYTES_MD: MetricDescriptor = new_counter_md( diff --git a/crates/obs/src/metrics/bucket_replication.rs b/crates/obs/src/metrics/bucket_replication.rs index acfbd2dd..da45ba27 100644 --- a/crates/obs/src/metrics/bucket_replication.rs +++ b/crates/obs/src/metrics/bucket_replication.rs @@ -1,3 +1,4 @@ +/// Bucket copy metric descriptor use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; // Label constants @@ -6,7 +7,6 @@ pub const OPERATION_L: &str = "operation"; pub const TARGET_ARN_L: &str = "targetArn"; pub const RANGE_L: &str = "range"; -/// Bucket copy metric descriptor lazy_static::lazy_static! { pub static ref BUCKET_REPL_LAST_HR_FAILED_BYTES_MD: MetricDescriptor = new_gauge_md( diff --git a/crates/obs/src/metrics/cluster_erasure_set.rs b/crates/obs/src/metrics/cluster_erasure_set.rs new file mode 100644 index 00000000..bc6050d1 --- /dev/null +++ b/crates/obs/src/metrics/cluster_erasure_set.rs @@ -0,0 +1,96 @@ +use crate::metrics::{new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +// 定义标签常量 +pub const POOL_ID_L: &str = "pool_id"; +pub const SET_ID_L: &str = "set_id"; + +/// 纠删码集合相关指标描述符 +lazy_static::lazy_static! { + pub static ref ERASURE_SET_OVERALL_WRITE_QUORUM_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetOverallWriteQuorum, + "Overall write quorum across pools and sets", + &[], // 无标签 + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_OVERALL_HEALTH_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetOverallHealth, + "Overall health across pools and sets (1=healthy, 0=unhealthy)", + &[], // 无标签 + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_READ_QUORUM_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetReadQuorum, + "Read quorum for the erasure set in a pool", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_WRITE_QUORUM_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetWriteQuorum, + "Write quorum for the erasure set in a pool", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_ONLINE_DRIVES_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetOnlineDrivesCount, + "Count of online drives in the erasure set in a pool", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_HEALING_DRIVES_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetHealingDrivesCount, + "Count of healing drives in the erasure set in a pool", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_HEALTH_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetHealth, + "Health of the erasure set in a pool (1=healthy, 0=unhealthy)", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_READ_TOLERANCE_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetReadTolerance, + "No of drive failures that can be tolerated without disrupting read operations", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_WRITE_TOLERANCE_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetWriteTolerance, + "No of drive failures that can be tolerated without disrupting write operations", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_READ_HEALTH_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetReadHealth, + "Health of the erasure set in a pool for read operations (1=healthy, 0=unhealthy)", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); + + pub static ref ERASURE_SET_WRITE_HEALTH_MD: MetricDescriptor = + new_gauge_md( + MetricName::ErasureSetWriteHealth, + "Health of the erasure set in a pool for write operations (1=healthy, 0=unhealthy)", + &[POOL_ID_L, SET_ID_L], + subsystems::CLUSTER_ERASURE_SET + ); +} diff --git a/crates/obs/src/metrics/cluster_health.rs b/crates/obs/src/metrics/cluster_health.rs new file mode 100644 index 00000000..ebaba255 --- /dev/null +++ b/crates/obs/src/metrics/cluster_health.rs @@ -0,0 +1,28 @@ +use crate::metrics::{new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 集群健康相关指标描述符 +lazy_static::lazy_static! { + pub static ref HEALTH_DRIVES_OFFLINE_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::HealthDrivesOfflineCount, + "Count of offline drives in the cluster", + &[], // 无标签 + subsystems::CLUSTER_HEALTH + ); + + pub static ref HEALTH_DRIVES_ONLINE_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::HealthDrivesOnlineCount, + "Count of online drives in the cluster", + &[], // 无标签 + subsystems::CLUSTER_HEALTH + ); + + pub static ref HEALTH_DRIVES_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::HealthDrivesCount, + "Count of all drives in the cluster", + &[], // 无标签 + subsystems::CLUSTER_HEALTH + ); +} diff --git a/crates/obs/src/metrics/cluster_iam.rs b/crates/obs/src/metrics/cluster_iam.rs new file mode 100644 index 00000000..aa02d36d --- /dev/null +++ b/crates/obs/src/metrics/cluster_iam.rs @@ -0,0 +1,84 @@ +use crate::metrics::{new_counter_md, subsystems, MetricDescriptor, MetricName}; + +/// IAM 相关指标描述符 +lazy_static::lazy_static! { + pub static ref LAST_SYNC_DURATION_MILLIS_MD: MetricDescriptor = + new_counter_md( + MetricName::LastSyncDurationMillis, + "Last successful IAM data sync duration in milliseconds", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref PLUGIN_AUTHN_SERVICE_FAILED_REQUESTS_MINUTE_MD: MetricDescriptor = + new_counter_md( + MetricName::PluginAuthnServiceFailedRequestsMinute, + "When plugin authentication is configured, returns failed requests count in the last full minute", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref PLUGIN_AUTHN_SERVICE_LAST_FAIL_SECONDS_MD: MetricDescriptor = + new_counter_md( + MetricName::PluginAuthnServiceLastFailSeconds, + "When plugin authentication is configured, returns time (in seconds) since the last failed request to the service", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref PLUGIN_AUTHN_SERVICE_LAST_SUCC_SECONDS_MD: MetricDescriptor = + new_counter_md( + MetricName::PluginAuthnServiceLastSuccSeconds, + "When plugin authentication is configured, returns time (in seconds) since the last successful request to the service", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref PLUGIN_AUTHN_SERVICE_SUCC_AVG_RTT_MS_MINUTE_MD: MetricDescriptor = + new_counter_md( + MetricName::PluginAuthnServiceSuccAvgRttMsMinute, + "When plugin authentication is configured, returns average round-trip-time of successful requests in the last full minute", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref PLUGIN_AUTHN_SERVICE_SUCC_MAX_RTT_MS_MINUTE_MD: MetricDescriptor = + new_counter_md( + MetricName::PluginAuthnServiceSuccMaxRttMsMinute, + "When plugin authentication is configured, returns maximum round-trip-time of successful requests in the last full minute", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref PLUGIN_AUTHN_SERVICE_TOTAL_REQUESTS_MINUTE_MD: MetricDescriptor = + new_counter_md( + MetricName::PluginAuthnServiceTotalRequestsMinute, + "When plugin authentication is configured, returns total requests count in the last full minute", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref SINCE_LAST_SYNC_MILLIS_MD: MetricDescriptor = + new_counter_md( + MetricName::SinceLastSyncMillis, + "Time (in milliseconds) since last successful IAM data sync.", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref SYNC_FAILURES_MD: MetricDescriptor = + new_counter_md( + MetricName::SyncFailures, + "Number of failed IAM data syncs since server start.", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); + + pub static ref SYNC_SUCCESSES_MD: MetricDescriptor = + new_counter_md( + MetricName::SyncSuccesses, + "Number of successful IAM data syncs since server start.", + &[], // 无标签 + subsystems::CLUSTER_IAM + ); +} diff --git a/crates/obs/src/metrics/cluster_notification.rs b/crates/obs/src/metrics/cluster_notification.rs new file mode 100644 index 00000000..1428ff3d --- /dev/null +++ b/crates/obs/src/metrics/cluster_notification.rs @@ -0,0 +1,36 @@ +use crate::metrics::{new_counter_md, subsystems, MetricDescriptor, MetricName}; + +/// 通知相关指标描述符 +lazy_static::lazy_static! { + pub static ref NOTIFICATION_CURRENT_SEND_IN_PROGRESS_MD: MetricDescriptor = + new_counter_md( + MetricName::NotificationCurrentSendInProgress, + "Number of concurrent async Send calls active to all targets", + &[], // 无标签 + subsystems::NOTIFICATION + ); + + pub static ref NOTIFICATION_EVENTS_ERRORS_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::NotificationEventsErrorsTotal, + "Events that were failed to be sent to the targets", + &[], // 无标签 + subsystems::NOTIFICATION + ); + + pub static ref NOTIFICATION_EVENTS_SENT_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::NotificationEventsSentTotal, + "Total number of events sent to the targets", + &[], // 无标签 + subsystems::NOTIFICATION + ); + + pub static ref NOTIFICATION_EVENTS_SKIPPED_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::NotificationEventsSkippedTotal, + "Events that were skipped to be sent to the targets due to the in-memory queue being full", + &[], // 无标签 + subsystems::NOTIFICATION + ); +} diff --git a/crates/obs/src/metrics/cluster_usage.rs b/crates/obs/src/metrics/cluster_usage.rs new file mode 100644 index 00000000..eadb81a9 --- /dev/null +++ b/crates/obs/src/metrics/cluster_usage.rs @@ -0,0 +1,131 @@ +use crate::metrics::{new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 集群对象使用情况相关指标描述符 +lazy_static::lazy_static! { + pub static ref USAGE_SINCE_LAST_UPDATE_SECONDS_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageSinceLastUpdateSeconds, + "Time since last update of usage metrics in seconds", + &[], // 无标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_TOTAL_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageTotalBytes, + "Total cluster usage in bytes", + &[], // 无标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_OBJECTS_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageObjectsCount, + "Total cluster objects count", + &[], // 无标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_VERSIONS_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageVersionsCount, + "Total cluster object versions (including delete markers) count", + &[], // 无标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_DELETE_MARKERS_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageDeleteMarkersCount, + "Total cluster delete markers count", + &[], // 无标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_BUCKETS_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketsCount, + "Total cluster buckets count", + &[], // 无标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_OBJECTS_DISTRIBUTION_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageSizeDistribution, + "Cluster object size distribution", + &["range"], // 标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); + + pub static ref USAGE_VERSIONS_DISTRIBUTION_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageVersionCountDistribution, + "Cluster object version count distribution", + &["range"], // 标签 + subsystems::CLUSTER_USAGE_OBJECTS + ); +} + +/// 定义常量 +pub const BUCKET_LABEL: &str = "bucket"; +pub const RANGE_LABEL: &str = "range"; + +/// 桶使用情况相关指标描述符 +lazy_static::lazy_static! { + pub static ref USAGE_BUCKET_TOTAL_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketTotalBytes, + "Total bucket size in bytes", + &[BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); + + pub static ref USAGE_BUCKET_OBJECTS_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketObjectsCount, + "Total objects count in bucket", + &[BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); + + pub static ref USAGE_BUCKET_VERSIONS_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketVersionsCount, + "Total object versions (including delete markers) count in bucket", + &[BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); + + pub static ref USAGE_BUCKET_DELETE_MARKERS_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketDeleteMarkersCount, + "Total delete markers count in bucket", + &[BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); + + pub static ref USAGE_BUCKET_QUOTA_TOTAL_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketQuotaTotalBytes, + "Total bucket quota in bytes", + &[BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); + + pub static ref USAGE_BUCKET_OBJECT_SIZE_DISTRIBUTION_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketObjectSizeDistribution, + "Bucket object size distribution", + &[RANGE_LABEL, BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); + + pub static ref USAGE_BUCKET_OBJECT_VERSION_COUNT_DISTRIBUTION_MD: MetricDescriptor = + new_gauge_md( + MetricName::UsageBucketObjectVersionCountDistribution, + "Bucket object version count distribution", + &[RANGE_LABEL, BUCKET_LABEL], + subsystems::CLUSTER_USAGE_BUCKETS + ); +} diff --git a/crates/obs/src/metrics/entry/descriptor.rs b/crates/obs/src/metrics/entry/descriptor.rs index a5232c87..e318dc43 100644 --- a/crates/obs/src/metrics/entry/descriptor.rs +++ b/crates/obs/src/metrics/entry/descriptor.rs @@ -38,6 +38,7 @@ impl MetricDescriptor { } /// 获取完整的指标名称,包含前缀和格式化路径 + #[allow(dead_code)] pub fn get_full_metric_name(&self) -> String { let prefix = self.metric_type.to_prom(); let namespace = self.namespace.as_str(); diff --git a/crates/obs/src/metrics/entry/metric_name.rs b/crates/obs/src/metrics/entry/metric_name.rs index 62b9f19f..4269099e 100644 --- a/crates/obs/src/metrics/entry/metric_name.rs +++ b/crates/obs/src/metrics/entry/metric_name.rs @@ -168,11 +168,175 @@ pub enum MetricName { ConfigRRSParity, ConfigStandardParity, + // 纠删码集合相关指标 + ErasureSetOverallWriteQuorum, + ErasureSetOverallHealth, + ErasureSetReadQuorum, + ErasureSetWriteQuorum, + ErasureSetOnlineDrivesCount, + ErasureSetHealingDrivesCount, + ErasureSetHealth, + ErasureSetReadTolerance, + ErasureSetWriteTolerance, + ErasureSetReadHealth, + ErasureSetWriteHealth, + + // 集群健康相关指标 + HealthDrivesOfflineCount, + HealthDrivesOnlineCount, + HealthDrivesCount, + + // IAM 相关指标 + LastSyncDurationMillis, + PluginAuthnServiceFailedRequestsMinute, + PluginAuthnServiceLastFailSeconds, + PluginAuthnServiceLastSuccSeconds, + PluginAuthnServiceSuccAvgRttMsMinute, + PluginAuthnServiceSuccMaxRttMsMinute, + PluginAuthnServiceTotalRequestsMinute, + SinceLastSyncMillis, + SyncFailures, + SyncSuccesses, + + // 通知相关指标 + NotificationCurrentSendInProgress, + NotificationEventsErrorsTotal, + NotificationEventsSentTotal, + NotificationEventsSkippedTotal, + + // 集群对象使用情况相关指标 + UsageSinceLastUpdateSeconds, + UsageTotalBytes, + UsageObjectsCount, + UsageVersionsCount, + UsageDeleteMarkersCount, + UsageBucketsCount, + UsageSizeDistribution, + UsageVersionCountDistribution, + + // 桶使用情况相关指标 + UsageBucketQuotaTotalBytes, + UsageBucketTotalBytes, + UsageBucketObjectsCount, + UsageBucketVersionsCount, + UsageBucketDeleteMarkersCount, + UsageBucketObjectSizeDistribution, + UsageBucketObjectVersionCountDistribution, + + // ILM 相关指标 + IlmExpiryPendingTasks, + IlmTransitionActiveTasks, + IlmTransitionPendingTasks, + IlmTransitionMissedImmediateTasks, + IlmVersionsScanned, + + // Webhook 日志相关指标 + WebhookQueueLength, + WebhookTotalMessages, + WebhookFailedMessages, + + // 复制相关指标 + ReplicationAverageActiveWorkers, + ReplicationAverageQueuedBytes, + ReplicationAverageQueuedCount, + ReplicationAverageDataTransferRate, + ReplicationCurrentActiveWorkers, + ReplicationCurrentDataTransferRate, + ReplicationLastMinuteQueuedBytes, + ReplicationLastMinuteQueuedCount, + ReplicationMaxActiveWorkers, + ReplicationMaxQueuedBytes, + ReplicationMaxQueuedCount, + ReplicationMaxDataTransferRate, + ReplicationRecentBacklogCount, + + // 扫描器相关指标 + ScannerBucketScansFinished, + ScannerBucketScansStarted, + ScannerDirectoriesScanned, + ScannerObjectsScanned, + ScannerVersionsScanned, + ScannerLastActivitySeconds, + + // CPU 系统相关指标 + SysCPUAvgIdle, + SysCPUAvgIOWait, + SysCPULoad, + SysCPULoadPerc, + SysCPUNice, + SysCPUSteal, + SysCPUSystem, + SysCPUUser, + + // 驱动器相关指标 + DriveUsedBytes, + DriveFreeBytes, + DriveTotalBytes, + DriveUsedInodes, + DriveFreeInodes, + DriveTotalInodes, + DriveTimeoutErrorsTotal, + DriveIOErrorsTotal, + DriveAvailabilityErrorsTotal, + DriveWaitingIO, + DriveAPILatencyMicros, + DriveHealth, + + DriveOfflineCount, + DriveOnlineCount, + DriveCount, + + // iostat 相关指标 + DriveReadsPerSec, + DriveReadsKBPerSec, + DriveReadsAwait, + DriveWritesPerSec, + DriveWritesKBPerSec, + DriveWritesAwait, + DrivePercUtil, + + // 内存相关指标 + MemTotal, + MemUsed, + MemUsedPerc, + MemFree, + MemBuffers, + MemCache, + MemShared, + MemAvailable, + + // 网络相关指标 + InternodeErrorsTotal, + InternodeDialErrorsTotal, + InternodeDialAvgTimeNanos, + InternodeSentBytesTotal, + InternodeRecvBytesTotal, + + // 进程相关指标 + ProcessLocksReadTotal, + ProcessLocksWriteTotal, + ProcessCPUTotalSeconds, + ProcessGoRoutineTotal, + ProcessIORCharBytes, + ProcessIOReadBytes, + ProcessIOWCharBytes, + ProcessIOWriteBytes, + ProcessStartTimeSeconds, + ProcessUptimeSeconds, + ProcessFileDescriptorLimitTotal, + ProcessFileDescriptorOpenTotal, + ProcessSyscallReadTotal, + ProcessSyscallWriteTotal, + ProcessResidentMemoryBytes, + ProcessVirtualMemoryBytes, + ProcessVirtualMemoryMaxBytes, + // 自定义指标 Custom(String), } impl MetricName { + #[allow(dead_code)] pub fn as_str(&self) -> String { match self { Self::AuthTotal => "auth_total".to_string(), @@ -317,10 +481,173 @@ impl MetricName { Self::AuditTargetQueueLength => "target_queue_length".to_string(), Self::AuditTotalMessages => "total_messages".to_string(), - /// metrics related to cluster configurations + // metrics related to cluster configurations Self::ConfigRRSParity => "rrs_parity".to_string(), Self::ConfigStandardParity => "standard_parity".to_string(), + // 纠删码集合相关指标 + Self::ErasureSetOverallWriteQuorum => "overall_write_quorum".to_string(), + Self::ErasureSetOverallHealth => "overall_health".to_string(), + Self::ErasureSetReadQuorum => "read_quorum".to_string(), + Self::ErasureSetWriteQuorum => "write_quorum".to_string(), + Self::ErasureSetOnlineDrivesCount => "online_drives_count".to_string(), + Self::ErasureSetHealingDrivesCount => "healing_drives_count".to_string(), + Self::ErasureSetHealth => "health".to_string(), + Self::ErasureSetReadTolerance => "read_tolerance".to_string(), + Self::ErasureSetWriteTolerance => "write_tolerance".to_string(), + Self::ErasureSetReadHealth => "read_health".to_string(), + Self::ErasureSetWriteHealth => "write_health".to_string(), + + // 集群健康相关指标 + Self::HealthDrivesOfflineCount => "drives_offline_count".to_string(), + Self::HealthDrivesOnlineCount => "drives_online_count".to_string(), + Self::HealthDrivesCount => "drives_count".to_string(), + + // IAM 相关指标 + Self::LastSyncDurationMillis => "last_sync_duration_millis".to_string(), + Self::PluginAuthnServiceFailedRequestsMinute => "plugin_authn_service_failed_requests_minute".to_string(), + Self::PluginAuthnServiceLastFailSeconds => "plugin_authn_service_last_fail_seconds".to_string(), + Self::PluginAuthnServiceLastSuccSeconds => "plugin_authn_service_last_succ_seconds".to_string(), + Self::PluginAuthnServiceSuccAvgRttMsMinute => "plugin_authn_service_succ_avg_rtt_ms_minute".to_string(), + Self::PluginAuthnServiceSuccMaxRttMsMinute => "plugin_authn_service_succ_max_rtt_ms_minute".to_string(), + Self::PluginAuthnServiceTotalRequestsMinute => "plugin_authn_service_total_requests_minute".to_string(), + Self::SinceLastSyncMillis => "since_last_sync_millis".to_string(), + Self::SyncFailures => "sync_failures".to_string(), + Self::SyncSuccesses => "sync_successes".to_string(), + + // 通知相关指标 + Self::NotificationCurrentSendInProgress => "current_send_in_progress".to_string(), + Self::NotificationEventsErrorsTotal => "events_errors_total".to_string(), + Self::NotificationEventsSentTotal => "events_sent_total".to_string(), + Self::NotificationEventsSkippedTotal => "events_skipped_total".to_string(), + + // 集群对象使用情况相关指标 + Self::UsageSinceLastUpdateSeconds => "since_last_update_seconds".to_string(), + Self::UsageTotalBytes => "total_bytes".to_string(), + Self::UsageObjectsCount => "count".to_string(), + Self::UsageVersionsCount => "versions_count".to_string(), + Self::UsageDeleteMarkersCount => "delete_markers_count".to_string(), + Self::UsageBucketsCount => "buckets_count".to_string(), + Self::UsageSizeDistribution => "size_distribution".to_string(), + Self::UsageVersionCountDistribution => "version_count_distribution".to_string(), + + // 桶使用情况相关指标 + Self::UsageBucketQuotaTotalBytes => "quota_total_bytes".to_string(), + Self::UsageBucketTotalBytes => "total_bytes".to_string(), + Self::UsageBucketObjectsCount => "objects_count".to_string(), + Self::UsageBucketVersionsCount => "versions_count".to_string(), + Self::UsageBucketDeleteMarkersCount => "delete_markers_count".to_string(), + Self::UsageBucketObjectSizeDistribution => "object_size_distribution".to_string(), + Self::UsageBucketObjectVersionCountDistribution => "object_version_count_distribution".to_string(), + + // ILM 相关指标 + Self::IlmExpiryPendingTasks => "expiry_pending_tasks".to_string(), + Self::IlmTransitionActiveTasks => "transition_active_tasks".to_string(), + Self::IlmTransitionPendingTasks => "transition_pending_tasks".to_string(), + Self::IlmTransitionMissedImmediateTasks => "transition_missed_immediate_tasks".to_string(), + Self::IlmVersionsScanned => "versions_scanned".to_string(), + + // Webhook 日志相关指标 + Self::WebhookQueueLength => "queue_length".to_string(), + Self::WebhookTotalMessages => "total_messages".to_string(), + Self::WebhookFailedMessages => "failed_messages".to_string(), + + // 复制相关指标 + Self::ReplicationAverageActiveWorkers => "average_active_workers".to_string(), + Self::ReplicationAverageQueuedBytes => "average_queued_bytes".to_string(), + Self::ReplicationAverageQueuedCount => "average_queued_count".to_string(), + Self::ReplicationAverageDataTransferRate => "average_data_transfer_rate".to_string(), + Self::ReplicationCurrentActiveWorkers => "current_active_workers".to_string(), + Self::ReplicationCurrentDataTransferRate => "current_data_transfer_rate".to_string(), + Self::ReplicationLastMinuteQueuedBytes => "last_minute_queued_bytes".to_string(), + Self::ReplicationLastMinuteQueuedCount => "last_minute_queued_count".to_string(), + Self::ReplicationMaxActiveWorkers => "max_active_workers".to_string(), + Self::ReplicationMaxQueuedBytes => "max_queued_bytes".to_string(), + Self::ReplicationMaxQueuedCount => "max_queued_count".to_string(), + Self::ReplicationMaxDataTransferRate => "max_data_transfer_rate".to_string(), + Self::ReplicationRecentBacklogCount => "recent_backlog_count".to_string(), + + // 扫描器相关指标 + Self::ScannerBucketScansFinished => "bucket_scans_finished".to_string(), + Self::ScannerBucketScansStarted => "bucket_scans_started".to_string(), + Self::ScannerDirectoriesScanned => "directories_scanned".to_string(), + Self::ScannerObjectsScanned => "objects_scanned".to_string(), + Self::ScannerVersionsScanned => "versions_scanned".to_string(), + Self::ScannerLastActivitySeconds => "last_activity_seconds".to_string(), + + // CPU 系统相关指标 + Self::SysCPUAvgIdle => "avg_idle".to_string(), + Self::SysCPUAvgIOWait => "avg_iowait".to_string(), + Self::SysCPULoad => "load".to_string(), + Self::SysCPULoadPerc => "load_perc".to_string(), + Self::SysCPUNice => "nice".to_string(), + Self::SysCPUSteal => "steal".to_string(), + Self::SysCPUSystem => "system".to_string(), + Self::SysCPUUser => "user".to_string(), + + // 驱动器相关指标 + Self::DriveUsedBytes => "used_bytes".to_string(), + Self::DriveFreeBytes => "free_bytes".to_string(), + Self::DriveTotalBytes => "total_bytes".to_string(), + Self::DriveUsedInodes => "used_inodes".to_string(), + Self::DriveFreeInodes => "free_inodes".to_string(), + Self::DriveTotalInodes => "total_inodes".to_string(), + Self::DriveTimeoutErrorsTotal => "timeout_errors_total".to_string(), + Self::DriveIOErrorsTotal => "io_errors_total".to_string(), + Self::DriveAvailabilityErrorsTotal => "availability_errors_total".to_string(), + Self::DriveWaitingIO => "waiting_io".to_string(), + Self::DriveAPILatencyMicros => "api_latency_micros".to_string(), + Self::DriveHealth => "health".to_string(), + + Self::DriveOfflineCount => "offline_count".to_string(), + Self::DriveOnlineCount => "online_count".to_string(), + Self::DriveCount => "count".to_string(), + + // iostat 相关指标 + Self::DriveReadsPerSec => "reads_per_sec".to_string(), + Self::DriveReadsKBPerSec => "reads_kb_per_sec".to_string(), + Self::DriveReadsAwait => "reads_await".to_string(), + Self::DriveWritesPerSec => "writes_per_sec".to_string(), + Self::DriveWritesKBPerSec => "writes_kb_per_sec".to_string(), + Self::DriveWritesAwait => "writes_await".to_string(), + Self::DrivePercUtil => "perc_util".to_string(), + + // 内存相关指标 + Self::MemTotal => "total".to_string(), + Self::MemUsed => "used".to_string(), + Self::MemUsedPerc => "used_perc".to_string(), + Self::MemFree => "free".to_string(), + Self::MemBuffers => "buffers".to_string(), + Self::MemCache => "cache".to_string(), + Self::MemShared => "shared".to_string(), + Self::MemAvailable => "available".to_string(), + + // 网络相关指标 + Self::InternodeErrorsTotal => "errors_total".to_string(), + Self::InternodeDialErrorsTotal => "dial_errors_total".to_string(), + Self::InternodeDialAvgTimeNanos => "dial_avg_time_nanos".to_string(), + Self::InternodeSentBytesTotal => "sent_bytes_total".to_string(), + Self::InternodeRecvBytesTotal => "recv_bytes_total".to_string(), + + // 进程相关指标 + Self::ProcessLocksReadTotal => "locks_read_total".to_string(), + Self::ProcessLocksWriteTotal => "locks_write_total".to_string(), + Self::ProcessCPUTotalSeconds => "cpu_total_seconds".to_string(), + Self::ProcessGoRoutineTotal => "go_routine_total".to_string(), + Self::ProcessIORCharBytes => "io_rchar_bytes".to_string(), + Self::ProcessIOReadBytes => "io_read_bytes".to_string(), + Self::ProcessIOWCharBytes => "io_wchar_bytes".to_string(), + Self::ProcessIOWriteBytes => "io_write_bytes".to_string(), + Self::ProcessStartTimeSeconds => "start_time_seconds".to_string(), + Self::ProcessUptimeSeconds => "uptime_seconds".to_string(), + Self::ProcessFileDescriptorLimitTotal => "file_descriptor_limit_total".to_string(), + Self::ProcessFileDescriptorOpenTotal => "file_descriptor_open_total".to_string(), + Self::ProcessSyscallReadTotal => "syscall_read_total".to_string(), + Self::ProcessSyscallWriteTotal => "syscall_write_total".to_string(), + Self::ProcessResidentMemoryBytes => "resident_memory_bytes".to_string(), + Self::ProcessVirtualMemoryBytes => "virtual_memory_bytes".to_string(), + Self::ProcessVirtualMemoryMaxBytes => "virtual_memory_max_bytes".to_string(), + Self::Custom(name) => name.clone(), } } diff --git a/crates/obs/src/metrics/entry/metric_type.rs b/crates/obs/src/metrics/entry/metric_type.rs index 614a8539..67634a4e 100644 --- a/crates/obs/src/metrics/entry/metric_type.rs +++ b/crates/obs/src/metrics/entry/metric_type.rs @@ -20,6 +20,7 @@ impl MetricType { /// Convert the metric type to the Prometheus value type /// In a Rust implementation, this might return the corresponding Prometheus Rust client type + #[allow(dead_code)] pub fn to_prom(&self) -> &'static str { match self { Self::Counter => "counter.", diff --git a/crates/obs/src/metrics/entry/namespace.rs b/crates/obs/src/metrics/entry/namespace.rs index 851ba0b8..0f4db118 100644 --- a/crates/obs/src/metrics/entry/namespace.rs +++ b/crates/obs/src/metrics/entry/namespace.rs @@ -5,6 +5,7 @@ pub enum MetricNamespace { } impl MetricNamespace { + #[allow(dead_code)] pub fn as_str(&self) -> &'static str { match self { Self::RustFS => "rustfs", diff --git a/crates/obs/src/metrics/entry/path_utils.rs b/crates/obs/src/metrics/entry/path_utils.rs index 3b83df3e..1275a826 100644 --- a/crates/obs/src/metrics/entry/path_utils.rs +++ b/crates/obs/src/metrics/entry/path_utils.rs @@ -1,7 +1,8 @@ /// Format the path to the metric name format /// Replace '/' and '-' with '_' +#[allow(dead_code)] pub fn format_path_to_metric_name(path: &str) -> String { - path.trim_start_matches('/').replace('/', "_").replace('-', "_") + path.trim_start_matches('/').replace(['/', '-'], "_") } #[cfg(test)] diff --git a/crates/obs/src/metrics/entry/subsystem.rs b/crates/obs/src/metrics/entry/subsystem.rs index 71508829..fafaf0f8 100644 --- a/crates/obs/src/metrics/entry/subsystem.rs +++ b/crates/obs/src/metrics/entry/subsystem.rs @@ -1,27 +1,27 @@ use crate::metrics::entry::path_utils::format_path_to_metric_name; /// The metrics subsystem is a subgroup of metrics within a namespace -/// 指标子系统,表示命名空间内指标的子分组 +/// The metrics subsystem, which represents a subgroup of metrics within a namespace #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum MetricSubsystem { - // API 相关子系统 + // API related subsystems ApiRequests, - // 桶相关子系统 + // bucket related subsystems BucketApi, BucketReplication, - // 系统相关子系统 + // system related subsystems SystemNetworkInternode, SystemDrive, SystemMemory, SystemCpu, SystemProcess, - // 调试相关子系统 + // debug related subsystems DebugGo, - // 集群相关子系统 + // cluster related subsystems ClusterHealth, ClusterUsageObjects, ClusterUsageBuckets, @@ -29,7 +29,7 @@ pub enum MetricSubsystem { ClusterIam, ClusterConfig, - // 其他服务相关子系统 + // other service related subsystems Ilm, Audit, LoggerWebhook, @@ -37,32 +37,32 @@ pub enum MetricSubsystem { Notification, Scanner, - // 自定义路径 + // Custom paths Custom(String), } impl MetricSubsystem { - /// 获取原始路径字符串 + /// Gets the original path string pub fn path(&self) -> &str { match self { - // API 相关子系统 + // api related subsystems Self::ApiRequests => "/api/requests", - // 桶相关子系统 + // bucket related subsystems Self::BucketApi => "/bucket/api", Self::BucketReplication => "/bucket/replication", - // 系统相关子系统 + // system related subsystems Self::SystemNetworkInternode => "/system/network/internode", Self::SystemDrive => "/system/drive", Self::SystemMemory => "/system/memory", Self::SystemCpu => "/system/cpu", Self::SystemProcess => "/system/process", - // 调试相关子系统 + // debug related subsystems Self::DebugGo => "/debug/go", - // 集群相关子系统 + // cluster related subsystems Self::ClusterHealth => "/cluster/health", Self::ClusterUsageObjects => "/cluster/usage/objects", Self::ClusterUsageBuckets => "/cluster/usage/buckets", @@ -70,7 +70,7 @@ impl MetricSubsystem { Self::ClusterIam => "/cluster/iam", Self::ClusterConfig => "/cluster/config", - // 其他服务相关子系统 + // other service related subsystems Self::Ilm => "/ilm", Self::Audit => "/audit", Self::LoggerWebhook => "/logger/webhook", @@ -78,17 +78,18 @@ impl MetricSubsystem { Self::Notification => "/notification", Self::Scanner => "/scanner", - // 自定义路径 + // Custom paths Self::Custom(path) => path, } } - /// 获取格式化后的指标名称格式字符串 + /// Get the formatted metric name format string + #[allow(dead_code)] pub fn as_str(&self) -> String { format_path_to_metric_name(self.path()) } - /// 从路径字符串创建子系统枚举 + /// Create a subsystem enumeration from a path string pub fn from_path(path: &str) -> Self { match path { // API 相关子系统 @@ -129,13 +130,14 @@ impl MetricSubsystem { } } - // 便利方法,直接创建自定义子系统 + /// A convenient way to create custom subsystems directly + #[allow(dead_code)] pub fn new(path: impl Into) -> Self { Self::Custom(path.into()) } } -// 便于与字符串相互转换的实现 +/// Implementations that facilitate conversion to and from strings impl From<&str> for MetricSubsystem { fn from(s: &str) -> Self { Self::from_path(s) @@ -158,10 +160,10 @@ impl std::fmt::Display for MetricSubsystem { pub mod subsystems { use super::MetricSubsystem; - // 集群基本路径常量 + // cluster base path constant pub const CLUSTER_BASE_PATH: &str = "/cluster"; - // 快捷访问各子系统的常量 + // Quick access to constants for each subsystem pub const API_REQUESTS: MetricSubsystem = MetricSubsystem::ApiRequests; pub const BUCKET_API: MetricSubsystem = MetricSubsystem::BucketApi; pub const BUCKET_REPLICATION: MetricSubsystem = MetricSubsystem::BucketReplication; @@ -198,7 +200,7 @@ mod tests { assert_eq!(MetricSubsystem::BucketApi.as_str(), "bucket_api"); assert_eq!(MetricSubsystem::ClusterHealth.as_str(), "cluster_health"); - // 测试自定义路径 + // Test custom paths let custom = MetricSubsystem::new("/custom/path-test"); assert_eq!(custom.as_str(), "custom_path_test"); } diff --git a/crates/obs/src/metrics/ilm.rs b/crates/obs/src/metrics/ilm.rs new file mode 100644 index 00000000..0c770ba3 --- /dev/null +++ b/crates/obs/src/metrics/ilm.rs @@ -0,0 +1,44 @@ +use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// ILM 相关指标描述符 +lazy_static::lazy_static! { + pub static ref ILM_EXPIRY_PENDING_TASKS_MD: MetricDescriptor = + new_gauge_md( + MetricName::IlmExpiryPendingTasks, + "Number of pending ILM expiry tasks in the queue", + &[], // 无标签 + subsystems::ILM + ); + + pub static ref ILM_TRANSITION_ACTIVE_TASKS_MD: MetricDescriptor = + new_gauge_md( + MetricName::IlmTransitionActiveTasks, + "Number of active ILM transition tasks", + &[], // 无标签 + subsystems::ILM + ); + + pub static ref ILM_TRANSITION_PENDING_TASKS_MD: MetricDescriptor = + new_gauge_md( + MetricName::IlmTransitionPendingTasks, + "Number of pending ILM transition tasks in the queue", + &[], // 无标签 + subsystems::ILM + ); + + pub static ref ILM_TRANSITION_MISSED_IMMEDIATE_TASKS_MD: MetricDescriptor = + new_counter_md( + MetricName::IlmTransitionMissedImmediateTasks, + "Number of missed immediate ILM transition tasks", + &[], // 无标签 + subsystems::ILM + ); + + pub static ref ILM_VERSIONS_SCANNED_MD: MetricDescriptor = + new_counter_md( + MetricName::IlmVersionsScanned, + "Total number of object versions checked for ILM actions since server start", + &[], // 无标签 + subsystems::ILM + ); +} diff --git a/crates/obs/src/metrics/logger_webhook.rs b/crates/obs/src/metrics/logger_webhook.rs new file mode 100644 index 00000000..2dec2cb0 --- /dev/null +++ b/crates/obs/src/metrics/logger_webhook.rs @@ -0,0 +1,35 @@ +use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 定义标签常量 +pub const NAME_LABEL: &str = "name"; +pub const ENDPOINT_LABEL: &str = "endpoint"; + +/// Webhook 日志相关指标描述符 +lazy_static::lazy_static! { + // 所有 Webhook 指标使用的标签 + static ref ALL_WEBHOOK_LABELS: [&'static str; 2] = [NAME_LABEL, ENDPOINT_LABEL]; + + pub static ref WEBHOOK_FAILED_MESSAGES_MD: MetricDescriptor = + new_counter_md( + MetricName::WebhookFailedMessages, + "Number of messages that failed to send", + &ALL_WEBHOOK_LABELS[..], + subsystems::LOGGER_WEBHOOK + ); + + pub static ref WEBHOOK_QUEUE_LENGTH_MD: MetricDescriptor = + new_gauge_md( + MetricName::WebhookQueueLength, + "Webhook queue length", + &ALL_WEBHOOK_LABELS[..], + subsystems::LOGGER_WEBHOOK + ); + + pub static ref WEBHOOK_TOTAL_MESSAGES_MD: MetricDescriptor = + new_counter_md( + MetricName::WebhookTotalMessages, + "Total number of messages sent to this target", + &ALL_WEBHOOK_LABELS[..], + subsystems::LOGGER_WEBHOOK + ); +} diff --git a/crates/obs/src/metrics/mod.rs b/crates/obs/src/metrics/mod.rs index ad1cba98..10bfcd07 100644 --- a/crates/obs/src/metrics/mod.rs +++ b/crates/obs/src/metrics/mod.rs @@ -1,8 +1,22 @@ mod audit; mod bucket; mod bucket_replication; +mod cluster_erasure_set; +mod cluster_health; +mod cluster_iam; +mod cluster_notification; +mod cluster_usage; mod entry; +mod ilm; +mod logger_webhook; +mod replication; mod request; +mod scanner; +mod system_cpu; +mod system_drive; +mod system_memory; +mod system_network; +mod system_process; pub use entry::descriptor::MetricDescriptor; pub use entry::metric_name::MetricName; diff --git a/crates/obs/src/metrics/replication.rs b/crates/obs/src/metrics/replication.rs new file mode 100644 index 00000000..c704d6ff --- /dev/null +++ b/crates/obs/src/metrics/replication.rs @@ -0,0 +1,108 @@ +use crate::metrics::{new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 复制相关指标描述符 +lazy_static::lazy_static! { + pub static ref REPLICATION_AVERAGE_ACTIVE_WORKERS_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationAverageActiveWorkers, + "Average number of active replication workers", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_AVERAGE_QUEUED_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationAverageQueuedBytes, + "Average number of bytes queued for replication since server start", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_AVERAGE_QUEUED_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationAverageQueuedCount, + "Average number of objects queued for replication since server start", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_AVERAGE_DATA_TRANSFER_RATE_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationAverageDataTransferRate, + "Average replication data transfer rate in bytes/sec", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_CURRENT_ACTIVE_WORKERS_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationCurrentActiveWorkers, + "Total number of active replication workers", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_CURRENT_DATA_TRANSFER_RATE_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationCurrentDataTransferRate, + "Current replication data transfer rate in bytes/sec", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_LAST_MINUTE_QUEUED_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationLastMinuteQueuedBytes, + "Number of bytes queued for replication in the last full minute", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_LAST_MINUTE_QUEUED_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationLastMinuteQueuedCount, + "Number of objects queued for replication in the last full minute", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_MAX_ACTIVE_WORKERS_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationMaxActiveWorkers, + "Maximum number of active replication workers seen since server start", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_MAX_QUEUED_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationMaxQueuedBytes, + "Maximum number of bytes queued for replication since server start", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_MAX_QUEUED_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationMaxQueuedCount, + "Maximum number of objects queued for replication since server start", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_MAX_DATA_TRANSFER_RATE_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationMaxDataTransferRate, + "Maximum replication data transfer rate in bytes/sec seen since server start", + &[], // 无标签 + subsystems::REPLICATION + ); + + pub static ref REPLICATION_RECENT_BACKLOG_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::ReplicationRecentBacklogCount, + "Total number of objects seen in replication backlog in the last 5 minutes", + &[], // 无标签 + subsystems::REPLICATION + ); +} diff --git a/crates/obs/src/metrics/scanner.rs b/crates/obs/src/metrics/scanner.rs new file mode 100644 index 00000000..ee23f11f --- /dev/null +++ b/crates/obs/src/metrics/scanner.rs @@ -0,0 +1,52 @@ +use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 扫描器相关指标描述符 +lazy_static::lazy_static! { + pub static ref SCANNER_BUCKET_SCANS_FINISHED_MD: MetricDescriptor = + new_counter_md( + MetricName::ScannerBucketScansFinished, + "Total number of bucket scans finished since server start", + &[], // 无标签 + subsystems::SCANNER + ); + + pub static ref SCANNER_BUCKET_SCANS_STARTED_MD: MetricDescriptor = + new_counter_md( + MetricName::ScannerBucketScansStarted, + "Total number of bucket scans started since server start", + &[], // 无标签 + subsystems::SCANNER + ); + + pub static ref SCANNER_DIRECTORIES_SCANNED_MD: MetricDescriptor = + new_counter_md( + MetricName::ScannerDirectoriesScanned, + "Total number of directories scanned since server start", + &[], // 无标签 + subsystems::SCANNER + ); + + pub static ref SCANNER_OBJECTS_SCANNED_MD: MetricDescriptor = + new_counter_md( + MetricName::ScannerObjectsScanned, + "Total number of unique objects scanned since server start", + &[], // 无标签 + subsystems::SCANNER + ); + + pub static ref SCANNER_VERSIONS_SCANNED_MD: MetricDescriptor = + new_counter_md( + MetricName::ScannerVersionsScanned, + "Total number of object versions scanned since server start", + &[], // 无标签 + subsystems::SCANNER + ); + + pub static ref SCANNER_LAST_ACTIVITY_SECONDS_MD: MetricDescriptor = + new_gauge_md( + MetricName::ScannerLastActivitySeconds, + "Time elapsed (in seconds) since last scan activity.", + &[], // 无标签 + subsystems::SCANNER + ); +} diff --git a/crates/obs/src/metrics/system_cpu.rs b/crates/obs/src/metrics/system_cpu.rs new file mode 100644 index 00000000..101d031b --- /dev/null +++ b/crates/obs/src/metrics/system_cpu.rs @@ -0,0 +1,68 @@ +use crate::metrics::{new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// CPU 系统相关指标描述符 +lazy_static::lazy_static! { + pub static ref SYS_CPU_AVG_IDLE_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPUAvgIdle, + "Average CPU idle time", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_AVG_IOWAIT_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPUAvgIOWait, + "Average CPU IOWait time", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_LOAD_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPULoad, + "CPU load average 1min", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_LOAD_PERC_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPULoadPerc, + "CPU load average 1min (percentage)", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_NICE_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPUNice, + "CPU nice time", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_STEAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPUSteal, + "CPU steal time", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_SYSTEM_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPUSystem, + "CPU system time", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); + + pub static ref SYS_CPU_USER_MD: MetricDescriptor = + new_gauge_md( + MetricName::SysCPUUser, + "CPU user time", + &[], // 无标签 + subsystems::SYSTEM_CPU + ); +} diff --git a/crates/obs/src/metrics/system_drive.rs b/crates/obs/src/metrics/system_drive.rs new file mode 100644 index 00000000..512a5317 --- /dev/null +++ b/crates/obs/src/metrics/system_drive.rs @@ -0,0 +1,192 @@ +use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 定义标签常量 +pub const DRIVE_LABEL: &str = "drive"; +pub const POOL_INDEX_LABEL: &str = "pool_index"; +pub const SET_INDEX_LABEL: &str = "set_index"; +pub const DRIVE_INDEX_LABEL: &str = "drive_index"; +pub const API_LABEL: &str = "api"; + +/// 所有驱动器相关的标签 +lazy_static::lazy_static! { + static ref ALL_DRIVE_LABELS: [&'static str; 4] = [DRIVE_LABEL, POOL_INDEX_LABEL, SET_INDEX_LABEL, DRIVE_INDEX_LABEL]; +} + +/// 驱动器相关指标描述符 +lazy_static::lazy_static! { + pub static ref DRIVE_USED_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveUsedBytes, + "Total storage used on a drive in bytes", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_FREE_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveFreeBytes, + "Total storage free on a drive in bytes", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_TOTAL_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveTotalBytes, + "Total storage available on a drive in bytes", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_USED_INODES_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveUsedInodes, + "Total used inodes on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_FREE_INODES_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveFreeInodes, + "Total free inodes on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_TOTAL_INODES_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveTotalInodes, + "Total inodes available on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_TIMEOUT_ERRORS_MD: MetricDescriptor = + new_counter_md( + MetricName::DriveTimeoutErrorsTotal, + "Total timeout errors on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_IO_ERRORS_MD: MetricDescriptor = + new_counter_md( + MetricName::DriveIOErrorsTotal, + "Total I/O errors on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_AVAILABILITY_ERRORS_MD: MetricDescriptor = + new_counter_md( + MetricName::DriveAvailabilityErrorsTotal, + "Total availability errors (I/O errors, timeouts) on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_WAITING_IO_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveWaitingIO, + "Total waiting I/O operations on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_API_LATENCY_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveAPILatencyMicros, + "Average last minute latency in µs for drive API storage operations", + &[&ALL_DRIVE_LABELS[..], &[API_LABEL]].concat(), + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_HEALTH_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveHealth, + "Drive health (0 = offline, 1 = healthy, 2 = healing)", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_OFFLINE_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveOfflineCount, + "Count of offline drives", + &[], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_ONLINE_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveOnlineCount, + "Count of online drives", + &[], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_COUNT_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveCount, + "Count of all drives", + &[], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_READS_PER_SEC_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveReadsPerSec, + "Reads per second on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_READS_KB_PER_SEC_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveReadsKBPerSec, + "Kilobytes read per second on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_READS_AWAIT_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveReadsAwait, + "Average time for read requests served on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_WRITES_PER_SEC_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveWritesPerSec, + "Writes per second on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_WRITES_KB_PER_SEC_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveWritesKBPerSec, + "Kilobytes written per second on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_WRITES_AWAIT_MD: MetricDescriptor = + new_gauge_md( + MetricName::DriveWritesAwait, + "Average time for write requests served on a drive", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); + + pub static ref DRIVE_PERC_UTIL_MD: MetricDescriptor = + new_gauge_md( + MetricName::DrivePercUtil, + "Percentage of time the disk was busy", + &ALL_DRIVE_LABELS[..], + subsystems::SYSTEM_DRIVE + ); +} diff --git a/crates/obs/src/metrics/system_memory.rs b/crates/obs/src/metrics/system_memory.rs new file mode 100644 index 00000000..c3447ba1 --- /dev/null +++ b/crates/obs/src/metrics/system_memory.rs @@ -0,0 +1,68 @@ +use crate::metrics::{new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 内存相关指标描述符 +lazy_static::lazy_static! { + pub static ref MEM_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemTotal, + "Total memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_USED_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemUsed, + "Used memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_USED_PERC_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemUsedPerc, + "Used memory percentage on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_FREE_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemFree, + "Free memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_BUFFERS_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemBuffers, + "Buffers memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_CACHE_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemCache, + "Cache memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_SHARED_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemShared, + "Shared memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); + + pub static ref MEM_AVAILABLE_MD: MetricDescriptor = + new_gauge_md( + MetricName::MemAvailable, + "Available memory on the node", + &[], // 无标签 + subsystems::SYSTEM_MEMORY + ); +} diff --git a/crates/obs/src/metrics/system_network.rs b/crates/obs/src/metrics/system_network.rs new file mode 100644 index 00000000..9d050760 --- /dev/null +++ b/crates/obs/src/metrics/system_network.rs @@ -0,0 +1,44 @@ +use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// 网络相关指标描述符 +lazy_static::lazy_static! { + pub static ref INTERNODE_ERRORS_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::InternodeErrorsTotal, + "Total number of failed internode calls", + &[], // 无标签 + subsystems::SYSTEM_NETWORK_INTERNODE + ); + + pub static ref INTERNODE_DIAL_ERRORS_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::InternodeDialErrorsTotal, + "Total number of internode TCP dial timeouts and errors", + &[], // 无标签 + subsystems::SYSTEM_NETWORK_INTERNODE + ); + + pub static ref INTERNODE_DIAL_AVG_TIME_NANOS_MD: MetricDescriptor = + new_gauge_md( + MetricName::InternodeDialAvgTimeNanos, + "Average dial time of internode TCP calls in nanoseconds", + &[], // 无标签 + subsystems::SYSTEM_NETWORK_INTERNODE + ); + + pub static ref INTERNODE_SENT_BYTES_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::InternodeSentBytesTotal, + "Total number of bytes sent to other peer nodes", + &[], // 无标签 + subsystems::SYSTEM_NETWORK_INTERNODE + ); + + pub static ref INTERNODE_RECV_BYTES_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::InternodeRecvBytesTotal, + "Total number of bytes received from other peer nodes", + &[], // 无标签 + subsystems::SYSTEM_NETWORK_INTERNODE + ); +} diff --git a/crates/obs/src/metrics/system_process.rs b/crates/obs/src/metrics/system_process.rs new file mode 100644 index 00000000..5534b2a1 --- /dev/null +++ b/crates/obs/src/metrics/system_process.rs @@ -0,0 +1,140 @@ +use crate::metrics::{new_counter_md, new_gauge_md, subsystems, MetricDescriptor, MetricName}; + +/// process related metric descriptors +lazy_static::lazy_static! { + pub static ref PROCESS_LOCKS_READ_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessLocksReadTotal, + "Number of current READ locks on this peer", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_LOCKS_WRITE_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessLocksWriteTotal, + "Number of current WRITE locks on this peer", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_CPU_TOTAL_SECONDS_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessCPUTotalSeconds, + "Total user and system CPU time spent in seconds", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_GO_ROUTINE_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessGoRoutineTotal, + "Total number of go routines running", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_IO_RCHAR_BYTES_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessIORCharBytes, + "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_IO_READ_BYTES_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessIOReadBytes, + "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_IO_WCHAR_BYTES_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessIOWCharBytes, + "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_IO_WRITE_BYTES_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessIOWriteBytes, + "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_START_TIME_SECONDS_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessStartTimeSeconds, + "Start time for RustFS process in seconds since Unix epoc", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_UPTIME_SECONDS_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessUptimeSeconds, + "Uptime for RustFS process in seconds", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_FILE_DESCRIPTOR_LIMIT_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessFileDescriptorLimitTotal, + "Limit on total number of open file descriptors for the RustFS Server process", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_FILE_DESCRIPTOR_OPEN_TOTAL_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessFileDescriptorOpenTotal, + "Total number of open file descriptors by the RustFS Server process", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_SYSCALL_READ_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessSyscallReadTotal, + "Total read SysCalls to the kernel. /proc/[pid]/io syscr", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_SYSCALL_WRITE_TOTAL_MD: MetricDescriptor = + new_counter_md( + MetricName::ProcessSyscallWriteTotal, + "Total write SysCalls to the kernel. /proc/[pid]/io syscw", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_RESIDENT_MEMORY_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessResidentMemoryBytes, + "Resident memory size in bytes", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_VIRTUAL_MEMORY_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessVirtualMemoryBytes, + "Virtual memory size in bytes", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); + + pub static ref PROCESS_VIRTUAL_MEMORY_MAX_BYTES_MD: MetricDescriptor = + new_gauge_md( + MetricName::ProcessVirtualMemoryMaxBytes, + "Maximum virtual memory size in bytes", + &[], // 无标签 + subsystems::SYSTEM_PROCESS + ); +}