feat(obs): add advanced log management configuration (#2016)

Co-authored-by: houseme <housemecn@gmail.com>
Co-authored-by: 安正超 <anzhengchao@gmail.com>
Co-authored-by: 唐小鸭 <tangtang1251@qq.com>
This commit is contained in:
heihutu
2026-03-01 03:23:48 +08:00
committed by GitHub
parent e7466eb1cc
commit 2c01b8c49d
26 changed files with 2560 additions and 1305 deletions

362
Cargo.lock generated
View File

@@ -1724,17 +1724,6 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "core_affinity"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a034b3a7b624016c6e13f5df875747cc25f884156aad2abd12b6c46797971342"
dependencies = [
"libc",
"num_cpus",
"winapi",
]
[[package]]
name = "cpp_demangle"
version = "0.5.1"
@@ -3300,17 +3289,6 @@ dependencies = [
"serde",
]
[[package]]
name = "erased-serde"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89e8918065695684b2b0702da20382d5ae6065cf3327bc2d6436bd49a71ce9f3"
dependencies = [
"serde",
"serde_core",
"typeid",
]
[[package]]
name = "errno"
version = "0.3.14"
@@ -3440,30 +3418,6 @@ dependencies = [
"zlib-rs",
]
[[package]]
name = "flexi_logger"
version = "0.31.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aea7feddba9b4e83022270d49a58d4a1b3fdad04b34f78cf1ce471f698e42672"
dependencies = [
"chrono",
"core_affinity",
"crossbeam-channel",
"crossbeam-queue",
"flate2",
"log",
"notify-debouncer-mini",
"nu-ansi-term",
"regex",
"serde",
"serde_derive",
"serde_json",
"thiserror 2.0.18",
"toml",
"tracing",
"tracing-subscriber",
]
[[package]]
name = "flume"
version = "0.11.1"
@@ -3508,15 +3462,6 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "fsevent-sys"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2"
dependencies = [
"libc",
]
[[package]]
name = "futures"
version = "0.3.32"
@@ -4440,9 +4385,9 @@ dependencies = [
[[package]]
name = "inferno"
version = "0.12.4"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d35223c50fdd26419a4ccea2c73be68bd2b29a3d7d6123ffe101c17f4c20a52a"
checksum = "20dd69640582458beceefcf045f8de34263d45194999c9a49fcd53e5b503d522"
dependencies = [
"ahash",
"clap",
@@ -4455,31 +4400,11 @@ dependencies = [
"log",
"num-format",
"once_cell",
"quick-xml 0.38.4",
"quick-xml 0.39.2",
"rgb",
"str_stack",
]
[[package]]
name = "inotify"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
dependencies = [
"bitflags 2.11.0",
"inotify-sys",
"libc",
]
[[package]]
name = "inotify-sys"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb"
dependencies = [
"libc",
]
[[package]]
name = "inout"
version = "0.1.4"
@@ -4702,26 +4627,6 @@ dependencies = [
"simple_asn1",
]
[[package]]
name = "kqueue"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eac30106d7dce88daf4a3fcb4879ea939476d5074a9b7ddd0fb97fa4bed5596a"
dependencies = [
"kqueue-sys",
"libc",
]
[[package]]
name = "kqueue-sys"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
dependencies = [
"bitflags 1.3.2",
"libc",
]
[[package]]
name = "lazy-regex"
version = "3.6.0"
@@ -4877,12 +4782,13 @@ dependencies = [
[[package]]
name = "libredox"
version = "0.1.12"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [
"bitflags 2.11.0",
"libc",
"plain",
"redox_syscall 0.7.3",
]
@@ -4979,10 +4885,6 @@ name = "log"
version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
dependencies = [
"serde_core",
"value-bag",
]
[[package]]
name = "lru"
@@ -5178,7 +5080,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc"
dependencies = [
"libc",
"log",
"wasi",
"windows-sys 0.61.2",
]
@@ -5303,45 +5204,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "notify"
version = "8.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d3d07927151ff8575b7087f245456e549fea62edf0ec4e565a5ee50c8402bc3"
dependencies = [
"bitflags 2.11.0",
"fsevent-sys",
"inotify",
"kqueue",
"libc",
"log",
"mio",
"notify-types",
"walkdir",
"windows-sys 0.60.2",
]
[[package]]
name = "notify-debouncer-mini"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17849edfaabd9a5fef1c606d99cfc615a8e99f7ac4366406d86c7942a3184cf2"
dependencies = [
"log",
"notify",
"notify-types",
"tempfile",
]
[[package]]
name = "notify-types"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42b8cfee0e339a0337359f3c88165702ac6e600dc01c0cc9579a92d62b08477a"
dependencies = [
"bitflags 2.11.0",
]
[[package]]
name = "ntapi"
version = "0.4.3"
@@ -6117,6 +5979,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]]
name = "plotters"
version = "0.3.7"
@@ -6241,7 +6109,7 @@ dependencies = [
"anyhow",
"backtrace",
"flate2",
"inferno 0.12.4",
"inferno 0.12.5",
"num",
"paste",
"prost",
@@ -6493,15 +6361,6 @@ dependencies = [
"serde",
]
[[package]]
name = "quick-xml"
version = "0.38.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
dependencies = [
"memchr",
]
[[package]]
name = "quick-xml"
version = "0.39.2"
@@ -7506,8 +7365,6 @@ dependencies = [
name = "rustfs-keystone"
version = "0.0.5"
dependencies = [
"anyhow",
"axum",
"bytes",
"futures",
"http 1.4.0",
@@ -7516,7 +7373,6 @@ dependencies = [
"hyper",
"moka",
"reqwest 0.13.2",
"rustfs-common",
"rustfs-credentials",
"rustfs-policy",
"serde",
@@ -7526,7 +7382,6 @@ dependencies = [
"tokio",
"tower",
"tracing",
"uuid",
]
[[package]]
@@ -7652,9 +7507,9 @@ dependencies = [
name = "rustfs-obs"
version = "0.0.5"
dependencies = [
"flexi_logger",
"flate2",
"glob",
"metrics",
"nu-ansi-term",
"nvml-wrapper",
"opentelemetry",
"opentelemetry-appender-tracing",
@@ -7667,6 +7522,7 @@ dependencies = [
"serde",
"smallvec",
"sysinfo",
"tempfile",
"thiserror 2.0.18",
"tokio",
"tracing",
@@ -7674,6 +7530,7 @@ dependencies = [
"tracing-error",
"tracing-opentelemetry",
"tracing-subscriber",
"walkdir",
]
[[package]]
@@ -8388,15 +8245,6 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "serde_fmt"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e497af288b3b95d067a23a4f749f2861121ffcb2f6d8379310dcda040c345ed"
dependencies = [
"serde_core",
]
[[package]]
name = "serde_json"
version = "1.0.149"
@@ -8430,15 +8278,6 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_spanned"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776"
dependencies = [
"serde_core",
]
[[package]]
name = "serde_urlencoded"
version = "0.7.1"
@@ -8683,7 +8522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b3b8565691b22d2bdfc066426ed48f837fc0c5f2c8cad8d9718f7f99d6995c1"
dependencies = [
"anyhow",
"erased-serde 0.3.31",
"erased-serde",
"rustversion",
"serde_core",
]
@@ -8973,84 +8812,6 @@ dependencies = [
"tokio-rustls",
]
[[package]]
name = "sval"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1aaf178a50bbdd86043fce9bf0a5867007d9b382db89d1c96ccae4601ff1ff9"
[[package]]
name = "sval_buffer"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f89273e48f03807ebf51c4d81c52f28d35ffa18a593edf97e041b52de143df89"
dependencies = [
"sval",
"sval_ref",
]
[[package]]
name = "sval_dynamic"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0430f4e18e7eba21a49d10d25a8dec3ce0e044af40b162347e99a8e3c3ced864"
dependencies = [
"sval",
]
[[package]]
name = "sval_fmt"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "835f51b9d7331b9d7fc48fc716c02306fa88c4a076b1573531910c91a525882d"
dependencies = [
"itoa",
"ryu",
"sval",
]
[[package]]
name = "sval_json"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13cbfe3ef406ee2366e7e8ab3678426362085fa9eaedf28cb878a967159dced3"
dependencies = [
"itoa",
"ryu",
"sval",
]
[[package]]
name = "sval_nested"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b20358af4af787c34321a86618c3cae12eabdd0e9df22cd9dd2c6834214c518"
dependencies = [
"sval",
"sval_buffer",
"sval_ref",
]
[[package]]
name = "sval_ref"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb5e500f8eb2efa84f75e7090f7fc43f621b9f8b6cde571c635b3855f97b332a"
dependencies = [
"sval",
]
[[package]]
name = "sval_serde"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2032ae39b11dcc6c18d5fbc50a661ea191cac96484c59ccf49b002261ca2c1"
dependencies = [
"serde_core",
"sval",
"sval_nested",
]
[[package]]
name = "symbolic-common"
version = "12.17.2"
@@ -9468,45 +9229,6 @@ dependencies = [
"tokio",
]
[[package]]
name = "toml"
version = "0.9.12+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863"
dependencies = [
"indexmap 2.13.0",
"serde_core",
"serde_spanned",
"toml_datetime",
"toml_parser",
"toml_writer",
"winnow",
]
[[package]]
name = "toml_datetime"
version = "0.7.5+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347"
dependencies = [
"serde_core",
]
[[package]]
name = "toml_parser"
version = "1.0.9+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
dependencies = [
"winnow",
]
[[package]]
name = "toml_writer"
version = "1.0.6+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
[[package]]
name = "tonic"
version = "0.14.5"
@@ -9775,12 +9497,6 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
[[package]]
name = "typeid"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c"
[[package]]
name = "typenum"
version = "1.19.0"
@@ -9931,42 +9647,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
[[package]]
name = "value-bag"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ba6f5989077681266825251a52748b8c1d8a4ad098cc37e440103d0ea717fc0"
dependencies = [
"value-bag-serde1",
"value-bag-sval2",
]
[[package]]
name = "value-bag-serde1"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16530907bfe2999a1773ca5900a65101e092c70f642f25cc23ca0c43573262c5"
dependencies = [
"erased-serde 0.4.9",
"serde_core",
"serde_fmt",
]
[[package]]
name = "value-bag-sval2"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d00ae130edd690eaa877e4f40605d534790d1cf1d651e7685bd6a144521b251f"
dependencies = [
"sval",
"sval_buffer",
"sval_dynamic",
"sval_fmt",
"sval_json",
"sval_ref",
"sval_serde",
]
[[package]]
name = "vaultrs"
version = "0.7.4"
@@ -10590,12 +10270,6 @@ version = "0.53.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
[[package]]
name = "winnow"
version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
[[package]]
name = "wit-bindgen"
version = "0.51.0"

View File

@@ -200,7 +200,6 @@ derive_builder = "0.20.2"
enumset = "1.1.10"
faster-hex = "0.10.0"
flate2 = "1.1.9"
flexi_logger = { version = "0.31.8", features = ["trc", "dont_minimize_extra_stacks", "compress", "kv", "json"] }
glob = "0.3.3"
google-cloud-storage = "1.8.0"
google-cloud-auth = "1.6.0"
@@ -219,7 +218,6 @@ md5 = "0.8.0"
mime_guess = "2.0.5"
moka = { version = "0.12.13", features = ["future"] }
netif = "0.1.6"
nu-ansi-term = "0.50.3"
num_cpus = { version = "1.17.0" }
nvml-wrapper = "0.12.0"
object_store = "0.12.5"

View File

@@ -164,7 +164,7 @@ pub const DEFAULT_LOG_FILENAME: &str = "rustfs";
/// This is the default log filename for OBS.
/// It is used to store the logs of the application.
/// Default value: rustfs.log
pub const DEFAULT_OBS_LOG_FILENAME: &str = concat!(DEFAULT_LOG_FILENAME, "");
pub const DEFAULT_OBS_LOG_FILENAME: &str = concat!(DEFAULT_LOG_FILENAME, ".log");
/// Default log directory for rustfs
/// This is the default log directory for rustfs.
@@ -183,9 +183,9 @@ pub const DEFAULT_LOG_ROTATION_SIZE_MB: u64 = 100;
/// Default log rotation time for rustfs
/// This is the default log rotation time for rustfs.
/// It is used to rotate the logs of the application.
/// Default value: hour, eg: day,hour,minute,second
/// Default value: hour, eg: daily,hourly,minutely
/// Environment variable: RUSTFS_OBS_LOG_ROTATION_TIME
pub const DEFAULT_LOG_ROTATION_TIME: &str = "hour";
pub const DEFAULT_LOG_ROTATION_TIME: &str = "hourly";
/// Default log keep files for rustfs
/// This is the default log keep files for rustfs.

View File

@@ -41,23 +41,30 @@ pub const ENV_OBS_LOG_ROTATION_SIZE_MB: &str = "RUSTFS_OBS_LOG_ROTATION_SIZE_MB"
pub const ENV_OBS_LOG_ROTATION_TIME: &str = "RUSTFS_OBS_LOG_ROTATION_TIME";
pub const ENV_OBS_LOG_KEEP_FILES: &str = "RUSTFS_OBS_LOG_KEEP_FILES";
/// Log pool capacity for async logging
pub const ENV_OBS_LOG_POOL_CAPA: &str = "RUSTFS_OBS_LOG_POOL_CAPA";
/// Log cleanup related configurations
pub const ENV_OBS_LOG_KEEP_COUNT: &str = "RUSTFS_OBS_LOG_KEEP_COUNT";
pub const ENV_OBS_LOG_MAX_TOTAL_SIZE_BYTES: &str = "RUSTFS_OBS_LOG_MAX_TOTAL_SIZE_BYTES";
pub const ENV_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES: &str = "RUSTFS_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES";
pub const ENV_OBS_LOG_COMPRESS_OLD_FILES: &str = "RUSTFS_OBS_LOG_COMPRESS_OLD_FILES";
pub const ENV_OBS_LOG_GZIP_COMPRESSION_LEVEL: &str = "RUSTFS_OBS_LOG_GZIP_COMPRESSION_LEVEL";
pub const ENV_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS: &str = "RUSTFS_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS";
pub const ENV_OBS_LOG_EXCLUDE_PATTERNS: &str = "RUSTFS_OBS_LOG_EXCLUDE_PATTERNS";
pub const ENV_OBS_LOG_DELETE_EMPTY_FILES: &str = "RUSTFS_OBS_LOG_DELETE_EMPTY_FILES";
pub const ENV_OBS_LOG_MIN_FILE_AGE_SECONDS: &str = "RUSTFS_OBS_LOG_MIN_FILE_AGE_SECONDS";
pub const ENV_OBS_LOG_CLEANUP_INTERVAL_SECONDS: &str = "RUSTFS_OBS_LOG_CLEANUP_INTERVAL_SECONDS";
pub const ENV_OBS_LOG_DRY_RUN: &str = "RUSTFS_OBS_LOG_DRY_RUN";
/// Log message capacity for async logging
pub const ENV_OBS_LOG_MESSAGE_CAPA: &str = "RUSTFS_OBS_LOG_MESSAGE_CAPA";
/// Log flush interval in milliseconds for async logging
pub const ENV_OBS_LOG_FLUSH_MS: &str = "RUSTFS_OBS_LOG_FLUSH_MS";
/// Default values for log pool
pub const DEFAULT_OBS_LOG_POOL_CAPA: usize = 10240;
/// Default values for message capacity
pub const DEFAULT_OBS_LOG_MESSAGE_CAPA: usize = 32768;
/// Default values for flush interval in milliseconds
pub const DEFAULT_OBS_LOG_FLUSH_MS: u64 = 200;
/// Default values for log cleanup
pub const DEFAULT_OBS_LOG_KEEP_COUNT: usize = 10;
pub const DEFAULT_OBS_LOG_MAX_TOTAL_SIZE_BYTES: u64 = 2 * 1024 * 1024 * 1024; // 2 GiB
pub const DEFAULT_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES: u64 = 0; // No single file limit
pub const DEFAULT_OBS_LOG_COMPRESS_OLD_FILES: bool = true;
pub const DEFAULT_OBS_LOG_GZIP_COMPRESSION_LEVEL: u32 = 6;
pub const DEFAULT_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS: u64 = 30;
pub const DEFAULT_OBS_LOG_DELETE_EMPTY_FILES: bool = true;
pub const DEFAULT_OBS_LOG_MIN_FILE_AGE_SECONDS: u64 = 3600; // 1 hour
pub const DEFAULT_OBS_LOG_CLEANUP_INTERVAL_SECONDS: u64 = 6 * 3600; // 6 hours
pub const DEFAULT_OBS_LOG_DRY_RUN: bool = false;
/// Default values for observability configuration
// ### Supported Environment Values
@@ -96,6 +103,21 @@ mod tests {
assert_eq!(ENV_OBS_TRACES_EXPORT_ENABLED, "RUSTFS_OBS_TRACES_EXPORT_ENABLED");
assert_eq!(ENV_OBS_METRICS_EXPORT_ENABLED, "RUSTFS_OBS_METRICS_EXPORT_ENABLED");
assert_eq!(ENV_OBS_LOGS_EXPORT_ENABLED, "RUSTFS_OBS_LOGS_EXPORT_ENABLED");
// Test log cleanup related env keys
assert_eq!(ENV_OBS_LOG_KEEP_COUNT, "RUSTFS_OBS_LOG_KEEP_COUNT");
assert_eq!(ENV_OBS_LOG_MAX_TOTAL_SIZE_BYTES, "RUSTFS_OBS_LOG_MAX_TOTAL_SIZE_BYTES");
assert_eq!(ENV_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES, "RUSTFS_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES");
assert_eq!(ENV_OBS_LOG_COMPRESS_OLD_FILES, "RUSTFS_OBS_LOG_COMPRESS_OLD_FILES");
assert_eq!(ENV_OBS_LOG_GZIP_COMPRESSION_LEVEL, "RUSTFS_OBS_LOG_GZIP_COMPRESSION_LEVEL");
assert_eq!(
ENV_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS,
"RUSTFS_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS"
);
assert_eq!(ENV_OBS_LOG_EXCLUDE_PATTERNS, "RUSTFS_OBS_LOG_EXCLUDE_PATTERNS");
assert_eq!(ENV_OBS_LOG_DELETE_EMPTY_FILES, "RUSTFS_OBS_LOG_DELETE_EMPTY_FILES");
assert_eq!(ENV_OBS_LOG_MIN_FILE_AGE_SECONDS, "RUSTFS_OBS_LOG_MIN_FILE_AGE_SECONDS");
assert_eq!(ENV_OBS_LOG_CLEANUP_INTERVAL_SECONDS, "RUSTFS_OBS_LOG_CLEANUP_INTERVAL_SECONDS");
assert_eq!(ENV_OBS_LOG_DRY_RUN, "RUSTFS_OBS_LOG_DRY_RUN");
}
#[test]

View File

@@ -33,12 +33,9 @@ serde_json = { workspace = true }
thiserror = { workspace = true }
tracing = { workspace = true }
time = { workspace = true }
uuid = { workspace = true }
moka = { workspace = true }
rustfs-common = { workspace = true }
rustfs-credentials = { workspace = true }
rustfs-policy = { workspace = true }
anyhow = { workspace = true }
# Middleware dependencies
tower = { workspace = true }
http = { workspace = true }
@@ -51,7 +48,6 @@ futures = { workspace = true }
[dev-dependencies]
tokio = { workspace = true, features = ["test-util"] }
tower = { workspace = true, features = ["util"] }
axum = { workspace = true }
hyper = { workspace = true, features = ["server"] }
serde_json = { workspace = true }

View File

@@ -36,9 +36,9 @@ full = ["gpu"]
[dependencies]
rustfs-config = { workspace = true, features = ["constants", "observability"] }
rustfs-utils = { workspace = true, features = ["ip", "path"] }
flexi_logger = { workspace = true }
flate2 = { workspace = true }
glob = { workspace = true }
metrics = { workspace = true }
nu-ansi-term = { workspace = true }
nvml-wrapper = { workspace = true, optional = true }
opentelemetry = { workspace = true }
opentelemetry-appender-tracing = { workspace = true, features = ["experimental_use_tracing_span_context", "experimental_metadata_attributes"] }
@@ -56,7 +56,9 @@ tracing-subscriber = { workspace = true, features = ["registry", "std", "fmt", "
tokio = { workspace = true, features = ["sync", "fs", "rt-multi-thread", "rt", "time", "macros"] }
sysinfo = { workspace = true }
thiserror = { workspace = true }
walkdir = { workspace = true }
[dev-dependencies]
tokio = { workspace = true, features = ["full"] }
tempfile = { workspace = true }

View File

@@ -1,82 +1,290 @@
[![RustFS](https://rustfs.com/images/rustfs-github.png)](https://rustfs.com)
# rustfs-obs
# RustFS Obs - Observability & Monitoring
<p align="center">
<strong>Comprehensive observability and monitoring system for RustFS distributed object storage</strong>
</p>
<p align="center">
<a href="https://github.com/rustfs/rustfs/actions/workflows/ci.yml"><img alt="CI" src="https://github.com/rustfs/rustfs/actions/workflows/ci.yml/badge.svg" /></a>
<a href="https://docs.rustfs.com/">📖 Documentation</a>
· <a href="https://github.com/rustfs/rustfs/issues">🐛 Bug Reports</a>
· <a href="https://github.com/rustfs/rustfs/discussions">💬 Discussions</a>
</p>
Observability library for [RustFS](https://github.com/rustfs/rustfs) providing structured JSON
logging, distributed tracing, and metrics via OpenTelemetry.
---
## 📖 Overview
## Features
**RustFS Obs** provides comprehensive observability and monitoring capabilities for the [RustFS](https://rustfs.com) distributed object storage system. For the complete RustFS experience, please visit the [main RustFS repository](https://github.com/rustfs/rustfs).
| Feature | Description |
|---------|-------------|
| **Structured logging** | JSON-formatted logs via `tracing-subscriber` |
| **Rolling-file logging** | Daily / hourly rotation with automatic cleanup |
| **Distributed tracing** | OTLP/HTTP export to Jaeger, Tempo, or any OTel collector |
| **Metrics** | OTLP/HTTP export, bridged from the `metrics` crate facade |
| **Log cleanup** | Background task: size limits, gzip compression, retention policies |
| **GPU metrics** *(optional)* | Enable with the `gpu` feature flag |
## ✨ Features
---
- **Environment-Aware Logging**: Automatically configures logging behavior based on deployment environment
- Production: File-only logging (stdout disabled by default for security and log aggregation)
- Development/Test: Full logging with stdout support for debugging
- OpenTelemetry integration for distributed tracing
- Prometheus metrics collection and exposition
- Structured logging with configurable levels and rotation
- Performance profiling and analytics
- Real-time health checks and status monitoring
- Custom dashboards and alerting integration
- Enhanced error handling and resilience
## Quick Start
## 🚀 Environment-Aware Logging
```toml
# Cargo.toml
[dependencies]
rustfs-obs = { version = "0.0.5" }
The obs module automatically adapts logging behavior based on your deployment environment:
### Production Environment
```bash
# Set production environment - disables stdout logging by default
export RUSTFS_OBS_ENVIRONMENT=production
# All logs go to files only (no stdout) for security and log aggregation
# Enhanced error handling with clear failure diagnostics
# GPU metrics support
rustfs-obs = { version = "0.0.5", features = ["gpu"] }
```
### Development/Test Environment
```bash
# Set development environment - enables stdout logging
export RUSTFS_OBS_ENVIRONMENT=development
# Logs appear both in files and stdout for easier debugging
# Full span tracking and verbose error messages
```
### Configuration Override
You can always override the environment defaults:
```rust
use rustfs_obs::OtelConfig;
use rustfs_obs::init_obs;
let config = OtelConfig {
endpoint: "".to_string(),
use_stdout: Some(true), // Explicit override - forces stdout even in production
environment: Some("production".to_string()),
..Default::default()
};
#[tokio::main]
async fn main() {
// Build config from environment variables, then initialise all backends.
let _guard = init_obs(None).await.expect("failed to initialise observability");
tracing::info!("RustFS started");
// _guard is dropped here — all providers are flushed and shut down.
}
```
### Supported Environment Values
- `production` - Secure file-only logging
- `development` - Full debugging with stdout
- `test` - Test environment with stdout support
- `staging` - Staging environment with stdout support
> **Keep `_guard` alive** for the lifetime of your application. Dropping it
> triggers an ordered shutdown of every OpenTelemetry provider.
## 📚 Documentation
---
For comprehensive documentation, examples, and usage guides, please visit the main [RustFS repository](https://github.com/rustfs/rustfs).
## Initialisation
## 📄 License
### With an explicit OTLP endpoint
This project is licensed under the Apache License 2.0 - see the [LICENSE](../../LICENSE) file for details.
```rust
use rustfs_obs::init_obs;
let _guard = init_obs(Some("http://otel-collector:4318".to_string()))
.await
.expect("observability init failed");
```
### With a custom config struct
```rust
use rustfs_obs::{AppConfig, OtelConfig, init_obs_with_config};
let config = AppConfig::new_with_endpoint(Some("http://localhost:4318".to_string()));
let _guard = init_obs_with_config(&config.observability)
.await
.expect("observability init failed");
```
---
## Routing Logic
The library selects a backend automatically based on configuration:
```
1. Any OTLP endpoint set?
└─ YES → Full OTLP/HTTP pipeline (traces + metrics + logs)
2. RUSTFS_OBS_LOG_DIRECTORY set to a non-empty path?
└─ YES → Rolling-file JSON logging
+ Stdout mirror enabled if:
- RUSTFS_OBS_LOG_STDOUT_ENABLED=true (explicit), OR
- RUSTFS_OBS_ENVIRONMENT != "production" (automatic)
3. Default → Stdout-only JSON logging (all signals)
```
**Key Points:**
- When **no log directory** is configured, logs automatically go to **stdout only** (perfect for development)
- When a **log directory** is set, logs go to **rolling files** in that directory
- In **non-production environments**, stdout is automatically mirrored alongside file logging for visibility
- In **production** mode, you must explicitly set `RUSTFS_OBS_LOG_STDOUT_ENABLED=true` to see stdout in addition to files
---
## Environment Variables
All configuration is read from environment variables at startup.
### OTLP / Export
| Variable | Default | Description |
|----------|---------|-------------|
| `RUSTFS_OBS_ENDPOINT` | _(empty)_ | Root OTLP/HTTP endpoint, e.g. `http://otel-collector:4318` |
| `RUSTFS_OBS_TRACE_ENDPOINT` | _(empty)_ | Dedicated trace endpoint (overrides root + `/v1/traces`) |
| `RUSTFS_OBS_METRIC_ENDPOINT` | _(empty)_ | Dedicated metrics endpoint |
| `RUSTFS_OBS_LOG_ENDPOINT` | _(empty)_ | Dedicated log endpoint |
| `RUSTFS_OBS_TRACES_EXPORT_ENABLED` | `true` | Toggle trace export |
| `RUSTFS_OBS_METRICS_EXPORT_ENABLED` | `true` | Toggle metrics export |
| `RUSTFS_OBS_LOGS_EXPORT_ENABLED` | `true` | Toggle OTLP log export |
| `RUSTFS_OBS_USE_STDOUT` | `false` | Mirror all signals to stdout alongside OTLP |
| `RUSTFS_OBS_SAMPLE_RATIO` | `0.1` | Trace sampling ratio `0.0``1.0` |
| `RUSTFS_OBS_METER_INTERVAL` | `15` | Metrics export interval (seconds) |
### Service identity
| Variable | Default | Description |
|----------|---------|-------------|
| `RUSTFS_OBS_SERVICE_NAME` | `rustfs` | OTel `service.name` |
| `RUSTFS_OBS_SERVICE_VERSION` | _(crate version)_ | OTel `service.version` |
| `RUSTFS_OBS_ENVIRONMENT` | `development` | Deployment environment (`production`, `development`, …) |
### Local logging
| Variable | Default | Description |
|----------|---------|-------------|
| `RUSTFS_OBS_LOGGER_LEVEL` | `info` | Log level; `RUST_LOG` syntax supported |
| `RUSTFS_OBS_LOG_STDOUT_ENABLED` | `false` | When file logging is active, also mirror to stdout |
| `RUSTFS_OBS_LOG_DIRECTORY` | _(empty)_ | **Directory for rolling log files. When empty, logs go to stdout only** |
| `RUSTFS_OBS_LOG_FILENAME` | `rustfs` | Base filename for rolling logs (date suffix added automatically) |
| `RUSTFS_OBS_LOG_ROTATION_TIME` | `hourly` | Rotation granularity: `minutely`, `hourly`, or `daily` |
| `RUSTFS_OBS_LOG_KEEP_FILES` | `30` | Number of rolling files to keep |
### Log cleanup
| Variable | Default | Description |
|----------|---------|-------------|
| `RUSTFS_OBS_LOG_KEEP_COUNT` | `10` | Minimum files the cleaner must always preserve |
| `RUSTFS_OBS_LOG_MAX_TOTAL_SIZE_BYTES` | `2147483648` | Hard cap on total log directory size (2 GiB) |
| `RUSTFS_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES` | `0` | Per-file size cap; `0` = unlimited |
| `RUSTFS_OBS_LOG_COMPRESS_OLD_FILES` | `true` | Gzip-compress files before deleting |
| `RUSTFS_OBS_LOG_GZIP_COMPRESSION_LEVEL` | `6` | Gzip level `1` (fastest) `9` (best) |
| `RUSTFS_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS` | `30` | Delete `.gz` archives older than N days; `0` = keep forever |
| `RUSTFS_OBS_LOG_EXCLUDE_PATTERNS` | _(empty)_ | Comma-separated glob patterns to never clean up |
| `RUSTFS_OBS_LOG_DELETE_EMPTY_FILES` | `true` | Remove zero-byte files |
| `RUSTFS_OBS_LOG_MIN_FILE_AGE_SECONDS` | `3600` | Minimum file age (seconds) before cleanup |
| `RUSTFS_OBS_LOG_CLEANUP_INTERVAL_SECONDS` | `21600` | How often the cleanup task runs (6 hours) |
| `RUSTFS_OBS_LOG_DRY_RUN` | `false` | Report deletions without actually removing files |
---
## Examples
### Stdout-only (development default)
```bash
# No RUSTFS_OBS_LOG_DIRECTORY set → stdout JSON
RUSTFS_OBS_LOGGER_LEVEL=debug ./rustfs
```
### Rolling-file logging
```bash
export RUSTFS_OBS_LOG_DIRECTORY=/var/log/rustfs
export RUSTFS_OBS_LOGGER_LEVEL=info
export RUSTFS_OBS_LOG_KEEP_FILES=30
export RUSTFS_OBS_LOG_MAX_TOTAL_SIZE_BYTES=5368709120 # 5 GiB
./rustfs
```
### Full OTLP pipeline (production)
```bash
export RUSTFS_OBS_ENDPOINT=http://otel-collector:4318
export RUSTFS_OBS_ENVIRONMENT=production
export RUSTFS_OBS_SAMPLE_RATIO=0.05 # 5% trace sampling
export RUSTFS_OBS_LOG_DIRECTORY=/var/log/rustfs
export RUSTFS_OBS_LOG_STDOUT_ENABLED=false
./rustfs
```
### Separate per-signal endpoints
```bash
export RUSTFS_OBS_TRACE_ENDPOINT=http://tempo:4318/v1/traces
export RUSTFS_OBS_METRIC_ENDPOINT=http://prometheus-otel:4318/v1/metrics
export RUSTFS_OBS_LOG_ENDPOINT=http://loki-otel:4318/v1/logs
./rustfs
```
### Dry-run cleanup audit
```bash
export RUSTFS_OBS_LOG_DIRECTORY=/var/log/rustfs
export RUSTFS_OBS_LOG_DRY_RUN=true
./rustfs
# Observe log output — no files will actually be deleted.
```
---
## Module Structure
```
rustfs-obs/src/
├── lib.rs # Crate root; public re-exports
├── config.rs # OtelConfig + AppConfig; env-var loading
├── error.rs # TelemetryError type
├── global.rs # init_obs / init_obs_with_config entry points
├── telemetry/ # Backend initialisation
│ ├── mod.rs # init_telemetry routing logic
│ ├── guard.rs # OtelGuard RAII lifecycle manager
│ ├── filter.rs # EnvFilter construction helpers
│ ├── resource.rs # OTel Resource builder
│ ├── local.rs # Stdout-only and rolling-file backends
│ ├── otel.rs # Full OTLP/HTTP pipeline
│ └── recorder.rs # metrics-crate → OTel bridge (Recorder)
├── log_cleanup/ # Background log-file cleanup subsystem
│ ├── mod.rs # LogCleaner public API + tests
│ ├── types.rs # FileInfo shared type
│ ├── scanner.rs # Filesystem discovery
│ ├── compress.rs # Gzip compression helper
│ └── cleaner.rs # Selection, compression, deletion logic
└── system/ # Host metrics (CPU, memory, disk, GPU)
├── mod.rs
├── attributes.rs
├── collector.rs
├── metrics.rs
└── gpu.rs # GPU metrics (feature = "gpu")
```
---
## Using `LogCleaner` Directly
```rust
use std::path::PathBuf;
use rustfs_obs::LogCleaner;
let cleaner = LogCleaner::new(
PathBuf::from("/var/log/rustfs"),
"rustfs.log.".to_string(), // file_prefix
10, // keep_count
2 * 1024 * 1024 * 1024, // max_total_size_bytes (2 GiB)
0, // max_single_file_size_bytes (unlimited)
true, // compress_old_files
6, // gzip_compression_level
30, // compressed_file_retention_days
vec!["current.log".to_string()], // exclude_patterns
true, // delete_empty_files
3600, // min_file_age_seconds (1 hour)
false, // dry_run
);
let (deleted, freed_bytes) = cleaner.cleanup().expect("cleanup failed");
println!("Deleted {deleted} files, freed {freed_bytes} bytes");
```
---
## Feature Flags
| Flag | Description |
|------|-------------|
| _(default)_ | Core logging, tracing, and metrics |
| `gpu` | GPU utilisation metrics via `nvml` |
| `full` | All features enabled |
```toml
# Enable GPU monitoring
rustfs-obs = { version = "0.0.5", features = ["gpu"] }
# Enable everything
rustfs-obs = { version = "0.0.5", features = ["full"] }
```
---
## License
Apache 2.0 — see [LICENSE](../../LICENSE).

View File

@@ -12,93 +12,209 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Observability configuration for RustFS.
//!
//! All configuration is read from environment variables. The canonical list of
//! variable names and their defaults lives in `rustfs-config/src/observability/mod.rs`.
//!
//! Two public structs are provided:
//! - [`OtelConfig`] — the primary flat configuration that drives every backend.
//! - [`AppConfig`] — a thin wrapper used when the config is embedded inside a
//! larger application configuration struct.
use rustfs_config::observability::{
DEFAULT_OBS_ENVIRONMENT_PRODUCTION, ENV_OBS_ENDPOINT, ENV_OBS_ENVIRONMENT, ENV_OBS_LOG_DIRECTORY, ENV_OBS_LOG_ENDPOINT,
ENV_OBS_LOG_FILENAME, ENV_OBS_LOG_KEEP_FILES, ENV_OBS_LOG_ROTATION_SIZE_MB, ENV_OBS_LOG_ROTATION_TIME,
ENV_OBS_LOG_STDOUT_ENABLED, ENV_OBS_LOGGER_LEVEL, ENV_OBS_LOGS_EXPORT_ENABLED, ENV_OBS_METER_INTERVAL,
ENV_OBS_METRIC_ENDPOINT, ENV_OBS_METRICS_EXPORT_ENABLED, ENV_OBS_SAMPLE_RATIO, ENV_OBS_SERVICE_NAME, ENV_OBS_SERVICE_VERSION,
ENV_OBS_TRACE_ENDPOINT, ENV_OBS_TRACES_EXPORT_ENABLED, ENV_OBS_USE_STDOUT,
DEFAULT_OBS_ENVIRONMENT_PRODUCTION, DEFAULT_OBS_LOG_CLEANUP_INTERVAL_SECONDS, DEFAULT_OBS_LOG_COMPRESS_OLD_FILES,
DEFAULT_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS, DEFAULT_OBS_LOG_DELETE_EMPTY_FILES, DEFAULT_OBS_LOG_DRY_RUN,
DEFAULT_OBS_LOG_GZIP_COMPRESSION_LEVEL, DEFAULT_OBS_LOG_KEEP_COUNT, DEFAULT_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES,
DEFAULT_OBS_LOG_MAX_TOTAL_SIZE_BYTES, DEFAULT_OBS_LOG_MIN_FILE_AGE_SECONDS, ENV_OBS_ENDPOINT, ENV_OBS_ENVIRONMENT,
ENV_OBS_LOG_CLEANUP_INTERVAL_SECONDS, ENV_OBS_LOG_COMPRESS_OLD_FILES, ENV_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS,
ENV_OBS_LOG_DELETE_EMPTY_FILES, ENV_OBS_LOG_DIRECTORY, ENV_OBS_LOG_DRY_RUN, ENV_OBS_LOG_ENDPOINT,
ENV_OBS_LOG_EXCLUDE_PATTERNS, ENV_OBS_LOG_FILENAME, ENV_OBS_LOG_GZIP_COMPRESSION_LEVEL, ENV_OBS_LOG_KEEP_COUNT,
ENV_OBS_LOG_KEEP_FILES, ENV_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES, ENV_OBS_LOG_MAX_TOTAL_SIZE_BYTES,
ENV_OBS_LOG_MIN_FILE_AGE_SECONDS, ENV_OBS_LOG_ROTATION_TIME, ENV_OBS_LOG_STDOUT_ENABLED, ENV_OBS_LOGGER_LEVEL,
ENV_OBS_LOGS_EXPORT_ENABLED, ENV_OBS_METER_INTERVAL, ENV_OBS_METRIC_ENDPOINT, ENV_OBS_METRICS_EXPORT_ENABLED,
ENV_OBS_SAMPLE_RATIO, ENV_OBS_SERVICE_NAME, ENV_OBS_SERVICE_VERSION, ENV_OBS_TRACE_ENDPOINT, ENV_OBS_TRACES_EXPORT_ENABLED,
ENV_OBS_USE_STDOUT,
};
use rustfs_config::{
APP_NAME, DEFAULT_LOG_KEEP_FILES, DEFAULT_LOG_LEVEL, DEFAULT_LOG_ROTATION_SIZE_MB, DEFAULT_LOG_ROTATION_TIME,
DEFAULT_OBS_LOG_FILENAME, DEFAULT_OBS_LOG_STDOUT_ENABLED, DEFAULT_OBS_LOGS_EXPORT_ENABLED,
DEFAULT_OBS_METRICS_EXPORT_ENABLED, DEFAULT_OBS_TRACES_EXPORT_ENABLED, ENVIRONMENT, METER_INTERVAL, SAMPLE_RATIO,
SERVICE_VERSION, USE_STDOUT,
APP_NAME, DEFAULT_LOG_KEEP_FILES, DEFAULT_LOG_LEVEL, DEFAULT_LOG_ROTATION_TIME, DEFAULT_OBS_LOG_FILENAME,
DEFAULT_OBS_LOG_STDOUT_ENABLED, DEFAULT_OBS_LOGS_EXPORT_ENABLED, DEFAULT_OBS_METRICS_EXPORT_ENABLED,
DEFAULT_OBS_TRACES_EXPORT_ENABLED, ENVIRONMENT, METER_INTERVAL, SAMPLE_RATIO, SERVICE_VERSION, USE_STDOUT,
};
use rustfs_utils::dirs::get_log_directory_to_string;
use rustfs_utils::{get_env_bool, get_env_f64, get_env_opt_str, get_env_str, get_env_u64, get_env_usize};
use serde::{Deserialize, Serialize};
use std::env;
/// Observability: OpenTelemetry configuration
/// # Fields
/// * `endpoint`: Endpoint for metric collection
/// * `use_stdout`: Output to stdout
/// * `sample_ratio`: Trace sampling ratio
/// * `meter_interval`: Metric collection interval
/// * `service_name`: Service name
/// * `service_version`: Service version
/// * `environment`: Environment
/// * `logger_level`: Logger level
/// * `local_logging_enabled`: Local logging enabled
/// # Added flexi_logger related configurations
/// * `log_directory`: Log file directory
/// * `log_filename`: The name of the log file
/// * `log_rotation_size_mb`: Log file size cut threshold (MB)
/// * `log_rotation_time`: Logs are cut by time (Hour,Day,Minute,Second)
/// * `log_keep_files`: Number of log files to be retained
/// # Returns
/// A new instance of OtelConfig
/// Full observability configuration used by all telemetry backends.
///
/// Fields are grouped into three logical sections:
///
/// ## OpenTelemetry / OTLP export
/// Controls whether and where traces, metrics, and logs are exported over the
/// wire using the OTLP/HTTP protocol.
///
/// ## Local logging
/// Controls the rolling-file appender: directory, filename, rotation policy,
/// and the number of files to retain.
///
/// ## Log cleanup
/// Controls the background cleanup task: size limits, compression, retention
/// of compressed archives, exclusion patterns, and dry-run mode.
///
/// # Design Notes
///
/// - All fields are `Option<T>` to allow partial configuration via environment
/// variables with sensible defaults provided by constants in `rustfs-config`.
/// - `log_keep_count` represents the cleaner's minimum retention; `log_keep_files`
/// controls the rolling-appender's file limit (both typically set to the same value).
///
/// # Example
/// ```no_run
/// use rustfs_obs::OtelConfig;
///
/// // Build from environment variables (typical production usage).
/// let config = OtelConfig::new();
///
/// // Build with an explicit OTLP endpoint.
/// let config = OtelConfig::extract_otel_config_from_env(
/// Some("http://otel-collector:4318".to_string())
/// );
/// ```
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct OtelConfig {
pub endpoint: String, // Endpoint for otel collection
pub trace_endpoint: Option<String>, // Endpoint for trace collection
pub metric_endpoint: Option<String>, // Endpoint for metric collection
pub log_endpoint: Option<String>, // Endpoint for log collection
pub traces_export_enabled: Option<bool>, // Enable/disable trace export
pub metrics_export_enabled: Option<bool>, // Enable/disable metric export
pub logs_export_enabled: Option<bool>, // Enable/disable log export
pub use_stdout: Option<bool>, // Output to stdout
pub sample_ratio: Option<f64>, // Trace sampling ratio
pub meter_interval: Option<u64>, // Metric collection interval
pub service_name: Option<String>, // Service name
pub service_version: Option<String>, // Service version
pub environment: Option<String>, // Environment
pub logger_level: Option<String>, // Logger level
pub log_stdout_enabled: Option<bool>, // Stdout logging enabled
// Added flexi_logger related configurations
pub log_directory: Option<String>, // LOG FILE DIRECTORY
pub log_filename: Option<String>, // The name of the log file
pub log_rotation_size_mb: Option<u64>, // Log file size cut threshold (MB)
pub log_rotation_time: Option<String>, // Logs are cut by time (Hour DayMinute Second)
pub log_keep_files: Option<usize>, // Number of log files to be retained
// ── OTLP export ──────────────────────────────────────────────────────────
/// Root OTLP/HTTP endpoint (e.g. `http://otel-collector:4318`).
/// Per-signal endpoints below take precedence when set.
pub endpoint: String,
/// Dedicated trace endpoint; overrides `endpoint` + `/v1/traces` fallback.
pub trace_endpoint: Option<String>,
/// Dedicated metrics endpoint; overrides `endpoint` + `/v1/metrics` fallback.
pub metric_endpoint: Option<String>,
/// Dedicated log endpoint; overrides `endpoint` + `/v1/logs` fallback.
pub log_endpoint: Option<String>,
/// Whether to export distributed traces (default: `true`).
pub traces_export_enabled: Option<bool>,
/// Whether to export metrics (default: `true`).
pub metrics_export_enabled: Option<bool>,
/// Whether to export logs via OTLP (default: `true`).
pub logs_export_enabled: Option<bool>,
/// **[OTLP-only]** Mirror all signals to stdout in addition to OTLP export.
/// Only applies when an OTLP endpoint is configured.
pub use_stdout: Option<bool>,
/// Fraction of traces to sample, `0.0``1.0` (default: `0.1`).
pub sample_ratio: Option<f64>,
/// Metrics export interval in seconds (default: `15`).
pub meter_interval: Option<u64>,
/// OTel `service.name` attribute (default: `APP_NAME`).
pub service_name: Option<String>,
/// OTel `service.version` attribute (default: `SERVICE_VERSION`).
pub service_version: Option<String>,
/// Deployment environment tag, e.g. `production` or `development`.
pub environment: Option<String>,
// ── Local logging ─────────────────────────────────────────────────────────
/// Minimum log level directive (default: `info`).
/// Respects `RUST_LOG` syntax when set via environment.
pub logger_level: Option<String>,
/// When `true`, a stdout JSON layer is always attached regardless of the
/// active backend (default: `false` in production, `true` otherwise).
pub log_stdout_enabled: Option<bool>,
/// Directory where rolling log files are written.
/// When absent or empty, logging falls back to stdout-only mode.
pub log_directory: Option<String>,
/// Base name for log files (without date suffix), e.g. `rustfs`.
/// Used for both rolling-appender naming and cleanup scanning.
pub log_filename: Option<String>,
/// Rotation time granularity: `"hourly"` or `"daily"` (default: `"daily"`).
pub log_rotation_time: Option<String>,
/// Number of rolling log files to retain (default: `30`).
/// The rolling-appender will delete the oldest file when this limit is exceeded.
pub log_keep_files: Option<usize>,
// ── Log cleanup ───────────────────────────────────────────────────────────
/// Minimum number of files the cleaner must always preserve.
/// Typically set to the same value as `log_keep_files`.
pub log_keep_count: Option<usize>,
/// Hard ceiling on the total size (bytes) of all log files (default: 2 GiB).
pub log_max_total_size_bytes: Option<u64>,
/// Per-file size ceiling (bytes); `0` means unlimited (default: `0`).
pub log_max_single_file_size_bytes: Option<u64>,
/// Compress eligible files with gzip before deletion (default: `true`).
pub log_compress_old_files: Option<bool>,
/// Gzip compression level `1``9` (default: `6`).
pub log_gzip_compression_level: Option<u32>,
/// Delete compressed archives older than this many days; `0` = keep forever
/// (default: `30`).
pub log_compressed_file_retention_days: Option<u64>,
/// Comma-separated glob patterns for files that must never be cleaned up.
pub log_exclude_patterns: Option<String>,
/// Delete zero-byte log files during cleanup (default: `true`).
pub log_delete_empty_files: Option<bool>,
/// A file younger than this many seconds is never touched (default: `3600`).
pub log_min_file_age_seconds: Option<u64>,
/// How often the background cleanup task runs, in seconds (default: `21600`).
pub log_cleanup_interval_seconds: Option<u64>,
/// Log what *would* be deleted without actually removing anything
/// (default: `false`).
pub log_dry_run: Option<bool>,
}
impl OtelConfig {
/// Helper function: Extract observable configuration from environment variables
/// Build an [`OtelConfig`] from environment variables.
///
/// The optional `endpoint` argument sets the root OTLP endpoint. If it is
/// `None` or an empty string the value is read from the
/// `RUSTFS_OBS_ENDPOINT` environment variable instead.
///
/// When no endpoint is configured at all, `use_stdout` is forced to `true`
/// so that logs are still visible during development.
///
/// # Example
/// ```no_run
/// use rustfs_obs::OtelConfig;
///
/// // Read everything from env vars.
/// let config = OtelConfig::extract_otel_config_from_env(None);
///
/// // Override the endpoint programmatically.
/// let config = OtelConfig::extract_otel_config_from_env(
/// Some("http://localhost:4318".to_string())
/// );
/// ```
pub fn extract_otel_config_from_env(endpoint: Option<String>) -> OtelConfig {
let endpoint = if let Some(endpoint) = endpoint {
if endpoint.is_empty() {
env::var(ENV_OBS_ENDPOINT).unwrap_or_else(|_| "".to_string())
} else {
endpoint
}
} else {
env::var(ENV_OBS_ENDPOINT).unwrap_or_else(|_| "".to_string())
let endpoint = match endpoint {
Some(ep) if !ep.is_empty() => ep,
_ => env::var(ENV_OBS_ENDPOINT).unwrap_or_default(),
};
let mut use_stdout = get_env_bool(ENV_OBS_USE_STDOUT, USE_STDOUT);
if endpoint.is_empty() {
use_stdout = true;
}
// Force stdout when there is no remote endpoint so that operators
// always have *some* log output in the default configuration.
let use_stdout = if endpoint.is_empty() {
true
} else {
get_env_bool(ENV_OBS_USE_STDOUT, USE_STDOUT)
};
// The canonical log directory is resolved only when explicitly set via
// environment variable. When absent or empty, logging falls back to
// stdout-only mode (not file-rolling).
let log_directory = match std::env::var(ENV_OBS_LOG_DIRECTORY) {
Ok(val) if !val.is_empty() => Some(val),
_ => None,
};
// `log_keep_files` (legacy) and `log_keep_count` (new) share the same
// environment variables but have slightly different semantics.
// `log_keep_files` is the rolling-appender retention count; `log_keep_count`
// is the cleaner's minimum-keep threshold. Both default to the same value.
let log_keep_files = Some(get_env_usize(ENV_OBS_LOG_KEEP_FILES, DEFAULT_LOG_KEEP_FILES));
let log_keep_count = Some(get_env_usize(ENV_OBS_LOG_KEEP_COUNT, DEFAULT_OBS_LOG_KEEP_COUNT));
// `log_rotation_time` drives the rolling-appender rotation period.
let log_rotation_time = Some(get_env_str(ENV_OBS_LOG_ROTATION_TIME, DEFAULT_LOG_ROTATION_TIME));
OtelConfig {
// OTLP
endpoint,
trace_endpoint: get_env_opt_str(ENV_OBS_TRACE_ENDPOINT),
metric_endpoint: get_env_opt_str(ENV_OBS_METRIC_ENDPOINT),
@@ -112,25 +228,47 @@ impl OtelConfig {
service_name: Some(get_env_str(ENV_OBS_SERVICE_NAME, APP_NAME)),
service_version: Some(get_env_str(ENV_OBS_SERVICE_VERSION, SERVICE_VERSION)),
environment: Some(get_env_str(ENV_OBS_ENVIRONMENT, ENVIRONMENT)),
// Local logging
logger_level: Some(get_env_str(ENV_OBS_LOGGER_LEVEL, DEFAULT_LOG_LEVEL)),
log_stdout_enabled: Some(get_env_bool(ENV_OBS_LOG_STDOUT_ENABLED, DEFAULT_OBS_LOG_STDOUT_ENABLED)),
log_directory: Some(get_log_directory_to_string(ENV_OBS_LOG_DIRECTORY)),
log_directory,
log_filename: Some(get_env_str(ENV_OBS_LOG_FILENAME, DEFAULT_OBS_LOG_FILENAME)),
log_rotation_size_mb: Some(get_env_u64(ENV_OBS_LOG_ROTATION_SIZE_MB, DEFAULT_LOG_ROTATION_SIZE_MB)), // Default to 100 MB
log_rotation_time: Some(get_env_str(ENV_OBS_LOG_ROTATION_TIME, DEFAULT_LOG_ROTATION_TIME)), // Default to "Hour"
log_keep_files: Some(get_env_usize(ENV_OBS_LOG_KEEP_FILES, DEFAULT_LOG_KEEP_FILES)), // Default to keeping 30 log files
log_rotation_time,
log_keep_files,
// Log cleanup
log_keep_count,
log_max_total_size_bytes: Some(get_env_u64(ENV_OBS_LOG_MAX_TOTAL_SIZE_BYTES, DEFAULT_OBS_LOG_MAX_TOTAL_SIZE_BYTES)),
log_max_single_file_size_bytes: Some(get_env_u64(
ENV_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES,
DEFAULT_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES,
)),
log_compress_old_files: Some(get_env_bool(ENV_OBS_LOG_COMPRESS_OLD_FILES, DEFAULT_OBS_LOG_COMPRESS_OLD_FILES)),
log_gzip_compression_level: Some(get_env_u64(
ENV_OBS_LOG_GZIP_COMPRESSION_LEVEL,
DEFAULT_OBS_LOG_GZIP_COMPRESSION_LEVEL as u64,
) as u32),
log_compressed_file_retention_days: Some(get_env_u64(
ENV_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS,
DEFAULT_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS,
)),
log_exclude_patterns: get_env_opt_str(ENV_OBS_LOG_EXCLUDE_PATTERNS),
log_delete_empty_files: Some(get_env_bool(ENV_OBS_LOG_DELETE_EMPTY_FILES, DEFAULT_OBS_LOG_DELETE_EMPTY_FILES)),
log_min_file_age_seconds: Some(get_env_u64(ENV_OBS_LOG_MIN_FILE_AGE_SECONDS, DEFAULT_OBS_LOG_MIN_FILE_AGE_SECONDS)),
log_cleanup_interval_seconds: Some(get_env_u64(
ENV_OBS_LOG_CLEANUP_INTERVAL_SECONDS,
DEFAULT_OBS_LOG_CLEANUP_INTERVAL_SECONDS,
)),
log_dry_run: Some(get_env_bool(ENV_OBS_LOG_DRY_RUN, DEFAULT_OBS_LOG_DRY_RUN)),
}
}
/// Create a new instance of OtelConfig with default values
/// Create a new [`OtelConfig`] populated entirely from environment variables.
///
/// # Returns
/// A new instance of OtelConfig
/// Equivalent to `OtelConfig::extract_otel_config_from_env(None)`.
///
/// # Example
/// ```no_run
/// use rustfs_obs::OtelConfig;
///
/// let config = OtelConfig::new();
/// ```
pub fn new() -> Self {
@@ -138,26 +276,17 @@ impl OtelConfig {
}
}
/// Implement Default trait for OtelConfig
/// This allows creating a default instance of OtelConfig using OtelConfig::default()
/// which internally calls OtelConfig::new()
///
/// # Example
/// ```no_run
/// use rustfs_obs::OtelConfig;
///
/// let config = OtelConfig::default();
/// ```
impl Default for OtelConfig {
fn default() -> Self {
Self::new()
}
}
/// Overall application configuration
/// Add observability configuration
/// Top-level application configuration that embeds [`OtelConfig`].
///
/// Observability: OpenTelemetry configuration
/// Use this when the observability config lives inside a larger `AppConfig`
/// struct, e.g. when deserialising from a config file that also contains other
/// application settings.
///
/// # Example
/// ```
@@ -171,23 +300,18 @@ pub struct AppConfig {
}
impl AppConfig {
/// Create a new instance of AppConfig with default values
///
/// # Returns
/// A new instance of AppConfig
/// Create an [`AppConfig`] with all observability settings read from the
/// environment (no explicit endpoint override).
pub fn new() -> Self {
Self {
observability: OtelConfig::default(),
}
}
/// Create a new instance of AppConfig with specified endpoint
/// Create an [`AppConfig`] with an explicit OTLP endpoint.
///
/// # Arguments
/// * `endpoint` - An optional string representing the endpoint for metric collection
///
/// # Returns
/// A new instance of AppConfig
/// * `endpoint` - Root OTLP/HTTP endpoint URL, or `None` to read from env.
///
/// # Example
/// ```no_run
@@ -202,27 +326,16 @@ impl AppConfig {
}
}
/// Implement Default trait for AppConfig
/// This allows creating a default instance of AppConfig using AppConfig::default()
/// which internally calls AppConfig::new()
///
/// # Example
/// ```no_run
/// use rustfs_obs::AppConfig;
///
/// let config = AppConfig::default();
/// ```
impl Default for AppConfig {
fn default() -> Self {
Self::new()
}
}
/// Check if the current environment is production
///
/// # Returns
/// true if production, false otherwise
/// Returns `true` when the current runtime environment is `production`.
///
/// Reads the `RUSTFS_OBS_ENVIRONMENT` environment variable and compares it
/// case-insensitively against the string `"production"`.
pub fn is_production_environment() -> bool {
get_env_str(ENV_OBS_ENVIRONMENT, ENVIRONMENT).eq_ignore_ascii_case(DEFAULT_OBS_ENVIRONMENT_PRODUCTION)
}

View File

@@ -30,39 +30,39 @@
//! # use all functions
//! rustfs-obs = { version = "0.1.0", features = ["full"] }
//! ```
///
/// ## Usage
///
/// ```no_run
/// use rustfs_obs::init_obs;
///
/// # #[tokio::main]
/// # async fn main() {
/// # let _guard = match init_obs(None).await {
/// # Ok(g) => g,
/// # Err(e) => {
/// # panic!("Failed to initialize observability: {:?}", e);
/// # }
/// # };
/// # // Application logic here
/// # {
/// # // Simulate some work
/// # tokio::time::sleep(std::time::Duration::from_secs(2)).await;
/// # println!("Application is running...");
/// # }
/// # // Guard will be dropped here, flushing telemetry data
/// # }
/// ```
//!
//! ## Usage
//!
//! ```no_run
//! use rustfs_obs::init_obs;
//!
//! # #[tokio::main]
//! # async fn main() {
//! # let _guard = match init_obs(None).await {
//! # Ok(g) => g,
//! # Err(e) => {
//! # panic!("Failed to initialize observability: {:?}", e);
//! # }
//! # };
//! # // Application logic here
//! # {
//! # // Simulate some work
//! # tokio::time::sleep(std::time::Duration::from_secs(2)).await;
//! # println!("Application is running...");
//! # }
//! # // Guard will be dropped here, flushing telemetry data
//! # }
//! ```
mod config;
mod error;
mod global;
mod recorder;
mod log_cleanup;
mod system;
mod telemetry;
pub use config::*;
pub use error::*;
pub use global::*;
pub use recorder::*;
pub use log_cleanup::*;
pub use system::SystemObserver;
pub use telemetry::OtelGuard;
pub use telemetry::{OtelGuard, Recorder};

View File

@@ -0,0 +1,276 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Core log-file cleanup orchestration.
//!
//! [`LogCleaner`] is the public entry point for the cleanup subsystem.
//! Construct it with [`LogCleaner::new`] and call [`LogCleaner::cleanup`]
//! periodically (e.g. from a `tokio::spawn`-ed loop).
//!
//! Internally the cleaner delegates to:
//! - [`super::scanner`] — to discover which files exist and which are eligible,
//! - [`super::compress`] — to gzip-compress files before they are deleted,
//! - [`LogCleaner::select_files_to_delete`] — to apply count / size limits.
use super::compress::compress_file;
use super::scanner::{collect_expired_compressed_files, collect_log_files};
use super::types::FileInfo;
use std::path::PathBuf;
use tracing::{debug, error, info};
/// Log-file lifecycle manager.
///
/// Holds all cleanup policy parameters and exposes a single [`cleanup`] method
/// that performs one full cleanup pass.
///
/// # Thread-safety
/// `LogCleaner` is `Send + Sync`. Multiple callers can share a reference
/// (e.g. via `Arc`) and call `cleanup` concurrently without data races,
/// because no mutable state is mutated after construction.
pub struct LogCleaner {
/// Directory containing the managed log files.
pub(super) log_dir: PathBuf,
/// Filename prefix that identifies managed files (e.g. `"rustfs.log."`).
pub(super) file_prefix: String,
/// The cleaner will never delete files if doing so would leave fewer than
/// this many files in the directory.
pub(super) keep_count: usize,
/// Hard ceiling on the total bytes of all managed files; `0` = no limit.
pub(super) max_total_size_bytes: u64,
/// Hard ceiling on a single file's size; `0` = no per-file limit.
pub(super) max_single_file_size_bytes: u64,
/// Compress eligible files with gzip before removing them.
pub(super) compress_old_files: bool,
/// Gzip compression level (`1``9`, clamped on construction).
pub(super) gzip_compression_level: u32,
/// Delete compressed archives older than this many days; `0` = keep forever.
pub(super) compressed_file_retention_days: u64,
/// Compiled glob patterns for files that must never be cleaned up.
pub(super) exclude_patterns: Vec<glob::Pattern>,
/// Delete zero-byte files even when they are younger than `min_file_age_seconds`.
pub(super) delete_empty_files: bool,
/// Files younger than this threshold (in seconds) are never touched.
pub(super) min_file_age_seconds: u64,
/// When `true`, log what would be done without performing any destructive
/// filesystem operations.
pub(super) dry_run: bool,
}
impl LogCleaner {
/// Build a new [`LogCleaner`] with the supplied policy parameters.
///
/// `exclude_patterns` is a list of glob strings (e.g. `"*.lock"`). Invalid
/// glob patterns are silently ignored.
///
/// `gzip_compression_level` is clamped to the range `[1, 9]`.
#[allow(clippy::too_many_arguments)]
pub fn new(
log_dir: PathBuf,
file_prefix: String,
keep_count: usize,
max_total_size_bytes: u64,
max_single_file_size_bytes: u64,
compress_old_files: bool,
gzip_compression_level: u32,
compressed_file_retention_days: u64,
exclude_patterns: Vec<String>,
delete_empty_files: bool,
min_file_age_seconds: u64,
dry_run: bool,
) -> Self {
let patterns = exclude_patterns
.into_iter()
.filter_map(|p| glob::Pattern::new(&p).ok())
.collect();
Self {
log_dir,
file_prefix,
keep_count,
max_total_size_bytes,
max_single_file_size_bytes,
compress_old_files,
gzip_compression_level: gzip_compression_level.clamp(1, 9),
compressed_file_retention_days,
exclude_patterns: patterns,
delete_empty_files,
min_file_age_seconds,
dry_run,
}
}
/// Perform one full cleanup pass.
///
/// Steps:
/// 1. Scan the log directory for managed files.
/// 2. Apply count/size policies to select files for deletion.
/// 3. Optionally compress selected files, then delete them.
/// 4. Collect and delete expired compressed archives.
///
/// # Returns
/// A tuple `(deleted_count, freed_bytes)` covering all deletions in this
/// pass (both regular files and expired compressed archives).
///
/// # Errors
/// Returns an [`std::io::Error`] if the log directory cannot be read.
pub fn cleanup(&self) -> Result<(usize, u64), std::io::Error> {
if !self.log_dir.exists() {
debug!("Log directory does not exist: {:?}", self.log_dir);
return Ok((0, 0));
}
let mut total_deleted = 0usize;
let mut total_freed = 0u64;
// ── 1. Discover active log files ──────────────────────────────────────
let mut files = collect_log_files(
&self.log_dir,
&self.file_prefix,
&self.exclude_patterns,
self.min_file_age_seconds,
self.delete_empty_files,
self.dry_run,
)?;
if files.is_empty() {
debug!("No log files found in directory: {:?}", self.log_dir);
} else {
files.sort_by_key(|f| f.modified);
let total_size: u64 = files.iter().map(|f| f.size).sum();
info!(
"Found {} log files, total size: {} bytes ({:.2} MB)",
files.len(),
total_size,
total_size as f64 / 1024.0 / 1024.0
);
// ── 2. Select + compress + delete ─────────────────────────────────
let to_delete = self.select_files_to_delete(&files, total_size);
if !to_delete.is_empty() {
let (d, f) = self.compress_and_delete(&to_delete)?;
total_deleted += d;
total_freed += f;
}
}
// ── 3. Remove expired compressed archives ─────────────────────────────
let expired_gz = collect_expired_compressed_files(&self.log_dir, &self.file_prefix, self.compressed_file_retention_days)?;
if !expired_gz.is_empty() {
let (d, f) = self.delete_files(&expired_gz)?;
total_deleted += d;
total_freed += f;
}
if total_deleted > 0 || total_freed > 0 {
info!(
"Cleanup completed: deleted {} files, freed {} bytes ({:.2} MB)",
total_deleted,
total_freed,
total_freed as f64 / 1024.0 / 1024.0
);
}
Ok((total_deleted, total_freed))
}
// ─── Selection ────────────────────────────────────────────────────────────
/// Choose which files from `files` (sorted oldest-first) should be deleted.
///
/// The algorithm respects three constraints in order:
/// 1. Always keep at least `keep_count` files.
/// 2. Delete old files while the total size exceeds `max_total_size_bytes`.
/// 3. Delete any file whose individual size exceeds `max_single_file_size_bytes`.
pub(super) fn select_files_to_delete(&self, files: &[FileInfo], total_size: u64) -> Vec<FileInfo> {
let mut to_delete = Vec::new();
if files.len() <= self.keep_count {
return to_delete;
}
let mut current_size = total_size;
let deletable = files.len() - self.keep_count;
for (idx, file) in files.iter().enumerate() {
if idx >= deletable {
break;
}
let over_total = self.max_total_size_bytes > 0 && current_size > self.max_total_size_bytes;
let over_single = self.max_single_file_size_bytes > 0 && file.size > self.max_single_file_size_bytes;
if over_total || over_single {
if over_single {
debug!(
"File exceeds single-file size limit: {:?} ({} > {} bytes)",
file.path, file.size, self.max_single_file_size_bytes
);
}
current_size = current_size.saturating_sub(file.size);
to_delete.push(file.clone());
} else {
// Neither limit is breached; stop scanning.
break;
}
}
to_delete
}
// ─── Compression + deletion ───────────────────────────────────────────────
/// Optionally compress and then delete the given files.
fn compress_and_delete(&self, files: &[FileInfo]) -> Result<(usize, u64), std::io::Error> {
if self.compress_old_files {
for f in files {
if let Err(e) = compress_file(&f.path, self.gzip_compression_level, self.dry_run) {
tracing::warn!("Failed to compress {:?}: {}", f.path, e);
}
}
}
self.delete_files(files)
}
/// Delete all files in `files`, logging each operation.
///
/// Errors on individual files are logged but do **not** abort the loop.
///
/// # Returns
/// `(deleted_count, freed_bytes)`.
pub(super) fn delete_files(&self, files: &[FileInfo]) -> Result<(usize, u64), std::io::Error> {
let mut deleted = 0usize;
let mut freed = 0u64;
for f in files {
if self.dry_run {
info!("[DRY RUN] Would delete: {:?} ({} bytes)", f.path, f.size);
deleted += 1;
freed += f.size;
} else {
match std::fs::remove_file(&f.path) {
Ok(()) => {
debug!("Deleted: {:?}", f.path);
deleted += 1;
freed += f.size;
}
Err(e) => {
error!("Failed to delete {:?}: {}", f.path, e);
}
}
}
}
Ok((deleted, freed))
}
}

View File

@@ -0,0 +1,76 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Gzip compression helper for old log files.
//!
//! Files are compressed in place: `<name>` → `<name>.gz`. The original file
//! is **not** deleted here — deletion is handled by the caller after
//! compression succeeds.
use flate2::Compression;
use flate2::write::GzEncoder;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::Path;
use tracing::{debug, info};
/// Compress `path` to `<path>.gz` using gzip.
///
/// If a `.gz` file for the given path already exists the function returns
/// `Ok(())` immediately without overwriting the existing archive.
///
/// # Arguments
/// * `path` - Path to the uncompressed log file.
/// * `level` - Gzip compression level (`1``9`); clamped automatically.
/// * `dry_run` - When `true`, log what would be done without writing anything.
///
/// # Errors
/// Propagates any I/O error encountered while opening, reading, writing, or
/// flushing files.
pub(super) fn compress_file(path: &Path, level: u32, dry_run: bool) -> Result<(), std::io::Error> {
let gz_path = path.with_extension("gz");
if gz_path.exists() {
debug!("Compressed file already exists, skipping: {:?}", gz_path);
return Ok(());
}
if dry_run {
info!("[DRY RUN] Would compress file: {:?} -> {:?}", path, gz_path);
return Ok(());
}
let input = File::open(path)?;
let output = File::create(&gz_path)?;
let mut reader = BufReader::new(input);
let mut writer = BufWriter::new(output);
let mut encoder = GzEncoder::new(Vec::new(), Compression::new(level.clamp(1, 9)));
std::io::copy(&mut reader, &mut encoder)?;
let compressed = encoder.finish()?;
writer.write_all(&compressed)?;
writer.flush()?;
debug!(
"Compressed {:?} -> {:?} ({} bytes -> {} bytes)",
path,
gz_path,
std::fs::metadata(path).map(|m| m.len()).unwrap_or(0),
compressed.len()
);
Ok(())
}

View File

@@ -0,0 +1,178 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Log-file cleanup subsystem.
//!
//! This module provides [`LogCleaner`], a configurable manager that
//! periodically removes, compresses, or archives old rolling log files.
//!
//! ## Sub-modules
//!
//! | Module | Responsibility |
//! |-------------|----------------------------------------------------------|
//! | `types` | Shared data types (`FileInfo`) |
//! | `scanner` | Filesystem traversal — discovers eligible files |
//! | `compress` | Gzip compression helper |
//! | `cleaner` | Core orchestration — selection, compression, deletion |
//!
//! ## Usage
//!
//! ```no_run
//! use std::path::PathBuf;
//! use rustfs_obs::LogCleaner;
//!
//! let cleaner = LogCleaner::new(
//! PathBuf::from("/var/log/rustfs"),
//! "rustfs.log.".to_string(),
//! 10, // keep_count
//! 2 * 1024 * 1024 * 1024, // max_total_size_bytes (2 GiB)
//! 0, // max_single_file_size_bytes (unlimited)
//! true, // compress_old_files
//! 6, // gzip_compression_level
//! 30, // compressed_file_retention_days
//! vec![], // exclude_patterns
//! true, // delete_empty_files
//! 3600, // min_file_age_seconds (1 hour)
//! false, // dry_run
//! );
//!
//! let (deleted, freed_bytes) = cleaner.cleanup().expect("cleanup failed");
//! println!("Deleted {deleted} files, freed {freed_bytes} bytes");
//! ```
mod cleaner;
mod compress;
mod scanner;
mod types;
pub use cleaner::LogCleaner;
#[cfg(test)]
mod tests {
use super::cleaner::LogCleaner;
use super::scanner;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use tempfile::TempDir;
fn create_log_file(dir: &Path, name: &str, size: usize) -> std::io::Result<()> {
let path = dir.join(name);
let mut f = File::create(path)?;
f.write_all(&vec![b'X'; size])?;
f.flush()
}
/// Build a cleaner with sensible test defaults (no compression, no age gate).
fn make_cleaner(dir: std::path::PathBuf, keep: usize, max_bytes: u64) -> LogCleaner {
LogCleaner::new(
dir,
"app.log.".to_string(),
keep,
max_bytes,
0, // max_single_file_size_bytes
false, // compress_old_files
6, // gzip_compression_level
30, // compressed_file_retention_days
Vec::new(), // exclude_patterns
true, // delete_empty_files
0, // min_file_age_seconds (0 = no age gate in tests)
false, // dry_run
)
}
#[test]
fn test_cleanup_removes_oldest_when_over_size() -> std::io::Result<()> {
let tmp = TempDir::new()?;
let dir = tmp.path().to_path_buf();
create_log_file(&dir, "app.log.2024-01-01", 1024)?;
create_log_file(&dir, "app.log.2024-01-02", 1024)?;
create_log_file(&dir, "app.log.2024-01-03", 1024)?;
create_log_file(&dir, "other.log", 1024)?; // not managed
// Total managed = 3 072 bytes; limit = 2 048; keep_count = 2 → must delete 1.
let cleaner = make_cleaner(dir.clone(), 2, 2048);
let (deleted, freed) = cleaner.cleanup()?;
assert_eq!(deleted, 1, "should delete exactly one file");
assert_eq!(freed, 1024);
Ok(())
}
#[test]
fn test_cleanup_respects_keep_count() -> std::io::Result<()> {
let tmp = TempDir::new()?;
let dir = tmp.path().to_path_buf();
for i in 1..=5 {
create_log_file(&dir, &format!("app.log.2024-01-0{i}"), 1024)?;
}
// No size limit, keep_count = 3 → nothing to delete (5 > 3 but size == 0 limit).
let cleaner = make_cleaner(dir.clone(), 3, 0);
let (deleted, _) = cleaner.cleanup()?;
assert_eq!(deleted, 0, "keep_count prevents deletion when no size limit");
Ok(())
}
#[test]
fn test_cleanup_ignores_unrelated_files() -> std::io::Result<()> {
let tmp = TempDir::new()?;
let dir = tmp.path().to_path_buf();
create_log_file(&dir, "app.log.2024-01-01", 1024)?;
create_log_file(&dir, "app.log.2024-01-02", 1024)?;
create_log_file(&dir, "other.log", 512)?; // different prefix
let cleaner = make_cleaner(dir.clone(), 1, 512);
let (deleted, _) = cleaner.cleanup()?;
// "other.log" must not be counted or deleted.
assert_eq!(deleted, 1, "only managed files should be deleted");
Ok(())
}
#[test]
fn test_collect_log_files_counts_correctly() -> std::io::Result<()> {
let tmp = TempDir::new()?;
let dir = tmp.path().to_path_buf();
create_log_file(&dir, "app.log.2024-01-01", 1024)?;
create_log_file(&dir, "app.log.2024-01-02", 2048)?;
create_log_file(&dir, "other.log", 512)?;
let files = scanner::collect_log_files(&dir, "app.log.", &[], 0, true, false)?;
assert_eq!(files.len(), 2, "scanner should find exactly 2 managed files");
Ok(())
}
#[test]
fn test_dry_run_does_not_delete() -> std::io::Result<()> {
let tmp = TempDir::new()?;
let dir = tmp.path().to_path_buf();
create_log_file(&dir, "app.log.2024-01-01", 1024)?;
create_log_file(&dir, "app.log.2024-01-02", 1024)?;
create_log_file(&dir, "app.log.2024-01-03", 1024)?;
let cleaner = LogCleaner::new(dir.clone(), "app.log.".to_string(), 1, 1024, 0, false, 6, 30, vec![], true, 0, true);
let (deleted, _freed) = cleaner.cleanup()?;
// dry_run=true reports deletions but doesn't actually remove files.
assert!(deleted > 0, "dry_run should report files as deleted");
assert_eq!(std::fs::read_dir(&dir)?.count(), 3, "no files should actually be removed");
Ok(())
}
}

View File

@@ -0,0 +1,197 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Filesystem scanner for discovering log files eligible for cleanup.
//!
//! This module is intentionally kept read-only: it does **not** delete or
//! compress any files — it only reports what it found.
use super::types::FileInfo;
use std::path::Path;
use std::time::{Duration, SystemTime};
use tracing::debug;
use walkdir::WalkDir;
/// Collect all log files in `log_dir` whose name starts with `file_prefix`.
///
/// Files that:
/// - are already compressed (`.gz` extension),
/// - are zero-byte and `delete_empty_files` is `true` (these are handled
/// immediately by the caller), or
/// - match one of the `exclude_patterns`,
/// - were modified more recently than `min_file_age_seconds` seconds ago,
///
/// are skipped and not returned in the result list.
///
/// # Arguments
/// * `log_dir` - Root directory to scan (depth 1 only, no recursion).
/// * `file_prefix` - Only filenames starting with this string are considered.
/// * `exclude_patterns` - Compiled glob patterns; matching files are skipped.
/// * `min_file_age_seconds` - Files younger than this threshold are skipped.
/// * `delete_empty_files` - When `true`, zero-byte files trigger an immediate
/// delete by the caller before the rest of cleanup runs.
pub(super) fn collect_log_files(
log_dir: &Path,
file_prefix: &str,
exclude_patterns: &[glob::Pattern],
min_file_age_seconds: u64,
delete_empty_files: bool,
dry_run: bool,
) -> Result<Vec<FileInfo>, std::io::Error> {
let mut files = Vec::new();
let now = SystemTime::now();
for entry in WalkDir::new(log_dir)
.max_depth(1)
.follow_links(false)
.into_iter()
.filter_map(|e| e.ok())
{
let path = entry.path();
if !path.is_file() {
continue;
}
let filename = match path.file_name().and_then(|n| n.to_str()) {
Some(f) => f,
None => continue,
};
// Only manage files that carry our prefix.
if !filename.starts_with(file_prefix) {
continue;
}
// Compressed files are handled by collect_compressed_files.
if filename.ends_with(".gz") {
continue;
}
// Honour exclusion patterns.
if is_excluded(filename, exclude_patterns) {
debug!("Excluding file from cleanup: {:?}", filename);
continue;
}
let metadata = match entry.metadata() {
Ok(m) => m,
Err(_) => continue,
};
let modified = match metadata.modified() {
Ok(t) => t,
Err(_) => continue,
};
let file_size = metadata.len();
// Delete zero-byte files immediately (outside the normal selection
// logic) when the feature is enabled.
if file_size == 0 && delete_empty_files {
if !dry_run {
if let Err(e) = std::fs::remove_file(path) {
tracing::warn!("Failed to delete empty file {:?}: {}", path, e);
} else {
debug!("Deleted empty file: {:?}", path);
}
} else {
tracing::info!("[DRY RUN] Would delete empty file: {:?}", path);
}
continue;
}
// Skip files that are too young.
if let Ok(age) = now.duration_since(modified)
&& age.as_secs() < min_file_age_seconds
{
debug!(
"Skipping file (too new): {:?}, age: {}s, min_age: {}s",
filename,
age.as_secs(),
min_file_age_seconds
);
continue;
}
files.push(FileInfo {
path: path.to_path_buf(),
size: file_size,
modified,
});
}
Ok(files)
}
/// Collect compressed `.gz` log files whose age exceeds the retention period.
///
/// When `compressed_file_retention_days` is `0` the function returns immediately
/// without collecting anything (files are kept indefinitely).
///
/// # Arguments
/// * `log_dir` - Root directory to scan.
/// * `file_prefix` - Only `.gz` files that also start with this prefix are considered.
/// * `compressed_file_retention_days` - Files older than this are eligible for
/// deletion; `0` means never delete compressed files.
pub(super) fn collect_expired_compressed_files(
log_dir: &Path,
file_prefix: &str,
compressed_file_retention_days: u64,
) -> Result<Vec<FileInfo>, std::io::Error> {
if compressed_file_retention_days == 0 {
return Ok(Vec::new());
}
let retention = Duration::from_secs(compressed_file_retention_days * 24 * 3600);
let now = SystemTime::now();
let mut files = Vec::new();
for entry in WalkDir::new(log_dir)
.max_depth(1)
.follow_links(false)
.into_iter()
.filter_map(|e| e.ok())
{
let path = entry.path();
if !path.is_file() {
continue;
}
let filename = match path.file_name().and_then(|n| n.to_str()) {
Some(f) => f,
None => continue,
};
if !filename.starts_with(file_prefix) || !filename.ends_with(".gz") {
continue;
}
let Ok(metadata) = entry.metadata() else { continue };
let Ok(modified) = metadata.modified() else { continue };
let Ok(age) = now.duration_since(modified) else { continue };
if age > retention {
files.push(FileInfo {
path: path.to_path_buf(),
size: metadata.len(),
modified,
});
}
}
Ok(files)
}
/// Returns `true` if `filename` matches any of the compiled exclusion patterns.
pub(super) fn is_excluded(filename: &str, patterns: &[glob::Pattern]) -> bool {
patterns.iter().any(|p| p.matches(filename))
}

View File

@@ -0,0 +1,33 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Shared types used across the log-cleanup sub-modules.
use std::path::PathBuf;
use std::time::SystemTime;
/// Metadata for a single log file discovered by the scanner.
///
/// Carries enough information to make cleanup decisions (sort by age, compare
/// size against limits, etc.) without re-reading filesystem metadata on every
/// operation.
#[derive(Debug, Clone)]
pub(super) struct FileInfo {
/// Absolute path to the file.
pub path: PathBuf,
/// File size in bytes at the time of discovery.
pub size: u64,
/// Last-modification timestamp from the filesystem.
pub modified: SystemTime,
}

View File

@@ -1,725 +0,0 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use crate::config::OtelConfig;
use crate::global::OBSERVABILITY_METRIC_ENABLED;
use crate::{Recorder, TelemetryError};
use flexi_logger::{DeferredNow, Record, WriteMode, WriteMode::AsyncWith, style};
use metrics::counter;
use nu_ansi_term::Color;
use opentelemetry::{KeyValue, global, trace::TracerProvider};
use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge;
use opentelemetry_otlp::{Compression, Protocol, WithExportConfig, WithHttpConfig};
use opentelemetry_sdk::propagation::TraceContextPropagator;
use opentelemetry_sdk::{
Resource,
logs::SdkLoggerProvider,
metrics::{PeriodicReader, SdkMeterProvider},
trace::{RandomIdGenerator, Sampler, SdkTracerProvider},
};
use opentelemetry_semantic_conventions::{
SCHEMA_URL,
attribute::{DEPLOYMENT_ENVIRONMENT_NAME, NETWORK_LOCAL_ADDRESS, SERVICE_VERSION as OTEL_SERVICE_VERSION},
};
use rustfs_config::{
APP_NAME, DEFAULT_LOG_KEEP_FILES, DEFAULT_LOG_LEVEL, DEFAULT_OBS_LOG_STDOUT_ENABLED, DEFAULT_OBS_LOGS_EXPORT_ENABLED,
DEFAULT_OBS_METRICS_EXPORT_ENABLED, DEFAULT_OBS_TRACES_EXPORT_ENABLED, ENVIRONMENT, METER_INTERVAL, SAMPLE_RATIO,
SERVICE_VERSION,
observability::{
DEFAULT_OBS_ENVIRONMENT_PRODUCTION, DEFAULT_OBS_LOG_FLUSH_MS, DEFAULT_OBS_LOG_MESSAGE_CAPA, DEFAULT_OBS_LOG_POOL_CAPA,
ENV_OBS_LOG_DIRECTORY, ENV_OBS_LOG_FLUSH_MS, ENV_OBS_LOG_MESSAGE_CAPA, ENV_OBS_LOG_POOL_CAPA,
},
};
use rustfs_utils::{get_env_opt_str, get_env_u64, get_env_usize, get_local_ip_with_default};
use smallvec::SmallVec;
use std::{borrow::Cow, fs, io::IsTerminal, time::Duration};
use tracing::info;
use tracing_error::ErrorLayer;
use tracing_opentelemetry::{MetricsLayer, OpenTelemetryLayer};
use tracing_subscriber::{
EnvFilter, Layer,
fmt::{format::FmtSpan, time::LocalTime},
layer::SubscriberExt,
util::SubscriberInitExt,
};
/// A guard object that manages the lifecycle of OpenTelemetry components.
///
/// This struct holds references to the created OpenTelemetry providers and ensures
/// they are properly shut down when the guard is dropped. It implements the RAII
/// (Resource Acquisition Is Initialization) pattern for managing telemetry resources.
///
/// When this guard goes out of scope, it will automatically shut down:
/// - The tracer provider (for distributed tracing)
/// - The meter provider (for metrics collection)
/// - The logger provider (for structured logging)
///
/// Implement Debug trait correctly, rather than using derive, as some fields may not have implemented Debug
pub struct OtelGuard {
tracer_provider: Option<SdkTracerProvider>,
meter_provider: Option<SdkMeterProvider>,
logger_provider: Option<SdkLoggerProvider>,
flexi_logger_handles: Option<flexi_logger::LoggerHandle>,
tracing_guard: Option<tracing_appender::non_blocking::WorkerGuard>,
}
impl std::fmt::Debug for OtelGuard {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OtelGuard")
.field("tracer_provider", &self.tracer_provider.is_some())
.field("meter_provider", &self.meter_provider.is_some())
.field("logger_provider", &self.logger_provider.is_some())
.field("flexi_logger_handles", &self.flexi_logger_handles.is_some())
.field("tracing_guard", &self.tracing_guard.is_some())
.finish()
}
}
impl Drop for OtelGuard {
fn drop(&mut self) {
if let Some(provider) = self.tracer_provider.take()
&& let Err(err) = provider.shutdown()
{
eprintln!("Tracer shutdown error: {err:?}");
}
if let Some(provider) = self.meter_provider.take()
&& let Err(err) = provider.shutdown()
{
eprintln!("Meter shutdown error: {err:?}");
}
if let Some(provider) = self.logger_provider.take()
&& let Err(err) = provider.shutdown()
{
eprintln!("Logger shutdown error: {err:?}");
}
if let Some(handle) = self.flexi_logger_handles.take() {
handle.shutdown();
println!("flexi_logger shutdown completed");
}
if let Some(guard) = self.tracing_guard.take() {
drop(guard);
println!("Tracing guard dropped, flushing logs.");
}
}
}
/// create OpenTelemetry Resource
fn resource(config: &OtelConfig) -> Resource {
Resource::builder()
.with_service_name(Cow::Borrowed(config.service_name.as_deref().unwrap_or(APP_NAME)).to_string())
.with_schema_url(
[
KeyValue::new(
OTEL_SERVICE_VERSION,
Cow::Borrowed(config.service_version.as_deref().unwrap_or(SERVICE_VERSION)).to_string(),
),
KeyValue::new(
DEPLOYMENT_ENVIRONMENT_NAME,
Cow::Borrowed(config.environment.as_deref().unwrap_or(ENVIRONMENT)).to_string(),
),
KeyValue::new(NETWORK_LOCAL_ADDRESS, get_local_ip_with_default()),
],
SCHEMA_URL,
)
.build()
}
/// Creates a periodic reader for stdout metrics
fn create_periodic_reader(interval: u64) -> PeriodicReader<opentelemetry_stdout::MetricExporter> {
PeriodicReader::builder(opentelemetry_stdout::MetricExporter::default())
.with_interval(Duration::from_secs(interval))
.build()
}
// Read the AsyncWith parameter from the environment variable
fn get_env_async_with() -> WriteMode {
let pool_capa = get_env_usize(ENV_OBS_LOG_POOL_CAPA, DEFAULT_OBS_LOG_POOL_CAPA);
let message_capa = get_env_usize(ENV_OBS_LOG_MESSAGE_CAPA, DEFAULT_OBS_LOG_MESSAGE_CAPA);
let flush_ms = get_env_u64(ENV_OBS_LOG_FLUSH_MS, DEFAULT_OBS_LOG_FLUSH_MS);
AsyncWith {
pool_capa,
message_capa,
flush_interval: Duration::from_millis(flush_ms),
}
}
fn build_env_filter(logger_level: &str, default_level: Option<&str>) -> EnvFilter {
let level = default_level.unwrap_or(logger_level);
let mut filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(level));
if !matches!(logger_level, "trace" | "debug") {
let directives: SmallVec<[&str; 5]> = smallvec::smallvec!["hyper", "tonic", "h2", "reqwest", "tower"];
for directive in directives {
filter = filter.add_directive(format!("{directive}=off").parse().unwrap());
}
}
filter
}
/// Custom Log Formatter Function - Terminal Output (with Color)
#[inline(never)]
fn format_with_color(w: &mut dyn std::io::Write, now: &mut DeferredNow, record: &Record) -> Result<(), std::io::Error> {
let level = record.level();
let level_style = style(level);
let binding = std::thread::current();
let thread_name = binding.name().unwrap_or("unnamed");
let thread_id = format!("{:?}", std::thread::current().id());
write!(
w,
"[{}] {} [{}] [{}:{}] [{}:{}] {}",
now.now().format(flexi_logger::TS_DASHES_BLANK_COLONS_DOT_BLANK),
level_style.paint(level.to_string()),
Color::Magenta.paint(record.target()),
Color::Blue.paint(record.file().unwrap_or("unknown")),
Color::Blue.paint(record.line().unwrap_or(0).to_string()),
Color::Green.paint(thread_name),
Color::Green.paint(thread_id),
record.args()
)
}
/// Custom Log Formatter - File Output (No Color)
#[inline(never)]
fn format_for_file(w: &mut dyn std::io::Write, now: &mut DeferredNow, record: &Record) -> Result<(), std::io::Error> {
let level = record.level();
let binding = std::thread::current();
let thread_name = binding.name().unwrap_or("unnamed");
let thread_id = format!("{:?}", std::thread::current().id());
write!(
w,
"[{}] {} [{}] [{}:{}] [{}:{}] {}",
now.now().format(flexi_logger::TS_DASHES_BLANK_COLONS_DOT_BLANK),
level,
record.target(),
record.file().unwrap_or("unknown"),
record.line().unwrap_or(0),
thread_name,
thread_id,
record.args()
)
}
/// stdout + span information (fix: retain WorkerGuard to avoid releasing after initialization)
fn init_stdout_logging(_config: &OtelConfig, logger_level: &str, is_production: bool) -> OtelGuard {
let env_filter = build_env_filter(logger_level, None);
let (nb, guard) = tracing_appender::non_blocking(std::io::stdout());
let enable_color = std::io::stdout().is_terminal();
let fmt_layer = tracing_subscriber::fmt::layer()
.with_timer(LocalTime::rfc_3339())
.with_target(true)
.with_ansi(enable_color)
.with_thread_names(true)
.with_thread_ids(true)
.with_file(true)
.with_line_number(true)
.with_writer(nb)
.json()
.with_current_span(true)
.with_span_list(true)
.with_span_events(if is_production { FmtSpan::CLOSE } else { FmtSpan::FULL });
tracing_subscriber::registry()
.with(env_filter)
.with(ErrorLayer::default())
.with(fmt_layer)
.init();
OBSERVABILITY_METRIC_ENABLED.set(false).ok();
counter!("rustfs.start.total").increment(1);
info!("Init stdout logging (level: {})", logger_level);
OtelGuard {
tracer_provider: None,
meter_provider: None,
logger_provider: None,
flexi_logger_handles: None,
tracing_guard: Some(guard),
}
}
/// File rolling log (size switching + number retained)
fn init_file_logging(config: &OtelConfig, logger_level: &str, is_production: bool) -> Result<OtelGuard, TelemetryError> {
use flexi_logger::{Age, Cleanup, Criterion, FileSpec, LogSpecification, Naming};
let service_name = config.service_name.as_deref().unwrap_or(APP_NAME);
let default_log_directory = rustfs_utils::dirs::get_log_directory_to_string(ENV_OBS_LOG_DIRECTORY);
let log_directory = config.log_directory.as_deref().unwrap_or(default_log_directory.as_str());
let log_filename = config.log_filename.as_deref().unwrap_or(service_name);
let keep_files = config.log_keep_files.unwrap_or(DEFAULT_LOG_KEEP_FILES);
if let Err(e) = fs::create_dir_all(log_directory) {
return Err(TelemetryError::Io(e.to_string()));
}
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
let desired: u32 = 0o755;
match fs::metadata(log_directory) {
Ok(meta) => {
let current = meta.permissions().mode() & 0o777;
// Only tighten to 0755 if existing permissions are looser than target, avoid loosening
if (current & !desired) != 0 {
if let Err(e) = fs::set_permissions(log_directory, Permissions::from_mode(desired)) {
return Err(TelemetryError::SetPermissions(format!(
"dir='{log_directory}', want={desired:#o}, have={current:#o}, err={e}"
)));
}
// Second verification
if let Ok(meta2) = fs::metadata(log_directory) {
let after = meta2.permissions().mode() & 0o777;
if after != desired {
return Err(TelemetryError::SetPermissions(format!(
"dir='{log_directory}', want={desired:#o}, after={after:#o}"
)));
}
}
}
}
Err(e) => {
return Err(TelemetryError::Io(format!("stat '{log_directory}' failed: {e}")));
}
}
}
// parsing level
let log_spec = LogSpecification::parse(logger_level)
.unwrap_or_else(|_| LogSpecification::parse(DEFAULT_LOG_LEVEL).unwrap_or(LogSpecification::error()));
// Switch by size (MB), Build log cutting conditions
let rotation_criterion = match (config.log_rotation_time.as_deref(), config.log_rotation_size_mb) {
// Cut by time and size at the same time
(Some(time), Some(size)) => {
let age = match time.to_lowercase().as_str() {
"hour" => Age::Hour,
"day" => Age::Day,
"minute" => Age::Minute,
"second" => Age::Second,
_ => Age::Day, // The default is by day
};
Criterion::AgeOrSize(age, size * 1024 * 1024) // Convert to bytes
}
// Cut by time only
(Some(time), None) => {
let age = match time.to_lowercase().as_str() {
"hour" => Age::Hour,
"day" => Age::Day,
"minute" => Age::Minute,
"second" => Age::Second,
_ => Age::Day, // The default is by day
};
Criterion::Age(age)
}
// Cut by size only
(None, Some(size)) => {
Criterion::Size(size * 1024 * 1024) // Convert to bytes
}
// By default, it is cut by the day
_ => Criterion::Age(Age::Day),
};
// write mode
let write_mode = get_env_async_with();
// Build
let mut builder = flexi_logger::Logger::try_with_env_or_str(logger_level)
.unwrap_or(flexi_logger::Logger::with(log_spec.clone()))
.format_for_stderr(format_with_color)
.format_for_stdout(format_with_color)
.format_for_files(format_for_file)
.log_to_file(
FileSpec::default()
.directory(log_directory)
.basename(log_filename)
.suppress_timestamp(),
)
.rotate(rotation_criterion, Naming::TimestampsDirect, Cleanup::KeepLogFiles(keep_files))
.write_mode(write_mode)
.append()
.use_utc();
// Optional copy to stdout (for local observation)
if config.log_stdout_enabled.unwrap_or(DEFAULT_OBS_LOG_STDOUT_ENABLED) || !is_production {
builder = builder.duplicate_to_stdout(flexi_logger::Duplicate::All);
} else {
builder = builder.duplicate_to_stdout(flexi_logger::Duplicate::None);
}
let handle = match builder.start() {
Ok(h) => Some(h),
Err(e) => {
eprintln!("ERROR: start flexi_logger failed: {e}");
None
}
};
OBSERVABILITY_METRIC_ENABLED.set(false).ok();
info!(
"Init file logging at '{}', roll size {:?}MB, keep {}",
log_directory, config.log_rotation_size_mb, keep_files
);
Ok(OtelGuard {
tracer_provider: None,
meter_provider: None,
logger_provider: None,
flexi_logger_handles: handle,
tracing_guard: None,
})
}
/// Observability (HTTP export, supports three sub-endpoints; if not, fallback to unified endpoint)
fn init_observability_http(config: &OtelConfig, logger_level: &str, is_production: bool) -> Result<OtelGuard, TelemetryError> {
// Resources and sampling
let res = resource(config);
let service_name = config.service_name.as_deref().unwrap_or(APP_NAME).to_owned();
let use_stdout = config.use_stdout.unwrap_or(!is_production);
let sample_ratio = config.sample_ratio.unwrap_or(SAMPLE_RATIO);
let sampler = if (0.0..1.0).contains(&sample_ratio) {
Sampler::TraceIdRatioBased(sample_ratio)
} else {
Sampler::AlwaysOn
};
// Endpoint
let root_ep = config.endpoint.clone(); // owned String
let trace_ep: String = config
.trace_endpoint
.as_deref()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| format!("{root_ep}/v1/traces"));
let metric_ep: String = config
.metric_endpoint
.as_deref()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| format!("{root_ep}/v1/metrics"));
let log_ep: String = config
.log_endpoint
.as_deref()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| format!("{root_ep}/v1/logs"));
// TracerHTTP
let tracer_provider = {
if trace_ep.is_empty() || !config.traces_export_enabled.unwrap_or(DEFAULT_OBS_TRACES_EXPORT_ENABLED) {
None
} else {
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_http()
.with_endpoint(trace_ep.as_str())
.with_protocol(Protocol::HttpBinary)
.with_compression(Compression::Gzip)
.build()
.map_err(|e| TelemetryError::BuildSpanExporter(e.to_string()))?;
let mut builder = SdkTracerProvider::builder()
.with_sampler(sampler)
.with_id_generator(RandomIdGenerator::default())
.with_resource(res.clone())
.with_batch_exporter(exporter);
if use_stdout {
builder = builder.with_batch_exporter(opentelemetry_stdout::SpanExporter::default());
}
let provider = builder.build();
global::set_tracer_provider(provider.clone());
global::set_text_map_propagator(TraceContextPropagator::new());
Some(provider)
}
};
// MeterHTTP
let meter_provider = {
if metric_ep.is_empty() || !config.metrics_export_enabled.unwrap_or(DEFAULT_OBS_METRICS_EXPORT_ENABLED) {
None
} else {
let exporter = opentelemetry_otlp::MetricExporter::builder()
.with_http()
.with_endpoint(metric_ep.as_str())
.with_temporality(opentelemetry_sdk::metrics::Temporality::default())
.with_protocol(Protocol::HttpBinary)
.with_compression(Compression::Gzip)
.build()
.map_err(|e| TelemetryError::BuildMetricExporter(e.to_string()))?;
let meter_interval = config.meter_interval.unwrap_or(METER_INTERVAL);
let (provider, recorder) = Recorder::builder(service_name.clone())
.with_meter_provider(|b| {
let b = b.with_resource(res.clone()).with_reader(
PeriodicReader::builder(exporter)
.with_interval(Duration::from_secs(meter_interval))
.build(),
);
if use_stdout {
b.with_reader(create_periodic_reader(meter_interval))
} else {
b
}
})
.build();
global::set_meter_provider(provider.clone());
metrics::set_global_recorder(recorder).map_err(|e| TelemetryError::InstallMetricsRecorder(e.to_string()))?;
OBSERVABILITY_METRIC_ENABLED.set(true).ok();
Some(provider)
}
};
// LoggerHTTP
let logger_provider = {
if log_ep.is_empty() || !config.logs_export_enabled.unwrap_or(DEFAULT_OBS_LOGS_EXPORT_ENABLED) {
None
} else {
let exporter = opentelemetry_otlp::LogExporter::builder()
.with_http()
.with_endpoint(log_ep.as_str())
.with_protocol(Protocol::HttpBinary)
.with_compression(Compression::Gzip)
.build()
.map_err(|e| TelemetryError::BuildLogExporter(e.to_string()))?;
let mut builder = SdkLoggerProvider::builder().with_resource(res);
builder = builder.with_batch_exporter(exporter);
if use_stdout {
builder = builder.with_batch_exporter(opentelemetry_stdout::LogExporter::default());
}
Some(builder.build())
}
};
// Tracing layer
let fmt_layer_opt = {
if config.log_stdout_enabled.unwrap_or(DEFAULT_OBS_LOG_STDOUT_ENABLED) {
let enable_color = std::io::stdout().is_terminal();
let mut layer = tracing_subscriber::fmt::layer()
.with_timer(LocalTime::rfc_3339())
.with_target(true)
.with_ansi(enable_color)
.with_thread_names(true)
.with_thread_ids(true)
.with_file(true)
.with_line_number(true)
.json()
.with_current_span(true)
.with_span_list(true);
let span_event = if is_production { FmtSpan::CLOSE } else { FmtSpan::FULL };
layer = layer.with_span_events(span_event);
Some(layer.with_filter(build_env_filter(logger_level, None)))
} else {
None
}
};
let filter = build_env_filter(logger_level, None);
let otel_bridge = logger_provider
.as_ref()
.map(|p| OpenTelemetryTracingBridge::new(p).with_filter(build_env_filter(logger_level, None)));
let tracer_layer = tracer_provider
.as_ref()
.map(|p| OpenTelemetryLayer::new(p.tracer(service_name.to_string())));
let metrics_layer = meter_provider.as_ref().map(|p| MetricsLayer::new(p.clone()));
tracing_subscriber::registry()
.with(filter)
.with(ErrorLayer::default())
.with(fmt_layer_opt)
.with(tracer_layer)
.with(otel_bridge)
.with(metrics_layer)
.init();
counter!("rustfs.start.total").increment(1);
info!(
"Init observability (HTTP): trace='{}', metric='{}', log='{}'",
trace_ep, metric_ep, log_ep
);
Ok(OtelGuard {
tracer_provider,
meter_provider,
logger_provider,
flexi_logger_handles: None,
tracing_guard: None,
})
}
/// Initialize Telemetry,Entrance: three rules
pub(crate) fn init_telemetry(config: &OtelConfig) -> Result<OtelGuard, TelemetryError> {
let environment = config.environment.as_deref().unwrap_or(ENVIRONMENT);
let is_production = environment.eq_ignore_ascii_case(DEFAULT_OBS_ENVIRONMENT_PRODUCTION);
let logger_level = config.logger_level.as_deref().unwrap_or(DEFAULT_LOG_LEVEL);
// Rule 3: Observability (any endpoint is enabled if it is not empty)
let has_obs = !config.endpoint.is_empty()
|| config.trace_endpoint.as_deref().map(|s| !s.is_empty()).unwrap_or(false)
|| config.metric_endpoint.as_deref().map(|s| !s.is_empty()).unwrap_or(false)
|| config.log_endpoint.as_deref().map(|s| !s.is_empty()).unwrap_or(false);
if has_obs {
return init_observability_http(config, logger_level, is_production);
}
// Rule 2: The user has explicitly customized the log directory (determined by whether ENV_OBS_LOG_DIRECTORY is set)
let user_set_log_dir = get_env_opt_str(ENV_OBS_LOG_DIRECTORY);
if user_set_log_dir.filter(|d| !d.is_empty()).is_some() {
return init_file_logging(config, logger_level, is_production);
}
// Rule 1: Default stdout (error level)
Ok(init_stdout_logging(config, DEFAULT_LOG_LEVEL, is_production))
}
#[cfg(test)]
mod tests {
use super::*;
use rustfs_config::USE_STDOUT;
#[test]
fn test_production_environment_detection() {
// Test production environment logic
let production_envs = vec!["production", "PRODUCTION", "Production"];
for env_value in production_envs {
let is_production = env_value.to_lowercase() == "production";
assert!(is_production, "Should detect '{env_value}' as production environment");
}
}
#[test]
fn test_non_production_environment_detection() {
// Test non-production environment logic
let non_production_envs = vec!["development", "test", "staging", "dev", "local"];
for env_value in non_production_envs {
let is_production = env_value.to_lowercase() == "production";
assert!(!is_production, "Should not detect '{env_value}' as production environment");
}
}
#[test]
fn test_stdout_behavior_logic() {
// Test the stdout behavior logic without environment manipulation
struct TestCase {
is_production: bool,
config_use_stdout: Option<bool>,
expected_use_stdout: bool,
description: &'static str,
}
let test_cases = vec![
TestCase {
is_production: true,
config_use_stdout: None,
expected_use_stdout: false,
description: "Production with no config should disable stdout",
},
TestCase {
is_production: false,
config_use_stdout: None,
expected_use_stdout: USE_STDOUT,
description: "Non-production with no config should use default",
},
TestCase {
is_production: true,
config_use_stdout: Some(true),
expected_use_stdout: true,
description: "Production with explicit true should enable stdout",
},
TestCase {
is_production: true,
config_use_stdout: Some(false),
expected_use_stdout: false,
description: "Production with explicit false should disable stdout",
},
TestCase {
is_production: false,
config_use_stdout: Some(true),
expected_use_stdout: true,
description: "Non-production with explicit true should enable stdout",
},
];
for case in test_cases {
let default_use_stdout = if case.is_production { false } else { USE_STDOUT };
let actual_use_stdout = case.config_use_stdout.unwrap_or(default_use_stdout);
assert_eq!(actual_use_stdout, case.expected_use_stdout, "Test case failed: {}", case.description);
}
}
#[test]
fn test_log_level_filter_mapping_logic() {
// Test the log level mapping logic used in the real implementation
let test_cases = vec![
("trace", "Trace"),
("debug", "Debug"),
("info", "Info"),
("warn", "Warn"),
("warning", "Warn"),
("error", "Error"),
("off", "None"),
("invalid_level", "Info"), // Should default to Info
];
for (input_level, expected_variant) in test_cases {
let filter_variant = match input_level.to_lowercase().as_str() {
"trace" => "Trace",
"debug" => "Debug",
"info" => "Info",
"warn" | "warning" => "Warn",
"error" => "Error",
"off" => "None",
_ => "Info", // default case
};
assert_eq!(
filter_variant, expected_variant,
"Log level '{input_level}' should map to '{expected_variant}'"
);
}
}
#[test]
fn test_otel_config_environment_defaults() {
// Test that OtelConfig properly handles environment detection logic
let config = OtelConfig {
endpoint: "".to_string(),
use_stdout: None,
environment: Some("production".to_string()),
..Default::default()
};
// Simulate the logic from init_telemetry
let environment = config.environment.as_deref().unwrap_or(ENVIRONMENT);
assert_eq!(environment, "production");
// Test with development environment
let dev_config = OtelConfig {
endpoint: "".to_string(),
use_stdout: None,
environment: Some("development".to_string()),
..Default::default()
};
let dev_environment = dev_config.environment.as_deref().unwrap_or(ENVIRONMENT);
assert_eq!(dev_environment, "development");
}
}

View File

@@ -0,0 +1,94 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Log filtering utilities for tracing subscribers.
//!
//! This module provides helper functions for building `EnvFilter` instances
//! used across different logging backends (stdout, file, OpenTelemetry).
use smallvec::SmallVec;
use tracing_subscriber::EnvFilter;
/// Build an `EnvFilter` from the given log level string.
///
/// If the `RUST_LOG` environment variable is set, it takes precedence over the
/// provided `logger_level`. For non-verbose levels (`info`, `warn`, `error`),
/// noisy internal crates (`hyper`, `tonic`, `h2`, `reqwest`, `tower`) are
/// automatically silenced to reduce log noise.
///
/// # Arguments
/// * `logger_level` - The desired log level string (e.g., `"info"`, `"debug"`).
/// * `default_level` - An optional override that replaces `logger_level` as the
/// base directive; useful when the caller wants to force a specific level
/// regardless of what is stored in config.
///
/// # Returns
/// A configured `EnvFilter` ready to be attached to a `tracing_subscriber` registry.
pub(super) fn build_env_filter(logger_level: &str, default_level: Option<&str>) -> EnvFilter {
let level = default_level.unwrap_or(logger_level);
let mut filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(level));
// Suppress chatty infrastructure crates unless the operator explicitly
// requests trace/debug output.
if !matches!(logger_level, "trace" | "debug") {
let directives: SmallVec<[&str; 5]> = smallvec::smallvec!["hyper", "tonic", "h2", "reqwest", "tower"];
for directive in directives {
filter = filter.add_directive(format!("{directive}=off").parse().unwrap());
}
}
filter
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_build_env_filter_default_level_overrides() {
// Ensure that providing a default_level uses it instead of logger_level.
let filter = build_env_filter("debug", Some("error"));
// The Debug output uses `LevelFilter::ERROR` for the error level directive.
let dbg = format!("{filter:?}");
assert!(
dbg.contains("LevelFilter::ERROR"),
"Expected 'LevelFilter::ERROR' in filter debug output: {dbg}"
);
}
#[test]
fn test_build_env_filter_suppresses_noisy_crates() {
// For info level, hyper/tonic/etc. should be suppressed with OFF.
let filter = build_env_filter("info", None);
let dbg = format!("{filter:?}");
// The Debug output uses `LevelFilter::OFF` for suppressed crates.
assert!(
dbg.contains("LevelFilter::OFF"),
"Expected 'LevelFilter::OFF' suppression directives in filter: {dbg}"
);
}
#[test]
fn test_build_env_filter_debug_no_suppression() {
// For debug level, our code does NOT inject any OFF directives.
let filter = build_env_filter("debug", None);
let dbg = format!("{filter:?}");
// Verify the filter builds without panicking and contains the debug level.
assert!(!dbg.is_empty());
assert!(
dbg.contains("LevelFilter::DEBUG"),
"Expected 'LevelFilter::DEBUG' in filter debug output: {dbg}"
);
}
}

View File

@@ -0,0 +1,103 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! RAII guard for OpenTelemetry provider lifecycle management.
//!
//! [`OtelGuard`] holds all runtime resources created during telemetry
//! initialisation. Dropping it triggers an ordered shutdown:
//!
//! 1. Tracer provider — flushes pending spans.
//! 2. Meter provider — flushes pending metrics.
//! 3. Logger provider — flushes pending log records.
//! 4. Cleanup task — aborted to prevent lingering background work.
//! 5. Tracing worker guard — flushes buffered log lines written by
//! `tracing_appender`.
use opentelemetry_sdk::{logs::SdkLoggerProvider, metrics::SdkMeterProvider, trace::SdkTracerProvider};
/// RAII guard that owns all active OpenTelemetry providers and the
/// `tracing_appender` worker guard.
///
/// Construct this via the `init_*` functions in [`crate::telemetry`] rather
/// than directly. The guard must be kept alive for the entire duration of the
/// application — once dropped, all telemetry pipelines are shut down.
pub struct OtelGuard {
/// Optional tracer provider for distributed tracing.
pub(crate) tracer_provider: Option<SdkTracerProvider>,
/// Optional meter provider for metrics collection.
pub(crate) meter_provider: Option<SdkMeterProvider>,
/// Optional logger provider for OTLP log export.
pub(crate) logger_provider: Option<SdkLoggerProvider>,
/// Worker guard that keeps the non-blocking `tracing_appender` thread
/// alive. Dropping it blocks until all buffered records are flushed.
pub(crate) tracing_guard: Option<tracing_appender::non_blocking::WorkerGuard>,
/// Optional guard for stdout logging; kept separate to allow independent flushing and shutdown.
pub(crate) stdout_guard: Option<tracing_appender::non_blocking::WorkerGuard>,
/// Handle to the background log-cleanup task; aborted on drop.
pub(crate) cleanup_handle: Option<tokio::task::JoinHandle<()>>,
}
impl std::fmt::Debug for OtelGuard {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OtelGuard")
.field("tracer_provider", &self.tracer_provider.is_some())
.field("meter_provider", &self.meter_provider.is_some())
.field("logger_provider", &self.logger_provider.is_some())
.field("tracing_guard", &self.tracing_guard.is_some())
.field("stdout_guard", &self.stdout_guard.is_some())
.field("cleanup_handle", &self.cleanup_handle.is_some())
.finish()
}
}
impl Drop for OtelGuard {
/// Shut down all telemetry providers in order.
///
/// Errors during shutdown are printed to `stderr` so they are visible even
/// after the tracing subscriber has been torn down.
fn drop(&mut self) {
if let Some(provider) = self.tracer_provider.take()
&& let Err(err) = provider.shutdown()
{
eprintln!("Tracer shutdown error: {err:?}");
}
if let Some(provider) = self.meter_provider.take()
&& let Err(err) = provider.shutdown()
{
eprintln!("Meter shutdown error: {err:?}");
}
if let Some(provider) = self.logger_provider.take()
&& let Err(err) = provider.shutdown()
{
eprintln!("Logger shutdown error: {err:?}");
}
if let Some(handle) = self.cleanup_handle.take() {
handle.abort();
eprintln!("Log cleanup task stopped");
}
if let Some(guard) = self.tracing_guard.take() {
drop(guard);
eprintln!("Tracing guard dropped, flushing logs.");
}
if let Some(guard) = self.stdout_guard.take() {
drop(guard);
eprintln!("Stdout guard dropped, flushing logs.");
}
}
}

View File

@@ -0,0 +1,385 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Local logging backend: stdout-only or file-rolling with optional stdout mirror.
//!
//! # Behaviour
//!
//! | Condition | Result |
//! |----------------------------------|----------------------------------------------|
//! | No log directory configured | JSON logs written to **stdout only** |
//! | Log directory configured | JSON logs written to **rolling file**; |
//! | | stdout mirror enabled when `log_stdout_enabled` |
//! | | is `true` or environment is non-production |
//!
//! The function [`init_local_logging`] is the single entry point for both
//! cases; callers do **not** need to distinguish between stdout and file modes.
use crate::TelemetryError;
use crate::config::OtelConfig;
use crate::global::OBSERVABILITY_METRIC_ENABLED;
use crate::log_cleanup::LogCleaner;
use crate::telemetry::filter::build_env_filter;
use metrics::counter;
use rustfs_config::observability::{
DEFAULT_OBS_LOG_CLEANUP_INTERVAL_SECONDS, DEFAULT_OBS_LOG_COMPRESS_OLD_FILES, DEFAULT_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS,
DEFAULT_OBS_LOG_DELETE_EMPTY_FILES, DEFAULT_OBS_LOG_DRY_RUN, DEFAULT_OBS_LOG_GZIP_COMPRESSION_LEVEL,
DEFAULT_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES, DEFAULT_OBS_LOG_MAX_TOTAL_SIZE_BYTES, DEFAULT_OBS_LOG_MIN_FILE_AGE_SECONDS,
};
use rustfs_config::{APP_NAME, DEFAULT_LOG_KEEP_FILES, DEFAULT_LOG_ROTATION_TIME, DEFAULT_OBS_LOG_STDOUT_ENABLED};
use std::{fs, io::IsTerminal, time::Duration};
use tracing::info;
use tracing_error::ErrorLayer;
use tracing_subscriber::{
fmt::{format::FmtSpan, time::LocalTime},
layer::SubscriberExt,
util::SubscriberInitExt,
};
use super::guard::OtelGuard;
/// Initialize local logging (stdout-only or file-rolling).
///
/// When `log_directory` is empty or `None` in the config the function sets up
/// a non-blocking JSON subscriber that writes to **stdout** and returns
/// immediately — no file I/O, no cleanup task.
///
/// When a log directory is provided the function additionally:
/// 1. Creates the directory (including on Unix, enforces `0755` permissions).
/// 2. Attaches a rolling-file appender (daily or hourly based on
/// `log_rotation_time`).
/// 3. Optionally mirrors output to stdout based on `log_stdout_enabled`.
/// 4. Spawns a background cleanup task that periodically removes or compresses
/// old log files according to the cleanup configuration in [`OtelConfig`].
///
/// # Arguments
/// * `config` - Observability configuration, fully populated from environment variables.
/// * `logger_level` - Effective log level string (e.g., `"info"`).
/// * `is_production` - Whether the runtime environment is production; controls
/// span verbosity and stdout mirroring defaults.
///
/// # Returns
/// An [`OtelGuard`] that keeps the `tracing_appender` worker alive and holds
/// a handle to the cleanup task (if started). Dropping the guard flushes
/// in-flight logs and stops the cleanup task.
///
/// # Errors
/// Returns [`TelemetryError`] if the log directory cannot be created or its
/// permissions cannot be set (Unix only).
pub(super) fn init_local_logging(
config: &OtelConfig,
logger_level: &str,
is_production: bool,
) -> Result<OtelGuard, TelemetryError> {
// Determine the effective log directory. An absent or empty value means
// stdout-only mode: we skip file setup entirely.
let log_dir_str = config.log_directory.as_deref().filter(|s| !s.is_empty());
if let Some(log_directory) = log_dir_str {
init_file_logging_internal(config, log_directory, logger_level, is_production)
} else {
Ok(init_stdout_only(config, logger_level, is_production))
}
}
// ─── Stdout-only ─────────────────────────────────────────────────────────────
/// Set up a non-blocking stdout JSON subscriber with no file I/O.
///
/// Used when no log directory has been configured. The subscriber formats
/// every log record as a JSON line, including RFC-3339 timestamps, thread
/// identifiers, file/line information, and span context.
///
/// # Arguments
/// * `_config` - Unused at the moment; reserved for future configuration.
/// * `logger_level` - Effective log level string.
/// * `is_production` - Controls span event verbosity.
fn init_stdout_only(_config: &OtelConfig, logger_level: &str, is_production: bool) -> OtelGuard {
let env_filter = build_env_filter(logger_level, None);
let (nb, guard) = tracing_appender::non_blocking(std::io::stdout());
let fmt_layer = tracing_subscriber::fmt::layer()
.with_timer(LocalTime::rfc_3339())
.with_target(true)
.with_ansi(std::io::stdout().is_terminal())
.with_thread_names(true)
.with_thread_ids(true)
.with_file(true)
.with_line_number(true)
.with_writer(nb)
.json()
.with_current_span(true)
.with_span_list(true)
.with_span_events(if is_production { FmtSpan::CLOSE } else { FmtSpan::FULL });
tracing_subscriber::registry()
.with(env_filter)
.with(ErrorLayer::default())
.with(fmt_layer)
.init();
OBSERVABILITY_METRIC_ENABLED.set(false).ok();
counter!("rustfs.start.total").increment(1);
info!("Init stdout logging (level: {})", logger_level);
OtelGuard {
tracer_provider: None,
meter_provider: None,
logger_provider: None,
tracing_guard: Some(guard),
stdout_guard: None,
cleanup_handle: None,
}
}
// ─── File-rolling ─────────────────────────────────────────────────────────────
/// Internal implementation for file-based rolling log setup.
///
/// Called by [`init_local_logging`] when a log directory is present.
/// Handles directory creation, permission enforcement (Unix), file appender
/// setup, optional stdout mirror, and log-cleanup task spawning.
fn init_file_logging_internal(
config: &OtelConfig,
log_directory: &str,
logger_level: &str,
is_production: bool,
) -> Result<OtelGuard, TelemetryError> {
let service_name = config.service_name.as_deref().unwrap_or(APP_NAME);
let log_filename = config.log_filename.as_deref().unwrap_or(service_name);
let keep_files = config.log_keep_files.unwrap_or(DEFAULT_LOG_KEEP_FILES);
// ── 1. Ensure the log directory exists ───────────────────────────────────
if let Err(e) = fs::create_dir_all(log_directory) {
return Err(TelemetryError::Io(e.to_string()));
}
// ── 2. Enforce directory permissions (Unix only) ─────────────────────────
#[cfg(unix)]
ensure_dir_permissions(log_directory)?;
// ── 3. Choose rotation strategy ──────────────────────────────────────────
// `log_rotation_time` drives the rolling-appender rotation period.
let rotation = config
.log_rotation_time
.as_deref()
.unwrap_or(DEFAULT_LOG_ROTATION_TIME)
.to_lowercase();
use tracing_appender::rolling::{RollingFileAppender, Rotation};
let file_appender = {
let rotation = match rotation.as_str() {
"minutely" => Rotation::MINUTELY,
"hourly" => Rotation::HOURLY,
_ => Rotation::DAILY,
};
RollingFileAppender::builder()
.rotation(rotation)
.filename_suffix(log_filename)
.max_log_files(keep_files)
.build(log_directory)
.expect("failed to initialize rolling file appender")
};
let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
// ── 4. Build subscriber layers ────────────────────────────────────────────
let env_filter = build_env_filter(logger_level, None);
let span_events = if is_production { FmtSpan::CLOSE } else { FmtSpan::FULL };
// File layer writes JSON without ANSI codes.
let file_layer = tracing_subscriber::fmt::layer()
.with_timer(LocalTime::rfc_3339())
.with_target(true)
.with_ansi(false)
.with_thread_names(true)
.with_thread_ids(true)
.with_file(true)
.with_line_number(true)
.with_writer(non_blocking)
.json()
.with_current_span(true)
.with_span_list(true)
.with_span_events(span_events.clone());
// Optional stdout mirror: enabled explicitly via `log_stdout_enabled`, or
// unconditionally in non-production environments.
let (stdout_layer, stdout_guard) = if config.log_stdout_enabled.unwrap_or(DEFAULT_OBS_LOG_STDOUT_ENABLED) || !is_production {
let (stdout_nb, stdout_guard) = tracing_appender::non_blocking(std::io::stdout());
let enable_color = std::io::stdout().is_terminal();
(
Some(
tracing_subscriber::fmt::layer()
.with_timer(LocalTime::rfc_3339())
.with_target(true)
.with_ansi(enable_color)
.with_thread_names(true)
.with_thread_ids(true)
.with_file(true)
.with_line_number(true)
.with_writer(stdout_nb) // .json()
// .with_current_span(true)
// .with_span_list(true)
.with_span_events(span_events),
),
Some(stdout_guard),
)
} else {
(None, None)
};
tracing_subscriber::registry()
.with(env_filter)
.with(ErrorLayer::default())
.with(file_layer)
.with(stdout_layer)
.init();
OBSERVABILITY_METRIC_ENABLED.set(false).ok();
// ── 5. Start background cleanup task ─────────────────────────────────────
let cleanup_handle = spawn_cleanup_task(config, log_directory, log_filename, keep_files);
info!(
"Init file logging at '{}', rotation: {}, keep {} files",
log_directory, rotation, keep_files
);
Ok(OtelGuard {
tracer_provider: None,
meter_provider: None,
logger_provider: None,
tracing_guard: Some(guard),
stdout_guard,
cleanup_handle: Some(cleanup_handle),
})
}
// ─── Directory permissions (Unix) ─────────────────────────────────────────────
/// Ensure the log directory has at most `0755` permissions (Unix only).
///
/// Tightens permissions to `0755` if the directory is more permissive.
/// This prevents world-writable log directories from being a security hazard.
/// No-ops if permissions are already `0755` or stricter.
#[cfg(unix)]
fn ensure_dir_permissions(log_directory: &str) -> Result<(), TelemetryError> {
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
let desired: u32 = 0o755;
match fs::metadata(log_directory) {
Ok(meta) => {
let current = meta.permissions().mode() & 0o777;
// Only tighten to 0755 if existing permissions are looser than target.
if (current & !desired) != 0 {
if let Err(e) = fs::set_permissions(log_directory, Permissions::from_mode(desired)) {
return Err(TelemetryError::SetPermissions(format!(
"dir='{log_directory}', want={desired:#o}, have={current:#o}, err={e}"
)));
}
// Second verification pass to confirm the change took effect.
if let Ok(meta2) = fs::metadata(log_directory) {
let after = meta2.permissions().mode() & 0o777;
if after != desired {
return Err(TelemetryError::SetPermissions(format!(
"dir='{log_directory}', want={desired:#o}, after={after:#o}"
)));
}
}
}
Ok(())
}
Err(e) => Err(TelemetryError::Io(format!("stat '{log_directory}' failed: {e}"))),
}
}
// ─── Cleanup task ─────────────────────────────────────────────────────────────
/// Spawn a background task that periodically cleans up old log files.
///
/// All cleanup parameters are derived from [`OtelConfig`] fields, with
/// sensible defaults when fields are absent. The task runs on the current
/// Tokio runtime and should be aborted (via the returned `JoinHandle`) when
/// the application shuts down.
///
/// # Arguments
/// * `config` - Observability config containing cleanup parameters.
/// * `log_directory` - Directory path of the rolling log files.
/// * `log_filename` - Base filename (used as the file prefix for matching).
/// * `keep_files` - Legacy keep-files count; used as fallback when the new
/// `log_keep_count` field is absent.
///
/// # Returns
/// A [`tokio::task::JoinHandle`] for the spawned cleanup loop.
fn spawn_cleanup_task(
config: &OtelConfig,
log_directory: &str,
log_filename: &str,
keep_files: usize,
) -> tokio::task::JoinHandle<()> {
let log_dir = std::path::PathBuf::from(log_directory);
let file_prefix = config.log_filename.as_deref().unwrap_or(log_filename).to_string();
let keep_count = config.log_keep_count.unwrap_or(keep_files);
let max_total_size = config
.log_max_total_size_bytes
.unwrap_or(DEFAULT_OBS_LOG_MAX_TOTAL_SIZE_BYTES * keep_count as u64);
let max_single_file_size = config
.log_max_single_file_size_bytes
.unwrap_or(DEFAULT_OBS_LOG_MAX_SINGLE_FILE_SIZE_BYTES);
let compress = config.log_compress_old_files.unwrap_or(DEFAULT_OBS_LOG_COMPRESS_OLD_FILES);
let gzip_level = config
.log_gzip_compression_level
.unwrap_or(DEFAULT_OBS_LOG_GZIP_COMPRESSION_LEVEL);
let retention_days = config
.log_compressed_file_retention_days
.unwrap_or(DEFAULT_OBS_LOG_COMPRESSED_FILE_RETENTION_DAYS);
let exclude_patterns = config
.log_exclude_patterns
.as_deref()
.map(|s| s.split(',').map(|p| p.trim().to_string()).collect())
.unwrap_or_default();
let delete_empty = config.log_delete_empty_files.unwrap_or(DEFAULT_OBS_LOG_DELETE_EMPTY_FILES);
let min_age = config
.log_min_file_age_seconds
.unwrap_or(DEFAULT_OBS_LOG_MIN_FILE_AGE_SECONDS);
let dry_run = config.log_dry_run.unwrap_or(DEFAULT_OBS_LOG_DRY_RUN);
let cleanup_interval = config
.log_cleanup_interval_seconds
.unwrap_or(DEFAULT_OBS_LOG_CLEANUP_INTERVAL_SECONDS);
let cleaner = LogCleaner::new(
log_dir,
file_prefix,
keep_count,
max_total_size,
max_single_file_size,
compress,
gzip_level,
retention_days,
exclude_patterns,
delete_empty,
min_age,
dry_run,
);
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(cleanup_interval));
loop {
interval.tick().await;
if let Err(e) = cleaner.cleanup() {
tracing::warn!("Log cleanup failed: {}", e);
}
}
})
}

View File

@@ -0,0 +1,243 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Telemetry initialisation module for RustFS.
//!
//! This module is the single entry point for all observability backends.
//! Callers should use [`init_telemetry`] and keep the returned [`OtelGuard`]
//! alive for the lifetime of the application.
//!
//! ## Architecture
//!
//! The module is split into focused sub-modules:
//!
//! | Sub-module | Responsibility |
//! |--------------|---------------------------------------------------------|
//! | `guard` | [`OtelGuard`] RAII type for provider lifecycle |
//! | `filter` | `EnvFilter` construction helpers |
//! | `resource` | OpenTelemetry `Resource` builder |
//! | `local` | Local logging: stdout-only **or** rolling-file |
//! | `otel` | Full OTLP/HTTP pipeline (traces + metrics + logs) |
//!
//! ## Routing rules (evaluated in order)
//!
//! 1. **OpenTelemetry** — if any OTLP endpoint is configured, the full HTTP
//! pipeline is initialised via [`otel::init_observability_http`].
//! 2. **File logging** — if `RUSTFS_OBS_LOG_DIRECTORY` (or `log_directory` /
//! `log_dir` in config) is set to a non-empty value, rolling-file logging is
//! initialised together with an optional stdout mirror.
//! 3. **Stdout only** — default fallback; no file I/O, no remote export.
mod filter;
mod guard;
mod local;
mod otel;
mod recorder;
mod resource;
use crate::TelemetryError;
use crate::config::OtelConfig;
pub use guard::OtelGuard;
pub use recorder::Recorder;
use rustfs_config::observability::ENV_OBS_LOG_DIRECTORY;
use rustfs_config::{DEFAULT_LOG_LEVEL, ENVIRONMENT, observability::DEFAULT_OBS_ENVIRONMENT_PRODUCTION};
use rustfs_utils::get_env_opt_str;
/// Initialize the telemetry subsystem according to the provided configuration.
///
/// Evaluates three routing rules in priority order and delegates to the
/// appropriate backend:
///
/// 1. If any OTLP endpoint is set, initialises the full
/// OpenTelemetry HTTP pipeline (traces + metrics + logs).
/// 2. If a log directory is explicitly configured via the
/// `RUSTFS_OBS_LOG_DIRECTORY` environment variable, initialises
/// rolling-file logging with an optional stdout mirror.
/// 3. Otherwise, falls back to stdout-only JSON logging.
///
/// # Arguments
/// * `config` - Observability configuration, typically built from environment
/// variables via [`OtelConfig::extract_otel_config_from_env`].
///
/// # Returns
/// An [`OtelGuard`] that must be kept alive for the duration of the
/// application. Dropping it triggers ordered shutdown of all providers.
///
/// # Errors
/// Returns [`TelemetryError`] when a backend fails to initialise (e.g., cannot
/// create the log directory, or an OTLP exporter cannot connect).
pub(crate) fn init_telemetry(config: &OtelConfig) -> Result<OtelGuard, TelemetryError> {
let environment = config.environment.as_deref().unwrap_or(ENVIRONMENT);
let is_production = environment.eq_ignore_ascii_case(DEFAULT_OBS_ENVIRONMENT_PRODUCTION);
let logger_level = config.logger_level.as_deref().unwrap_or(DEFAULT_LOG_LEVEL);
// ── Rule 1: OpenTelemetry HTTP pipeline ───────────────────────────────────
// Activated when at least one OTLP endpoint is non-empty.
let has_obs = !config.endpoint.is_empty()
|| config.trace_endpoint.as_deref().map(|s| !s.is_empty()).unwrap_or(false)
|| config.metric_endpoint.as_deref().map(|s| !s.is_empty()).unwrap_or(false)
|| config.log_endpoint.as_deref().map(|s| !s.is_empty()).unwrap_or(false);
if has_obs {
return otel::init_observability_http(config, logger_level, is_production);
}
// ── Rule 2 & 3: Local logging (file or stdout) ────────────────────────────
// `init_local_logging` internally decides between file and stdout mode
// based on whether a log directory is configured.
//
// We check the environment variable here (rather than relying solely on the
// config struct) to honour dynamic overrides set after config construction.
let user_set_log_dir = get_env_opt_str(ENV_OBS_LOG_DIRECTORY);
let effective_config = if user_set_log_dir.as_deref().filter(|d| !d.is_empty()).is_some() {
// Environment variable is set: ensure the config reflects it so that
// `init_local_logging` picks up the value even if the struct was built
// before the env var was set.
std::borrow::Cow::Owned(OtelConfig {
log_directory: user_set_log_dir,
..config.clone()
})
} else {
std::borrow::Cow::Borrowed(config)
};
local::init_local_logging(&effective_config, logger_level, is_production)
}
#[cfg(test)]
mod tests {
use rustfs_config::observability::DEFAULT_OBS_ENVIRONMENT_PRODUCTION;
use rustfs_config::{ENVIRONMENT, USE_STDOUT};
#[test]
fn test_production_environment_detection() {
// Verify that case-insensitive comparison correctly identifies production.
let production_envs = ["production", "PRODUCTION", "Production"];
for env_value in production_envs {
let is_production = env_value.eq_ignore_ascii_case(DEFAULT_OBS_ENVIRONMENT_PRODUCTION);
assert!(is_production, "Should detect '{env_value}' as production environment");
}
}
#[test]
fn test_non_production_environment_detection() {
// Verify that non-production environments are not misidentified.
let non_production_envs = ["development", "test", "staging", "dev", "local"];
for env_value in non_production_envs {
let is_production = env_value.eq_ignore_ascii_case(DEFAULT_OBS_ENVIRONMENT_PRODUCTION);
assert!(!is_production, "Should not detect '{env_value}' as production environment");
}
}
#[test]
fn test_stdout_behavior_logic() {
// Validate the stdout-enable logic for different environment/config combinations.
struct TestCase {
is_production: bool,
config_use_stdout: Option<bool>,
expected_use_stdout: bool,
description: &'static str,
}
let test_cases = [
TestCase {
is_production: true,
config_use_stdout: None,
expected_use_stdout: false,
description: "Production with no config should disable stdout",
},
TestCase {
is_production: false,
config_use_stdout: None,
expected_use_stdout: USE_STDOUT,
description: "Non-production with no config should use default",
},
TestCase {
is_production: true,
config_use_stdout: Some(true),
expected_use_stdout: true,
description: "Production with explicit true should enable stdout",
},
TestCase {
is_production: true,
config_use_stdout: Some(false),
expected_use_stdout: false,
description: "Production with explicit false should disable stdout",
},
TestCase {
is_production: false,
config_use_stdout: Some(true),
expected_use_stdout: true,
description: "Non-production with explicit true should enable stdout",
},
];
for case in &test_cases {
let default_use_stdout = if case.is_production { false } else { USE_STDOUT };
let actual = case.config_use_stdout.unwrap_or(default_use_stdout);
assert_eq!(actual, case.expected_use_stdout, "Test case failed: {}", case.description);
}
}
#[test]
fn test_log_level_filter_mapping_logic() {
// Validate the log level string → tracing level mapping used in filters.
let test_cases = [
("trace", "Trace"),
("debug", "Debug"),
("info", "Info"),
("warn", "Warn"),
("warning", "Warn"),
("error", "Error"),
("off", "None"),
("invalid_level", "Info"),
];
for (input, expected) in test_cases {
let mapped = match input.to_lowercase().as_str() {
"trace" => "Trace",
"debug" => "Debug",
"info" => "Info",
"warn" | "warning" => "Warn",
"error" => "Error",
"off" => "None",
_ => "Info",
};
assert_eq!(mapped, expected, "Log level '{input}' should map to '{expected}'");
}
}
#[test]
fn test_otel_config_environment_defaults() {
// Verify that environment field defaults behave correctly.
use crate::config::OtelConfig;
let config = OtelConfig {
endpoint: "".to_string(),
use_stdout: None,
environment: Some("production".to_string()),
..Default::default()
};
let environment = config.environment.as_deref().unwrap_or(ENVIRONMENT);
assert_eq!(environment, "production");
let dev_config = OtelConfig {
endpoint: "".to_string(),
use_stdout: None,
environment: Some("development".to_string()),
..Default::default()
};
let dev_environment = dev_config.environment.as_deref().unwrap_or(ENVIRONMENT);
assert_eq!(dev_environment, "development");
}
}

View File

@@ -0,0 +1,317 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! OpenTelemetry HTTP exporter initialisation.
//!
//! This module sets up full OTLP/HTTP pipelines for:
//! - **Traces** via [`opentelemetry_otlp::SpanExporter`]
//! - **Metrics** via [`opentelemetry_otlp::MetricExporter`]
//! - **Logs** via [`opentelemetry_otlp::LogExporter`]
//!
//! Each signal has a dedicated endpoint field in [`OtelConfig`]. When a
//! per-signal endpoint is absent, the function falls back to appending the
//! standard OTLP path suffix to the root `endpoint` field:
//!
//! | Signal | Fallback path |
//! |---------|-----------------|
//! | Traces | `/v1/traces` |
//! | Metrics | `/v1/metrics` |
//! | Logs | `/v1/logs` |
//!
//! All exporters use **HTTP binary** (Protobuf) encoding with **gzip**
//! compression for efficiency over the wire.
use crate::TelemetryError;
use crate::config::OtelConfig;
use crate::global::OBSERVABILITY_METRIC_ENABLED;
use crate::telemetry::filter::build_env_filter;
use crate::telemetry::guard::OtelGuard;
use crate::telemetry::recorder::Recorder;
use crate::telemetry::resource::build_resource;
use metrics::counter;
use opentelemetry::{global, trace::TracerProvider};
use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge;
use opentelemetry_otlp::{Compression, Protocol, WithExportConfig, WithHttpConfig};
use opentelemetry_sdk::propagation::TraceContextPropagator;
use opentelemetry_sdk::{
logs::SdkLoggerProvider,
metrics::{PeriodicReader, SdkMeterProvider},
trace::{RandomIdGenerator, Sampler, SdkTracerProvider},
};
use rustfs_config::{
APP_NAME, DEFAULT_OBS_LOG_STDOUT_ENABLED, DEFAULT_OBS_LOGS_EXPORT_ENABLED, DEFAULT_OBS_METRICS_EXPORT_ENABLED,
DEFAULT_OBS_TRACES_EXPORT_ENABLED, METER_INTERVAL, SAMPLE_RATIO,
};
use std::{io::IsTerminal, time::Duration};
use tracing::info;
use tracing_error::ErrorLayer;
use tracing_opentelemetry::{MetricsLayer, OpenTelemetryLayer};
use tracing_subscriber::{
Layer,
fmt::{format::FmtSpan, time::LocalTime},
layer::SubscriberExt,
util::SubscriberInitExt,
};
/// Initialize the full OpenTelemetry HTTP pipeline (traces + metrics + logs).
///
/// This function is invoked when at least one OTLP endpoint has been
/// configured. It creates exporters, wires them into SDK providers, installs
/// a global tracer/meter, and builds a `tracing_subscriber` registry that
/// bridges Rust's `tracing` macros to the OTLP pipelines.
///
/// # Arguments
/// * `config` - Fully populated observability configuration.
/// * `logger_level` - Effective log level string (e.g., `"info"`).
/// * `is_production` - Controls span verbosity and stdout layer defaults.
///
/// # Returns
/// An [`OtelGuard`] owning all created providers. Dropping it triggers an
/// ordered shutdown and flushes all pending telemetry data.
///
/// # Errors
/// Returns [`TelemetryError`] if any exporter or provider fails to build.
///
/// # Note
/// This function is intentionally kept unchanged from the pre-refactor
/// implementation to preserve existing OTLP behaviour.
pub(super) fn init_observability_http(
config: &OtelConfig,
logger_level: &str,
is_production: bool,
) -> Result<OtelGuard, TelemetryError> {
// ── Resource & sampling ──────────────────────────────────────────────────
let res = build_resource(config);
let service_name = config.service_name.as_deref().unwrap_or(APP_NAME).to_owned();
let use_stdout = config.use_stdout.unwrap_or(!is_production);
let sample_ratio = config.sample_ratio.unwrap_or(SAMPLE_RATIO);
let sampler = if (0.0..1.0).contains(&sample_ratio) {
Sampler::TraceIdRatioBased(sample_ratio)
} else {
Sampler::AlwaysOn
};
// ── Endpoint resolution ───────────────────────────────────────────────────
// Each signal may have a dedicated endpoint; if absent, fall back to the
// root endpoint with the standard OTLP path suffix appended.
let root_ep = config.endpoint.clone();
let trace_ep: String = config
.trace_endpoint
.as_deref()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| format!("{root_ep}/v1/traces"));
let metric_ep: String = config
.metric_endpoint
.as_deref()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| format!("{root_ep}/v1/metrics"));
let log_ep: String = config
.log_endpoint
.as_deref()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.unwrap_or_else(|| format!("{root_ep}/v1/logs"));
// ── Tracer provider (HTTP) ────────────────────────────────────────────────
let tracer_provider = build_tracer_provider(&trace_ep, config, res.clone(), sampler, use_stdout)?;
// ── Meter provider (HTTP) ─────────────────────────────────────────────────
let meter_provider = build_meter_provider(&metric_ep, config, res.clone(), &service_name, use_stdout)?;
// ── Logger provider (HTTP) ────────────────────────────────────────────────
let logger_provider = build_logger_provider(&log_ep, config, res, use_stdout)?;
// ── Tracing subscriber registry ───────────────────────────────────────────
// Build an optional stdout formatting layer. When `log_stdout_enabled` is
// false the field is `None` and tracing-subscriber will skip it.
let fmt_layer_opt = if config.log_stdout_enabled.unwrap_or(DEFAULT_OBS_LOG_STDOUT_ENABLED) {
let enable_color = std::io::stdout().is_terminal();
let span_event = if is_production { FmtSpan::CLOSE } else { FmtSpan::FULL };
let layer = tracing_subscriber::fmt::layer()
.with_timer(LocalTime::rfc_3339())
.with_target(true)
.with_ansi(enable_color)
.with_thread_names(true)
.with_thread_ids(true)
.with_file(true)
.with_line_number(true)
.json()
.with_current_span(true)
.with_span_list(true)
.with_span_events(span_event)
.with_filter(build_env_filter(logger_level, None));
Some(layer)
} else {
None
};
let filter = build_env_filter(logger_level, None);
let otel_bridge = logger_provider
.as_ref()
.map(|p| OpenTelemetryTracingBridge::new(p).with_filter(build_env_filter(logger_level, None)));
let tracer_layer = tracer_provider
.as_ref()
.map(|p| OpenTelemetryLayer::new(p.tracer(service_name.to_string())));
let metrics_layer = meter_provider.as_ref().map(|p| MetricsLayer::new(p.clone()));
tracing_subscriber::registry()
.with(filter)
.with(ErrorLayer::default())
.with(fmt_layer_opt)
.with(tracer_layer)
.with(otel_bridge)
.with(metrics_layer)
.init();
counter!("rustfs.start.total").increment(1);
info!(
"Init observability (HTTP): trace='{}', metric='{}', log='{}'",
trace_ep, metric_ep, log_ep
);
Ok(OtelGuard {
tracer_provider,
meter_provider,
logger_provider,
tracing_guard: None,
stdout_guard: None,
cleanup_handle: None,
})
}
// ─── Private builder helpers ──────────────────────────────────────────────────
/// Build an optional [`SdkTracerProvider`] for the given trace endpoint.
///
/// Returns `None` when the endpoint is empty or trace export is disabled.
fn build_tracer_provider(
trace_ep: &str,
config: &OtelConfig,
res: opentelemetry_sdk::Resource,
sampler: Sampler,
use_stdout: bool,
) -> Result<Option<SdkTracerProvider>, TelemetryError> {
if trace_ep.is_empty() || !config.traces_export_enabled.unwrap_or(DEFAULT_OBS_TRACES_EXPORT_ENABLED) {
return Ok(None);
}
let exporter = opentelemetry_otlp::SpanExporter::builder()
.with_http()
.with_endpoint(trace_ep)
.with_protocol(Protocol::HttpBinary)
.with_compression(Compression::Gzip)
.build()
.map_err(|e| TelemetryError::BuildSpanExporter(e.to_string()))?;
let mut builder = SdkTracerProvider::builder()
.with_sampler(sampler)
.with_id_generator(RandomIdGenerator::default())
.with_resource(res)
.with_batch_exporter(exporter);
if use_stdout {
builder = builder.with_batch_exporter(opentelemetry_stdout::SpanExporter::default());
}
let provider = builder.build();
global::set_tracer_provider(provider.clone());
global::set_text_map_propagator(TraceContextPropagator::new());
Ok(Some(provider))
}
/// Build an optional [`SdkMeterProvider`] for the given metrics endpoint.
///
/// Returns `None` when the endpoint is empty or metric export is disabled.
fn build_meter_provider(
metric_ep: &str,
config: &OtelConfig,
res: opentelemetry_sdk::Resource,
service_name: &str,
use_stdout: bool,
) -> Result<Option<SdkMeterProvider>, TelemetryError> {
if metric_ep.is_empty() || !config.metrics_export_enabled.unwrap_or(DEFAULT_OBS_METRICS_EXPORT_ENABLED) {
return Ok(None);
}
let exporter = opentelemetry_otlp::MetricExporter::builder()
.with_http()
.with_endpoint(metric_ep)
.with_temporality(opentelemetry_sdk::metrics::Temporality::default())
.with_protocol(Protocol::HttpBinary)
.with_compression(Compression::Gzip)
.build()
.map_err(|e| TelemetryError::BuildMetricExporter(e.to_string()))?;
let meter_interval = config.meter_interval.unwrap_or(METER_INTERVAL);
let (provider, recorder) = Recorder::builder(service_name.to_string())
.with_meter_provider(|b: opentelemetry_sdk::metrics::MeterProviderBuilder| {
let b = b.with_resource(res).with_reader(
PeriodicReader::builder(exporter)
.with_interval(Duration::from_secs(meter_interval))
.build(),
);
if use_stdout {
b.with_reader(create_periodic_reader(meter_interval))
} else {
b
}
})
.build();
global::set_meter_provider(provider.clone() as SdkMeterProvider);
metrics::set_global_recorder(recorder).map_err(|e| TelemetryError::InstallMetricsRecorder(e.to_string()))?;
OBSERVABILITY_METRIC_ENABLED.set(true).ok();
Ok(Some(provider))
}
/// Build an optional [`SdkLoggerProvider`] for the given log endpoint.
///
/// Returns `None` when the endpoint is empty or log export is disabled.
fn build_logger_provider(
log_ep: &str,
config: &OtelConfig,
res: opentelemetry_sdk::Resource,
use_stdout: bool,
) -> Result<Option<SdkLoggerProvider>, TelemetryError> {
if log_ep.is_empty() || !config.logs_export_enabled.unwrap_or(DEFAULT_OBS_LOGS_EXPORT_ENABLED) {
return Ok(None);
}
let exporter = opentelemetry_otlp::LogExporter::builder()
.with_http()
.with_endpoint(log_ep)
.with_protocol(Protocol::HttpBinary)
.with_compression(Compression::Gzip)
.build()
.map_err(|e| TelemetryError::BuildLogExporter(e.to_string()))?;
let mut builder = SdkLoggerProvider::builder().with_resource(res);
builder = builder.with_batch_exporter(exporter);
if use_stdout {
builder = builder.with_batch_exporter(opentelemetry_stdout::LogExporter::default());
}
Ok(Some(builder.build()))
}
/// Create a stdout periodic metrics reader for the given interval.
fn create_periodic_reader(interval: u64) -> PeriodicReader<opentelemetry_stdout::MetricExporter> {
PeriodicReader::builder(opentelemetry_stdout::MetricExporter::default())
.with_interval(Duration::from_secs(interval))
.build()
}

View File

@@ -0,0 +1,64 @@
// Copyright 2024 RustFS Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! OpenTelemetry [`Resource`] construction for RustFS.
//!
//! A `Resource` describes the entity producing telemetry data. The resource
//! built here includes the service name, service version, deployment
//! environment, and the local machine IP address so that data can be
//! correlated across services in a distributed system.
use crate::config::OtelConfig;
use opentelemetry::KeyValue;
use opentelemetry_sdk::Resource;
use opentelemetry_semantic_conventions::{
SCHEMA_URL,
attribute::{DEPLOYMENT_ENVIRONMENT_NAME, NETWORK_LOCAL_ADDRESS, SERVICE_VERSION as OTEL_SERVICE_VERSION},
};
use rustfs_config::{APP_NAME, ENVIRONMENT, SERVICE_VERSION};
use rustfs_utils::get_local_ip_with_default;
use std::borrow::Cow;
/// Build an OpenTelemetry [`Resource`] populated from the provided config.
///
/// The resource carries the following attributes:
/// - `service.name` — from `config.service_name`, defaulting to [`APP_NAME`].
/// - `service.version` — from `config.service_version`, defaulting to
/// [`SERVICE_VERSION`].
/// - `deployment.environment` — from `config.environment`, defaulting to
/// [`ENVIRONMENT`].
/// - `network.local.address` — the primary local IP of the current host,
/// useful for identifying individual nodes in a cluster.
///
/// All attributes are attached to the resource using the semantic conventions
/// schema URL to ensure compatibility with standard OTLP backends.
pub(super) fn build_resource(config: &OtelConfig) -> Resource {
Resource::builder()
.with_service_name(Cow::Borrowed(config.service_name.as_deref().unwrap_or(APP_NAME)).to_string())
.with_schema_url(
[
KeyValue::new(
OTEL_SERVICE_VERSION,
Cow::Borrowed(config.service_version.as_deref().unwrap_or(SERVICE_VERSION)).to_string(),
),
KeyValue::new(
DEPLOYMENT_ENVIRONMENT_NAME,
Cow::Borrowed(config.environment.as_deref().unwrap_or(ENVIRONMENT)).to_string(),
),
KeyValue::new(NETWORK_LOCAL_ADDRESS, get_local_ip_with_default()),
],
SCHEMA_URL,
)
.build()
}

View File

@@ -89,7 +89,7 @@ static GLOBAL: profiling::allocator::TracingAllocator<mimalloc::MiMalloc> =
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
fn main() {
let runtime = server::get_tokio_runtime_builder()
let runtime = server::tokio_runtime_builder()
.build()
.expect("Failed to build Tokio runtime");
let result = runtime.block_on(async_main());
@@ -283,7 +283,7 @@ async fn run(config: config::Config) -> Result<()> {
// // Initialize global configuration system
let mut retry_count = 0;
while let Err(e) = ecconfig::init_global_config_sys(store.clone()).await {
error!("ecconfig::init_global_config_sys failed {:?}", e);
error!("ecstore config::init_global_config_sys failed {:?}", e);
// TODO: check error type
retry_count += 1;
if retry_count > 15 {

View File

@@ -31,7 +31,7 @@ pub(crate) use event::{init_event_notifier, shutdown_event_notifier};
pub(crate) use http::start_http_server;
pub(crate) use prefix::*;
pub(crate) use readiness::ReadinessGateLayer;
pub(crate) use runtime::get_tokio_runtime_builder;
pub(crate) use runtime::tokio_runtime_builder;
pub(crate) use service_state::SHUTDOWN_TIMEOUT;
pub(crate) use service_state::ServiceState;
pub(crate) use service_state::ServiceStateManager;

View File

@@ -80,11 +80,11 @@ fn compute_default_max_blocking_threads() -> usize {
/// Panics if environment variable values are invalid
/// # Examples
/// ```no_run
/// use rustfs_server::get_tokio_runtime_builder;
/// let builder = get_tokio_runtime_builder();
/// use rustfs_server::tokio_runtime_builder;
/// let builder = tokio_runtime_builder();
/// let runtime = builder.build().unwrap();
/// ```
pub(crate) fn get_tokio_runtime_builder() -> tokio::runtime::Builder {
pub(crate) fn tokio_runtime_builder() -> tokio::runtime::Builder {
let mut builder = tokio::runtime::Builder::new_multi_thread();
// Worker threads(Default physical cores)
@@ -136,7 +136,10 @@ pub(crate) fn get_tokio_runtime_builder() -> tokio::runtime::Builder {
});
}
if !rustfs_obs::is_production_environment() {
tracing::debug!(
println!(
"Starting Tokio runtime with configured parameters: worker_threads={}, max_blocking_threads={}, \
thread_stack_size={}, thread_keep_alive={}, global_queue_interval={}, event_interval={}, \
max_io_events_per_tick={}, thread_name={}",
worker_threads,
max_blocking_threads,
thread_stack_size,
@@ -144,8 +147,7 @@ pub(crate) fn get_tokio_runtime_builder() -> tokio::runtime::Builder {
global_queue_interval,
event_interval,
max_io_events_per_tick,
thread_name,
"Starting Tokio runtime with configured parameters"
thread_name
);
}
builder

View File

@@ -68,11 +68,10 @@ export RUSTFS_CONSOLE_ADDRESS=":9001"
#export RUSTFS_OBS_SERVICE_VERSION=0.1.0 # Service version
export RUSTFS_OBS_ENVIRONMENT=develop # Environment name
export RUSTFS_OBS_LOGGER_LEVEL=info # Log level, supports trace, debug, info, warn, error
export RUSTFS_OBS_LOG_STDOUT_ENABLED=false # Whether to enable local stdout logging
export RUSTFS_OBS_LOG_STDOUT_ENABLED=true # Whether to enable local stdout logging
export RUSTFS_OBS_LOG_DIRECTORY="$current_dir/deploy/logs" # Log directory
export RUSTFS_OBS_LOG_ROTATION_TIME="hour" # Log rotation time unit, can be "second", "minute", "hour", "day"
export RUSTFS_OBS_LOG_ROTATION_SIZE_MB=100 # Log rotation size in MB
export RUSTFS_OBS_LOG_POOL_CAPA=10240 # Log pool capacity
export RUSTFS_OBS_LOG_ROTATION_TIME="minutely" # Log rotation time unit, can be "minutely", "hourly", "daily"
export RUSTFS_OBS_LOG_KEEP_FILES=30 # Number of log files to keep
export RUSTFS_OBS_LOG_MESSAGE_CAPA=32768 # Log message capacity
export RUSTFS_OBS_LOG_FLUSH_MS=300 # Log flush interval in milliseconds