diff --git a/seaweed-volume/src/metrics.rs b/seaweed-volume/src/metrics.rs index 5d65ea442..251b7e6f2 100644 --- a/seaweed-volume/src/metrics.rs +++ b/seaweed-volume/src/metrics.rs @@ -142,6 +142,35 @@ lazy_static::lazy_static! { &["code"], ).expect("metric can be created"); + // ---- Scrubbing metrics (Go: VolumeServerScrub*) ---- + + /// Last scrub execution time, as seconds since UNIX epoch, with label `mode`. + pub static ref SCRUB_LAST_TIME_SECONDS: GaugeVec = GaugeVec::new( + Opts::new( + "SeaweedFS_volumeServer_scrub_last_time_seconds", + "Last scrub execution time, as seconds since UNIX epoch.", + ), + &["mode"], + ).expect("metric can be created"); + + /// Counter of overall volumes with issues detected during scrubbing, with label `mode`. + pub static ref SCRUB_VOLUME_FAILURES: IntCounterVec = IntCounterVec::new( + Opts::new( + "SeaweedFS_volumeServer_scrub_volume_failures", + "Counter of overall volumes with issues detected during scrubbing.", + ), + &["mode"], + ).expect("metric can be created"); + + /// Counter of overall EC shards with issues detected during scrubbing, with label `mode`. + pub static ref SCRUB_SHARD_FAILURES: IntCounterVec = IntCounterVec::new( + Opts::new( + "SeaweedFS_volumeServer_scrub_shard_failures", + "Counter of overall EC shards with issues detected during scrubbing.", + ), + &["mode"], + ).expect("metric can be created"); + // ---- Legacy aliases for backward compat with existing code ---- /// Total number of volumes on this server (flat gauge). @@ -251,6 +280,9 @@ pub fn register_metrics() { Box::new(INFLIGHT_DOWNLOAD_SIZE.clone()), Box::new(INFLIGHT_UPLOAD_SIZE.clone()), Box::new(UPLOAD_ERROR_COUNTER.clone()), + Box::new(SCRUB_LAST_TIME_SECONDS.clone()), + Box::new(SCRUB_VOLUME_FAILURES.clone()), + Box::new(SCRUB_SHARD_FAILURES.clone()), // Legacy metrics Box::new(VOLUMES_TOTAL.clone()), Box::new(DISK_SIZE_BYTES.clone()), diff --git a/seaweed-volume/src/server/grpc_server.rs b/seaweed-volume/src/server/grpc_server.rs index 03deb012d..838269d24 100644 --- a/seaweed-volume/src/server/grpc_server.rs +++ b/seaweed-volume/src/server/grpc_server.rs @@ -29,6 +29,42 @@ fn volume_is_remote_only(dat_path: &str, has_remote_file: bool) -> bool { has_remote_file && !std::path::Path::new(dat_path).exists() } +/// Map a numeric `VolumeScrubMode` to its proto enum name, matching Go's +/// `req.GetMode().String()` used for the Prometheus `mode` label. +fn scrub_mode_label(mode: i32) -> &'static str { + match mode { + 0 => "UNKNOWN", + 1 => "INDEX", + 2 => "FULL", + 3 => "LOCAL", + _ => "UNKNOWN", + } +} + +fn unix_now_seconds() -> f64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs() as f64) + .unwrap_or(0.0) +} + +/// Record scrub metrics. `broken_shards` is `Some` only for EC scrubs so the +/// shard-failures family stays untouched on regular volume scrubs (matching Go). +fn emit_scrub_metrics(mode: i32, broken_volumes: usize, broken_shards: Option) { + let mode_label = scrub_mode_label(mode); + crate::metrics::SCRUB_LAST_TIME_SECONDS + .with_label_values(&[mode_label]) + .set(unix_now_seconds()); + crate::metrics::SCRUB_VOLUME_FAILURES + .with_label_values(&[mode_label]) + .inc_by(broken_volumes as u64); + if let Some(n) = broken_shards { + crate::metrics::SCRUB_SHARD_FAILURES + .with_label_values(&[mode_label]) + .inc_by(n as u64); + } +} + /// Persist VolumeServerState to a state.pb file (matches Go's State.save). fn save_state_file( path: &str, @@ -3419,8 +3455,8 @@ impl VolumeServer for VolumeGrpcService { // Match Go: if mark_broken_volumes_readonly, call makeVolumeReadonly on each broken volume. // Collect errors via errors.Join semantics (return joined error if any fail). + let mut errs: Vec = Vec::new(); if req.mark_broken_volumes_readonly { - let mut errs: Vec = Vec::new(); for vid in &broken_vids { match self.make_volume_readonly(*vid, true).await { Ok(()) => { @@ -3432,9 +3468,14 @@ impl VolumeServer for VolumeGrpcService { } } } - if !errs.is_empty() { - return Err(Status::internal(errs.join("\n"))); - } + } + + // Record metrics before the post-scrub error check so scrub failures are + // persisted even when a follow-up admin action (mark-readonly) fails. + emit_scrub_metrics(mode, broken_vids.len(), None); + + if !errs.is_empty() { + return Err(Status::internal(errs.join("\n"))); } Ok(Response::new(volume_server_pb::ScrubVolumeResponse { @@ -3560,6 +3601,12 @@ impl VolumeServer for VolumeGrpcService { } } + emit_scrub_metrics( + mode, + broken_volume_ids.len(), + Some(broken_shard_infos.len()), + ); + Ok(Response::new(volume_server_pb::ScrubEcVolumeResponse { total_volumes, total_files,