rust(volume): export Prometheus metrics for scrubbing operations (#9266)

* rust(volume): export Prometheus metrics for scrubbing operations Mirrors #9264 in the Rust volume server. Adds three metrics that match the Go names so the same dashboards/alerts work against either binary: - SeaweedFS_volumeServer_scrub_last_time_seconds (gauge) - SeaweedFS_volumeServer_scrub_volume_failures (counter) - SeaweedFS_volumeServer_scrub_shard_failures (counter) Metrics are aggregated at the volume / EC shard level, labelled by VolumeScrubMode (UNKNOWN/INDEX/FULL/LOCAL) to match Go's req.GetMode().String(). * rust(volume): record scrub metrics before post-scrub error check Address PR feedback: - Move metric emission before the mark_broken_volumes_readonly error check so scrub failures are persisted even when the follow-up mark-readonly admin action fails (matches Go's volume_grpc_scrub.go). - Extract the duplicated metric block into emit_scrub_metrics() shared by both ScrubVolume and ScrubEcVolume. The shard-failures family stays untouched on regular volume scrubs to mirror Go.
2026-06-13 23:36:45 +03:00 · 2026-04-28 13:29:32 -07:00
parent 3f3aaa7cc8
commit 08d59750ef
2 changed files with 83 additions and 4 deletions
@@ -142,6 +142,35 @@ lazy_static::lazy_static! {
        &["code"],
    ).expect("metric can be created");

+    // ---- Scrubbing metrics (Go: VolumeServerScrub*) ----
+
+    /// Last scrub execution time, as seconds since UNIX epoch, with label `mode`.
+    pub static ref SCRUB_LAST_TIME_SECONDS: GaugeVec = GaugeVec::new(
+        Opts::new(
+            "SeaweedFS_volumeServer_scrub_last_time_seconds",
+            "Last scrub execution time, as seconds since UNIX epoch.",
+        ),
+        &["mode"],
+    ).expect("metric can be created");
+
+    /// Counter of overall volumes with issues detected during scrubbing, with label `mode`.
+    pub static ref SCRUB_VOLUME_FAILURES: IntCounterVec = IntCounterVec::new(
+        Opts::new(
+            "SeaweedFS_volumeServer_scrub_volume_failures",
+            "Counter of overall volumes with issues detected during scrubbing.",
+        ),
+        &["mode"],
+    ).expect("metric can be created");
+
+    /// Counter of overall EC shards with issues detected during scrubbing, with label `mode`.
+    pub static ref SCRUB_SHARD_FAILURES: IntCounterVec = IntCounterVec::new(
+        Opts::new(
+            "SeaweedFS_volumeServer_scrub_shard_failures",
+            "Counter of overall EC shards with issues detected during scrubbing.",
+        ),
+        &["mode"],
+    ).expect("metric can be created");
+
    // ---- Legacy aliases for backward compat with existing code ----

    /// Total number of volumes on this server (flat gauge).
@@ -251,6 +280,9 @@ pub fn register_metrics() {
            Box::new(INFLIGHT_DOWNLOAD_SIZE.clone()),
            Box::new(INFLIGHT_UPLOAD_SIZE.clone()),
            Box::new(UPLOAD_ERROR_COUNTER.clone()),
+            Box::new(SCRUB_LAST_TIME_SECONDS.clone()),
+            Box::new(SCRUB_VOLUME_FAILURES.clone()),
+            Box::new(SCRUB_SHARD_FAILURES.clone()),
            // Legacy metrics
            Box::new(VOLUMES_TOTAL.clone()),
            Box::new(DISK_SIZE_BYTES.clone()),
@@ -29,6 +29,42 @@ fn volume_is_remote_only(dat_path: &str, has_remote_file: bool) -> bool {
    has_remote_file && !std::path::Path::new(dat_path).exists()
 }

+/// Map a numeric `VolumeScrubMode` to its proto enum name, matching Go's
+/// `req.GetMode().String()` used for the Prometheus `mode` label.
+fn scrub_mode_label(mode: i32) -> &'static str {
+    match mode {
+        0 => "UNKNOWN",
+        1 => "INDEX",
+        2 => "FULL",
+        3 => "LOCAL",
+        _ => "UNKNOWN",
+    }
+}
+
+fn unix_now_seconds() -> f64 {
+    std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map(|d| d.as_secs() as f64)
+        .unwrap_or(0.0)
+}
+
+/// Record scrub metrics. `broken_shards` is `Some` only for EC scrubs so the
+/// shard-failures family stays untouched on regular volume scrubs (matching Go).
+fn emit_scrub_metrics(mode: i32, broken_volumes: usize, broken_shards: Option<usize>) {
+    let mode_label = scrub_mode_label(mode);
+    crate::metrics::SCRUB_LAST_TIME_SECONDS
+        .with_label_values(&[mode_label])
+        .set(unix_now_seconds());
+    crate::metrics::SCRUB_VOLUME_FAILURES
+        .with_label_values(&[mode_label])
+        .inc_by(broken_volumes as u64);
+    if let Some(n) = broken_shards {
+        crate::metrics::SCRUB_SHARD_FAILURES
+            .with_label_values(&[mode_label])
+            .inc_by(n as u64);
+    }
+}
+
 /// Persist VolumeServerState to a state.pb file (matches Go's State.save).
 fn save_state_file(
    path: &str,
@@ -3419,8 +3455,8 @@ impl VolumeServer for VolumeGrpcService {

        // Match Go: if mark_broken_volumes_readonly, call makeVolumeReadonly on each broken volume.
        // Collect errors via errors.Join semantics (return joined error if any fail).
+        let mut errs: Vec<String> = Vec::new();
        if req.mark_broken_volumes_readonly {
-            let mut errs: Vec<String> = Vec::new();
            for vid in &broken_vids {
                match self.make_volume_readonly(*vid, true).await {
                    Ok(()) => {
@@ -3432,9 +3468,14 @@ impl VolumeServer for VolumeGrpcService {
                    }
                }
            }
-            if !errs.is_empty() {
-                return Err(Status::internal(errs.join("\n")));
-            }
+        }
+
+        // Record metrics before the post-scrub error check so scrub failures are
+        // persisted even when a follow-up admin action (mark-readonly) fails.
+        emit_scrub_metrics(mode, broken_vids.len(), None);
+
+        if !errs.is_empty() {
+            return Err(Status::internal(errs.join("\n")));
        }

        Ok(Response::new(volume_server_pb::ScrubVolumeResponse {
@@ -3560,6 +3601,12 @@ impl VolumeServer for VolumeGrpcService {
            }
        }

+        emit_scrub_metrics(
+            mode,
+            broken_volume_ids.len(),
+            Some(broken_shard_infos.len()),
+        );
+
        Ok(Response::new(volume_server_pb::ScrubEcVolumeResponse {
            total_volumes,
            total_files,