rust(volume): export Prometheus metrics for scrubbing operations (#9266)

* rust(volume): export Prometheus metrics for scrubbing operations

Mirrors #9264 in the Rust volume server. Adds three metrics that match
the Go names so the same dashboards/alerts work against either binary:

  - SeaweedFS_volumeServer_scrub_last_time_seconds (gauge)
  - SeaweedFS_volumeServer_scrub_volume_failures   (counter)
  - SeaweedFS_volumeServer_scrub_shard_failures    (counter)

Metrics are aggregated at the volume / EC shard level, labelled by
VolumeScrubMode (UNKNOWN/INDEX/FULL/LOCAL) to match Go's
req.GetMode().String().

* rust(volume): record scrub metrics before post-scrub error check

Address PR feedback:
  - Move metric emission before the mark_broken_volumes_readonly error
    check so scrub failures are persisted even when the follow-up
    mark-readonly admin action fails (matches Go's volume_grpc_scrub.go).
  - Extract the duplicated metric block into emit_scrub_metrics() shared
    by both ScrubVolume and ScrubEcVolume. The shard-failures family
    stays untouched on regular volume scrubs to mirror Go.
This commit is contained in:
Chris Lu
2026-04-28 13:29:32 -07:00
committed by GitHub
parent 3f3aaa7cc8
commit 08d59750ef
2 changed files with 83 additions and 4 deletions
+32
View File
@@ -142,6 +142,35 @@ lazy_static::lazy_static! {
&["code"],
).expect("metric can be created");
// ---- Scrubbing metrics (Go: VolumeServerScrub*) ----
/// Last scrub execution time, as seconds since UNIX epoch, with label `mode`.
pub static ref SCRUB_LAST_TIME_SECONDS: GaugeVec = GaugeVec::new(
Opts::new(
"SeaweedFS_volumeServer_scrub_last_time_seconds",
"Last scrub execution time, as seconds since UNIX epoch.",
),
&["mode"],
).expect("metric can be created");
/// Counter of overall volumes with issues detected during scrubbing, with label `mode`.
pub static ref SCRUB_VOLUME_FAILURES: IntCounterVec = IntCounterVec::new(
Opts::new(
"SeaweedFS_volumeServer_scrub_volume_failures",
"Counter of overall volumes with issues detected during scrubbing.",
),
&["mode"],
).expect("metric can be created");
/// Counter of overall EC shards with issues detected during scrubbing, with label `mode`.
pub static ref SCRUB_SHARD_FAILURES: IntCounterVec = IntCounterVec::new(
Opts::new(
"SeaweedFS_volumeServer_scrub_shard_failures",
"Counter of overall EC shards with issues detected during scrubbing.",
),
&["mode"],
).expect("metric can be created");
// ---- Legacy aliases for backward compat with existing code ----
/// Total number of volumes on this server (flat gauge).
@@ -251,6 +280,9 @@ pub fn register_metrics() {
Box::new(INFLIGHT_DOWNLOAD_SIZE.clone()),
Box::new(INFLIGHT_UPLOAD_SIZE.clone()),
Box::new(UPLOAD_ERROR_COUNTER.clone()),
Box::new(SCRUB_LAST_TIME_SECONDS.clone()),
Box::new(SCRUB_VOLUME_FAILURES.clone()),
Box::new(SCRUB_SHARD_FAILURES.clone()),
// Legacy metrics
Box::new(VOLUMES_TOTAL.clone()),
Box::new(DISK_SIZE_BYTES.clone()),
+51 -4
View File
@@ -29,6 +29,42 @@ fn volume_is_remote_only(dat_path: &str, has_remote_file: bool) -> bool {
has_remote_file && !std::path::Path::new(dat_path).exists()
}
/// Map a numeric `VolumeScrubMode` to its proto enum name, matching Go's
/// `req.GetMode().String()` used for the Prometheus `mode` label.
fn scrub_mode_label(mode: i32) -> &'static str {
match mode {
0 => "UNKNOWN",
1 => "INDEX",
2 => "FULL",
3 => "LOCAL",
_ => "UNKNOWN",
}
}
fn unix_now_seconds() -> f64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as f64)
.unwrap_or(0.0)
}
/// Record scrub metrics. `broken_shards` is `Some` only for EC scrubs so the
/// shard-failures family stays untouched on regular volume scrubs (matching Go).
fn emit_scrub_metrics(mode: i32, broken_volumes: usize, broken_shards: Option<usize>) {
let mode_label = scrub_mode_label(mode);
crate::metrics::SCRUB_LAST_TIME_SECONDS
.with_label_values(&[mode_label])
.set(unix_now_seconds());
crate::metrics::SCRUB_VOLUME_FAILURES
.with_label_values(&[mode_label])
.inc_by(broken_volumes as u64);
if let Some(n) = broken_shards {
crate::metrics::SCRUB_SHARD_FAILURES
.with_label_values(&[mode_label])
.inc_by(n as u64);
}
}
/// Persist VolumeServerState to a state.pb file (matches Go's State.save).
fn save_state_file(
path: &str,
@@ -3419,8 +3455,8 @@ impl VolumeServer for VolumeGrpcService {
// Match Go: if mark_broken_volumes_readonly, call makeVolumeReadonly on each broken volume.
// Collect errors via errors.Join semantics (return joined error if any fail).
let mut errs: Vec<String> = Vec::new();
if req.mark_broken_volumes_readonly {
let mut errs: Vec<String> = Vec::new();
for vid in &broken_vids {
match self.make_volume_readonly(*vid, true).await {
Ok(()) => {
@@ -3432,9 +3468,14 @@ impl VolumeServer for VolumeGrpcService {
}
}
}
if !errs.is_empty() {
return Err(Status::internal(errs.join("\n")));
}
}
// Record metrics before the post-scrub error check so scrub failures are
// persisted even when a follow-up admin action (mark-readonly) fails.
emit_scrub_metrics(mode, broken_vids.len(), None);
if !errs.is_empty() {
return Err(Status::internal(errs.join("\n")));
}
Ok(Response::new(volume_server_pb::ScrubVolumeResponse {
@@ -3560,6 +3601,12 @@ impl VolumeServer for VolumeGrpcService {
}
}
emit_scrub_metrics(
mode,
broken_volume_ids.len(),
Some(broken_shard_infos.len()),
);
Ok(Response::new(volume_server_pb::ScrubEcVolumeResponse {
total_volumes,
total_files,