mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
rust(volume): export Prometheus metrics for scrubbing operations (#9266)
* rust(volume): export Prometheus metrics for scrubbing operations Mirrors #9264 in the Rust volume server. Adds three metrics that match the Go names so the same dashboards/alerts work against either binary: - SeaweedFS_volumeServer_scrub_last_time_seconds (gauge) - SeaweedFS_volumeServer_scrub_volume_failures (counter) - SeaweedFS_volumeServer_scrub_shard_failures (counter) Metrics are aggregated at the volume / EC shard level, labelled by VolumeScrubMode (UNKNOWN/INDEX/FULL/LOCAL) to match Go's req.GetMode().String(). * rust(volume): record scrub metrics before post-scrub error check Address PR feedback: - Move metric emission before the mark_broken_volumes_readonly error check so scrub failures are persisted even when the follow-up mark-readonly admin action fails (matches Go's volume_grpc_scrub.go). - Extract the duplicated metric block into emit_scrub_metrics() shared by both ScrubVolume and ScrubEcVolume. The shard-failures family stays untouched on regular volume scrubs to mirror Go.
This commit is contained in:
@@ -142,6 +142,35 @@ lazy_static::lazy_static! {
|
||||
&["code"],
|
||||
).expect("metric can be created");
|
||||
|
||||
// ---- Scrubbing metrics (Go: VolumeServerScrub*) ----
|
||||
|
||||
/// Last scrub execution time, as seconds since UNIX epoch, with label `mode`.
|
||||
pub static ref SCRUB_LAST_TIME_SECONDS: GaugeVec = GaugeVec::new(
|
||||
Opts::new(
|
||||
"SeaweedFS_volumeServer_scrub_last_time_seconds",
|
||||
"Last scrub execution time, as seconds since UNIX epoch.",
|
||||
),
|
||||
&["mode"],
|
||||
).expect("metric can be created");
|
||||
|
||||
/// Counter of overall volumes with issues detected during scrubbing, with label `mode`.
|
||||
pub static ref SCRUB_VOLUME_FAILURES: IntCounterVec = IntCounterVec::new(
|
||||
Opts::new(
|
||||
"SeaweedFS_volumeServer_scrub_volume_failures",
|
||||
"Counter of overall volumes with issues detected during scrubbing.",
|
||||
),
|
||||
&["mode"],
|
||||
).expect("metric can be created");
|
||||
|
||||
/// Counter of overall EC shards with issues detected during scrubbing, with label `mode`.
|
||||
pub static ref SCRUB_SHARD_FAILURES: IntCounterVec = IntCounterVec::new(
|
||||
Opts::new(
|
||||
"SeaweedFS_volumeServer_scrub_shard_failures",
|
||||
"Counter of overall EC shards with issues detected during scrubbing.",
|
||||
),
|
||||
&["mode"],
|
||||
).expect("metric can be created");
|
||||
|
||||
// ---- Legacy aliases for backward compat with existing code ----
|
||||
|
||||
/// Total number of volumes on this server (flat gauge).
|
||||
@@ -251,6 +280,9 @@ pub fn register_metrics() {
|
||||
Box::new(INFLIGHT_DOWNLOAD_SIZE.clone()),
|
||||
Box::new(INFLIGHT_UPLOAD_SIZE.clone()),
|
||||
Box::new(UPLOAD_ERROR_COUNTER.clone()),
|
||||
Box::new(SCRUB_LAST_TIME_SECONDS.clone()),
|
||||
Box::new(SCRUB_VOLUME_FAILURES.clone()),
|
||||
Box::new(SCRUB_SHARD_FAILURES.clone()),
|
||||
// Legacy metrics
|
||||
Box::new(VOLUMES_TOTAL.clone()),
|
||||
Box::new(DISK_SIZE_BYTES.clone()),
|
||||
|
||||
@@ -29,6 +29,42 @@ fn volume_is_remote_only(dat_path: &str, has_remote_file: bool) -> bool {
|
||||
has_remote_file && !std::path::Path::new(dat_path).exists()
|
||||
}
|
||||
|
||||
/// Map a numeric `VolumeScrubMode` to its proto enum name, matching Go's
|
||||
/// `req.GetMode().String()` used for the Prometheus `mode` label.
|
||||
fn scrub_mode_label(mode: i32) -> &'static str {
|
||||
match mode {
|
||||
0 => "UNKNOWN",
|
||||
1 => "INDEX",
|
||||
2 => "FULL",
|
||||
3 => "LOCAL",
|
||||
_ => "UNKNOWN",
|
||||
}
|
||||
}
|
||||
|
||||
fn unix_now_seconds() -> f64 {
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.as_secs() as f64)
|
||||
.unwrap_or(0.0)
|
||||
}
|
||||
|
||||
/// Record scrub metrics. `broken_shards` is `Some` only for EC scrubs so the
|
||||
/// shard-failures family stays untouched on regular volume scrubs (matching Go).
|
||||
fn emit_scrub_metrics(mode: i32, broken_volumes: usize, broken_shards: Option<usize>) {
|
||||
let mode_label = scrub_mode_label(mode);
|
||||
crate::metrics::SCRUB_LAST_TIME_SECONDS
|
||||
.with_label_values(&[mode_label])
|
||||
.set(unix_now_seconds());
|
||||
crate::metrics::SCRUB_VOLUME_FAILURES
|
||||
.with_label_values(&[mode_label])
|
||||
.inc_by(broken_volumes as u64);
|
||||
if let Some(n) = broken_shards {
|
||||
crate::metrics::SCRUB_SHARD_FAILURES
|
||||
.with_label_values(&[mode_label])
|
||||
.inc_by(n as u64);
|
||||
}
|
||||
}
|
||||
|
||||
/// Persist VolumeServerState to a state.pb file (matches Go's State.save).
|
||||
fn save_state_file(
|
||||
path: &str,
|
||||
@@ -3419,8 +3455,8 @@ impl VolumeServer for VolumeGrpcService {
|
||||
|
||||
// Match Go: if mark_broken_volumes_readonly, call makeVolumeReadonly on each broken volume.
|
||||
// Collect errors via errors.Join semantics (return joined error if any fail).
|
||||
let mut errs: Vec<String> = Vec::new();
|
||||
if req.mark_broken_volumes_readonly {
|
||||
let mut errs: Vec<String> = Vec::new();
|
||||
for vid in &broken_vids {
|
||||
match self.make_volume_readonly(*vid, true).await {
|
||||
Ok(()) => {
|
||||
@@ -3432,9 +3468,14 @@ impl VolumeServer for VolumeGrpcService {
|
||||
}
|
||||
}
|
||||
}
|
||||
if !errs.is_empty() {
|
||||
return Err(Status::internal(errs.join("\n")));
|
||||
}
|
||||
}
|
||||
|
||||
// Record metrics before the post-scrub error check so scrub failures are
|
||||
// persisted even when a follow-up admin action (mark-readonly) fails.
|
||||
emit_scrub_metrics(mode, broken_vids.len(), None);
|
||||
|
||||
if !errs.is_empty() {
|
||||
return Err(Status::internal(errs.join("\n")));
|
||||
}
|
||||
|
||||
Ok(Response::new(volume_server_pb::ScrubVolumeResponse {
|
||||
@@ -3560,6 +3601,12 @@ impl VolumeServer for VolumeGrpcService {
|
||||
}
|
||||
}
|
||||
|
||||
emit_scrub_metrics(
|
||||
mode,
|
||||
broken_volume_ids.len(),
|
||||
Some(broken_shard_infos.len()),
|
||||
);
|
||||
|
||||
Ok(Response::new(volume_server_pb::ScrubEcVolumeResponse {
|
||||
total_volumes,
|
||||
total_files,
|
||||
|
||||
Reference in New Issue
Block a user