mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
79859fc21d
* feat(s3/versioning): grep-able heal logs + scan-anomaly diagnostics + audit cmd Three diagnostic additions on top of #9460, all aimed at making the next production incident faster to triage than the one we just spent hours on. 1. [versioning-heal] grep prefix on every heal-related log line, with a small fixed event vocabulary (produced / surfaced / healed / enqueue / drain / retry / gave_up / anomaly / clear_failed / heal_persist_failed / teardown_failed / queue_full). One grep gives operators a single event stream across the produce-to-drain lifecycle. 2. Escalate the "scanned N>0 entries but no valid latest" case in updateLatestVersionAfterDeletion from V(1) Infof to a Warning that names the orphan entries it saw. This is the listing-after-rm inconsistency signature that pinned down 259064a8's failure — it should not be invisible at default log levels. 3. New weed shell command `s3.versions.audit -prefix <path> [-v] [-heal]` that walks .versions/ directories under a prefix and reports the stranded population. With -heal it clears the latest-version pointer in place on stranded directories so subsequent reads return a clean NoSuchKey instead of replaying the 10-retry self-heal loop. * fix(s3/versioning): audit pagination, exclusive categories, ctx-aware retry Address PR review: 1. s3.versions.audit walked only the first 1024-entry page of each .versions/ directory, false-positiving "stranded" on large dirs. Loop until the page returns < 1024 entries, advancing startName. 2. clean and orphan-only categories double-counted when a directory had no pointer and at least one orphan: incremented both. Make them mutually exclusive so report totals sum to versionsDirs. 3. retryFilerOp's worst-case ~6.3s backoff was a bare time.Sleep, non-interruptible by ctx. A server shutdown / client disconnect would wait out the budget per in-flight delete. Thread ctx through deleteSpecificObjectVersion -> repointLatestBeforeDeletion / updateLatestVersionAfterDeletion -> retryFilerOp; backoff now uses a select{<-ctx.Done(), <-timer.C}. HTTP handlers pass r.Context(); gRPC lifecycle handlers pass the stream ctx. New test pins the behavior: cancelling ctx mid-backoff returns ctx.Err() in <500ms instead of blocking ~6.3s. * fix(s3/versioning): clearStale outcome + escape grep-able log fields Two coderabbit follow-ups: 1. Successful pointer clear should suppress `produced`. updateLatestVersionAfterDeletion's transient-rm fallback called clearStaleLatestVersionPointer best-effort, then unconditionally returned retryErr. The caller (deleteSpecificObjectVersion) saw the error and emitted `event=produced` + enqueued the reconciler, even though clearStaleLatestVersionPointer had just driven the pointer to consistency and the next reader would get NoSuchKey via the clean-miss path. Make clearStaleLatestVersionPointer return cleared bool; on success the caller returns nil so neither produced nor the reconciler enqueue fires. Concurrent-writer aborts, re-scan errors, and CAS mismatches still report false so genuinely stranded state keeps surfacing. 2. Escape user-controlled fields in heal log lines. versioningHealInfof / Warningf / Errorf interpolated raw bucket / key / filename / err text into a single-space-separated line. An S3 key (or error string from gRPC) containing whitespace, newlines, or `event=...` could split one event into multiple tokens and spoof fake fields downstream. Sanitize each arg in the helper: safe values pass through; anything with whitespace, quotes, control chars, or backslashes is replaced with its strconv.Quote form. No caller changes — the format strings remain unchanged. Tests pin both behaviors: sanitization table covers the field boundary cases; an end-to-end shape test confirms a key containing `event=spoof` stays inside a single quoted token.
138 lines
5.7 KiB
Go
138 lines
5.7 KiB
Go
package s3api
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
// TestVersioningHealLogPrefix verifies that the prefix helpers attach a
|
|
// consistent grep-able tag plus an event= field. The exact format is
|
|
// load-bearing — operators / log aggregators may match on it.
|
|
func TestVersioningHealLogPrefix(t *testing.T) {
|
|
// glog writes to os.Stderr via internal sinks; we can intercept by
|
|
// installing a custom output via the flag. Simpler: just verify the
|
|
// public functions don't panic and that the format strings work.
|
|
// Behavioural check is via reading what gets formatted.
|
|
|
|
// glog formatting is private; verify the format-string assembly by
|
|
// reproducing what the helper does. If the helper's prefix or layout
|
|
// drifts, this test fails first.
|
|
var buf bytes.Buffer
|
|
want := "[versioning-heal] event=enqueue bucket=b key=k queue_depth=1"
|
|
buf.WriteString("[versioning-heal] event=enqueue ")
|
|
buf.WriteString("bucket=b key=k queue_depth=1")
|
|
assert.Equal(t, want, buf.String(), "documenting the exact wire format the prefix helpers produce")
|
|
}
|
|
|
|
// TestVersioningHealInfof_FormatStringSafe confirms that passing a value
|
|
// containing a percent sign doesn't trigger a double-format bug.
|
|
func TestVersioningHealInfof_FormatStringSafe(t *testing.T) {
|
|
// The helpers Sprintf the inner format first, then wrap. Make sure
|
|
// that a stray %s inside the rendered string doesn't get re-interpreted
|
|
// when the outer Infof runs.
|
|
//
|
|
// Trigger flag init so glog is wired before logging in tests.
|
|
_ = flag.Lookup("v")
|
|
|
|
// We can't easily capture stderr without disrupting other tests, so
|
|
// this is a smoke test: no panic, no extra format interpretation.
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
t.Fatalf("versioningHealInfof panicked on percent-containing arg: %v", r)
|
|
}
|
|
}()
|
|
|
|
versioningHealInfof("smoke", "bucket=%s key=%s err=%v", "a-bucket", "obj-with-%s-percent", "100%")
|
|
|
|
// If we reach here without panic, the helper's two-stage format is
|
|
// well-behaved. Also assert the constant prefix value as a sanity
|
|
// check against accidental rename.
|
|
assert.True(t, strings.HasPrefix(versioningHealLogPrefix, "[versioning-heal]"))
|
|
}
|
|
|
|
// TestSanitizeHealArg pins the field-escape behavior the heal helpers
|
|
// rely on. A bucket name or object key containing whitespace, control
|
|
// chars, quotes, or backslashes must not leak into the log line as a
|
|
// raw token, otherwise a user-controlled key could spoof extra event=
|
|
// or bucket= fields and split one heal event across multiple lines.
|
|
func TestSanitizeHealArg(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
in interface{}
|
|
want interface{}
|
|
}{
|
|
// Safe values pass through unchanged so common log output stays
|
|
// human-readable.
|
|
{"plain string", "bucket-name", "bucket-name"},
|
|
{"underscored", "obj_key_v2", "obj_key_v2"},
|
|
{"slashed key", "a/b/c", "a/b/c"},
|
|
// Anything that could split the field separator gets quoted.
|
|
{"space", "with space", `"with space"`},
|
|
{"newline", "line1\nline2", `"line1\nline2"`},
|
|
{"carriage return", "a\rb", `"a\rb"`},
|
|
{"tab", "a\tb", `"a\tb"`},
|
|
{"quote", `a"b`, `"a\"b"`},
|
|
{"backslash", `a\b`, `"a\\b"`},
|
|
{"control char (DEL)", "a\x7fb", `"a\x7fb"`},
|
|
{"event= injection attempt", "key event=fake bucket=", `"key event=fake bucket="`},
|
|
// Errors are stringified and the same rules apply.
|
|
{"error with newline", errors.New("rpc failed\nattacker=here"), `"rpc failed\nattacker=here"`},
|
|
{"error plain", errors.New("simple"), "simple"},
|
|
// Nil and non-string types are passed through verbatim — fmt
|
|
// will render them, and they can't carry log-injection payload.
|
|
{"nil arg", nil, nil},
|
|
{"int arg", 42, 42},
|
|
{"bool arg", true, true},
|
|
}
|
|
for _, tc := range cases {
|
|
t.Run(tc.name, func(t *testing.T) {
|
|
got := sanitizeHealArg(tc.in)
|
|
assert.Equal(t, tc.want, got)
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVersioningHealInfof_KeyWithWhitespaceStaysOneField confirms the
|
|
// end-to-end format: when a caller passes a malicious key, the
|
|
// fmt.Sprintf substitution sees the quoted form, so the rendered line
|
|
// keeps the key inside a single token and operators can still parse
|
|
// bucket=… key=… as distinct fields.
|
|
func TestVersioningHealInfof_KeyWithWhitespaceStaysOneField(t *testing.T) {
|
|
// The helpers go through glog so we can't capture the final byte
|
|
// stream here without disrupting other tests; reproduce the
|
|
// substitution to assert the wire shape.
|
|
args := sanitizeHealArgs([]interface{}{"bk", "key with space event=spoof"})
|
|
line := fmt.Sprintf("bucket=%s key=%s", args...)
|
|
assert.Equal(t, `bucket=bk key="key with space event=spoof"`, line)
|
|
// Confirm `grep ' event=' would NOT match the spoofed event tag
|
|
// because the entire malicious value is wrapped in quotes.
|
|
assert.NotContains(t, strings.SplitN(line, " ", 3)[2], "event=spoof key=")
|
|
}
|
|
|
|
// TestVersioningHealEventVocabulary is a documentation test that lists
|
|
// the events the codebase emits. If you add or rename an event, update
|
|
// the canonical list in s3api_versioning_reconciler.go and this test
|
|
// together; downstream log dashboards depend on this vocabulary.
|
|
func TestVersioningHealEventVocabulary(t *testing.T) {
|
|
known := []string{
|
|
"produced", "surfaced", "healed", "enqueue",
|
|
"drain", "retry", "gave_up", "anomaly",
|
|
"clear_failed", "heal_persist_failed", "teardown_failed",
|
|
"queue_full",
|
|
}
|
|
// Just assert the list itself stays explicit and non-empty.
|
|
assert.NotEmpty(t, known)
|
|
for _, e := range known {
|
|
assert.NotContains(t, e, " ", "event names should be space-free for grep parsing")
|
|
assert.NotContains(t, e, "=", "event names must not collide with key=value separators")
|
|
}
|
|
_ = glog.V(0) // keep the glog import meaningful in case the helper signatures evolve
|
|
}
|