mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-15 23:56:48 +03:00
c2591b4395
* volume: verify before destroy in VolumeCopy and replication repair Four data-safety fixes around copy/repair paths that could destroy or resurrect data before verifying the source or survivors. (a) VolumeCopy no longer deletes a pre-existing local replica up front. The delete is deferred until ReadVolumeFileStatus on the source succeeds, so a transient source outage (or a retry after one) can no longer wipe a healthy destination replica. Gated on source readability only; size/count comparisons are intentionally not used because they invert legitimately after divergent vacuum/compaction. Mirrored in the Rust volume server. (b) volume.check.disk no longer resurrects vacuumed-deleted needles. A key present-and-live on the source but entirely absent on the target is ambiguous: it may be a genuine missing write, or a needle deleted on the target and then vacuumed (its index entry and any tombstone are gone). An individual needle AppendAtNs has no monotonic relation to a vacuum watermark, so the old cutoff heuristic could not tell them apart. Without positive proof the absence is a missing write, the safe default is to NOT push it back. Tradeoff: a real missing write may go unrepaired until a tombstone-aware path exists, but we never raise back deleted data. (c) Over-replication trim no longer resurrects needles or removes the wrong replica. The pre-delete sync now runs read-only (divergence check only) instead of writing the doomed replica's needles into the survivor. pickOneReplicaToDelete only ever removes the smallest of multiple healthy writable replicas; it refuses the trim when doing so would leave only read-only/integrity-flagged survivors, since file_count>0 alone cannot prove the survivor's .dat is readable. (d) Incomplete-volume (.note) cleanup keeps the shared .vif when an .ecx for the same vid coexists on the disk, so removing an interrupted regular copy cannot strip a coexisting EC volume's info file. VolumeCopy now surfaces .note write/remove errors instead of ignoring them. In the Rust volume server (where a persisting note is actually reachable) the .note check moves below the empty-stub sweep and EC validation, keeps the .vif on EC coexistence, and the mount path fails when a .note still persists. * shell: scope the over-replication writable-survivor guard to the trim path only The writable-survivor guard (never trim down to a read-only survivor) lived inside the shared pickOneReplicaToDelete, so it also gated the misplaced-volume relocation via pickOneMisplacedVolume -- a misplaced read-only volume (e.g. a full one) would silently stop being rebalanced. Extract pickSmallestReplica for the relocation path (which deletes-and-recreates and must act on read-only replicas), and keep the writable-survivor guard only in pickOneReplicaToDelete used by the over-replication trim. * seaweed-volume: recompute keep_vif after invalid-EC cleanup in the .note path keep_vif used the pre-validation ecx_exists snapshot, so when the EC-validation step above removed the invalid .ecx/shards, the .note cleanup still preserved a now-orphaned .vif. Re-check .ecx existence at cleanup time, matching the Go hasEcxFile re-check. * shell: keep placement when picking an over-replication victim to delete The trim picked the smallest writable replica without regard to placement, so it could delete the only replica in a required failure domain (e.g. with "100" and replicas dc1 + two in dc2, deleting dc1 leaves both survivors in dc2). Prefer a writable replica whose removal still satisfies placement, falling back to the smallest writable only when none does.
359 lines
12 KiB
Go
359 lines
12 KiB
Go
package shell
|
|
|
|
import (
|
|
"bytes"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/credentials/insecure"
|
|
)
|
|
|
|
// TestDoVolumeCheckDiskDoesNotResurrectAbsentNeedle verifies that a needle
|
|
// present-and-live on the source but entirely absent on the target is NOT
|
|
// pushed back by default. Such an absence is indistinguishable from a needle
|
|
// that was deleted on the target and then vacuumed away, so resurrecting it
|
|
// would raise back data the operator deleted.
|
|
func TestDoVolumeCheckDiskDoesNotResurrectAbsentNeedle(t *testing.T) {
|
|
sourceDB, targetDB := needle_map.NewMemDb(), needle_map.NewMemDb()
|
|
defer sourceDB.Close()
|
|
defer targetDB.Close()
|
|
|
|
// Source has a live needle; target lacks it (e.g. deleted+vacuumed there).
|
|
if err := sourceDB.Set(types.NeedleId(1001), types.ToOffset(8), types.Size(123)); err != nil {
|
|
t.Fatalf("seed source: %v", err)
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
vcd := &volumeCheckDisk{
|
|
commandEnv: &CommandEnv{
|
|
option: &ShellOptions{
|
|
GrpcDialOption: grpc.WithTransportCredentials(insecure.NewCredentials()),
|
|
},
|
|
},
|
|
writer: &buf,
|
|
now: time.Now(),
|
|
applyChanges: false, // simulation: should not even read the source blob
|
|
nonRepairThreshold: 1,
|
|
// resurrectMissingNeedles left false: the safe default.
|
|
}
|
|
|
|
// Source points at an unreachable address: with the fix no blob read is
|
|
// attempted, so this is never contacted. The pre-fix code queues the
|
|
// absent needle and tries to read it from here, surfacing as an error.
|
|
source := &VolumeReplica{
|
|
location: &location{"dc1", "r1", &master_pb.DataNodeInfo{Id: "127.0.0.1:1"}},
|
|
info: &master_pb.VolumeInformationMessage{Id: 7},
|
|
}
|
|
target := &VolumeReplica{
|
|
location: &location{"dc1", "r2", &master_pb.DataNodeInfo{Id: "127.0.0.1:2"}},
|
|
info: &master_pb.VolumeInformationMessage{Id: 7},
|
|
}
|
|
|
|
// With the safe default, the absent needle is skipped before any network
|
|
// read, so this returns cleanly with no changes. On the pre-fix code the
|
|
// needle is queued and a source blob read is attempted, which has no server
|
|
// to reach and surfaces as an error (or a resurrection) instead.
|
|
hasChanges, err := vcd.doVolumeCheckDisk(sourceDB, targetDB, source, target)
|
|
if err != nil {
|
|
t.Fatalf("doVolumeCheckDisk returned error: %v", err)
|
|
}
|
|
if hasChanges {
|
|
t.Fatalf("absent-on-target needle was resurrected; expected no changes")
|
|
}
|
|
}
|
|
|
|
type testCommandVolumeCheckDisk struct {
|
|
commandVolumeCheckDisk
|
|
}
|
|
|
|
type shouldSkipVolume struct {
|
|
name string
|
|
a VolumeReplica
|
|
b VolumeReplica
|
|
pulseTimeAtSecond int64
|
|
syncDeletions bool
|
|
shouldSkipVolume bool
|
|
}
|
|
|
|
func TestShouldSkipVolume(t *testing.T) {
|
|
var tests = []shouldSkipVolume{
|
|
{
|
|
name: "identical volumes should be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: true,
|
|
},
|
|
{
|
|
name: "different file counts should not be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1001,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: false,
|
|
},
|
|
{
|
|
name: "different delete counts with syncDeletions enabled should not be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 101,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: false,
|
|
},
|
|
{
|
|
name: "different delete counts with syncDeletions disabled should be skipped if file counts match",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 101,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: false,
|
|
shouldSkipVolume: true,
|
|
},
|
|
// Edge case: Zero file and delete counts
|
|
{
|
|
name: "volumes with zero file counts should be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 0,
|
|
DeleteCount: 0,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 0,
|
|
DeleteCount: 0,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: true,
|
|
},
|
|
{
|
|
name: "volumes with zero and non-zero file counts should not be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1,
|
|
DeleteCount: 0,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 0,
|
|
DeleteCount: 0,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: false,
|
|
},
|
|
// Edge case: Recently modified volumes (after pulse time)
|
|
// Note: VolumePulsePeriod is 10 seconds, so pulse cutoff is now - 20 seconds
|
|
// When both volumes are recently modified, skip check to avoid false positives
|
|
{
|
|
name: "recently modified volumes with same file counts should be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583395}, // Modified 5 seconds ago
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583390}, // Modified 10 seconds ago
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: true, // Same counts = skip
|
|
},
|
|
{
|
|
name: "one volume modified before pulse cutoff with different file counts should not be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583370}, // Modified 30 seconds ago (before cutoff at -20s)
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 999,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583370}, // Same modification time
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: false, // Different counts + old enough = needs sync
|
|
},
|
|
// Edge case: Different ModifiedAtSecond values, same file counts
|
|
{
|
|
name: "different modification times with same file counts should be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300}, // 100 seconds before pulse time
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583350}, // 50 seconds before pulse time
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: true, // Same counts, both before cutoff
|
|
},
|
|
// Edge case: Very close to pulse time boundary
|
|
{
|
|
name: "volumes modified exactly at pulse cutoff boundary with different counts should not be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1001,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583379}, // Just before cutoff (pulseTime - 21s)
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583379}, // Just before cutoff
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: false, // At boundary with different counts - needs sync
|
|
},
|
|
{
|
|
name: "volumes modified just after pulse cutoff boundary with same counts should be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583381}, // Just after cutoff (pulseTime - 19s)
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583381}, // Just after cutoff
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: true, // Same counts + recent = skip to avoid false positive
|
|
},
|
|
// Edge case: Large file count differences
|
|
{
|
|
name: "large file count difference with old modification time should not be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 10000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 5000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583300},
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: false, // Large difference requires sync
|
|
},
|
|
// Edge case: Both volumes modified AFTER pulse cutoff time
|
|
// When ModifiedAtSecond >= pulseTimeAtSecond for both volumes with same counts,
|
|
// the condition (a.info.FileCount != b.info.FileCount) is false, so we skip
|
|
// without calling eqVolumeFileCount
|
|
{
|
|
name: "both volumes modified after pulse cutoff with same file counts should be skipped",
|
|
a: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583405}, // After pulse cutoff (1696583380)
|
|
},
|
|
b: VolumeReplica{nil, &master_pb.VolumeInformationMessage{
|
|
FileCount: 1000,
|
|
DeleteCount: 100,
|
|
ModifiedAtSecond: 1696583410}, // After pulse cutoff
|
|
},
|
|
pulseTimeAtSecond: 1696583400,
|
|
syncDeletions: true,
|
|
shouldSkipVolume: true, // Same counts = skip without calling eqVolumeFileCount
|
|
},
|
|
}
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
var buf bytes.Buffer
|
|
vcd := &volumeCheckDisk{
|
|
writer: &buf,
|
|
now: time.Unix(tt.pulseTimeAtSecond, 0),
|
|
verbose: false, // reduce noise in tests
|
|
syncDeletions: tt.syncDeletions,
|
|
}
|
|
result, err := vcd.shouldSkipVolume(&tt.a, &tt.b)
|
|
if err != nil {
|
|
// In unit tests, we expect no errors from shouldSkipVolume
|
|
// since we're using test data without actual network calls
|
|
t.Errorf("shouldSkipVolume() returned unexpected error: %v", err)
|
|
return
|
|
}
|
|
if result != tt.shouldSkipVolume {
|
|
t.Errorf("shouldSkipVolume() = %v, want %v\nFileCount A=%d B=%d, DeleteCount A=%d B=%d",
|
|
result, tt.shouldSkipVolume,
|
|
tt.a.info.FileCount, tt.b.info.FileCount,
|
|
tt.a.info.DeleteCount, tt.b.info.DeleteCount)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestVolumeCheckDiskHelperMethods tests the helper methods on volumeCheckDisk
|
|
func TestVolumeCheckDiskHelperMethods(t *testing.T) {
|
|
var buf bytes.Buffer
|
|
vcd := &volumeCheckDisk{
|
|
writer: &buf,
|
|
verbose: true,
|
|
}
|
|
|
|
// Test write method
|
|
vcd.write("test %s", "message")
|
|
if buf.String() != "test message\n" {
|
|
t.Errorf("write() output = %q, want %q", buf.String(), "test message\n")
|
|
}
|
|
|
|
// Test writeVerbose with verbose=true
|
|
buf.Reset()
|
|
vcd.writeVerbose("verbose %d", 123)
|
|
if buf.String() != "verbose 123\n" {
|
|
t.Errorf("writeVerbose() with verbose=true output = %q, want %q", buf.String(), "verbose 123\n")
|
|
}
|
|
|
|
// Test writeVerbose with verbose=false
|
|
buf.Reset()
|
|
vcd.verbose = false
|
|
vcd.writeVerbose("should not appear")
|
|
if buf.String() != "" {
|
|
t.Errorf("writeVerbose() with verbose=false output = %q, want empty", buf.String())
|
|
}
|
|
}
|