mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
fix(ec): never delete recoverable EC shards on startup/reconcile (the non-empty-.dat sibling of the stub bug) (#9941)
* fix(ec): never delete recoverable shards on startup/reconcile (size-direction + byte-exact .dat)
EC startup validation and the cross-disk reconcile could delete the only
copy of distributed-EC shards whenever a non-empty .dat sat beside them.
This is the same data-loss class as the empty-.dat-stub fix, now for a
real (non-empty) stale or partial .dat.
validateEcVolume: the discriminating signal is the shard size relative to
the .dat's full encode, not the shard count.
- shards smaller than expected: an interrupted local encode left partial
shards and the .dat is the complete source -> reclaim the .dat.
- shards equal to expected: a valid (or still-distributing) EC volume ->
keep; the shards may be the only copy.
- shards larger than expected: the .dat is the stale/partial side (e.g. an
interrupted decode left a half-written .dat next to the real shards) ->
keep.
Previously any size mismatch, a low shard count beside a .dat, or a
transient stat error returned "delete", wiping sole-copy shards. Now every
ambiguity (size mismatch in either direction, inconsistent shard sizes,
transient I/O error, partial shard set) keeps the data; only a credible
full source .dat with no partial set to lose is reclaimed.
handleFoundEcxFile: a shard load failure (corrupt/locked .ecx, EMFILE
during a mass restart, transient I/O) no longer deletes the EC files when a
.dat exists -- it only unloads and keeps the files for retry. All deletion
authority now flows through validateEcVolume.
pruneIncompleteEcWithSiblingDat: count shards NODE-WIDE (a set split across
sibling disks summing to >= dataShards is independently recoverable and is
left alone), and require the sibling .dat to byte-exactly match the size
.vif recorded at encode time before deleting -- the prior "at least this
big, or bigger than a superblock" gate could trust a stale .dat and wipe
sole-copy shards. EC encode records the source size in .vif, so this gate
works for real volumes; older volumes without it fail safe (kept).
Rust volume server mirrors all of the above: size-direction + keep-on-
ambiguity in validate_ec_volume, keep-on-load-failure in
handle_found_ecx_file, and the node-wide + byte-exact gate in the prune.
The Rust validate/prune paths now resolve the data-shard count from the
volume's own .vif instead of hardcoding 10+4, so custom-ratio volumes are
not mis-sized and wrongly deleted on reboot.
Existing tests that encoded the old (unsafe) "delete on low count / size
mismatch" behavior are updated to the safe expectation, and new regression
tests cover the partial-decode-.dat-keeps-shards and transient-error-keeps
cases (Go and Rust); they fail on the pre-fix code.
* fix(ec): record DatFileSize in planted EC .vif for the prune test; trim comments
The multi-disk lifecycle e2e test planted a partial EC leftover with an
empty .vif, so the byte-exact prune gate (which a real encoded volume
satisfies via its recorded source size) kept it instead of cleaning up.
Record DatFileSize + the EC ratio in the planted .vif, matching production.
Also condense the verbose comments added in this change to the repo's
concise style.
This commit is contained in:
@@ -371,20 +371,12 @@ func (l *DiskLocation) handleFoundEcxFile(shards []string, collection string, vo
|
||||
return
|
||||
}
|
||||
|
||||
// Attempt to load the EC shards
|
||||
// A load failure (corrupt/locked .ecx, EMFILE, transient I/O) is not proof
|
||||
// the shards are disposable -- validateEcVolume already decided they may be
|
||||
// the only copy. Release FDs but keep the files for retry; never delete here.
|
||||
if err := l.loadEcShards(shards, collection, volumeId, onShardLoad); err != nil {
|
||||
// If EC shards failed to load and .dat still exists, clean up EC files to allow .dat file to be used
|
||||
// If .dat is gone, log error but don't clean up (may be waiting for shards from other servers)
|
||||
if datExists {
|
||||
glog.Warningf("Failed to load EC shards for volume %d and .dat exists: %v, cleaning up EC files to use .dat...", volumeId, err)
|
||||
// Unload first to release FDs, then remove files
|
||||
l.unloadEcVolume(volumeId)
|
||||
l.removeEcVolumeFiles(collection, volumeId)
|
||||
} else {
|
||||
glog.Warningf("Failed to load EC shards for volume %d: %v (this may be normal for distributed EC volumes)", volumeId, err)
|
||||
// Clean up any partially loaded in-memory state. This does not delete files.
|
||||
l.unloadEcVolume(volumeId)
|
||||
}
|
||||
glog.Warningf("Failed to load EC shards for volume %d: %v; keeping files for retry", volumeId, err)
|
||||
l.unloadEcVolume(volumeId)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -458,98 +450,70 @@ func calculateExpectedShardSize(datFileSize int64, dataShardCount int) int64 {
|
||||
return shardSize
|
||||
}
|
||||
|
||||
// validateEcVolume checks if EC volume has enough shards to be functional
|
||||
// For distributed EC volumes (where .dat is deleted), any number of shards is valid
|
||||
// For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards
|
||||
// Also validates that all shards have the same size (required for Reed-Solomon EC)
|
||||
// If .dat exists, it also validates shards match the expected size based on .dat file size
|
||||
// validateEcVolume reports whether the EC files for (collection, vid) on this
|
||||
// disk may be deleted to reclaim the local .dat. It returns false (delete)
|
||||
// only when that provably loses no data; every ambiguity returns true (keep),
|
||||
// since the shards may be the only copy of distributed-EC data.
|
||||
func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool {
|
||||
baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid))
|
||||
datFileName := baseFileName + ".dat"
|
||||
|
||||
var expectedShardSize int64 = -1
|
||||
datExists := false
|
||||
|
||||
// Resolve the data-shard count from the volume's own .vif, which is
|
||||
// written at encode time and travels with the volume. The volume
|
||||
// server never loads the cluster EC config into memory, so the OSS
|
||||
// default (10) would size a custom-ratio volume's shards (e.g. 9+3)
|
||||
// for 10 data shards, fail the size check, and wrongly delete a
|
||||
// healthy volume on reboot. Fall back to the default only when the
|
||||
// .vif carries no EC shard config.
|
||||
// Custom ratio comes from the volume's own .vif; the server holds no
|
||||
// cluster EC config in memory.
|
||||
dataShards := l.ecDataShardsFromVif(collection, vid)
|
||||
|
||||
// If .dat file exists, compute exact expected shard size from it.
|
||||
// An empty .dat (<= a superblock, zero needles) cannot be the encode
|
||||
// source -- it is a leftover stub -- so treat it as absent rather than
|
||||
// letting it mark a healthy distributed EC volume as an interrupted
|
||||
// local encode, which would delete its shards.
|
||||
// On-disk .dat size, or -1 when absent (an empty <= superblock .dat is a
|
||||
// stub). A transient stat error keeps the shards rather than deleting.
|
||||
var expectedShardSize int64 = -1
|
||||
datExists := false
|
||||
if datFileInfo, err := os.Stat(datFileName); err == nil {
|
||||
if datFileInfo.Size() > int64(super_block.SuperBlockSize) {
|
||||
datExists = true
|
||||
expectedShardSize = calculateExpectedShardSize(datFileInfo.Size(), dataShards)
|
||||
}
|
||||
} else if !os.IsNotExist(err) {
|
||||
// If stat fails with unexpected error (permission, I/O), fail validation
|
||||
// Don't treat this as "distributed EC" - it could be a temporary error
|
||||
glog.Warningf("Failed to stat .dat file %s: %v", datFileName, err)
|
||||
return false
|
||||
glog.Warningf("EC volume %d: cannot stat .dat %s (%v); keeping EC shards", vid, datFileName, err)
|
||||
return true
|
||||
}
|
||||
|
||||
// Count local shards; a transient stat error or inconsistent sizes -> keep.
|
||||
shardCount := 0
|
||||
var actualShardSize int64 = -1
|
||||
|
||||
// Count shards and validate they all have the same size (required for Reed-Solomon EC)
|
||||
// Check up to MaxShardCount (32) to support custom EC ratios
|
||||
for i := 0; i < erasure_coding.MaxShardCount; i++ {
|
||||
shardFileName := baseFileName + erasure_coding.ToExt(i)
|
||||
fi, err := os.Stat(shardFileName)
|
||||
|
||||
if err == nil {
|
||||
// Check if file has non-zero size
|
||||
if fi.Size() > 0 {
|
||||
// Validate all shards are the same size (required for Reed-Solomon EC)
|
||||
if actualShardSize == -1 {
|
||||
actualShardSize = fi.Size()
|
||||
} else if fi.Size() != actualShardSize {
|
||||
glog.Warningf("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)",
|
||||
vid, i, fi.Size(), actualShardSize)
|
||||
return false
|
||||
glog.Warningf("EC volume %d shard %d size %d != %d; keeping EC shards", vid, i, fi.Size(), actualShardSize)
|
||||
return true
|
||||
}
|
||||
shardCount++
|
||||
}
|
||||
} else if !os.IsNotExist(err) {
|
||||
// If stat fails with unexpected error (permission, I/O), fail validation
|
||||
// This is consistent with .dat file error handling
|
||||
glog.Warningf("Failed to stat shard file %s: %v", shardFileName, err)
|
||||
return false
|
||||
glog.Warningf("EC volume %d: cannot stat shard %s (%v); keeping EC shards", vid, shardFileName, err)
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// If .dat file exists, validate shard size matches expected size
|
||||
if datExists && actualShardSize > 0 && expectedShardSize > 0 {
|
||||
if actualShardSize != expectedShardSize {
|
||||
glog.Warningf("EC volume %d: shard size %d doesn't match expected size %d (based on .dat file size)",
|
||||
vid, actualShardSize, expectedShardSize)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// If .dat file is gone, this is a distributed EC volume - any shard count is valid
|
||||
if !datExists {
|
||||
glog.V(1).Infof("EC volume %d: distributed EC (.dat removed) with %d shards", vid, shardCount)
|
||||
return true
|
||||
return true // distributed EC; any shard count is valid
|
||||
}
|
||||
|
||||
// If .dat file exists, we need at least DataShards shards locally
|
||||
// for the volume's configured ratio. Otherwise it's an incomplete
|
||||
// EC encoding that should be cleaned up.
|
||||
if shardCount < dataShards {
|
||||
glog.Warningf("EC volume %d has .dat file but only %d shards (need at least %d for local EC)",
|
||||
vid, shardCount, dataShards)
|
||||
// Reclaim only when it loses no data. Shards smaller than this .dat's full
|
||||
// encode are an interrupted encode whose .dat is the complete source ->
|
||||
// reclaim. Shards >= expected (valid/distributing EC, or a stale/partial
|
||||
// .dat beside larger real shards) may be the only copy -> keep.
|
||||
if shardCount == 0 {
|
||||
return false
|
||||
}
|
||||
if expectedShardSize > 0 && actualShardSize > 0 && actualShardSize < expectedShardSize {
|
||||
glog.Warningf("EC volume %d: %d shards of %d bytes are smaller than the .dat's full encode (%d bytes); reclaiming the complete .dat",
|
||||
vid, shardCount, actualShardSize, expectedShardSize)
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
|
||||
@@ -54,14 +54,18 @@ func TestIncompleteEcEncodingCleanup(t *testing.T) {
|
||||
expectLoadSuccess: false,
|
||||
},
|
||||
{
|
||||
name: "Incomplete EC: shards with .ecx but < 10 shards, .dat exists - should cleanup",
|
||||
// Full-size shards beside a .dat are NOT an interrupted local
|
||||
// encode (which leaves equally-truncated shards smaller than the
|
||||
// .dat); they may be sole copies of a distributed volume, so the
|
||||
// safe behavior is to keep them rather than delete on a low count.
|
||||
name: "Distributed EC: full-size shards with .ecx, < 10 of them, .dat exists - keep",
|
||||
volumeId: 102,
|
||||
collection: "",
|
||||
createDatFile: true,
|
||||
createEcxFile: true,
|
||||
createEcjFile: false,
|
||||
numShards: 7, // Less than DataShardsCount (10)
|
||||
expectCleanup: true,
|
||||
numShards: 7, // Less than DataShardsCount (10), but full size
|
||||
expectCleanup: false,
|
||||
expectLoadSuccess: false,
|
||||
},
|
||||
{
|
||||
@@ -275,12 +279,16 @@ func TestValidateEcVolume(t *testing.T) {
|
||||
expectValid: true,
|
||||
},
|
||||
{
|
||||
name: "Invalid: .dat exists with < 10 shards",
|
||||
// Full-size shards smaller in count than dataShards may be sole
|
||||
// copies of a distributed volume (a real interrupted local encode
|
||||
// leaves equally-truncated shards, not full-size ones), so they
|
||||
// are kept rather than deleted in favor of the .dat.
|
||||
name: "Keep: .dat exists with < 10 full-size shards (possible distributed sole copies)",
|
||||
volumeId: 201,
|
||||
collection: "",
|
||||
createDatFile: true,
|
||||
numShards: 9,
|
||||
expectValid: false,
|
||||
expectValid: true,
|
||||
},
|
||||
{
|
||||
name: "Valid: .dat deleted (distributed EC) with any shards",
|
||||
@@ -307,12 +315,15 @@ func TestValidateEcVolume(t *testing.T) {
|
||||
expectValid: false,
|
||||
},
|
||||
{
|
||||
name: "Invalid: .dat exists with different size shards",
|
||||
// Inconsistent shard sizes signal corruption or mixed generations,
|
||||
// not a clean interrupted encode; deleting them could destroy the
|
||||
// only copy, so validation keeps them.
|
||||
name: "Keep: .dat exists with different size shards (inconsistent, not trusted for deletion)",
|
||||
volumeId: 205,
|
||||
collection: "",
|
||||
createDatFile: true,
|
||||
numShards: 10, // Will create shards with varying sizes
|
||||
expectValid: false,
|
||||
expectValid: true,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
||||
)
|
||||
|
||||
func writeSizedFile(t *testing.T, path string, size int64) {
|
||||
t.Helper()
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
t.Fatalf("create %s: %v", path, err)
|
||||
}
|
||||
if size > 0 {
|
||||
if err := f.Truncate(size); err != nil {
|
||||
f.Close()
|
||||
t.Fatalf("truncate %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
t.Fatalf("close %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateEcVolume_PartialDatNextToFullShardsKeeps reproduces the headline
|
||||
// data-loss geometry: an interrupted decode (or any stale/partial .dat) leaves
|
||||
// a .dat SMALLER than the volume's real source next to the full-size EC shards
|
||||
// that are the only copy. validateEcVolume must keep the shards, not delete
|
||||
// them in favor of the partial .dat.
|
||||
func TestValidateEcVolume_PartialDatNextToFullShardsKeeps(t *testing.T) {
|
||||
l := newTestDiskLocation(t.TempDir())
|
||||
base := erasure_coding.EcShardFileName("", l.Directory, 70)
|
||||
|
||||
// Real shards sized for a 30 MB source volume.
|
||||
fullShardSize := calculateExpectedShardSize(30*1024*1024, erasure_coding.DataShardsCount)
|
||||
for i := 0; i < erasure_coding.DataShardsCount; i++ {
|
||||
writeSizedFile(t, base+erasure_coding.ToExt(i), fullShardSize)
|
||||
}
|
||||
// A partial .dat (e.g. a decode crashed mid-write) far smaller than the
|
||||
// shards' real source — bigger than a superblock so it is not swept as a
|
||||
// stub, but smaller than what these shards encode.
|
||||
writeSizedFile(t, base+".dat", 5*1024*1024)
|
||||
|
||||
if !l.validateEcVolume("", 70) {
|
||||
t.Fatal("validateEcVolume deleted full-size shards beside a smaller (stale/partial) .dat; the shards may be the only copy")
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateEcVolume_InterruptedEncodeReclaimsDat is the legitimate cleanup:
|
||||
// a full source .dat next to shards SMALLER than it would encode (an encode
|
||||
// interrupted mid-shard-write). The .dat is the complete source, so reclaiming
|
||||
// it (deleting the partial shards) loses no data.
|
||||
func TestValidateEcVolume_InterruptedEncodeReclaimsDat(t *testing.T) {
|
||||
l := newTestDiskLocation(t.TempDir())
|
||||
base := erasure_coding.EcShardFileName("", l.Directory, 71)
|
||||
|
||||
datSize := int64(30 * 1024 * 1024)
|
||||
writeSizedFile(t, base+".dat", datSize)
|
||||
// Shards truncated well below the .dat's full encode (partial writes).
|
||||
partialShardSize := calculateExpectedShardSize(datSize, erasure_coding.DataShardsCount) / 3
|
||||
if partialShardSize <= 0 {
|
||||
t.Fatal("test setup: partial shard size must be positive")
|
||||
}
|
||||
for i := 0; i < erasure_coding.DataShardsCount; i++ {
|
||||
writeSizedFile(t, base+erasure_coding.ToExt(i), partialShardSize)
|
||||
}
|
||||
|
||||
if l.validateEcVolume("", 71) {
|
||||
t.Fatal("validateEcVolume kept shards smaller than the full source .dat; an interrupted local encode should be reclaimable")
|
||||
}
|
||||
}
|
||||
|
||||
// TestValidateEcVolume_TransientStatErrorKeeps confirms a non-ENOENT stat error
|
||||
// never authorizes deletion: a shard directory that cannot be read (EACCES)
|
||||
// must keep the data, not delete it.
|
||||
func TestValidateEcVolume_TransientStatErrorKeeps(t *testing.T) {
|
||||
if os.Getuid() == 0 {
|
||||
t.Skip("root bypasses directory permission checks")
|
||||
}
|
||||
parent := t.TempDir()
|
||||
sub := filepath.Join(parent, "locked")
|
||||
if err := os.Mkdir(sub, 0o755); err != nil {
|
||||
t.Fatalf("mkdir: %v", err)
|
||||
}
|
||||
base := erasure_coding.EcShardFileName("", sub, 73)
|
||||
writeSizedFile(t, base+".dat", int64(super_block.SuperBlockSize)+1024)
|
||||
writeSizedFile(t, base+erasure_coding.ToExt(0), 1024)
|
||||
if err := os.Chmod(sub, 0o000); err != nil {
|
||||
t.Fatalf("chmod: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { os.Chmod(sub, 0o755) })
|
||||
|
||||
l := newTestDiskLocation(sub)
|
||||
// stat under the 0000 dir fails with EACCES (not ENOENT); validateEcVolume
|
||||
// must keep (return true), never delete on a transient error.
|
||||
if !l.validateEcVolume("", 73) {
|
||||
t.Fatal("validateEcVolume returned delete on a transient stat error; must keep on ambiguity")
|
||||
}
|
||||
}
|
||||
@@ -5,9 +5,11 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||
)
|
||||
|
||||
@@ -85,8 +87,17 @@ func TestIssue9478_PartialEcOnSiblingDiskOfHealthyDat(t *testing.T) {
|
||||
if f, err := os.Create(ecBase + ".ecj"); err == nil {
|
||||
f.Close()
|
||||
}
|
||||
if f, err := os.Create(ecBase + ".vif"); err == nil {
|
||||
f.Close()
|
||||
// A credible EC .vif records the encode-time source size and ratio. The
|
||||
// prune now deletes a partial leftover only when the sibling .dat matches
|
||||
// that recorded size byte-for-byte (a real encoded volume records it), so
|
||||
// the test reflects production rather than a loose "any .dat > superblock"
|
||||
// gate.
|
||||
if err := volume_info.SaveVolumeInfo(ecBase+".vif", &volume_server_pb.VolumeInfo{
|
||||
Version: uint32(needle.Version3),
|
||||
DatFileSize: datFileSize,
|
||||
EcShardConfig: &volume_server_pb.EcShardConfig{DataShards: 10, ParityShards: 4},
|
||||
}); err != nil {
|
||||
t.Fatalf("save ec .vif: %v", err)
|
||||
}
|
||||
|
||||
minFreeSpace := util.MinFreeSpace{Type: util.AsPercent, Percent: 1, Raw: "1"}
|
||||
|
||||
@@ -10,7 +10,6 @@ import (
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
||||
)
|
||||
|
||||
// datOwnerInfo records both the disk that holds a .dat for a given
|
||||
@@ -226,6 +225,24 @@ func (s *Store) indexEcxOwners() map[ecKeyForReconcile]ecxOwnerInfo {
|
||||
// to forget the registrations the per-disk pass already emitted on
|
||||
// NewEcShardsChan during startup, instead of waiting for the first
|
||||
// periodic heartbeat to reconcile.
|
||||
// countEcShardsNodeWide returns the distinct EC shard ids for (collection, vid)
|
||||
// across every disk on this store. Shards can be split across sibling disks, so
|
||||
// a per-disk count understates a node-wide-recoverable set. Caller must not hold
|
||||
// any DiskLocation.ecVolumesLock (this takes them).
|
||||
func (s *Store) countEcShardsNodeWide(collection string, vid needle.VolumeId) int {
|
||||
seen := make(map[erasure_coding.ShardId]struct{})
|
||||
for _, loc := range s.Locations {
|
||||
loc.ecVolumesLock.RLock()
|
||||
if ev, ok := loc.ecVolumes[vid]; ok && ev.Collection == collection {
|
||||
for _, sh := range ev.Shards {
|
||||
seen[sh.ShardId] = struct{}{}
|
||||
}
|
||||
}
|
||||
loc.ecVolumesLock.RUnlock()
|
||||
}
|
||||
return len(seen)
|
||||
}
|
||||
|
||||
func (s *Store) pruneIncompleteEcWithSiblingDat() {
|
||||
if len(s.Locations) < 2 {
|
||||
return
|
||||
@@ -251,10 +268,8 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
|
||||
loc.ecVolumesLock.RLock()
|
||||
for vid, ev := range loc.ecVolumes {
|
||||
shardCount := len(ev.Shards)
|
||||
// Use the volume's configured data-shard count, not the OSS
|
||||
// default: a disk holding a full data set for a custom ratio
|
||||
// (e.g. 9 shards of a 9+3 volume) is independently recoverable
|
||||
// and must not be mistaken for a partial leftover and wiped.
|
||||
// Use the volume's own ratio, not the OSS default, so a full
|
||||
// custom-ratio data set (e.g. 9 of a 9+3) is not mistaken for a leftover.
|
||||
dataShards := erasure_coding.DataShardsCount
|
||||
if ev.ECContext != nil && ev.ECContext.DataShards > 0 {
|
||||
dataShards = ev.ECContext.DataShards
|
||||
@@ -267,16 +282,13 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
|
||||
if !hasDat || owner.location == loc {
|
||||
continue
|
||||
}
|
||||
// Credible source size: prefer .vif's encode-time size; when
|
||||
// unknown (0) require more than a bare superblock so an empty
|
||||
// 8-byte stub (e.g. a phantom .dat) can't pass.
|
||||
requiredDatSize := ev.DatFileSize()
|
||||
if requiredDatSize <= 0 {
|
||||
requiredDatSize = int64(super_block.SuperBlockSize) + 1
|
||||
}
|
||||
if owner.size < requiredDatSize {
|
||||
glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards but sibling .dat on %s is %d bytes (need >= %d); leaving partial EC in place so distributed reconstruction is still possible",
|
||||
vid, ev.Collection, loc.Directory, shardCount, owner.location.Directory, owner.size, requiredDatSize)
|
||||
// Delete only against a byte-exact committed source: the sibling
|
||||
// .dat must equal the size .vif recorded at encode time. An unknown
|
||||
// (0) or mismatched size cannot prove the .dat holds this data.
|
||||
datFileSize := ev.DatFileSize()
|
||||
if datFileSize <= 0 || owner.size != datFileSize {
|
||||
glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards; sibling .dat on %s is %d bytes but .vif recorded %d (need byte-exact match); leaving partial EC in place",
|
||||
vid, ev.Collection, loc.Directory, shardCount, owner.location.Directory, owner.size, datFileSize)
|
||||
continue
|
||||
}
|
||||
victims = append(victims, victim{
|
||||
@@ -291,7 +303,15 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
|
||||
loc.ecVolumesLock.RUnlock()
|
||||
|
||||
for _, v := range victims {
|
||||
glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards (need %d) while a healthy .dat exists on sibling disk %s; cleaning up leftover EC files (issue 9478)",
|
||||
// Never prune when the shards are recoverable node-wide (a set
|
||||
// split across sibling disks summing to >= dataShards); they may
|
||||
// be sole copies of a distributed volume.
|
||||
if nodeWide := s.countEcShardsNodeWide(v.collection, v.vid); nodeWide >= v.dataShards {
|
||||
glog.Warningf("ec volume %d (collection=%q): %d shards present node-wide (>= %d) are independently recoverable; leaving EC in place despite a sibling .dat",
|
||||
v.vid, v.collection, nodeWide, v.dataShards)
|
||||
continue
|
||||
}
|
||||
glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards (need %d) while a byte-exact source .dat exists on sibling disk %s; cleaning up leftover EC files",
|
||||
v.vid, v.collection, loc.Directory, v.shardCount, v.dataShards, v.datDir)
|
||||
loc.unloadEcVolume(v.vid)
|
||||
loc.removeEcVolumeFiles(v.collection, v.vid)
|
||||
|
||||
Reference in New Issue
Block a user