fix(ec): never delete recoverable EC shards on startup/reconcile (the non-empty-.dat sibling of the stub bug) (#9941)

* fix(ec): never delete recoverable shards on startup/reconcile (size-direction + byte-exact .dat) EC startup validation and the cross-disk reconcile could delete the only copy of distributed-EC shards whenever a non-empty .dat sat beside them. This is the same data-loss class as the empty-.dat-stub fix, now for a real (non-empty) stale or partial .dat. validateEcVolume: the discriminating signal is the shard size relative to the .dat's full encode, not the shard count. - shards smaller than expected: an interrupted local encode left partial shards and the .dat is the complete source -> reclaim the .dat. - shards equal to expected: a valid (or still-distributing) EC volume -> keep; the shards may be the only copy. - shards larger than expected: the .dat is the stale/partial side (e.g. an interrupted decode left a half-written .dat next to the real shards) -> keep. Previously any size mismatch, a low shard count beside a .dat, or a transient stat error returned "delete", wiping sole-copy shards. Now every ambiguity (size mismatch in either direction, inconsistent shard sizes, transient I/O error, partial shard set) keeps the data; only a credible full source .dat with no partial set to lose is reclaimed. handleFoundEcxFile: a shard load failure (corrupt/locked .ecx, EMFILE during a mass restart, transient I/O) no longer deletes the EC files when a .dat exists -- it only unloads and keeps the files for retry. All deletion authority now flows through validateEcVolume. pruneIncompleteEcWithSiblingDat: count shards NODE-WIDE (a set split across sibling disks summing to >= dataShards is independently recoverable and is left alone), and require the sibling .dat to byte-exactly match the size .vif recorded at encode time before deleting -- the prior "at least this big, or bigger than a superblock" gate could trust a stale .dat and wipe sole-copy shards. EC encode records the source size in .vif, so this gate works for real volumes; older volumes without it fail safe (kept). Rust volume server mirrors all of the above: size-direction + keep-on- ambiguity in validate_ec_volume, keep-on-load-failure in handle_found_ecx_file, and the node-wide + byte-exact gate in the prune. The Rust validate/prune paths now resolve the data-shard count from the volume's own .vif instead of hardcoding 10+4, so custom-ratio volumes are not mis-sized and wrongly deleted on reboot. Existing tests that encoded the old (unsafe) "delete on low count / size mismatch" behavior are updated to the safe expectation, and new regression tests cover the partial-decode-.dat-keeps-shards and transient-error-keeps cases (Go and Rust); they fail on the pre-fix code. * fix(ec): record DatFileSize in planted EC .vif for the prune test; trim comments The multi-disk lifecycle e2e test planted a partial EC leftover with an empty .vif, so the byte-exact prune gate (which a real encoded volume satisfies via its recorded source size) kept it instead of cleaning up. Record DatFileSize + the EC ratio in the planted .vif, matching production. Also condense the verbose comments added in this change to the repo's concise style.
2026-06-13 23:36:45 +03:00 · 2026-06-12 23:51:29 -07:00
parent 3718301599
commit f724828bcb
8 changed files with 393 additions and 186 deletions
@@ -371,20 +371,12 @@ func (l *DiskLocation) handleFoundEcxFile(shards []string, collection string, vo
 		return
 	}

-	// Attempt to load the EC shards
+	// A load failure (corrupt/locked .ecx, EMFILE, transient I/O) is not proof
+	// the shards are disposable -- validateEcVolume already decided they may be
+	// the only copy. Release FDs but keep the files for retry; never delete here.
 	if err := l.loadEcShards(shards, collection, volumeId, onShardLoad); err != nil {
-		// If EC shards failed to load and .dat still exists, clean up EC files to allow .dat file to be used
-		// If .dat is gone, log error but don't clean up (may be waiting for shards from other servers)
-		if datExists {
-			glog.Warningf("Failed to load EC shards for volume %d and .dat exists: %v, cleaning up EC files to use .dat...", volumeId, err)
-			// Unload first to release FDs, then remove files
-			l.unloadEcVolume(volumeId)
-			l.removeEcVolumeFiles(collection, volumeId)
-		} else {
-			glog.Warningf("Failed to load EC shards for volume %d: %v (this may be normal for distributed EC volumes)", volumeId, err)
-			// Clean up any partially loaded in-memory state. This does not delete files.
-			l.unloadEcVolume(volumeId)
-		}
+		glog.Warningf("Failed to load EC shards for volume %d: %v; keeping files for retry", volumeId, err)
+		l.unloadEcVolume(volumeId)
 		return
 	}
 }
@@ -458,98 +450,70 @@ func calculateExpectedShardSize(datFileSize int64, dataShardCount int) int64 {
 	return shardSize
 }

-// validateEcVolume checks if EC volume has enough shards to be functional
-// For distributed EC volumes (where .dat is deleted), any number of shards is valid
-// For incomplete EC encoding (where .dat still exists), we need at least DataShardsCount shards
-// Also validates that all shards have the same size (required for Reed-Solomon EC)
-// If .dat exists, it also validates shards match the expected size based on .dat file size
+// validateEcVolume reports whether the EC files for (collection, vid) on this
+// disk may be deleted to reclaim the local .dat. It returns false (delete)
+// only when that provably loses no data; every ambiguity returns true (keep),
+// since the shards may be the only copy of distributed-EC data.
 func (l *DiskLocation) validateEcVolume(collection string, vid needle.VolumeId) bool {
 	baseFileName := erasure_coding.EcShardFileName(collection, l.Directory, int(vid))
 	datFileName := baseFileName + ".dat"

-	var expectedShardSize int64 = -1
-	datExists := false
-
-	// Resolve the data-shard count from the volume's own .vif, which is
-	// written at encode time and travels with the volume. The volume
-	// server never loads the cluster EC config into memory, so the OSS
-	// default (10) would size a custom-ratio volume's shards (e.g. 9+3)
-	// for 10 data shards, fail the size check, and wrongly delete a
-	// healthy volume on reboot. Fall back to the default only when the
-	// .vif carries no EC shard config.
+	// Custom ratio comes from the volume's own .vif; the server holds no
+	// cluster EC config in memory.
 	dataShards := l.ecDataShardsFromVif(collection, vid)

-	// If .dat file exists, compute exact expected shard size from it.
-	// An empty .dat (<= a superblock, zero needles) cannot be the encode
-	// source -- it is a leftover stub -- so treat it as absent rather than
-	// letting it mark a healthy distributed EC volume as an interrupted
-	// local encode, which would delete its shards.
+	// On-disk .dat size, or -1 when absent (an empty <= superblock .dat is a
+	// stub). A transient stat error keeps the shards rather than deleting.
+	var expectedShardSize int64 = -1
+	datExists := false
 	if datFileInfo, err := os.Stat(datFileName); err == nil {
 		if datFileInfo.Size() > int64(super_block.SuperBlockSize) {
 			datExists = true
 			expectedShardSize = calculateExpectedShardSize(datFileInfo.Size(), dataShards)
 		}
 	} else if !os.IsNotExist(err) {
-		// If stat fails with unexpected error (permission, I/O), fail validation
-		// Don't treat this as "distributed EC" - it could be a temporary error
-		glog.Warningf("Failed to stat .dat file %s: %v", datFileName, err)
-		return false
+		glog.Warningf("EC volume %d: cannot stat .dat %s (%v); keeping EC shards", vid, datFileName, err)
+		return true
 	}

+	// Count local shards; a transient stat error or inconsistent sizes -> keep.
 	shardCount := 0
 	var actualShardSize int64 = -1
-
-	// Count shards and validate they all have the same size (required for Reed-Solomon EC)
-	// Check up to MaxShardCount (32) to support custom EC ratios
 	for i := 0; i < erasure_coding.MaxShardCount; i++ {
 		shardFileName := baseFileName + erasure_coding.ToExt(i)
 		fi, err := os.Stat(shardFileName)
-
 		if err == nil {
-			// Check if file has non-zero size
 			if fi.Size() > 0 {
-				// Validate all shards are the same size (required for Reed-Solomon EC)
 				if actualShardSize == -1 {
 					actualShardSize = fi.Size()
 				} else if fi.Size() != actualShardSize {
-					glog.Warningf("EC volume %d shard %d has size %d, expected %d (all EC shards must be same size)",
-						vid, i, fi.Size(), actualShardSize)
-					return false
+					glog.Warningf("EC volume %d shard %d size %d != %d; keeping EC shards", vid, i, fi.Size(), actualShardSize)
+					return true
 				}
 				shardCount++
 			}
 		} else if !os.IsNotExist(err) {
-			// If stat fails with unexpected error (permission, I/O), fail validation
-			// This is consistent with .dat file error handling
-			glog.Warningf("Failed to stat shard file %s: %v", shardFileName, err)
-			return false
+			glog.Warningf("EC volume %d: cannot stat shard %s (%v); keeping EC shards", vid, shardFileName, err)
+			return true
 		}
 	}

-	// If .dat file exists, validate shard size matches expected size
-	if datExists && actualShardSize > 0 && expectedShardSize > 0 {
-		if actualShardSize != expectedShardSize {
-			glog.Warningf("EC volume %d: shard size %d doesn't match expected size %d (based on .dat file size)",
-				vid, actualShardSize, expectedShardSize)
-			return false
-		}
-	}
-
-	// If .dat file is gone, this is a distributed EC volume - any shard count is valid
 	if !datExists {
-		glog.V(1).Infof("EC volume %d: distributed EC (.dat removed) with %d shards", vid, shardCount)
-		return true
+		return true // distributed EC; any shard count is valid
 	}

-	// If .dat file exists, we need at least DataShards shards locally
-	// for the volume's configured ratio. Otherwise it's an incomplete
-	// EC encoding that should be cleaned up.
-	if shardCount < dataShards {
-		glog.Warningf("EC volume %d has .dat file but only %d shards (need at least %d for local EC)",
-			vid, shardCount, dataShards)
+	// Reclaim only when it loses no data. Shards smaller than this .dat's full
+	// encode are an interrupted encode whose .dat is the complete source ->
+	// reclaim. Shards >= expected (valid/distributing EC, or a stale/partial
+	// .dat beside larger real shards) may be the only copy -> keep.
+	if shardCount == 0 {
+		return false
+	}
+	if expectedShardSize > 0 && actualShardSize > 0 && actualShardSize < expectedShardSize {
+		glog.Warningf("EC volume %d: %d shards of %d bytes are smaller than the .dat's full encode (%d bytes); reclaiming the complete .dat",
+			vid, shardCount, actualShardSize, expectedShardSize)
 		return false
 	}
-
 	return true
 }

@@ -54,14 +54,18 @@ func TestIncompleteEcEncodingCleanup(t *testing.T) {
 			expectLoadSuccess: false,
 		},
 		{
-			name:              "Incomplete EC: shards with .ecx but < 10 shards, .dat exists - should cleanup",
+			// Full-size shards beside a .dat are NOT an interrupted local
+			// encode (which leaves equally-truncated shards smaller than the
+			// .dat); they may be sole copies of a distributed volume, so the
+			// safe behavior is to keep them rather than delete on a low count.
+			name:              "Distributed EC: full-size shards with .ecx, < 10 of them, .dat exists - keep",
 			volumeId:          102,
 			collection:        "",
 			createDatFile:     true,
 			createEcxFile:     true,
 			createEcjFile:     false,
-			numShards:         7, // Less than DataShardsCount (10)
-			expectCleanup:     true,
+			numShards:         7, // Less than DataShardsCount (10), but full size
+			expectCleanup:     false,
 			expectLoadSuccess: false,
 		},
 		{
@@ -275,12 +279,16 @@ func TestValidateEcVolume(t *testing.T) {
 			expectValid:   true,
 		},
 		{
-			name:          "Invalid: .dat exists with < 10 shards",
+			// Full-size shards smaller in count than dataShards may be sole
+			// copies of a distributed volume (a real interrupted local encode
+			// leaves equally-truncated shards, not full-size ones), so they
+			// are kept rather than deleted in favor of the .dat.
+			name:          "Keep: .dat exists with < 10 full-size shards (possible distributed sole copies)",
 			volumeId:      201,
 			collection:    "",
 			createDatFile: true,
 			numShards:     9,
-			expectValid:   false,
+			expectValid:   true,
 		},
 		{
 			name:          "Valid: .dat deleted (distributed EC) with any shards",
@@ -307,12 +315,15 @@ func TestValidateEcVolume(t *testing.T) {
 			expectValid:   false,
 		},
 		{
-			name:          "Invalid: .dat exists with different size shards",
+			// Inconsistent shard sizes signal corruption or mixed generations,
+			// not a clean interrupted encode; deleting them could destroy the
+			// only copy, so validation keeps them.
+			name:          "Keep: .dat exists with different size shards (inconsistent, not trusted for deletion)",
 			volumeId:      205,
 			collection:    "",
 			createDatFile: true,
 			numShards:     10, // Will create shards with varying sizes
-			expectValid:   false,
+			expectValid:   true,
 		},
 	}

@@ -0,0 +1,103 @@
+package storage
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
+	"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
+)
+
+func writeSizedFile(t *testing.T, path string, size int64) {
+	t.Helper()
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatalf("create %s: %v", path, err)
+	}
+	if size > 0 {
+		if err := f.Truncate(size); err != nil {
+			f.Close()
+			t.Fatalf("truncate %s: %v", path, err)
+		}
+	}
+	if err := f.Close(); err != nil {
+		t.Fatalf("close %s: %v", path, err)
+	}
+}
+
+// TestValidateEcVolume_PartialDatNextToFullShardsKeeps reproduces the headline
+// data-loss geometry: an interrupted decode (or any stale/partial .dat) leaves
+// a .dat SMALLER than the volume's real source next to the full-size EC shards
+// that are the only copy. validateEcVolume must keep the shards, not delete
+// them in favor of the partial .dat.
+func TestValidateEcVolume_PartialDatNextToFullShardsKeeps(t *testing.T) {
+	l := newTestDiskLocation(t.TempDir())
+	base := erasure_coding.EcShardFileName("", l.Directory, 70)
+
+	// Real shards sized for a 30 MB source volume.
+	fullShardSize := calculateExpectedShardSize(30*1024*1024, erasure_coding.DataShardsCount)
+	for i := 0; i < erasure_coding.DataShardsCount; i++ {
+		writeSizedFile(t, base+erasure_coding.ToExt(i), fullShardSize)
+	}
+	// A partial .dat (e.g. a decode crashed mid-write) far smaller than the
+	// shards' real source — bigger than a superblock so it is not swept as a
+	// stub, but smaller than what these shards encode.
+	writeSizedFile(t, base+".dat", 5*1024*1024)
+
+	if !l.validateEcVolume("", 70) {
+		t.Fatal("validateEcVolume deleted full-size shards beside a smaller (stale/partial) .dat; the shards may be the only copy")
+	}
+}
+
+// TestValidateEcVolume_InterruptedEncodeReclaimsDat is the legitimate cleanup:
+// a full source .dat next to shards SMALLER than it would encode (an encode
+// interrupted mid-shard-write). The .dat is the complete source, so reclaiming
+// it (deleting the partial shards) loses no data.
+func TestValidateEcVolume_InterruptedEncodeReclaimsDat(t *testing.T) {
+	l := newTestDiskLocation(t.TempDir())
+	base := erasure_coding.EcShardFileName("", l.Directory, 71)
+
+	datSize := int64(30 * 1024 * 1024)
+	writeSizedFile(t, base+".dat", datSize)
+	// Shards truncated well below the .dat's full encode (partial writes).
+	partialShardSize := calculateExpectedShardSize(datSize, erasure_coding.DataShardsCount) / 3
+	if partialShardSize <= 0 {
+		t.Fatal("test setup: partial shard size must be positive")
+	}
+	for i := 0; i < erasure_coding.DataShardsCount; i++ {
+		writeSizedFile(t, base+erasure_coding.ToExt(i), partialShardSize)
+	}
+
+	if l.validateEcVolume("", 71) {
+		t.Fatal("validateEcVolume kept shards smaller than the full source .dat; an interrupted local encode should be reclaimable")
+	}
+}
+
+// TestValidateEcVolume_TransientStatErrorKeeps confirms a non-ENOENT stat error
+// never authorizes deletion: a shard directory that cannot be read (EACCES)
+// must keep the data, not delete it.
+func TestValidateEcVolume_TransientStatErrorKeeps(t *testing.T) {
+	if os.Getuid() == 0 {
+		t.Skip("root bypasses directory permission checks")
+	}
+	parent := t.TempDir()
+	sub := filepath.Join(parent, "locked")
+	if err := os.Mkdir(sub, 0o755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	base := erasure_coding.EcShardFileName("", sub, 73)
+	writeSizedFile(t, base+".dat", int64(super_block.SuperBlockSize)+1024)
+	writeSizedFile(t, base+erasure_coding.ToExt(0), 1024)
+	if err := os.Chmod(sub, 0o000); err != nil {
+		t.Fatalf("chmod: %v", err)
+	}
+	t.Cleanup(func() { os.Chmod(sub, 0o755) })
+
+	l := newTestDiskLocation(sub)
+	// stat under the 0000 dir fails with EACCES (not ENOENT); validateEcVolume
+	// must keep (return true), never delete on a transient error.
+	if !l.validateEcVolume("", 73) {
+		t.Fatal("validateEcVolume returned delete on a transient stat error; must keep on ambiguity")
+	}
+}
@@ -5,9 +5,11 @@ import (
 	"testing"

 	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
+	"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
 	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
 	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
 	"github.com/seaweedfs/seaweedfs/weed/storage/types"
+	"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
 	"github.com/seaweedfs/seaweedfs/weed/util"
 )

@@ -85,8 +87,17 @@ func TestIssue9478_PartialEcOnSiblingDiskOfHealthyDat(t *testing.T) {
 	if f, err := os.Create(ecBase + ".ecj"); err == nil {
 		f.Close()
 	}
-	if f, err := os.Create(ecBase + ".vif"); err == nil {
-		f.Close()
+	// A credible EC .vif records the encode-time source size and ratio. The
+	// prune now deletes a partial leftover only when the sibling .dat matches
+	// that recorded size byte-for-byte (a real encoded volume records it), so
+	// the test reflects production rather than a loose "any .dat > superblock"
+	// gate.
+	if err := volume_info.SaveVolumeInfo(ecBase+".vif", &volume_server_pb.VolumeInfo{
+		Version:       uint32(needle.Version3),
+		DatFileSize:   datFileSize,
+		EcShardConfig: &volume_server_pb.EcShardConfig{DataShards: 10, ParityShards: 4},
+	}); err != nil {
+		t.Fatalf("save ec .vif: %v", err)
 	}

 	minFreeSpace := util.MinFreeSpace{Type: util.AsPercent, Percent: 1, Raw: "1"}
@@ -10,7 +10,6 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
 	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
 	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
-	"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
 )

 // datOwnerInfo records both the disk that holds a .dat for a given
@@ -226,6 +225,24 @@ func (s *Store) indexEcxOwners() map[ecKeyForReconcile]ecxOwnerInfo {
 // to forget the registrations the per-disk pass already emitted on
 // NewEcShardsChan during startup, instead of waiting for the first
 // periodic heartbeat to reconcile.
+// countEcShardsNodeWide returns the distinct EC shard ids for (collection, vid)
+// across every disk on this store. Shards can be split across sibling disks, so
+// a per-disk count understates a node-wide-recoverable set. Caller must not hold
+// any DiskLocation.ecVolumesLock (this takes them).
+func (s *Store) countEcShardsNodeWide(collection string, vid needle.VolumeId) int {
+	seen := make(map[erasure_coding.ShardId]struct{})
+	for _, loc := range s.Locations {
+		loc.ecVolumesLock.RLock()
+		if ev, ok := loc.ecVolumes[vid]; ok && ev.Collection == collection {
+			for _, sh := range ev.Shards {
+				seen[sh.ShardId] = struct{}{}
+			}
+		}
+		loc.ecVolumesLock.RUnlock()
+	}
+	return len(seen)
+}
+
 func (s *Store) pruneIncompleteEcWithSiblingDat() {
 	if len(s.Locations) < 2 {
 		return
@@ -251,10 +268,8 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
 		loc.ecVolumesLock.RLock()
 		for vid, ev := range loc.ecVolumes {
 			shardCount := len(ev.Shards)
-			// Use the volume's configured data-shard count, not the OSS
-			// default: a disk holding a full data set for a custom ratio
-			// (e.g. 9 shards of a 9+3 volume) is independently recoverable
-			// and must not be mistaken for a partial leftover and wiped.
+			// Use the volume's own ratio, not the OSS default, so a full
+			// custom-ratio data set (e.g. 9 of a 9+3) is not mistaken for a leftover.
 			dataShards := erasure_coding.DataShardsCount
 			if ev.ECContext != nil && ev.ECContext.DataShards > 0 {
 				dataShards = ev.ECContext.DataShards
@@ -267,16 +282,13 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
 			if !hasDat || owner.location == loc {
 				continue
 			}
-			// Credible source size: prefer .vif's encode-time size; when
-			// unknown (0) require more than a bare superblock so an empty
-			// 8-byte stub (e.g. a phantom .dat) can't pass.
-			requiredDatSize := ev.DatFileSize()
-			if requiredDatSize <= 0 {
-				requiredDatSize = int64(super_block.SuperBlockSize) + 1
-			}
-			if owner.size < requiredDatSize {
-				glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards but sibling .dat on %s is %d bytes (need >= %d); leaving partial EC in place so distributed reconstruction is still possible",
-					vid, ev.Collection, loc.Directory, shardCount, owner.location.Directory, owner.size, requiredDatSize)
+			// Delete only against a byte-exact committed source: the sibling
+			// .dat must equal the size .vif recorded at encode time. An unknown
+			// (0) or mismatched size cannot prove the .dat holds this data.
+			datFileSize := ev.DatFileSize()
+			if datFileSize <= 0 || owner.size != datFileSize {
+				glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards; sibling .dat on %s is %d bytes but .vif recorded %d (need byte-exact match); leaving partial EC in place",
+					vid, ev.Collection, loc.Directory, shardCount, owner.location.Directory, owner.size, datFileSize)
 				continue
 			}
 			victims = append(victims, victim{
@@ -291,7 +303,15 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
 		loc.ecVolumesLock.RUnlock()

 		for _, v := range victims {
-			glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards (need %d) while a healthy .dat exists on sibling disk %s; cleaning up leftover EC files (issue 9478)",
+			// Never prune when the shards are recoverable node-wide (a set
+			// split across sibling disks summing to >= dataShards); they may
+			// be sole copies of a distributed volume.
+			if nodeWide := s.countEcShardsNodeWide(v.collection, v.vid); nodeWide >= v.dataShards {
+				glog.Warningf("ec volume %d (collection=%q): %d shards present node-wide (>= %d) are independently recoverable; leaving EC in place despite a sibling .dat",
+					v.vid, v.collection, nodeWide, v.dataShards)
+				continue
+			}
+			glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards (need %d) while a byte-exact source .dat exists on sibling disk %s; cleaning up leftover EC files",
 				v.vid, v.collection, loc.Directory, v.shardCount, v.dataShards, v.datDir)
 			loc.unloadEcVolume(v.vid)
 			loc.removeEcVolumeFiles(v.collection, v.vid)