shell: stop ec.encode/ec.rebuild from destroying live EC shards (no crash needed) (#9939)

* shell: stop ec.encode/ec.rebuild from destroying live EC shards Three operator-triggered shell paths could destroy data with no crash: ec.encode -volumeId on an already-EC volume tore down its shards before failing. The volume-id path never checked the id was a regular volume: the collection lookup scans only VolumeInfos (so an EC-only id maps to ""), and volumeLocations succeeds via the EC-location fallback, so clearPreexistingEcShards full-teardown-deleted every shard cluster-wide before doEcEncode failed. An EC volume has no .dat, so this is its only copy. Add assertEncodableRegularVolumes: each requested id must be a regular volume in the topology snapshot; an EC-only or unknown id is refused before any teardown. A volume present as both a regular .dat and stale orphan shards (a failed-encode retry) still passes. This closes the operator-rerun/script-retry path; a worker racing the snapshot is a fencing problem handled separately. ec.rebuild dry-run (the default, without -apply) still issued real VolumeEcShardsDelete RPCs: prepareDataToRecover appended every would-copy shard to copiedShardIds even though the copy was skipped, and the cleanup defer deleted that set unconditionally. Now a dry-run copies nothing and records nothing to delete (a separate would-copy counter drives the recoverability check so the dry-run still reports its plan), and the cleanup runs only under -apply. ec.rebuild could also self-destruct a live shard: localShardsInfo was overwritten per disk instead of unioned, so a shard the rebuilder holds on a non-last disk looked remote, got copied onto itself (in-place O_TRUNC) and then node-wide deleted. Union local shards across all disks, and never copy/delete a shard whose only listed holder is the rebuilder itself. * shell: address ec destructive-guards review comments - countLocalShards: union shards across all of the rebuilder's disks so slot accounting matches what prepareDataToRecover treats as local; first-match counting overstated slotsNeeded on multi-disk rebuilders - VolumeEcShardsCopy: resolve SourceDataNode via pb.NewServerAddressFromDataNode instead of the raw node id, which may not be a dialable host:port - assertEncodableRegularVolumes: skip nil DiskInfo map entries, matching the other topology walks in this file; rename ecOnly to hasEcShards since the map marks any volume with shards, not only shard-only ones
2026-06-13 23:36:45 +03:00 · 2026-06-12 22:30:17 -07:00
parent 18cdb3819b
commit 3718301599
4 changed files with 309 additions and 33 deletions
@@ -219,6 +219,52 @@ func collectEcNodes(commandEnv *CommandEnv, diskType types.DiskType) (ecNodes []
 	return collectEcNodesForDC(commandEnv, "", diskType)
 }

+// assertEncodableRegularVolumes rejects volume ids that are not encodable
+// regular volumes in the topology snapshot: an already-EC volume (present only
+// as EC shards, with no .dat) or an id absent from the cluster. Encoding an
+// already-EC volume would clear its shards before failing, destroying the only
+// copy. A volume present as BOTH a regular .dat and stale orphan shards (a
+// failed-encode retry) passes, so the retry + orphan sweep still works.
+func assertEncodableRegularVolumes(t *master_pb.TopologyInfo, vids []needle.VolumeId) error {
+	want := make(map[needle.VolumeId]bool, len(vids))
+	for _, vid := range vids {
+		want[vid] = true
+	}
+	regular := make(map[needle.VolumeId]bool)
+	hasEcShards := make(map[needle.VolumeId]bool)
+	for _, dc := range t.DataCenterInfos {
+		for _, r := range dc.RackInfos {
+			for _, dn := range r.DataNodeInfos {
+				for _, diskInfo := range dn.DiskInfos {
+					if diskInfo == nil {
+						continue
+					}
+					for _, vi := range diskInfo.VolumeInfos {
+						if want[needle.VolumeId(vi.Id)] {
+							regular[needle.VolumeId(vi.Id)] = true
+						}
+					}
+					for _, ec := range diskInfo.EcShardInfos {
+						if want[needle.VolumeId(ec.Id)] {
+							hasEcShards[needle.VolumeId(ec.Id)] = true
+						}
+					}
+				}
+			}
+		}
+	}
+	for _, vid := range vids {
+		if regular[vid] {
+			continue
+		}
+		if hasEcShards[vid] {
+			return fmt.Errorf("volume %d is already EC-encoded (no .dat replica); refusing to re-encode, which would destroy its shards", vid)
+		}
+		return fmt.Errorf("volume %d not found as a regular volume in the cluster; refusing to encode", vid)
+	}
+	return nil
+}
+
 // collectVolumeIdToCollection returns a map from volume ID to its collection name
 func collectVolumeIdToCollection(t *master_pb.TopologyInfo, vids []needle.VolumeId) map[needle.VolumeId]string {
 	result := make(map[needle.VolumeId]string)
@@ -0,0 +1,193 @@
+package shell
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
+	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
+	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
+	"github.com/seaweedfs/seaweedfs/weed/storage/types"
+)
+
+// topoWith builds a one-node topology snapshot where regularVids appear as
+// regular volumes (.dat) and ecVids appear as EC shards.
+func topoWith(regularVids, ecVids []uint32) *master_pb.TopologyInfo {
+	di := &master_pb.DiskInfo{Type: "hdd"}
+	for _, v := range regularVids {
+		di.VolumeInfos = append(di.VolumeInfos, &master_pb.VolumeInformationMessage{Id: v})
+	}
+	for _, v := range ecVids {
+		di.EcShardInfos = append(di.EcShardInfos, &master_pb.VolumeEcShardInformationMessage{Id: v})
+	}
+	return &master_pb.TopologyInfo{
+		DataCenterInfos: []*master_pb.DataCenterInfo{{
+			RackInfos: []*master_pb.RackInfo{{
+				DataNodeInfos: []*master_pb.DataNodeInfo{{
+					Id:        "n1",
+					DiskInfos: map[string]*master_pb.DiskInfo{"hdd": di},
+				}},
+			}},
+		}},
+	}
+}
+
+// TestAssertEncodableRegularVolumes: ec.encode must refuse an already-EC volume
+// (no .dat — re-encoding would destroy its shards) and an unknown id, while
+// allowing a regular volume and a failed-encode retry (regular .dat + orphans).
+func TestAssertEncodableRegularVolumes(t *testing.T) {
+	cases := []struct {
+		name          string
+		regular, ec   []uint32
+		vids          []needle.VolumeId
+		wantErrSubstr string
+	}{
+		{"regular volume", []uint32{1}, nil, []needle.VolumeId{1}, ""},
+		{"already EC", nil, []uint32{1}, []needle.VolumeId{1}, "already EC"},
+		{"regular plus stale orphan shards", []uint32{1}, []uint32{1}, []needle.VolumeId{1}, ""},
+		{"unknown id", nil, nil, []needle.VolumeId{1}, "not found"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			err := assertEncodableRegularVolumes(topoWith(tc.regular, tc.ec), tc.vids)
+			if tc.wantErrSubstr == "" {
+				if err != nil {
+					t.Fatalf("want nil error, got %v", err)
+				}
+				return
+			}
+			if err == nil || !strings.Contains(err.Error(), tc.wantErrSubstr) {
+				t.Fatalf("want error containing %q, got %v", tc.wantErrSubstr, err)
+			}
+		})
+	}
+}
+
+func newDryRunRebuilder(log *bytes.Buffer, nodes ...*EcNode) *ecRebuilder {
+	return &ecRebuilder{
+		commandEnv:   &CommandEnv{env: make(map[string]string), noLock: true},
+		ecNodes:      nodes,
+		writer:       log,
+		applyChanges: false,
+	}
+}
+
+// TestPrepareDataToRecover_DryRunRecoverableNoSideEffects: a dry-run on a
+// recoverable degraded volume the rebuilder holds none of must still succeed
+// and report its plan, while recording zero shards to delete (so the deferred
+// cleanup issues no VolumeEcShardsDelete RPC).
+func TestPrepareDataToRecover_DryRunRecoverableNoSideEffects(t *testing.T) {
+	var log bytes.Buffer
+	rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100)
+	remote := newEcNode("dc1", "rack1", "remote", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11})
+	erb := newDryRunRebuilder(&log, rebuilder, remote)
+
+	locations := make(EcShardLocations, erasure_coding.MaxShardCount)
+	for i := 0; i < 12; i++ {
+		locations[i] = []*EcNode{remote}
+	}
+
+	copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
+	if err != nil {
+		t.Fatalf("dry-run on a recoverable volume must not error: %v", err)
+	}
+	if len(copied) != 0 {
+		t.Errorf("dry-run must record no copied shards (deletion set), got %v", copied)
+	}
+	if len(local) != 0 {
+		t.Errorf("rebuilder holds no shards locally, got %v", local)
+	}
+	if !strings.Contains(log.String(), "would copy") {
+		t.Errorf("dry-run should report the would-copy plan, got:\n%s", log.String())
+	}
+}
+
+// TestPrepareDataToRecover_UnionsLocalShardsAcrossDisks: shards the rebuilder
+// holds on more than one disk must all count as local (not copied), so the
+// per-disk overwrite that made one disk's shards look remote (then self-copied
+// with O_TRUNC and node-wide deleted) cannot happen.
+func TestPrepareDataToRecover_UnionsLocalShardsAcrossDisks(t *testing.T) {
+	var log bytes.Buffer
+	rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1}).
+		addEcVolumeShards(needle.VolumeId(1), "c1", []erasure_coding.ShardId{5}, types.SsdType)
+	remote := newEcNode("dc1", "rack1", "remote", 100)
+	erb := newDryRunRebuilder(&log, rebuilder)
+
+	locations := make(EcShardLocations, erasure_coding.MaxShardCount)
+	for i := 0; i < 12; i++ {
+		locations[i] = []*EcNode{remote}
+	}
+	locations[0] = []*EcNode{rebuilder}
+	locations[1] = []*EcNode{rebuilder}
+	locations[5] = []*EcNode{rebuilder}
+
+	copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	missing := map[erasure_coding.ShardId]bool{0: true, 1: true, 5: true}
+	for _, s := range local {
+		delete(missing, s)
+	}
+	if len(missing) != 0 {
+		t.Errorf("shards %v should be local (union across disks); local=%v", missing, local)
+	}
+	for _, s := range copied {
+		if s == 0 || s == 1 || s == 5 {
+			t.Errorf("local shard %d must never be in the copy/delete set", s)
+		}
+	}
+}
+
+// TestCountLocalShards_UnionsAcrossDisks: slot accounting must union shards
+// across the node's disks like prepareDataToRecover does, or slotsNeeded is
+// overstated and selectAndReserveRebuilder can reject a valid node.
+func TestCountLocalShards_UnionsAcrossDisks(t *testing.T) {
+	var log bytes.Buffer
+	rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1}).
+		addEcVolumeShards(needle.VolumeId(1), "c1", []erasure_coding.ShardId{5}, types.SsdType)
+	erb := newDryRunRebuilder(&log, rebuilder)
+
+	if got := erb.countLocalShards(rebuilder, "c1", needle.VolumeId(1)); got != 3 {
+		t.Errorf("countLocalShards must union across disks: want 3, got %d", got)
+	}
+}
+
+// TestPrepareDataToRecover_SelfSourceNotCopied: if the rebuilder is the only
+// listed holder of a shard it does not report locally, it must be treated as
+// local rather than copied onto itself and then deleted.
+func TestPrepareDataToRecover_SelfSourceNotCopied(t *testing.T) {
+	var log bytes.Buffer
+	rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 6, 7, 8, 9, 10})
+	erb := newDryRunRebuilder(&log, rebuilder)
+
+	locations := make(EcShardLocations, erasure_coding.MaxShardCount)
+	for _, s := range []int{0, 1, 2, 3, 4, 6, 7, 8, 9, 10} {
+		locations[s] = []*EcNode{rebuilder}
+	}
+	locations[5] = []*EcNode{rebuilder} // sole holder is the rebuilder, not in its reported shards
+
+	copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	for _, s := range copied {
+		if s == 5 {
+			t.Errorf("self-sourced shard 5 must not be copied/deleted")
+		}
+	}
+	found := false
+	for _, s := range local {
+		if s == 5 {
+			found = true
+		}
+	}
+	if !found {
+		t.Errorf("self-sourced shard 5 should be classified local, got local=%v", local)
+	}
+}
@@ -160,6 +160,15 @@ func (c *commandEcEncode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
 		return nil
 	}

+	// Refuse to encode a volume that is already EC (present only as shards):
+	// an EC volume has no .dat, so re-encoding it would tear down its only
+	// copy before failing. A regular volume (with a .dat) passes. This closes
+	// the operator-rerun / script-retry path; a worker racing the snapshot is
+	// handled by encode fencing, not here.
+	if err = assertEncodableRegularVolumes(topologyInfo, volumeIds); err != nil {
+		return err
+	}
+
 	// Collect volume ID to collection name mapping for the sync operation
 	volumeIdToCollection := collectVolumeIdToCollection(topologyInfo, volumeIds)

@@ -145,15 +145,18 @@ func (erb *ecRebuilder) isLocked() bool {
 }

 // countLocalShards returns the number of shards already present locally on the node for the given volume.
+// Unions across all of the node's disks, like prepareDataToRecover, so slot
+// accounting matches what the rebuild will actually treat as local.
 func (erb *ecRebuilder) countLocalShards(node *EcNode, collection string, volumeId needle.VolumeId) int {
+	localShardsInfo := erasure_coding.NewShardsInfo()
 	for _, diskInfo := range node.info.DiskInfos {
 		for _, ecShardInfo := range diskInfo.EcShardInfos {
 			if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
-				return erasure_coding.GetShardCount(ecShardInfo)
+				localShardsInfo.Add(erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo))
 			}
 		}
 	}
-	return 0
+	return localShardsInfo.Count()
 }

 // selectAndReserveRebuilder atomically selects a rebuilder node with sufficient free slots
@@ -266,14 +269,16 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
 		return err
 	}
 	defer func() {
-		// clean up working files
-
-		// ask the rebuilder to delete the copied shards
-		err = sourceServerDeleteEcShards(erb.commandEnv.option.GrpcDialOption, collection, volumeId, pb.NewServerAddressFromDataNode(rebuilder.info), copiedShardIds)
-		if err != nil {
-			erb.write("%s delete copied ec shards %s %d.%v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds)
+		// Clean up the working copies this run actually made. In dry-run
+		// nothing was copied (copiedShardIds is empty), so this issues no
+		// delete RPC. Use a local error so a cleanup failure cannot mask a
+		// successful rebuild's return value.
+		if !erb.applyChanges || len(copiedShardIds) == 0 {
+			return
+		}
+		if derr := sourceServerDeleteEcShards(erb.commandEnv.option.GrpcDialOption, collection, volumeId, pb.NewServerAddressFromDataNode(rebuilder.info), copiedShardIds); derr != nil {
+			erb.write("%s delete copied ec shards %s %d.%v: %v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds, derr)
 		}
-
 	}()

 	if !erb.applyChanges {
@@ -323,7 +328,11 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 		for _, ecShardInfo := range diskInfo.EcShardInfos {
 			if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
 				needEcxFile = false
-				localShardsInfo = erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo)
+				// Union across disks: the rebuilder may hold this volume's
+				// shards on more than one disk. Overwriting per-disk would
+				// make a shard on a non-last disk look remote and get copied
+				// onto itself (O_TRUNC) and then node-wide deleted.
+				localShardsInfo.Add(erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo))
 			}
 		}
 	}
@@ -335,6 +344,11 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 		}
 	}

+	// wouldCopy counts shards available on a remote holder that the rebuilder
+	// could pull. It drives the recoverability gate below independently of
+	// whether we actually copy (dry-run copies nothing), so a dry-run still
+	// reports the plan instead of erroring "not enough shards".
+	wouldCopy := 0
 	for i := 0; i < targetShardCount; i++ {
 		ecNodes := locations[i]
 		shardId := erasure_coding.ShardId(i)
@@ -349,9 +363,23 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 			continue
 		}

-		var copyErr error
-		if erb.applyChanges {
-			copyErr = operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(rebuilder.info), erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
+		// The rebuilder is itself the only listed holder: never copy a shard
+		// onto itself (the in-place O_TRUNC would destroy it) nor schedule it
+		// for the post-rebuild delete. Treat it as already local.
+		if ecNodes[0].info.Id == rebuilder.info.Id {
+			localShardIds = append(localShardIds, shardId)
+			erb.write("use existing shard %d.%d (already on rebuilder)\n", volumeId, shardId)
+			continue
+		}
+
+		wouldCopy++
+
+		if !erb.applyChanges {
+			erb.write("would copy %d.%d from %s\n", volumeId, shardId, ecNodes[0].info.Id)
+			continue
+		}
+
+		copyErr := operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(rebuilder.info), erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
 			_, copyErr := volumeServerClient.VolumeEcShardsCopy(context.Background(), &volume_server_pb.VolumeEcShardsCopyRequest{
 				VolumeId:       uint32(volumeId),
 				Collection:     collection,
@@ -359,28 +387,28 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 				CopyEcxFile:    needEcxFile,
 				CopyEcjFile:    true,
 				CopyVifFile:    needEcxFile,
-					SourceDataNode: ecNodes[0].info.Id,
+				SourceDataNode: string(pb.NewServerAddressFromDataNode(ecNodes[0].info)),
 			})
 			return copyErr
 		})
-			if copyErr == nil && needEcxFile {
-				needEcxFile = false
-			}
-		}
 		if copyErr != nil {
 			erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr)
-		} else {
+			continue
+		}
+		if needEcxFile {
+			needEcxFile = false
+		}
 		erb.write("%s copied %d.%d from %s\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id)
+		// Only shards this run actually copied are temp working files to be
+		// deleted afterward; never a pre-existing local or remote shard.
 		copiedShardIds = append(copiedShardIds, shardId)
 	}

-	}
-
-	if len(copiedShardIds)+len(localShardIds) >= erasure_coding.DataShardsCount {
+	if len(localShardIds)+wouldCopy >= erasure_coding.DataShardsCount {
 		return copiedShardIds, localShardIds, nil
 	}

-	return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(copiedShardIds)+len(localShardIds), volumeId)
+	return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+wouldCopy, volumeId)

 }