shell: stop ec.encode/ec.rebuild from destroying live EC shards (no crash needed) (#9939)

* shell: stop ec.encode/ec.rebuild from destroying live EC shards

Three operator-triggered shell paths could destroy data with no crash:

ec.encode -volumeId on an already-EC volume tore down its shards before
failing. The volume-id path never checked the id was a regular volume:
the collection lookup scans only VolumeInfos (so an EC-only id maps to
""), and volumeLocations succeeds via the EC-location fallback, so
clearPreexistingEcShards full-teardown-deleted every shard cluster-wide
before doEcEncode failed. An EC volume has no .dat, so this is its only
copy. Add assertEncodableRegularVolumes: each requested id must be a
regular volume in the topology snapshot; an EC-only or unknown id is
refused before any teardown. A volume present as both a regular .dat and
stale orphan shards (a failed-encode retry) still passes. This closes
the operator-rerun/script-retry path; a worker racing the snapshot is a
fencing problem handled separately.

ec.rebuild dry-run (the default, without -apply) still issued real
VolumeEcShardsDelete RPCs: prepareDataToRecover appended every
would-copy shard to copiedShardIds even though the copy was skipped, and
the cleanup defer deleted that set unconditionally. Now a dry-run copies
nothing and records nothing to delete (a separate would-copy counter
drives the recoverability check so the dry-run still reports its plan),
and the cleanup runs only under -apply.

ec.rebuild could also self-destruct a live shard: localShardsInfo was
overwritten per disk instead of unioned, so a shard the rebuilder holds
on a non-last disk looked remote, got copied onto itself (in-place
O_TRUNC) and then node-wide deleted. Union local shards across all
disks, and never copy/delete a shard whose only listed holder is the
rebuilder itself.

* shell: address ec destructive-guards review comments

- countLocalShards: union shards across all of the rebuilder's disks so
  slot accounting matches what prepareDataToRecover treats as local;
  first-match counting overstated slotsNeeded on multi-disk rebuilders
- VolumeEcShardsCopy: resolve SourceDataNode via
  pb.NewServerAddressFromDataNode instead of the raw node id, which may
  not be a dialable host:port
- assertEncodableRegularVolumes: skip nil DiskInfo map entries, matching
  the other topology walks in this file; rename ecOnly to hasEcShards
  since the map marks any volume with shards, not only shard-only ones
This commit is contained in:
Chris Lu
2026-06-12 22:30:17 -07:00
committed by GitHub
parent 18cdb3819b
commit 3718301599
4 changed files with 309 additions and 33 deletions
+46
View File
@@ -219,6 +219,52 @@ func collectEcNodes(commandEnv *CommandEnv, diskType types.DiskType) (ecNodes []
return collectEcNodesForDC(commandEnv, "", diskType)
}
// assertEncodableRegularVolumes rejects volume ids that are not encodable
// regular volumes in the topology snapshot: an already-EC volume (present only
// as EC shards, with no .dat) or an id absent from the cluster. Encoding an
// already-EC volume would clear its shards before failing, destroying the only
// copy. A volume present as BOTH a regular .dat and stale orphan shards (a
// failed-encode retry) passes, so the retry + orphan sweep still works.
func assertEncodableRegularVolumes(t *master_pb.TopologyInfo, vids []needle.VolumeId) error {
want := make(map[needle.VolumeId]bool, len(vids))
for _, vid := range vids {
want[vid] = true
}
regular := make(map[needle.VolumeId]bool)
hasEcShards := make(map[needle.VolumeId]bool)
for _, dc := range t.DataCenterInfos {
for _, r := range dc.RackInfos {
for _, dn := range r.DataNodeInfos {
for _, diskInfo := range dn.DiskInfos {
if diskInfo == nil {
continue
}
for _, vi := range diskInfo.VolumeInfos {
if want[needle.VolumeId(vi.Id)] {
regular[needle.VolumeId(vi.Id)] = true
}
}
for _, ec := range diskInfo.EcShardInfos {
if want[needle.VolumeId(ec.Id)] {
hasEcShards[needle.VolumeId(ec.Id)] = true
}
}
}
}
}
}
for _, vid := range vids {
if regular[vid] {
continue
}
if hasEcShards[vid] {
return fmt.Errorf("volume %d is already EC-encoded (no .dat replica); refusing to re-encode, which would destroy its shards", vid)
}
return fmt.Errorf("volume %d not found as a regular volume in the cluster; refusing to encode", vid)
}
return nil
}
// collectVolumeIdToCollection returns a map from volume ID to its collection name
func collectVolumeIdToCollection(t *master_pb.TopologyInfo, vids []needle.VolumeId) map[needle.VolumeId]string {
result := make(map[needle.VolumeId]string)
@@ -0,0 +1,193 @@
package shell
import (
"bytes"
"strings"
"testing"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
)
// topoWith builds a one-node topology snapshot where regularVids appear as
// regular volumes (.dat) and ecVids appear as EC shards.
func topoWith(regularVids, ecVids []uint32) *master_pb.TopologyInfo {
di := &master_pb.DiskInfo{Type: "hdd"}
for _, v := range regularVids {
di.VolumeInfos = append(di.VolumeInfos, &master_pb.VolumeInformationMessage{Id: v})
}
for _, v := range ecVids {
di.EcShardInfos = append(di.EcShardInfos, &master_pb.VolumeEcShardInformationMessage{Id: v})
}
return &master_pb.TopologyInfo{
DataCenterInfos: []*master_pb.DataCenterInfo{{
RackInfos: []*master_pb.RackInfo{{
DataNodeInfos: []*master_pb.DataNodeInfo{{
Id: "n1",
DiskInfos: map[string]*master_pb.DiskInfo{"hdd": di},
}},
}},
}},
}
}
// TestAssertEncodableRegularVolumes: ec.encode must refuse an already-EC volume
// (no .dat — re-encoding would destroy its shards) and an unknown id, while
// allowing a regular volume and a failed-encode retry (regular .dat + orphans).
func TestAssertEncodableRegularVolumes(t *testing.T) {
cases := []struct {
name string
regular, ec []uint32
vids []needle.VolumeId
wantErrSubstr string
}{
{"regular volume", []uint32{1}, nil, []needle.VolumeId{1}, ""},
{"already EC", nil, []uint32{1}, []needle.VolumeId{1}, "already EC"},
{"regular plus stale orphan shards", []uint32{1}, []uint32{1}, []needle.VolumeId{1}, ""},
{"unknown id", nil, nil, []needle.VolumeId{1}, "not found"},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
err := assertEncodableRegularVolumes(topoWith(tc.regular, tc.ec), tc.vids)
if tc.wantErrSubstr == "" {
if err != nil {
t.Fatalf("want nil error, got %v", err)
}
return
}
if err == nil || !strings.Contains(err.Error(), tc.wantErrSubstr) {
t.Fatalf("want error containing %q, got %v", tc.wantErrSubstr, err)
}
})
}
}
func newDryRunRebuilder(log *bytes.Buffer, nodes ...*EcNode) *ecRebuilder {
return &ecRebuilder{
commandEnv: &CommandEnv{env: make(map[string]string), noLock: true},
ecNodes: nodes,
writer: log,
applyChanges: false,
}
}
// TestPrepareDataToRecover_DryRunRecoverableNoSideEffects: a dry-run on a
// recoverable degraded volume the rebuilder holds none of must still succeed
// and report its plan, while recording zero shards to delete (so the deferred
// cleanup issues no VolumeEcShardsDelete RPC).
func TestPrepareDataToRecover_DryRunRecoverableNoSideEffects(t *testing.T) {
var log bytes.Buffer
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100)
remote := newEcNode("dc1", "rack1", "remote", 100).
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11})
erb := newDryRunRebuilder(&log, rebuilder, remote)
locations := make(EcShardLocations, erasure_coding.MaxShardCount)
for i := 0; i < 12; i++ {
locations[i] = []*EcNode{remote}
}
copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
if err != nil {
t.Fatalf("dry-run on a recoverable volume must not error: %v", err)
}
if len(copied) != 0 {
t.Errorf("dry-run must record no copied shards (deletion set), got %v", copied)
}
if len(local) != 0 {
t.Errorf("rebuilder holds no shards locally, got %v", local)
}
if !strings.Contains(log.String(), "would copy") {
t.Errorf("dry-run should report the would-copy plan, got:\n%s", log.String())
}
}
// TestPrepareDataToRecover_UnionsLocalShardsAcrossDisks: shards the rebuilder
// holds on more than one disk must all count as local (not copied), so the
// per-disk overwrite that made one disk's shards look remote (then self-copied
// with O_TRUNC and node-wide deleted) cannot happen.
func TestPrepareDataToRecover_UnionsLocalShardsAcrossDisks(t *testing.T) {
var log bytes.Buffer
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1}).
addEcVolumeShards(needle.VolumeId(1), "c1", []erasure_coding.ShardId{5}, types.SsdType)
remote := newEcNode("dc1", "rack1", "remote", 100)
erb := newDryRunRebuilder(&log, rebuilder)
locations := make(EcShardLocations, erasure_coding.MaxShardCount)
for i := 0; i < 12; i++ {
locations[i] = []*EcNode{remote}
}
locations[0] = []*EcNode{rebuilder}
locations[1] = []*EcNode{rebuilder}
locations[5] = []*EcNode{rebuilder}
copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
missing := map[erasure_coding.ShardId]bool{0: true, 1: true, 5: true}
for _, s := range local {
delete(missing, s)
}
if len(missing) != 0 {
t.Errorf("shards %v should be local (union across disks); local=%v", missing, local)
}
for _, s := range copied {
if s == 0 || s == 1 || s == 5 {
t.Errorf("local shard %d must never be in the copy/delete set", s)
}
}
}
// TestCountLocalShards_UnionsAcrossDisks: slot accounting must union shards
// across the node's disks like prepareDataToRecover does, or slotsNeeded is
// overstated and selectAndReserveRebuilder can reject a valid node.
func TestCountLocalShards_UnionsAcrossDisks(t *testing.T) {
var log bytes.Buffer
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1}).
addEcVolumeShards(needle.VolumeId(1), "c1", []erasure_coding.ShardId{5}, types.SsdType)
erb := newDryRunRebuilder(&log, rebuilder)
if got := erb.countLocalShards(rebuilder, "c1", needle.VolumeId(1)); got != 3 {
t.Errorf("countLocalShards must union across disks: want 3, got %d", got)
}
}
// TestPrepareDataToRecover_SelfSourceNotCopied: if the rebuilder is the only
// listed holder of a shard it does not report locally, it must be treated as
// local rather than copied onto itself and then deleted.
func TestPrepareDataToRecover_SelfSourceNotCopied(t *testing.T) {
var log bytes.Buffer
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 6, 7, 8, 9, 10})
erb := newDryRunRebuilder(&log, rebuilder)
locations := make(EcShardLocations, erasure_coding.MaxShardCount)
for _, s := range []int{0, 1, 2, 3, 4, 6, 7, 8, 9, 10} {
locations[s] = []*EcNode{rebuilder}
}
locations[5] = []*EcNode{rebuilder} // sole holder is the rebuilder, not in its reported shards
copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
for _, s := range copied {
if s == 5 {
t.Errorf("self-sourced shard 5 must not be copied/deleted")
}
}
found := false
for _, s := range local {
if s == 5 {
found = true
}
}
if !found {
t.Errorf("self-sourced shard 5 should be classified local, got local=%v", local)
}
}
+9
View File
@@ -160,6 +160,15 @@ func (c *commandEcEncode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
return nil
}
// Refuse to encode a volume that is already EC (present only as shards):
// an EC volume has no .dat, so re-encoding it would tear down its only
// copy before failing. A regular volume (with a .dat) passes. This closes
// the operator-rerun / script-retry path; a worker racing the snapshot is
// handled by encode fencing, not here.
if err = assertEncodableRegularVolumes(topologyInfo, volumeIds); err != nil {
return err
}
// Collect volume ID to collection name mapping for the sync operation
volumeIdToCollection := collectVolumeIdToCollection(topologyInfo, volumeIds)
+51 -23
View File
@@ -145,15 +145,18 @@ func (erb *ecRebuilder) isLocked() bool {
}
// countLocalShards returns the number of shards already present locally on the node for the given volume.
// Unions across all of the node's disks, like prepareDataToRecover, so slot
// accounting matches what the rebuild will actually treat as local.
func (erb *ecRebuilder) countLocalShards(node *EcNode, collection string, volumeId needle.VolumeId) int {
localShardsInfo := erasure_coding.NewShardsInfo()
for _, diskInfo := range node.info.DiskInfos {
for _, ecShardInfo := range diskInfo.EcShardInfos {
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
return erasure_coding.GetShardCount(ecShardInfo)
localShardsInfo.Add(erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo))
}
}
}
return 0
return localShardsInfo.Count()
}
// selectAndReserveRebuilder atomically selects a rebuilder node with sufficient free slots
@@ -266,14 +269,16 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
return err
}
defer func() {
// clean up working files
// ask the rebuilder to delete the copied shards
err = sourceServerDeleteEcShards(erb.commandEnv.option.GrpcDialOption, collection, volumeId, pb.NewServerAddressFromDataNode(rebuilder.info), copiedShardIds)
if err != nil {
erb.write("%s delete copied ec shards %s %d.%v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds)
// Clean up the working copies this run actually made. In dry-run
// nothing was copied (copiedShardIds is empty), so this issues no
// delete RPC. Use a local error so a cleanup failure cannot mask a
// successful rebuild's return value.
if !erb.applyChanges || len(copiedShardIds) == 0 {
return
}
if derr := sourceServerDeleteEcShards(erb.commandEnv.option.GrpcDialOption, collection, volumeId, pb.NewServerAddressFromDataNode(rebuilder.info), copiedShardIds); derr != nil {
erb.write("%s delete copied ec shards %s %d.%v: %v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds, derr)
}
}()
if !erb.applyChanges {
@@ -323,7 +328,11 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
for _, ecShardInfo := range diskInfo.EcShardInfos {
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
needEcxFile = false
localShardsInfo = erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo)
// Union across disks: the rebuilder may hold this volume's
// shards on more than one disk. Overwriting per-disk would
// make a shard on a non-last disk look remote and get copied
// onto itself (O_TRUNC) and then node-wide deleted.
localShardsInfo.Add(erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo))
}
}
}
@@ -335,6 +344,11 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
}
}
// wouldCopy counts shards available on a remote holder that the rebuilder
// could pull. It drives the recoverability gate below independently of
// whether we actually copy (dry-run copies nothing), so a dry-run still
// reports the plan instead of erroring "not enough shards".
wouldCopy := 0
for i := 0; i < targetShardCount; i++ {
ecNodes := locations[i]
shardId := erasure_coding.ShardId(i)
@@ -349,9 +363,23 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
continue
}
var copyErr error
if erb.applyChanges {
copyErr = operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(rebuilder.info), erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
// The rebuilder is itself the only listed holder: never copy a shard
// onto itself (the in-place O_TRUNC would destroy it) nor schedule it
// for the post-rebuild delete. Treat it as already local.
if ecNodes[0].info.Id == rebuilder.info.Id {
localShardIds = append(localShardIds, shardId)
erb.write("use existing shard %d.%d (already on rebuilder)\n", volumeId, shardId)
continue
}
wouldCopy++
if !erb.applyChanges {
erb.write("would copy %d.%d from %s\n", volumeId, shardId, ecNodes[0].info.Id)
continue
}
copyErr := operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(rebuilder.info), erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
_, copyErr := volumeServerClient.VolumeEcShardsCopy(context.Background(), &volume_server_pb.VolumeEcShardsCopyRequest{
VolumeId: uint32(volumeId),
Collection: collection,
@@ -359,28 +387,28 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
CopyEcxFile: needEcxFile,
CopyEcjFile: true,
CopyVifFile: needEcxFile,
SourceDataNode: ecNodes[0].info.Id,
SourceDataNode: string(pb.NewServerAddressFromDataNode(ecNodes[0].info)),
})
return copyErr
})
if copyErr == nil && needEcxFile {
needEcxFile = false
}
}
if copyErr != nil {
erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr)
} else {
continue
}
if needEcxFile {
needEcxFile = false
}
erb.write("%s copied %d.%d from %s\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id)
// Only shards this run actually copied are temp working files to be
// deleted afterward; never a pre-existing local or remote shard.
copiedShardIds = append(copiedShardIds, shardId)
}
}
if len(copiedShardIds)+len(localShardIds) >= erasure_coding.DataShardsCount {
if len(localShardIds)+wouldCopy >= erasure_coding.DataShardsCount {
return copiedShardIds, localShardIds, nil
}
return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(copiedShardIds)+len(localShardIds), volumeId)
return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+wouldCopy, volumeId)
}