mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
shell: stop ec.encode/ec.rebuild from destroying live EC shards (no crash needed) (#9939)
* shell: stop ec.encode/ec.rebuild from destroying live EC shards Three operator-triggered shell paths could destroy data with no crash: ec.encode -volumeId on an already-EC volume tore down its shards before failing. The volume-id path never checked the id was a regular volume: the collection lookup scans only VolumeInfos (so an EC-only id maps to ""), and volumeLocations succeeds via the EC-location fallback, so clearPreexistingEcShards full-teardown-deleted every shard cluster-wide before doEcEncode failed. An EC volume has no .dat, so this is its only copy. Add assertEncodableRegularVolumes: each requested id must be a regular volume in the topology snapshot; an EC-only or unknown id is refused before any teardown. A volume present as both a regular .dat and stale orphan shards (a failed-encode retry) still passes. This closes the operator-rerun/script-retry path; a worker racing the snapshot is a fencing problem handled separately. ec.rebuild dry-run (the default, without -apply) still issued real VolumeEcShardsDelete RPCs: prepareDataToRecover appended every would-copy shard to copiedShardIds even though the copy was skipped, and the cleanup defer deleted that set unconditionally. Now a dry-run copies nothing and records nothing to delete (a separate would-copy counter drives the recoverability check so the dry-run still reports its plan), and the cleanup runs only under -apply. ec.rebuild could also self-destruct a live shard: localShardsInfo was overwritten per disk instead of unioned, so a shard the rebuilder holds on a non-last disk looked remote, got copied onto itself (in-place O_TRUNC) and then node-wide deleted. Union local shards across all disks, and never copy/delete a shard whose only listed holder is the rebuilder itself. * shell: address ec destructive-guards review comments - countLocalShards: union shards across all of the rebuilder's disks so slot accounting matches what prepareDataToRecover treats as local; first-match counting overstated slotsNeeded on multi-disk rebuilders - VolumeEcShardsCopy: resolve SourceDataNode via pb.NewServerAddressFromDataNode instead of the raw node id, which may not be a dialable host:port - assertEncodableRegularVolumes: skip nil DiskInfo map entries, matching the other topology walks in this file; rename ecOnly to hasEcShards since the map marks any volume with shards, not only shard-only ones
This commit is contained in:
@@ -219,6 +219,52 @@ func collectEcNodes(commandEnv *CommandEnv, diskType types.DiskType) (ecNodes []
|
||||
return collectEcNodesForDC(commandEnv, "", diskType)
|
||||
}
|
||||
|
||||
// assertEncodableRegularVolumes rejects volume ids that are not encodable
|
||||
// regular volumes in the topology snapshot: an already-EC volume (present only
|
||||
// as EC shards, with no .dat) or an id absent from the cluster. Encoding an
|
||||
// already-EC volume would clear its shards before failing, destroying the only
|
||||
// copy. A volume present as BOTH a regular .dat and stale orphan shards (a
|
||||
// failed-encode retry) passes, so the retry + orphan sweep still works.
|
||||
func assertEncodableRegularVolumes(t *master_pb.TopologyInfo, vids []needle.VolumeId) error {
|
||||
want := make(map[needle.VolumeId]bool, len(vids))
|
||||
for _, vid := range vids {
|
||||
want[vid] = true
|
||||
}
|
||||
regular := make(map[needle.VolumeId]bool)
|
||||
hasEcShards := make(map[needle.VolumeId]bool)
|
||||
for _, dc := range t.DataCenterInfos {
|
||||
for _, r := range dc.RackInfos {
|
||||
for _, dn := range r.DataNodeInfos {
|
||||
for _, diskInfo := range dn.DiskInfos {
|
||||
if diskInfo == nil {
|
||||
continue
|
||||
}
|
||||
for _, vi := range diskInfo.VolumeInfos {
|
||||
if want[needle.VolumeId(vi.Id)] {
|
||||
regular[needle.VolumeId(vi.Id)] = true
|
||||
}
|
||||
}
|
||||
for _, ec := range diskInfo.EcShardInfos {
|
||||
if want[needle.VolumeId(ec.Id)] {
|
||||
hasEcShards[needle.VolumeId(ec.Id)] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, vid := range vids {
|
||||
if regular[vid] {
|
||||
continue
|
||||
}
|
||||
if hasEcShards[vid] {
|
||||
return fmt.Errorf("volume %d is already EC-encoded (no .dat replica); refusing to re-encode, which would destroy its shards", vid)
|
||||
}
|
||||
return fmt.Errorf("volume %d not found as a regular volume in the cluster; refusing to encode", vid)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectVolumeIdToCollection returns a map from volume ID to its collection name
|
||||
func collectVolumeIdToCollection(t *master_pb.TopologyInfo, vids []needle.VolumeId) map[needle.VolumeId]string {
|
||||
result := make(map[needle.VolumeId]string)
|
||||
|
||||
@@ -0,0 +1,193 @@
|
||||
package shell
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||||
)
|
||||
|
||||
// topoWith builds a one-node topology snapshot where regularVids appear as
|
||||
// regular volumes (.dat) and ecVids appear as EC shards.
|
||||
func topoWith(regularVids, ecVids []uint32) *master_pb.TopologyInfo {
|
||||
di := &master_pb.DiskInfo{Type: "hdd"}
|
||||
for _, v := range regularVids {
|
||||
di.VolumeInfos = append(di.VolumeInfos, &master_pb.VolumeInformationMessage{Id: v})
|
||||
}
|
||||
for _, v := range ecVids {
|
||||
di.EcShardInfos = append(di.EcShardInfos, &master_pb.VolumeEcShardInformationMessage{Id: v})
|
||||
}
|
||||
return &master_pb.TopologyInfo{
|
||||
DataCenterInfos: []*master_pb.DataCenterInfo{{
|
||||
RackInfos: []*master_pb.RackInfo{{
|
||||
DataNodeInfos: []*master_pb.DataNodeInfo{{
|
||||
Id: "n1",
|
||||
DiskInfos: map[string]*master_pb.DiskInfo{"hdd": di},
|
||||
}},
|
||||
}},
|
||||
}},
|
||||
}
|
||||
}
|
||||
|
||||
// TestAssertEncodableRegularVolumes: ec.encode must refuse an already-EC volume
|
||||
// (no .dat — re-encoding would destroy its shards) and an unknown id, while
|
||||
// allowing a regular volume and a failed-encode retry (regular .dat + orphans).
|
||||
func TestAssertEncodableRegularVolumes(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
regular, ec []uint32
|
||||
vids []needle.VolumeId
|
||||
wantErrSubstr string
|
||||
}{
|
||||
{"regular volume", []uint32{1}, nil, []needle.VolumeId{1}, ""},
|
||||
{"already EC", nil, []uint32{1}, []needle.VolumeId{1}, "already EC"},
|
||||
{"regular plus stale orphan shards", []uint32{1}, []uint32{1}, []needle.VolumeId{1}, ""},
|
||||
{"unknown id", nil, nil, []needle.VolumeId{1}, "not found"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
err := assertEncodableRegularVolumes(topoWith(tc.regular, tc.ec), tc.vids)
|
||||
if tc.wantErrSubstr == "" {
|
||||
if err != nil {
|
||||
t.Fatalf("want nil error, got %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err == nil || !strings.Contains(err.Error(), tc.wantErrSubstr) {
|
||||
t.Fatalf("want error containing %q, got %v", tc.wantErrSubstr, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func newDryRunRebuilder(log *bytes.Buffer, nodes ...*EcNode) *ecRebuilder {
|
||||
return &ecRebuilder{
|
||||
commandEnv: &CommandEnv{env: make(map[string]string), noLock: true},
|
||||
ecNodes: nodes,
|
||||
writer: log,
|
||||
applyChanges: false,
|
||||
}
|
||||
}
|
||||
|
||||
// TestPrepareDataToRecover_DryRunRecoverableNoSideEffects: a dry-run on a
|
||||
// recoverable degraded volume the rebuilder holds none of must still succeed
|
||||
// and report its plan, while recording zero shards to delete (so the deferred
|
||||
// cleanup issues no VolumeEcShardsDelete RPC).
|
||||
func TestPrepareDataToRecover_DryRunRecoverableNoSideEffects(t *testing.T) {
|
||||
var log bytes.Buffer
|
||||
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100)
|
||||
remote := newEcNode("dc1", "rack1", "remote", 100).
|
||||
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11})
|
||||
erb := newDryRunRebuilder(&log, rebuilder, remote)
|
||||
|
||||
locations := make(EcShardLocations, erasure_coding.MaxShardCount)
|
||||
for i := 0; i < 12; i++ {
|
||||
locations[i] = []*EcNode{remote}
|
||||
}
|
||||
|
||||
copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
|
||||
if err != nil {
|
||||
t.Fatalf("dry-run on a recoverable volume must not error: %v", err)
|
||||
}
|
||||
if len(copied) != 0 {
|
||||
t.Errorf("dry-run must record no copied shards (deletion set), got %v", copied)
|
||||
}
|
||||
if len(local) != 0 {
|
||||
t.Errorf("rebuilder holds no shards locally, got %v", local)
|
||||
}
|
||||
if !strings.Contains(log.String(), "would copy") {
|
||||
t.Errorf("dry-run should report the would-copy plan, got:\n%s", log.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestPrepareDataToRecover_UnionsLocalShardsAcrossDisks: shards the rebuilder
|
||||
// holds on more than one disk must all count as local (not copied), so the
|
||||
// per-disk overwrite that made one disk's shards look remote (then self-copied
|
||||
// with O_TRUNC and node-wide deleted) cannot happen.
|
||||
func TestPrepareDataToRecover_UnionsLocalShardsAcrossDisks(t *testing.T) {
|
||||
var log bytes.Buffer
|
||||
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
|
||||
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1}).
|
||||
addEcVolumeShards(needle.VolumeId(1), "c1", []erasure_coding.ShardId{5}, types.SsdType)
|
||||
remote := newEcNode("dc1", "rack1", "remote", 100)
|
||||
erb := newDryRunRebuilder(&log, rebuilder)
|
||||
|
||||
locations := make(EcShardLocations, erasure_coding.MaxShardCount)
|
||||
for i := 0; i < 12; i++ {
|
||||
locations[i] = []*EcNode{remote}
|
||||
}
|
||||
locations[0] = []*EcNode{rebuilder}
|
||||
locations[1] = []*EcNode{rebuilder}
|
||||
locations[5] = []*EcNode{rebuilder}
|
||||
|
||||
copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
missing := map[erasure_coding.ShardId]bool{0: true, 1: true, 5: true}
|
||||
for _, s := range local {
|
||||
delete(missing, s)
|
||||
}
|
||||
if len(missing) != 0 {
|
||||
t.Errorf("shards %v should be local (union across disks); local=%v", missing, local)
|
||||
}
|
||||
for _, s := range copied {
|
||||
if s == 0 || s == 1 || s == 5 {
|
||||
t.Errorf("local shard %d must never be in the copy/delete set", s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestCountLocalShards_UnionsAcrossDisks: slot accounting must union shards
|
||||
// across the node's disks like prepareDataToRecover does, or slotsNeeded is
|
||||
// overstated and selectAndReserveRebuilder can reject a valid node.
|
||||
func TestCountLocalShards_UnionsAcrossDisks(t *testing.T) {
|
||||
var log bytes.Buffer
|
||||
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
|
||||
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1}).
|
||||
addEcVolumeShards(needle.VolumeId(1), "c1", []erasure_coding.ShardId{5}, types.SsdType)
|
||||
erb := newDryRunRebuilder(&log, rebuilder)
|
||||
|
||||
if got := erb.countLocalShards(rebuilder, "c1", needle.VolumeId(1)); got != 3 {
|
||||
t.Errorf("countLocalShards must union across disks: want 3, got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestPrepareDataToRecover_SelfSourceNotCopied: if the rebuilder is the only
|
||||
// listed holder of a shard it does not report locally, it must be treated as
|
||||
// local rather than copied onto itself and then deleted.
|
||||
func TestPrepareDataToRecover_SelfSourceNotCopied(t *testing.T) {
|
||||
var log bytes.Buffer
|
||||
rebuilder := newEcNode("dc1", "rack1", "rebuilder", 100).
|
||||
addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 6, 7, 8, 9, 10})
|
||||
erb := newDryRunRebuilder(&log, rebuilder)
|
||||
|
||||
locations := make(EcShardLocations, erasure_coding.MaxShardCount)
|
||||
for _, s := range []int{0, 1, 2, 3, 4, 6, 7, 8, 9, 10} {
|
||||
locations[s] = []*EcNode{rebuilder}
|
||||
}
|
||||
locations[5] = []*EcNode{rebuilder} // sole holder is the rebuilder, not in its reported shards
|
||||
|
||||
copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
for _, s := range copied {
|
||||
if s == 5 {
|
||||
t.Errorf("self-sourced shard 5 must not be copied/deleted")
|
||||
}
|
||||
}
|
||||
found := false
|
||||
for _, s := range local {
|
||||
if s == 5 {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("self-sourced shard 5 should be classified local, got local=%v", local)
|
||||
}
|
||||
}
|
||||
@@ -160,6 +160,15 @@ func (c *commandEcEncode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
|
||||
return nil
|
||||
}
|
||||
|
||||
// Refuse to encode a volume that is already EC (present only as shards):
|
||||
// an EC volume has no .dat, so re-encoding it would tear down its only
|
||||
// copy before failing. A regular volume (with a .dat) passes. This closes
|
||||
// the operator-rerun / script-retry path; a worker racing the snapshot is
|
||||
// handled by encode fencing, not here.
|
||||
if err = assertEncodableRegularVolumes(topologyInfo, volumeIds); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Collect volume ID to collection name mapping for the sync operation
|
||||
volumeIdToCollection := collectVolumeIdToCollection(topologyInfo, volumeIds)
|
||||
|
||||
|
||||
@@ -145,15 +145,18 @@ func (erb *ecRebuilder) isLocked() bool {
|
||||
}
|
||||
|
||||
// countLocalShards returns the number of shards already present locally on the node for the given volume.
|
||||
// Unions across all of the node's disks, like prepareDataToRecover, so slot
|
||||
// accounting matches what the rebuild will actually treat as local.
|
||||
func (erb *ecRebuilder) countLocalShards(node *EcNode, collection string, volumeId needle.VolumeId) int {
|
||||
localShardsInfo := erasure_coding.NewShardsInfo()
|
||||
for _, diskInfo := range node.info.DiskInfos {
|
||||
for _, ecShardInfo := range diskInfo.EcShardInfos {
|
||||
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
|
||||
return erasure_coding.GetShardCount(ecShardInfo)
|
||||
localShardsInfo.Add(erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo))
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
return localShardsInfo.Count()
|
||||
}
|
||||
|
||||
// selectAndReserveRebuilder atomically selects a rebuilder node with sufficient free slots
|
||||
@@ -266,14 +269,16 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
|
||||
return err
|
||||
}
|
||||
defer func() {
|
||||
// clean up working files
|
||||
|
||||
// ask the rebuilder to delete the copied shards
|
||||
err = sourceServerDeleteEcShards(erb.commandEnv.option.GrpcDialOption, collection, volumeId, pb.NewServerAddressFromDataNode(rebuilder.info), copiedShardIds)
|
||||
if err != nil {
|
||||
erb.write("%s delete copied ec shards %s %d.%v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds)
|
||||
// Clean up the working copies this run actually made. In dry-run
|
||||
// nothing was copied (copiedShardIds is empty), so this issues no
|
||||
// delete RPC. Use a local error so a cleanup failure cannot mask a
|
||||
// successful rebuild's return value.
|
||||
if !erb.applyChanges || len(copiedShardIds) == 0 {
|
||||
return
|
||||
}
|
||||
if derr := sourceServerDeleteEcShards(erb.commandEnv.option.GrpcDialOption, collection, volumeId, pb.NewServerAddressFromDataNode(rebuilder.info), copiedShardIds); derr != nil {
|
||||
erb.write("%s delete copied ec shards %s %d.%v: %v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds, derr)
|
||||
}
|
||||
|
||||
}()
|
||||
|
||||
if !erb.applyChanges {
|
||||
@@ -323,7 +328,11 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
|
||||
for _, ecShardInfo := range diskInfo.EcShardInfos {
|
||||
if ecShardInfo.Collection == collection && needle.VolumeId(ecShardInfo.Id) == volumeId {
|
||||
needEcxFile = false
|
||||
localShardsInfo = erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo)
|
||||
// Union across disks: the rebuilder may hold this volume's
|
||||
// shards on more than one disk. Overwriting per-disk would
|
||||
// make a shard on a non-last disk look remote and get copied
|
||||
// onto itself (O_TRUNC) and then node-wide deleted.
|
||||
localShardsInfo.Add(erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(ecShardInfo))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -335,6 +344,11 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
|
||||
}
|
||||
}
|
||||
|
||||
// wouldCopy counts shards available on a remote holder that the rebuilder
|
||||
// could pull. It drives the recoverability gate below independently of
|
||||
// whether we actually copy (dry-run copies nothing), so a dry-run still
|
||||
// reports the plan instead of erroring "not enough shards".
|
||||
wouldCopy := 0
|
||||
for i := 0; i < targetShardCount; i++ {
|
||||
ecNodes := locations[i]
|
||||
shardId := erasure_coding.ShardId(i)
|
||||
@@ -349,9 +363,23 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
|
||||
continue
|
||||
}
|
||||
|
||||
var copyErr error
|
||||
if erb.applyChanges {
|
||||
copyErr = operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(rebuilder.info), erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
|
||||
// The rebuilder is itself the only listed holder: never copy a shard
|
||||
// onto itself (the in-place O_TRUNC would destroy it) nor schedule it
|
||||
// for the post-rebuild delete. Treat it as already local.
|
||||
if ecNodes[0].info.Id == rebuilder.info.Id {
|
||||
localShardIds = append(localShardIds, shardId)
|
||||
erb.write("use existing shard %d.%d (already on rebuilder)\n", volumeId, shardId)
|
||||
continue
|
||||
}
|
||||
|
||||
wouldCopy++
|
||||
|
||||
if !erb.applyChanges {
|
||||
erb.write("would copy %d.%d from %s\n", volumeId, shardId, ecNodes[0].info.Id)
|
||||
continue
|
||||
}
|
||||
|
||||
copyErr := operation.WithVolumeServerClient(false, pb.NewServerAddressFromDataNode(rebuilder.info), erb.commandEnv.option.GrpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
|
||||
_, copyErr := volumeServerClient.VolumeEcShardsCopy(context.Background(), &volume_server_pb.VolumeEcShardsCopyRequest{
|
||||
VolumeId: uint32(volumeId),
|
||||
Collection: collection,
|
||||
@@ -359,28 +387,28 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
|
||||
CopyEcxFile: needEcxFile,
|
||||
CopyEcjFile: true,
|
||||
CopyVifFile: needEcxFile,
|
||||
SourceDataNode: ecNodes[0].info.Id,
|
||||
SourceDataNode: string(pb.NewServerAddressFromDataNode(ecNodes[0].info)),
|
||||
})
|
||||
return copyErr
|
||||
})
|
||||
if copyErr == nil && needEcxFile {
|
||||
needEcxFile = false
|
||||
}
|
||||
}
|
||||
if copyErr != nil {
|
||||
erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr)
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
if needEcxFile {
|
||||
needEcxFile = false
|
||||
}
|
||||
erb.write("%s copied %d.%d from %s\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id)
|
||||
// Only shards this run actually copied are temp working files to be
|
||||
// deleted afterward; never a pre-existing local or remote shard.
|
||||
copiedShardIds = append(copiedShardIds, shardId)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if len(copiedShardIds)+len(localShardIds) >= erasure_coding.DataShardsCount {
|
||||
if len(localShardIds)+wouldCopy >= erasure_coding.DataShardsCount {
|
||||
return copiedShardIds, localShardIds, nil
|
||||
}
|
||||
|
||||
return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(copiedShardIds)+len(localShardIds), volumeId)
|
||||
return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+wouldCopy, volumeId)
|
||||
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user