mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
fix(ec): re-sweep recovered nodes before generation so they aren't stranded
A node skipped as down by the pre-encode sweep is excluded from the rebalance, but it can recover and become the generation host — mounting all shards locally, then being excluded from distribution. Union-only verification accepts all shards on one node and deletes the originals: a single point of failure. Re-sweep the skipped nodes just before generation; one whose teardown now succeeds leaves the skipped set and rebalances normally, while a node still down stays skipped.
This commit is contained in:
@@ -320,6 +320,18 @@ func doEcEncode(commandEnv *CommandEnv, writer io.Writer, volumeIdToCollection m
|
||||
bestReplicas[vid] = bestLoc
|
||||
}
|
||||
|
||||
// Re-attempt the orphan sweep on the nodes skipped as unreachable, now that
|
||||
// any node that recovered during readonly-marking and replica sync answers
|
||||
// again. A node whose teardown now succeeds is clean (and the generation host
|
||||
// re-wipes its own disks regardless), so it leaves the skipped set and can be
|
||||
// a balance source/target — otherwise its shards would never distribute off
|
||||
// it. A node that is still down stays skipped and excluded, preserving the
|
||||
// leniency for a genuinely-down node; such a node also cannot be the
|
||||
// generation host below, since VolumeEcShardsGenerate would fail to read .dat.
|
||||
if err := resweepSkippedNodes(commandEnv, skippedNodes, volumeIds, volumeIdToCollection, maxParallelization); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// generate ec shards using the best replica for each volume
|
||||
ewg.Reset()
|
||||
for _, vid := range volumeIds {
|
||||
@@ -452,6 +464,75 @@ func clearPreexistingEcShards(commandEnv *CommandEnv, topologyInfo *master_pb.To
|
||||
return skipped, nil
|
||||
}
|
||||
|
||||
// resweepSkippedNodes re-attempts the orphan teardown on the nodes that the
|
||||
// initial sweep skipped as unreachable, just before shard generation. A node
|
||||
// that recovered in the meantime — and is therefore eligible to host this
|
||||
// encode's generation — has its teardown retried; if it now fully succeeds it is
|
||||
// removed from skipped so the rebalance can use it as a source and move its
|
||||
// shards off, instead of stranding all shards on the single generation host and
|
||||
// collapsing fault tolerance. A node still transport-down stays skipped (the
|
||||
// same leniency the initial sweep grants), and a node that came back reachable
|
||||
// but whose delete genuinely failed is fatal, exactly as in the initial sweep,
|
||||
// so a stale generation is never silently left behind. Mutates skipped in place.
|
||||
func resweepSkippedNodes(commandEnv *CommandEnv, skipped map[pb.ServerAddress]struct{}, volumeIds []needle.VolumeId, volumeIdToCollection map[needle.VolumeId]string, maxParallelization int) error {
|
||||
if len(skipped) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
allShardIds := make([]erasure_coding.ShardId, erasure_coding.MaxShardCount)
|
||||
for i := range allShardIds {
|
||||
allShardIds[i] = erasure_coding.ShardId(i)
|
||||
}
|
||||
|
||||
addrs := make([]pb.ServerAddress, 0, len(skipped))
|
||||
for addr := range skipped {
|
||||
addrs = append(addrs, addr)
|
||||
}
|
||||
|
||||
fmt.Printf("re-checking %d node(s) skipped by the orphan sweep before generating shards...\n", len(addrs))
|
||||
|
||||
// A node still down on every retried vid stays skipped; one that fully
|
||||
// succeeds is un-skipped. Track per-node whether any retry still failed
|
||||
// (down) so a node whose state is mixed across vids never gets un-skipped.
|
||||
stillDown := make(map[pb.ServerAddress]struct{})
|
||||
var mu sync.Mutex
|
||||
ewg := NewErrorWaitGroup(maxParallelization)
|
||||
for _, addr := range addrs {
|
||||
for _, vid := range volumeIds {
|
||||
collection := volumeIdToCollection[vid]
|
||||
ewg.Add(func() error {
|
||||
if err := unmountAndDeleteEcShardsQuiet(commandEnv.option.GrpcDialOption, collection, vid, addr, allShardIds); err != nil {
|
||||
// Same decision as the initial sweep: a reachable node whose delete
|
||||
// genuinely failed (or did not ack a full teardown, or whose liveness is
|
||||
// inconclusive) is fatal, since it could hold an orphan a later copy
|
||||
// re-stamps into this generation. Only a node still transport-down stays
|
||||
// skipped.
|
||||
if errors.Is(err, errFullTeardownNotAcked) || !isNodeUnreachable(err) ||
|
||||
classifyNodeLiveness(pingVolumeServer(commandEnv.option.GrpcDialOption, addr)) != nodeDown {
|
||||
return fmt.Errorf("re-clear stale ec shards for volume %d on %s: %w", vid, addr, err)
|
||||
}
|
||||
glog.V(1).Infof("orphan re-sweep: volume %d on %s still skipped (unreachable): %v", vid, addr, err)
|
||||
mu.Lock()
|
||||
stillDown[addr] = struct{}{}
|
||||
mu.Unlock()
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
if err := ewg.Wait(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, addr := range addrs {
|
||||
if _, down := stillDown[addr]; !down {
|
||||
delete(skipped, addr)
|
||||
glog.V(0).Infof("orphan re-sweep: node %s recovered and was cleaned; it will participate in the EC rebalance", addr)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// isNodeUnreachable reports whether err means the volume server could not be
|
||||
// reached at all, as opposed to an RPC that reached the node and failed. Only an
|
||||
// unreachable node is safe to skip in the orphan sweep. A dead peer surfaces as
|
||||
|
||||
Reference in New Issue
Block a user