Files
seaweedfs/weed/topology/data_node_ec_multi_disk_test.go
T
Chris Lu f1f720f5da fix(master): register EC shards per physical disk on full heartbeat sync (#9212) (#9219)
* refactor(types): add DiskId type for physical-disk identifiers

Names the uint32 physical-disk index that volume servers carry in
VolumeEcShardInformationMessage / VolumeInformationMessage, so EC shard
tracking that needs to distinguish disks within a DataNode can use a
dedicated type instead of an untyped uint32. No behaviour change.

* fix(master): register EC shards per physical disk on full heartbeat sync (#9212)

When a volume's EC shards are spread across multiple physical disks on the
same volume server (common after ec.balance / ec.rebuild on multi-disk
nodes), the volume server emits one VolumeEcShardInformationMessage per
(disk, volume) in its heartbeat. The master's DataNode.UpdateEcShards was
building a `map[VolumeId]*EcVolumeInfo` with last-write-wins, and
doUpdateEcShards then overwrote `disk.ecShards[vid]` once per message, so
all but the final disk's shards were silently dropped. Only the
topology-global ecShardMap (built via RegisterEcShards in a per-message
loop) stayed correct, which hid the problem from `topo.LookupEcShards`
but broke everything that reads the DataNode/Disk view — volume.list,
admin UI, ec.rebuild dry-run ("only 6 shards, skipping"), and
`DiskInfo.EcShardInfos` which the shell's ec.balance / ec.rebuild
planners group by `eci.DiskId`.

Change the shape of `Disk.ecShards` from
    map[VolumeId]*EcVolumeInfo
to
    map[VolumeId]map[types.DiskId]*EcVolumeInfo

so every physical disk keeps its own entry. UpdateEcShards aggregates
incoming messages by (vid, diskId) rather than vid alone; Add/Delete/
HasVolumesById and HasEcShards consult the nested map; doUpdateEcShards
rewrites the nested structure from the aggregated map. Per-physical-disk
attribution survives through DataNode.ToDataNodeInfo ->
DiskInfo.EcShardInfos, matching the wire format the volume server
produces and what downstream admin tooling expects.

Delta sync (AddOrUpdateEcShard / DeleteEcShard) already merged via
ShardsInfo.Add, so this only affects the full-sync path that runs on
heartbeat reconnect.

Adds data_node_ec_multi_disk_test.go with two regression tests that fail
on pre-fix master:
- TestEcShardsAcrossMultipleDisksOnSameNode: volume 15 spread over 3
  disks (matches the bug report's volume-2 row); asserts every shard
  visible via LookupEcShards, DataNode.GetEcShards, and ToDataNodeInfo's
  per-disk EcShardInfos entries.
- TestEcShardsAfterRestartHeartbeat: minimal 2-disk full sync case.

* fix(topology): tighten locking around EC shard map access

Addresses review comments on #9219:

* DataNode.UpdateEcShards now holds dn.Lock for the full read-diff-write
  cycle, matching UpdateVolumes' model, so concurrent heartbeats can no
  longer interleave their getOrCreateDisk / UpAdjustDiskUsageDelta
  updates with each other. Introduces a private getEcShardsLocked helper
  for reads under the held lock; renames doUpdateEcShards to
  doUpdateEcShardsLocked for the same reason.

* DataNode.HasEcShards now takes each disk's ecShardsLock while reading
  disk.ecShards, closing a pre-existing map race with concurrent
  Add/Delete/Update writers.

* doUpdateEcShardsLocked takes each disk's ecShardsLock around the
  reset-and-rewrite so readers (GetEcShards, HasEcShards) see a
  consistent map state rather than a partially-rebuilt one.

* Disk.GetEcShards' slice-capacity hint now accounts for the nested
  per-physical-disk entries (sum of inner lengths) instead of
  underestimating by the unique-volume count.
2026-04-24 14:01:09 -07:00

178 lines
6.4 KiB
Go

package topology
import (
"testing"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/sequence"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
)
// TestEcShardsAcrossMultipleDisksOnSameNode reproduces issue #9212.
// When a volume server reports EC shards of the same volume spread across
// multiple physical disks on the same node, the master must register ALL
// shards, not only a subset.
func TestEcShardsAcrossMultipleDisksOnSameNode(t *testing.T) {
topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
dc := topo.GetOrCreateDataCenter("dc1")
rack := dc.GetOrCreateRack("rack1")
maxVolumeCounts := map[string]uint32{"": 100}
dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts)
const vid = uint32(15)
const collection = "grafana-loki"
const diskType = "" // HDD / default type — all 4 disks share this type
// Volume 15 has its 14 EC shards spread across 3 physical disks on this node.
// Mirrors the "volume-2" row from issue #9212:
// /data1 (diskId 0): ec02, ec06, ec10
// /data2 (diskId 1): ec01, ec04, ec09
// /data4 (diskId 3): ec08, ec12
disk0 := buildEcShardMessage(vid, collection, diskType, 0, []erasure_coding.ShardId{2, 6, 10})
disk1 := buildEcShardMessage(vid, collection, diskType, 1, []erasure_coding.ShardId{1, 4, 9})
disk3 := buildEcShardMessage(vid, collection, diskType, 3, []erasure_coding.ShardId{8, 12})
msgs := []*master_pb.VolumeEcShardInformationMessage{disk0, disk1, disk3}
topo.SyncDataNodeEcShards(msgs, dn)
locs, ok := topo.LookupEcShards(needle.VolumeId(vid))
if !ok {
t.Fatalf("volume %d: no ec shard locations registered at all", vid)
}
// All 8 shards should be visible to the master: 1,2,4,6,8,9,10,12.
wantShards := []erasure_coding.ShardId{1, 2, 4, 6, 8, 9, 10, 12}
var gotShards []erasure_coding.ShardId
for shardId, dataNodes := range locs.Locations {
if len(dataNodes) > 0 {
gotShards = append(gotShards, erasure_coding.ShardId(shardId))
}
}
if len(gotShards) != len(wantShards) {
t.Errorf("volume %d: topology.LookupEcShards sees %d shards %v, want %d shards %v",
vid, len(gotShards), gotShards, len(wantShards), wantShards)
}
for _, want := range wantShards {
if len(locs.Locations[want]) == 0 {
t.Errorf("volume %d: shard %d missing from topology (bug #9212)", vid, want)
}
}
// The DataNode's own view (what drives volume.list output, admin UI, and
// ec.rebuild dry-run diagnostics) must also see all shards across all disks.
dnShards := dn.GetEcShards()
dnShardCount := 0
dnShardBitmap := erasure_coding.ShardBits(0)
for _, ev := range dnShards {
if ev.VolumeId == needle.VolumeId(vid) {
dnShardCount += ev.ShardsInfo.Count()
dnShardBitmap |= erasure_coding.ShardBits(ev.ShardsInfo.Bitmap())
}
}
if dnShardCount != len(wantShards) {
t.Errorf("volume %d: DataNode.GetEcShards reports %d shards (bitmap=0b%b), want %d shards %v (bug #9212)",
vid, dnShardCount, dnShardBitmap, len(wantShards), wantShards)
}
// Per-physical-disk attribution must survive all the way to the protobuf
// DiskInfo.EcShardInfos consumed by ec.balance / ec.rebuild planners. The
// admin shell groups shards by eci.DiskId, so each physical disk needs a
// separate message with its own shard subset.
wantPerDisk := map[uint32][]erasure_coding.ShardId{
0: {2, 6, 10},
1: {1, 4, 9},
3: {8, 12},
}
dnInfo := dn.ToDataNodeInfo()
gotPerDisk := map[uint32][]erasure_coding.ShardId{}
for _, diskInfo := range dnInfo.DiskInfos {
for _, eci := range diskInfo.EcShardInfos {
if eci.Id != vid {
continue
}
si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(eci)
gotPerDisk[eci.DiskId] = append(gotPerDisk[eci.DiskId], si.Ids()...)
}
}
for diskId, want := range wantPerDisk {
got := gotPerDisk[diskId]
if !shardSetEqual(got, want) {
t.Errorf("volume %d diskId %d: DiskInfo.EcShardInfos report shards %v, want %v (per-physical-disk attribution lost)",
vid, diskId, got, want)
}
}
for diskId := range gotPerDisk {
if _, ok := wantPerDisk[diskId]; !ok {
t.Errorf("volume %d: unexpected diskId %d in DiskInfo.EcShardInfos, shards=%v",
vid, diskId, gotPerDisk[diskId])
}
}
}
func shardSetEqual(a, b []erasure_coding.ShardId) bool {
if len(a) != len(b) {
return false
}
seen := make(map[erasure_coding.ShardId]int, len(a))
for _, id := range a {
seen[id]++
}
for _, id := range b {
seen[id]--
if seen[id] < 0 {
return false
}
}
return true
}
// TestEcShardsAfterRestartHeartbeat simulates the exact issue #9212 flow:
// the volume server starts up, loads shards from each disk, and sends a
// single full-sync heartbeat containing one VolumeEcShardInformationMessage
// per (disk, volume). The master must end up with all shards visible per
// DataNode, not just the subset belonging to whichever disk happened to be
// iterated last.
func TestEcShardsAfterRestartHeartbeat(t *testing.T) {
topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
dc := topo.GetOrCreateDataCenter("dc1")
rack := dc.GetOrCreateRack("rack1")
dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", map[string]uint32{"": 100})
// Matches "volume-1" from the bug report: ec13 on /data1, ec03 on /data3.
msgs := []*master_pb.VolumeEcShardInformationMessage{
buildEcShardMessage(15, "grafana-loki", "", 0, []erasure_coding.ShardId{13}),
buildEcShardMessage(15, "grafana-loki", "", 2, []erasure_coding.ShardId{3}),
}
topo.SyncDataNodeEcShards(msgs, dn)
dnShards := dn.GetEcShards()
var combined erasure_coding.ShardBits
for _, ev := range dnShards {
if ev.VolumeId == 15 {
combined |= erasure_coding.ShardBits(ev.ShardsInfo.Bitmap())
}
}
if combined.Count() != 2 {
t.Errorf("volume 15: DataNode sees %d shards (bitmap=0b%b) after full sync of 2 disks; want both shards 3 and 13 visible (bug #9212)",
combined.Count(), combined)
}
}
func buildEcShardMessage(vid uint32, collection, diskType string, diskId uint32, shardIds []erasure_coding.ShardId) *master_pb.VolumeEcShardInformationMessage {
si := erasure_coding.NewShardsInfo()
for _, sid := range shardIds {
si.Set(erasure_coding.NewShardInfo(sid, erasure_coding.ShardSize(1024)))
}
return &master_pb.VolumeEcShardInformationMessage{
Id: vid,
Collection: collection,
DiskType: diskType,
DiskId: diskId,
EcIndexBits: si.Bitmap(),
ShardSizes: si.SizesInt64(),
}
}