mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
f1f720f5da
* refactor(types): add DiskId type for physical-disk identifiers Names the uint32 physical-disk index that volume servers carry in VolumeEcShardInformationMessage / VolumeInformationMessage, so EC shard tracking that needs to distinguish disks within a DataNode can use a dedicated type instead of an untyped uint32. No behaviour change. * fix(master): register EC shards per physical disk on full heartbeat sync (#9212) When a volume's EC shards are spread across multiple physical disks on the same volume server (common after ec.balance / ec.rebuild on multi-disk nodes), the volume server emits one VolumeEcShardInformationMessage per (disk, volume) in its heartbeat. The master's DataNode.UpdateEcShards was building a `map[VolumeId]*EcVolumeInfo` with last-write-wins, and doUpdateEcShards then overwrote `disk.ecShards[vid]` once per message, so all but the final disk's shards were silently dropped. Only the topology-global ecShardMap (built via RegisterEcShards in a per-message loop) stayed correct, which hid the problem from `topo.LookupEcShards` but broke everything that reads the DataNode/Disk view — volume.list, admin UI, ec.rebuild dry-run ("only 6 shards, skipping"), and `DiskInfo.EcShardInfos` which the shell's ec.balance / ec.rebuild planners group by `eci.DiskId`. Change the shape of `Disk.ecShards` from map[VolumeId]*EcVolumeInfo to map[VolumeId]map[types.DiskId]*EcVolumeInfo so every physical disk keeps its own entry. UpdateEcShards aggregates incoming messages by (vid, diskId) rather than vid alone; Add/Delete/ HasVolumesById and HasEcShards consult the nested map; doUpdateEcShards rewrites the nested structure from the aggregated map. Per-physical-disk attribution survives through DataNode.ToDataNodeInfo -> DiskInfo.EcShardInfos, matching the wire format the volume server produces and what downstream admin tooling expects. Delta sync (AddOrUpdateEcShard / DeleteEcShard) already merged via ShardsInfo.Add, so this only affects the full-sync path that runs on heartbeat reconnect. Adds data_node_ec_multi_disk_test.go with two regression tests that fail on pre-fix master: - TestEcShardsAcrossMultipleDisksOnSameNode: volume 15 spread over 3 disks (matches the bug report's volume-2 row); asserts every shard visible via LookupEcShards, DataNode.GetEcShards, and ToDataNodeInfo's per-disk EcShardInfos entries. - TestEcShardsAfterRestartHeartbeat: minimal 2-disk full sync case. * fix(topology): tighten locking around EC shard map access Addresses review comments on #9219: * DataNode.UpdateEcShards now holds dn.Lock for the full read-diff-write cycle, matching UpdateVolumes' model, so concurrent heartbeats can no longer interleave their getOrCreateDisk / UpAdjustDiskUsageDelta updates with each other. Introduces a private getEcShardsLocked helper for reads under the held lock; renames doUpdateEcShards to doUpdateEcShardsLocked for the same reason. * DataNode.HasEcShards now takes each disk's ecShardsLock while reading disk.ecShards, closing a pre-existing map race with concurrent Add/Delete/Update writers. * doUpdateEcShardsLocked takes each disk's ecShardsLock around the reset-and-rewrite so readers (GetEcShards, HasEcShards) see a consistent map state rather than a partially-rebuilt one. * Disk.GetEcShards' slice-capacity hint now accounts for the nested per-physical-disk entries (sum of inner lengths) instead of underestimating by the unique-volume count.
178 lines
6.4 KiB
Go
178 lines
6.4 KiB
Go
package topology
|
|
|
|
import (
|
|
"testing"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/sequence"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
|
)
|
|
|
|
// TestEcShardsAcrossMultipleDisksOnSameNode reproduces issue #9212.
|
|
// When a volume server reports EC shards of the same volume spread across
|
|
// multiple physical disks on the same node, the master must register ALL
|
|
// shards, not only a subset.
|
|
func TestEcShardsAcrossMultipleDisksOnSameNode(t *testing.T) {
|
|
topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
|
|
dc := topo.GetOrCreateDataCenter("dc1")
|
|
rack := dc.GetOrCreateRack("rack1")
|
|
maxVolumeCounts := map[string]uint32{"": 100}
|
|
dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", maxVolumeCounts)
|
|
|
|
const vid = uint32(15)
|
|
const collection = "grafana-loki"
|
|
const diskType = "" // HDD / default type — all 4 disks share this type
|
|
|
|
// Volume 15 has its 14 EC shards spread across 3 physical disks on this node.
|
|
// Mirrors the "volume-2" row from issue #9212:
|
|
// /data1 (diskId 0): ec02, ec06, ec10
|
|
// /data2 (diskId 1): ec01, ec04, ec09
|
|
// /data4 (diskId 3): ec08, ec12
|
|
disk0 := buildEcShardMessage(vid, collection, diskType, 0, []erasure_coding.ShardId{2, 6, 10})
|
|
disk1 := buildEcShardMessage(vid, collection, diskType, 1, []erasure_coding.ShardId{1, 4, 9})
|
|
disk3 := buildEcShardMessage(vid, collection, diskType, 3, []erasure_coding.ShardId{8, 12})
|
|
|
|
msgs := []*master_pb.VolumeEcShardInformationMessage{disk0, disk1, disk3}
|
|
topo.SyncDataNodeEcShards(msgs, dn)
|
|
|
|
locs, ok := topo.LookupEcShards(needle.VolumeId(vid))
|
|
if !ok {
|
|
t.Fatalf("volume %d: no ec shard locations registered at all", vid)
|
|
}
|
|
|
|
// All 8 shards should be visible to the master: 1,2,4,6,8,9,10,12.
|
|
wantShards := []erasure_coding.ShardId{1, 2, 4, 6, 8, 9, 10, 12}
|
|
var gotShards []erasure_coding.ShardId
|
|
for shardId, dataNodes := range locs.Locations {
|
|
if len(dataNodes) > 0 {
|
|
gotShards = append(gotShards, erasure_coding.ShardId(shardId))
|
|
}
|
|
}
|
|
|
|
if len(gotShards) != len(wantShards) {
|
|
t.Errorf("volume %d: topology.LookupEcShards sees %d shards %v, want %d shards %v",
|
|
vid, len(gotShards), gotShards, len(wantShards), wantShards)
|
|
}
|
|
|
|
for _, want := range wantShards {
|
|
if len(locs.Locations[want]) == 0 {
|
|
t.Errorf("volume %d: shard %d missing from topology (bug #9212)", vid, want)
|
|
}
|
|
}
|
|
|
|
// The DataNode's own view (what drives volume.list output, admin UI, and
|
|
// ec.rebuild dry-run diagnostics) must also see all shards across all disks.
|
|
dnShards := dn.GetEcShards()
|
|
dnShardCount := 0
|
|
dnShardBitmap := erasure_coding.ShardBits(0)
|
|
for _, ev := range dnShards {
|
|
if ev.VolumeId == needle.VolumeId(vid) {
|
|
dnShardCount += ev.ShardsInfo.Count()
|
|
dnShardBitmap |= erasure_coding.ShardBits(ev.ShardsInfo.Bitmap())
|
|
}
|
|
}
|
|
if dnShardCount != len(wantShards) {
|
|
t.Errorf("volume %d: DataNode.GetEcShards reports %d shards (bitmap=0b%b), want %d shards %v (bug #9212)",
|
|
vid, dnShardCount, dnShardBitmap, len(wantShards), wantShards)
|
|
}
|
|
|
|
// Per-physical-disk attribution must survive all the way to the protobuf
|
|
// DiskInfo.EcShardInfos consumed by ec.balance / ec.rebuild planners. The
|
|
// admin shell groups shards by eci.DiskId, so each physical disk needs a
|
|
// separate message with its own shard subset.
|
|
wantPerDisk := map[uint32][]erasure_coding.ShardId{
|
|
0: {2, 6, 10},
|
|
1: {1, 4, 9},
|
|
3: {8, 12},
|
|
}
|
|
dnInfo := dn.ToDataNodeInfo()
|
|
gotPerDisk := map[uint32][]erasure_coding.ShardId{}
|
|
for _, diskInfo := range dnInfo.DiskInfos {
|
|
for _, eci := range diskInfo.EcShardInfos {
|
|
if eci.Id != vid {
|
|
continue
|
|
}
|
|
si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(eci)
|
|
gotPerDisk[eci.DiskId] = append(gotPerDisk[eci.DiskId], si.Ids()...)
|
|
}
|
|
}
|
|
for diskId, want := range wantPerDisk {
|
|
got := gotPerDisk[diskId]
|
|
if !shardSetEqual(got, want) {
|
|
t.Errorf("volume %d diskId %d: DiskInfo.EcShardInfos report shards %v, want %v (per-physical-disk attribution lost)",
|
|
vid, diskId, got, want)
|
|
}
|
|
}
|
|
for diskId := range gotPerDisk {
|
|
if _, ok := wantPerDisk[diskId]; !ok {
|
|
t.Errorf("volume %d: unexpected diskId %d in DiskInfo.EcShardInfos, shards=%v",
|
|
vid, diskId, gotPerDisk[diskId])
|
|
}
|
|
}
|
|
}
|
|
|
|
func shardSetEqual(a, b []erasure_coding.ShardId) bool {
|
|
if len(a) != len(b) {
|
|
return false
|
|
}
|
|
seen := make(map[erasure_coding.ShardId]int, len(a))
|
|
for _, id := range a {
|
|
seen[id]++
|
|
}
|
|
for _, id := range b {
|
|
seen[id]--
|
|
if seen[id] < 0 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// TestEcShardsAfterRestartHeartbeat simulates the exact issue #9212 flow:
|
|
// the volume server starts up, loads shards from each disk, and sends a
|
|
// single full-sync heartbeat containing one VolumeEcShardInformationMessage
|
|
// per (disk, volume). The master must end up with all shards visible per
|
|
// DataNode, not just the subset belonging to whichever disk happened to be
|
|
// iterated last.
|
|
func TestEcShardsAfterRestartHeartbeat(t *testing.T) {
|
|
topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
|
|
dc := topo.GetOrCreateDataCenter("dc1")
|
|
rack := dc.GetOrCreateRack("rack1")
|
|
dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", map[string]uint32{"": 100})
|
|
|
|
// Matches "volume-1" from the bug report: ec13 on /data1, ec03 on /data3.
|
|
msgs := []*master_pb.VolumeEcShardInformationMessage{
|
|
buildEcShardMessage(15, "grafana-loki", "", 0, []erasure_coding.ShardId{13}),
|
|
buildEcShardMessage(15, "grafana-loki", "", 2, []erasure_coding.ShardId{3}),
|
|
}
|
|
topo.SyncDataNodeEcShards(msgs, dn)
|
|
|
|
dnShards := dn.GetEcShards()
|
|
var combined erasure_coding.ShardBits
|
|
for _, ev := range dnShards {
|
|
if ev.VolumeId == 15 {
|
|
combined |= erasure_coding.ShardBits(ev.ShardsInfo.Bitmap())
|
|
}
|
|
}
|
|
if combined.Count() != 2 {
|
|
t.Errorf("volume 15: DataNode sees %d shards (bitmap=0b%b) after full sync of 2 disks; want both shards 3 and 13 visible (bug #9212)",
|
|
combined.Count(), combined)
|
|
}
|
|
}
|
|
|
|
func buildEcShardMessage(vid uint32, collection, diskType string, diskId uint32, shardIds []erasure_coding.ShardId) *master_pb.VolumeEcShardInformationMessage {
|
|
si := erasure_coding.NewShardsInfo()
|
|
for _, sid := range shardIds {
|
|
si.Set(erasure_coding.NewShardInfo(sid, erasure_coding.ShardSize(1024)))
|
|
}
|
|
return &master_pb.VolumeEcShardInformationMessage{
|
|
Id: vid,
|
|
Collection: collection,
|
|
DiskType: diskType,
|
|
DiskId: diskId,
|
|
EcIndexBits: si.Bitmap(),
|
|
ShardSizes: si.SizesInt64(),
|
|
}
|
|
}
|