Files
seaweedfs/weed/topology/data_node_ec.go
T
Chris Lu f1f720f5da fix(master): register EC shards per physical disk on full heartbeat sync (#9212) (#9219)
* refactor(types): add DiskId type for physical-disk identifiers

Names the uint32 physical-disk index that volume servers carry in
VolumeEcShardInformationMessage / VolumeInformationMessage, so EC shard
tracking that needs to distinguish disks within a DataNode can use a
dedicated type instead of an untyped uint32. No behaviour change.

* fix(master): register EC shards per physical disk on full heartbeat sync (#9212)

When a volume's EC shards are spread across multiple physical disks on the
same volume server (common after ec.balance / ec.rebuild on multi-disk
nodes), the volume server emits one VolumeEcShardInformationMessage per
(disk, volume) in its heartbeat. The master's DataNode.UpdateEcShards was
building a `map[VolumeId]*EcVolumeInfo` with last-write-wins, and
doUpdateEcShards then overwrote `disk.ecShards[vid]` once per message, so
all but the final disk's shards were silently dropped. Only the
topology-global ecShardMap (built via RegisterEcShards in a per-message
loop) stayed correct, which hid the problem from `topo.LookupEcShards`
but broke everything that reads the DataNode/Disk view — volume.list,
admin UI, ec.rebuild dry-run ("only 6 shards, skipping"), and
`DiskInfo.EcShardInfos` which the shell's ec.balance / ec.rebuild
planners group by `eci.DiskId`.

Change the shape of `Disk.ecShards` from
    map[VolumeId]*EcVolumeInfo
to
    map[VolumeId]map[types.DiskId]*EcVolumeInfo

so every physical disk keeps its own entry. UpdateEcShards aggregates
incoming messages by (vid, diskId) rather than vid alone; Add/Delete/
HasVolumesById and HasEcShards consult the nested map; doUpdateEcShards
rewrites the nested structure from the aggregated map. Per-physical-disk
attribution survives through DataNode.ToDataNodeInfo ->
DiskInfo.EcShardInfos, matching the wire format the volume server
produces and what downstream admin tooling expects.

Delta sync (AddOrUpdateEcShard / DeleteEcShard) already merged via
ShardsInfo.Add, so this only affects the full-sync path that runs on
heartbeat reconnect.

Adds data_node_ec_multi_disk_test.go with two regression tests that fail
on pre-fix master:
- TestEcShardsAcrossMultipleDisksOnSameNode: volume 15 spread over 3
  disks (matches the bug report's volume-2 row); asserts every shard
  visible via LookupEcShards, DataNode.GetEcShards, and ToDataNodeInfo's
  per-disk EcShardInfos entries.
- TestEcShardsAfterRestartHeartbeat: minimal 2-disk full sync case.

* fix(topology): tighten locking around EC shard map access

Addresses review comments on #9219:

* DataNode.UpdateEcShards now holds dn.Lock for the full read-diff-write
  cycle, matching UpdateVolumes' model, so concurrent heartbeats can no
  longer interleave their getOrCreateDisk / UpAdjustDiskUsageDelta
  updates with each other. Introduces a private getEcShardsLocked helper
  for reads under the held lock; renames doUpdateEcShards to
  doUpdateEcShardsLocked for the same reason.

* DataNode.HasEcShards now takes each disk's ecShardsLock while reading
  disk.ecShards, closing a pre-existing map race with concurrent
  Add/Delete/Update writers.

* doUpdateEcShardsLocked takes each disk's ecShardsLock around the
  reset-and-rewrite so readers (GetEcShards, HasEcShards) see a
  consistent map state rather than a partially-rebuilt one.

* Disk.GetEcShards' slice-capacity hint now accounts for the nested
  per-physical-disk entries (sum of inner lengths) instead of
  underestimating by the unique-volume count.
2026-04-24 14:01:09 -07:00

200 lines
6.2 KiB
Go

package topology
import (
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
)
func (dn *DataNode) GetEcShards() (ret []*erasure_coding.EcVolumeInfo) {
dn.RLock()
for _, c := range dn.children {
disk := c.(*Disk)
ret = append(ret, disk.GetEcShards()...)
}
dn.RUnlock()
return ret
}
// ecShardKey identifies a per-physical-disk EC shard entry on a DataNode.
// A single volume's EC shards can live on multiple physical disks of one
// DataNode (e.g. a 10+4 volume spread across 4 mount points), so entries
// must be tracked per (volume, physical disk) rather than per volume alone.
// See issue #9212.
type ecShardKey struct {
vid needle.VolumeId
diskId types.DiskId
}
func (dn *DataNode) UpdateEcShards(actualShards []*erasure_coding.EcVolumeInfo) (newShards, deletedShards []*erasure_coding.EcVolumeInfo) {
// Aggregate incoming messages by (volume, physical disk). Duplicates for
// the same (vid, diskId) are merged; entries for the same vid on
// different disks stay separate so per-physical-disk attribution is
// preserved all the way to DiskInfo.EcShardInfos — which the admin
// shell's ec.balance / ec.rebuild planners read via eci.DiskId.
actualByKey := make(map[ecShardKey]*erasure_coding.EcVolumeInfo, len(actualShards))
for _, ecShards := range actualShards {
key := ecShardKey{vid: ecShards.VolumeId, diskId: types.DiskId(ecShards.DiskId)}
if existing, ok := actualByKey[key]; ok {
existing.ShardsInfo.Add(ecShards.ShardsInfo)
continue
}
// Clone so subsequent merges for the same key don't mutate the
// caller's EcVolumeInfo, and so the diff below is stable.
clone := *ecShards
clone.ShardsInfo = ecShards.ShardsInfo.Copy()
actualByKey[key] = &clone
}
// Hold dn.Lock for the full read-diff-write cycle so concurrent heartbeats,
// getOrCreateDisk calls, and UpAdjustDiskUsageDelta updates on this data
// node are serialized with us — matching UpdateVolumes' locking model.
// Internal helpers below assume the caller holds dn.Lock.
dn.Lock()
defer dn.Unlock()
existingEcShards := dn.getEcShardsLocked()
// find out the newShards and deletedShards
for _, ecShards := range existingEcShards {
var newShardCount, deletedShardCount int
disk := dn.getOrCreateDisk(ecShards.DiskType)
key := ecShardKey{vid: ecShards.VolumeId, diskId: types.DiskId(ecShards.DiskId)}
if actualEcShards, ok := actualByKey[key]; !ok {
// dn registered ec shards not found in the new set of ec shards
deletedShards = append(deletedShards, ecShards)
deletedShardCount += ecShards.ShardsInfo.Count()
} else {
// found, but maybe the actual shard could be missing
a := actualEcShards.Minus(ecShards)
if a.ShardsInfo.Count() > 0 {
newShards = append(newShards, a)
newShardCount += a.ShardsInfo.Count()
}
d := ecShards.Minus(actualEcShards)
if d.ShardsInfo.Count() > 0 {
deletedShards = append(deletedShards, d)
deletedShardCount += d.ShardsInfo.Count()
}
}
if (newShardCount - deletedShardCount) != 0 {
disk.UpAdjustDiskUsageDelta(types.ToDiskType(ecShards.DiskType), &DiskUsageCounts{
ecShardCount: int64(newShardCount - deletedShardCount),
})
}
}
existingKeys := make(map[ecShardKey]struct{}, len(existingEcShards))
for _, ev := range existingEcShards {
existingKeys[ecShardKey{vid: ev.VolumeId, diskId: types.DiskId(ev.DiskId)}] = struct{}{}
}
for key, ecShards := range actualByKey {
if _, found := existingKeys[key]; found {
continue
}
newShards = append(newShards, ecShards)
disk := dn.getOrCreateDisk(ecShards.DiskType)
disk.UpAdjustDiskUsageDelta(types.ToDiskType(ecShards.DiskType), &DiskUsageCounts{
ecShardCount: int64(ecShards.ShardsInfo.Count()),
})
}
if len(newShards) > 0 || len(deletedShards) > 0 {
// if changed, set to the new ec shard map
dn.doUpdateEcShardsLocked(actualByKey)
}
return
}
// getEcShardsLocked returns the flat list of per-physical-disk EC shard
// entries across all children. Caller MUST hold dn.Lock (or RLock); each
// disk's ecShardsLock is taken internally via Disk.GetEcShards.
func (dn *DataNode) getEcShardsLocked() (ret []*erasure_coding.EcVolumeInfo) {
for _, c := range dn.children {
disk := c.(*Disk)
ret = append(ret, disk.GetEcShards()...)
}
return ret
}
func (dn *DataNode) HasEcShards(volumeId needle.VolumeId) (found bool) {
dn.RLock()
defer dn.RUnlock()
for _, c := range dn.children {
disk := c.(*Disk)
disk.ecShardsLock.RLock()
byDisk, ok := disk.ecShards[volumeId]
has := ok && len(byDisk) > 0
disk.ecShardsLock.RUnlock()
if has {
return true
}
}
return
}
// doUpdateEcShardsLocked rewrites disk.ecShards from actualByKey. Caller
// MUST hold dn.Lock; each disk's ecShardsLock is taken internally around
// the read-modify-write of its ecShards map.
func (dn *DataNode) doUpdateEcShardsLocked(actualByKey map[ecShardKey]*erasure_coding.EcVolumeInfo) {
for _, c := range dn.children {
disk := c.(*Disk)
disk.ecShardsLock.Lock()
disk.ecShards = make(map[needle.VolumeId]map[types.DiskId]*erasure_coding.EcVolumeInfo)
disk.ecShardsLock.Unlock()
}
for _, shard := range actualByKey {
disk := dn.getOrCreateDisk(shard.DiskType)
disk.ecShardsLock.Lock()
byDisk, ok := disk.ecShards[shard.VolumeId]
if !ok {
byDisk = make(map[types.DiskId]*erasure_coding.EcVolumeInfo, 1)
disk.ecShards[shard.VolumeId] = byDisk
}
byDisk[types.DiskId(shard.DiskId)] = shard
disk.ecShardsLock.Unlock()
}
}
func (dn *DataNode) DeltaUpdateEcShards(newShards, deletedShards []*erasure_coding.EcVolumeInfo) {
for _, newShard := range newShards {
dn.AddOrUpdateEcShard(newShard)
}
for _, deletedShard := range deletedShards {
dn.DeleteEcShard(deletedShard)
}
}
func (dn *DataNode) AddOrUpdateEcShard(s *erasure_coding.EcVolumeInfo) {
disk := dn.getOrCreateDisk(s.DiskType)
disk.AddOrUpdateEcShard(s)
}
func (dn *DataNode) DeleteEcShard(s *erasure_coding.EcVolumeInfo) {
disk := dn.getOrCreateDisk(s.DiskType)
disk.DeleteEcShard(s)
}
func (dn *DataNode) HasVolumesById(volumeId needle.VolumeId) (hasVolumeId bool) {
dn.RLock()
defer dn.RUnlock()
for _, c := range dn.children {
disk := c.(*Disk)
if disk.HasVolumesById(volumeId) {
return true
}
}
return false
}