master: grow rack-spanning volumes once per DC, capped at copy_N (#9835)

* master: grow rack-spanning volumes once per DC, capped at copy_N

The periodic rack-aware growth scan grew once per rack. For rack-spanning
replication (DiffRackCount > 0) a single logical volume already covers every
rack the placement needs, so a crowded volume made every rack report
should-grow and the scan created racks×step too many volumes: with "010"
across two racks that is 2 racks x step 2 = 4 logical (8 physical) volumes.

Plan one DC-wide grow for rack-spanning replication, and cap the per-event
step at master.volume_growth.copy_N so lowering it reduces periodic growth.

* master: distribute lastGrowCount evenly across uneven DCs

The non-rack-spanning grow divisor used the current DC's rack count, so DCs
with different rack counts each over-grew. Sum every rack up front and divide
lastGrowCount by that global count instead.
This commit is contained in:
Chris Lu
2026-06-05 12:39:59 -07:00
committed by GitHub
parent ab7be7867d
commit 6bd0091c72
4 changed files with 253 additions and 28 deletions
+6 -17
View File
@@ -3,7 +3,6 @@ package weed_server
import (
"context"
"fmt"
"math"
"math/rand/v2"
"strings"
"sync"
@@ -89,22 +88,12 @@ func (ms *MasterServer) ProcessGrowRequest() {
if err != nil {
glog.V(0).Infof("volume grow request failed: %+v", err)
}
writableVolumes := vl.CloneWritableVolumes()
for dcId, racks := range dcs {
for _, rackId := range racks {
if vl.ShouldGrowVolumesByDcAndRack(&writableVolumes, dcId, rackId) {
vgr.DataCenter = string(dcId)
vgr.Rack = string(rackId)
if lastGrowCount > 0 {
vgr.WritableVolumeCount = uint32(math.Ceil(float64(lastGrowCount) / float64(len(dcs)*len(racks))))
} else {
vgr.WritableVolumeCount = volumeGrowStepCount
}
if _, err = ms.VolumeGrow(ctx, vgr); err != nil {
glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", dcId, rackId, err)
}
}
for _, plan := range vl.PlanRackAwareGrowth(dcs, lastGrowCount, volumeGrowStepCount) {
vgr.DataCenter = plan.DataCenter
vgr.Rack = plan.Rack
vgr.WritableVolumeCount = plan.WritableVolumeCount
if _, err = ms.VolumeGrow(ctx, vgr); err != nil {
glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", plan.DataCenter, plan.Rack, err)
}
}
}
+16 -11
View File
@@ -98,20 +98,25 @@ func NewDefaultVolumeGrowth() *VolumeGrowth {
return &VolumeGrowth{}
}
// VolumeGrowthCountForCopies returns the configured number of logical volumes
// to create at once for a given replica copy count (master.volume_growth.copy_N).
func VolumeGrowthCountForCopies(copyCount int) uint32 {
switch copyCount {
case 1:
return VolumeGrowStrategy.Copy1Count
case 2:
return VolumeGrowStrategy.Copy2Count
case 3:
return VolumeGrowStrategy.Copy3Count
default:
return VolumeGrowStrategy.CopyOtherCount
}
}
// one replication type may need rp.GetCopyCount() actual volumes
// given copyCount, how many logical volumes to create
func (vg *VolumeGrowth) findVolumeCount(copyCount int) (count uint32) {
switch copyCount {
case 1:
count = VolumeGrowStrategy.Copy1Count
case 2:
count = VolumeGrowStrategy.Copy2Count
case 3:
count = VolumeGrowStrategy.Copy3Count
default:
count = VolumeGrowStrategy.CopyOtherCount
}
return
return VolumeGrowthCountForCopies(copyCount)
}
func (vg *VolumeGrowth) AutomaticGrowByType(option *VolumeGrowOption, grpcDialOption grpc.DialOption, topo *Topology, targetCount uint32) (result []*master_pb.VolumeLocation, err error) {
+64
View File
@@ -696,6 +696,70 @@ func (vl *VolumeLayout) ShouldGrowVolumesByDcAndRack(writables *[]needle.VolumeI
return true
}
// RackGrowPlan is one volume grow action produced by the periodic rack-aware
// growth scan. An empty Rack means the grow is DC-wide.
type RackGrowPlan struct {
DataCenter string
Rack string
WritableVolumeCount uint32
}
// PlanRackAwareGrowth returns the grow actions needed so every location that
// can serve writes keeps a non-crowded writable volume. stepCount is the
// default per-event increment.
//
// For rack-spanning replication (DiffRackCount > 0) a single logical volume
// already covers the racks the placement requires, so ShouldGrowVolumesByDcAndRack
// returns the same result for every rack in a DC. Planning one grow per rack
// would create racks×count too many volumes; plan one DC-wide grow instead.
// The default increment is capped at the configured copy_N so lowering
// master.volume_growth.copy_N reduces periodic growth.
func (vl *VolumeLayout) PlanRackAwareGrowth(dcs map[NodeId][]NodeId, lastGrowCount, stepCount uint32) (plans []RackGrowPlan) {
writables := vl.CloneWritableVolumes()
if c := VolumeGrowthCountForCopies(vl.rp.GetCopyCount()); c < stepCount {
stepCount = c
}
growOncePerDc := vl.rp.DiffRackCount > 0
// Spread lastGrowCount evenly across all grow targets. Summing every rack
// up front keeps the divisor global, so DCs with different rack counts do
// not each over-grow from a per-DC divisor.
var rackPairs uint32
for _, racks := range dcs {
rackPairs += uint32(len(racks))
}
for dcId, racks := range dcs {
if growOncePerDc {
if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, "") {
continue
}
count := stepCount
if lastGrowCount > 0 {
count = ceilDiv(lastGrowCount, uint32(len(dcs)))
}
plans = append(plans, RackGrowPlan{DataCenter: string(dcId), WritableVolumeCount: count})
continue
}
for _, rackId := range racks {
if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, rackId) {
continue
}
count := stepCount
if lastGrowCount > 0 {
count = ceilDiv(lastGrowCount, rackPairs)
}
plans = append(plans, RackGrowPlan{DataCenter: string(dcId), Rack: string(rackId), WritableVolumeCount: count})
}
}
return plans
}
func ceilDiv(a, b uint32) uint32 {
if b == 0 {
return 0
}
return (a + b - 1) / b
}
func (vl *VolumeLayout) GetWritableVolumeCount() (active, crowded int) {
vl.accessLock.RLock()
defer vl.accessLock.RUnlock()
+167
View File
@@ -77,3 +77,170 @@ func TestShouldGrowVolumesByDcAndRack_Issue8986(t *testing.T) {
t.Error("rack3 should NOT need growth — the DC already has non-crowded writable volumes that can serve writes")
}
}
// Topology from https://github.com/seaweedfs/seaweedfs/issues/9832
// 1 DC, 2 racks, 1 server each, replication 010, one crowded volume (28.65 GB)
// near the 30 GB size limit.
var topologyLayout9832 = `
{
"datacenter1":{
"rack1":{
"node-a":{
"ip":"10.0.0.1",
"volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ],
"limit":30
}
},
"rack2":{
"node-b":{
"ip":"10.0.0.2",
"volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ],
"limit":30
}
}
}
}
`
// Reproduces https://github.com/seaweedfs/seaweedfs/issues/9832
//
// A crowded "010" volume made the periodic rack-aware scan return true for
// every rack in the DC, so it grew once per rack with a hardcoded step of 2 —
// 2 racks × 2 = 4 logical volumes (8 physical) — and ignored a lowered
// master.volume_growth.copy_2. PlanRackAwareGrowth now plans a single DC-wide
// grow capped at copy_N.
func TestPlanRackAwareGrowth_Issue9832(t *testing.T) {
defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count)
VolumeGrowStrategy.Copy2Count = 1 // user's master.volume_growth.copy_2
topo := setupWithLimit(t, topologyLayout9832, 30000)
rp, _ := super_block.NewReplicaPlacementFromString("010")
vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType)
if writables := vl.CloneWritableVolumes(); len(writables) != 1 {
t.Fatalf("expected 1 writable volume, got %d", len(writables))
}
// The crowded volume makes both racks report "should grow" — the source of
// the old per-rack multiplication.
writables := vl.CloneWritableVolumes()
if !vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack1") ||
!vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack2") {
t.Fatal("expected both racks to report should-grow for the crowded volume")
}
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
if len(plans) != 1 {
t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans)
}
if plans[0].Rack != "" {
t.Errorf("expected DC-wide grow (empty rack), got rack %q", plans[0].Rack)
}
if plans[0].WritableVolumeCount != 1 {
t.Errorf("expected copy_2=1 logical volume, got %d", plans[0].WritableVolumeCount)
}
}
// With the default copy_2 the per-event step is preserved (not increased): the
// fix only removes the per-rack multiplication.
func TestPlanRackAwareGrowth_DefaultStepNotMultiplied(t *testing.T) {
defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count)
VolumeGrowStrategy.Copy2Count = 6 // default
topo := setupWithLimit(t, topologyLayout9832, 30000)
rp, _ := super_block.NewReplicaPlacementFromString("010")
vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType)
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
if len(plans) != 1 {
t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans)
}
if plans[0].WritableVolumeCount != 2 {
t.Errorf("expected step count 2 (min of step 2 and copy_2 6), got %d", plans[0].WritableVolumeCount)
}
}
// A non-crowded "010" volume needs no growth at all.
func TestPlanRackAwareGrowth_NotCrowdedNoGrowth(t *testing.T) {
layout := `
{
"datacenter1":{
"rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } },
"rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } }
}
}
`
topo := setupWithLimit(t, layout, 30000)
rp, _ := super_block.NewReplicaPlacementFromString("010")
vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
if plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2); len(plans) != 0 {
t.Fatalf("expected no growth for non-crowded volume, got %+v", plans)
}
}
// Non-rack-spanning replication ("000") still grows per rack: a rack without a
// writable volume gets its own grow.
func TestPlanRackAwareGrowth_PerRackForNonRackSpanning(t *testing.T) {
layout := `
{
"datacenter1":{
"rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":1000, "replication":"000", "collection":"c"} ], "limit":30 } },
"rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[], "limit":30 } }
}
}
`
topo := setupWithLimit(t, layout, 30000)
rp, _ := super_block.NewReplicaPlacementFromString("000")
vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
if len(plans) != 1 {
t.Fatalf("expected 1 per-rack grow, got %d: %+v", len(plans), plans)
}
if plans[0].Rack != "rack2" {
t.Errorf("expected grow pinned to empty rack2, got %q", plans[0].Rack)
}
}
// lastGrowCount is spread evenly across all grow targets even when DCs have
// different rack counts: each crowded "000" rack gets ceilDiv(lastGrowCount,
// totalRacks), so the total matches the request rather than over-growing per DC.
func TestPlanRackAwareGrowth_EvenDistributionAcrossUnevenDCs(t *testing.T) {
layout := `
{
"dc1":{
"rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
"rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":2, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }
},
"dc2":{
"rack3":{ "node-c":{ "ip":"10.0.0.3", "volumes":[ {"id":3, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
"rack4":{ "node-d":{ "ip":"10.0.0.4", "volumes":[ {"id":4, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
"rack5":{ "node-e":{ "ip":"10.0.0.5", "volumes":[ {"id":5, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }
}
}
`
topo := setupWithLimit(t, layout, 30000)
rp, _ := super_block.NewReplicaPlacementFromString("000")
vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 10, 2)
if len(plans) != 5 {
t.Fatalf("expected a grow per crowded rack (5), got %d: %+v", len(plans), plans)
}
total := uint32(0)
for _, p := range plans {
if p.WritableVolumeCount != 2 { // ceilDiv(10, 5 racks)
t.Errorf("expected even per-rack count 2, got %d for %s/%s", p.WritableVolumeCount, p.DataCenter, p.Rack)
}
total += p.WritableVolumeCount
}
if total != 10 {
t.Errorf("expected total grow 10, got %d", total)
}
}
func restoreCopyCounts(copy1, copy2 uint32) {
VolumeGrowStrategy.Copy1Count = copy1
VolumeGrowStrategy.Copy2Count = copy2
}