master: grow rack-spanning volumes once per DC, capped at copy_N (#9835)

* master: grow rack-spanning volumes once per DC, capped at copy_N The periodic rack-aware growth scan grew once per rack. For rack-spanning replication (DiffRackCount > 0) a single logical volume already covers every rack the placement needs, so a crowded volume made every rack report should-grow and the scan created racks×step too many volumes: with "010" across two racks that is 2 racks x step 2 = 4 logical (8 physical) volumes. Plan one DC-wide grow for rack-spanning replication, and cap the per-event step at master.volume_growth.copy_N so lowering it reduces periodic growth. * master: distribute lastGrowCount evenly across uneven DCs The non-rack-spanning grow divisor used the current DC's rack count, so DCs with different rack counts each over-grew. Sum every rack up front and divide lastGrowCount by that global count instead.
2026-06-13 23:36:45 +03:00 · 2026-06-05 12:39:59 -07:00
parent ab7be7867d
commit 6bd0091c72
4 changed files with 253 additions and 28 deletions
@@ -3,7 +3,6 @@ package weed_server
 import (
 	"context"
 	"fmt"
-	"math"
 	"math/rand/v2"
 	"strings"
 	"sync"
@@ -89,22 +88,12 @@ func (ms *MasterServer) ProcessGrowRequest() {
 				if err != nil {
 					glog.V(0).Infof("volume grow request failed: %+v", err)
 				}
-				writableVolumes := vl.CloneWritableVolumes()
-				for dcId, racks := range dcs {
-					for _, rackId := range racks {
-						if vl.ShouldGrowVolumesByDcAndRack(&writableVolumes, dcId, rackId) {
-							vgr.DataCenter = string(dcId)
-							vgr.Rack = string(rackId)
-							if lastGrowCount > 0 {
-								vgr.WritableVolumeCount = uint32(math.Ceil(float64(lastGrowCount) / float64(len(dcs)*len(racks))))
-							} else {
-								vgr.WritableVolumeCount = volumeGrowStepCount
-							}
-
-							if _, err = ms.VolumeGrow(ctx, vgr); err != nil {
-								glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", dcId, rackId, err)
-							}
-						}
+				for _, plan := range vl.PlanRackAwareGrowth(dcs, lastGrowCount, volumeGrowStepCount) {
+					vgr.DataCenter = plan.DataCenter
+					vgr.Rack = plan.Rack
+					vgr.WritableVolumeCount = plan.WritableVolumeCount
+					if _, err = ms.VolumeGrow(ctx, vgr); err != nil {
+						glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", plan.DataCenter, plan.Rack, err)
 					}
 				}
 			}
@@ -98,20 +98,25 @@ func NewDefaultVolumeGrowth() *VolumeGrowth {
 	return &VolumeGrowth{}
 }

+// VolumeGrowthCountForCopies returns the configured number of logical volumes
+// to create at once for a given replica copy count (master.volume_growth.copy_N).
+func VolumeGrowthCountForCopies(copyCount int) uint32 {
+	switch copyCount {
+	case 1:
+		return VolumeGrowStrategy.Copy1Count
+	case 2:
+		return VolumeGrowStrategy.Copy2Count
+	case 3:
+		return VolumeGrowStrategy.Copy3Count
+	default:
+		return VolumeGrowStrategy.CopyOtherCount
+	}
+}
+
 // one replication type may need rp.GetCopyCount() actual volumes
 // given copyCount, how many logical volumes to create
 func (vg *VolumeGrowth) findVolumeCount(copyCount int) (count uint32) {
-	switch copyCount {
-	case 1:
-		count = VolumeGrowStrategy.Copy1Count
-	case 2:
-		count = VolumeGrowStrategy.Copy2Count
-	case 3:
-		count = VolumeGrowStrategy.Copy3Count
-	default:
-		count = VolumeGrowStrategy.CopyOtherCount
-	}
-	return
+	return VolumeGrowthCountForCopies(copyCount)
 }

 func (vg *VolumeGrowth) AutomaticGrowByType(option *VolumeGrowOption, grpcDialOption grpc.DialOption, topo *Topology, targetCount uint32) (result []*master_pb.VolumeLocation, err error) {
@@ -696,6 +696,70 @@ func (vl *VolumeLayout) ShouldGrowVolumesByDcAndRack(writables *[]needle.VolumeI
 	return true
 }

+// RackGrowPlan is one volume grow action produced by the periodic rack-aware
+// growth scan. An empty Rack means the grow is DC-wide.
+type RackGrowPlan struct {
+	DataCenter          string
+	Rack                string
+	WritableVolumeCount uint32
+}
+
+// PlanRackAwareGrowth returns the grow actions needed so every location that
+// can serve writes keeps a non-crowded writable volume. stepCount is the
+// default per-event increment.
+//
+// For rack-spanning replication (DiffRackCount > 0) a single logical volume
+// already covers the racks the placement requires, so ShouldGrowVolumesByDcAndRack
+// returns the same result for every rack in a DC. Planning one grow per rack
+// would create racks×count too many volumes; plan one DC-wide grow instead.
+// The default increment is capped at the configured copy_N so lowering
+// master.volume_growth.copy_N reduces periodic growth.
+func (vl *VolumeLayout) PlanRackAwareGrowth(dcs map[NodeId][]NodeId, lastGrowCount, stepCount uint32) (plans []RackGrowPlan) {
+	writables := vl.CloneWritableVolumes()
+	if c := VolumeGrowthCountForCopies(vl.rp.GetCopyCount()); c < stepCount {
+		stepCount = c
+	}
+	growOncePerDc := vl.rp.DiffRackCount > 0
+	// Spread lastGrowCount evenly across all grow targets. Summing every rack
+	// up front keeps the divisor global, so DCs with different rack counts do
+	// not each over-grow from a per-DC divisor.
+	var rackPairs uint32
+	for _, racks := range dcs {
+		rackPairs += uint32(len(racks))
+	}
+	for dcId, racks := range dcs {
+		if growOncePerDc {
+			if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, "") {
+				continue
+			}
+			count := stepCount
+			if lastGrowCount > 0 {
+				count = ceilDiv(lastGrowCount, uint32(len(dcs)))
+			}
+			plans = append(plans, RackGrowPlan{DataCenter: string(dcId), WritableVolumeCount: count})
+			continue
+		}
+		for _, rackId := range racks {
+			if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, rackId) {
+				continue
+			}
+			count := stepCount
+			if lastGrowCount > 0 {
+				count = ceilDiv(lastGrowCount, rackPairs)
+			}
+			plans = append(plans, RackGrowPlan{DataCenter: string(dcId), Rack: string(rackId), WritableVolumeCount: count})
+		}
+	}
+	return plans
+}
+
+func ceilDiv(a, b uint32) uint32 {
+	if b == 0 {
+		return 0
+	}
+	return (a + b - 1) / b
+}
+
 func (vl *VolumeLayout) GetWritableVolumeCount() (active, crowded int) {
 	vl.accessLock.RLock()
 	defer vl.accessLock.RUnlock()
@@ -77,3 +77,170 @@ func TestShouldGrowVolumesByDcAndRack_Issue8986(t *testing.T) {
 		t.Error("rack3 should NOT need growth — the DC already has non-crowded writable volumes that can serve writes")
 	}
 }
+
+// Topology from https://github.com/seaweedfs/seaweedfs/issues/9832
+// 1 DC, 2 racks, 1 server each, replication 010, one crowded volume (28.65 GB)
+// near the 30 GB size limit.
+var topologyLayout9832 = `
+{
+  "datacenter1":{
+    "rack1":{
+      "node-a":{
+        "ip":"10.0.0.1",
+        "volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ],
+        "limit":30
+      }
+    },
+    "rack2":{
+      "node-b":{
+        "ip":"10.0.0.2",
+        "volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ],
+        "limit":30
+      }
+    }
+  }
+}
+`
+
+// Reproduces https://github.com/seaweedfs/seaweedfs/issues/9832
+//
+// A crowded "010" volume made the periodic rack-aware scan return true for
+// every rack in the DC, so it grew once per rack with a hardcoded step of 2 —
+// 2 racks × 2 = 4 logical volumes (8 physical) — and ignored a lowered
+// master.volume_growth.copy_2. PlanRackAwareGrowth now plans a single DC-wide
+// grow capped at copy_N.
+func TestPlanRackAwareGrowth_Issue9832(t *testing.T) {
+	defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count)
+	VolumeGrowStrategy.Copy2Count = 1 // user's master.volume_growth.copy_2
+
+	topo := setupWithLimit(t, topologyLayout9832, 30000)
+	rp, _ := super_block.NewReplicaPlacementFromString("010")
+	vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType)
+
+	if writables := vl.CloneWritableVolumes(); len(writables) != 1 {
+		t.Fatalf("expected 1 writable volume, got %d", len(writables))
+	}
+
+	// The crowded volume makes both racks report "should grow" — the source of
+	// the old per-rack multiplication.
+	writables := vl.CloneWritableVolumes()
+	if !vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack1") ||
+		!vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack2") {
+		t.Fatal("expected both racks to report should-grow for the crowded volume")
+	}
+
+	plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
+	if len(plans) != 1 {
+		t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans)
+	}
+	if plans[0].Rack != "" {
+		t.Errorf("expected DC-wide grow (empty rack), got rack %q", plans[0].Rack)
+	}
+	if plans[0].WritableVolumeCount != 1 {
+		t.Errorf("expected copy_2=1 logical volume, got %d", plans[0].WritableVolumeCount)
+	}
+}
+
+// With the default copy_2 the per-event step is preserved (not increased): the
+// fix only removes the per-rack multiplication.
+func TestPlanRackAwareGrowth_DefaultStepNotMultiplied(t *testing.T) {
+	defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count)
+	VolumeGrowStrategy.Copy2Count = 6 // default
+
+	topo := setupWithLimit(t, topologyLayout9832, 30000)
+	rp, _ := super_block.NewReplicaPlacementFromString("010")
+	vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType)
+
+	plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
+	if len(plans) != 1 {
+		t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans)
+	}
+	if plans[0].WritableVolumeCount != 2 {
+		t.Errorf("expected step count 2 (min of step 2 and copy_2 6), got %d", plans[0].WritableVolumeCount)
+	}
+}
+
+// A non-crowded "010" volume needs no growth at all.
+func TestPlanRackAwareGrowth_NotCrowdedNoGrowth(t *testing.T) {
+	layout := `
+{
+  "datacenter1":{
+    "rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } },
+    "rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } }
+  }
+}
+`
+	topo := setupWithLimit(t, layout, 30000)
+	rp, _ := super_block.NewReplicaPlacementFromString("010")
+	vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
+
+	if plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2); len(plans) != 0 {
+		t.Fatalf("expected no growth for non-crowded volume, got %+v", plans)
+	}
+}
+
+// Non-rack-spanning replication ("000") still grows per rack: a rack without a
+// writable volume gets its own grow.
+func TestPlanRackAwareGrowth_PerRackForNonRackSpanning(t *testing.T) {
+	layout := `
+{
+  "datacenter1":{
+    "rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":1000, "replication":"000", "collection":"c"} ], "limit":30 } },
+    "rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[], "limit":30 } }
+  }
+}
+`
+	topo := setupWithLimit(t, layout, 30000)
+	rp, _ := super_block.NewReplicaPlacementFromString("000")
+	vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
+
+	plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
+	if len(plans) != 1 {
+		t.Fatalf("expected 1 per-rack grow, got %d: %+v", len(plans), plans)
+	}
+	if plans[0].Rack != "rack2" {
+		t.Errorf("expected grow pinned to empty rack2, got %q", plans[0].Rack)
+	}
+}
+
+// lastGrowCount is spread evenly across all grow targets even when DCs have
+// different rack counts: each crowded "000" rack gets ceilDiv(lastGrowCount,
+// totalRacks), so the total matches the request rather than over-growing per DC.
+func TestPlanRackAwareGrowth_EvenDistributionAcrossUnevenDCs(t *testing.T) {
+	layout := `
+{
+  "dc1":{
+    "rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
+    "rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":2, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }
+  },
+  "dc2":{
+    "rack3":{ "node-c":{ "ip":"10.0.0.3", "volumes":[ {"id":3, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
+    "rack4":{ "node-d":{ "ip":"10.0.0.4", "volumes":[ {"id":4, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
+    "rack5":{ "node-e":{ "ip":"10.0.0.5", "volumes":[ {"id":5, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }
+  }
+}
+`
+	topo := setupWithLimit(t, layout, 30000)
+	rp, _ := super_block.NewReplicaPlacementFromString("000")
+	vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
+
+	plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 10, 2)
+	if len(plans) != 5 {
+		t.Fatalf("expected a grow per crowded rack (5), got %d: %+v", len(plans), plans)
+	}
+	total := uint32(0)
+	for _, p := range plans {
+		if p.WritableVolumeCount != 2 { // ceilDiv(10, 5 racks)
+			t.Errorf("expected even per-rack count 2, got %d for %s/%s", p.WritableVolumeCount, p.DataCenter, p.Rack)
+		}
+		total += p.WritableVolumeCount
+	}
+	if total != 10 {
+		t.Errorf("expected total grow 10, got %d", total)
+	}
+}
+
+func restoreCopyCounts(copy1, copy2 uint32) {
+	VolumeGrowStrategy.Copy1Count = copy1
+	VolumeGrowStrategy.Copy2Count = copy2
+}