diff --git a/weed/server/master_grpc_server_volume.go b/weed/server/master_grpc_server_volume.go index 2e990b1a0..ca30287c6 100644 --- a/weed/server/master_grpc_server_volume.go +++ b/weed/server/master_grpc_server_volume.go @@ -3,7 +3,6 @@ package weed_server import ( "context" "fmt" - "math" "math/rand/v2" "strings" "sync" @@ -89,22 +88,12 @@ func (ms *MasterServer) ProcessGrowRequest() { if err != nil { glog.V(0).Infof("volume grow request failed: %+v", err) } - writableVolumes := vl.CloneWritableVolumes() - for dcId, racks := range dcs { - for _, rackId := range racks { - if vl.ShouldGrowVolumesByDcAndRack(&writableVolumes, dcId, rackId) { - vgr.DataCenter = string(dcId) - vgr.Rack = string(rackId) - if lastGrowCount > 0 { - vgr.WritableVolumeCount = uint32(math.Ceil(float64(lastGrowCount) / float64(len(dcs)*len(racks)))) - } else { - vgr.WritableVolumeCount = volumeGrowStepCount - } - - if _, err = ms.VolumeGrow(ctx, vgr); err != nil { - glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", dcId, rackId, err) - } - } + for _, plan := range vl.PlanRackAwareGrowth(dcs, lastGrowCount, volumeGrowStepCount) { + vgr.DataCenter = plan.DataCenter + vgr.Rack = plan.Rack + vgr.WritableVolumeCount = plan.WritableVolumeCount + if _, err = ms.VolumeGrow(ctx, vgr); err != nil { + glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", plan.DataCenter, plan.Rack, err) } } } diff --git a/weed/topology/volume_growth.go b/weed/topology/volume_growth.go index 7f607d1d5..92382c3d6 100644 --- a/weed/topology/volume_growth.go +++ b/weed/topology/volume_growth.go @@ -98,20 +98,25 @@ func NewDefaultVolumeGrowth() *VolumeGrowth { return &VolumeGrowth{} } +// VolumeGrowthCountForCopies returns the configured number of logical volumes +// to create at once for a given replica copy count (master.volume_growth.copy_N). +func VolumeGrowthCountForCopies(copyCount int) uint32 { + switch copyCount { + case 1: + return VolumeGrowStrategy.Copy1Count + case 2: + return VolumeGrowStrategy.Copy2Count + case 3: + return VolumeGrowStrategy.Copy3Count + default: + return VolumeGrowStrategy.CopyOtherCount + } +} + // one replication type may need rp.GetCopyCount() actual volumes // given copyCount, how many logical volumes to create func (vg *VolumeGrowth) findVolumeCount(copyCount int) (count uint32) { - switch copyCount { - case 1: - count = VolumeGrowStrategy.Copy1Count - case 2: - count = VolumeGrowStrategy.Copy2Count - case 3: - count = VolumeGrowStrategy.Copy3Count - default: - count = VolumeGrowStrategy.CopyOtherCount - } - return + return VolumeGrowthCountForCopies(copyCount) } func (vg *VolumeGrowth) AutomaticGrowByType(option *VolumeGrowOption, grpcDialOption grpc.DialOption, topo *Topology, targetCount uint32) (result []*master_pb.VolumeLocation, err error) { diff --git a/weed/topology/volume_layout.go b/weed/topology/volume_layout.go index 1a0a60ed6..cdb4c5814 100644 --- a/weed/topology/volume_layout.go +++ b/weed/topology/volume_layout.go @@ -696,6 +696,70 @@ func (vl *VolumeLayout) ShouldGrowVolumesByDcAndRack(writables *[]needle.VolumeI return true } +// RackGrowPlan is one volume grow action produced by the periodic rack-aware +// growth scan. An empty Rack means the grow is DC-wide. +type RackGrowPlan struct { + DataCenter string + Rack string + WritableVolumeCount uint32 +} + +// PlanRackAwareGrowth returns the grow actions needed so every location that +// can serve writes keeps a non-crowded writable volume. stepCount is the +// default per-event increment. +// +// For rack-spanning replication (DiffRackCount > 0) a single logical volume +// already covers the racks the placement requires, so ShouldGrowVolumesByDcAndRack +// returns the same result for every rack in a DC. Planning one grow per rack +// would create racks×count too many volumes; plan one DC-wide grow instead. +// The default increment is capped at the configured copy_N so lowering +// master.volume_growth.copy_N reduces periodic growth. +func (vl *VolumeLayout) PlanRackAwareGrowth(dcs map[NodeId][]NodeId, lastGrowCount, stepCount uint32) (plans []RackGrowPlan) { + writables := vl.CloneWritableVolumes() + if c := VolumeGrowthCountForCopies(vl.rp.GetCopyCount()); c < stepCount { + stepCount = c + } + growOncePerDc := vl.rp.DiffRackCount > 0 + // Spread lastGrowCount evenly across all grow targets. Summing every rack + // up front keeps the divisor global, so DCs with different rack counts do + // not each over-grow from a per-DC divisor. + var rackPairs uint32 + for _, racks := range dcs { + rackPairs += uint32(len(racks)) + } + for dcId, racks := range dcs { + if growOncePerDc { + if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, "") { + continue + } + count := stepCount + if lastGrowCount > 0 { + count = ceilDiv(lastGrowCount, uint32(len(dcs))) + } + plans = append(plans, RackGrowPlan{DataCenter: string(dcId), WritableVolumeCount: count}) + continue + } + for _, rackId := range racks { + if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, rackId) { + continue + } + count := stepCount + if lastGrowCount > 0 { + count = ceilDiv(lastGrowCount, rackPairs) + } + plans = append(plans, RackGrowPlan{DataCenter: string(dcId), Rack: string(rackId), WritableVolumeCount: count}) + } + } + return plans +} + +func ceilDiv(a, b uint32) uint32 { + if b == 0 { + return 0 + } + return (a + b - 1) / b +} + func (vl *VolumeLayout) GetWritableVolumeCount() (active, crowded int) { vl.accessLock.RLock() defer vl.accessLock.RUnlock() diff --git a/weed/topology/volume_layout_grow_test.go b/weed/topology/volume_layout_grow_test.go index 999faeebc..7e00375c2 100644 --- a/weed/topology/volume_layout_grow_test.go +++ b/weed/topology/volume_layout_grow_test.go @@ -77,3 +77,170 @@ func TestShouldGrowVolumesByDcAndRack_Issue8986(t *testing.T) { t.Error("rack3 should NOT need growth — the DC already has non-crowded writable volumes that can serve writes") } } + +// Topology from https://github.com/seaweedfs/seaweedfs/issues/9832 +// 1 DC, 2 racks, 1 server each, replication 010, one crowded volume (28.65 GB) +// near the 30 GB size limit. +var topologyLayout9832 = ` +{ + "datacenter1":{ + "rack1":{ + "node-a":{ + "ip":"10.0.0.1", + "volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ], + "limit":30 + } + }, + "rack2":{ + "node-b":{ + "ip":"10.0.0.2", + "volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ], + "limit":30 + } + } + } +} +` + +// Reproduces https://github.com/seaweedfs/seaweedfs/issues/9832 +// +// A crowded "010" volume made the periodic rack-aware scan return true for +// every rack in the DC, so it grew once per rack with a hardcoded step of 2 — +// 2 racks × 2 = 4 logical volumes (8 physical) — and ignored a lowered +// master.volume_growth.copy_2. PlanRackAwareGrowth now plans a single DC-wide +// grow capped at copy_N. +func TestPlanRackAwareGrowth_Issue9832(t *testing.T) { + defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count) + VolumeGrowStrategy.Copy2Count = 1 // user's master.volume_growth.copy_2 + + topo := setupWithLimit(t, topologyLayout9832, 30000) + rp, _ := super_block.NewReplicaPlacementFromString("010") + vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType) + + if writables := vl.CloneWritableVolumes(); len(writables) != 1 { + t.Fatalf("expected 1 writable volume, got %d", len(writables)) + } + + // The crowded volume makes both racks report "should grow" — the source of + // the old per-rack multiplication. + writables := vl.CloneWritableVolumes() + if !vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack1") || + !vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack2") { + t.Fatal("expected both racks to report should-grow for the crowded volume") + } + + plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2) + if len(plans) != 1 { + t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans) + } + if plans[0].Rack != "" { + t.Errorf("expected DC-wide grow (empty rack), got rack %q", plans[0].Rack) + } + if plans[0].WritableVolumeCount != 1 { + t.Errorf("expected copy_2=1 logical volume, got %d", plans[0].WritableVolumeCount) + } +} + +// With the default copy_2 the per-event step is preserved (not increased): the +// fix only removes the per-rack multiplication. +func TestPlanRackAwareGrowth_DefaultStepNotMultiplied(t *testing.T) { + defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count) + VolumeGrowStrategy.Copy2Count = 6 // default + + topo := setupWithLimit(t, topologyLayout9832, 30000) + rp, _ := super_block.NewReplicaPlacementFromString("010") + vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType) + + plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2) + if len(plans) != 1 { + t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans) + } + if plans[0].WritableVolumeCount != 2 { + t.Errorf("expected step count 2 (min of step 2 and copy_2 6), got %d", plans[0].WritableVolumeCount) + } +} + +// A non-crowded "010" volume needs no growth at all. +func TestPlanRackAwareGrowth_NotCrowdedNoGrowth(t *testing.T) { + layout := ` +{ + "datacenter1":{ + "rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } }, + "rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } } + } +} +` + topo := setupWithLimit(t, layout, 30000) + rp, _ := super_block.NewReplicaPlacementFromString("010") + vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType) + + if plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2); len(plans) != 0 { + t.Fatalf("expected no growth for non-crowded volume, got %+v", plans) + } +} + +// Non-rack-spanning replication ("000") still grows per rack: a rack without a +// writable volume gets its own grow. +func TestPlanRackAwareGrowth_PerRackForNonRackSpanning(t *testing.T) { + layout := ` +{ + "datacenter1":{ + "rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":1000, "replication":"000", "collection":"c"} ], "limit":30 } }, + "rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[], "limit":30 } } + } +} +` + topo := setupWithLimit(t, layout, 30000) + rp, _ := super_block.NewReplicaPlacementFromString("000") + vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType) + + plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2) + if len(plans) != 1 { + t.Fatalf("expected 1 per-rack grow, got %d: %+v", len(plans), plans) + } + if plans[0].Rack != "rack2" { + t.Errorf("expected grow pinned to empty rack2, got %q", plans[0].Rack) + } +} + +// lastGrowCount is spread evenly across all grow targets even when DCs have +// different rack counts: each crowded "000" rack gets ceilDiv(lastGrowCount, +// totalRacks), so the total matches the request rather than over-growing per DC. +func TestPlanRackAwareGrowth_EvenDistributionAcrossUnevenDCs(t *testing.T) { + layout := ` +{ + "dc1":{ + "rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }, + "rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":2, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } } + }, + "dc2":{ + "rack3":{ "node-c":{ "ip":"10.0.0.3", "volumes":[ {"id":3, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }, + "rack4":{ "node-d":{ "ip":"10.0.0.4", "volumes":[ {"id":4, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }, + "rack5":{ "node-e":{ "ip":"10.0.0.5", "volumes":[ {"id":5, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } } + } +} +` + topo := setupWithLimit(t, layout, 30000) + rp, _ := super_block.NewReplicaPlacementFromString("000") + vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType) + + plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 10, 2) + if len(plans) != 5 { + t.Fatalf("expected a grow per crowded rack (5), got %d: %+v", len(plans), plans) + } + total := uint32(0) + for _, p := range plans { + if p.WritableVolumeCount != 2 { // ceilDiv(10, 5 racks) + t.Errorf("expected even per-rack count 2, got %d for %s/%s", p.WritableVolumeCount, p.DataCenter, p.Rack) + } + total += p.WritableVolumeCount + } + if total != 10 { + t.Errorf("expected total grow 10, got %d", total) + } +} + +func restoreCopyCounts(copy1, copy2 uint32) { + VolumeGrowStrategy.Copy1Count = copy1 + VolumeGrowStrategy.Copy2Count = copy2 +}