mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
master: grow rack-spanning volumes once per DC, capped at copy_N (#9835)
* master: grow rack-spanning volumes once per DC, capped at copy_N The periodic rack-aware growth scan grew once per rack. For rack-spanning replication (DiffRackCount > 0) a single logical volume already covers every rack the placement needs, so a crowded volume made every rack report should-grow and the scan created racks×step too many volumes: with "010" across two racks that is 2 racks x step 2 = 4 logical (8 physical) volumes. Plan one DC-wide grow for rack-spanning replication, and cap the per-event step at master.volume_growth.copy_N so lowering it reduces periodic growth. * master: distribute lastGrowCount evenly across uneven DCs The non-rack-spanning grow divisor used the current DC's rack count, so DCs with different rack counts each over-grew. Sum every rack up front and divide lastGrowCount by that global count instead.
This commit is contained in:
@@ -3,7 +3,6 @@ package weed_server
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"math/rand/v2"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -89,22 +88,12 @@ func (ms *MasterServer) ProcessGrowRequest() {
|
||||
if err != nil {
|
||||
glog.V(0).Infof("volume grow request failed: %+v", err)
|
||||
}
|
||||
writableVolumes := vl.CloneWritableVolumes()
|
||||
for dcId, racks := range dcs {
|
||||
for _, rackId := range racks {
|
||||
if vl.ShouldGrowVolumesByDcAndRack(&writableVolumes, dcId, rackId) {
|
||||
vgr.DataCenter = string(dcId)
|
||||
vgr.Rack = string(rackId)
|
||||
if lastGrowCount > 0 {
|
||||
vgr.WritableVolumeCount = uint32(math.Ceil(float64(lastGrowCount) / float64(len(dcs)*len(racks))))
|
||||
} else {
|
||||
vgr.WritableVolumeCount = volumeGrowStepCount
|
||||
}
|
||||
|
||||
if _, err = ms.VolumeGrow(ctx, vgr); err != nil {
|
||||
glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", dcId, rackId, err)
|
||||
}
|
||||
}
|
||||
for _, plan := range vl.PlanRackAwareGrowth(dcs, lastGrowCount, volumeGrowStepCount) {
|
||||
vgr.DataCenter = plan.DataCenter
|
||||
vgr.Rack = plan.Rack
|
||||
vgr.WritableVolumeCount = plan.WritableVolumeCount
|
||||
if _, err = ms.VolumeGrow(ctx, vgr); err != nil {
|
||||
glog.V(0).Infof("volume grow request for dc:%s rack:%s failed: %+v", plan.DataCenter, plan.Rack, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,20 +98,25 @@ func NewDefaultVolumeGrowth() *VolumeGrowth {
|
||||
return &VolumeGrowth{}
|
||||
}
|
||||
|
||||
// VolumeGrowthCountForCopies returns the configured number of logical volumes
|
||||
// to create at once for a given replica copy count (master.volume_growth.copy_N).
|
||||
func VolumeGrowthCountForCopies(copyCount int) uint32 {
|
||||
switch copyCount {
|
||||
case 1:
|
||||
return VolumeGrowStrategy.Copy1Count
|
||||
case 2:
|
||||
return VolumeGrowStrategy.Copy2Count
|
||||
case 3:
|
||||
return VolumeGrowStrategy.Copy3Count
|
||||
default:
|
||||
return VolumeGrowStrategy.CopyOtherCount
|
||||
}
|
||||
}
|
||||
|
||||
// one replication type may need rp.GetCopyCount() actual volumes
|
||||
// given copyCount, how many logical volumes to create
|
||||
func (vg *VolumeGrowth) findVolumeCount(copyCount int) (count uint32) {
|
||||
switch copyCount {
|
||||
case 1:
|
||||
count = VolumeGrowStrategy.Copy1Count
|
||||
case 2:
|
||||
count = VolumeGrowStrategy.Copy2Count
|
||||
case 3:
|
||||
count = VolumeGrowStrategy.Copy3Count
|
||||
default:
|
||||
count = VolumeGrowStrategy.CopyOtherCount
|
||||
}
|
||||
return
|
||||
return VolumeGrowthCountForCopies(copyCount)
|
||||
}
|
||||
|
||||
func (vg *VolumeGrowth) AutomaticGrowByType(option *VolumeGrowOption, grpcDialOption grpc.DialOption, topo *Topology, targetCount uint32) (result []*master_pb.VolumeLocation, err error) {
|
||||
|
||||
@@ -696,6 +696,70 @@ func (vl *VolumeLayout) ShouldGrowVolumesByDcAndRack(writables *[]needle.VolumeI
|
||||
return true
|
||||
}
|
||||
|
||||
// RackGrowPlan is one volume grow action produced by the periodic rack-aware
|
||||
// growth scan. An empty Rack means the grow is DC-wide.
|
||||
type RackGrowPlan struct {
|
||||
DataCenter string
|
||||
Rack string
|
||||
WritableVolumeCount uint32
|
||||
}
|
||||
|
||||
// PlanRackAwareGrowth returns the grow actions needed so every location that
|
||||
// can serve writes keeps a non-crowded writable volume. stepCount is the
|
||||
// default per-event increment.
|
||||
//
|
||||
// For rack-spanning replication (DiffRackCount > 0) a single logical volume
|
||||
// already covers the racks the placement requires, so ShouldGrowVolumesByDcAndRack
|
||||
// returns the same result for every rack in a DC. Planning one grow per rack
|
||||
// would create racks×count too many volumes; plan one DC-wide grow instead.
|
||||
// The default increment is capped at the configured copy_N so lowering
|
||||
// master.volume_growth.copy_N reduces periodic growth.
|
||||
func (vl *VolumeLayout) PlanRackAwareGrowth(dcs map[NodeId][]NodeId, lastGrowCount, stepCount uint32) (plans []RackGrowPlan) {
|
||||
writables := vl.CloneWritableVolumes()
|
||||
if c := VolumeGrowthCountForCopies(vl.rp.GetCopyCount()); c < stepCount {
|
||||
stepCount = c
|
||||
}
|
||||
growOncePerDc := vl.rp.DiffRackCount > 0
|
||||
// Spread lastGrowCount evenly across all grow targets. Summing every rack
|
||||
// up front keeps the divisor global, so DCs with different rack counts do
|
||||
// not each over-grow from a per-DC divisor.
|
||||
var rackPairs uint32
|
||||
for _, racks := range dcs {
|
||||
rackPairs += uint32(len(racks))
|
||||
}
|
||||
for dcId, racks := range dcs {
|
||||
if growOncePerDc {
|
||||
if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, "") {
|
||||
continue
|
||||
}
|
||||
count := stepCount
|
||||
if lastGrowCount > 0 {
|
||||
count = ceilDiv(lastGrowCount, uint32(len(dcs)))
|
||||
}
|
||||
plans = append(plans, RackGrowPlan{DataCenter: string(dcId), WritableVolumeCount: count})
|
||||
continue
|
||||
}
|
||||
for _, rackId := range racks {
|
||||
if !vl.ShouldGrowVolumesByDcAndRack(&writables, dcId, rackId) {
|
||||
continue
|
||||
}
|
||||
count := stepCount
|
||||
if lastGrowCount > 0 {
|
||||
count = ceilDiv(lastGrowCount, rackPairs)
|
||||
}
|
||||
plans = append(plans, RackGrowPlan{DataCenter: string(dcId), Rack: string(rackId), WritableVolumeCount: count})
|
||||
}
|
||||
}
|
||||
return plans
|
||||
}
|
||||
|
||||
func ceilDiv(a, b uint32) uint32 {
|
||||
if b == 0 {
|
||||
return 0
|
||||
}
|
||||
return (a + b - 1) / b
|
||||
}
|
||||
|
||||
func (vl *VolumeLayout) GetWritableVolumeCount() (active, crowded int) {
|
||||
vl.accessLock.RLock()
|
||||
defer vl.accessLock.RUnlock()
|
||||
|
||||
@@ -77,3 +77,170 @@ func TestShouldGrowVolumesByDcAndRack_Issue8986(t *testing.T) {
|
||||
t.Error("rack3 should NOT need growth — the DC already has non-crowded writable volumes that can serve writes")
|
||||
}
|
||||
}
|
||||
|
||||
// Topology from https://github.com/seaweedfs/seaweedfs/issues/9832
|
||||
// 1 DC, 2 racks, 1 server each, replication 010, one crowded volume (28.65 GB)
|
||||
// near the 30 GB size limit.
|
||||
var topologyLayout9832 = `
|
||||
{
|
||||
"datacenter1":{
|
||||
"rack1":{
|
||||
"node-a":{
|
||||
"ip":"10.0.0.1",
|
||||
"volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ],
|
||||
"limit":30
|
||||
}
|
||||
},
|
||||
"rack2":{
|
||||
"node-b":{
|
||||
"ip":"10.0.0.2",
|
||||
"volumes":[ {"id":12, "size":28650, "replication":"010", "collection":"bucket-nexus"} ],
|
||||
"limit":30
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
`
|
||||
|
||||
// Reproduces https://github.com/seaweedfs/seaweedfs/issues/9832
|
||||
//
|
||||
// A crowded "010" volume made the periodic rack-aware scan return true for
|
||||
// every rack in the DC, so it grew once per rack with a hardcoded step of 2 —
|
||||
// 2 racks × 2 = 4 logical volumes (8 physical) — and ignored a lowered
|
||||
// master.volume_growth.copy_2. PlanRackAwareGrowth now plans a single DC-wide
|
||||
// grow capped at copy_N.
|
||||
func TestPlanRackAwareGrowth_Issue9832(t *testing.T) {
|
||||
defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count)
|
||||
VolumeGrowStrategy.Copy2Count = 1 // user's master.volume_growth.copy_2
|
||||
|
||||
topo := setupWithLimit(t, topologyLayout9832, 30000)
|
||||
rp, _ := super_block.NewReplicaPlacementFromString("010")
|
||||
vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType)
|
||||
|
||||
if writables := vl.CloneWritableVolumes(); len(writables) != 1 {
|
||||
t.Fatalf("expected 1 writable volume, got %d", len(writables))
|
||||
}
|
||||
|
||||
// The crowded volume makes both racks report "should grow" — the source of
|
||||
// the old per-rack multiplication.
|
||||
writables := vl.CloneWritableVolumes()
|
||||
if !vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack1") ||
|
||||
!vl.ShouldGrowVolumesByDcAndRack(&writables, "datacenter1", "rack2") {
|
||||
t.Fatal("expected both racks to report should-grow for the crowded volume")
|
||||
}
|
||||
|
||||
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
|
||||
if len(plans) != 1 {
|
||||
t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans)
|
||||
}
|
||||
if plans[0].Rack != "" {
|
||||
t.Errorf("expected DC-wide grow (empty rack), got rack %q", plans[0].Rack)
|
||||
}
|
||||
if plans[0].WritableVolumeCount != 1 {
|
||||
t.Errorf("expected copy_2=1 logical volume, got %d", plans[0].WritableVolumeCount)
|
||||
}
|
||||
}
|
||||
|
||||
// With the default copy_2 the per-event step is preserved (not increased): the
|
||||
// fix only removes the per-rack multiplication.
|
||||
func TestPlanRackAwareGrowth_DefaultStepNotMultiplied(t *testing.T) {
|
||||
defer restoreCopyCounts(VolumeGrowStrategy.Copy1Count, VolumeGrowStrategy.Copy2Count)
|
||||
VolumeGrowStrategy.Copy2Count = 6 // default
|
||||
|
||||
topo := setupWithLimit(t, topologyLayout9832, 30000)
|
||||
rp, _ := super_block.NewReplicaPlacementFromString("010")
|
||||
vl := topo.GetVolumeLayout("bucket-nexus", rp, needle.EMPTY_TTL, types.HardDriveType)
|
||||
|
||||
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
|
||||
if len(plans) != 1 {
|
||||
t.Fatalf("expected 1 DC-wide grow, got %d: %+v", len(plans), plans)
|
||||
}
|
||||
if plans[0].WritableVolumeCount != 2 {
|
||||
t.Errorf("expected step count 2 (min of step 2 and copy_2 6), got %d", plans[0].WritableVolumeCount)
|
||||
}
|
||||
}
|
||||
|
||||
// A non-crowded "010" volume needs no growth at all.
|
||||
func TestPlanRackAwareGrowth_NotCrowdedNoGrowth(t *testing.T) {
|
||||
layout := `
|
||||
{
|
||||
"datacenter1":{
|
||||
"rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } },
|
||||
"rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":12, "size":1000, "replication":"010", "collection":"c"} ], "limit":30 } }
|
||||
}
|
||||
}
|
||||
`
|
||||
topo := setupWithLimit(t, layout, 30000)
|
||||
rp, _ := super_block.NewReplicaPlacementFromString("010")
|
||||
vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
|
||||
|
||||
if plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2); len(plans) != 0 {
|
||||
t.Fatalf("expected no growth for non-crowded volume, got %+v", plans)
|
||||
}
|
||||
}
|
||||
|
||||
// Non-rack-spanning replication ("000") still grows per rack: a rack without a
|
||||
// writable volume gets its own grow.
|
||||
func TestPlanRackAwareGrowth_PerRackForNonRackSpanning(t *testing.T) {
|
||||
layout := `
|
||||
{
|
||||
"datacenter1":{
|
||||
"rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":1000, "replication":"000", "collection":"c"} ], "limit":30 } },
|
||||
"rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[], "limit":30 } }
|
||||
}
|
||||
}
|
||||
`
|
||||
topo := setupWithLimit(t, layout, 30000)
|
||||
rp, _ := super_block.NewReplicaPlacementFromString("000")
|
||||
vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
|
||||
|
||||
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 0, 2)
|
||||
if len(plans) != 1 {
|
||||
t.Fatalf("expected 1 per-rack grow, got %d: %+v", len(plans), plans)
|
||||
}
|
||||
if plans[0].Rack != "rack2" {
|
||||
t.Errorf("expected grow pinned to empty rack2, got %q", plans[0].Rack)
|
||||
}
|
||||
}
|
||||
|
||||
// lastGrowCount is spread evenly across all grow targets even when DCs have
|
||||
// different rack counts: each crowded "000" rack gets ceilDiv(lastGrowCount,
|
||||
// totalRacks), so the total matches the request rather than over-growing per DC.
|
||||
func TestPlanRackAwareGrowth_EvenDistributionAcrossUnevenDCs(t *testing.T) {
|
||||
layout := `
|
||||
{
|
||||
"dc1":{
|
||||
"rack1":{ "node-a":{ "ip":"10.0.0.1", "volumes":[ {"id":1, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
|
||||
"rack2":{ "node-b":{ "ip":"10.0.0.2", "volumes":[ {"id":2, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }
|
||||
},
|
||||
"dc2":{
|
||||
"rack3":{ "node-c":{ "ip":"10.0.0.3", "volumes":[ {"id":3, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
|
||||
"rack4":{ "node-d":{ "ip":"10.0.0.4", "volumes":[ {"id":4, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } },
|
||||
"rack5":{ "node-e":{ "ip":"10.0.0.5", "volumes":[ {"id":5, "size":28650, "replication":"000", "collection":"c"} ], "limit":30 } }
|
||||
}
|
||||
}
|
||||
`
|
||||
topo := setupWithLimit(t, layout, 30000)
|
||||
rp, _ := super_block.NewReplicaPlacementFromString("000")
|
||||
vl := topo.GetVolumeLayout("c", rp, needle.EMPTY_TTL, types.HardDriveType)
|
||||
|
||||
plans := vl.PlanRackAwareGrowth(topo.ListDCAndRacks(), 10, 2)
|
||||
if len(plans) != 5 {
|
||||
t.Fatalf("expected a grow per crowded rack (5), got %d: %+v", len(plans), plans)
|
||||
}
|
||||
total := uint32(0)
|
||||
for _, p := range plans {
|
||||
if p.WritableVolumeCount != 2 { // ceilDiv(10, 5 racks)
|
||||
t.Errorf("expected even per-rack count 2, got %d for %s/%s", p.WritableVolumeCount, p.DataCenter, p.Rack)
|
||||
}
|
||||
total += p.WritableVolumeCount
|
||||
}
|
||||
if total != 10 {
|
||||
t.Errorf("expected total grow 10, got %d", total)
|
||||
}
|
||||
}
|
||||
|
||||
func restoreCopyCounts(copy1, copy2 uint32) {
|
||||
VolumeGrowStrategy.Copy1Count = copy1
|
||||
VolumeGrowStrategy.Copy2Count = copy2
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user