diff --git a/.github/workflows/ec-integration.yml b/.github/workflows/ec-integration.yml deleted file mode 100644 index 285c4be1f..000000000 --- a/.github/workflows/ec-integration.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: EC Integration Tests - -on: - push: - branches: [ master ] - paths: - - 'weed/admin/**' - - 'weed/worker/**' - - 'test/erasure_coding/admin_dockertest/**' - - '.github/workflows/ec-integration.yml' - pull_request: - branches: [ master ] - paths: - - 'weed/admin/**' - - 'weed/worker/**' - - 'test/erasure_coding/admin_dockertest/**' - - '.github/workflows/ec-integration.yml' - -jobs: - ec-integration-test: - runs-on: ubuntu-latest - timeout-minutes: 15 - - steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Set up Go - uses: actions/setup-go@v6 - with: - go-version-file: 'go.mod' - - - name: Build weed binary - run: | - cd weed - go build -o ../weed_bin - - - name: Run EC integration tests - run: | - cd test/erasure_coding/admin_dockertest - go test -v -timeout 15m ec_integration_test.go - - - name: Upload test logs on failure - if: failure() - uses: actions/upload-artifact@v7 - with: - name: ec-test-logs - path: test/erasure_coding/admin_dockertest/tmp/logs/ - retention-days: 7 diff --git a/test/erasure_coding/multidisk_shardloss_test.go b/test/erasure_coding/multidisk_shardloss_test.go new file mode 100644 index 000000000..5fceb8965 --- /dev/null +++ b/test/erasure_coding/multidisk_shardloss_test.go @@ -0,0 +1,190 @@ +package erasure_coding + +import ( + "context" + "fmt" + "path/filepath" + "regexp" + "strconv" + "strings" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/weed/shell" + "github.com/seaweedfs/seaweedfs/weed/storage/needle" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" +) + +// TestMultiDiskECBalanceNoShardLoss is the end-to-end regression for issue 9593. +// It runs a real cluster of multi-disk volume servers (3 servers x 4 disks), +// EC-encodes a volume, then runs ec.balance, asserting hard invariants the older +// integration tests only logged: +// +// - after encode the full set of 14 EC shards exists, +// - ec.balance never loses a shard (still 14 distinct shards afterwards), +// - shards end up spread across more than one disk per node, and +// - cluster.status counts physical disks (not one per node) and matches the +// real on-disk distribution. +func TestMultiDiskECBalanceNoShardLoss(t *testing.T) { + if testing.Short() { + t.Skip("Skipping multi-disk EC integration test in short mode") + } + + testDir := t.TempDir() + ctx, cancel := context.WithTimeout(context.Background(), 240*time.Second) + defer cancel() + + cluster, err := startMultiDiskCluster(ctx, testDir) + require.NoError(t, err) + defer cluster.Stop() + + require.NoError(t, waitForServer("127.0.0.1:9334", 30*time.Second)) + for i := 0; i < 3; i++ { + require.NoError(t, waitForServer(fmt.Sprintf("127.0.0.1:809%d", i), 30*time.Second)) + } + t.Log("waiting for multi-disk volume servers to register...") + time.Sleep(10 * time.Second) + + commandEnv := shell.NewCommandEnv(&shell.ShellOptions{ + Masters: stringPtr("127.0.0.1:9334"), + GrpcDialOption: grpc.WithInsecure(), + FilerGroup: stringPtr("default"), + }) + connectToMasterAndSync(ctx, t, commandEnv) + + // Upload enough small files that the volume holds real data to encode. + var volumeId needle.VolumeId + for retry := 0; retry < 5; retry++ { + volumeId, err = uploadTestDataToMaster([]byte(strings.Repeat("multidisk-ec-9593 ", 64)), "127.0.0.1:9334") + if err == nil { + break + } + time.Sleep(3 * time.Second) + } + require.NoError(t, err, "failed to upload test data") + for i := 0; i < 40; i++ { + if _, e := uploadTestDataToMaster([]byte(strings.Repeat("filler ", 128)), "127.0.0.1:9334"); e != nil { + break + } + } + t.Logf("using volume %d", volumeId) + time.Sleep(3 * time.Second) + + locked, unlock := tryLockWithTimeout(t, commandEnv, 15*time.Second) + require.True(t, locked, "could not acquire shell lock") + defer unlock() + + // EC-encode the volume. + out, err := captureCommandOutput(t, shell.Commands[findCommandIndex("ec.encode")], + []string{"-volumeId", fmt.Sprintf("%d", volumeId), "-collection", "test", "-force"}, commandEnv) + t.Logf("ec.encode output:\n%s", out) + require.NoError(t, err, "ec.encode failed") + + // All 14 shards must exist after encoding. + require.Eventually(t, func() bool { + return len(collectDistinctShardIDs(testDir, uint32(volumeId))) == erasureShardCount + }, 30*time.Second, time.Second, "expected all %d EC shards after encode, got %v", + erasureShardCount, collectDistinctShardIDs(testDir, uint32(volumeId))) + + beforeBalance := collectDistinctShardIDs(testDir, uint32(volumeId)) + t.Logf("after encode: %d distinct shards on %d disks", len(beforeBalance), disksWithShards(testDir, uint32(volumeId))) + + // Run ec.balance. + out, err = captureCommandOutput(t, shell.Commands[findCommandIndex("ec.balance")], + []string{"-collection", "test", "-force"}, commandEnv) + t.Logf("ec.balance output:\n%s", out) + require.NoError(t, err, "ec.balance failed") + time.Sleep(3 * time.Second) + + // The core regression: ec.balance must not lose any shard. + afterBalance := collectDistinctShardIDs(testDir, uint32(volumeId)) + require.Equal(t, erasureShardCount, len(afterBalance), + "ec.balance lost shards on multi-disk nodes: had %v, now %v", sortedKeysOf(beforeBalance), sortedKeysOf(afterBalance)) + + // Shards must be spread across more than one physical disk per node overall. + usedDisks := disksWithShards(testDir, uint32(volumeId)) + assert.Greater(t, usedDisks, 3, "EC shards should span more than one disk per node (got %d disks across 3 nodes)", usedDisks) + + // cluster.status must count physical disks, not collapse to one per node: it + // must report at least the disks actually holding this volume's shards (which + // is already >3 across the 3 nodes). Before the fix it reported 3 (node count). + require.Eventually(t, func() bool { + n, ok := clusterStatusDiskCount(t, commandEnv) + return ok && n >= usedDisks + }, 30*time.Second, 2*time.Second, "cluster.status never reported the >=%d physical disks holding shards (multi-disk count)", usedDisks) + + n, _ := clusterStatusDiskCount(t, commandEnv) + t.Logf("cluster.status reports %d physical disks (>= %d holding this volume's shards)", n, usedDisks) +} + +const erasureShardCount = 14 // 10 data + 4 parity + +// collectDistinctShardIDs returns the set of EC shard ids present for a volume +// across every disk of every server in the multi-disk test layout. +func collectDistinctShardIDs(testDir string, volumeId uint32) map[int]bool { + ids := map[int]bool{} + for server := 0; server < 3; server++ { + for disk := 0; disk < 4; disk++ { + diskDir := filepath.Join(testDir, fmt.Sprintf("server%d_disk%d", server, disk)) + files, err := listECShardFiles(diskDir, volumeId) + if err != nil { + continue + } + for _, f := range files { + i := strings.LastIndex(f, ".ec") + if i < 0 { + continue + } + if n, err := strconv.Atoi(f[i+3:]); err == nil && n >= 0 && n < erasureShardCount { + ids[n] = true + } + } + } + } + return ids +} + +// disksWithShards counts how many physical disks hold at least one shard. +func disksWithShards(testDir string, volumeId uint32) int { + n := 0 + for _, disks := range countShardsPerDisk(testDir, volumeId) { + for _, c := range disks { + if c > 0 { + n++ + } + } + } + return n +} + +var diskCountRe = regexp.MustCompile(`(\d+)\s+disks?`) + +// clusterStatusDiskCount runs cluster.status and parses the reported disk count. +func clusterStatusDiskCount(t *testing.T, commandEnv *shell.CommandEnv) (int, bool) { + t.Helper() + out, err := captureCommandOutput(t, shell.Commands[findCommandIndex("cluster.status")], []string{}, commandEnv) + if err != nil { + return 0, false + } + m := diskCountRe.FindStringSubmatch(out) + if m == nil { + return 0, false + } + n, err := strconv.Atoi(m[1]) + return n, err == nil +} + +func sortedKeysOf(m map[int]bool) []int { + out := make([]int, 0, len(m)) + for k := range m { + out = append(out, k) + } + for i := 1; i < len(out); i++ { + for j := i; j > 0 && out[j-1] > out[j]; j-- { + out[j-1], out[j] = out[j], out[j-1] + } + } + return out +} diff --git a/weed/admin/maintenance/maintenance_integration.go b/weed/admin/maintenance/maintenance_integration.go index be5875a72..68e72ff0b 100644 --- a/weed/admin/maintenance/maintenance_integration.go +++ b/weed/admin/maintenance/maintenance_integration.go @@ -26,6 +26,10 @@ type MaintenanceIntegration struct { // Active topology for task detection and target selection activeTopology *topology.ActiveTopology + // Master's default replication, refreshed by the scanner each cycle and + // passed to detectors as the replica-placement fallback (matches the shell). + defaultReplicaPlacement string + // Type conversion maps taskTypeMap map[types.TaskType]MaintenanceTaskType revTaskTypeMap map[MaintenanceTaskType]types.TaskType @@ -219,9 +223,10 @@ func (s *MaintenanceIntegration) ScanWithTaskDetectors(volumeMetrics []*types.Vo // Create cluster info clusterInfo := &types.ClusterInfo{ - TotalVolumes: len(filteredMetrics), - LastUpdated: time.Now(), - ActiveTopology: s.activeTopology, // Provide ActiveTopology for destination planning + TotalVolumes: len(filteredMetrics), + LastUpdated: time.Now(), + ActiveTopology: s.activeTopology, // Provide ActiveTopology for destination planning + DefaultReplicaPlacement: s.defaultReplicaPlacement, } // Run detection for each registered task type @@ -271,6 +276,12 @@ func (s *MaintenanceIntegration) ScanWithTaskDetectors(volumeMetrics []*types.Vo return allResults, nil } +// SetDefaultReplicaPlacement records the master's default replication so detectors +// can use it as the replica-placement fallback (matching the shell). +func (s *MaintenanceIntegration) SetDefaultReplicaPlacement(replicaPlacement string) { + s.defaultReplicaPlacement = replicaPlacement +} + // UpdateTopologyInfo updates the volume shard tracker with topology information for empty servers func (s *MaintenanceIntegration) UpdateTopologyInfo(topologyInfo *master_pb.TopologyInfo) error { // Log topology details before update for diagnostics diff --git a/weed/admin/maintenance/maintenance_scanner.go b/weed/admin/maintenance/maintenance_scanner.go index ceeed85e5..5b963c1e9 100644 --- a/weed/admin/maintenance/maintenance_scanner.go +++ b/weed/admin/maintenance/maintenance_scanner.go @@ -51,6 +51,10 @@ func (ms *MaintenanceScanner) ScanForMaintenanceTasks() ([]*TaskDetectionResult, } } + // Refresh the master's default replication so detectors can use it as the + // replica-placement fallback (matches the shell ec.balance default). + ms.integration.SetDefaultReplicaPlacement(ms.getDefaultReplicaPlacement()) + // Use task detection system with complete cluster information results, err := ms.integration.ScanWithTaskDetectors(taskMetrics) if err != nil { @@ -67,6 +71,26 @@ func (ms *MaintenanceScanner) ScanForMaintenanceTasks() ([]*TaskDetectionResult, return []*TaskDetectionResult{}, nil } +// getDefaultReplicaPlacement reads the master's configured default replication, +// used by detectors as the replica-placement fallback. Returns "" on error so +// detectors fall back to even spread rather than failing the scan. +func (ms *MaintenanceScanner) getDefaultReplicaPlacement() string { + var replicaPlacement string + err := ms.adminClient.WithMasterClient(func(client master_pb.SeaweedClient) error { + resp, err := client.GetMasterConfiguration(context.Background(), &master_pb.GetMasterConfigurationRequest{}) + if err != nil { + return err + } + replicaPlacement = resp.DefaultReplication + return nil + }) + if err != nil { + glog.V(1).Infof("could not fetch master default replication: %v", err) + return "" + } + return replicaPlacement +} + // getVolumeHealthMetrics collects health information for all volumes. // Returns metrics in task-system format directly (no intermediate copy) and // the topology info for updating the active topology. diff --git a/weed/admin/topology/count_topology_resources_test.go b/weed/admin/topology/count_topology_resources_test.go new file mode 100644 index 000000000..26cf7b698 --- /dev/null +++ b/weed/admin/topology/count_topology_resources_test.go @@ -0,0 +1,54 @@ +package topology + +import ( + "testing" + + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" +) + +// TestCountTopologyResources_multiDiskPerNode covers the case where the master +// keys DiskInfos by disk type, so several same-type physical disks on a node +// collapse into a single DiskInfo entry. Counting len(DiskInfos) under-reports +// the physical disk count and disagrees with the per-disk activeDisk map that +// the rest of the admin topology builds via SplitByPhysicalDisk. +func TestCountTopologyResources_multiDiskPerNode(t *testing.T) { + makeNode := func(id string) *master_pb.DataNodeInfo { + var ecShardInfos []*master_pb.VolumeEcShardInformationMessage + for diskId := uint32(0); diskId < 6; diskId++ { + ecShardInfos = append(ecShardInfos, &master_pb.VolumeEcShardInformationMessage{ + Id: diskId + 1, + DiskId: diskId, + EcIndexBits: 1, + }) + } + return &master_pb.DataNodeInfo{ + Id: id, + DiskInfos: map[string]*master_pb.DiskInfo{ + "": {Type: "", MaxVolumeCount: 60, EcShardInfos: ecShardInfos}, + }, + } + } + topo := &master_pb.TopologyInfo{ + Id: "multi_disk_topo", + DataCenterInfos: []*master_pb.DataCenterInfo{{ + Id: "dc1", + RackInfos: []*master_pb.RackInfo{{ + Id: "rack1", + DataNodeInfos: []*master_pb.DataNodeInfo{ + makeNode("node1"), makeNode("node2"), makeNode("node3"), + }, + }}, + }}, + } + + dcCount, nodeCount, diskCount := CountTopologyResources(topo) + if dcCount != 1 { + t.Errorf("dcCount = %d, want 1", dcCount) + } + if nodeCount != 3 { + t.Errorf("nodeCount = %d, want 3", nodeCount) + } + if diskCount != 18 { + t.Errorf("diskCount = %d, want 18 (6 physical disks x 3 nodes)", diskCount) + } +} diff --git a/weed/admin/topology/topology_management.go b/weed/admin/topology/topology_management.go index 62f957860..05acecde8 100644 --- a/weed/admin/topology/topology_management.go +++ b/weed/admin/topology/topology_management.go @@ -19,7 +19,12 @@ func CountTopologyResources(topologyInfo *master_pb.TopologyInfo) (dcCount, node for _, rack := range dc.RackInfos { nodeCount += len(rack.DataNodeInfos) for _, node := range rack.DataNodeInfos { - diskCount += len(node.DiskInfos) + // DiskInfos is keyed by disk type, so same-type physical disks + // collapse into one entry. Count physical disks so the number + // matches the per-disk activeDisk map. + for _, diskInfo := range node.DiskInfos { + diskCount += len(diskInfo.SplitByPhysicalDisk()) + } } } } diff --git a/weed/pb/worker.proto b/weed/pb/worker.proto index 2d2c6b2d1..a1ba6c5fa 100644 --- a/weed/pb/worker.proto +++ b/weed/pb/worker.proto @@ -413,6 +413,7 @@ message EcBalanceTaskConfig { string collection_filter = 3; // Collection filter string disk_type = 4; // Disk type filter repeated string preferred_tags = 5; // Preferred disk tags for placement + string replica_placement = 6; // EC shard replica placement (e.g. "020"); empty falls back to master default replication } // ========== Task Persistence Messages ========== diff --git a/weed/pb/worker_pb/worker.pb.go b/weed/pb/worker_pb/worker.pb.go index a978455da..e0284dee7 100644 --- a/weed/pb/worker_pb/worker.pb.go +++ b/weed/pb/worker_pb/worker.pb.go @@ -3297,6 +3297,7 @@ type EcBalanceTaskConfig struct { CollectionFilter string `protobuf:"bytes,3,opt,name=collection_filter,json=collectionFilter,proto3" json:"collection_filter,omitempty"` // Collection filter DiskType string `protobuf:"bytes,4,opt,name=disk_type,json=diskType,proto3" json:"disk_type,omitempty"` // Disk type filter PreferredTags []string `protobuf:"bytes,5,rep,name=preferred_tags,json=preferredTags,proto3" json:"preferred_tags,omitempty"` // Preferred disk tags for placement + ReplicaPlacement string `protobuf:"bytes,6,opt,name=replica_placement,json=replicaPlacement,proto3" json:"replica_placement,omitempty"` // EC shard replica placement (e.g. "020"); empty falls back to master default replication unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -3366,6 +3367,13 @@ func (x *EcBalanceTaskConfig) GetPreferredTags() []string { return nil } +func (x *EcBalanceTaskConfig) GetReplicaPlacement() string { + if x != nil { + return x.ReplicaPlacement + } + return "" +} + // MaintenanceTaskData represents complete task state for persistence type MaintenanceTaskData struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -4238,13 +4246,14 @@ const file_worker_proto_rawDesc = "" + "\x0esource_disk_id\x18\x05 \x01(\rR\fsourceDiskId\x12\x1f\n" + "\vtarget_node\x18\x06 \x01(\tR\n" + "targetNode\x12$\n" + - "\x0etarget_disk_id\x18\a \x01(\rR\ftargetDiskId\"\xe1\x01\n" + + "\x0etarget_disk_id\x18\a \x01(\rR\ftargetDiskId\"\x8e\x02\n" + "\x13EcBalanceTaskConfig\x12/\n" + "\x13imbalance_threshold\x18\x01 \x01(\x01R\x12imbalanceThreshold\x12(\n" + "\x10min_server_count\x18\x02 \x01(\x05R\x0eminServerCount\x12+\n" + "\x11collection_filter\x18\x03 \x01(\tR\x10collectionFilter\x12\x1b\n" + "\tdisk_type\x18\x04 \x01(\tR\bdiskType\x12%\n" + - "\x0epreferred_tags\x18\x05 \x03(\tR\rpreferredTags\"\xae\a\n" + + "\x0epreferred_tags\x18\x05 \x03(\tR\rpreferredTags\x12+\n" + + "\x11replica_placement\x18\x06 \x01(\tR\x10replicaPlacement\"\xae\a\n" + "\x13MaintenanceTaskData\x12\x0e\n" + "\x02id\x18\x01 \x01(\tR\x02id\x12\x12\n" + "\x04type\x18\x02 \x01(\tR\x04type\x12\x1a\n" + diff --git a/weed/plugin/worker/volume_metrics.go b/weed/plugin/worker/volume_metrics.go index 4ceda7b2f..8452f40c9 100644 --- a/weed/plugin/worker/volume_metrics.go +++ b/weed/plugin/worker/volume_metrics.go @@ -59,6 +59,38 @@ func CollectVolumeMetricsFromMasters( // FetchVolumeList dials the given master address (trying both the address as // given and the gRPC port variant) and returns the master's volume list. Used // by detection helpers that already know which master address to talk to. +// FetchDefaultReplicaPlacement returns the master's configured default replication +// (GetMasterConfiguration), used by detectors as the replica-placement fallback so +// the plugin path matches the shell. Returns "" if it cannot be fetched, so callers +// fall back to even spread rather than failing detection. +func FetchDefaultReplicaPlacement(ctx context.Context, masterAddresses []string, grpcDialOption grpc.DialOption) string { + if grpcDialOption == nil { + return "" + } + for _, address := range masterAddresses { + for _, candidate := range MasterAddressCandidates(address) { + if ctx.Err() != nil { + return "" + } + dialCtx, cancelDial := context.WithTimeout(ctx, 5*time.Second) + conn, err := pb.GrpcDial(dialCtx, candidate, false, grpcDialOption) + cancelDial() + if err != nil { + continue + } + client := master_pb.NewSeaweedClient(conn) + callCtx, cancelCall := context.WithTimeout(ctx, 10*time.Second) + resp, callErr := client.GetMasterConfiguration(callCtx, &master_pb.GetMasterConfigurationRequest{}) + cancelCall() + _ = conn.Close() + if callErr == nil { + return resp.DefaultReplication + } + } + } + return "" +} + func FetchVolumeList(ctx context.Context, address string, grpcDialOption grpc.DialOption) (*master_pb.VolumeListResponse, error) { var lastErr error for _, candidate := range MasterAddressCandidates(address) { diff --git a/weed/shell/command_cluster_status.go b/weed/shell/command_cluster_status.go index 2f0de7703..68a8cd3aa 100644 --- a/weed/shell/command_cluster_status.go +++ b/weed/shell/command_cluster_status.go @@ -327,7 +327,13 @@ func (sp *ClusterStatusPrinter) printClusterInfo() { for _, ri := range dci.RackInfos { for _, dni := range ri.DataNodeInfos { nodes++ - disks += len(dni.DiskInfos) + // The master keys DiskInfos by disk type, so multiple + // same-type physical disks on one node collapse into a single + // entry. Count physical disks via SplitByPhysicalDisk so a node + // with N disks of one type reports N, not 1. + for _, di := range dni.DiskInfos { + disks += len(di.SplitByPhysicalDisk()) + } } } } diff --git a/weed/shell/command_cluster_status_test.go b/weed/shell/command_cluster_status_test.go index 69ed3f5a9..93756075e 100644 --- a/weed/shell/command_cluster_status_test.go +++ b/weed/shell/command_cluster_status_test.go @@ -51,6 +51,65 @@ func TestPrintClusterInfo(t *testing.T) { } } +// TestPrintClusterInfo_multiDiskPerNode covers a node whose several physical +// disks of the same type collapse into a single DiskInfo on the wire (keyed by +// disk type), so counting len(DiskInfos) under-reports the physical disk count. +// Three nodes with six disks each must report 18 disks, not 3. +func TestPrintClusterInfo_multiDiskPerNode(t *testing.T) { + makeNode := func(id string) *master_pb.DataNodeInfo { + var ecShardInfos []*master_pb.VolumeEcShardInformationMessage + // One EC volume per physical disk, each carrying its own DiskId 0..5. + for diskId := uint32(0); diskId < 6; diskId++ { + ecShardInfos = append(ecShardInfos, &master_pb.VolumeEcShardInformationMessage{ + Id: diskId + 1, + DiskId: diskId, + EcIndexBits: 1, // a single shard present + }) + } + return &master_pb.DataNodeInfo{ + Id: id, + DiskInfos: map[string]*master_pb.DiskInfo{ + "": { + Type: "", + MaxVolumeCount: 60, + EcShardInfos: ecShardInfos, + }, + }, + } + } + topo := &master_pb.TopologyInfo{ + Id: "multi_disk_topo", + DataCenterInfos: []*master_pb.DataCenterInfo{{ + Id: "dc1", + RackInfos: []*master_pb.RackInfo{{ + Id: "rack1", + DataNodeInfos: []*master_pb.DataNodeInfo{ + makeNode("node1"), makeNode("node2"), makeNode("node3"), + }, + }}, + }}, + } + + var buf bytes.Buffer + sp := &ClusterStatusPrinter{ + writer: &buf, + humanize: true, + topology: topo, + } + sp.printClusterInfo() + got := buf.String() + want := `cluster: + id: multi_disk_topo + status: unlocked + nodes: 3 + topology: 1 DC, 18 disks on 1 rack + +` + if got != want { + t.Errorf("multi-disk cluster info:\ngot:\n%s\nwant:\n%s", got, want) + } +} + func TestPrintVolumeInfo(t *testing.T) { testCases := []struct { topology *master_pb.TopologyInfo diff --git a/weed/shell/command_ec_common.go b/weed/shell/command_ec_common.go index de3271548..9f3cbbfb8 100644 --- a/weed/shell/command_ec_common.go +++ b/weed/shell/command_ec_common.go @@ -2,9 +2,7 @@ package shell import ( "context" - "errors" "fmt" - "math/rand/v2" "regexp" "slices" "sort" @@ -16,6 +14,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding/ecbalancer" "github.com/seaweedfs/seaweedfs/weed/storage/needle" "github.com/seaweedfs/seaweedfs/weed/storage/super_block" "github.com/seaweedfs/seaweedfs/weed/storage/types" @@ -649,751 +648,6 @@ func (ecNode *EcNode) deleteEcVolumeShards(vid needle.VolumeId, shardIds []erasu return ecNode } -func groupByCount(data []*EcNode, identifierFn func(*EcNode) (id string, count int)) map[string]int { - countMap := make(map[string]int) - for _, d := range data { - id, count := identifierFn(d) - countMap[id] += count - } - return countMap -} - -func groupBy(data []*EcNode, identifierFn func(*EcNode) (id string)) map[string][]*EcNode { - groupMap := make(map[string][]*EcNode) - for _, d := range data { - id := identifierFn(d) - groupMap[id] = append(groupMap[id], d) - } - return groupMap -} - -type ecBalancer struct { - commandEnv *CommandEnv - ecNodes []*EcNode - replicaPlacement *super_block.ReplicaPlacement - applyBalancing bool - maxParallelization int - diskType types.DiskType // target disk type for EC shards (default: HardDriveType) - // EC configuration for shard distribution (defaults to 10+4) - dataShardCount int - parityShardCount int -} - -// getDataShardCount returns the configured data shard count, defaulting to standard 10 -func (ecb *ecBalancer) getDataShardCount() int { - if ecb.dataShardCount > 0 { - return ecb.dataShardCount - } - return erasure_coding.DataShardsCount -} - -// getParityShardCount returns the configured parity shard count, defaulting to standard 4 -func (ecb *ecBalancer) getParityShardCount() int { - if ecb.parityShardCount > 0 { - return ecb.parityShardCount - } - return erasure_coding.ParityShardsCount -} - -func (ecb *ecBalancer) errorWaitGroup() *ErrorWaitGroup { - return NewErrorWaitGroup(ecb.maxParallelization) -} - -func (ecb *ecBalancer) racks() map[RackId]*EcRack { - racks := make(map[RackId]*EcRack) - for _, ecNode := range ecb.ecNodes { - if racks[ecNode.rack] == nil { - racks[ecNode.rack] = &EcRack{ - ecNodes: make(map[EcNodeId]*EcNode), - } - } - racks[ecNode.rack].ecNodes[EcNodeId(ecNode.info.Id)] = ecNode - racks[ecNode.rack].freeEcSlot += ecNode.freeEcSlot - } - return racks -} - -func (ecb *ecBalancer) balanceEcVolumes(collection string) error { - - fmt.Printf("balanceEcVolumes %s\n", collection) - - if err := ecb.deleteDuplicatedEcShards(collection); err != nil { - return fmt.Errorf("delete duplicated collection %s ec shards: %v", collection, err) - } - - if err := ecb.balanceEcShardsAcrossRacks(collection); err != nil { - return fmt.Errorf("balance across racks collection %s ec shards: %v", collection, err) - } - - if err := ecb.balanceEcShardsWithinRacks(collection); err != nil { - return fmt.Errorf("balance within racks collection %s ec shards: %v", collection, err) - } - - return nil -} - -func (ecb *ecBalancer) deleteDuplicatedEcShards(collection string) error { - vidLocations := ecb.collectVolumeIdToEcNodes(collection) - - ewg := ecb.errorWaitGroup() - for vid, locations := range vidLocations { - ewg.Add(func() error { - return ecb.doDeduplicateEcShards(collection, vid, locations) - }) - } - return ewg.Wait() -} - -func (ecb *ecBalancer) doDeduplicateEcShards(collection string, vid needle.VolumeId, locations []*EcNode) error { - // check whether this volume has ecNodes that are over average - // Use MaxShardCount (32) to support custom EC ratios - shardToLocations := make([][]*EcNode, erasure_coding.MaxShardCount) - for _, ecNode := range locations { - si := findEcVolumeShardsInfo(ecNode, vid, ecb.diskType) - for _, shardId := range si.Ids() { - shardToLocations[shardId] = append(shardToLocations[shardId], ecNode) - } - } - for shardId, ecNodes := range shardToLocations { - if len(ecNodes) <= 1 { - continue - } - sortEcNodesByFreeslotsAscending(ecNodes) - fmt.Printf("ec shard %d.%d has %d copies, keeping %v\n", vid, shardId, len(ecNodes), ecNodes[0].info.Id) - if !ecb.applyBalancing { - continue - } - - duplicatedShardIds := []erasure_coding.ShardId{erasure_coding.ShardId(shardId)} - for _, ecNode := range ecNodes[1:] { - if err := unmountEcShards(ecb.commandEnv.option.GrpcDialOption, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil { - return err - } - if err := sourceServerDeleteEcShards(ecb.commandEnv.option.GrpcDialOption, collection, vid, pb.NewServerAddressFromDataNode(ecNode.info), duplicatedShardIds); err != nil { - return err - } - ecNode.deleteEcVolumeShards(vid, duplicatedShardIds, ecb.diskType) - } - } - return nil -} - -func (ecb *ecBalancer) balanceEcShardsAcrossRacks(collection string) error { - // collect vid => []ecNode, since previous steps can change the locations - vidLocations := ecb.collectVolumeIdToEcNodes(collection) - - // spread the ec shards evenly - ewg := ecb.errorWaitGroup() - for vid, locations := range vidLocations { - ewg.Add(func() error { - return ecb.doBalanceEcShardsAcrossRacks(collection, vid, locations) - }) - } - return ewg.Wait() -} - -func countShardsByRack(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType) map[string]int { - return groupByCount(locations, func(ecNode *EcNode) (id string, count int) { - id = string(ecNode.rack) - if si := findEcVolumeShardsInfo(ecNode, vid, diskType); si != nil { - count = si.Count() - } - return - }) -} - -// shardsByType is a generic helper that counts data and parity shards per group -func shardsByType(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int, keyExtractor func(*EcNode) string) (dataPerGroup, parityPerGroup map[string][]erasure_coding.ShardId) { - dataPerGroup = make(map[string][]erasure_coding.ShardId) - parityPerGroup = make(map[string][]erasure_coding.ShardId) - for _, ecNode := range locations { - si := findEcVolumeShardsInfo(ecNode, vid, diskType) - groupKey := keyExtractor(ecNode) - for _, shardId := range si.Ids() { - if int(shardId) < dataShards { - dataPerGroup[groupKey] = append(dataPerGroup[groupKey], shardId) - } else { - parityPerGroup[groupKey] = append(parityPerGroup[groupKey], shardId) - } - } - } - return -} - -// shardsByTypePerRack counts data shards (< dataShards) and parity shards (>= dataShards) per rack -func shardsByTypePerRack(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int) (dataPerRack, parityPerRack map[string][]erasure_coding.ShardId) { - return shardsByType(vid, locations, diskType, dataShards, func(ecNode *EcNode) string { - return string(ecNode.rack) - }) -} - -// shardsByTypePerNode counts data shards and parity shards per node -func shardsByTypePerNode(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType, dataShards int) (dataPerNode, parityPerNode map[string][]erasure_coding.ShardId) { - return shardsByType(vid, locations, diskType, dataShards, func(ecNode *EcNode) string { - return ecNode.info.Id - }) -} - -func countShardsByNode(vid needle.VolumeId, locations []*EcNode, diskType types.DiskType) map[string]int { - return groupByCount(locations, func(ecNode *EcNode) (id string, count int) { - id = ecNode.info.Id - if si := findEcVolumeShardsInfo(ecNode, vid, diskType); si != nil { - count = si.Count() - } - return - }) -} - -func (ecb *ecBalancer) doBalanceEcShardsAcrossRacks(collection string, vid needle.VolumeId, locations []*EcNode) error { - racks := ecb.racks() - numRacks := len(racks) - - // Use configured EC scheme for shard type classification (defaults to 10+4) - dataShardCount := ecb.getDataShardCount() - parityShardCount := ecb.getParityShardCount() - - // Get current distribution of data shards per rack (parity computed after data balancing) - dataPerRack, _ := shardsByTypePerRack(vid, locations, ecb.diskType, dataShardCount) - - // Calculate max shards per rack for each type to ensure even spread - // Data: 10 shards / 6 racks = max 2 per rack - // Parity: 4 shards / 6 racks = max 1 per rack (with 2 racks having 0) - maxDataPerRack := ceilDivide(dataShardCount, numRacks) - maxParityPerRack := ceilDivide(parityShardCount, numRacks) - - rackEcNodesWithVid := groupBy(locations, func(ecNode *EcNode) string { - return string(ecNode.rack) - }) - - // Track total shard count per rack for slot management - rackToShardCount := countShardsByRack(vid, locations, ecb.diskType) - - // First pass: Balance data shards across racks - if err := ecb.balanceShardTypeAcrossRacks(collection, vid, racks, rackEcNodesWithVid, dataPerRack, rackToShardCount, maxDataPerRack, "data", nil); err != nil { - return err - } - - // Refresh locations after data shard moves and get parity distribution - locations = ecb.collectVolumeIdToEcNodes(collection)[vid] - dataPerRack, parityPerRack := shardsByTypePerRack(vid, locations, ecb.diskType, dataShardCount) - rackEcNodesWithVid = groupBy(locations, func(ecNode *EcNode) string { - return string(ecNode.rack) - }) - rackToShardCount = countShardsByRack(vid, locations, ecb.diskType) - - // Identify racks containing data shards to avoid for parity placement. - // We call this "antiAffinityRacks" because we want parity shards to have anti-affinity - // with racks that hold data shards, to ensure better fault tolerance. - antiAffinityRacks := make(map[string]bool) - for rackId, shards := range dataPerRack { - if len(shards) > 0 { - antiAffinityRacks[rackId] = true - } - } - - // Second pass: Balance parity shards across racks, ignoring racks with data shards if possible - if err := ecb.balanceShardTypeAcrossRacks(collection, vid, racks, rackEcNodesWithVid, parityPerRack, rackToShardCount, maxParityPerRack, "parity", antiAffinityRacks); err != nil { - return err - } - - return nil -} - -// balanceShardTypeAcrossRacks spreads shards of a specific type (data or parity) evenly across racks -func (ecb *ecBalancer) balanceShardTypeAcrossRacks( - collection string, - vid needle.VolumeId, - racks map[RackId]*EcRack, - rackEcNodesWithVid map[string][]*EcNode, - shardsPerRack map[string][]erasure_coding.ShardId, - rackToShardCount map[string]int, - maxPerRack int, - shardType string, - antiAffinityRacks map[string]bool, -) error { - // Find racks with too many shards of this type - shardsToMove := make(map[erasure_coding.ShardId]*EcNode) - for rackId, shards := range shardsPerRack { - if len(shards) <= maxPerRack { - continue - } - // Pick excess shards to move - excess := len(shards) - maxPerRack - ecNodes := rackEcNodesWithVid[rackId] - for i := 0; i < excess && i < len(shards); i++ { - shardId := shards[i] - // Find which node has this shard - for _, ecNode := range ecNodes { - si := findEcVolumeShardsInfo(ecNode, vid, ecb.diskType) - if si.Has(shardId) { - shardsToMove[shardId] = ecNode - break - } - } - } - } - - // Move shards to racks that have fewer than maxPerRack of this type - for shardId, ecNode := range shardsToMove { - // Find destination rack with room for this shard type - destRackId, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount, antiAffinityRacks) - if err != nil { - fmt.Printf("ec %s shard %d.%d at %s can not find a destination rack:\n%s\n", shardType, vid, shardId, ecNode.info.Id, err.Error()) - continue - } - - var possibleDestinationEcNodes []*EcNode - for _, n := range racks[destRackId].ecNodes { - possibleDestinationEcNodes = append(possibleDestinationEcNodes, n) - } - err = ecb.pickOneEcNodeAndMoveOneShard(ecNode, collection, vid, shardId, possibleDestinationEcNodes) - if err != nil { - return err - } - - // Update tracking - shardsPerRack[string(destRackId)] = append(shardsPerRack[string(destRackId)], shardId) - // Remove from source rack - srcRack := string(ecNode.rack) - for i, s := range shardsPerRack[srcRack] { - if s == shardId { - shardsPerRack[srcRack] = append(shardsPerRack[srcRack][:i], shardsPerRack[srcRack][i+1:]...) - break - } - } - rackToShardCount[string(destRackId)] += 1 - rackToShardCount[srcRack] -= 1 - racks[destRackId].freeEcSlot -= 1 - racks[RackId(srcRack)].freeEcSlot += 1 - } - - return nil -} - -// twoPassSelector implements two-pass selection with anti-affinity -// Pass 1: Select from candidates NOT in antiAffinity set -// Pass 2: Fallback to any valid candidate if Pass 1 yields no results -type twoPassSelector[T any] struct { - candidates []T - shardsPerTarget map[string][]erasure_coding.ShardId - maxPerTarget int - targetToShardCount map[string]int - antiAffinity map[string]bool - - // Functions to extract info from candidate - getKey func(T) string - hasFreeSlots func(T) bool - checkLimit func(T) bool // replica placement or other limits -} - -func (s *twoPassSelector[T]) selectCandidate() (T, error) { - var selected []T - minShards := s.maxPerTarget + 1 - - // Pass 1: Try candidates NOT in anti-affinity set - for _, candidate := range s.candidates { - if !s.hasFreeSlots(candidate) { - continue - } - key := s.getKey(candidate) - currentCount := len(s.shardsPerTarget[key]) - if currentCount >= s.maxPerTarget { - continue - } - if !s.checkLimit(candidate) { - continue - } - - // Skip anti-affinity targets in pass 1 - if s.antiAffinity != nil && s.antiAffinity[key] { - continue - } - - if currentCount < minShards { - selected = nil - minShards = currentCount - } - if currentCount == minShards { - selected = append(selected, candidate) - } - } - - // Pass 2: Fallback if no candidates found - if len(selected) == 0 { - minShards = s.maxPerTarget + 1 - for _, candidate := range s.candidates { - if !s.hasFreeSlots(candidate) { - continue - } - key := s.getKey(candidate) - currentCount := len(s.shardsPerTarget[key]) - if currentCount >= s.maxPerTarget { - continue - } - if !s.checkLimit(candidate) { - continue - } - - if currentCount < minShards { - selected = nil - minShards = currentCount - } - if currentCount == minShards { - selected = append(selected, candidate) - } - } - } - - if len(selected) == 0 { - var zero T - return zero, errors.New("no valid candidate available") - } - return selected[rand.IntN(len(selected))], nil -} - -// pickRackForShardType selects a rack that has room for more shards of a specific type -func (ecb *ecBalancer) pickRackForShardType( - rackToEcNodes map[RackId]*EcRack, - shardsPerRack map[string][]erasure_coding.ShardId, - maxPerRack int, - rackToShardCount map[string]int, - antiAffinityRacks map[string]bool, -) (RackId, error) { - // Convert map to slice for iteration - var rackCandidates []struct { - id RackId - rack *EcRack - } - for id, rack := range rackToEcNodes { - rackCandidates = append(rackCandidates, struct { - id RackId - rack *EcRack - }{id, rack}) - } - - selector := &twoPassSelector[struct { - id RackId - rack *EcRack - }]{ - candidates: rackCandidates, - shardsPerTarget: shardsPerRack, - maxPerTarget: maxPerRack, - targetToShardCount: rackToShardCount, - antiAffinity: antiAffinityRacks, - getKey: func(c struct { - id RackId - rack *EcRack - }) string { - return string(c.id) - }, - hasFreeSlots: func(c struct { - id RackId - rack *EcRack - }) bool { - return c.rack.freeEcSlot > 0 - }, - checkLimit: func(c struct { - id RackId - rack *EcRack - }) bool { - // For EC shards, replica placement constraint only applies when DiffRackCount > 0. - if ecb.replicaPlacement != nil && ecb.replicaPlacement.DiffRackCount > 0 { - return rackToShardCount[string(c.id)] < ecb.replicaPlacement.DiffRackCount - } - return true - }, - } - - selected, err := selector.selectCandidate() - if err != nil { - return "", errors.New("no rack available for shard type balancing") - } - return selected.id, nil -} - -func (ecb *ecBalancer) balanceEcShardsWithinRacks(collection string) error { - // collect vid => []ecNode, since previous steps can change the locations - vidLocations := ecb.collectVolumeIdToEcNodes(collection) - racks := ecb.racks() - - // spread the ec shards evenly - ewg := ecb.errorWaitGroup() - for vid, locations := range vidLocations { - - // see the volume's shards are in how many racks, and how many in each rack - rackToShardCount := countShardsByRack(vid, locations, ecb.diskType) - - for rackId := range rackToShardCount { - var possibleDestinationEcNodes []*EcNode - for _, n := range racks[RackId(rackId)].ecNodes { - if _, found := n.info.DiskInfos[string(ecb.diskType)]; found { - possibleDestinationEcNodes = append(possibleDestinationEcNodes, n) - } - } - ewg.Add(func() error { - return ecb.doBalanceEcShardsWithinOneRack(collection, vid, possibleDestinationEcNodes) - }) - } - } - return ewg.Wait() -} - -func (ecb *ecBalancer) doBalanceEcShardsWithinOneRack(collection string, vid needle.VolumeId, possibleDestinationEcNodes []*EcNode) error { - // Use configured EC scheme - dataShardCount := ecb.getDataShardCount() - - // Get current distribution of data shards per node - dataPerNode, parityPerNode := shardsByTypePerNode(vid, possibleDestinationEcNodes, ecb.diskType, dataShardCount) - - // Calculate max shards per node for each type - numNodes := len(possibleDestinationEcNodes) - if numNodes == 0 { - return nil - } - - // Calculate totals based on actual shards present in the rack (subset of all shards) - totalData := 0 - for _, shards := range dataPerNode { - totalData += len(shards) - } - totalParity := 0 - for _, shards := range parityPerNode { - totalParity += len(shards) - } - - maxDataPerNode := ceilDivide(totalData, numNodes) - maxParityPerNode := ceilDivide(totalParity, numNodes) - - // Track total shard count per node - nodeToShardCount := countShardsByNode(vid, possibleDestinationEcNodes, ecb.diskType) - - // First pass: Balance data shards across nodes - if err := ecb.balanceShardTypeAcrossNodes(collection, vid, possibleDestinationEcNodes, dataPerNode, nodeToShardCount, maxDataPerNode, "data", nil); err != nil { - return err - } - - // Refresh locations after data shard moves - // We need to re-scan because moving shards changes node states - dataPerNode, parityPerNode = shardsByTypePerNode(vid, possibleDestinationEcNodes, ecb.diskType, dataShardCount) - nodeToShardCount = countShardsByNode(vid, possibleDestinationEcNodes, ecb.diskType) - - // Identify nodes containing data shards to avoid for parity placement - antiAffinityNodes := make(map[string]bool) - for nodeId, shards := range dataPerNode { - if len(shards) > 0 { - antiAffinityNodes[nodeId] = true - } - } - - // Second pass: Balance parity shards across nodes, avoiding nodes with data shards if possible - if err := ecb.balanceShardTypeAcrossNodes(collection, vid, possibleDestinationEcNodes, parityPerNode, nodeToShardCount, maxParityPerNode, "parity", antiAffinityNodes); err != nil { - return err - } - - return nil -} - -func (ecb *ecBalancer) balanceEcRacks() error { - // balance one rack for all ec shards - ewg := ecb.errorWaitGroup() - for _, ecRack := range ecb.racks() { - ewg.Add(func() error { - return ecb.doBalanceEcRack(ecRack) - }) - } - return ewg.Wait() -} - -func (ecb *ecBalancer) doBalanceEcRack(ecRack *EcRack) error { - if len(ecRack.ecNodes) <= 1 { - return nil - } - - var rackEcNodes []*EcNode - for _, node := range ecRack.ecNodes { - rackEcNodes = append(rackEcNodes, node) - } - - ecNodeIdToShardCount := groupByCount(rackEcNodes, func(ecNode *EcNode) (id string, count int) { - diskInfo, found := ecNode.info.DiskInfos[string(ecb.diskType)] - if !found { - return - } - for _, ecShardInfo := range diskInfo.EcShardInfos { - count += erasure_coding.GetShardCount(ecShardInfo) - } - return ecNode.info.Id, count - }) - - var totalShardCount int - for _, count := range ecNodeIdToShardCount { - totalShardCount += count - } - - averageShardCount := ceilDivide(totalShardCount, len(rackEcNodes)) - - hasMove := true - for hasMove { - hasMove = false - // Sort by shard count (ascending) so emptyNode has fewest shards, fullNode has most. - // Using freeEcSlot would be incorrect when nodes have different total capacities. - slices.SortFunc(rackEcNodes, func(a, b *EcNode) int { - return ecNodeIdToShardCount[a.info.Id] - ecNodeIdToShardCount[b.info.Id] - }) - emptyNode, fullNode := rackEcNodes[0], rackEcNodes[len(rackEcNodes)-1] - emptyNodeShardCount, fullNodeShardCount := ecNodeIdToShardCount[emptyNode.info.Id], ecNodeIdToShardCount[fullNode.info.Id] - if fullNodeShardCount > averageShardCount && emptyNodeShardCount+1 <= averageShardCount { - - // Build a map of volume ID -> shard bits on the empty node - emptyNodeVolumeShards := make(map[uint32]erasure_coding.ShardBits) - if emptyDiskInfo, found := emptyNode.info.DiskInfos[string(ecb.diskType)]; found { - for _, shards := range emptyDiskInfo.EcShardInfos { - emptyNodeVolumeShards[shards.Id] = erasure_coding.ShardBits(shards.EcIndexBits) - } - } - - if fullDiskInfo, found := fullNode.info.DiskInfos[string(ecb.diskType)]; found { - // Pass 0: prefer moving shards of volumes not on the empty node (best diversity) - // Pass 1: move shards of shared volumes where the specific shard ID differs - for pass := 0; pass < 2 && !hasMove; pass++ { - for _, shards := range fullDiskInfo.EcShardInfos { - emptyBits, volumeOnEmpty := emptyNodeVolumeShards[shards.Id] - if pass == 0 && volumeOnEmpty { - continue // pass 0: skip volumes already on the empty node - } - if pass == 1 && !volumeOnEmpty { - continue // pass 1: only consider volumes already on the empty node - } - si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(shards) - for _, shardId := range si.Ids() { - if pass == 1 && emptyBits.Has(shardId) { - continue // Skip shard IDs already on the empty node - } - vid := needle.VolumeId(shards.Id) - // For balancing, strictly require matching disk type and apply anti-affinity - dataShardCount := ecb.getDataShardCount() - destDiskId := pickBestDiskOnNode(emptyNode, vid, ecb.diskType, true, shardId, dataShardCount) - - if destDiskId > 0 { - fmt.Printf("%s moves ec shards %d.%d to %s (disk %d)\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id, destDiskId) - } else { - fmt.Printf("%s moves ec shards %d.%d to %s\n", fullNode.info.Id, shards.Id, shardId, emptyNode.info.Id) - } - - err := moveMountedShardToEcNode(ecb.commandEnv, fullNode, shards.Collection, vid, shardId, emptyNode, destDiskId, ecb.applyBalancing, ecb.diskType) - if err != nil { - return err - } - - ecNodeIdToShardCount[emptyNode.info.Id]++ - ecNodeIdToShardCount[fullNode.info.Id]-- - hasMove = true - break - } - if hasMove { - break - } - } - } - } - } - } - - return nil -} - -func (ecb *ecBalancer) pickEcNodeToBalanceShardsInto(vid needle.VolumeId, existingLocation *EcNode, possibleDestinations []*EcNode) (*EcNode, error) { - if existingLocation == nil { - return nil, fmt.Errorf("INTERNAL: missing source nodes") - } - if len(possibleDestinations) == 0 { - return nil, fmt.Errorf("INTERNAL: missing destination nodes") - } - - nodeShards := map[*EcNode]int{} - for _, node := range possibleDestinations { - count := 0 - if si := findEcVolumeShardsInfo(node, vid, ecb.diskType); si != nil { - count = si.Count() - } - nodeShards[node] = count - } - - targets := []*EcNode{} - targetShards := -1 - for _, shards := range nodeShards { - if shards > targetShards { - targetShards = shards - } - } - - details := "" - for _, node := range possibleDestinations { - if node.info.Id == existingLocation.info.Id { - continue - } - if node.freeEcSlot <= 0 { - details += fmt.Sprintf(" Skipped %s because it has no free slots\n", node.info.Id) - continue - } - - shards := nodeShards[node] - // For EC shards, replica placement constraint only applies when SameRackCount > 0. - // When SameRackCount = 0 (e.g., replica placement "000"), EC shards should be - // distributed freely within racks - the "000" means "no volume replication needed" - // because erasure coding provides redundancy. - if ecb.replicaPlacement != nil && ecb.replicaPlacement.SameRackCount > 0 && shards > ecb.replicaPlacement.SameRackCount+1 { - details += fmt.Sprintf(" Skipped %s because shards %d > replica placement limit for the rack (%d + 1)\n", node.info.Id, shards, ecb.replicaPlacement.SameRackCount) - continue - } - - if shards < targetShards { - // Favor nodes with less shards, to ensure an uniform distribution. - targets = nil - targetShards = shards - } - if shards == targetShards { - targets = append(targets, node) - } - } - - if len(targets) == 0 { - return nil, errors.New(details) - } - - // When multiple nodes have the same shard count, prefer nodes with better disk distribution - // (i.e., nodes with more disks that have fewer shards of this volume) - if len(targets) > 1 { - slices.SortFunc(targets, func(a, b *EcNode) int { - aScore := diskDistributionScore(a, vid) - bScore := diskDistributionScore(b, vid) - return aScore - bScore // Lower score is better - }) - return targets[0], nil - } - - return targets[rand.IntN(len(targets))], nil -} - -// diskDistributionScore calculates a score for how well-distributed shards are on the node's disks -// Lower score is better (means more room for balanced distribution) -func diskDistributionScore(ecNode *EcNode, vid needle.VolumeId) int { - if len(ecNode.disks) == 0 { - return 0 - } - - // Sum the existing shard count for this volume on each disk - // Lower total means more room for new shards - score := 0 - for _, disk := range ecNode.disks { - if si, ok := disk.ecShards[vid]; ok { - score += si.Count() * 10 // Weight shards of this volume heavily - } - score += disk.ecShardCount // Also consider total shards on disk - } - return score -} - // pickBestDiskOnNode selects the best disk on a node for placing a new EC shard // It prefers disks of the specified type with fewer shards and more free slots // When shardId is provided and dataShardCount > 0, it applies anti-affinity: @@ -1473,49 +727,17 @@ func pickBestDiskOnNode(ecNode *EcNode, vid needle.VolumeId, diskType types.Disk return fallbackDiskId } -// pickEcNodeAndDiskToBalanceShardsInto picks both a destination node and specific disk -func (ecb *ecBalancer) pickEcNodeAndDiskToBalanceShardsInto(vid needle.VolumeId, shardId erasure_coding.ShardId, existingLocation *EcNode, possibleDestinations []*EcNode) (*EcNode, uint32, error) { - node, err := ecb.pickEcNodeToBalanceShardsInto(vid, existingLocation, possibleDestinations) - if err != nil { - return nil, 0, err - } - - // For balancing, strictly require matching disk type and apply anti-affinity - dataShardCount := ecb.getDataShardCount() - diskId := pickBestDiskOnNode(node, vid, ecb.diskType, true, shardId, dataShardCount) - return node, diskId, nil -} - -func (ecb *ecBalancer) pickOneEcNodeAndMoveOneShard(existingLocation *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, possibleDestinationEcNodes []*EcNode) error { - destNode, destDiskId, err := ecb.pickEcNodeAndDiskToBalanceShardsInto(vid, shardId, existingLocation, possibleDestinationEcNodes) - if err != nil { - fmt.Printf("WARNING: Could not find suitable target node for %d.%d:\n%s", vid, shardId, err.Error()) - return nil - } - - if destDiskId > 0 { - fmt.Printf("%s moves ec shard %d.%d to %s (disk %d)\n", existingLocation.info.Id, vid, shardId, destNode.info.Id, destDiskId) - } else { - fmt.Printf("%s moves ec shard %d.%d to %s\n", existingLocation.info.Id, vid, shardId, destNode.info.Id) - } - return moveMountedShardToEcNode(ecb.commandEnv, existingLocation, collection, vid, shardId, destNode, destDiskId, ecb.applyBalancing, ecb.diskType) -} - -func (ecb *ecBalancer) collectVolumeIdToEcNodes(collection string) map[needle.VolumeId][]*EcNode { - vidLocations := make(map[needle.VolumeId][]*EcNode) - for _, ecNode := range ecb.ecNodes { - diskInfo, found := ecNode.info.DiskInfos[string(ecb.diskType)] - if !found { - continue - } - for _, shardInfo := range diskInfo.EcShardInfos { - // ignore if not in current collection - if shardInfo.Collection == collection { - vidLocations[needle.VolumeId(shardInfo.Id)] = append(vidLocations[needle.VolumeId(shardInfo.Id)], ecNode) - } - } - } - return vidLocations +// ecBalancer drives an EC balance run: it collects the cluster's EC nodes, hands +// them to the shared ecbalancer planner, and executes the planned shard moves. +// The balancing policy lives in weed/storage/erasure_coding/ecbalancer, shared +// with the EC balance worker so the two cannot drift. +type ecBalancer struct { + commandEnv *CommandEnv + ecNodes []*EcNode + replicaPlacement *super_block.ReplicaPlacement + applyBalancing bool + maxParallelization int + diskType types.DiskType } func EcBalance(commandEnv *CommandEnv, collections []string, dc string, ecReplicaPlacement *super_block.ReplicaPlacement, diskType types.DiskType, maxParallelization int, applyBalancing bool) (err error) { @@ -1540,19 +762,179 @@ func EcBalance(commandEnv *CommandEnv, collections []string, dc string, ecReplic if len(collections) == 0 { glog.V(1).Infof("WARNING: No collections to balance EC volumes across.\n") } + return ecb.balance(collections) +} + +// shellECRatio resolves a collection's EC data/parity counts, defaulting to the +// standard scheme. This is the shell's plug-in point for custom ratios. +func shellECRatio(_ string) (int, int) { + // Custom EC ratios are an enterprise feature; OSS uses the standard scheme. + return erasure_coding.DataShardsCount, erasure_coding.ParityShardsCount +} + +// balance plans EC shard moves with the shared planner and executes them. When +// collections is empty all collections present are balanced. +func (ecb *ecBalancer) balance(collections []string) error { + topo := toBalancerTopology(ecb.ecNodes, collections, ecb.diskType) + moves := ecbalancer.Plan(topo, ecbalancer.Options{ + DiskType: string(ecb.diskType), + ImbalanceThreshold: 0, // the shell balances to an even distribution + ReplicaPlacement: ecb.replicaPlacement, + Ratio: shellECRatio, + }) + return ecb.executeMoves(moves) +} + +// toBalancerTopology builds an ecbalancer.Topology from the shell's EcNode model, +// including the shards of the requested collections (all collections when empty). +func toBalancerTopology(ecNodes []*EcNode, collections []string, diskType types.DiskType) *ecbalancer.Topology { + allowed := make(map[string]bool, len(collections)) for _, c := range collections { - if err = ecb.balanceEcVolumes(c); err != nil { - return err + allowed[c] = true + } + + topo := ecbalancer.NewTopology() + for _, en := range ecNodes { + rackKey := string(en.dc) + ":" + string(en.rack) + node := topo.AddNode(en.info.Id, string(en.dc), rackKey, en.freeEcSlot) + for diskId, d := range en.disks { + node.AddDisk(diskId, d.diskType, d.freeEcSlots, d.ecShardCount) + } + diskInfo, found := en.info.DiskInfos[string(diskType)] + if !found { + continue + } + for _, eci := range diskInfo.EcShardInfos { + if len(allowed) > 0 && !allowed[eci.Collection] { + continue + } + node.AddShards(eci.Id, eci.Collection, eci.DiskId, erasure_coding.ShardBits(eci.EcIndexBits)) } } + return topo +} - if err := ecb.balanceEcRacks(); err != nil { - return fmt.Errorf("balance ec racks: %w", err) +// executeMoves carries out the planned moves. Phases run in order (a within-rack +// move can depend on a cross-rack move's result), and the independent moves +// within a phase run with up to maxParallelization concurrency. Apply mode does +// only the RPCs; dry-run mode runs sequentially and mutates the in-memory EcNode +// model so callers/tests can inspect the planned end state. +func (ecb *ecBalancer) executeMoves(moves []ecbalancer.Move) error { + byID := make(map[string]*EcNode, len(ecb.ecNodes)) + for _, en := range ecb.ecNodes { + byID[en.info.Id] = en } + // Plan emits moves grouped by phase; run each contiguous same-phase group + // together, waiting before the next so cross-phase dependencies hold. + for i := 0; i < len(moves); { + j := i + for j < len(moves) && moves[j].Phase == moves[i].Phase { + j++ + } + if err := ecb.executePhase(byID, moves[i:j]); err != nil { + return err + } + i = j + } return nil } +func (ecb *ecBalancer) executePhase(byID map[string]*EcNode, moves []ecbalancer.Move) error { + if !ecb.applyBalancing { + // Dry-run: sequential so the in-memory model updates are race-free and + // reflect the full plan for inspection. + for _, m := range moves { + if err := ecb.executeMove(byID, m); err != nil { + return err + } + } + return nil + } + // Apply mode: parallelize across volumes, but run one volume's moves within a + // phase sequentially. Concurrent moves of the same volume to a node can race + // on its shared .ecx/.ecj/.vif sidecar files. + var order []uint32 + byVol := make(map[uint32][]ecbalancer.Move) + for _, m := range moves { + if _, ok := byVol[m.VolumeID]; !ok { + order = append(order, m.VolumeID) + } + byVol[m.VolumeID] = append(byVol[m.VolumeID], m) + } + ewg := NewErrorWaitGroup(ecb.maxParallelization) + for _, vid := range order { + group := byVol[vid] + ewg.Add(func() error { + for _, m := range group { + if err := ecb.executeMove(byID, m); err != nil { + return err + } + } + return nil + }) + } + return ewg.Wait() +} + +func (ecb *ecBalancer) executeMove(byID map[string]*EcNode, m ecbalancer.Move) error { + src := byID[m.SourceNode] + if src == nil { + return nil + } + vid := needle.VolumeId(m.VolumeID) + shardId := erasure_coding.ShardId(m.ShardID) + shardIds := []erasure_coding.ShardId{shardId} + + if m.Phase == "dedup" { + fmt.Printf("dedup: delete ec shard %d.%d on %s\n", vid, shardId, m.SourceNode) + if !ecb.applyBalancing { + src.deleteEcVolumeShards(vid, shardIds, ecb.diskType) + return nil + } + grpcDialOption := ecb.commandEnv.option.GrpcDialOption + addr := pb.NewServerAddressFromDataNode(src.info) + if err := unmountEcShards(grpcDialOption, vid, addr, shardIds); err != nil { + return err + } + return sourceServerDeleteEcShards(grpcDialOption, m.Collection, vid, addr, shardIds) + } + + dst := byID[m.TargetNode] + if dst == nil { + return nil + } + if m.TargetDisk > 0 { + fmt.Printf("%s moves ec shard %d.%d to %s (disk %d)\n", m.SourceNode, vid, shardId, m.TargetNode, m.TargetDisk) + } else { + fmt.Printf("%s moves ec shard %d.%d to %s\n", m.SourceNode, vid, shardId, m.TargetNode) + } + if !ecb.applyBalancing { + // Dry-run: update the in-memory model only. + return moveMountedShardToEcNode(ecb.commandEnv, src, m.Collection, vid, shardId, dst, m.TargetDisk, false, ecb.diskType) + } + return ecb.applyShardMoveRPC(src, dst, m.Collection, vid, shardId, m.TargetDisk) +} + +// applyShardMoveRPC copies a shard to the destination disk, then unmounts and +// deletes it on the source. It does not touch the in-memory model, so it is safe +// to run concurrently across the moves of a phase. +func (ecb *ecBalancer) applyShardMoveRPC(src, dst *EcNode, collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, destDiskId uint32) error { + grpcDialOption := ecb.commandEnv.option.GrpcDialOption + srcAddr := pb.NewServerAddressFromDataNode(src.info) + copiedShardIds, err := oneServerCopyAndMountEcShardsFromSource(grpcDialOption, dst, []erasure_coding.ShardId{shardId}, vid, collection, srcAddr, destDiskId) + if err != nil { + return err + } + if len(copiedShardIds) == 0 { + return nil + } + if err := unmountEcShards(grpcDialOption, vid, srcAddr, copiedShardIds); err != nil { + return err + } + return sourceServerDeleteEcShards(grpcDialOption, collection, vid, srcAddr, copiedShardIds) +} + // compileCollectionPattern compiles a regex pattern for collection matching. // Empty patterns match empty collections only. // The special keyword CollectionDefault ("_default") matches empty collections. @@ -1567,114 +949,3 @@ func compileCollectionPattern(pattern string) (*regexp.Regexp, error) { } return regexp.Compile(pattern) } - -// balanceShardTypeAcrossNodes spreads shards of a specific type (data or parity) evenly across nodes -func (ecb *ecBalancer) balanceShardTypeAcrossNodes( - collection string, - vid needle.VolumeId, - possibleDestinationEcNodes []*EcNode, - shardsPerNode map[string][]erasure_coding.ShardId, - nodeToShardCount map[string]int, - maxPerNode int, - shardType string, - antiAffinityNodes map[string]bool, -) error { - // Map ID to EcNode for lookup - nodeMap := make(map[string]*EcNode) - for _, n := range possibleDestinationEcNodes { - nodeMap[n.info.Id] = n - } - - // Find nodes with too many shards of this type - shardsToMove := make(map[erasure_coding.ShardId]*EcNode) - for nodeId, shards := range shardsPerNode { - if len(shards) <= maxPerNode { - continue - } - // Pick excess shards to move - excess := len(shards) - maxPerNode - ecNode := nodeMap[nodeId] - if ecNode == nil { - continue - } - - for i := 0; i < excess && i < len(shards); i++ { - shardId := shards[i] - // Verify node has this shard - si := findEcVolumeShardsInfo(ecNode, vid, ecb.diskType) - if si.Has(shardId) { - shardsToMove[shardId] = ecNode - } - } - } - - // Move shards to nodes that have fewer than maxPerNode of this type - for shardId, ecNode := range shardsToMove { - // Find destination node with room for this shard type - destNode, err := ecb.pickNodeForShardType(possibleDestinationEcNodes, shardsPerNode, maxPerNode, nodeToShardCount, antiAffinityNodes) - if err != nil { - fmt.Printf("ec %s shard %d.%d at %s can not find a destination node:\n%s\n", shardType, vid, shardId, ecNode.info.Id, err.Error()) - continue - } - - err = ecb.pickOneEcNodeAndMoveOneShard(ecNode, collection, vid, shardId, []*EcNode{destNode}) - if err != nil { - return err - } - - // Update tracking - destNodeId := destNode.info.Id - shardsPerNode[destNodeId] = append(shardsPerNode[destNodeId], shardId) - - // Remove from source node - srcNodeId := ecNode.info.Id - for i, s := range shardsPerNode[srcNodeId] { - if s == shardId { - shardsPerNode[srcNodeId] = append(shardsPerNode[srcNodeId][:i], shardsPerNode[srcNodeId][i+1:]...) - break - } - } - nodeToShardCount[destNodeId] += 1 - nodeToShardCount[srcNodeId] -= 1 - destNode.freeEcSlot -= 1 - ecNode.freeEcSlot += 1 - } - - return nil -} - -// pickNodeForShardType selects a node that has room for more shards of a specific type -func (ecb *ecBalancer) pickNodeForShardType( - nodes []*EcNode, - shardsPerNode map[string][]erasure_coding.ShardId, - maxPerNode int, - nodeToShardCount map[string]int, - antiAffinityNodes map[string]bool, -) (*EcNode, error) { - selector := &twoPassSelector[*EcNode]{ - candidates: nodes, - shardsPerTarget: shardsPerNode, - maxPerTarget: maxPerNode, - targetToShardCount: nodeToShardCount, - antiAffinity: antiAffinityNodes, - getKey: func(n *EcNode) string { - return n.info.Id - }, - hasFreeSlots: func(n *EcNode) bool { - return n.freeEcSlot > 0 - }, - checkLimit: func(n *EcNode) bool { - // For EC shards, replica placement constraint only applies when SameRackCount > 0. - if ecb.replicaPlacement != nil && ecb.replicaPlacement.SameRackCount > 0 { - return nodeToShardCount[n.info.Id] < ecb.replicaPlacement.SameRackCount+1 - } - return true - }, - } - - selected, err := selector.selectCandidate() - if err != nil { - return nil, errors.New("no node available for shard type balancing") - } - return selected, nil -} diff --git a/weed/shell/command_ec_common_avoid_test.go b/weed/shell/command_ec_common_avoid_test.go deleted file mode 100644 index b420eec5b..000000000 --- a/weed/shell/command_ec_common_avoid_test.go +++ /dev/null @@ -1,377 +0,0 @@ -package shell - -import ( - "testing" - - "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" - "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" - "github.com/seaweedfs/seaweedfs/weed/storage/needle" - "github.com/seaweedfs/seaweedfs/weed/storage/super_block" - "github.com/seaweedfs/seaweedfs/weed/storage/types" -) - -func TestPickRackForShardType_AntiAffinityRacks(t *testing.T) { - // Setup topology with 3 racks, each with 1 node, enough free slots - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - buildRackWithEcShards("rack0", "node0:8080", 100, nil), - buildRackWithEcShards("rack1", "node1:8080", 100, nil), - buildRackWithEcShards("rack2", "node2:8080", 100, nil), - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - } - - racks := ecb.racks() - rackToShardCount := make(map[string]int) - shardsPerRack := make(map[string][]erasure_coding.ShardId) - maxPerRack := 2 - - // Case 1: Avoid rack0 - antiAffinityRacks := map[string]bool{"rack0": true} - - // Try multiple times to ensure randomness doesn't accidentally pass - for i := 0; i < 20; i++ { - picked, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount, antiAffinityRacks) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if picked == "rack0" { - t.Errorf("picked avoided rack rack0") - } - } - - // Case 2: Fallback - avoid all racks - avoidAll := map[string]bool{"rack0": true, "rack1": true, "rack2": true} - picked, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, rackToShardCount, avoidAll) - if err != nil { - t.Fatalf("fallback failed: %v", err) - } - if picked == "" { - t.Errorf("expected some rack to be picked in fallback") - } -} - -func TestPickRackForShardType_EdgeCases(t *testing.T) { - t.Run("NoFreeSlots", func(t *testing.T) { - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - buildRackWithEcShards("rack0", "node0:8080", 0, nil), // maxVolumes=0 - buildRackWithEcShards("rack1", "node1:8080", 0, nil), - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - } - - racks := ecb.racks() - _, err := ecb.pickRackForShardType(racks, make(map[string][]erasure_coding.ShardId), 2, make(map[string]int), nil) - if err == nil { - t.Error("expected error when no free slots, got nil") - } - }) - - t.Run("AllRacksAtMaxCapacity", func(t *testing.T) { - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - buildRackWithEcShards("rack0", "node0:8080", 100, nil), - buildRackWithEcShards("rack1", "node1:8080", 100, nil), - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - } - - racks := ecb.racks() - shardsPerRack := map[string][]erasure_coding.ShardId{ - "rack0": {0, 1}, // 2 shards - "rack1": {2, 3}, // 2 shards - } - maxPerRack := 2 - - _, err := ecb.pickRackForShardType(racks, shardsPerRack, maxPerRack, make(map[string]int), nil) - if err == nil { - t.Error("expected error when all racks at max capacity, got nil") - } - }) - - t.Run("ReplicaPlacementLimit", func(t *testing.T) { - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - buildRackWithEcShards("rack0", "node0:8080", 100, nil), - buildRackWithEcShards("rack1", "node1:8080", 100, nil), - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - rp, _ := super_block.NewReplicaPlacementFromString("012") // DiffRackCount = 1 - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - replicaPlacement: rp, - } - - racks := ecb.racks() - rackToShardCount := map[string]int{ - "rack0": 1, // At limit - "rack1": 0, - } - - picked, err := ecb.pickRackForShardType(racks, make(map[string][]erasure_coding.ShardId), 5, rackToShardCount, nil) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if picked != "rack1" { - t.Errorf("expected rack1 (not at limit), got %v", picked) - } - }) - - t.Run("PreferFewerShards", func(t *testing.T) { - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - buildRackWithEcShards("rack0", "node0:8080", 100, nil), - buildRackWithEcShards("rack1", "node1:8080", 100, nil), - buildRackWithEcShards("rack2", "node2:8080", 100, nil), - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - } - - racks := ecb.racks() - shardsPerRack := map[string][]erasure_coding.ShardId{ - "rack0": {0, 1}, // 2 shards - "rack1": {2}, // 1 shard - "rack2": {}, // 0 shards - } - - // Should pick rack2 (fewest shards) - picked, err := ecb.pickRackForShardType(racks, shardsPerRack, 5, make(map[string]int), nil) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if picked != "rack2" { - t.Errorf("expected rack2 (fewest shards), got %v", picked) - } - }) -} - -func TestPickNodeForShardType_AntiAffinityNodes(t *testing.T) { - // Setup topology with 1 rack, 3 nodes - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - { - Id: "rack0", - DataNodeInfos: []*master_pb.DataNodeInfo{ - buildDataNode("node0:8080", 100), - buildDataNode("node1:8080", 100), - buildDataNode("node2:8080", 100), - }, - }, - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - } - - nodeToShardCount := make(map[string]int) - shardsPerNode := make(map[string][]erasure_coding.ShardId) - maxPerNode := 2 - - // Case 1: Avoid node0 - antiAffinityNodes := map[string]bool{"node0:8080": true} - - for i := 0; i < 20; i++ { - picked, err := ecb.pickNodeForShardType(ecNodes, shardsPerNode, maxPerNode, nodeToShardCount, antiAffinityNodes) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if picked.info.Id == "node0:8080" { - t.Errorf("picked avoided node node0") - } - } -} - -func TestPickNodeForShardType_EdgeCases(t *testing.T) { - t.Run("NoFreeSlots", func(t *testing.T) { - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - { - Id: "rack0", - DataNodeInfos: []*master_pb.DataNodeInfo{ - buildDataNode("node0:8080", 0), // No capacity - }, - }, - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - } - - _, err := ecb.pickNodeForShardType(ecNodes, make(map[string][]erasure_coding.ShardId), 2, make(map[string]int), nil) - if err == nil { - t.Error("expected error when no free slots, got nil") - } - }) - - t.Run("ReplicaPlacementSameRackLimit", func(t *testing.T) { - topo := &master_pb.TopologyInfo{ - Id: "test_topo", - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - { - Id: "rack0", - DataNodeInfos: []*master_pb.DataNodeInfo{ - buildDataNode("node0:8080", 100), - buildDataNode("node1:8080", 100), - }, - }, - }, - }, - }, - } - - ecNodes, _ := collectEcVolumeServersByDc(topo, "", types.HardDriveType) - rp, _ := super_block.NewReplicaPlacementFromString("021") // SameRackCount = 1 - ecb := &ecBalancer{ - ecNodes: ecNodes, - diskType: types.HardDriveType, - replicaPlacement: rp, - } - - nodeToShardCount := map[string]int{ - "node0:8080": 3, // Exceeds SameRackCount + 1 - "node1:8080": 0, - } - - picked, err := ecb.pickNodeForShardType(ecNodes, make(map[string][]erasure_coding.ShardId), 5, nodeToShardCount, nil) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if picked.info.Id != "node1:8080" { - t.Errorf("expected node1 (not at limit), got %v", picked.info.Id) - } - }) -} - -func TestShardsByType(t *testing.T) { - vid := needle.VolumeId(123) - - // Create mock nodes with shards - nodes := []*EcNode{ - { - info: &master_pb.DataNodeInfo{ - Id: "node1", - DiskInfos: map[string]*master_pb.DiskInfo{ - string(types.HardDriveType): { - EcShardInfos: []*master_pb.VolumeEcShardInformationMessage{ - { - Id: uint32(vid), - EcIndexBits: uint32((1 << 0) | (1 << 1) | (1 << 10) | (1 << 11)), // data: 0,1 parity: 10,11 - }, - }, - }, - }, - }, - rack: "rack1", - }, - } - - t.Run("Standard10Plus4", func(t *testing.T) { - dataPerRack, parityPerRack := shardsByTypePerRack(vid, nodes, types.HardDriveType, 10) - - if len(dataPerRack["rack1"]) != 2 { - t.Errorf("expected 2 data shards, got %d", len(dataPerRack["rack1"])) - } - if len(parityPerRack["rack1"]) != 2 { - t.Errorf("expected 2 parity shards, got %d", len(parityPerRack["rack1"])) - } - }) - - t.Run("NodeGrouping", func(t *testing.T) { - dataPerNode, parityPerNode := shardsByTypePerNode(vid, nodes, types.HardDriveType, 10) - - if len(dataPerNode["node1"]) != 2 { - t.Errorf("expected 2 data shards on node1, got %d", len(dataPerNode["node1"])) - } - if len(parityPerNode["node1"]) != 2 { - t.Errorf("expected 2 parity shards on node1, got %d", len(parityPerNode["node1"])) - } - }) -} - -func buildDataNode(nodeId string, maxVolumes int64) *master_pb.DataNodeInfo { - return &master_pb.DataNodeInfo{ - Id: nodeId, - DiskInfos: map[string]*master_pb.DiskInfo{ - string(types.HardDriveType): { - Type: string(types.HardDriveType), - MaxVolumeCount: maxVolumes, - VolumeCount: 0, - }, - }, - } -} diff --git a/weed/shell/command_ec_decode.go b/weed/shell/command_ec_decode.go index 9da96a69a..62b201b05 100644 --- a/weed/shell/command_ec_decode.go +++ b/weed/shell/command_ec_decode.go @@ -405,9 +405,19 @@ func collectEcNodeShardsInfo(topoInfo *master_pb.TopologyInfo, vid needle.Volume res := make(map[pb.ServerAddress]*erasure_coding.ShardsInfo) eachDataNode(topoInfo, func(dc DataCenterId, rack RackId, dn *master_pb.DataNodeInfo) { if diskInfo, found := dn.DiskInfos[string(diskType)]; found { + // A node may report several EcShardInfos for one volume — one per + // physical disk holding shards of it (multi-disk nodes). Union them + // rather than overwriting, or only the last disk's shards survive and + // the node looks like it is missing shards it actually has. for _, v := range diskInfo.EcShardInfos { if v.Id == uint32(vid) { - res[pb.NewServerAddressFromDataNode(dn)] = erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(v) + addr := pb.NewServerAddressFromDataNode(dn) + si := erasure_coding.ShardsInfoFromVolumeEcShardInformationMessage(v) + if existing, ok := res[addr]; ok { + existing.Add(si) + } else { + res[addr] = si + } } } } diff --git a/weed/shell/command_ec_encode.go b/weed/shell/command_ec_encode.go index 3824a38ef..6d6d9eb3d 100644 --- a/weed/shell/command_ec_encode.go +++ b/weed/shell/command_ec_encode.go @@ -340,36 +340,58 @@ func doEcEncode(commandEnv *CommandEnv, writer io.Writer, volumeIdToCollection m } func verifyEcShardsBeforeDelete(commandEnv *CommandEnv, volumeIds []needle.VolumeId, diskType types.DiskType) error { - topoInfo, _, err := collectTopologyInfo(commandEnv, 0) - if err != nil { - return fmt.Errorf("fetch topology for shard verification: %w", err) - } + // Shard relocations from the preceding EC balance reach the master via + // volume-server heartbeats, so freshly distributed shards may not all be + // visible in the master topology immediately. Poll a few times before + // concluding the shard set is incomplete, so a heartbeat-propagation lag is + // not mistaken for missing data (which would abort the encode). Genuine loss + // still fails after the retries are exhausted. + const maxAttempts = 10 + const retryInterval = 2 * time.Second - for _, vid := range volumeIds { - nodeShards := collectEcNodeShardsInfo(topoInfo, vid, diskType) - - var union erasure_coding.ShardBits - for _, info := range nodeShards { - union = erasure_coding.ShardBits(uint32(union) | info.Bitmap()) + var lastErr error + for attempt := 0; attempt < maxAttempts; attempt++ { + topoInfo, _, err := collectTopologyInfo(commandEnv, 0) + if err != nil { + return fmt.Errorf("fetch topology for shard verification: %w", err) } - totalShards := erasure_coding.TotalShardsCount - if err := erasure_coding.RequireFullShardSet(uint32(vid), union, totalShards); err != nil { - summary := make([]string, 0, len(nodeShards)) - for node, info := range nodeShards { - summary = append(summary, fmt.Sprintf("%s=%s", node, info.String())) + lastErr = nil + for _, vid := range volumeIds { + nodeShards := collectEcNodeShardsInfo(topoInfo, vid, diskType) + + var union erasure_coding.ShardBits + for _, info := range nodeShards { + union = erasure_coding.ShardBits(uint32(union) | info.Bitmap()) } - sort.Strings(summary) - glog.Errorf("EC shard verification failed for volume %d on diskType %q: %v; observed: %v", - vid, diskType.ReadableString(), err, summary) - return fmt.Errorf("volume %d: %w (observed: %v)", vid, err, summary) + + totalShards := erasure_coding.TotalShardsCount + if err := erasure_coding.RequireFullShardSet(uint32(vid), union, totalShards); err != nil { + summary := make([]string, 0, len(nodeShards)) + for node, info := range nodeShards { + summary = append(summary, fmt.Sprintf("%s=%s", node, info.String())) + } + sort.Strings(summary) + lastErr = fmt.Errorf("volume %d: %w (observed: %v)", vid, err, summary) + break + } + + glog.V(0).Infof("EC shard verification ok for volume %d on diskType %q: %d/%d shards present across %d nodes", + vid, diskType.ReadableString(), union.Count(), totalShards, len(nodeShards)) } - glog.V(0).Infof("EC shard verification ok for volume %d on diskType %q: %d/%d shards present across %d nodes", - vid, diskType.ReadableString(), union.Count(), totalShards, len(nodeShards)) + if lastErr == nil { + return nil + } + if attempt < maxAttempts-1 { + glog.V(0).Infof("EC shard verification incomplete (attempt %d/%d), waiting for shard locations to propagate: %v", + attempt+1, maxAttempts, lastErr) + time.Sleep(retryInterval) + } } - return nil + glog.Errorf("EC shard verification failed after %d attempts: %v", maxAttempts, lastErr) + return lastErr } // doDeleteVolumesWithLocations deletes volumes using pre-collected location information diff --git a/weed/shell/command_ec_test.go b/weed/shell/command_ec_test.go index 4a422b7ea..af9ee3da9 100644 --- a/weed/shell/command_ec_test.go +++ b/weed/shell/command_ec_test.go @@ -19,7 +19,7 @@ func TestCommandEcBalanceSmall(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") + ecb.balance([]string{"c1"}) } func TestCommandEcBalanceNothingToMove(t *testing.T) { @@ -36,7 +36,7 @@ func TestCommandEcBalanceNothingToMove(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") + ecb.balance([]string{"c1"}) } func TestCommandEcBalanceAddNewServers(t *testing.T) { @@ -55,7 +55,7 @@ func TestCommandEcBalanceAddNewServers(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") + ecb.balance([]string{"c1"}) } func TestCommandEcBalanceAddNewRacks(t *testing.T) { @@ -74,7 +74,7 @@ func TestCommandEcBalanceAddNewRacks(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") + ecb.balance([]string{"c1"}) } func TestCommandEcBalanceVolumeEvenButRackUneven(t *testing.T) { @@ -118,8 +118,7 @@ func TestCommandEcBalanceVolumeEvenButRackUneven(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") - ecb.balanceEcRacks() + ecb.balance([]string{"c1"}) } func newEcNode(dc string, rack string, dataNodeId string, freeEcSlot int) *EcNode { @@ -158,7 +157,7 @@ func TestCommandEcBalanceEvenDataAndParityDistribution(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") + ecb.balance([]string{"c1"}) // After balancing (dry-run), verify the PLANNED distribution by checking what moves were proposed // The ecb.ecNodes state is updated during dry-run to track planned moves @@ -262,7 +261,7 @@ func TestCommandEcBalanceMultipleVolumesEvenDistribution(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") + ecb.balance([]string{"c1"}) // Check both volumes for _, vid := range []needle.VolumeId{1, 2} { @@ -316,8 +315,7 @@ func TestCommandEcBalanceAllNodesShareAllVolumes(t *testing.T) { diskType: types.HardDriveType, } - ecb.balanceEcVolumes("c1") - ecb.balanceEcRacks() + ecb.balance([]string{"c1"}) // Count total shards per node after balancing for _, node := range ecb.ecNodes { @@ -396,8 +394,7 @@ func TestCommandEcBalanceIssue8793Topology(t *testing.T) { t.Logf("BEFORE node %s (max %d): %d shards", node.info.Id, node.freeEcSlot+count, count) } - ecb.balanceEcVolumes("cldata") - ecb.balanceEcRacks() + ecb.balance([]string{"cldata"}) // Verify: no node should exceed the average totalShards := 0 diff --git a/weed/storage/erasure_coding/ecbalancer/balancer.go b/weed/storage/erasure_coding/ecbalancer/balancer.go new file mode 100644 index 000000000..3cf7aaca9 --- /dev/null +++ b/weed/storage/erasure_coding/ecbalancer/balancer.go @@ -0,0 +1,1058 @@ +// Package ecbalancer holds the EC-shard rebalancing policy shared by the shell +// ec.balance command and the admin EC balance worker. It is pure: callers build +// a Topology snapshot from their own structures, call Plan to get the list of +// shard Moves, and execute them their own way (inline RPCs in the shell, task +// proposals in the worker). Keeping the policy here stops the two callers from +// drifting apart. +package ecbalancer + +import ( + "math" + "sort" + + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/super_block" +) + +// Topology is a snapshot of EC shard placement to plan against. Build it with +// NewTopology + AddNode + (*Node).AddDisk/AddShards. +type Topology struct { + nodes map[string]*Node +} + +// volKey identifies an EC volume by (collection, id). A numeric volume id can be +// reused across collections, so the collection is part of the identity (see +// weed/storage/store_ec_attach_reservation.go); keying shards by id alone would +// merge unrelated volumes and could dedup/move shards across collections. +type volKey struct { + collection string + vid uint32 +} + +// Node is a volume server in the snapshot. Fields are set through the builder +// methods; only its identity is read back (via Move). +type Node struct { + id string + dc string + rack string // composite rack key (e.g. "dc1:rack1") + freeSlots int + disks map[uint32]*disk + shards map[volKey]*volumeShards +} + +type disk struct { + diskID uint32 + diskType string + freeSlots int + shardCount int // total EC shards on this disk across all volumes +} + +type volumeShards struct { + collection string + shardBits erasure_coding.ShardBits // union across disks + diskShardBits map[uint32]erasure_coding.ShardBits // disk_id -> shards of this volume on that disk +} + +type rack struct { + nodes map[string]*Node + freeSlots int +} + +// Move is a planned shard relocation. For a dedup deletion SourceNode==TargetNode +// and SourceDisk==TargetDisk (unmount+delete only, no copy). +type Move struct { + VolumeID uint32 + ShardID int + Collection string + SourceNode string + SourceDisk uint32 + SourceRack string + TargetNode string + TargetDisk uint32 + TargetRack string + Phase string // "dedup", "cross_rack", "within_rack", "global" +} + +// Options tunes a Plan run. +type Options struct { + DiskType string // "" matches any disk type + ImbalanceThreshold float64 // skip rack/node balancing below this skew + ReplicaPlacement *super_block.ReplicaPlacement // nil = even spread, no per-rack/node cap + // Ratio returns a collection's (dataShards, parityShards); nil defaults to the + // standard scheme. This is where a caller plugs in custom-ratio resolution. + Ratio func(collection string) (dataShards, parityShards int) + // GlobalMaxMovesPerRack caps how many shards the global (cross-volume) phase + // moves out of one rack in a single Plan. 0 means unlimited (drain to balance + // in one pass), which the shell uses; the worker sets a small value to make + // incremental progress across repeated detection cycles. + GlobalMaxMovesPerRack int + // GlobalUtilizationBased selects the global phase's balance metric: when true, + // nodes are balanced by fractional fullness (shards/capacity), which suits + // heterogeneous-capacity racks; when false, by raw shard count. The worker + // uses utilization; the shell uses raw count. + GlobalUtilizationBased bool +} + +// move is the internal form carrying node pointers; converted to Move at the end. +type move struct { + volumeID uint32 + shardID int + collection string + source *Node + sourceDisk uint32 + target *Node + targetDisk uint32 + phase string +} + +// NewTopology returns an empty topology to populate. +func NewTopology() *Topology { + return &Topology{nodes: make(map[string]*Node)} +} + +// AddNode registers a volume server. freeSlots is the node's total free EC shard +// slots; per-disk free slots are supplied via AddDisk. +func (t *Topology) AddNode(id, dc, rackKey string, freeSlots int) *Node { + n := &Node{ + id: id, + dc: dc, + rack: rackKey, + freeSlots: freeSlots, + disks: make(map[uint32]*disk), + shards: make(map[volKey]*volumeShards), + } + t.nodes[id] = n + return n +} + +// AddDisk registers a physical disk. shardCount is the disk's total EC shard +// count across all volumes (used for disk scoring); freeSlots is the per-disk +// free EC shard slots. +func (n *Node) AddDisk(diskID uint32, diskType string, freeSlots, shardCount int) { + n.disks[diskID] = &disk{diskID: diskID, diskType: diskType, freeSlots: freeSlots, shardCount: shardCount} +} + +// AddShards records that the volume's shards in bits live on diskID. Call it +// only for the volumes that should be balanced; the disk's overall occupancy is +// reported separately via AddDisk. +func (n *Node) AddShards(vid uint32, collection string, diskID uint32, bits erasure_coding.ShardBits) { + key := volKey{collection: collection, vid: vid} + vs, ok := n.shards[key] + if !ok { + vs = &volumeShards{collection: collection, diskShardBits: make(map[uint32]erasure_coding.ShardBits)} + n.shards[key] = vs + } + vs.shardBits |= bits + vs.diskShardBits[diskID] |= bits +} + +// Plan runs the full multi-phase EC balance policy and returns the proposed +// moves: per collection it deduplicates, then spreads data and parity shards +// across racks and within racks (two-pass, with anti-affinity), and finally +// balances total shard load across nodes in each rack. +func Plan(topo *Topology, opts Options) []Move { + if topo == nil || len(topo.nodes) == 0 { + return nil + } + ratio := opts.Ratio + if ratio == nil { + ratio = func(string) (int, int) { + return erasure_coding.DataShardsCount, erasure_coding.ParityShardsCount + } + } + + nodes := topo.nodes + racks := buildRacks(nodes) + + // Group volumes by collection (deterministic order), keyed by (collection, id) + // so volumes that reuse a numeric id across collections stay distinct. Resolve + // each collection's data-shard count once for the global phase's disk scoring. + byCollection := make(map[string][]volKey) + seen := make(map[volKey]bool) + for _, n := range nodes { + for vk := range n.shards { + if !seen[vk] { + seen[vk] = true + byCollection[vk.collection] = append(byCollection[vk.collection], vk) + } + } + } + collections := make([]string, 0, len(byCollection)) + dataShardsByCollection := make(map[string]int) + for c := range byCollection { + collections = append(collections, c) + sort.Slice(byCollection[c], func(i, j int) bool { return byCollection[c][i].vid < byCollection[c][j].vid }) + d, _ := ratio(c) + dataShardsByCollection[c] = d + } + sort.Strings(collections) + + var all []*move + for _, collection := range collections { + dataShards, parityShards := ratio(collection) + + for _, vk := range byCollection[collection] { + m := detectDuplicateShards(vk, nodes) + applyMovesToTopology(m, racks) + all = append(all, m...) + } + for _, vk := range byCollection[collection] { + m := detectCrossRackImbalance(vk, nodes, racks, opts.DiskType, opts.ImbalanceThreshold, dataShards, parityShards, opts.ReplicaPlacement) + applyMovesToTopology(m, racks) + all = append(all, m...) + } + for _, vk := range byCollection[collection] { + m := detectWithinRackImbalance(vk, nodes, racks, opts.DiskType, opts.ImbalanceThreshold, dataShards, parityShards, opts.ReplicaPlacement) + applyMovesToTopology(m, racks) + all = append(all, m...) + } + } + + all = append(all, detectGlobalImbalance(nodes, racks, opts.DiskType, opts.ImbalanceThreshold, dataShardsByCollection, opts.GlobalMaxMovesPerRack, opts.GlobalUtilizationBased)...) + + out := make([]Move, 0, len(all)) + for _, m := range all { + out = append(out, Move{ + VolumeID: m.volumeID, + ShardID: m.shardID, + Collection: m.collection, + SourceNode: m.source.id, + SourceDisk: m.sourceDisk, + SourceRack: m.source.rack, + TargetNode: m.target.id, + TargetDisk: m.targetDisk, + TargetRack: m.target.rack, + Phase: m.phase, + }) + } + return out +} + +func buildRacks(nodes map[string]*Node) map[string]*rack { + racks := make(map[string]*rack) + for _, n := range nodes { + r, ok := racks[n.rack] + if !ok { + r = &rack{nodes: make(map[string]*Node)} + racks[n.rack] = r + } + r.nodes[n.id] = n + r.freeSlots += n.freeSlots + } + return racks +} + +// detectDuplicateShards finds shards present on more than one node and proposes +// deleting all copies but the one on the node with the most free slots. +func detectDuplicateShards(vk volKey, nodes map[string]*Node) []*move { + shardLocations := make(map[int][]*Node) + for _, node := range nodes { + info, ok := node.shards[vk] + if !ok { + continue + } + for shardID := 0; shardID < erasure_coding.MaxShardCount; shardID++ { + if info.shardBits.Has(erasure_coding.ShardId(shardID)) { + shardLocations[shardID] = append(shardLocations[shardID], node) + } + } + } + + var moves []*move + for shardID, locs := range shardLocations { + if len(locs) <= 1 { + continue + } + // Keep the copy on the node with the most free slots and delete the + // duplicates from the more-constrained nodes, relieving capacity pressure + // where it is tightest. Sort ascending by free slots (tie-break on node id + // for determinism) and keep the last entry. + sort.Slice(locs, func(i, j int) bool { + if locs[i].freeSlots != locs[j].freeSlots { + return locs[i].freeSlots < locs[j].freeSlots + } + return locs[i].id < locs[j].id + }) + for _, node := range locs[:len(locs)-1] { + moves = append(moves, &move{ + volumeID: vk.vid, + shardID: shardID, + collection: vk.collection, + source: node, + sourceDisk: shardDiskID(node, vk, shardID), + target: node, + targetDisk: shardDiskID(node, vk, shardID), + phase: "dedup", + }) + } + } + return moves +} + +// detectCrossRackImbalance spreads a volume's shards across racks in two passes +// (data, then parity with anti-affinity to data-bearing racks). Returns nil if +// the overall distribution is below the imbalance threshold. +func detectCrossRackImbalance(vk volKey, nodes map[string]*Node, racks map[string]*rack, diskType string, threshold float64, dataShards, parityShards int, rp *super_block.ReplicaPlacement) []*move { + numRacks := len(racks) + if numRacks <= 1 { + return nil + } + + // Gate on per-type spread: act when data OR parity shards are unevenly + // distributed across racks, even if the per-rack totals happen to be even. + gateData, gateParity := shardsByGroup(vk, nodes, dataShards, func(n *Node) string { return n.rack }) + if !typeImbalanced(gateData, numRacks, threshold) && !typeImbalanced(gateParity, numRacks, threshold) { + return nil + } + + rackShardCount := countShardsByRack(vk, nodes) + var moves []*move + + dataPerRack, _ := shardsByGroup(vk, nodes, dataShards, func(n *Node) string { return n.rack }) + moves = append(moves, balanceShardTypeAcrossRacks(vk, nodes, racks, diskType, dataShards, + dataPerRack, rackShardCount, ceilDivide(dataShards, numRacks), nil, rp)...) + + dataPerRack, parityPerRack := shardsByGroup(vk, nodes, dataShards, func(n *Node) string { return n.rack }) + antiAffinity := make(map[string]bool) + for rackID, shards := range dataPerRack { + if len(shards) > 0 { + antiAffinity[rackID] = true + } + } + moves = append(moves, balanceShardTypeAcrossRacks(vk, nodes, racks, diskType, dataShards, + parityPerRack, rackShardCount, ceilDivide(parityShards, numRacks), antiAffinity, rp)...) + + return moves +} + +func balanceShardTypeAcrossRacks(vk volKey, nodes map[string]*Node, racks map[string]*rack, diskType string, dataShards int, shardsPerRack map[string][]int, rackShardCount map[string]int, maxPerRack int, antiAffinity map[string]bool, rp *super_block.ReplicaPlacement) []*move { + if maxPerRack < 1 { + maxPerRack = 1 + } + rackKeys := sortedKeys(racks) + + type pending struct { + shardID int + src *Node + } + var toMove []pending + for _, rackID := range rackKeys { + shards := append([]int(nil), shardsPerRack[rackID]...) + if len(shards) <= maxPerRack { + continue + } + sort.Ints(shards) + for i := 0; i < len(shards)-maxPerRack; i++ { + if src := nodeInRackHoldingShard(nodes, rackID, vk, shards[i]); src != nil { + toMove = append(toMove, pending{shards[i], src}) + } + } + } + + var moves []*move + for _, pm := range toMove { + destRack, ok := pickTarget(rackKeys, shardsPerRack, maxPerRack, antiAffinity, + func(r string) bool { return racks[r].freeSlots > 0 }, + func(r string) bool { + if rp != nil && rp.DiffRackCount > 0 { + return rackShardCount[r] < rp.DiffRackCount + } + return true + }) + if !ok { + continue + } + destNode := pickNodeInRack(racks[destRack], vk, rp) + if destNode == nil { + continue + } + destDisk := pickBestDiskOnNode(destNode, vk, diskType, pm.shardID, dataShards) + moves = append(moves, &move{ + volumeID: vk.vid, + shardID: pm.shardID, + collection: vk.collection, + source: pm.src, + sourceDisk: shardDiskID(pm.src, vk, pm.shardID), + target: destNode, + targetDisk: destDisk, + phase: "cross_rack", + }) + releaseShard(pm.src, vk, pm.shardID) + reserveShard(destNode, vk, pm.shardID, destDisk) + srcRack := pm.src.rack + shardsPerRack[destRack] = append(shardsPerRack[destRack], pm.shardID) + shardsPerRack[srcRack] = removeInt(shardsPerRack[srcRack], pm.shardID) + rackShardCount[destRack]++ + rackShardCount[srcRack]-- + racks[destRack].freeSlots-- + racks[srcRack].freeSlots++ + // Account at the node level too, so pickNodeInRack does not over-plan a + // limited-capacity destination across successive moves. + destNode.freeSlots-- + pm.src.freeSlots++ + } + return moves +} + +func pickNodeInRack(r *rack, vk volKey, rp *super_block.ReplicaPlacement) *Node { + var best *Node + bestCount := -1 + for _, id := range sortedNodeKeys(r.nodes) { + node := r.nodes[id] + if node.freeSlots <= 0 { + continue + } + count := volumeShardCount(node, vk) + if rp != nil && rp.SameRackCount > 0 && count >= rp.SameRackCount+1 { + continue + } + if best == nil || count < bestCount { + best, bestCount = node, count + } + } + return best +} + +// detectWithinRackImbalance spreads a volume's shards across the nodes of each +// rack, again data then parity with anti-affinity. +func detectWithinRackImbalance(vk volKey, nodes map[string]*Node, racks map[string]*rack, diskType string, threshold float64, dataShards, parityShards int, rp *super_block.ReplicaPlacement) []*move { + var moves []*move + + for _, rackID := range sortedKeys(racks) { + r := racks[rackID] + if len(r.nodes) <= 1 { + continue + } + + numNodes := len(r.nodes) + // Gate on per-type spread across the rack's nodes (see cross-rack phase). + gateData, gateParity := shardsByGroup(vk, r.nodes, dataShards, func(n *Node) string { return n.id }) + if !typeImbalanced(gateData, numNodes, threshold) && !typeImbalanced(gateParity, numNodes, threshold) { + continue + } + nodeShardCount := countShardsByNode(vk, r.nodes) + + dataPerNode, _ := shardsByGroup(vk, r.nodes, dataShards, func(n *Node) string { return n.id }) + moves = append(moves, balanceShardTypeAcrossNodes(vk, r, diskType, dataShards, + dataPerNode, nodeShardCount, ceilDivide(sumLens(dataPerNode), numNodes), nil, rp)...) + + dataPerNode, parityPerNode := shardsByGroup(vk, r.nodes, dataShards, func(n *Node) string { return n.id }) + antiAffinity := make(map[string]bool) + for nodeID, shards := range dataPerNode { + if len(shards) > 0 { + antiAffinity[nodeID] = true + } + } + moves = append(moves, balanceShardTypeAcrossNodes(vk, r, diskType, dataShards, + parityPerNode, nodeShardCount, ceilDivide(sumLens(parityPerNode), numNodes), antiAffinity, rp)...) + } + + return moves +} + +func balanceShardTypeAcrossNodes(vk volKey, r *rack, diskType string, dataShards int, shardsPerNode map[string][]int, nodeShardCount map[string]int, maxPerNode int, antiAffinity map[string]bool, rp *super_block.ReplicaPlacement) []*move { + if maxPerNode < 1 { + maxPerNode = 1 + } + nodeKeys := sortedNodeKeys(r.nodes) + + type pending struct { + shardID int + src *Node + } + var toMove []pending + for _, nodeID := range nodeKeys { + shards := append([]int(nil), shardsPerNode[nodeID]...) + if len(shards) <= maxPerNode { + continue + } + sort.Ints(shards) + src := r.nodes[nodeID] + for i := 0; i < len(shards)-maxPerNode; i++ { + toMove = append(toMove, pending{shards[i], src}) + } + } + + var moves []*move + for _, pm := range toMove { + destID, ok := pickTarget(nodeKeys, shardsPerNode, maxPerNode, antiAffinity, + func(n string) bool { return n != pm.src.id && r.nodes[n].freeSlots > 0 }, + func(n string) bool { + if rp != nil && rp.SameRackCount > 0 { + return nodeShardCount[n] < rp.SameRackCount+1 + } + return true + }) + if !ok { + continue + } + destNode := r.nodes[destID] + destDisk := pickBestDiskOnNode(destNode, vk, diskType, pm.shardID, dataShards) + moves = append(moves, &move{ + volumeID: vk.vid, + shardID: pm.shardID, + collection: vk.collection, + source: pm.src, + sourceDisk: shardDiskID(pm.src, vk, pm.shardID), + target: destNode, + targetDisk: destDisk, + phase: "within_rack", + }) + releaseShard(pm.src, vk, pm.shardID) + reserveShard(destNode, vk, pm.shardID, destDisk) + shardsPerNode[destID] = append(shardsPerNode[destID], pm.shardID) + shardsPerNode[pm.src.id] = removeInt(shardsPerNode[pm.src.id], pm.shardID) + nodeShardCount[destID]++ + nodeShardCount[pm.src.id]-- + pm.src.freeSlots++ + destNode.freeSlots-- + } + return moves +} + +// detectGlobalImbalance balances total EC shard load across the nodes of each +// rack (across all volumes), using utilization ratios so heterogeneous-capacity +// nodes are compared fairly. +func detectGlobalImbalance(nodes map[string]*Node, racks map[string]*rack, diskType string, threshold float64, dataShardsByCollection map[string]int, maxMovesPerRack int, byUtilization bool) []*move { + var moves []*move + + for _, rackID := range sortedKeys(racks) { + r := racks[rackID] + if len(r.nodes) <= 1 { + continue + } + + nodeShardCounts := make(map[string]int) + totalShards := 0 + for nodeID, node := range r.nodes { + count := 0 + for _, info := range node.shards { + count += info.shardBits.Count() + } + nodeShardCounts[nodeID] = count + totalShards += count + } + if totalShards == 0 { + continue + } + + // The balance metric is shards/capacity. For utilization balancing the + // capacity is the node's real shard-slot capacity; for raw-count balancing + // it is a constant 1, so the metric reduces to the raw shard count. Either + // way a node can only receive while its real freeSlots remain. + nodeCapacity := make(map[string]int, len(r.nodes)) + for nodeID, count := range nodeShardCounts { + if byUtilization { + nodeCapacity[nodeID] = count + r.nodes[nodeID].freeSlots + } else { + nodeCapacity[nodeID] = 1 + } + } + if !exceedsUtilImbalanceThreshold(nodeShardCounts, nodeCapacity, threshold) { + continue + } + + // Each iteration moves one shard. 0 means unlimited (drain to balance in + // one pass) — bounded by totalShards since the convergence guard stops + // once no beneficial move remains. + iterations := maxMovesPerRack + if iterations <= 0 { + iterations = totalShards + } + for i := 0; i < iterations; i++ { + var minNode, maxNode *Node + minUtil := math.Inf(1) + maxUtil := -1.0 + var minCount, maxCount int + for _, nodeID := range sortedNodeKeys(r.nodes) { + count := nodeShardCounts[nodeID] + node := r.nodes[nodeID] + capacity := nodeCapacity[nodeID] + if capacity <= 0 { + continue + } + util := float64(count) / float64(capacity) + if util < minUtil && node.freeSlots > 0 { + minUtil, minCount, minNode = util, count, node + } + if util > maxUtil { + maxUtil, maxCount, maxNode = util, count, node + } + } + if maxNode == nil || minNode == nil || maxNode.id == minNode.id { + break + } + + maxCap := nodeCapacity[maxNode.id] + minCap := nodeCapacity[minNode.id] + if maxCap <= 0 || minCap <= 0 { + break + } + if float64(minCount+1)/float64(minCap) > float64(maxCount-1)/float64(maxCap) { + break + } + + // Prefer moving a shard of a volume the destination does not hold at + // all (pass 0) before adding another shard of an already-present volume + // (pass 1), to keep each volume's shards spread across nodes. + moved := false + for pass := 0; pass < 2 && !moved; pass++ { + for _, vk := range sortedVolumeKeys(maxNode.shards) { + if moved { + break + } + info := maxNode.shards[vk] + minInfo := minNode.shards[vk] + volumeOnMin := minInfo != nil && minInfo.shardBits != 0 + if pass == 0 && volumeOnMin { + continue // pass 0: only volumes absent from the destination + } + if pass == 1 && !volumeOnMin { + continue // pass 1: only volumes already on the destination + } + // Iterate the full shard-id space so custom ratios with more than + // the standard total (ids 14..MaxShardCount-1) are candidates too. + for shardID := 0; shardID < erasure_coding.MaxShardCount; shardID++ { + sid := erasure_coding.ShardId(shardID) + if !info.shardBits.Has(sid) { + continue + } + if minInfo != nil && minInfo.shardBits.Has(sid) { + continue + } + dataShards := dataShardsByCollection[vk.collection] + if dataShards <= 0 { + dataShards = erasure_coding.DataShardsCount + } + destDisk := pickBestDiskOnNode(minNode, vk, diskType, shardID, dataShards) + moves = append(moves, &move{ + volumeID: vk.vid, + shardID: shardID, + collection: vk.collection, + source: maxNode, + sourceDisk: shardDiskID(maxNode, vk, shardID), + target: minNode, + targetDisk: destDisk, + phase: "global", + }) + info.shardBits = info.shardBits.Clear(sid) + for diskID := range info.diskShardBits { + info.diskShardBits[diskID] = info.diskShardBits[diskID].Clear(sid) + } + reserveShard(minNode, vk, shardID, destDisk) + nodeShardCounts[maxNode.id]-- + nodeShardCounts[minNode.id]++ + maxNode.freeSlots++ + minNode.freeSlots-- + moved = true + break + } + } + } + if !moved { + break + } + } + } + + return moves +} + +// shardsByGroup classifies a volume's shards into data (id < dataShards) and +// parity buckets, grouped by key(node). +func shardsByGroup(vk volKey, nodes map[string]*Node, dataShards int, key func(*Node) string) (dataPer, parityPer map[string][]int) { + dataPer = make(map[string][]int) + parityPer = make(map[string][]int) + for _, node := range nodes { + info, ok := node.shards[vk] + if !ok { + continue + } + k := key(node) + for s := 0; s < erasure_coding.MaxShardCount; s++ { + if !info.shardBits.Has(erasure_coding.ShardId(s)) { + continue + } + if s < dataShards { + dataPer[k] = append(dataPer[k], s) + } else { + parityPer[k] = append(parityPer[k], s) + } + } + } + return +} + +// pickTarget selects a destination key with room for another shard of a type, in +// two passes: first excluding anti-affinity targets, then any valid target. Among +// valid targets it prefers the fewest shards of this type; ties break on sorted +// key order, so selection is deterministic. +func pickTarget(candidates []string, shardsPerTarget map[string][]int, maxPerTarget int, antiAffinity map[string]bool, hasFreeSlots, withinLimit func(string) bool) (string, bool) { + try := func(skipAnti bool) (string, bool) { + best := "" + bestCount := maxPerTarget + 1 + for _, c := range candidates { + if skipAnti && antiAffinity[c] { + continue + } + if !hasFreeSlots(c) { + continue + } + if len(shardsPerTarget[c]) >= maxPerTarget { + continue + } + if !withinLimit(c) { + continue + } + if cnt := len(shardsPerTarget[c]); cnt < bestCount { + best, bestCount = c, cnt + } + } + return best, best != "" + } + if len(antiAffinity) > 0 { + if t, ok := try(true); ok { + return t, true + } + } + return try(false) +} + +// pickBestDiskOnNode chooses the physical disk to place a new shard of the +// volume: matching disk type with free capacity, preferring fewer total shards, +// far fewer shards of the same volume, and data/parity anti-affinity. Returns 0 +// ("server picks") when no disk info or no disk with capacity. +func pickBestDiskOnNode(node *Node, vk volKey, diskType string, shardID, dataShardCount int) uint32 { + if len(node.disks) == 0 { + return 0 + } + isDataShard := dataShardCount > 0 && shardID < dataShardCount + info := node.shards[vk] + + var bestDiskID uint32 + bestScore := -1 + for _, diskID := range sortedDiskKeys(node.disks) { + d := node.disks[diskID] + if diskType != "" && d.diskType != diskType { + continue + } + if d.freeSlots <= 0 { + continue + } + + existingShards := 0 + hasData := false + hasParity := false + if info != nil { + bits := info.diskShardBits[diskID] + existingShards = bits.Count() + if dataShardCount > 0 { + for s := 0; s < erasure_coding.MaxShardCount; s++ { + if !bits.Has(erasure_coding.ShardId(s)) { + continue + } + if s < dataShardCount { + hasData = true + } else { + hasParity = true + } + } + } + } + + score := d.shardCount*10 + existingShards*100 + if dataShardCount > 0 { + if isDataShard && hasParity { + score += 1000 + } else if !isDataShard && hasData { + score += 1000 + } + } + if bestScore == -1 || score < bestScore { + bestScore = score + bestDiskID = diskID + } + } + return bestDiskID +} + +// shardDiskID returns the disk holding the given shard of the volume, or 0. +func shardDiskID(node *Node, vk volKey, shardID int) uint32 { + info, ok := node.shards[vk] + if !ok { + return 0 + } + sid := erasure_coding.ShardId(shardID) + for _, diskID := range sortedDiskKeys(info.diskShardBits) { + if info.diskShardBits[diskID].Has(sid) { + return diskID + } + } + return 0 +} + +// reserveShard records a just-planned placement on the destination so later picks +// in the same run spread across disks/nodes. +func reserveShard(node *Node, vk volKey, shardID int, diskID uint32) { + info, ok := node.shards[vk] + if !ok { + info = &volumeShards{collection: vk.collection, diskShardBits: make(map[uint32]erasure_coding.ShardBits)} + node.shards[vk] = info + } + if info.diskShardBits == nil { + info.diskShardBits = make(map[uint32]erasure_coding.ShardBits) + } + sid := erasure_coding.ShardId(shardID) + info.shardBits = info.shardBits.Set(sid) + info.diskShardBits[diskID] = info.diskShardBits[diskID].Set(sid) + if d, ok := node.disks[diskID]; ok { + d.shardCount++ + if d.freeSlots > 0 { + d.freeSlots-- + } + } +} + +// releaseShard removes a shard of the volume from a node's model. +func releaseShard(node *Node, vk volKey, shardID int) { + info, ok := node.shards[vk] + if !ok { + return + } + sid := erasure_coding.ShardId(shardID) + for diskID, bits := range info.diskShardBits { + if bits.Has(sid) { + info.diskShardBits[diskID] = bits.Clear(sid) + if d, ok := node.disks[diskID]; ok { + d.shardCount-- + d.freeSlots++ + } + } + } + info.shardBits = info.shardBits.Clear(sid) +} + +// applyMovesToTopology simulates moves so later phases see updated placement. +// Dedup moves (source==target) are deletions that this helper alone applies, so +// it also credits the freed disk/node/rack capacity — otherwise a slot opened by +// dedup could not be used by the cross-rack/within-rack/global phases in the same +// run. Non-dedup moves already had their slots accounted inline by the phase that +// produced them, so only their shard bits are (idempotently) re-asserted here. +func applyMovesToTopology(moves []*move, racks map[string]*rack) { + for _, m := range moves { + sid := erasure_coding.ShardId(m.shardID) + vk := volKey{collection: m.collection, vid: m.volumeID} + dedup := m.source.id == m.target.id + + if srcInfo, ok := m.source.shards[vk]; ok { + srcInfo.shardBits = srcInfo.shardBits.Clear(sid) + for diskID := range srcInfo.diskShardBits { + if !srcInfo.diskShardBits[diskID].Has(sid) { + continue + } + srcInfo.diskShardBits[diskID] = srcInfo.diskShardBits[diskID].Clear(sid) + if dedup { + if d, ok := m.source.disks[diskID]; ok { + d.shardCount-- + d.freeSlots++ + } + } + } + } + + if dedup { + m.source.freeSlots++ + if r, ok := racks[m.source.rack]; ok { + r.freeSlots++ + } + continue + } + + dstInfo, ok := m.target.shards[vk] + if !ok { + dstInfo = &volumeShards{collection: m.collection, diskShardBits: make(map[uint32]erasure_coding.ShardBits)} + m.target.shards[vk] = dstInfo + } + if dstInfo.diskShardBits == nil { + dstInfo.diskShardBits = make(map[uint32]erasure_coding.ShardBits) + } + dstInfo.shardBits = dstInfo.shardBits.Set(sid) + dstInfo.diskShardBits[m.targetDisk] = dstInfo.diskShardBits[m.targetDisk].Set(sid) + } +} + +func volumeShardCount(node *Node, vk volKey) int { + if info, ok := node.shards[vk]; ok { + return info.shardBits.Count() + } + return 0 +} + +func nodeInRackHoldingShard(nodes map[string]*Node, rackID string, vk volKey, shardID int) *Node { + sid := erasure_coding.ShardId(shardID) + for _, id := range sortedNodeKeys(nodes) { + node := nodes[id] + if node.rack != rackID { + continue + } + if info, ok := node.shards[vk]; ok && info.shardBits.Has(sid) { + return node + } + } + return nil +} + +func countShardsByRack(vk volKey, nodes map[string]*Node) map[string]int { + m := make(map[string]int) + for _, node := range nodes { + if info, ok := node.shards[vk]; ok { + m[node.rack] += info.shardBits.Count() + } + } + return m +} + +func countShardsByNode(vk volKey, nodes map[string]*Node) map[string]int { + m := make(map[string]int) + for id, node := range nodes { + if info, ok := node.shards[vk]; ok { + m[id] = info.shardBits.Count() + } + } + return m +} + +func sortedKeys[T any](m map[string]T) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +func sortedNodeKeys(nodes map[string]*Node) []string { + return sortedKeys(nodes) +} + +func sortedDiskKeys[T any](m map[uint32]T) []uint32 { + keys := make([]uint32, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] }) + return keys +} + +func sortedVolumeKeys(m map[volKey]*volumeShards) []volKey { + keys := make([]volKey, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { + if keys[i].vid != keys[j].vid { + return keys[i].vid < keys[j].vid + } + return keys[i].collection < keys[j].collection + }) + return keys +} + +func sumLens(m map[string][]int) int { + total := 0 + for _, v := range m { + total += len(v) + } + return total +} + +func removeInt(s []int, v int) []int { + for i, x := range s { + if x == v { + return append(s[:i], s[i+1:]...) + } + } + return s +} + +func ceilDivide(a, b int) int { + if b == 0 { + return 0 + } + return (a + b - 1) / b +} + +// typeImbalanced reports whether the shards of one type (data or parity), +// grouped by rack or node, are spread unevenly enough across numGroups to exceed +// the threshold. Gating per type (rather than on combined totals) ensures a +// data/parity skew is acted on even when the per-group totals are even. +func typeImbalanced(perGroup map[string][]int, numGroups int, threshold float64) bool { + counts := make(map[string]int, len(perGroup)) + total := 0 + for k, v := range perGroup { + counts[k] = len(v) + total += len(v) + } + if total == 0 { + return false + } + return exceedsImbalanceThreshold(counts, total, numGroups, threshold) +} + +// exceedsImbalanceThreshold reports whether (max-min)/avg over numGroups exceeds +// the threshold. Groups missing from counts are treated as zero. +func exceedsImbalanceThreshold(counts map[string]int, total, numGroups int, threshold float64) bool { + if numGroups <= 1 || total == 0 { + return false + } + minCount := 0 + if len(counts) >= numGroups { + minCount = total + 1 + for _, count := range counts { + if count < minCount { + minCount = count + } + } + } + maxCount := -1 + for _, count := range counts { + if count > maxCount { + maxCount = count + } + } + avg := float64(total) / float64(numGroups) + if avg == 0 { + return false + } + return float64(maxCount-minCount)/avg > threshold +} + +// exceedsUtilImbalanceThreshold compares fractional fullness (count/capacity) so +// heterogeneous-capacity nodes are evaluated fairly. +func exceedsUtilImbalanceThreshold(counts, capacities map[string]int, threshold float64) bool { + minUtil := math.Inf(1) + maxUtil := -1.0 + seen := 0 + for nodeID, count := range counts { + capacity := capacities[nodeID] + if capacity <= 0 { + continue + } + util := float64(count) / float64(capacity) + if util < minUtil { + minUtil = util + } + if util > maxUtil { + maxUtil = util + } + seen++ + } + if seen < 2 || maxUtil <= 0 { + return false + } + avg := (maxUtil + minUtil) / 2 + if avg == 0 { + return false + } + return (maxUtil-minUtil)/avg > threshold +} diff --git a/weed/storage/erasure_coding/ecbalancer/balancer_test.go b/weed/storage/erasure_coding/ecbalancer/balancer_test.go new file mode 100644 index 000000000..2fac3a5ae --- /dev/null +++ b/weed/storage/erasure_coding/ecbalancer/balancer_test.go @@ -0,0 +1,431 @@ +package ecbalancer + +import ( + "testing" + + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/super_block" +) + +func bits(ids ...int) erasure_coding.ShardBits { + var b erasure_coding.ShardBits + for _, id := range ids { + b = b.Set(erasure_coding.ShardId(id)) + } + return b +} + +// addEmptyNode adds an EC-empty destination node with six disks and capacity. +func addEmptyNode(t *Topology, id, rackKey string) { + n := t.AddNode(id, "dc1", rackKey, 600) + for d := uint32(1); d <= 6; d++ { + n.AddDisk(d, "", 100, 0) + } +} + +func ratio(d, p int) func(string) (int, int) { + return func(string) (int, int) { return d, p } +} + +func TestPickBestDiskOnNode(t *testing.T) { + const vid = uint32(100) + const ds = erasure_coding.DataShardsCount + vk := volKey{collection: "c", vid: vid} + + t.Run("skips disks with no free slots", func(t *testing.T) { + topo := NewTopology() + n := topo.AddNode("n1", "dc1", "dc1:r1", 100) + n.AddDisk(1, "", 0, 0) + n.AddDisk(2, "", 10, 0) + if got := pickBestDiskOnNode(n, vk, "", 0, ds); got != 2 { + t.Errorf("got disk %d, want 2", got) + } + }) + + t.Run("spreads a volume's shards across disks", func(t *testing.T) { + topo := NewTopology() + n := topo.AddNode("n1", "dc1", "dc1:r1", 100) + n.AddDisk(1, "", 10, 1) + n.AddDisk(2, "", 10, 0) + n.AddShards(vid, "c", 1, bits(0)) + if got := pickBestDiskOnNode(n, vk, "", 5, ds); got != 2 { + t.Errorf("got disk %d, want 2 (disk 1 already holds this volume)", got) + } + }) + + t.Run("data shard avoids disk holding parity", func(t *testing.T) { + topo := NewTopology() + n := topo.AddNode("n1", "dc1", "dc1:r1", 100) + n.AddDisk(1, "", 10, 1) + n.AddDisk(2, "", 10, 0) + n.AddShards(vid, "c", 1, bits(ds)) // parity on disk 1 + if got := pickBestDiskOnNode(n, vk, "", 0, ds); got != 2 { + t.Errorf("got disk %d, want 2 (anti-affinity)", got) + } + }) + + t.Run("anti-affinity follows the supplied ratio boundary", func(t *testing.T) { + topo := NewTopology() + n := topo.AddNode("n1", "dc1", "dc1:r1", 100) + n.AddDisk(1, "", 10, 1) + n.AddDisk(2, "", 10, 2) + n.AddShards(vid, "c", 1, bits(7)) // parity at 6+3 + n.AddShards(vid, "c", 2, bits(2)) // data + if got := pickBestDiskOnNode(n, vk, "", 1, 6); got != 2 { + t.Errorf("ratio 6: got disk %d, want 2", got) + } + if got := pickBestDiskOnNode(n, vk, "", 1, erasure_coding.DataShardsCount); got != 1 { + t.Errorf("boundary 10: got disk %d, want 1", got) + } + }) + + t.Run("only matching disk type when set", func(t *testing.T) { + topo := NewTopology() + n := topo.AddNode("n1", "dc1", "dc1:r1", 100) + n.AddDisk(1, "ssd", 10, 0) + n.AddDisk(2, "hdd", 10, 0) + if got := pickBestDiskOnNode(n, vk, "hdd", 0, ds); got != 2 { + t.Errorf("got disk %d, want 2 (only hdd)", got) + } + }) +} + +func TestPlanSourceDiskAttribution(t *testing.T) { + shardsByDisk := map[uint32][]int{0: {0, 1, 2}, 1: {3, 4, 5}, 2: {6, 7}, 3: {8, 9}, 4: {10, 11}, 5: {12, 13}} + shardToDisk := map[int]uint32{} + for d, ss := range shardsByDisk { + for _, s := range ss { + shardToDisk[s] = d + } + } + topo := NewTopology() + src := topo.AddNode("node1", "dc1", "dc1:rack1", 0) + for d, ss := range shardsByDisk { + src.AddDisk(d, "", 0, len(ss)) + src.AddShards(100, "col1", d, bits(ss...)) + } + addEmptyNode(topo, "node2", "dc1:rack2") + + moves := Plan(topo, Options{ImbalanceThreshold: 0.01, Ratio: ratio(10, 4)}) + if len(moves) == 0 { + t.Fatal("expected moves") + } + for _, m := range moves { + if m.Phase != "cross_rack" { + continue + } + if want := shardToDisk[m.ShardID]; m.SourceDisk != want { + t.Errorf("shard %d: source disk %d, want %d", m.ShardID, m.SourceDisk, want) + } + } +} + +func TestPlanSpreadsAcrossDestinationDisks(t *testing.T) { + topo := NewTopology() + src := topo.AddNode("node1", "dc1", "dc1:rack1", 0) + src.AddDisk(0, "", 0, 14) + src.AddShards(100, "col1", 0, bits(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) + addEmptyNode(topo, "node2", "dc1:rack2") + + moves := Plan(topo, Options{ImbalanceThreshold: 0.01, Ratio: ratio(10, 4)}) + distinct := map[uint32]bool{} + crossRack := 0 + for _, m := range moves { + if m.Phase == "cross_rack" { + crossRack++ + distinct[m.TargetDisk] = true + } + } + if crossRack != 7 { + t.Fatalf("got %d cross-rack moves, want 7", crossRack) + } + if len(distinct) != 6 { + t.Errorf("cross-rack moves used %d distinct disks, want 6: %v", len(distinct), distinct) + } +} + +func TestPlanCrossRackParityAntiAffinity(t *testing.T) { + topo := NewTopology() + src := topo.AddNode("node1", "dc1", "dc1:rack1", 0) + src.AddDisk(0, "", 0, 3) + src.AddShards(100, "col1", 0, bits(0, 1, 2)) // 1 data + 2 parity at ratio 1+2 + addEmptyNode(topo, "node2", "dc1:rack2") + addEmptyNode(topo, "node3", "dc1:rack3") + + moves := Plan(topo, Options{ImbalanceThreshold: 0.01, Ratio: ratio(1, 2)}) + if len(moves) == 0 { + t.Fatal("expected parity moves across racks") + } + for _, m := range moves { + if m.ShardID < 1 { + t.Errorf("data shard %d moved; it fits in rack1", m.ShardID) + } + if m.TargetNode == "node1" { + t.Errorf("parity shard %d moved onto data-bearing node1", m.ShardID) + } + } +} + +func TestWithinRackParityAntiAffinity(t *testing.T) { + // Test the within-rack phase in isolation (the global phase, which balances + // total load, would otherwise also act on this single rack). + topo := NewTopology() + src := topo.AddNode("node1", "dc1", "dc1:rack1", 600) + src.AddDisk(0, "", 600, 3) + src.AddShards(100, "col1", 0, bits(0, 1, 2)) + addEmptyNode(topo, "node2", "dc1:rack1") + addEmptyNode(topo, "node3", "dc1:rack1") + + racks := buildRacks(topo.nodes) + moves := detectWithinRackImbalance(volKey{collection: "col1", vid: 100}, topo.nodes, racks, "", 0.01, 1, 2, nil) + if len(moves) == 0 { + t.Fatal("expected parity moves within rack") + } + for _, m := range moves { + if m.shardID < 1 { + t.Errorf("data shard %d moved; it fits on node1", m.shardID) + } + if m.target.id == "node1" { + t.Errorf("parity shard %d moved onto data-bearing node1", m.shardID) + } + } +} + +func TestPlanReplicaPlacementCapsPerRack(t *testing.T) { + build := func() *Topology { + topo := NewTopology() + src := topo.AddNode("node1", "dc1", "dc1:rack1", 0) + src.AddDisk(0, "", 0, 6) + src.AddShards(100, "col1", 0, bits(0, 1, 2, 3, 4, 5)) // all data at ratio 6+0 + addEmptyNode(topo, "node2", "dc1:rack2") + addEmptyNode(topo, "node3", "dc1:rack3") + return topo + } + + countCross := func(moves []Move) int { + n := 0 + for _, m := range moves { + if m.Phase == "cross_rack" { + n++ + } + } + return n + } + + if got := countCross(Plan(build(), Options{ImbalanceThreshold: 0.01, Ratio: ratio(6, 0)})); got != 4 { + t.Fatalf("without replica placement: %d cross-rack moves, want 4", got) + } + rp := &super_block.ReplicaPlacement{DiffRackCount: 1} + if got := countCross(Plan(build(), Options{ImbalanceThreshold: 0.01, ReplicaPlacement: rp, Ratio: ratio(6, 0)})); got != 2 { + t.Errorf("with DiffRackCount=1: %d cross-rack moves, want 2", got) + } +} + +func TestPlanDedup(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 5) + n1.AddDisk(0, "", 5, 2) + n1.AddShards(100, "col1", 0, bits(0, 1)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack2", 10) + n2.AddDisk(0, "", 10, 1) + n2.AddShards(100, "col1", 0, bits(0)) // shard 0 duplicated on node2 + + var dedup []Move + for _, m := range Plan(topo, Options{ImbalanceThreshold: 0.01, Ratio: ratio(10, 4)}) { + if m.Phase == "dedup" { + dedup = append(dedup, m) + } + } + if len(dedup) != 1 { + t.Fatalf("got %d dedup moves, want 1", len(dedup)) + } + if dedup[0].ShardID != 0 || dedup[0].SourceNode != "node1" || dedup[0].TargetNode != "node1" { + t.Errorf("dedup move = %+v, want shard 0 deleted on node1 (fewer free slots)", dedup[0]) + } +} + +func TestCeilDivide(t *testing.T) { + for _, tc := range []struct{ a, b, want int }{{14, 3, 5}, {10, 3, 4}, {0, 5, 0}, {5, 0, 0}} { + if got := ceilDivide(tc.a, tc.b); got != tc.want { + t.Errorf("ceilDivide(%d,%d)=%d want %d", tc.a, tc.b, got, tc.want) + } + } +} + +func allBits(n int) erasure_coding.ShardBits { + var b erasure_coding.ShardBits + for i := 0; i < n; i++ { + b = b.Set(erasure_coding.ShardId(i)) + } + return b +} + +func TestGlobalImbalanceMovesFromFullToEmpty(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 5) + n1.AddShards(100, "col1", 0, allBits(14)) + n1.AddShards(200, "col1", 0, allBits(6)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack1", 30) + n2.AddShards(300, "col1", 0, allBits(2)) + + moves := detectGlobalImbalance(topo.nodes, buildRacks(topo.nodes), "", 0.01, nil, 0, true) + if len(moves) == 0 { + t.Fatal("expected global balance moves") + } + for _, m := range moves { + if m.phase != "global" || m.source.id != "node1" || m.target.id != "node2" { + t.Errorf("move = %+v, want global node1->node2", m) + } + } +} + +// TestGlobalImbalanceHeterogeneousCapacity: node1 holds more shards but is less +// utilized (high capacity); moves must drain the more-utilized node2. +func TestGlobalImbalanceHeterogeneousCapacity(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 90) + n1.AddShards(100, "col1", 0, allBits(10)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack1", 2) + n2.AddShards(200, "col1", 0, allBits(3)) + + moves := detectGlobalImbalance(topo.nodes, buildRacks(topo.nodes), "", 0.01, nil, 0, true) + if len(moves) == 0 { + t.Fatal("expected moves from high-util node2 to low-util node1") + } + seen := map[[2]int]bool{} + for _, m := range moves { + if m.source.id != "node2" || m.target.id != "node1" { + t.Errorf("move = %+v, want node2->node1", m) + } + key := [2]int{int(m.volumeID), m.shardID} + if seen[key] { + t.Errorf("duplicate move for volume %d shard %d", m.volumeID, m.shardID) + } + seen[key] = true + } +} + +func TestGlobalImbalanceSkipsFullNodes(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 10) + n1.AddShards(100, "col1", 0, allBits(14)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack1", 0) // full, cannot receive + n2.AddShards(200, "col1", 0, allBits(2)) + + if moves := detectGlobalImbalance(topo.nodes, buildRacks(topo.nodes), "", 0.01, nil, 0, true); len(moves) != 0 { + t.Fatalf("expected 0 moves (node2 full), got %d", len(moves)) + } +} + +// TestPlanBalancesSkewedDataParityWithEvenTotals guards the per-type gate: two +// racks hold equal shard totals (7 each) but the data shards are skewed (7 vs 3). +// A total-count gate would skip balancing; the per-type gate must still act. +func TestPlanBalancesSkewedDataParityWithEvenTotals(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 100) + n1.AddDisk(0, "", 100, 7) + n1.AddShards(100, "col1", 0, bits(0, 1, 2, 3, 4, 5, 6)) // 7 data shards + n2 := topo.AddNode("node2", "dc1", "dc1:rack2", 100) + n2.AddDisk(0, "", 100, 7) + n2.AddShards(100, "col1", 0, bits(7, 8, 9, 10, 11, 12, 13)) // 3 data + 4 parity + + moves := Plan(topo, Options{ImbalanceThreshold: 0, Ratio: ratio(10, 4)}) + + crossRack, dataMoved := 0, 0 + for _, m := range moves { + if m.Phase == "cross_rack" { + crossRack++ + if m.ShardID < 10 { + dataMoved++ + } + } + } + if crossRack == 0 { + t.Fatal("even totals masked a data/parity skew: no cross-rack moves produced") + } + if dataMoved == 0 { + t.Error("expected skewed data shards to rebalance across racks") + } +} + +// TestGlobalPrefersVolumeAbsentFromDestination guards the global phase's +// volume-diversity preference: when draining a node, move a shard of a volume the +// destination does not hold at all before piling a second shard of an +// already-present volume onto it. node1 (full) holds vol100 and vol200; node2 +// (empty) holds only vol100, so the first global move should be a vol200 shard. +func TestGlobalPrefersVolumeAbsentFromDestination(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 0) + n1.AddShards(100, "col1", 0, bits(0, 1)) + n1.AddShards(200, "col1", 0, bits(0, 1)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack1", 3) + n2.AddShards(100, "col1", 0, bits(2)) + + moves := detectGlobalImbalance(topo.nodes, buildRacks(topo.nodes), "", 0.01, nil, 0, true) + if len(moves) == 0 { + t.Fatal("expected a global move from the full node") + } + if moves[0].volumeID != 200 { + t.Errorf("first global move is volume %d, want 200 (the volume absent from node2)", moves[0].volumeID) + } + for _, m := range moves { + if m.source.id != "node1" || m.target.id != "node2" { + t.Errorf("move %+v, want node1->node2", m) + } + } +} + +// TestPlanKeepsCollectionsWithSameVolumeIdDistinct guards EC identity: a numeric +// volume id reused across collections must not be merged. (A,5) on node1 and +// (B,5) on node2 are different volumes, so neither dedup nor any move should +// treat them as copies of one another. +func TestPlanKeepsCollectionsWithSameVolumeIdDistinct(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 100) + n1.AddDisk(0, "", 100, 1) + n1.AddShards(5, "A", 0, bits(0)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack2", 100) + n2.AddDisk(0, "", 100, 1) + n2.AddShards(5, "B", 0, bits(0)) + + for _, m := range Plan(topo, Options{ImbalanceThreshold: 0.01, Ratio: ratio(10, 4)}) { + if m.Phase == "dedup" { + t.Errorf("dedup %+v: (A,5) and (B,5) are different volumes and must not be deduped", m) + } + } +} + +// TestDedupFreesCapacityForLaterPhases checks that capacity opened by deleting a +// duplicate is usable by a later phase in the same Plan. node2 is full (0 free) +// but holds a duplicate of node1's shard 0; node1 is roomier, so dedup deletes +// node2's copy, freeing a slot. The within-rack phase must then be able to move a +// shard onto node2. +func TestDedupFreesCapacityForLaterPhases(t *testing.T) { + topo := NewTopology() + n1 := topo.AddNode("node1", "dc1", "dc1:rack1", 5) + n1.AddDisk(0, "", 5, 7) + n1.AddShards(100, "col1", 0, bits(0, 1, 2, 3, 4, 5, 6)) + n2 := topo.AddNode("node2", "dc1", "dc1:rack1", 0) // full + n2.AddDisk(0, "", 0, 1) + n2.AddShards(100, "col1", 0, bits(0)) // duplicate of node1's shard 0 + + moves := Plan(topo, Options{ImbalanceThreshold: 0.01, Ratio: ratio(10, 4)}) + + dedup := false + toNode2 := 0 + for _, m := range moves { + if m.Phase == "dedup" { + dedup = true + continue + } + if m.TargetNode == "node2" { + toNode2++ + } + } + if !dedup { + t.Fatal("expected a dedup move for the duplicated shard 0") + } + if toNode2 == 0 { + t.Error("slot freed by dedup on node2 was not usable by a later phase") + } +} diff --git a/weed/worker/tasks/ec_balance/config.go b/weed/worker/tasks/ec_balance/config.go index 3ddad226b..16515bae4 100644 --- a/weed/worker/tasks/ec_balance/config.go +++ b/weed/worker/tasks/ec_balance/config.go @@ -17,7 +17,8 @@ type Config struct { CollectionFilter string `json:"collection_filter"` DiskType string `json:"disk_type"` PreferredTags []string `json:"preferred_tags"` - DataCenterFilter string `json:"-"` // per-detection-run, not persisted + ReplicaPlacement string `json:"replica_placement"` // e.g. "020"; empty falls back to the master default replication (even spread only when that default is empty or zero) + DataCenterFilter string `json:"-"` // per-detection-run, not persisted } // NewDefaultConfig creates a new default EC balance configuration @@ -155,6 +156,19 @@ func GetConfigSpec() base.ConfigSpec { InputType: "text", CSSClasses: "form-control", }, + { + Name: "replica_placement", + JSONName: "replica_placement", + Type: config.FieldTypeString, + DefaultValue: "", + Required: false, + DisplayName: "Replica Placement", + Description: "EC shard replica placement constraint (e.g. 020)", + HelpText: "Leave empty to use the master default replication (even spread only when that default is empty or zero). When set, limits shards per rack/node per the placement digits (dc/rack/node)", + Placeholder: "020", + InputType: "text", + CSSClasses: "form-control", + }, }, } } @@ -174,6 +188,7 @@ func (c *Config) ToTaskPolicy() *worker_pb.TaskPolicy { CollectionFilter: c.CollectionFilter, DiskType: c.DiskType, PreferredTags: preferredTagsCopy, + ReplicaPlacement: c.ReplicaPlacement, }, }, } @@ -195,6 +210,7 @@ func (c *Config) FromTaskPolicy(policy *worker_pb.TaskPolicy) error { c.CollectionFilter = ecbConfig.CollectionFilter c.DiskType = ecbConfig.DiskType c.PreferredTags = append([]string(nil), ecbConfig.PreferredTags...) + c.ReplicaPlacement = ecbConfig.ReplicaPlacement } return nil diff --git a/weed/worker/tasks/ec_balance/detection.go b/weed/worker/tasks/ec_balance/detection.go index 3d2a2da9e..d472b8936 100644 --- a/weed/worker/tasks/ec_balance/detection.go +++ b/weed/worker/tasks/ec_balance/detection.go @@ -3,56 +3,25 @@ package ec_balance import ( "context" "fmt" - "math" - "sort" "time" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding/ecbalancer" + "github.com/seaweedfs/seaweedfs/weed/storage/super_block" + storagetypes "github.com/seaweedfs/seaweedfs/weed/storage/types" "github.com/seaweedfs/seaweedfs/weed/util/wildcard" "github.com/seaweedfs/seaweedfs/weed/worker/tasks/base" "github.com/seaweedfs/seaweedfs/weed/worker/types" ) -// ecNodeInfo represents a volume server with EC shard information for detection -type ecNodeInfo struct { - nodeID string - address string - dc string - rack string // dc:rack composite key - freeSlots int - // volumeID -> shardBits (bitmask of shard IDs present on this node) - ecShards map[uint32]*ecVolumeInfo -} - -type ecVolumeInfo struct { - collection string - shardBits uint32 // bitmask - diskID uint32 -} - -// ecRackInfo represents a rack with EC node information -type ecRackInfo struct { - nodes map[string]*ecNodeInfo - freeSlots int -} - -// shardMove represents a proposed EC shard move -type shardMove struct { - volumeID uint32 - shardID int - collection string - source *ecNodeInfo - sourceDisk uint32 - target *ecNodeInfo - targetDisk uint32 - phase string // "dedup", "cross_rack", "within_rack", "global" -} - -// Detection implements the multi-phase EC shard balance detection algorithm. -// It analyzes EC shard distribution and proposes moves to achieve even distribution. +// Detection builds an EC balance topology snapshot from the cluster's active +// topology, runs the shared ecbalancer planner, and converts the planned moves +// into worker task proposals. The balancing policy lives in +// weed/storage/erasure_coding/ecbalancer, shared with the shell ec.balance +// command so the two cannot drift. func Detection( ctx context.Context, metrics []*types.VolumeHealthMetrics, @@ -72,139 +41,114 @@ func Detection( if clusterInfo == nil || clusterInfo.ActiveTopology == nil { return nil, false, fmt.Errorf("active topology not available for EC balance detection") } - topoInfo := clusterInfo.ActiveTopology.GetTopologyInfo() if topoInfo == nil { return nil, false, fmt.Errorf("topology info not available") } - // Build EC topology view - nodes, racks := buildECTopology(topoInfo, ecConfig) - - if len(nodes) < ecConfig.MinServerCount { - glog.V(1).Infof("EC balance: only %d servers, need at least %d", len(nodes), ecConfig.MinServerCount) + topo, nodeCount := buildBalancerTopology(topoInfo, ecConfig) + if nodeCount < ecConfig.MinServerCount { + glog.V(1).Infof("EC balance: only %d servers, need at least %d", nodeCount, ecConfig.MinServerCount) return nil, false, nil } - // Collect all EC volumes grouped by collection - collections := collectECCollections(nodes, ecConfig) - if len(collections) == 0 { - glog.V(1).Infof("EC balance: no EC volumes found matching filters") + replicaPlacement := resolveReplicaPlacement(ecConfig, clusterInfo) + + if ctx != nil { + if err := ctx.Err(); err != nil { + return nil, false, err + } + } + + // Canonical disk type for placement/execution: "hdd" -> "" (HardDriveType), + // matching the topology's disk keys and the volume server's move RPCs. + normalizedDiskType := storagetypes.ToDiskType(ecConfig.DiskType).String() + + moves := ecbalancer.Plan(topo, ecbalancer.Options{ + DiskType: normalizedDiskType, + ImbalanceThreshold: ecConfig.ImbalanceThreshold, + ReplicaPlacement: replicaPlacement, + Ratio: func(collection string) (int, int) { + return resolveECRatio(clusterInfo, collection) + }, + // Move incrementally across detection cycles rather than draining a rack + // in one batch; the scheduler re-evaluates each cycle. + GlobalMaxMovesPerRack: 10, + // Balance heterogeneous-capacity racks by fractional fullness. + GlobalUtilizationBased: true, + }) + if len(moves) == 0 { return nil, false, nil } - threshold := ecConfig.ImbalanceThreshold - var allMoves []*shardMove - - // Build set of allowed collections for global phase filtering - allowedVids := make(map[uint32]bool) - for _, volumeIDs := range collections { - for _, vid := range volumeIDs { - allowedVids[vid] = true - } - } - - for collection, volumeIDs := range collections { - if ctx != nil { - if err := ctx.Err(); err != nil { - return nil, false, err - } - } - - // Phase 1: Detect duplicate shards (always run, duplicates are errors not imbalance) - for _, vid := range volumeIDs { - moves := detectDuplicateShards(vid, collection, nodes, ecConfig.DiskType) - applyMovesToTopology(moves) - allMoves = append(allMoves, moves...) - } - - // Phase 2: Balance shards across racks (operates on updated topology from phase 1) - for _, vid := range volumeIDs { - moves := detectCrossRackImbalance(vid, collection, nodes, racks, ecConfig.DiskType, threshold) - applyMovesToTopology(moves) - allMoves = append(allMoves, moves...) - } - - // Phase 3: Balance shards within racks (operates on updated topology from phases 1-2) - for _, vid := range volumeIDs { - moves := detectWithinRackImbalance(vid, collection, nodes, racks, ecConfig.DiskType, threshold) - applyMovesToTopology(moves) - allMoves = append(allMoves, moves...) - } - } - - // Phase 4: Global node balance across racks (only for volumes in allowed collections) - globalMoves := detectGlobalImbalance(nodes, racks, ecConfig, allowedVids) - allMoves = append(allMoves, globalMoves...) - - // Cap results hasMore := false - if maxResults > 0 && len(allMoves) > maxResults { - allMoves = allMoves[:maxResults] + if maxResults > 0 && len(moves) > maxResults { + moves = moves[:maxResults] hasMore = true } - // Convert moves to TaskDetectionResults now := time.Now() - results := make([]*types.TaskDetectionResult, 0, len(allMoves)) - for i, move := range allMoves { - // Include loop index and source/target in TaskID for uniqueness + results := make([]*types.TaskDetectionResult, 0, len(moves)) + for i, m := range moves { taskID := fmt.Sprintf("ec_balance_%d_%d_%s_%s_%d_%d", - move.volumeID, move.shardID, - move.source.nodeID, move.target.nodeID, - now.UnixNano(), i) - - result := &types.TaskDetectionResult{ + m.VolumeID, m.ShardID, m.SourceNode, m.TargetNode, now.UnixNano(), i) + results = append(results, &types.TaskDetectionResult{ TaskID: taskID, TaskType: types.TaskTypeECBalance, - VolumeID: move.volumeID, - Server: move.source.nodeID, - Collection: move.collection, - Priority: movePhasePriority(move.phase), - Reason: fmt.Sprintf("EC shard %d.%d %s: %s → %s (%s)", - move.volumeID, move.shardID, move.phase, - move.source.nodeID, move.target.nodeID, move.phase), + VolumeID: m.VolumeID, + Server: m.SourceNode, + Collection: m.Collection, + Priority: movePhasePriority(m.Phase), + Reason: fmt.Sprintf("EC shard %d.%d %s: %s → %s", + m.VolumeID, m.ShardID, m.Phase, m.SourceNode, m.TargetNode), ScheduleAt: now, TypedParams: &worker_pb.TaskParams{ TaskId: taskID, - VolumeId: move.volumeID, - Collection: move.collection, + VolumeId: m.VolumeID, + Collection: m.Collection, Sources: []*worker_pb.TaskSource{{ - Node: move.source.address, - DiskId: move.sourceDisk, - Rack: move.source.rack, - ShardIds: []uint32{uint32(move.shardID)}, + Node: m.SourceNode, + DiskId: m.SourceDisk, + Rack: m.SourceRack, + ShardIds: []uint32{uint32(m.ShardID)}, }}, Targets: []*worker_pb.TaskTarget{{ - Node: move.target.address, - DiskId: move.targetDisk, - Rack: move.target.rack, - ShardIds: []uint32{uint32(move.shardID)}, + Node: m.TargetNode, + DiskId: m.TargetDisk, + Rack: m.TargetRack, + ShardIds: []uint32{uint32(m.ShardID)}, }}, TaskParams: &worker_pb.TaskParams_EcBalanceParams{ EcBalanceParams: &worker_pb.EcBalanceTaskParams{ - DiskType: ecConfig.DiskType, + DiskType: normalizedDiskType, TimeoutSeconds: 600, }, }, }, - } - results = append(results, result) + }) } - glog.V(1).Infof("EC balance detection: %d moves proposed across %d collections", - len(results), len(collections)) - + glog.V(1).Infof("EC balance detection: %d moves proposed", len(results)) return results, hasMore, nil } -// buildECTopology constructs EC node and rack structures from topology info. -// Rack keys are dc:rack composites to avoid cross-DC name collisions. -// Only racks with eligible nodes (matching disk type, having EC shards or capacity) are included. -func buildECTopology(topoInfo *master_pb.TopologyInfo, config *Config) (map[string]*ecNodeInfo, map[string]*ecRackInfo) { - nodes := make(map[string]*ecNodeInfo) - racks := make(map[string]*ecRackInfo) +// buildBalancerTopology builds an ecbalancer.Topology from the master topology, +// applying the data-center, disk-type, and collection filters. Rack keys are +// dc:rack composites to avoid cross-DC name collisions. Per-disk free capacity +// is split evenly from the node total because the wire collapses same-type disks. +// Returns the topology and the number of eligible nodes (for MinServerCount). +func buildBalancerTopology(topoInfo *master_pb.TopologyInfo, config *Config) (*ecbalancer.Topology, int) { + topo := ecbalancer.NewTopology() + allowedCollections := wildcard.CompileWildcardMatchers(config.CollectionFilter) + // Normalize the disk-type filter: "hdd" (and the default "") map to the + // HardDriveType, which the topology reports under the empty-string key. Keep a + // separate "filter requested" flag so a configured "hdd" still filters to HDD + // disks instead of being mistaken for "all disk types". + filterByDiskType := config.DiskType != "" + wantDiskType := storagetypes.ToDiskType(config.DiskType).String() + + nodeCount := 0 for _, dc := range topoInfo.DataCenterInfos { if config.DataCenterFilter != "" { matchers := wildcard.CompileWildcardMatchers(config.DataCenterFilter) @@ -214,41 +158,38 @@ func buildECTopology(topoInfo *master_pb.TopologyInfo, config *Config) (map[stri } for _, rack := range dc.RackInfos { - // Use dc:rack composite key to avoid cross-DC name collisions rackKey := dc.Id + ":" + rack.Id for _, dn := range rack.DataNodeInfos { - node := &ecNodeInfo{ - nodeID: dn.Id, - address: dn.Id, - dc: dc.Id, - rack: rackKey, - ecShards: make(map[uint32]*ecVolumeInfo), - } - + freeSlots := 0 + diskTypeOf := make(map[uint32]string) // physical disk_id -> disk type + diskShardCount := make(map[uint32]int) hasMatchingDisk := false + for diskType, diskInfo := range dn.DiskInfos { - if config.DiskType != "" && diskType != config.DiskType { + if filterByDiskType && diskType != wantDiskType { continue } hasMatchingDisk = true - freeSlots := int(diskInfo.MaxVolumeCount-diskInfo.VolumeCount)*erasure_coding.DataShardsCount - countEcShards(diskInfo.EcShardInfos) - if freeSlots > 0 { - node.freeSlots += freeSlots + fs := int(diskInfo.MaxVolumeCount-diskInfo.VolumeCount)*erasure_coding.DataShardsCount - countEcShards(diskInfo.EcShardInfos) + if fs > 0 { + freeSlots += fs } - - for _, ecShardInfo := range diskInfo.EcShardInfos { - vid := ecShardInfo.Id - existing, ok := node.ecShards[vid] - if !ok { - existing = &ecVolumeInfo{ - collection: ecShardInfo.Collection, - diskID: ecShardInfo.DiskId, - } - node.ecShards[vid] = existing + // Discover physical disks from regular volumes too, so an + // EC-empty disk is still a candidate destination. + for _, vi := range diskInfo.VolumeInfos { + if _, ok := diskTypeOf[vi.DiskId]; !ok { + diskTypeOf[vi.DiskId] = diskType } - existing.shardBits |= ecShardInfo.EcIndexBits + } + for _, eci := range diskInfo.EcShardInfos { + if _, ok := diskTypeOf[eci.DiskId]; !ok { + diskTypeOf[eci.DiskId] = diskType + } + // Disk occupancy counts ALL volumes' shards (capacity model), + // independent of the collection filter below. + diskShardCount[eci.DiskId] += erasure_coding.GetShardCount(eci) } } @@ -256,583 +197,79 @@ func buildECTopology(topoInfo *master_pb.TopologyInfo, config *Config) (map[stri continue } - nodes[dn.Id] = node + node := topo.AddNode(dn.Id, dc.Id, rackKey, freeSlots) - // Only create rack entry when we have an eligible node - if _, ok := racks[rackKey]; !ok { - racks[rackKey] = &ecRackInfo{nodes: make(map[string]*ecNodeInfo)} + perDiskFree := 0 + if diskCount := len(diskTypeOf); diskCount > 0 && freeSlots > 0 { + perDiskFree = freeSlots / diskCount } - racks[rackKey].nodes[dn.Id] = node - racks[rackKey].freeSlots += node.freeSlots - } - } - } - - return nodes, racks -} - -// collectECCollections groups EC volume IDs by collection, applying filters -func collectECCollections(nodes map[string]*ecNodeInfo, config *Config) map[string][]uint32 { - allowedCollections := wildcard.CompileWildcardMatchers(config.CollectionFilter) - - // Collect unique volume IDs per collection - collectionVids := make(map[string]map[uint32]bool) - for _, node := range nodes { - for vid, info := range node.ecShards { - if len(allowedCollections) > 0 && !wildcard.MatchesAnyWildcard(allowedCollections, info.collection) { - continue - } - if _, ok := collectionVids[info.collection]; !ok { - collectionVids[info.collection] = make(map[uint32]bool) - } - collectionVids[info.collection][vid] = true - } - } - - // Convert to sorted slices - result := make(map[string][]uint32, len(collectionVids)) - for collection, vids := range collectionVids { - vidSlice := make([]uint32, 0, len(vids)) - for vid := range vids { - vidSlice = append(vidSlice, vid) - } - sort.Slice(vidSlice, func(i, j int) bool { return vidSlice[i] < vidSlice[j] }) - result[collection] = vidSlice - } - - return result -} - -// detectDuplicateShards finds shards that exist on multiple nodes. -// Duplicates are always returned regardless of threshold since they are data errors. -func detectDuplicateShards(vid uint32, collection string, nodes map[string]*ecNodeInfo, diskType string) []*shardMove { - // Build shard -> list of nodes mapping - shardLocations := make(map[int][]*ecNodeInfo) - for _, node := range nodes { - info, ok := node.ecShards[vid] - if !ok { - continue - } - for shardID := 0; shardID < erasure_coding.MaxShardCount; shardID++ { - if info.shardBits&(1< 0 { - rackShardCount[node.rack] += count - rackShardNodes[node.rack] = append(rackShardNodes[node.rack], node) - totalShards += count - } - } - - if totalShards == 0 { - return nil - } - - // Check if imbalance exceeds threshold - if !exceedsImbalanceThreshold(rackShardCount, totalShards, numRacks, threshold) { - return nil - } - - maxPerRack := ceilDivide(totalShards, numRacks) - - var moves []*shardMove - - // Find over-loaded racks and move excess shards to under-loaded racks - for rackID, count := range rackShardCount { - if count <= maxPerRack { - continue - } - excess := count - maxPerRack - movedFromRack := 0 - - // Find shards to move from this rack - for _, node := range rackShardNodes[rackID] { - if movedFromRack >= excess { - break - } - info := node.ecShards[vid] - for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ { - if movedFromRack >= excess { - break - } - if info.shardBits&(1<= excess { - break - } - if info.shardBits&(1< 0 && !allowedVids[vid] { - continue - } - count += shardBitCount(info.shardBits) - } - nodeShardCounts[nodeID] = count - totalShards += count - } - - if totalShards == 0 { - continue - } - - // Snapshot each node's total shard capacity (current shards from allowed - // volumes plus any remaining free slots). Capacity is fixed for the - // duration of this loop — moves conserve total shards across the rack, - // so the denominator does not change as nodeShardCounts shift. - nodeCapacity := make(map[string]int, len(rack.nodes)) - for nodeID, count := range nodeShardCounts { - nodeCapacity[nodeID] = count + rack.nodes[nodeID].freeSlots - } - - // Check if imbalance exceeds threshold using utilization ratios - // (count/capacity), not raw shard counts. Raw counts would say a - // cluster is imbalanced whenever a large-capacity node holds more - // shards than a small-capacity node, even when both are at the - // same fractional fullness. - if !exceedsUtilImbalanceThreshold(nodeShardCounts, nodeCapacity, config.ImbalanceThreshold) { - continue - } - - // Iteratively move shards from most-utilized to least-utilized - for i := 0; i < 10; i++ { // cap iterations to avoid infinite loops - // Find min and max nodes by utilization ratio. Min must have free - // slots so it can receive a shard; max can be any node with shards - // (we move shards out of it). Utilization-based selection is - // critical on heterogeneous racks: a large-capacity node with many - // shards in absolute terms may still be the LEAST utilized, and - // moving shards into it from a small, nearly-full node is the - // correct direction even though raw counts would suggest otherwise. - var minNode, maxNode *ecNodeInfo - minUtil := math.Inf(1) - maxUtil := -1.0 - var minCount, maxCount int - for nodeID, count := range nodeShardCounts { - node := rack.nodes[nodeID] - cap := nodeCapacity[nodeID] - if cap <= 0 { - continue - } - util := float64(count) / float64(cap) - if util < minUtil && node.freeSlots > 0 { - minUtil = util - minCount = count - minNode = node - } - if util > maxUtil { - maxUtil = util - maxCount = count - maxNode = rack.nodes[nodeID] - } - } - - if maxNode == nil || minNode == nil || maxNode.nodeID == minNode.nodeID { - break - } - - // Per-move convergence guard: reject any move where the - // destination's post-move utilization would strictly exceed the - // source's post-move utilization. This mirrors the guard in - // weed/worker/tasks/balance/detection.go and terminates the loop - // once no further beneficial move exists, preventing oscillation - // and overshoot on heterogeneous racks. - maxCap := nodeCapacity[maxNode.nodeID] - minCap := nodeCapacity[minNode.nodeID] - if maxCap <= 0 || minCap <= 0 { - break - } - newSrcUtil := float64(maxCount-1) / float64(maxCap) - newDstUtil := float64(minCount+1) / float64(minCap) - if newDstUtil > newSrcUtil { - break - } - - // Pick a shard from maxNode that doesn't already exist on minNode - moved := false - for vid, info := range maxNode.ecShards { - if moved { - break - } - if len(allowedVids) > 0 && !allowedVids[vid] { - continue - } - // Check minNode doesn't have this volume's shards already (avoid same-volume overlap) - minInfo := minNode.ecShards[vid] - for shardID := 0; shardID < erasure_coding.TotalShardsCount; shardID++ { - if info.shardBits&(1< 0 && !wildcard.MatchesAnyWildcard(allowedCollections, eci.Collection) { + continue } - minNode.ecShards[vid] = minInfo + node.AddShards(eci.Id, eci.Collection, eci.DiskId, erasure_coding.ShardBits(eci.EcIndexBits)) } - minInfo.shardBits |= shardBit - nodeShardCounts[maxNode.nodeID]-- - nodeShardCounts[minNode.nodeID]++ - maxNode.freeSlots++ - minNode.freeSlots-- - moved = true - break } - } - if !moved { - break + + nodeCount++ } } } - return moves + return topo, nodeCount } -// findDestNodeInUnderloadedRack finds a node in a rack that has fewer than maxPerRack shards -func findDestNodeInUnderloadedRack(vid uint32, racks map[string]*ecRackInfo, rackShardCount map[string]int, maxPerRack int, excludeRack string, nodes map[string]*ecNodeInfo) *ecNodeInfo { - var bestNode *ecNodeInfo - bestFreeSlots := -1 - - for rackID, rack := range racks { - if rackID == excludeRack { - continue - } - if rackShardCount[rackID] >= maxPerRack { - continue - } - if rack.freeSlots <= 0 { - continue - } - for _, node := range rack.nodes { - if node.freeSlots <= 0 { - continue - } - if node.freeSlots > bestFreeSlots { - bestFreeSlots = node.freeSlots - bestNode = node - } - } - } - - return bestNode +// resolveECRatio returns the (dataShards, parityShards) for a collection from the +// admin EC config snapshot when present, else the local default. This keeps the +// enterprise-only custom-ratio plumbing out of the shared planner. +func resolveECRatio(_ *types.ClusterInfo, _ string) (int, int) { + // Custom EC ratios are an enterprise feature; OSS uses the standard scheme. + return normalizeECShardCounts(0, 0) } -// findLeastLoadedNodeInRack finds the node with fewest shards in a rack -func findLeastLoadedNodeInRack(vid uint32, rack *ecRackInfo, excludeNode string, nodeShardCount map[string]int, maxPerNode int) *ecNodeInfo { - var bestNode *ecNodeInfo - bestCount := maxPerNode + 1 - - for nodeID, node := range rack.nodes { - if nodeID == excludeNode { - continue - } - if node.freeSlots <= 0 { - continue - } - count := nodeShardCount[nodeID] - if count >= maxPerNode { - continue - } - if count < bestCount { - bestCount = count - bestNode = node - } +// resolveReplicaPlacement picks the EC shard replica placement constraint: an +// explicit config value wins; otherwise it falls back to the master's default +// replication (matching the shell ec.balance default). A missing, invalid, or +// zero-replication value yields nil, meaning even spread / no constraint. +func resolveReplicaPlacement(ecConfig *Config, clusterInfo *types.ClusterInfo) *super_block.ReplicaPlacement { + spec := ecConfig.ReplicaPlacement + if spec == "" && clusterInfo != nil { + spec = clusterInfo.DefaultReplicaPlacement } - - return bestNode + if spec == "" { + return nil + } + rp, err := super_block.NewReplicaPlacementFromString(spec) + if err != nil { + glog.Warningf("EC balance: ignoring invalid replica placement %q: %v", spec, err) + return nil + } + if !rp.HasReplication() { + return nil + } + return rp } -// exceedsImbalanceThreshold checks if the distribution of counts exceeds the threshold. -// numGroups is the total number of groups (including those with 0 shards that aren't in the map). -// imbalanceRatio = (maxCount - minCount) / avgCount -func exceedsImbalanceThreshold(counts map[string]int, total int, numGroups int, threshold float64) bool { - if numGroups <= 1 || total == 0 { - return false +func normalizeECShardCounts(dataShards, parityShards int) (int, int) { + if dataShards <= 0 { + dataShards = erasure_coding.DataShardsCount } - - minCount := 0 // groups not in map have 0 shards - if len(counts) >= numGroups { - // All groups have entries; find actual min - minCount = total + 1 - for _, count := range counts { - if count < minCount { - minCount = count - } - } + if parityShards <= 0 { + parityShards = erasure_coding.ParityShardsCount } - - maxCount := -1 - for _, count := range counts { - if count > maxCount { - maxCount = count - } - } - - avg := float64(total) / float64(numGroups) - if avg == 0 { - return false - } - - imbalanceRatio := float64(maxCount-minCount) / avg - return imbalanceRatio > threshold + return dataShards, parityShards } -// exceedsUtilImbalanceThreshold checks whether the per-node utilization ratio -// (shard count / shard slot capacity) is skewed beyond the given threshold. -// Unlike exceedsImbalanceThreshold, it compares fractional fullness rather -// than raw counts so that racks with heterogeneous MaxVolumeCount are -// evaluated correctly — a large-capacity node holding more shards than a -// small-capacity node is not considered imbalanced if both are at the same -// fractional fullness. Nodes with zero capacity are skipped. -func exceedsUtilImbalanceThreshold(counts map[string]int, capacities map[string]int, threshold float64) bool { - minUtil := math.Inf(1) - maxUtil := -1.0 - seen := 0 - for nodeID, count := range counts { - cap := capacities[nodeID] - if cap <= 0 { - continue - } - util := float64(count) / float64(cap) - if util < minUtil { - minUtil = util - } - if util > maxUtil { - maxUtil = util - } - seen++ - } - if seen < 2 || maxUtil <= 0 { - return false - } - avg := (maxUtil + minUtil) / 2 - if avg == 0 { - return false - } - return (maxUtil-minUtil)/avg > threshold -} - -// applyMovesToTopology simulates planned moves on the in-memory topology -// so subsequent detection phases see updated shard placement. -func applyMovesToTopology(moves []*shardMove) { - for _, move := range moves { - shardBit := uint32(1 << uint(move.shardID)) - - // Remove shard from source - if srcInfo, ok := move.source.ecShards[move.volumeID]; ok { - srcInfo.shardBits &^= shardBit - } - - // For non-dedup moves, add shard to target - if move.source.nodeID != move.target.nodeID { - dstInfo, ok := move.target.ecShards[move.volumeID] - if !ok { - dstInfo = &ecVolumeInfo{ - collection: move.collection, - diskID: move.targetDisk, - } - move.target.ecShards[move.volumeID] = dstInfo - } - dstInfo.shardBits |= shardBit - } - } -} - -// Helper functions - func countEcShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) int { count := 0 for _, eci := range ecShardInfos { @@ -841,29 +278,6 @@ func countEcShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) in return count } -func shardBitCount(bits uint32) int { - count := 0 - for bits != 0 { - count += int(bits & 1) - bits >>= 1 - } - return count -} - -func ecShardDiskID(node *ecNodeInfo, vid uint32) uint32 { - if info, ok := node.ecShards[vid]; ok { - return info.diskID - } - return 0 -} - -func ceilDivide(a, b int) int { - if b == 0 { - return 0 - } - return (a + b - 1) / b -} - func movePhasePriority(phase string) types.TaskPriority { switch phase { case "dedup": diff --git a/weed/worker/tasks/ec_balance/detection_test.go b/weed/worker/tasks/ec_balance/detection_test.go index 86b3d8b34..66d473952 100644 --- a/weed/worker/tasks/ec_balance/detection_test.go +++ b/weed/worker/tasks/ec_balance/detection_test.go @@ -6,519 +6,59 @@ import ( "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding/ecbalancer" "github.com/seaweedfs/seaweedfs/weed/worker/types" ) -func TestShardBitCount(t *testing.T) { - tests := []struct { - bits uint32 - expected int - }{ - {0, 0}, - {1, 1}, - {0b111, 3}, - {0x3FFF, 14}, // all 14 shards - {0b10101010, 4}, +// The EC balance policy itself is tested in the shared ecbalancer package; these +// tests cover the worker adapter: building the planner topology from the master +// topology (filters, capacity) and the Detection entry point. + +func ecTopo(node1Collection string) *master_pb.TopologyInfo { + node1 := &master_pb.DataNodeInfo{ + Id: "node1", + DiskInfos: map[string]*master_pb.DiskInfo{ + "": {Type: "", MaxVolumeCount: 100, EcShardInfos: []*master_pb.VolumeEcShardInformationMessage{ + {Id: 100, Collection: node1Collection, DiskId: 0, EcIndexBits: 0x3FFF}, // 14 shards + }}, + }, } - for _, tt := range tests { - got := shardBitCount(tt.bits) - if got != tt.expected { - t.Errorf("shardBitCount(%b) = %d, want %d", tt.bits, got, tt.expected) - } + node2 := &master_pb.DataNodeInfo{ + Id: "node2", + DiskInfos: map[string]*master_pb.DiskInfo{"": {Type: "", MaxVolumeCount: 100}}, + } + return &master_pb.TopologyInfo{ + DataCenterInfos: []*master_pb.DataCenterInfo{{ + Id: "dc1", + RackInfos: []*master_pb.RackInfo{ + {Id: "rack1", DataNodeInfos: []*master_pb.DataNodeInfo{node1}}, + {Id: "rack2", DataNodeInfos: []*master_pb.DataNodeInfo{node2}}, + }, + }}, } } -func TestCeilDivide(t *testing.T) { - tests := []struct { - a, b int - expected int - }{ - {14, 3, 5}, - {14, 7, 2}, - {10, 3, 4}, - {0, 5, 0}, - {5, 0, 0}, - } - for _, tt := range tests { - got := ceilDivide(tt.a, tt.b) - if got != tt.expected { - t.Errorf("ceilDivide(%d, %d) = %d, want %d", tt.a, tt.b, got, tt.expected) - } - } -} - -func TestDetectDuplicateShards(t *testing.T) { - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 5, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0b11}, // shard 0, 1 - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack2", freeSlots: 10, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0b01}, // shard 0 (duplicate) - }, - }, - } - - moves := detectDuplicateShards(100, "col1", nodes, "") - - if len(moves) != 1 { - t.Fatalf("expected 1 dedup move, got %d", len(moves)) - } - - move := moves[0] - if move.phase != "dedup" { - t.Errorf("expected phase 'dedup', got %q", move.phase) - } - if move.shardID != 0 { - t.Errorf("expected shard 0 to be deduplicated, got %d", move.shardID) - } - // node1 has fewer free slots, so the duplicate on node1 should be removed (keeper is node2) - if move.source.nodeID != "node1" { - t.Errorf("expected source node1 (fewer free slots), got %s", move.source.nodeID) - } - // Dedup moves set target=source so isDedupPhase recognizes unmount+delete only - if move.target.nodeID != "node1" { - t.Errorf("expected target node1 (same as source for dedup), got %s", move.target.nodeID) - } -} - -func TestDetectCrossRackImbalance(t *testing.T) { - // 14 shards all on rack1, 2 racks available — large imbalance - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 0, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0x3FFF}, // all 14 shards - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack2", freeSlots: 20, - ecShards: map[uint32]*ecVolumeInfo{}, - }, - } - racks := map[string]*ecRackInfo{ - "dc1:rack1": { - nodes: map[string]*ecNodeInfo{"node1": nodes["node1"]}, - freeSlots: 0, - }, - "dc1:rack2": { - nodes: map[string]*ecNodeInfo{"node2": nodes["node2"]}, - freeSlots: 20, - }, - } - - // Use very low threshold so this triggers - moves := detectCrossRackImbalance(100, "col1", nodes, racks, "", 0.01) - - // With 14 shards across 2 racks, max per rack = 7 - // rack1 has 14 -> excess = 7, should move 7 to rack2 - if len(moves) != 7 { - t.Fatalf("expected 7 cross-rack moves, got %d", len(moves)) - } - for _, move := range moves { - if move.phase != "cross_rack" { - t.Errorf("expected phase 'cross_rack', got %q", move.phase) - } - if move.source.rack != "dc1:rack1" { - t.Errorf("expected source dc1:rack1, got %s", move.source.rack) - } - if move.target.rack != "dc1:rack2" { - t.Errorf("expected target dc1:rack2, got %s", move.target.rack) - } - } -} - -func TestDetectCrossRackImbalanceBelowThreshold(t *testing.T) { - // Slight imbalance: rack1 has 8, rack2 has 6 — imbalance = 2/7 ≈ 0.29 - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 10, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0xFF}, // 8 shards - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack2", freeSlots: 10, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0x3F00}, // 6 shards - }, - }, - } - racks := map[string]*ecRackInfo{ - "dc1:rack1": { - nodes: map[string]*ecNodeInfo{"node1": nodes["node1"]}, - freeSlots: 10, - }, - "dc1:rack2": { - nodes: map[string]*ecNodeInfo{"node2": nodes["node2"]}, - freeSlots: 10, - }, - } - - // High threshold should skip this - moves := detectCrossRackImbalance(100, "col1", nodes, racks, "", 0.5) - if len(moves) != 0 { - t.Fatalf("expected 0 moves below threshold, got %d", len(moves)) - } -} - -func TestDetectWithinRackImbalance(t *testing.T) { - // rack1 has 2 nodes: node1 has 10 shards, node2 has 0 shards - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 5, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0b1111111111}, // shards 0-9 - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack1", freeSlots: 20, - ecShards: map[uint32]*ecVolumeInfo{}, - }, - } - racks := map[string]*ecRackInfo{ - "dc1:rack1": { - nodes: map[string]*ecNodeInfo{"node1": nodes["node1"], "node2": nodes["node2"]}, - freeSlots: 25, - }, - } - - moves := detectWithinRackImbalance(100, "col1", nodes, racks, "", 0.01) - - // 10 shards on 2 nodes, max per node = 5 - // node1 has 10 -> excess = 5, should move 5 to node2 - if len(moves) != 5 { - t.Fatalf("expected 5 within-rack moves, got %d", len(moves)) - } - for _, move := range moves { - if move.phase != "within_rack" { - t.Errorf("expected phase 'within_rack', got %q", move.phase) - } - if move.source.nodeID != "node1" { - t.Errorf("expected source node1, got %s", move.source.nodeID) - } - if move.target.nodeID != "node2" { - t.Errorf("expected target node2, got %s", move.target.nodeID) - } - } -} - -func TestDetectGlobalImbalance(t *testing.T) { - // node1 has 20 total shards, node2 has 2 total shards (same rack) - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 5, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0x3FFF}, // 14 shards - 200: {collection: "col1", shardBits: 0b111111}, // 6 shards - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack1", freeSlots: 30, - ecShards: map[uint32]*ecVolumeInfo{ - 300: {collection: "col1", shardBits: 0b11}, // 2 shards - }, - }, - } - racks := map[string]*ecRackInfo{ - "dc1:rack1": { - nodes: map[string]*ecNodeInfo{"node1": nodes["node1"], "node2": nodes["node2"]}, - freeSlots: 35, - }, - } - +func TestBuildBalancerTopology(t *testing.T) { config := NewDefaultConfig() - config.ImbalanceThreshold = 0.01 // low threshold to ensure moves happen - moves := detectGlobalImbalance(nodes, racks, config, nil) - - // Total = 22 shards, avg = 11. node1 has 20, node2 has 2. - // Should move shards until balanced (max 10 iterations) + topo, nodeCount := buildBalancerTopology(ecTopo("col1"), config) + if nodeCount != 2 { + t.Fatalf("nodeCount = %d, want 2", nodeCount) + } + moves := ecbalancer.Plan(topo, ecbalancer.Options{ImbalanceThreshold: 0.01}) if len(moves) == 0 { - t.Fatal("expected global balance moves, got 0") - } - for _, move := range moves { - if move.phase != "global" { - t.Errorf("expected phase 'global', got %q", move.phase) - } - if move.source.nodeID != "node1" { - t.Errorf("expected moves from node1, got %s", move.source.nodeID) - } - if move.target.nodeID != "node2" { - t.Errorf("expected moves to node2, got %s", move.target.nodeID) - } + t.Error("expected cross-rack moves for an all-on-one-rack volume") } } -// TestDetectGlobalImbalance_HeterogeneousCapacity is a regression test for -// the Phase 4 rebalancer on heterogeneous racks. node1 holds more shards in -// absolute terms but has much higher capacity, so it is actually the LESS -// utilized node; node2 holds fewer shards but is nearly full. The greedy -// algorithm must pick the most-utilized node as the source and move shards -// in the direction that reduces fractional fullness, NOT in the direction -// that would equalize raw counts (which here would overfill node2). -// -// Scenario: -// -// node1: 10 shards, freeSlots=90 → capacity 100, util 10% -// node2: 3 shards, freeSlots=2 → capacity 5, util 60% -// -// Correct behavior: move shards FROM node2 TO node1 (draining the -// most-utilized node), until no further improvement is possible. Also -// verifies that moves are de-duplicated — the inner loop must update -// shardBits between iterations so each proposed move refers to a distinct -// physical shard. -func TestDetectGlobalImbalance_HeterogeneousCapacity(t *testing.T) { - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 90, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0x3FF}, // 10 shards - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack1", freeSlots: 2, - ecShards: map[uint32]*ecVolumeInfo{ - 200: {collection: "col1", shardBits: 0b111}, // 3 shards - }, - }, - } - racks := map[string]*ecRackInfo{ - "dc1:rack1": { - nodes: map[string]*ecNodeInfo{"node1": nodes["node1"], "node2": nodes["node2"]}, - freeSlots: 92, - }, - } - +func TestBuildBalancerTopologyCollectionFilter(t *testing.T) { config := NewDefaultConfig() - config.ImbalanceThreshold = 0.01 - moves := detectGlobalImbalance(nodes, racks, config, nil) - - if len(moves) == 0 { - t.Fatal("expected moves from high-util node2 to low-util node1, got 0") + config.CollectionFilter = "other" // does not match the volume's collection + topo, nodeCount := buildBalancerTopology(ecTopo("col1"), config) + if nodeCount != 2 { + t.Fatalf("nodeCount = %d, want 2", nodeCount) } - - // Every move must drain the higher-util node (node2) and target the - // lower-util node (node1). A raw-count-based greedy algorithm would - // pick the opposite direction — that is the bug this test guards. - for _, move := range moves { - if move.source.nodeID != "node2" { - t.Errorf("expected source node2 (util 0.60), got %s", move.source.nodeID) - } - if move.target.nodeID != "node1" { - t.Errorf("expected target node1 (util 0.10), got %s", move.target.nodeID) - } - } - - // Verify no duplicate (volumeID, shardID) pairs — the inner loop must - // update shardBits between iterations so each move refers to a distinct - // physical shard. - seen := make(map[[2]int]bool, len(moves)) - for _, move := range moves { - key := [2]int{int(move.volumeID), move.shardID} - if seen[key] { - t.Errorf("duplicate move for volume %d shard %d", move.volumeID, move.shardID) - } - seen[key] = true - } -} - -func TestDetectGlobalImbalanceSkipsFullNodes(t *testing.T) { - // node2 has 0 free slots — should not be chosen as destination - nodes := map[string]*ecNodeInfo{ - "node1": { - nodeID: "node1", address: "node1:8080", rack: "dc1:rack1", freeSlots: 10, - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1", shardBits: 0x3FFF}, // 14 shards - }, - }, - "node2": { - nodeID: "node2", address: "node2:8080", rack: "dc1:rack1", freeSlots: 0, - ecShards: map[uint32]*ecVolumeInfo{ - 200: {collection: "col1", shardBits: 0b11}, // 2 shards - }, - }, - } - racks := map[string]*ecRackInfo{ - "dc1:rack1": { - nodes: map[string]*ecNodeInfo{"node1": nodes["node1"], "node2": nodes["node2"]}, - freeSlots: 10, - }, - } - - config := NewDefaultConfig() - config.ImbalanceThreshold = 0.01 - moves := detectGlobalImbalance(nodes, racks, config, nil) - - // node2 has no free slots so no moves should be proposed - if len(moves) != 0 { - t.Fatalf("expected 0 moves (node2 full), got %d", len(moves)) - } -} - -func TestBuildECTopology(t *testing.T) { - topoInfo := &master_pb.TopologyInfo{ - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{ - { - Id: "rack1", - DataNodeInfos: []*master_pb.DataNodeInfo{ - { - Id: "server1:8080", - DiskInfos: map[string]*master_pb.DiskInfo{ - "": { - MaxVolumeCount: 100, - VolumeCount: 50, - EcShardInfos: []*master_pb.VolumeEcShardInformationMessage{ - { - Id: 1, - Collection: "test", - EcIndexBits: 0x3FFF, // all 14 shards - }, - }, - }, - }, - }, - }, - }, - }, - }, - }, - } - - config := NewDefaultConfig() - nodes, racks := buildECTopology(topoInfo, config) - - if len(nodes) != 1 { - t.Fatalf("expected 1 node, got %d", len(nodes)) - } - if len(racks) != 1 { - t.Fatalf("expected 1 rack, got %d", len(racks)) - } - - node := nodes["server1:8080"] - if node == nil { - t.Fatal("expected node server1:8080") - } - if node.dc != "dc1" { - t.Errorf("expected dc=dc1, got %s", node.dc) - } - // Rack key should be dc:rack composite - if node.rack != "dc1:rack1" { - t.Errorf("expected rack=dc1:rack1, got %s", node.rack) - } - - ecInfo, ok := node.ecShards[1] - if !ok { - t.Fatal("expected EC shard info for volume 1") - } - if ecInfo.collection != "test" { - t.Errorf("expected collection=test, got %s", ecInfo.collection) - } - if shardBitCount(ecInfo.shardBits) != 14 { - t.Errorf("expected 14 shards, got %d", shardBitCount(ecInfo.shardBits)) - } -} - -func TestBuildECTopologyCrossDCRackNames(t *testing.T) { - // Two DCs with identically-named racks should produce distinct rack keys - topoInfo := &master_pb.TopologyInfo{ - DataCenterInfos: []*master_pb.DataCenterInfo{ - { - Id: "dc1", - RackInfos: []*master_pb.RackInfo{{ - Id: "rack1", - DataNodeInfos: []*master_pb.DataNodeInfo{{ - Id: "node-dc1:8080", - DiskInfos: map[string]*master_pb.DiskInfo{ - "": {MaxVolumeCount: 10, VolumeCount: 0}, - }, - }}, - }}, - }, - { - Id: "dc2", - RackInfos: []*master_pb.RackInfo{{ - Id: "rack1", - DataNodeInfos: []*master_pb.DataNodeInfo{{ - Id: "node-dc2:8080", - DiskInfos: map[string]*master_pb.DiskInfo{ - "": {MaxVolumeCount: 10, VolumeCount: 0}, - }, - }}, - }}, - }, - }, - } - - config := NewDefaultConfig() - _, racks := buildECTopology(topoInfo, config) - - if len(racks) != 2 { - t.Fatalf("expected 2 distinct racks, got %d", len(racks)) - } - if _, ok := racks["dc1:rack1"]; !ok { - t.Error("expected dc1:rack1 rack key") - } - if _, ok := racks["dc2:rack1"]; !ok { - t.Error("expected dc2:rack1 rack key") - } -} - -func TestCollectECCollections(t *testing.T) { - nodes := map[string]*ecNodeInfo{ - "node1": { - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1"}, - 200: {collection: "col2"}, - }, - }, - "node2": { - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1"}, - 300: {collection: "col2"}, - }, - }, - } - - config := NewDefaultConfig() - collections := collectECCollections(nodes, config) - - if len(collections) != 2 { - t.Fatalf("expected 2 collections, got %d", len(collections)) - } - if len(collections["col1"]) != 1 { - t.Errorf("expected 1 volume in col1, got %d", len(collections["col1"])) - } - if len(collections["col2"]) != 2 { - t.Errorf("expected 2 volumes in col2, got %d", len(collections["col2"])) - } -} - -func TestCollectECCollectionsWithFilter(t *testing.T) { - nodes := map[string]*ecNodeInfo{ - "node1": { - ecShards: map[uint32]*ecVolumeInfo{ - 100: {collection: "col1"}, - 200: {collection: "col2"}, - }, - }, - } - - config := NewDefaultConfig() - config.CollectionFilter = "col1" - collections := collectECCollections(nodes, config) - - if len(collections) != 1 { - t.Fatalf("expected 1 collection, got %d", len(collections)) - } - if _, ok := collections["col1"]; !ok { - t.Error("expected col1 to be present") + if moves := ecbalancer.Plan(topo, ecbalancer.Options{ImbalanceThreshold: 0.01}); len(moves) != 0 { + t.Errorf("filtered-out collection should produce no moves, got %d", len(moves)) } } @@ -542,46 +82,24 @@ func TestDetectionNilTopology(t *testing.T) { config := NewDefaultConfig() clusterInfo := &types.ClusterInfo{ActiveTopology: nil} - _, _, err := Detection(context.Background(), nil, clusterInfo, config, 0) - if err == nil { + if _, _, err := Detection(context.Background(), nil, clusterInfo, config, 0); err == nil { t.Fatal("expected error for nil topology") } } func TestMovePhasePriority(t *testing.T) { - if movePhasePriority("dedup") != types.TaskPriorityHigh { - t.Error("dedup should be high priority") + cases := map[string]types.TaskPriority{ + "dedup": types.TaskPriorityHigh, + "cross_rack": types.TaskPriorityMedium, + "within_rack": types.TaskPriorityLow, + "global": types.TaskPriorityLow, } - if movePhasePriority("cross_rack") != types.TaskPriorityMedium { - t.Error("cross_rack should be medium priority") - } - if movePhasePriority("within_rack") != types.TaskPriorityLow { - t.Error("within_rack should be low priority") - } - if movePhasePriority("global") != types.TaskPriorityLow { - t.Error("global should be low priority") + for phase, want := range cases { + if got := movePhasePriority(phase); got != want { + t.Errorf("movePhasePriority(%q) = %v, want %v", phase, got, want) + } } } -func TestExceedsImbalanceThreshold(t *testing.T) { - // 14 vs 0 across 2 groups: imbalance = 14/7 = 2.0 > any reasonable threshold - counts := map[string]int{"a": 14, "b": 0} - if !exceedsImbalanceThreshold(counts, 14, 2, 0.2) { - t.Error("expected imbalance to exceed 0.2 threshold") - } - - // Only one group has shards but numGroups=2: min is 0 from absent group - counts2 := map[string]int{"a": 14} - if !exceedsImbalanceThreshold(counts2, 14, 2, 0.2) { - t.Error("expected imbalance with absent group to exceed 0.2 threshold") - } - - // 7 vs 7: perfectly balanced - counts3 := map[string]int{"a": 7, "b": 7} - if exceedsImbalanceThreshold(counts3, 14, 2, 0.01) { - t.Error("expected balanced distribution to not exceed threshold") - } -} - -// helper to avoid unused import +// keep the erasure_coding import meaningful for future adapter tests var _ = erasure_coding.DataShardsCount diff --git a/weed/worker/tasks/ec_balance/ec_balance_task.go b/weed/worker/tasks/ec_balance/ec_balance_task.go index 904b6db8e..f9692d7f2 100644 --- a/weed/worker/tasks/ec_balance/ec_balance_task.go +++ b/weed/worker/tasks/ec_balance/ec_balance_task.go @@ -73,6 +73,17 @@ func (t *ECBalanceTask) Execute(ctx context.Context, params *worker_pb.TaskParam isDedupDelete := ecParams != nil && isDedupPhase(params) + // Guard against a same-node, cross-disk "move". copyAndMountShard skips the + // copy when source and target addresses match, but deleteShard is node-wide + // (it removes the shard from every disk on the node), so this sequence would + // erase the shard after never copying it. EC shards also cannot be relocated + // between disks of one node via these RPCs, so such a move is meaningless. + // Reject it rather than lose data. + if source.Node == target.Node && source.DiskId != target.DiskId { + return fmt.Errorf("refusing same-node cross-disk EC shard move for volume %d shard(s) %v on %s (source disk %d, target disk %d): EC shard delete is node-wide and would erase the shard after a skipped copy", + params.VolumeId, source.ShardIds, source.Node, source.DiskId, target.DiskId) + } + glog.Infof("EC balance: moving shard(s) %v of volume %d from %s to %s", source.ShardIds, params.VolumeId, source.Node, target.Node) @@ -199,6 +210,14 @@ func (t *ECBalanceTask) Validate(params *worker_pb.TaskParams) error { if len(params.Targets[0].ShardIds) == 0 { return fmt.Errorf("ECBalanceTask.Validate: Targets[0].ShardIds is empty") } + // A same-node, cross-disk move is unsafe: the node-wide EC shard delete would + // erase the shard after copyAndMountShard skips the same-address copy. Such a + // move cannot be expressed by these RPCs anyway. Dedup (same node and disk) is + // allowed. + if params.Sources[0].Node == params.Targets[0].Node && params.Sources[0].DiskId != params.Targets[0].DiskId { + return fmt.Errorf("ECBalanceTask.Validate: refusing same-node cross-disk move on %s (source disk %d, target disk %d): EC shard delete is node-wide", + params.Sources[0].Node, params.Sources[0].DiskId, params.Targets[0].DiskId) + } return nil } @@ -223,10 +242,16 @@ func (t *ECBalanceTask) reportProgress(progress float64, stage string) { glog.Infof("EC balance volume %d: [%.2f] %s", t.volumeID, progress, stage) } -// isDedupPhase checks if this is a dedup-phase task (source and target are the same node) +// isDedupPhase checks if this is a dedup-phase task: an unmount+delete on a +// single location, encoded by detection as source==target on the same node AND +// the same disk. Comparing the disk too is essential — VolumeEcShardsDelete is +// node-wide (it removes the shard from every disk on the node), so a same-node +// but cross-disk request must NOT be treated as a benign dedup; see Validate +// and Execute, which reject it outright. func isDedupPhase(params *worker_pb.TaskParams) bool { if len(params.Sources) > 0 && len(params.Targets) > 0 { - return params.Sources[0].Node == params.Targets[0].Node + s, t := params.Sources[0], params.Targets[0] + return s.Node == t.Node && s.DiskId == t.DiskId } return false } diff --git a/weed/worker/tasks/ec_balance/multidisk_detection_test.go b/weed/worker/tasks/ec_balance/multidisk_detection_test.go new file mode 100644 index 000000000..3f2ab04a3 --- /dev/null +++ b/weed/worker/tasks/ec_balance/multidisk_detection_test.go @@ -0,0 +1,335 @@ +package ec_balance + +import ( + "context" + "sort" + "testing" + + "github.com/seaweedfs/seaweedfs/weed/admin/topology" + "github.com/seaweedfs/seaweedfs/weed/pb/master_pb" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/worker/types" +) + +// In-process (no real cluster) tests of the detection/planning path against a +// topology shaped the way the master reports a multi-disk cluster: several +// same-type physical disks on a node collapse into a single DiskInfo, with each +// shard's real DiskId surviving only in the per-shard records (issue 9593). They +// run the master-wire-format -> ActiveTopology -> Detection path, then simulate +// executing the planned moves with the volume server's actual semantics +// (VolumeEcShardsDelete is node-wide) to assert no EC shard is ever lost. The +// real-cluster end-to-end equivalent lives in test/erasure_coding. + +// nodeSpec describes one volume server: its rack and the volume's shards per +// physical disk (diskID -> shard ids). +type nodeSpec struct { + id string + rack string + disks map[uint32][]int +} + +const integDisksPerNode = 6 + +// buildMasterTopology renders nodeSpecs into a *master_pb.TopologyInfo exactly as +// the master would: one DiskInfo per node keyed by disk type (""), carrying one +// EcShardInfo per (volume, physical disk) plus regular VolumeInfos so every +// physical disk is discoverable even when it holds no EC shards. +func buildMasterTopology(collection string, vid uint32, maxVolPerDisk int, specs []nodeSpec) *master_pb.TopologyInfo { + rackByID := map[string]*master_pb.RackInfo{} + var rackOrder []string + + for _, spec := range specs { + var ecShards []*master_pb.VolumeEcShardInformationMessage + for diskID, shards := range spec.disks { + var bits erasure_coding.ShardBits + for _, s := range shards { + bits = bits.Set(erasure_coding.ShardId(s)) + } + ecShards = append(ecShards, &master_pb.VolumeEcShardInformationMessage{ + Id: vid, + Collection: collection, + DiskId: diskID, + EcIndexBits: uint32(bits), + }) + } + // Expose all physical disks (incl. EC-empty ones) via regular volumes. + var volInfos []*master_pb.VolumeInformationMessage + for d := uint32(0); d < integDisksPerNode; d++ { + volInfos = append(volInfos, &master_pb.VolumeInformationMessage{Id: 90000 + d, DiskId: d}) + } + dn := &master_pb.DataNodeInfo{ + Id: spec.id, + DiskInfos: map[string]*master_pb.DiskInfo{ + "": { + Type: "", + MaxVolumeCount: int64(maxVolPerDisk * integDisksPerNode), + VolumeCount: int64(integDisksPerNode), + EcShardInfos: ecShards, + VolumeInfos: volInfos, + }, + }, + } + r, ok := rackByID[spec.rack] + if !ok { + r = &master_pb.RackInfo{Id: spec.rack} + rackByID[spec.rack] = r + rackOrder = append(rackOrder, spec.rack) + } + r.DataNodeInfos = append(r.DataNodeInfos, dn) + } + + dc := &master_pb.DataCenterInfo{Id: "dc1"} + for _, rk := range rackOrder { + dc.RackInfos = append(dc.RackInfos, rackByID[rk]) + } + return &master_pb.TopologyInfo{Id: "integ", DataCenterInfos: []*master_pb.DataCenterInfo{dc}} +} + +// runDetection wires the topology through a real ActiveTopology and runs Detection. +func runDetection(t *testing.T, topoInfo *master_pb.TopologyInfo, cfg *Config) []*types.TaskDetectionResult { + t.Helper() + at := topology.NewActiveTopology(0) + if err := at.UpdateTopology(topoInfo); err != nil { + t.Fatalf("UpdateTopology: %v", err) + } + results, _, err := Detection(context.Background(), nil, &types.ClusterInfo{ActiveTopology: at}, cfg, 0) + if err != nil { + t.Fatalf("Detection: %v", err) + } + return results +} + +// shardModel is nodeID -> diskID -> set of shard ids for one volume. +type shardModel map[string]map[uint32]map[int]bool + +func modelFromSpecs(specs []nodeSpec) shardModel { + m := make(shardModel) + for _, spec := range specs { + m[spec.id] = make(map[uint32]map[int]bool) + for diskID, shards := range spec.disks { + set := make(map[int]bool) + for _, s := range shards { + set[s] = true + } + m[spec.id][diskID] = set + } + } + return m +} + +func (m shardModel) distinctShards() []int { + seen := map[int]bool{} + for _, disks := range m { + for _, set := range disks { + for s := range set { + seen[s] = true + } + } + } + out := make([]int, 0, len(seen)) + for s := range seen { + out = append(out, s) + } + sort.Ints(out) + return out +} + +// applyMovesRealistically replays planned moves with the volume server's actual +// behavior: a move copies the shard to the destination disk, then deletes it on +// the source node — and EC shard delete is node-wide (removes the shard from +// every disk on the source node). A dedup move (same node+disk) is delete-only. +func (m shardModel) apply(results []*types.TaskDetectionResult) { + nodeWideDelete := func(node string, shard int) { + for diskID := range m[node] { + delete(m[node][diskID], shard) + } + } + for _, r := range results { + p := r.TypedParams + if p == nil || len(p.Sources) == 0 || len(p.Targets) == 0 { + continue + } + src, dst := p.Sources[0], p.Targets[0] + if len(src.ShardIds) == 0 { + continue + } + shard := int(src.ShardIds[0]) + dedup := src.Node == dst.Node && src.DiskId == dst.DiskId + if !dedup { + if m[dst.Node] == nil { + m[dst.Node] = make(map[uint32]map[int]bool) + } + if m[dst.Node][dst.DiskId] == nil { + m[dst.Node][dst.DiskId] = make(map[int]bool) + } + m[dst.Node][dst.DiskId][shard] = true + } + nodeWideDelete(src.Node, shard) + } +} + +// TestMultiDiskBalanceNeverLosesShards is the core regression for +// issue 9593: a freshly-encoded volume on a 3-node, 6-disk-per-node cluster, +// balanced and then concentrated, must never lose a shard when the planned moves +// are executed with real node-wide-delete semantics. +func TestMultiDiskBalanceNeverLosesShards(t *testing.T) { + cfg := NewDefaultConfig() + cfg.ImbalanceThreshold = 0.0 // balance to even; surface any move the planner makes + + cases := []struct { + name string + specs []nodeSpec + }{ + { + // Healthy post-encode spread: 14 shards across 3 nodes (5/5/4), each + // node spreading its shards over distinct physical disks. + name: "balanced across disks", + specs: []nodeSpec{ + {id: "n1", rack: "rack1", disks: map[uint32][]int{0: {0}, 1: {1}, 2: {2}, 3: {3}, 4: {4}}}, + {id: "n2", rack: "rack1", disks: map[uint32][]int{0: {5}, 1: {6}, 2: {7}, 3: {8}, 4: {9}}}, + {id: "n3", rack: "rack1", disks: map[uint32][]int{0: {10}, 1: {11}, 2: {12}, 3: {13}}}, + }, + }, + { + // Worst case from the report: all 14 shards landed on one node's disks; + // the balancer must redistribute without losing any. + name: "concentrated on one node", + specs: []nodeSpec{ + {id: "n1", rack: "rack1", disks: map[uint32][]int{0: {0, 1, 2}, 1: {3, 4, 5}, 2: {6, 7}, 3: {8, 9}, 4: {10, 11}, 5: {12, 13}}}, + {id: "n2", rack: "rack2", disks: map[uint32][]int{}}, + {id: "n3", rack: "rack3", disks: map[uint32][]int{}}, + }, + }, + { + // Multi-rack healthy spread. + name: "spread across racks", + specs: []nodeSpec{ + {id: "n1", rack: "rack1", disks: map[uint32][]int{0: {0, 1}, 1: {2, 3}, 2: {4}}}, + {id: "n2", rack: "rack2", disks: map[uint32][]int{0: {5, 6}, 1: {7, 8}, 2: {9}}}, + {id: "n3", rack: "rack3", disks: map[uint32][]int{0: {10, 11}, 1: {12, 13}}}, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + topoInfo := buildMasterTopology("col1", 28, 50, tc.specs) + results := runDetection(t, topoInfo, cfg) + + model := modelFromSpecs(tc.specs) + before := model.distinctShards() + + for _, r := range results { + // No move may be a same-node cross-disk move (node-wide delete + // would erase the shard after a skipped copy). + p := r.TypedParams + s, d := p.Sources[0], p.Targets[0] + if s.Node == d.Node && s.DiskId != d.DiskId { + t.Errorf("unsafe same-node cross-disk move: vol %d shard %v on %s disk %d->%d", p.VolumeId, s.ShardIds, s.Node, s.DiskId, d.DiskId) + } + // Source disk must actually hold the shard being moved. + shard := int(s.ShardIds[0]) + if !model[s.Node][s.DiskId][shard] && s.Node != d.Node { + t.Errorf("move sources vol %d shard %d from %s disk %d, which does not hold it", p.VolumeId, shard, s.Node, s.DiskId) + } + } + + model.apply(results) + after := model.distinctShards() + + if len(after) != len(before) { + t.Errorf("[%s] shard loss: had %v (%d), now %v (%d) after %d moves", + tc.name, before, len(before), after, len(after), len(results)) + } + }) + } +} + +// TestConcentratedVolumeSpreadsAcrossNodesAndDisks asserts the +// remediation actually happens: a one-node-concentrated volume is redistributed +// to the other racks, landing on multiple distinct destination disks. +func TestConcentratedVolumeSpreadsAcrossNodesAndDisks(t *testing.T) { + cfg := NewDefaultConfig() + cfg.ImbalanceThreshold = 0.0 + + specs := []nodeSpec{ + {id: "n1", rack: "rack1", disks: map[uint32][]int{0: {0, 1, 2}, 1: {3, 4, 5}, 2: {6, 7}, 3: {8, 9}, 4: {10, 11}, 5: {12, 13}}}, + {id: "n2", rack: "rack2", disks: map[uint32][]int{}}, + {id: "n3", rack: "rack3", disks: map[uint32][]int{}}, + } + results := runDetection(t, buildMasterTopology("col1", 28, 50, specs), cfg) + if len(results) == 0 { + t.Fatal("expected redistribution moves for a one-node-concentrated volume") + } + + destNodes := map[string]bool{} + destDisksByNode := map[string]map[uint32]bool{} + for _, r := range results { + d := r.TypedParams.Targets[0] + if d.Node == "n1" { + continue + } + destNodes[d.Node] = true + if destDisksByNode[d.Node] == nil { + destDisksByNode[d.Node] = map[uint32]bool{} + } + destDisksByNode[d.Node][d.DiskId] = true + } + if len(destNodes) < 2 { + t.Errorf("shards spread to only %d destination nodes, want both other racks", len(destNodes)) + } + for node, disks := range destDisksByNode { + if len(disks) < 2 { + t.Errorf("destination %s received shards on only %d disk(s); expected spread across disks: %v", node, len(disks), disks) + } + } +} + +// TestBuildBalancerTopologyNormalizesHddDiskType guards the disk-type filter: +// the master reports default-HDD disks under the empty-string key, so a config of +// "hdd" must match them (not filter everything out), while "ssd" must exclude them. +func TestBuildBalancerTopologyNormalizesHddDiskType(t *testing.T) { + specs := []nodeSpec{ + {id: "n1", rack: "r1", disks: map[uint32][]int{0: {0, 1}}}, + {id: "n2", rack: "r1", disks: map[uint32][]int{0: {2}}}, + } + topoInfo := buildMasterTopology("c", 100, 50, specs) + + if _, n := buildBalancerTopology(topoInfo, &Config{DiskType: "hdd"}); n != 2 { + t.Errorf("disk_type=hdd matched %d nodes on an all-HDD cluster, want 2 (hdd must map to the empty HDD key)", n) + } + if _, n := buildBalancerTopology(topoInfo, &Config{DiskType: ""}); n != 2 { + t.Errorf("disk_type=empty matched %d nodes, want 2 (all)", n) + } + if _, n := buildBalancerTopology(topoInfo, &Config{DiskType: "ssd"}); n != 0 { + t.Errorf("disk_type=ssd matched %d nodes on an all-HDD cluster, want 0", n) + } +} + +// TestResolveReplicaPlacementFallsBackToMasterDefault verifies the worker mirrors +// the shell: explicit config wins, otherwise the master's default replication is +// the fallback, and an empty or zero-replication value means no constraint. +func TestResolveReplicaPlacementFallsBackToMasterDefault(t *testing.T) { + cases := []struct { + name string + configRP string + defaultRP string + wantApplied bool + }{ + {"explicit config used", "010", "", true}, + {"explicit config wins over default", "010", "100", true}, + {"falls back to master default", "", "010", true}, + {"zero master default = no constraint", "", "000", false}, + {"empty everywhere = no constraint", "", "", false}, + {"invalid value ignored", "", "nonsense", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + rp := resolveReplicaPlacement(&Config{ReplicaPlacement: tc.configRP}, + &types.ClusterInfo{DefaultReplicaPlacement: tc.defaultRP}) + if (rp != nil) != tc.wantApplied { + t.Errorf("config=%q default=%q: applied=%v, want %v", tc.configRP, tc.defaultRP, rp != nil, tc.wantApplied) + } + }) + } +} diff --git a/weed/worker/tasks/ec_balance/multidisk_test.go b/weed/worker/tasks/ec_balance/multidisk_test.go new file mode 100644 index 000000000..ed3a2fadc --- /dev/null +++ b/weed/worker/tasks/ec_balance/multidisk_test.go @@ -0,0 +1,58 @@ +package ec_balance + +import ( + "testing" + + "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb" +) + +// These cover the worker execution layer (ec_balance_task.go). The shard-balance +// policy and multi-disk placement are tested in the shared ecbalancer package. + +// TestValidateRejectsSameNodeCrossDiskMove covers the data-loss trap of a +// same-node, cross-disk shard move. copyAndMountShard skips the copy when source +// and target addresses match, but VolumeEcShardsDelete is node-wide, so executing +// such a move would erase the shard. Validate must reject it, while allowing a +// cross-node move and a same-node/same-disk dedup. +func TestValidateRejectsSameNodeCrossDiskMove(t *testing.T) { + task := NewECBalanceTask("t1", 100, "col1", nil) + + mk := func(srcNode string, srcDisk uint32, dstNode string, dstDisk uint32) *worker_pb.TaskParams { + return &worker_pb.TaskParams{ + VolumeId: 100, + Sources: []*worker_pb.TaskSource{{Node: srcNode, DiskId: srcDisk, ShardIds: []uint32{3}}}, + Targets: []*worker_pb.TaskTarget{{Node: dstNode, DiskId: dstDisk, ShardIds: []uint32{3}}}, + } + } + + if err := task.Validate(mk("node1", 0, "node1", 3)); err == nil { + t.Error("Validate accepted a same-node cross-disk move; it must reject it to avoid node-wide delete data loss") + } + if err := task.Validate(mk("node1", 0, "node2", 0)); err != nil { + t.Errorf("Validate rejected a legitimate cross-node move: %v", err) + } + if err := task.Validate(mk("node1", 2, "node1", 2)); err != nil { + t.Errorf("Validate rejected a same-node/same-disk dedup: %v", err) + } +} + +// TestIsDedupPhaseRequiresSameDisk confirms dedup classification keys on both +// node and disk, so a same-node cross-disk request is never silently routed to +// the unmount+delete path. +func TestIsDedupPhaseRequiresSameDisk(t *testing.T) { + withParams := func(srcDisk, dstDisk uint32) *worker_pb.TaskParams { + return &worker_pb.TaskParams{ + Sources: []*worker_pb.TaskSource{{Node: "node1", DiskId: srcDisk}}, + Targets: []*worker_pb.TaskTarget{{Node: "node1", DiskId: dstDisk}}, + TaskParams: &worker_pb.TaskParams_EcBalanceParams{ + EcBalanceParams: &worker_pb.EcBalanceTaskParams{}, + }, + } + } + if !isDedupPhase(withParams(2, 2)) { + t.Error("same node and disk should be a dedup phase") + } + if isDedupPhase(withParams(0, 3)) { + t.Error("same node but different disk must NOT be a dedup phase") + } +} diff --git a/weed/worker/tasks/ec_balance/plugin_handler.go b/weed/worker/tasks/ec_balance/plugin_handler.go index 5146acc6f..68d97191c 100644 --- a/weed/worker/tasks/ec_balance/plugin_handler.go +++ b/weed/worker/tasks/ec_balance/plugin_handler.go @@ -102,13 +102,22 @@ func (h *ECBalanceHandler) Descriptor() *plugin_pb.JobTypeDescriptor { FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING, Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT, }, + { + Name: "shard_replica_placement", + Label: "Shard Replica Placement", + Description: "EC shard replica placement constraint (e.g. 020); empty uses the master default replication (even spread only when that default is empty or zero). Mirrors the ec.balance -shardReplicaPlacement flag.", + Placeholder: "master default", + FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING, + Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT, + }, }, }, }, DefaultValues: map[string]*plugin_pb.ConfigValue{ - "collection_filter": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, - "data_center_filter": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, - "disk_type": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, + "collection_filter": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, + "data_center_filter": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, + "disk_type": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, + "shard_replica_placement": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}}, }, }, WorkerConfigForm: &plugin_pb.ConfigForm{ @@ -207,6 +216,10 @@ func (h *ECBalanceHandler) Detect( if diskType != "" { workerConfig.TaskConfig.DiskType = diskType } + replicaPlacement := strings.TrimSpace(pluginworker.ReadStringConfig(request.GetAdminConfigValues(), "shard_replica_placement", "")) + if replicaPlacement != "" { + workerConfig.TaskConfig.ReplicaPlacement = replicaPlacement + } masters := make([]string, 0) if request.ClusterContext != nil { @@ -218,7 +231,10 @@ func (h *ECBalanceHandler) Detect( return err } - clusterInfo := &workertypes.ClusterInfo{ActiveTopology: activeTopology} + clusterInfo := &workertypes.ClusterInfo{ + ActiveTopology: activeTopology, + DefaultReplicaPlacement: pluginworker.FetchDefaultReplicaPlacement(ctx, masters, h.grpcDialOption), + } maxResults := int(request.MaxResults) if maxResults < 0 { maxResults = 0 @@ -427,15 +443,15 @@ func buildECBalanceProposal(result *workertypes.TaskDetectionResult) (*plugin_pb proposalID = fmt.Sprintf("ec-balance-%d-%d", result.VolumeID, time.Now().UnixNano()) } - // Dedupe key includes volume ID, shard ID, source node, and collection - // to distinguish moves of the same shard from different source nodes (e.g. dedup) + // Dedupe per (collection, volume), NOT per shard. A volume's EC shards can be + // spread across several disks on one server, and concurrent moves of the same + // volume race on its shared .ecx/.ecj/.vif sidecar files. Keying by volume + // makes the scheduler run only one of a volume's moves at a time — both within + // a detection run and against in-flight jobs — and because the planner emits a + // volume's moves in phase order (dedup, cross-rack, within-rack, global), the + // phases then execute in order across detection cycles. This mirrors the shell, + // which serializes a volume's moves and waits between phases. dedupeKey := fmt.Sprintf("ec_balance:%d", result.VolumeID) - if len(result.TypedParams.Sources) > 0 { - src := result.TypedParams.Sources[0] - if len(src.ShardIds) > 0 { - dedupeKey = fmt.Sprintf("ec_balance:%d:%d:%s", result.VolumeID, src.ShardIds[0], src.Node) - } - } if result.Collection != "" { dedupeKey += ":" + result.Collection } diff --git a/weed/worker/tasks/ec_balance/plugin_handler_test.go b/weed/worker/tasks/ec_balance/plugin_handler_test.go index 5c3b8c96e..bfabacc93 100644 --- a/weed/worker/tasks/ec_balance/plugin_handler_test.go +++ b/weed/worker/tasks/ec_balance/plugin_handler_test.go @@ -118,8 +118,11 @@ func TestBuildECBalanceProposal(t *testing.T) { if proposal.ProposalId != "test-task-123" { t.Errorf("expected proposal ID test-task-123, got %s", proposal.ProposalId) } - if proposal.DedupeKey != "ec_balance:42:5:source:8080:test-col" { - t.Errorf("expected dedupe key ec_balance:42:5:source:8080:test-col, got %s", proposal.DedupeKey) + // Dedupe is per (volume, collection) so a volume's shard moves serialize and + // run in phase order, rather than per shard which allowed concurrent same-volume + // moves that race on the volume's .ecx/.ecj/.vif sidecars. + if proposal.DedupeKey != "ec_balance:42:test-col" { + t.Errorf("expected dedupe key ec_balance:42:test-col, got %s", proposal.DedupeKey) } if proposal.Labels["source_node"] != "source:8080" { t.Errorf("expected source_node label source:8080, got %s", proposal.Labels["source_node"]) diff --git a/weed/worker/types/data_types.go b/weed/worker/types/data_types.go index cc6de8feb..cd3040597 100644 --- a/weed/worker/types/data_types.go +++ b/weed/worker/types/data_types.go @@ -22,6 +22,10 @@ type ClusterInfo struct { LastUpdated time.Time ActiveTopology *topology.ActiveTopology // Added for destination planning in detection VolumeReplicaMap map[uint32][]ReplicaLocation + // DefaultReplicaPlacement is the master's configured default replication + // (GetMasterConfiguration). Detectors use it as the fallback when no explicit + // replica placement is set, matching the shell's behavior. Empty = none. + DefaultReplicaPlacement string // GrpcDialOption is set when a detector needs to make targeted gRPC calls // during detection (e.g., the EC detector auto-cleans up an orphaned // regular replica that survived a previous encode; see #9448). Optional: