diff --git a/weed/storage/erasure_coding/ec_locate.go b/weed/storage/erasure_coding/ec_locate.go index 84ba8e04f..4504124ef 100644 --- a/weed/storage/erasure_coding/ec_locate.go +++ b/weed/storage/erasure_coding/ec_locate.go @@ -64,7 +64,7 @@ func moveToNextBlock(blockIndex int, isLargeBlock bool, nLargeBlockRows int64) ( func locateOffset(largeBlockLength, smallBlockLength int64, shardDatSize int64, offset int64) (blockIndex int, isLargeBlock bool, nLargeBlockRows int64, innerBlockOffset int64) { largeRowSize := largeBlockLength * DataShardsCount - nLargeBlockRows = (shardDatSize - 1) / largeBlockLength + nLargeBlockRows = shardDatSize / largeBlockLength // if offset is within the large block area if offset < nLargeBlockRows*largeRowSize { diff --git a/weed/storage/erasure_coding/ec_roundtrip_test.go b/weed/storage/erasure_coding/ec_roundtrip_test.go new file mode 100644 index 000000000..efbbad84e --- /dev/null +++ b/weed/storage/erasure_coding/ec_roundtrip_test.go @@ -0,0 +1,388 @@ +package erasure_coding + +import ( + "bytes" + "crypto/rand" + "fmt" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/seaweedfs/seaweedfs/weed/storage/types" +) + +// TestEcReadRoundTrip tests the EC encode→read cycle via LocateData for various +// .dat file sizes, paying special attention to the large/small block boundary. +// +// The nLargeBlockRows calculation must correctly distinguish between large and small +// blocks. A previous bug (issue #8947) caused an off-by-one error when +// shardDatSize was an exact multiple of largeBlockSize, leading to data corruption. +func TestEcReadRoundTrip(t *testing.T) { + const ( + large = largeBlockSize // 10000 + small = smallBlockSize // 100 + ) + + largeRowSize := large * DataShardsCount // 100000 + smallRowSize := small * DataShardsCount // 1000 + + testCases := []struct { + name string + datSize int64 + }{ + // Exact multiples of largeRowSize — triggers the nLargeBlockRows off-by-one bug + {"1_large_row_exact", int64(largeRowSize)}, + {"2_large_rows_exact", int64(2 * largeRowSize)}, + {"3_large_rows_exact", int64(3 * largeRowSize)}, + + // Just over a large row boundary — has small blocks + {"1_large_row_plus_1", int64(largeRowSize + 1)}, + {"2_large_rows_plus_small", int64(2*largeRowSize + smallRowSize)}, + {"1_large_row_plus_half_small", int64(largeRowSize + smallRowSize/2)}, + + // Just under a large row boundary — all small blocks + {"just_under_1_large_row", int64(largeRowSize - 1)}, + {"just_under_2_large_rows", int64(2*largeRowSize - 1)}, + + // Small data — no large blocks at all + {"small_only", int64(smallRowSize * 3)}, + {"small_single_row", int64(smallRowSize)}, + + // Boundary with mixed large and small + {"boundary_spanning", int64(largeRowSize + smallRowSize*5 + 50)}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + testEcRead(t, large, small, tc.datSize) + }) + } +} + +// testEcRead creates a .dat file, EC-encodes it, then verifies LocateData-based reads +// return correct data at positions throughout the file (especially near the large/small +// block boundary). +func testEcRead(t *testing.T, large, small, datSize int64) { + t.Helper() + + dir := t.TempDir() + baseFileName := fmt.Sprintf("%s/rt_%d", dir, datSize) + + // 1. Create a .dat file with deterministic random data + originalData := make([]byte, datSize) + _, err := rand.Read(originalData) + require.NoError(t, err, "generating random data") + + err = os.WriteFile(baseFileName+".dat", originalData, 0644) + require.NoError(t, err, "writing .dat file") + + ctx := NewDefaultECContext("", 0) + + // 2. EC encode with test block sizes + err = generateEcFiles(baseFileName, int(small), large, small, ctx) + require.NoError(t, err, "EC encoding") + + // 3. Open EC shard files for reading + ecFiles, err := openEcFiles(baseFileName, true, ctx) + require.NoError(t, err, "opening EC files") + defer closeEcFiles(ecFiles) + + ecStat, err := ecFiles[0].Stat() + require.NoError(t, err) + shardFileSize := ecStat.Size() + + // Compute shardDatSize as the production code does when datFileSize is known + shardDatSizeFromDat := datSize / int64(ctx.DataShards) + + // 4. Verify EC reads at various positions + largeRowSize := large * DataShardsCount + encoderLargeRows := datSize / int64(largeRowSize) + boundaryOffset := encoderLargeRows * int64(largeRowSize) + + readSize := types.Size(small / 2) // read half a small block + testOffsets := collectTestOffsets(datSize, int64(readSize), boundaryOffset, large, small) + + for _, offset := range testOffsets { + // Test with shardDatSize from datFileSize (the production path with fix) + intervals := LocateData(large, small, shardDatSizeFromDat, offset, readSize) + ecData, err := assembleFromIntervals(ecFiles, intervals, large, small) + require.NoError(t, err, "reading EC data at offset %d (datFileSize path)", offset) + + expected := originalData[offset : offset+int64(readSize)] + if !bytes.Equal(expected, ecData) { + t.Errorf("EC read mismatch at offset %d (datFileSize path, shardDatSize=%d, nLargeBlockRows=%d)", + offset, shardDatSizeFromDat, shardDatSizeFromDat/large) + } + + // Test with shardDatSize from ecdFileSize-1 (the fallback path for old volumes) + intervalsFallback := LocateData(large, small, shardFileSize-1, offset, readSize) + ecDataFallback, err := assembleFromIntervals(ecFiles, intervalsFallback, large, small) + if err == nil && !bytes.Equal(expected, ecDataFallback) { + // The fallback path may fail for exact multiples — log as warning + t.Logf("WARN: EC read mismatch at offset %d (fallback path, shardFileSize=%d)", + offset, shardFileSize) + } + } +} + +// locateOffsetBuggy reimplements locateOffset with the old buggy formula: +// +// nLargeBlockRows = (shardDatSize - 1) / largeBlockLength +// +// This caused an off-by-one error when shardDatSize was an exact multiple of +// largeBlockLength, miscounting the number of large block rows. +func locateOffsetBuggy(largeBlockLength, smallBlockLength int64, shardDatSize int64, offset int64) (blockIndex int, isLargeBlock bool, nLargeBlockRows int64, innerBlockOffset int64) { + largeRowSize := largeBlockLength * DataShardsCount + nLargeBlockRows = (shardDatSize - 1) / largeBlockLength // THE BUG + + if offset < nLargeBlockRows*largeRowSize { + isLargeBlock = true + blockIndex = int(offset / largeBlockLength) + innerBlockOffset = offset % largeBlockLength + return + } + + isLargeBlock = false + offset -= nLargeBlockRows * largeRowSize + blockIndex = int(offset / smallBlockLength) + innerBlockOffset = offset % smallBlockLength + return +} + +// locateDataBuggy is LocateData using the old buggy locateOffset. +func locateDataBuggy(largeBlockLength, smallBlockLength int64, shardDatSize int64, offset int64, size types.Size) []Interval { + blockIndex, isLargeBlock, nLargeBlockRows, innerBlockOffset := locateOffsetBuggy(largeBlockLength, smallBlockLength, shardDatSize, offset) + + var intervals []Interval + for size > 0 { + blockRemaining := largeBlockLength - innerBlockOffset + if !isLargeBlock { + blockRemaining = smallBlockLength - innerBlockOffset + } + if blockRemaining <= 0 { + blockIndex, isLargeBlock = moveToNextBlock(blockIndex, isLargeBlock, nLargeBlockRows) + innerBlockOffset = 0 + continue + } + interval := Interval{ + BlockIndex: blockIndex, + InnerBlockOffset: innerBlockOffset, + IsLargeBlock: isLargeBlock, + LargeBlockRowsCount: int(nLargeBlockRows), + } + if int64(size) <= blockRemaining { + interval.Size = size + intervals = append(intervals, interval) + return intervals + } + interval.Size = types.Size(blockRemaining) + intervals = append(intervals, interval) + size -= interval.Size + blockIndex, isLargeBlock = moveToNextBlock(blockIndex, isLargeBlock, nLargeBlockRows) + innerBlockOffset = 0 + } + return intervals +} + +// TestEcOffByOneBug_Issue8947 directly demonstrates the off-by-one bug. +// +// It creates a .dat file whose size is an exact multiple of (largeBlockSize * DataShards), +// EC-encodes it, then shows that: +// - The OLD buggy formula produces WRONG data (data corruption) +// - The FIXED formula produces CORRECT data +func TestEcOffByOneBug_Issue8947(t *testing.T) { + const ( + large = largeBlockSize // 10000 + small = smallBlockSize // 100 + ) + + // datSize is exactly 2 large rows — each shard gets exactly 2*largeBlockSize bytes. + // The encoder produces 2 large block rows and 0 small block rows. + datSize := int64(2 * large * DataShardsCount) // 200000 + + dir := t.TempDir() + baseFileName := fmt.Sprintf("%s/bug_%d", dir, datSize) + + originalData := make([]byte, datSize) + _, err := rand.Read(originalData) + require.NoError(t, err) + + err = os.WriteFile(baseFileName+".dat", originalData, 0644) + require.NoError(t, err) + + ctx := NewDefaultECContext("", 0) + err = generateEcFiles(baseFileName, int(small), large, small, ctx) + require.NoError(t, err, "EC encoding") + + ecFiles, err := openEcFiles(baseFileName, true, ctx) + require.NoError(t, err) + defer closeEcFiles(ecFiles) + + // shardDatSize = datFileSize / DataShards = 2 * largeBlockSize + // This is an EXACT multiple of largeBlockSize. + shardDatSize := datSize / int64(ctx.DataShards) // = 2 * large = 20000 + + // The encoder used 2 large block rows, 0 small block rows. + // Correct: nLargeBlockRows = 20000 / 10000 = 2 + // Buggy: nLargeBlockRows = (20000 - 1) / 10000 = 1 ← OFF BY ONE + fixedRows := shardDatSize / large + buggyRows := (shardDatSize - 1) / large + assert.Equal(t, int64(2), fixedRows, "fixed formula should give 2 large block rows") + assert.Equal(t, int64(1), buggyRows, "buggy formula gives only 1 (the bug)") + + // Test reading from the 2nd large block row (offsets 100000–199999). + // With the buggy formula (nLargeBlockRows=1), this region is incorrectly + // treated as small blocks, causing reads from the WRONG shard positions. + readSize := types.Size(small / 2) + + // Pick an offset well into the 2nd large block row so that the buggy formula + // computes a different (shard, offset) than the correct formula. + // At the very start of the 2nd row, both formulas coincidentally hit the same + // shard position. But further in, the small-block vs large-block addressing diverges. + offset := int64(large*DataShardsCount) + large + 50 // 110050: in 2nd large row, shard 1 + + // --- Fixed formula: reads correct data --- + fixedIntervals := LocateData(large, small, shardDatSize, offset, readSize) + fixedData, err := assembleFromIntervals(ecFiles, fixedIntervals, large, small) + require.NoError(t, err, "fixed LocateData read") + + expected := originalData[offset : offset+int64(readSize)] + assert.True(t, bytes.Equal(expected, fixedData), + "FIXED formula should read correct data from 2nd large block row") + + // --- Buggy formula: reads WRONG data --- + buggyIntervals := locateDataBuggy(large, small, shardDatSize, offset, readSize) + buggyData, err := assembleFromIntervals(ecFiles, buggyIntervals, large, small) + // The buggy formula might read from wrong offsets (possibly out of bounds), + // so an error is also evidence of the bug. + if err != nil { + t.Logf("Buggy formula caused read error (expected): %v", err) + } else { + assert.False(t, bytes.Equal(expected, buggyData), + "BUGGY formula should return WRONG data from 2nd large block row (demonstrating the corruption)") + n := 8 + if len(expected) < n { + n = len(expected) + } + t.Logf("Buggy formula returned wrong data: expected first bytes %x, got %x", + expected[:n], buggyData[:n]) + } + + // Verify the bug mechanism: buggy formula misclassifies the 2nd large row as small blocks + assert.True(t, fixedIntervals[0].IsLargeBlock, + "fixed: offset %d should be in large blocks", offset) + assert.False(t, buggyIntervals[0].IsLargeBlock, + "buggy: offset %d is incorrectly classified as small blocks (the bug)", offset) + + t.Logf("Fixed: nLargeBlockRows=%d, interval=%+v", fixedRows, fixedIntervals[0]) + t.Logf("Buggy: nLargeBlockRows=%d, interval=%+v", buggyRows, buggyIntervals[0]) +} + +// TestEcDecodeDatRoundTrip tests the full WriteDatFile decode path using the production +// block sizes with a small .dat file that fits within the small block region. +func TestEcDecodeDatRoundTrip(t *testing.T) { + // With production sizes, datFileSize must be < DataShardsCount * ErasureCodingLargeBlockSize (10GB) + // to avoid needing huge test files. We test small block decode only. + // Each shard gets datSize/DataShards bytes in small 1MB blocks. + datSizes := []int64{ + 1000, // tiny + int64(DataShardsCount) * ErasureCodingSmallBlockSize, // exactly 1 small row (10MB) + int64(DataShardsCount)*ErasureCodingSmallBlockSize + 500, // 1 small row + partial + } + + for _, datSize := range datSizes { + t.Run(fmt.Sprintf("size_%d", datSize), func(t *testing.T) { + testDecodeDat(t, datSize) + }) + } +} + +func testDecodeDat(t *testing.T, datSize int64) { + t.Helper() + + dir := t.TempDir() + baseFileName := fmt.Sprintf("%s/dec_%d", dir, datSize) + + // 1. Create .dat with random data + originalData := make([]byte, datSize) + _, err := rand.Read(originalData) + require.NoError(t, err) + + err = os.WriteFile(baseFileName+".dat", originalData, 0644) + require.NoError(t, err) + + ctx := NewDefaultECContext("", 0) + + // 2. EC encode with PRODUCTION block sizes + err = generateEcFiles(baseFileName, 256*1024, ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, ctx) + require.NoError(t, err, "EC encoding") + + // 3. Decode via WriteDatFile + decodedBase := baseFileName + "_decoded" + shardFileNames := make([]string, DataShardsCount) + for i := 0; i < DataShardsCount; i++ { + shardFileNames[i] = fmt.Sprintf("%s%s", baseFileName, ctx.ToExt(i)) + } + + err = WriteDatFile(decodedBase, datSize, shardFileNames) + require.NoError(t, err, "WriteDatFile") + + // 4. Verify decoded .dat matches original + decodedData, err := os.ReadFile(decodedBase + ".dat") + require.NoError(t, err) + + assert.Equal(t, len(originalData), len(decodedData), "decoded .dat size mismatch") + if !bytes.Equal(originalData, decodedData) { + for i := 0; i < len(originalData) && i < len(decodedData); i++ { + if originalData[i] != decodedData[i] { + t.Fatalf("decoded .dat mismatch at byte %d (datSize=%d)", i, datSize) + } + } + } +} + +// collectTestOffsets generates offsets to test, focusing on the large/small block boundary. +func collectTestOffsets(datSize, readSize, boundaryOffset, large, small int64) []int64 { + offsets := []int64{0} + + if datSize > readSize { + offsets = append(offsets, datSize/2) + } + + // Near the large/small block boundary + if boundaryOffset > 0 && boundaryOffset < datSize { + for _, delta := range []int64{-large, -small, -1, 0, 1, small, large} { + off := boundaryOffset + delta + if off >= 0 && off+readSize <= datSize { + offsets = append(offsets, off) + } + } + } + + // Near end of file + if datSize > readSize { + offsets = append(offsets, datSize-readSize) + } + + return offsets +} + +// assembleFromIntervals reads data from EC shard files according to the given intervals. +func assembleFromIntervals(ecFiles []*os.File, intervals []Interval, large, small int64) ([]byte, error) { + var data []byte + for _, interval := range intervals { + shardId, shardOffset := interval.ToShardIdAndOffset(large, small) + chunk := make([]byte, interval.Size) + n, err := ecFiles[shardId].ReadAt(chunk, shardOffset) + if err != nil { + return nil, fmt.Errorf("read shard %d offset %d size %d: %v", shardId, shardOffset, interval.Size, err) + } + if n != int(interval.Size) { + return nil, fmt.Errorf("short read from shard %d: got %d, want %d", shardId, n, interval.Size) + } + data = append(data, chunk...) + } + return data, nil +} diff --git a/weed/storage/erasure_coding/ec_test.go b/weed/storage/erasure_coding/ec_test.go index 0db9f30fa..3894d75d3 100644 --- a/weed/storage/erasure_coding/ec_test.go +++ b/weed/storage/erasure_coding/ec_test.go @@ -217,7 +217,9 @@ func (this Interval) sameAs(that Interval) bool { } func TestLocateData2(t *testing.T) { - intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472, 21479557912, 4194339) + // Use ecdFileSize-1 to simulate the fallback path in LocateEcShardNeedleInterval + // when datFileSize is not available (old EC volumes without .vif datFileSize). + intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472-1, 21479557912, 4194339) assert.Equal(t, intervals, []Interval{ {BlockIndex: 4, InnerBlockOffset: 527128, Size: 521448, IsLargeBlock: false, LargeBlockRowsCount: 2}, {BlockIndex: 5, InnerBlockOffset: 0, Size: 1048576, IsLargeBlock: false, LargeBlockRowsCount: 2}, @@ -228,7 +230,8 @@ func TestLocateData2(t *testing.T) { } func TestLocateData3(t *testing.T) { - intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472, 30782909808, 112568) + // Use ecdFileSize-1 to simulate the fallback path in LocateEcShardNeedleInterval + intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472-1, 30782909808, 112568) for _, interval := range intervals { fmt.Printf("%+v\n", interval) } @@ -237,13 +240,29 @@ func TestLocateData3(t *testing.T) { }) } +func TestLocateData_ExactMultiple_Issue8947(t *testing.T) { + // When datFileSize is available, shardDatSize = datFileSize / DataShards. + // For a 30GB volume with 10 data shards, shardDatSize = 3GB = 3 * ErasureCodingLargeBlockSize. + // The encoder produces 3 large block rows, 0 small block rows. + // nLargeBlockRows must be 3, not 2. + shardDatSize := int64(3) * ErasureCodingLargeBlockSize // 3GB per shard from datFileSize/DataShards + + // Reading from the 3rd large block row (offsets 20GB-30GB) should work + offset := int64(2) * ErasureCodingLargeBlockSize * DataShardsCount // 20GB + intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, shardDatSize, offset, 1024) + assert.Equal(t, 1, len(intervals)) + assert.True(t, intervals[0].IsLargeBlock, "data in 3rd large row should be in large blocks") + assert.Equal(t, 3, intervals[0].LargeBlockRowsCount) + assert.Equal(t, 20, intervals[0].BlockIndex) // block 20 = shard 0 of 3rd row +} + func TestLocateData_Issue8179(t *testing.T) { large := int64(10000) small := int64(100) shardSize := int64(259092) // Resulting in nLargeBlockRows = 25 as seen in panic log // Testing range through the large-to-small transition boundary - nLargeBlockRows := (shardSize - 1) / large + nLargeBlockRows := shardSize / large largeAreaSize := nLargeBlockRows * int64(DataShardsCount) * large for offset := largeAreaSize - 500; offset < largeAreaSize+500; offset++ { diff --git a/weed/storage/erasure_coding/ec_volume.go b/weed/storage/erasure_coding/ec_volume.go index effb78f28..0061f1222 100644 --- a/weed/storage/erasure_coding/ec_volume.go +++ b/weed/storage/erasure_coding/ec_volume.go @@ -294,14 +294,17 @@ func (ev *EcVolume) LocateEcShardNeedle(needleId types.NeedleId, version needle. func (ev *EcVolume) LocateEcShardNeedleInterval(version needle.Version, offset int64, size types.Size) (intervals []Interval) { shard := ev.Shards[0] - // Usually shard will be padded to round of ErasureCodingSmallBlockSize. - // So in most cases, if shardSize equals to n * ErasureCodingLargeBlockSize, - // the data would be in small blocks. - shardSize := shard.ecdFileSize - 1 + var shardSize int64 if ev.datFileSize > 0 { - // To get the correct LargeBlockRowsCount - // use datFileSize to calculate the shardSize to match the EC encoding logic. + // Use datFileSize to calculate the shardSize to match the EC encoding logic. + // This is the authoritative value stored in .vif during EC encoding. shardSize = ev.datFileSize / int64(ev.ECContext.DataShards) + } else { + // Fallback for old EC volumes without datFileSize in .vif. + // Subtract 1 to handle the ambiguous case where ecdFileSize is an exact + // multiple of ErasureCodingLargeBlockSize but the data is actually in small + // blocks (e.g., datFileSize was just under DataShards*ErasureCodingLargeBlockSize). + shardSize = shard.ecdFileSize - 1 } // calculate the locations in the ec shards intervals = LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, shardSize, offset, types.Size(needle.GetActualSize(size, version))) diff --git a/weed/storage/needle/needle_read.go b/weed/storage/needle/needle_read.go index 47918260e..7374dc0ca 100644 --- a/weed/storage/needle/needle_read.go +++ b/weed/storage/needle/needle_read.go @@ -55,7 +55,6 @@ func (n *Needle) ReadBytes(bytes []byte, offset int64, size Size, version Versio if n.Size != size { if OffsetSize == 4 && offset < int64(MaxPossibleVolumeSize) { stats.VolumeServerHandlerCounter.WithLabelValues(stats.ErrorSizeMismatchOffsetSize).Inc() - glog.Errorf("entry not found: offset %d found id %x size %d, expected size %d", offset, n.Id, n.Size, size) return ErrorSizeMismatch } stats.VolumeServerHandlerCounter.WithLabelValues(stats.ErrorSizeMismatch).Inc() diff --git a/weed/storage/store_ec.go b/weed/storage/store_ec.go index 605c0532b..accc7e6a7 100644 --- a/weed/storage/store_ec.go +++ b/weed/storage/store_ec.go @@ -167,9 +167,9 @@ func (s *Store) ReadEcShardNeedle(vid needle.VolumeId, n *needle.Needle, onReadS return 0, ErrorDeleted } - err = n.ReadBytes(bytes, offset.ToActualOffset(), size, localEcVolume.Version) + err = n.ReadBytes(bytes, 0, size, localEcVolume.Version) if err != nil { - return 0, fmt.Errorf("readbytes: %w", err) + return 0, fmt.Errorf("ec volume %d needle %s offset %d size %d: %w", vid, n.String(), offset.ToActualOffset(), size, err) } return len(bytes), nil