fix(ec): off-by-one in nLargeBlockRows causes EC read corruption (#8957)

* fix(ec): off-by-one in nLargeBlockRows causes EC read corruption (#8947)

The nLargeBlockRows formula in locateOffset used (shardDatSize-1)/largeBlockLength,
which produces an off-by-one error when shardDatSize is an exact multiple of
largeBlockLength (e.g. a 30GB volume with 10 data shards = 3GB per shard).
This causes needles in the last large block row to be mislocated as small blocks,
reading from completely wrong shard positions and returning garbage data.

Fix: remove the -1 from locateOffset and only apply it in the ecdFileSize fallback
path (old volumes without datFileSize in .vif), where it's needed to handle the
ambiguous case conservatively.

Also fix ReadEcShardNeedle to pass offset=0 to ReadBytes, consistent with the
scrub path, since the bytes buffer already starts at position 0.

* fix: add volume context to EC read errors, remove contextless glog

The glog.Errorf in ReadBytes logged "entry not found" without any volume
ID, making it impossible to identify which volume was affected. Remove
this contextless log and instead add volume ID, needle ID, offset, and
size to the error returned from the EC read path.

The EC scrub callers already wrap errors with volume context.
This commit is contained in:
Chris Lu
2026-04-07 12:02:51 -07:00
committed by GitHub
parent 45bf3ad058
commit d4548376a1
6 changed files with 422 additions and 13 deletions
+1 -1
View File
@@ -64,7 +64,7 @@ func moveToNextBlock(blockIndex int, isLargeBlock bool, nLargeBlockRows int64) (
func locateOffset(largeBlockLength, smallBlockLength int64, shardDatSize int64, offset int64) (blockIndex int, isLargeBlock bool, nLargeBlockRows int64, innerBlockOffset int64) {
largeRowSize := largeBlockLength * DataShardsCount
nLargeBlockRows = (shardDatSize - 1) / largeBlockLength
nLargeBlockRows = shardDatSize / largeBlockLength
// if offset is within the large block area
if offset < nLargeBlockRows*largeRowSize {
@@ -0,0 +1,388 @@
package erasure_coding
import (
"bytes"
"crypto/rand"
"fmt"
"os"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
)
// TestEcReadRoundTrip tests the EC encode→read cycle via LocateData for various
// .dat file sizes, paying special attention to the large/small block boundary.
//
// The nLargeBlockRows calculation must correctly distinguish between large and small
// blocks. A previous bug (issue #8947) caused an off-by-one error when
// shardDatSize was an exact multiple of largeBlockSize, leading to data corruption.
func TestEcReadRoundTrip(t *testing.T) {
const (
large = largeBlockSize // 10000
small = smallBlockSize // 100
)
largeRowSize := large * DataShardsCount // 100000
smallRowSize := small * DataShardsCount // 1000
testCases := []struct {
name string
datSize int64
}{
// Exact multiples of largeRowSize — triggers the nLargeBlockRows off-by-one bug
{"1_large_row_exact", int64(largeRowSize)},
{"2_large_rows_exact", int64(2 * largeRowSize)},
{"3_large_rows_exact", int64(3 * largeRowSize)},
// Just over a large row boundary — has small blocks
{"1_large_row_plus_1", int64(largeRowSize + 1)},
{"2_large_rows_plus_small", int64(2*largeRowSize + smallRowSize)},
{"1_large_row_plus_half_small", int64(largeRowSize + smallRowSize/2)},
// Just under a large row boundary — all small blocks
{"just_under_1_large_row", int64(largeRowSize - 1)},
{"just_under_2_large_rows", int64(2*largeRowSize - 1)},
// Small data — no large blocks at all
{"small_only", int64(smallRowSize * 3)},
{"small_single_row", int64(smallRowSize)},
// Boundary with mixed large and small
{"boundary_spanning", int64(largeRowSize + smallRowSize*5 + 50)},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
testEcRead(t, large, small, tc.datSize)
})
}
}
// testEcRead creates a .dat file, EC-encodes it, then verifies LocateData-based reads
// return correct data at positions throughout the file (especially near the large/small
// block boundary).
func testEcRead(t *testing.T, large, small, datSize int64) {
t.Helper()
dir := t.TempDir()
baseFileName := fmt.Sprintf("%s/rt_%d", dir, datSize)
// 1. Create a .dat file with deterministic random data
originalData := make([]byte, datSize)
_, err := rand.Read(originalData)
require.NoError(t, err, "generating random data")
err = os.WriteFile(baseFileName+".dat", originalData, 0644)
require.NoError(t, err, "writing .dat file")
ctx := NewDefaultECContext("", 0)
// 2. EC encode with test block sizes
err = generateEcFiles(baseFileName, int(small), large, small, ctx)
require.NoError(t, err, "EC encoding")
// 3. Open EC shard files for reading
ecFiles, err := openEcFiles(baseFileName, true, ctx)
require.NoError(t, err, "opening EC files")
defer closeEcFiles(ecFiles)
ecStat, err := ecFiles[0].Stat()
require.NoError(t, err)
shardFileSize := ecStat.Size()
// Compute shardDatSize as the production code does when datFileSize is known
shardDatSizeFromDat := datSize / int64(ctx.DataShards)
// 4. Verify EC reads at various positions
largeRowSize := large * DataShardsCount
encoderLargeRows := datSize / int64(largeRowSize)
boundaryOffset := encoderLargeRows * int64(largeRowSize)
readSize := types.Size(small / 2) // read half a small block
testOffsets := collectTestOffsets(datSize, int64(readSize), boundaryOffset, large, small)
for _, offset := range testOffsets {
// Test with shardDatSize from datFileSize (the production path with fix)
intervals := LocateData(large, small, shardDatSizeFromDat, offset, readSize)
ecData, err := assembleFromIntervals(ecFiles, intervals, large, small)
require.NoError(t, err, "reading EC data at offset %d (datFileSize path)", offset)
expected := originalData[offset : offset+int64(readSize)]
if !bytes.Equal(expected, ecData) {
t.Errorf("EC read mismatch at offset %d (datFileSize path, shardDatSize=%d, nLargeBlockRows=%d)",
offset, shardDatSizeFromDat, shardDatSizeFromDat/large)
}
// Test with shardDatSize from ecdFileSize-1 (the fallback path for old volumes)
intervalsFallback := LocateData(large, small, shardFileSize-1, offset, readSize)
ecDataFallback, err := assembleFromIntervals(ecFiles, intervalsFallback, large, small)
if err == nil && !bytes.Equal(expected, ecDataFallback) {
// The fallback path may fail for exact multiples — log as warning
t.Logf("WARN: EC read mismatch at offset %d (fallback path, shardFileSize=%d)",
offset, shardFileSize)
}
}
}
// locateOffsetBuggy reimplements locateOffset with the old buggy formula:
//
// nLargeBlockRows = (shardDatSize - 1) / largeBlockLength
//
// This caused an off-by-one error when shardDatSize was an exact multiple of
// largeBlockLength, miscounting the number of large block rows.
func locateOffsetBuggy(largeBlockLength, smallBlockLength int64, shardDatSize int64, offset int64) (blockIndex int, isLargeBlock bool, nLargeBlockRows int64, innerBlockOffset int64) {
largeRowSize := largeBlockLength * DataShardsCount
nLargeBlockRows = (shardDatSize - 1) / largeBlockLength // THE BUG
if offset < nLargeBlockRows*largeRowSize {
isLargeBlock = true
blockIndex = int(offset / largeBlockLength)
innerBlockOffset = offset % largeBlockLength
return
}
isLargeBlock = false
offset -= nLargeBlockRows * largeRowSize
blockIndex = int(offset / smallBlockLength)
innerBlockOffset = offset % smallBlockLength
return
}
// locateDataBuggy is LocateData using the old buggy locateOffset.
func locateDataBuggy(largeBlockLength, smallBlockLength int64, shardDatSize int64, offset int64, size types.Size) []Interval {
blockIndex, isLargeBlock, nLargeBlockRows, innerBlockOffset := locateOffsetBuggy(largeBlockLength, smallBlockLength, shardDatSize, offset)
var intervals []Interval
for size > 0 {
blockRemaining := largeBlockLength - innerBlockOffset
if !isLargeBlock {
blockRemaining = smallBlockLength - innerBlockOffset
}
if blockRemaining <= 0 {
blockIndex, isLargeBlock = moveToNextBlock(blockIndex, isLargeBlock, nLargeBlockRows)
innerBlockOffset = 0
continue
}
interval := Interval{
BlockIndex: blockIndex,
InnerBlockOffset: innerBlockOffset,
IsLargeBlock: isLargeBlock,
LargeBlockRowsCount: int(nLargeBlockRows),
}
if int64(size) <= blockRemaining {
interval.Size = size
intervals = append(intervals, interval)
return intervals
}
interval.Size = types.Size(blockRemaining)
intervals = append(intervals, interval)
size -= interval.Size
blockIndex, isLargeBlock = moveToNextBlock(blockIndex, isLargeBlock, nLargeBlockRows)
innerBlockOffset = 0
}
return intervals
}
// TestEcOffByOneBug_Issue8947 directly demonstrates the off-by-one bug.
//
// It creates a .dat file whose size is an exact multiple of (largeBlockSize * DataShards),
// EC-encodes it, then shows that:
// - The OLD buggy formula produces WRONG data (data corruption)
// - The FIXED formula produces CORRECT data
func TestEcOffByOneBug_Issue8947(t *testing.T) {
const (
large = largeBlockSize // 10000
small = smallBlockSize // 100
)
// datSize is exactly 2 large rows — each shard gets exactly 2*largeBlockSize bytes.
// The encoder produces 2 large block rows and 0 small block rows.
datSize := int64(2 * large * DataShardsCount) // 200000
dir := t.TempDir()
baseFileName := fmt.Sprintf("%s/bug_%d", dir, datSize)
originalData := make([]byte, datSize)
_, err := rand.Read(originalData)
require.NoError(t, err)
err = os.WriteFile(baseFileName+".dat", originalData, 0644)
require.NoError(t, err)
ctx := NewDefaultECContext("", 0)
err = generateEcFiles(baseFileName, int(small), large, small, ctx)
require.NoError(t, err, "EC encoding")
ecFiles, err := openEcFiles(baseFileName, true, ctx)
require.NoError(t, err)
defer closeEcFiles(ecFiles)
// shardDatSize = datFileSize / DataShards = 2 * largeBlockSize
// This is an EXACT multiple of largeBlockSize.
shardDatSize := datSize / int64(ctx.DataShards) // = 2 * large = 20000
// The encoder used 2 large block rows, 0 small block rows.
// Correct: nLargeBlockRows = 20000 / 10000 = 2
// Buggy: nLargeBlockRows = (20000 - 1) / 10000 = 1 ← OFF BY ONE
fixedRows := shardDatSize / large
buggyRows := (shardDatSize - 1) / large
assert.Equal(t, int64(2), fixedRows, "fixed formula should give 2 large block rows")
assert.Equal(t, int64(1), buggyRows, "buggy formula gives only 1 (the bug)")
// Test reading from the 2nd large block row (offsets 100000199999).
// With the buggy formula (nLargeBlockRows=1), this region is incorrectly
// treated as small blocks, causing reads from the WRONG shard positions.
readSize := types.Size(small / 2)
// Pick an offset well into the 2nd large block row so that the buggy formula
// computes a different (shard, offset) than the correct formula.
// At the very start of the 2nd row, both formulas coincidentally hit the same
// shard position. But further in, the small-block vs large-block addressing diverges.
offset := int64(large*DataShardsCount) + large + 50 // 110050: in 2nd large row, shard 1
// --- Fixed formula: reads correct data ---
fixedIntervals := LocateData(large, small, shardDatSize, offset, readSize)
fixedData, err := assembleFromIntervals(ecFiles, fixedIntervals, large, small)
require.NoError(t, err, "fixed LocateData read")
expected := originalData[offset : offset+int64(readSize)]
assert.True(t, bytes.Equal(expected, fixedData),
"FIXED formula should read correct data from 2nd large block row")
// --- Buggy formula: reads WRONG data ---
buggyIntervals := locateDataBuggy(large, small, shardDatSize, offset, readSize)
buggyData, err := assembleFromIntervals(ecFiles, buggyIntervals, large, small)
// The buggy formula might read from wrong offsets (possibly out of bounds),
// so an error is also evidence of the bug.
if err != nil {
t.Logf("Buggy formula caused read error (expected): %v", err)
} else {
assert.False(t, bytes.Equal(expected, buggyData),
"BUGGY formula should return WRONG data from 2nd large block row (demonstrating the corruption)")
n := 8
if len(expected) < n {
n = len(expected)
}
t.Logf("Buggy formula returned wrong data: expected first bytes %x, got %x",
expected[:n], buggyData[:n])
}
// Verify the bug mechanism: buggy formula misclassifies the 2nd large row as small blocks
assert.True(t, fixedIntervals[0].IsLargeBlock,
"fixed: offset %d should be in large blocks", offset)
assert.False(t, buggyIntervals[0].IsLargeBlock,
"buggy: offset %d is incorrectly classified as small blocks (the bug)", offset)
t.Logf("Fixed: nLargeBlockRows=%d, interval=%+v", fixedRows, fixedIntervals[0])
t.Logf("Buggy: nLargeBlockRows=%d, interval=%+v", buggyRows, buggyIntervals[0])
}
// TestEcDecodeDatRoundTrip tests the full WriteDatFile decode path using the production
// block sizes with a small .dat file that fits within the small block region.
func TestEcDecodeDatRoundTrip(t *testing.T) {
// With production sizes, datFileSize must be < DataShardsCount * ErasureCodingLargeBlockSize (10GB)
// to avoid needing huge test files. We test small block decode only.
// Each shard gets datSize/DataShards bytes in small 1MB blocks.
datSizes := []int64{
1000, // tiny
int64(DataShardsCount) * ErasureCodingSmallBlockSize, // exactly 1 small row (10MB)
int64(DataShardsCount)*ErasureCodingSmallBlockSize + 500, // 1 small row + partial
}
for _, datSize := range datSizes {
t.Run(fmt.Sprintf("size_%d", datSize), func(t *testing.T) {
testDecodeDat(t, datSize)
})
}
}
func testDecodeDat(t *testing.T, datSize int64) {
t.Helper()
dir := t.TempDir()
baseFileName := fmt.Sprintf("%s/dec_%d", dir, datSize)
// 1. Create .dat with random data
originalData := make([]byte, datSize)
_, err := rand.Read(originalData)
require.NoError(t, err)
err = os.WriteFile(baseFileName+".dat", originalData, 0644)
require.NoError(t, err)
ctx := NewDefaultECContext("", 0)
// 2. EC encode with PRODUCTION block sizes
err = generateEcFiles(baseFileName, 256*1024, ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, ctx)
require.NoError(t, err, "EC encoding")
// 3. Decode via WriteDatFile
decodedBase := baseFileName + "_decoded"
shardFileNames := make([]string, DataShardsCount)
for i := 0; i < DataShardsCount; i++ {
shardFileNames[i] = fmt.Sprintf("%s%s", baseFileName, ctx.ToExt(i))
}
err = WriteDatFile(decodedBase, datSize, shardFileNames)
require.NoError(t, err, "WriteDatFile")
// 4. Verify decoded .dat matches original
decodedData, err := os.ReadFile(decodedBase + ".dat")
require.NoError(t, err)
assert.Equal(t, len(originalData), len(decodedData), "decoded .dat size mismatch")
if !bytes.Equal(originalData, decodedData) {
for i := 0; i < len(originalData) && i < len(decodedData); i++ {
if originalData[i] != decodedData[i] {
t.Fatalf("decoded .dat mismatch at byte %d (datSize=%d)", i, datSize)
}
}
}
}
// collectTestOffsets generates offsets to test, focusing on the large/small block boundary.
func collectTestOffsets(datSize, readSize, boundaryOffset, large, small int64) []int64 {
offsets := []int64{0}
if datSize > readSize {
offsets = append(offsets, datSize/2)
}
// Near the large/small block boundary
if boundaryOffset > 0 && boundaryOffset < datSize {
for _, delta := range []int64{-large, -small, -1, 0, 1, small, large} {
off := boundaryOffset + delta
if off >= 0 && off+readSize <= datSize {
offsets = append(offsets, off)
}
}
}
// Near end of file
if datSize > readSize {
offsets = append(offsets, datSize-readSize)
}
return offsets
}
// assembleFromIntervals reads data from EC shard files according to the given intervals.
func assembleFromIntervals(ecFiles []*os.File, intervals []Interval, large, small int64) ([]byte, error) {
var data []byte
for _, interval := range intervals {
shardId, shardOffset := interval.ToShardIdAndOffset(large, small)
chunk := make([]byte, interval.Size)
n, err := ecFiles[shardId].ReadAt(chunk, shardOffset)
if err != nil {
return nil, fmt.Errorf("read shard %d offset %d size %d: %v", shardId, shardOffset, interval.Size, err)
}
if n != int(interval.Size) {
return nil, fmt.Errorf("short read from shard %d: got %d, want %d", shardId, n, interval.Size)
}
data = append(data, chunk...)
}
return data, nil
}
+22 -3
View File
@@ -217,7 +217,9 @@ func (this Interval) sameAs(that Interval) bool {
}
func TestLocateData2(t *testing.T) {
intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472, 21479557912, 4194339)
// Use ecdFileSize-1 to simulate the fallback path in LocateEcShardNeedleInterval
// when datFileSize is not available (old EC volumes without .vif datFileSize).
intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472-1, 21479557912, 4194339)
assert.Equal(t, intervals, []Interval{
{BlockIndex: 4, InnerBlockOffset: 527128, Size: 521448, IsLargeBlock: false, LargeBlockRowsCount: 2},
{BlockIndex: 5, InnerBlockOffset: 0, Size: 1048576, IsLargeBlock: false, LargeBlockRowsCount: 2},
@@ -228,7 +230,8 @@ func TestLocateData2(t *testing.T) {
}
func TestLocateData3(t *testing.T) {
intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472, 30782909808, 112568)
// Use ecdFileSize-1 to simulate the fallback path in LocateEcShardNeedleInterval
intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, 3221225472-1, 30782909808, 112568)
for _, interval := range intervals {
fmt.Printf("%+v\n", interval)
}
@@ -237,13 +240,29 @@ func TestLocateData3(t *testing.T) {
})
}
func TestLocateData_ExactMultiple_Issue8947(t *testing.T) {
// When datFileSize is available, shardDatSize = datFileSize / DataShards.
// For a 30GB volume with 10 data shards, shardDatSize = 3GB = 3 * ErasureCodingLargeBlockSize.
// The encoder produces 3 large block rows, 0 small block rows.
// nLargeBlockRows must be 3, not 2.
shardDatSize := int64(3) * ErasureCodingLargeBlockSize // 3GB per shard from datFileSize/DataShards
// Reading from the 3rd large block row (offsets 20GB-30GB) should work
offset := int64(2) * ErasureCodingLargeBlockSize * DataShardsCount // 20GB
intervals := LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, shardDatSize, offset, 1024)
assert.Equal(t, 1, len(intervals))
assert.True(t, intervals[0].IsLargeBlock, "data in 3rd large row should be in large blocks")
assert.Equal(t, 3, intervals[0].LargeBlockRowsCount)
assert.Equal(t, 20, intervals[0].BlockIndex) // block 20 = shard 0 of 3rd row
}
func TestLocateData_Issue8179(t *testing.T) {
large := int64(10000)
small := int64(100)
shardSize := int64(259092) // Resulting in nLargeBlockRows = 25 as seen in panic log
// Testing range through the large-to-small transition boundary
nLargeBlockRows := (shardSize - 1) / large
nLargeBlockRows := shardSize / large
largeAreaSize := nLargeBlockRows * int64(DataShardsCount) * large
for offset := largeAreaSize - 500; offset < largeAreaSize+500; offset++ {
+9 -6
View File
@@ -294,14 +294,17 @@ func (ev *EcVolume) LocateEcShardNeedle(needleId types.NeedleId, version needle.
func (ev *EcVolume) LocateEcShardNeedleInterval(version needle.Version, offset int64, size types.Size) (intervals []Interval) {
shard := ev.Shards[0]
// Usually shard will be padded to round of ErasureCodingSmallBlockSize.
// So in most cases, if shardSize equals to n * ErasureCodingLargeBlockSize,
// the data would be in small blocks.
shardSize := shard.ecdFileSize - 1
var shardSize int64
if ev.datFileSize > 0 {
// To get the correct LargeBlockRowsCount
// use datFileSize to calculate the shardSize to match the EC encoding logic.
// Use datFileSize to calculate the shardSize to match the EC encoding logic.
// This is the authoritative value stored in .vif during EC encoding.
shardSize = ev.datFileSize / int64(ev.ECContext.DataShards)
} else {
// Fallback for old EC volumes without datFileSize in .vif.
// Subtract 1 to handle the ambiguous case where ecdFileSize is an exact
// multiple of ErasureCodingLargeBlockSize but the data is actually in small
// blocks (e.g., datFileSize was just under DataShards*ErasureCodingLargeBlockSize).
shardSize = shard.ecdFileSize - 1
}
// calculate the locations in the ec shards
intervals = LocateData(ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, shardSize, offset, types.Size(needle.GetActualSize(size, version)))
-1
View File
@@ -55,7 +55,6 @@ func (n *Needle) ReadBytes(bytes []byte, offset int64, size Size, version Versio
if n.Size != size {
if OffsetSize == 4 && offset < int64(MaxPossibleVolumeSize) {
stats.VolumeServerHandlerCounter.WithLabelValues(stats.ErrorSizeMismatchOffsetSize).Inc()
glog.Errorf("entry not found: offset %d found id %x size %d, expected size %d", offset, n.Id, n.Size, size)
return ErrorSizeMismatch
}
stats.VolumeServerHandlerCounter.WithLabelValues(stats.ErrorSizeMismatch).Inc()
+2 -2
View File
@@ -167,9 +167,9 @@ func (s *Store) ReadEcShardNeedle(vid needle.VolumeId, n *needle.Needle, onReadS
return 0, ErrorDeleted
}
err = n.ReadBytes(bytes, offset.ToActualOffset(), size, localEcVolume.Version)
err = n.ReadBytes(bytes, 0, size, localEcVolume.Version)
if err != nil {
return 0, fmt.Errorf("readbytes: %w", err)
return 0, fmt.Errorf("ec volume %d needle %s offset %d size %d: %w", vid, n.String(), offset.ToActualOffset(), size, err)
}
return len(bytes), nil