mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
18cdb3819b
* fix(ec): make ecx-journal fold and shard rebuild crash-safe Two EC rebuild paths could silently lose or corrupt data: RebuildEcxFile folded the .ecj deletion journal into .ecx (in-place WriteAt tombstones) and then unlinked the journal without flushing the .ecx writes first. A crash could persist the unlink ahead of the tombstones, resurrecting deleted needles on the next load. It also read journal records with a bare n!=size break, so a torn tail silently dropped the remaining tombstones before the unlink. Now: read records with io.ReadFull (io.EOF ends cleanly, a torn tail aborts and leaves .ecj in place for retry), fsync .ecx before removing the journal. rebuildEcFiles treated a zero/short ReadAt as a clean end-of-input and discarded the read error, so a truncated or unreadable input shard produced truncated regenerated shards that were then published as restored redundancy; the regenerated shards were also never fsynced on the no-sidecar path. Now: derive the expected shard size from the present inputs up front (rejecting a divergent/zero-size input), drive the loop by that size, fail on any short read or short write, and fsync every regenerated shard before it is mounted/renamed. Rust volume server mirrors the rebuild fix: rebuild_ec_files now checks the read_at byte count (it previously discarded it, the same truncation bug). The Rust ecx fold already synced .ecx before removing the journal. Custom EC ratios are unaffected: the shard size derives from the input shards and the loop uses the .vif-resolved data/parity counts, never a hardcoded 10+4. * storage: close ecx journal files via defer in RebuildEcxFile Per review: a single deferred Close per file replaces the per-error-path manual closes, so new early returns cannot leak descriptors. The journal is still closed explicitly before its unlink since Windows cannot delete an open file; the deferred second Close is a harmless no-op.
618 lines
22 KiB
Go
618 lines
22 KiB
Go
package erasure_coding
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
"github.com/klauspost/reedsolomon"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/idx"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
)
|
|
|
|
const (
|
|
DataShardsCount = 10
|
|
ParityShardsCount = 4
|
|
TotalShardsCount = DataShardsCount + ParityShardsCount
|
|
MaxShardCount = 32 // Maximum number of shards since ShardBits is uint32 (bits 0-31)
|
|
MinTotalDisks = TotalShardsCount/ParityShardsCount + 1
|
|
ErasureCodingLargeBlockSize = 1024 * 1024 * 1024 // 1GB
|
|
ErasureCodingSmallBlockSize = 1024 * 1024 // 1MB
|
|
)
|
|
|
|
// WriteSortedFileFromIdx generates .ecx file from existing .idx file
|
|
// all keys are sorted in ascending order
|
|
func WriteSortedFileFromIdx(baseFileName string, ext string) (e error) {
|
|
|
|
nm, err := readNeedleMap(baseFileName)
|
|
if nm != nil {
|
|
defer nm.Close()
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("readNeedleMap: %w", err)
|
|
}
|
|
|
|
ecxFile, err := os.OpenFile(baseFileName+ext, os.O_TRUNC|os.O_CREATE|os.O_WRONLY, 0644)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open ecx file: %w", err)
|
|
}
|
|
defer ecxFile.Close()
|
|
|
|
err = nm.AscendingVisit(func(value needle_map.NeedleValue) error {
|
|
bytes := value.ToBytes()
|
|
_, writeErr := ecxFile.Write(bytes)
|
|
return writeErr
|
|
})
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("failed to visit idx file: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// WriteEcFiles generates .ec00 ~ .ec13 files from baseFileName.dat. Pass
|
|
// BackgroundECContext for the default ratio, or an explicit ctx for a configured
|
|
// (e.g. custom-ratio) layout. It returns the bitrot protection (per-shard block
|
|
// CRC32C) computed during the single encode pass; the caller persists it as a
|
|
// <base>.ecsum sidecar.
|
|
func WriteEcFiles(baseFileName string, ctx *ECContext) (*volume_server_pb.EcBitrotProtection, error) {
|
|
if ctx == nil || ctx.Total() == 0 {
|
|
ctx = NewDefaultECContext("", 0)
|
|
}
|
|
return generateEcFiles(baseFileName, 256*1024, ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, ctx)
|
|
}
|
|
|
|
// RebuildEcFiles rebuilds missing EC shard files. Pass BackgroundECContext to
|
|
// resolve the layout from the volume's .vif (falling back to the default ratio),
|
|
// or an explicit ctx when the caller already knows the shard layout.
|
|
// additionalDirs are extra directories to search for existing shard files,
|
|
// which handles multi-disk servers where shards may be spread across disks.
|
|
// When a bitrot checksum sidecar is present for the (generation-0) volume,
|
|
// present input shards are verified against it and corrupt ones are excluded
|
|
// from Reed-Solomon and regenerated; unsafeIgnoreSidecar bypasses that guard.
|
|
func RebuildEcFiles(baseFileName string, ctx *ECContext, unsafeIgnoreSidecar bool, additionalDirs ...string) ([]uint32, error) {
|
|
if ctx == nil || ctx.Total() == 0 {
|
|
// Resolve the layout from the .vif to preserve the original configuration.
|
|
volumeInfo, _, foundVif, vifErr := volume_info.MaybeLoadVolumeInfo(baseFileName + ".vif")
|
|
if vifErr != nil {
|
|
// The .vif exists but cannot be read or parsed. Fail closed rather
|
|
// than silently falling back to the default ratio, which would
|
|
// rebuild a custom-ratio volume with the wrong layout. Pass an
|
|
// explicit ctx to override.
|
|
return nil, fmt.Errorf("RebuildEcFiles %s: cannot load .vif: %w", baseFileName, vifErr)
|
|
}
|
|
if foundVif && volumeInfo.EcShardConfig != nil {
|
|
ds := int(volumeInfo.EcShardConfig.DataShards)
|
|
ps := int(volumeInfo.EcShardConfig.ParityShards)
|
|
|
|
// Validate EC config before using it
|
|
if ds > 0 && ps > 0 && ds+ps <= MaxShardCount {
|
|
ctx = &ECContext{
|
|
DataShards: ds,
|
|
ParityShards: ps,
|
|
}
|
|
glog.V(0).Infof("Rebuilding EC files for %s with config from .vif: %s", baseFileName, ctx.String())
|
|
} else {
|
|
glog.Warningf("Invalid EC config in .vif for %s (data=%d, parity=%d), using default", baseFileName, ds, ps)
|
|
ctx = NewDefaultECContext("", 0)
|
|
}
|
|
} else {
|
|
glog.V(0).Infof("Rebuilding EC files for %s with default config", baseFileName)
|
|
ctx = NewDefaultECContext("", 0)
|
|
}
|
|
}
|
|
|
|
return generateMissingEcFiles(baseFileName, 256*1024, ErasureCodingLargeBlockSize, ErasureCodingSmallBlockSize, ctx, unsafeIgnoreSidecar, additionalDirs)
|
|
}
|
|
|
|
func ToExt(ecIndex int) string {
|
|
return fmt.Sprintf(".ec%02d", ecIndex)
|
|
}
|
|
|
|
func generateEcFiles(baseFileName string, bufferSize int, largeBlockSize int64, smallBlockSize int64, ctx *ECContext) (*volume_server_pb.EcBitrotProtection, error) {
|
|
file, err := os.OpenFile(baseFileName+".dat", os.O_RDONLY, 0)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to open dat file: %w", err)
|
|
}
|
|
defer file.Close()
|
|
|
|
fi, err := file.Stat()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to stat dat file: %w", err)
|
|
}
|
|
|
|
// One rolling-CRC builder per shard; fed as each shard's bytes are written.
|
|
builders := make([]*shardChecksumBuilder, ctx.Total())
|
|
for i := range builders {
|
|
builders[i] = newShardChecksumBuilder(BitrotBlockSize)
|
|
}
|
|
|
|
glog.V(0).Infof("encodeDatFile %s.dat size:%d with EC context %s", baseFileName, fi.Size(), ctx.String())
|
|
err = encodeDatFile(fi.Size(), baseFileName, bufferSize, largeBlockSize, file, smallBlockSize, ctx, builders)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("encodeDatFile: %w", err)
|
|
}
|
|
return buildProtectionFromBuilders(ctx, builders, BitrotBlockSize), nil
|
|
}
|
|
|
|
// findShardFile looks for a shard file at baseFileName+ext, then in additionalDirs.
|
|
func findShardFile(baseFileName string, ext string, additionalDirs []string) string {
|
|
primary := baseFileName + ext
|
|
if util.FileExists(primary) {
|
|
return primary
|
|
}
|
|
baseName := filepath.Base(baseFileName)
|
|
for _, dir := range additionalDirs {
|
|
candidate := filepath.Join(dir, baseName+ext)
|
|
if util.FileExists(candidate) {
|
|
return candidate
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func generateMissingEcFiles(baseFileName string, bufferSize int, largeBlockSize int64, smallBlockSize int64, ctx *ECContext, unsafeIgnoreSidecar bool, additionalDirs []string) (generatedShardIds []uint32, err error) {
|
|
|
|
// Pass 1: discover which shards exist and which are missing,
|
|
// opening input files but NOT creating output files yet.
|
|
shardHasData := make([]bool, ctx.Total())
|
|
shardPaths := make([]string, ctx.Total()) // non-empty for present shards (also the in-place output for a reclassified-corrupt shard)
|
|
inputFiles := make([]*os.File, ctx.Total())
|
|
presentCount := 0
|
|
for shardId := 0; shardId < ctx.Total(); shardId++ {
|
|
ext := ctx.ToExt(shardId)
|
|
shardPath := findShardFile(baseFileName, ext, additionalDirs)
|
|
if shardPath != "" {
|
|
shardHasData[shardId] = true
|
|
shardPaths[shardId] = shardPath
|
|
inputFiles[shardId], err = os.OpenFile(shardPath, os.O_RDONLY, 0)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer inputFiles[shardId].Close()
|
|
presentCount++
|
|
} else {
|
|
generatedShardIds = append(generatedShardIds, uint32(shardId))
|
|
}
|
|
}
|
|
|
|
// Bitrot verify-and-exclude: when a generation-0 checksum sidecar is present
|
|
// and valid, verify each present input shard against it and reclassify
|
|
// corrupt ones as missing so Reed-Solomon regenerates them instead of
|
|
// silently consuming corrupt bytes. corruptOwned marks shards whose
|
|
// (corrupt) original file must be replaced in place at its discovered path.
|
|
corruptOwned := make([]bool, ctx.Total())
|
|
prot, status := loadRebuildSidecar(baseFileName, ctx, additionalDirs)
|
|
switch status {
|
|
case BitrotInvalid:
|
|
if !unsafeIgnoreSidecar {
|
|
return nil, fmt.Errorf("bitrot sidecar for %s is malformed/unverifiable; refusing to rebuild (pass unsafeIgnoreSidecar to override)", baseFileName)
|
|
}
|
|
glog.Warningf("bitrot sidecar for %s is malformed/unverifiable; proceeding because unsafeIgnoreSidecar is set", baseFileName)
|
|
case BitrotOn:
|
|
corrupt := make([]int, 0, ctx.Total())
|
|
for shardId := 0; shardId < ctx.Total(); shardId++ {
|
|
if !shardHasData[shardId] {
|
|
continue
|
|
}
|
|
entry := shardChecksums(prot, uint32(shardId))
|
|
if entry == nil {
|
|
continue
|
|
}
|
|
mismatched, verr := verifyShardFileBlocks(shardPaths[shardId], entry, int64(prot.BlockSize))
|
|
if verr != nil {
|
|
// A read error means we cannot trust this shard as a Reed-Solomon
|
|
// input. Exclude it (treat as corrupt) rather than silently
|
|
// feeding possibly-corrupt bytes into reconstruction.
|
|
glog.Warningf("bitrot: failed to verify present shard %d for %s: %v; excluding it", shardId, baseFileName, verr)
|
|
corrupt = append(corrupt, shardId)
|
|
continue
|
|
}
|
|
if len(mismatched) > 0 {
|
|
corrupt = append(corrupt, shardId)
|
|
}
|
|
}
|
|
if len(corrupt) > 0 {
|
|
// Wholesale-mismatch guard (RS-arbiter conservative form): localized
|
|
// bitrot touches a few shards; a stale/wrong sidecar mismatches more
|
|
// than parity_shards. In that case refuse rather than excluding good
|
|
// shards en masse.
|
|
if len(corrupt) > ctx.ParityShards && !unsafeIgnoreSidecar {
|
|
return nil, fmt.Errorf("bitrot sidecar suspect for %s: %d/%d present shards mismatch (> parity %d); refusing to rebuild (pass unsafeIgnoreSidecar to override)",
|
|
baseFileName, len(corrupt), presentCount, ctx.ParityShards)
|
|
}
|
|
if presentCount-len(corrupt) < ctx.DataShards && !unsafeIgnoreSidecar {
|
|
return nil, fmt.Errorf("bitrot: only %d verified-good shards for %s, need %d data shards; sidecar may be stale (pass unsafeIgnoreSidecar to override)",
|
|
presentCount-len(corrupt), baseFileName, ctx.DataShards)
|
|
}
|
|
if !unsafeIgnoreSidecar {
|
|
for _, shardId := range corrupt {
|
|
glog.Warningf("bitrot: present shard %d for %s fails checksum; excluding from rebuild inputs and regenerating", shardId, baseFileName)
|
|
shardHasData[shardId] = false
|
|
corruptOwned[shardId] = true
|
|
generatedShardIds = append(generatedShardIds, uint32(shardId))
|
|
presentCount--
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pre-check: bail out before creating any output files.
|
|
if presentCount < ctx.DataShards {
|
|
return nil, fmt.Errorf("not enough shards to rebuild %s: found %d shards, need at least %d (data shards), missing shards: %v",
|
|
baseFileName, presentCount, ctx.DataShards, generatedShardIds)
|
|
}
|
|
|
|
glog.V(0).Infof("rebuilding %s: %d shards present, %d missing %v, config %s",
|
|
baseFileName, presentCount, len(generatedShardIds), generatedShardIds, ctx.String())
|
|
|
|
// Pass 2: create output files for missing shards. A genuinely-absent shard
|
|
// is written at baseFileName+ext; a reclassified-corrupt shard is written to
|
|
// a temp file beside its discovered location and atomically renamed over the
|
|
// corrupt original after the rebuild (and checksum) succeed, so we never
|
|
// leave a duplicate shard id or a half-written file.
|
|
outputFiles := make([]*os.File, ctx.Total())
|
|
writePaths := make([]string, ctx.Total())
|
|
finalPaths := make([]string, ctx.Total())
|
|
for shardId := 0; shardId < ctx.Total(); shardId++ {
|
|
if shardHasData[shardId] {
|
|
continue
|
|
}
|
|
finalPath := baseFileName + ctx.ToExt(shardId)
|
|
writePath := finalPath
|
|
if corruptOwned[shardId] && shardPaths[shardId] != "" {
|
|
finalPath = shardPaths[shardId]
|
|
writePath = shardPaths[shardId] + ".rebuilding"
|
|
}
|
|
outputFiles[shardId], err = os.OpenFile(writePath, os.O_TRUNC|os.O_WRONLY|os.O_CREATE, 0644)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer outputFiles[shardId].Close()
|
|
writePaths[shardId] = writePath
|
|
finalPaths[shardId] = finalPath
|
|
}
|
|
|
|
if err = rebuildEcFiles(shardHasData, inputFiles, outputFiles, ctx); err != nil {
|
|
return nil, fmt.Errorf("rebuildEcFiles: %w", err)
|
|
}
|
|
|
|
// Verify regenerated shards against the sidecar. Reed-Solomon is
|
|
// deterministic, so a regenerated shard that does NOT match the sidecar
|
|
// means the sidecar is wrong/stale (not the shard) — fail closed rather than
|
|
// publishing bytes we cannot trust. On ANY verification failure (sync, read
|
|
// error, or mismatch) remove every generated output so the rebuild publishes
|
|
// nothing: a genuinely-missing shard returns to missing; a reclassified-
|
|
// corrupt shard keeps its untouched original.
|
|
if status == BitrotOn && !unsafeIgnoreSidecar {
|
|
for shardId := 0; shardId < ctx.Total(); shardId++ {
|
|
if writePaths[shardId] == "" {
|
|
continue
|
|
}
|
|
entry := shardChecksums(prot, uint32(shardId))
|
|
if entry == nil {
|
|
continue
|
|
}
|
|
if err = outputFiles[shardId].Sync(); err != nil {
|
|
cleanupRebuildOutputs(outputFiles, writePaths)
|
|
return nil, fmt.Errorf("sync regenerated shard %d: %w", shardId, err)
|
|
}
|
|
mismatched, verr := verifyShardFileBlocks(writePaths[shardId], entry, int64(prot.BlockSize))
|
|
if verr != nil {
|
|
cleanupRebuildOutputs(outputFiles, writePaths)
|
|
return nil, fmt.Errorf("bitrot: verify regenerated shard %d for %s: %w", shardId, baseFileName, verr)
|
|
}
|
|
if len(mismatched) > 0 {
|
|
cleanupRebuildOutputs(outputFiles, writePaths)
|
|
return nil, fmt.Errorf("bitrot: regenerated shard %d for %s does not match sidecar (%d blocks differ); sidecar likely stale — aborting (pass unsafeIgnoreSidecar to override)",
|
|
shardId, baseFileName, len(mismatched))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Atomically move reclassified-corrupt rebuilds over their originals.
|
|
for shardId := 0; shardId < ctx.Total(); shardId++ {
|
|
if writePaths[shardId] != "" && writePaths[shardId] != finalPaths[shardId] {
|
|
outputFiles[shardId].Close()
|
|
if rerr := os.Rename(writePaths[shardId], finalPaths[shardId]); rerr != nil {
|
|
return nil, fmt.Errorf("bitrot: replace corrupt shard %d (%s -> %s): %w", shardId, writePaths[shardId], finalPaths[shardId], rerr)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// cleanupRebuildOutputs removes every generated output on a failed fail-closed
|
|
// rebuild: temp replacements AND genuinely-missing shards written directly at
|
|
// their final path, so no unverified bytes are published. A reclassified-corrupt
|
|
// shard's untouched original (at finalPath, distinct from its temp writePath) is
|
|
// left in place; a genuinely-missing shard (writePath == finalPath) returns to
|
|
// missing.
|
|
func cleanupRebuildOutputs(outputFiles []*os.File, writePaths []string) {
|
|
for i := range writePaths {
|
|
if writePaths[i] == "" {
|
|
continue
|
|
}
|
|
if outputFiles[i] != nil {
|
|
outputFiles[i].Close()
|
|
}
|
|
os.Remove(writePaths[i])
|
|
}
|
|
}
|
|
|
|
// loadRebuildSidecar loads and validates the generation-0 checksum sidecar for a
|
|
// rebuild. RebuildEcFiles operates on the un-suffixed (generation 0) shard
|
|
// names, so only the legacy sidecar is relevant here. Returns BitrotOff when
|
|
// absent or describing a different generation/config, BitrotInvalid on a
|
|
// self-integrity/manifest failure, BitrotOn when usable.
|
|
func loadRebuildSidecar(baseFileName string, ctx *ECContext, additionalDirs []string) (*volume_server_pb.EcBitrotProtection, BitrotStatus) {
|
|
path := findBitrotSidecar(0, baseFileName, baseFileName, additionalDirs...)
|
|
if path == "" {
|
|
return nil, BitrotOff
|
|
}
|
|
prot, err := LoadBitrotSidecar(path)
|
|
if err != nil {
|
|
glog.Warningf("bitrot: sidecar %s self-integrity failed: %v", path, err)
|
|
return nil, BitrotInvalid
|
|
}
|
|
if prot.Generation != 0 {
|
|
return nil, BitrotOff
|
|
}
|
|
if prot.EcShardConfig == nil ||
|
|
int(prot.EcShardConfig.DataShards) != ctx.DataShards ||
|
|
int(prot.EcShardConfig.ParityShards) != ctx.ParityShards {
|
|
return nil, BitrotOff
|
|
}
|
|
if err := ValidateBitrotManifest(prot, ctx.DataShards, ctx.ParityShards); err != nil {
|
|
glog.Warningf("bitrot: sidecar %s manifest invalid: %v", path, err)
|
|
return nil, BitrotInvalid
|
|
}
|
|
return prot, BitrotOn
|
|
}
|
|
|
|
func encodeData(file *os.File, enc reedsolomon.Encoder, startOffset, blockSize int64, buffers [][]byte, outputs []*os.File, ctx *ECContext, builders []*shardChecksumBuilder) error {
|
|
|
|
bufferSize := int64(len(buffers[0]))
|
|
if bufferSize == 0 {
|
|
glog.Fatal("unexpected zero buffer size")
|
|
}
|
|
|
|
batchCount := blockSize / bufferSize
|
|
if blockSize%bufferSize != 0 {
|
|
glog.Fatalf("unexpected block size %d buffer size %d", blockSize, bufferSize)
|
|
}
|
|
|
|
for b := int64(0); b < batchCount; b++ {
|
|
err := encodeDataOneBatch(file, enc, startOffset+b*bufferSize, blockSize, buffers, outputs, ctx, builders)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func openEcFiles(baseFileName string, forRead bool, ctx *ECContext) (files []*os.File, err error) {
|
|
for i := 0; i < ctx.Total(); i++ {
|
|
fname := baseFileName + ctx.ToExt(i)
|
|
openOption := os.O_TRUNC | os.O_CREATE | os.O_WRONLY
|
|
if forRead {
|
|
openOption = os.O_RDONLY
|
|
}
|
|
f, err := os.OpenFile(fname, openOption, 0644)
|
|
if err != nil {
|
|
return files, fmt.Errorf("failed to open file %s: %v", fname, err)
|
|
}
|
|
files = append(files, f)
|
|
}
|
|
return
|
|
}
|
|
|
|
func closeEcFiles(files []*os.File) {
|
|
for _, f := range files {
|
|
if f != nil {
|
|
f.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
func encodeDataOneBatch(file *os.File, enc reedsolomon.Encoder, startOffset, blockSize int64, buffers [][]byte, outputs []*os.File, ctx *ECContext, builders []*shardChecksumBuilder) error {
|
|
|
|
// read data into buffers
|
|
for i := 0; i < ctx.DataShards; i++ {
|
|
n, err := file.ReadAt(buffers[i], startOffset+blockSize*int64(i))
|
|
if err != nil {
|
|
if err != io.EOF {
|
|
return err
|
|
}
|
|
}
|
|
if n < len(buffers[i]) {
|
|
for t := len(buffers[i]) - 1; t >= n; t-- {
|
|
buffers[i][t] = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
err := enc.Encode(buffers)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for i := 0; i < ctx.Total(); i++ {
|
|
_, err := outputs[i].Write(buffers[i])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Accumulate this shard's block CRC over exactly the bytes written.
|
|
if builders != nil && builders[i] != nil {
|
|
builders[i].write(buffers[i])
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func encodeDatFile(remainingSize int64, baseFileName string, bufferSize int, largeBlockSize int64, file *os.File, smallBlockSize int64, ctx *ECContext, builders []*shardChecksumBuilder) error {
|
|
|
|
var processedSize int64
|
|
|
|
enc, err := ctx.CreateEncoder()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create encoder: %w", err)
|
|
}
|
|
|
|
buffers := make([][]byte, ctx.Total())
|
|
for i := range buffers {
|
|
buffers[i] = make([]byte, bufferSize)
|
|
}
|
|
|
|
outputs, err := openEcFiles(baseFileName, false, ctx)
|
|
defer closeEcFiles(outputs)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open ec files %s: %v", baseFileName, err)
|
|
}
|
|
|
|
// Pre-calculate row sizes to avoid redundant calculations in loops
|
|
largeRowSize := largeBlockSize * int64(ctx.DataShards)
|
|
smallRowSize := smallBlockSize * int64(ctx.DataShards)
|
|
|
|
for remainingSize >= largeRowSize {
|
|
err = encodeData(file, enc, processedSize, largeBlockSize, buffers, outputs, ctx, builders)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to encode large chunk data: %w", err)
|
|
}
|
|
remainingSize -= largeRowSize
|
|
processedSize += largeRowSize
|
|
}
|
|
for remainingSize > 0 {
|
|
err = encodeData(file, enc, processedSize, smallBlockSize, buffers, outputs, ctx, builders)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to encode small chunk data: %w", err)
|
|
}
|
|
remainingSize -= smallRowSize
|
|
processedSize += smallRowSize
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func rebuildEcFiles(shardHasData []bool, inputFiles []*os.File, outputFiles []*os.File, ctx *ECContext) error {
|
|
|
|
enc, err := ctx.CreateEncoder()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create encoder: %w", err)
|
|
}
|
|
|
|
// The output shard size equals the present input shards' size (all EC
|
|
// shards are equal length). Deriving it up front turns a short read on a
|
|
// truncated/corrupt input into an error instead of a silent early return
|
|
// that would publish truncated shards as restored redundancy.
|
|
var expectedShardSize int64 = -1
|
|
for i := 0; i < ctx.Total(); i++ {
|
|
if !shardHasData[i] {
|
|
continue
|
|
}
|
|
fi, statErr := inputFiles[i].Stat()
|
|
if statErr != nil {
|
|
return fmt.Errorf("stat input shard %d: %w", i, statErr)
|
|
}
|
|
if expectedShardSize < 0 {
|
|
expectedShardSize = fi.Size()
|
|
} else if fi.Size() != expectedShardSize {
|
|
return fmt.Errorf("ec rebuild: input shard %d size %d != %d (truncated input?)", i, fi.Size(), expectedShardSize)
|
|
}
|
|
}
|
|
if expectedShardSize <= 0 {
|
|
return fmt.Errorf("ec rebuild: no input shard data (expected shard size %d)", expectedShardSize)
|
|
}
|
|
|
|
buffers := make([][]byte, ctx.Total())
|
|
for i := range buffers {
|
|
if shardHasData[i] {
|
|
buffers[i] = make([]byte, ErasureCodingSmallBlockSize)
|
|
}
|
|
}
|
|
|
|
for startOffset := int64(0); startOffset < expectedShardSize; {
|
|
thisBlock := int64(ErasureCodingSmallBlockSize)
|
|
if remaining := expectedShardSize - startOffset; remaining < thisBlock {
|
|
thisBlock = remaining
|
|
}
|
|
|
|
// read the input data; a short read means a truncated input shard.
|
|
shards := make([][]byte, ctx.Total())
|
|
for i := 0; i < ctx.Total(); i++ {
|
|
if !shardHasData[i] {
|
|
continue // nil: reconstructed below
|
|
}
|
|
b := buffers[i][:thisBlock]
|
|
n, readErr := inputFiles[i].ReadAt(b, startOffset)
|
|
if readErr != nil && readErr != io.EOF {
|
|
return fmt.Errorf("ec rebuild read shard %d at %d: %w", i, startOffset, readErr)
|
|
}
|
|
if int64(n) != thisBlock {
|
|
return fmt.Errorf("ec rebuild short read shard %d at %d: got %d want %d", i, startOffset, n, thisBlock)
|
|
}
|
|
shards[i] = b
|
|
}
|
|
|
|
if err = enc.Reconstruct(shards); err != nil {
|
|
return fmt.Errorf("reconstruct: %w", err)
|
|
}
|
|
|
|
for i := 0; i < ctx.Total(); i++ {
|
|
if shardHasData[i] {
|
|
continue
|
|
}
|
|
n, writeErr := outputFiles[i].WriteAt(shards[i][:thisBlock], startOffset)
|
|
if writeErr != nil {
|
|
return fmt.Errorf("ec rebuild write shard %d at %d: %w", i, startOffset, writeErr)
|
|
}
|
|
if int64(n) != thisBlock {
|
|
return fmt.Errorf("ec rebuild short write shard %d at %d: got %d want %d", i, startOffset, n, thisBlock)
|
|
}
|
|
}
|
|
startOffset += thisBlock
|
|
}
|
|
|
|
// Flush every regenerated shard before it is mounted/renamed and published
|
|
// as restored redundancy, so a crash cannot leave a peer trusting a shard
|
|
// whose bytes never reached disk.
|
|
for i := 0; i < ctx.Total(); i++ {
|
|
if shardHasData[i] {
|
|
continue
|
|
}
|
|
if err = outputFiles[i].Sync(); err != nil {
|
|
return fmt.Errorf("ec rebuild sync shard %d: %w", i, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func readNeedleMap(baseFileName string) (*needle_map.MemDb, error) {
|
|
indexFile, err := os.OpenFile(baseFileName+".idx", os.O_RDONLY, 0644)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("cannot read Volume Index %s.idx: %v", baseFileName, err)
|
|
}
|
|
defer indexFile.Close()
|
|
|
|
cm := needle_map.NewMemDb()
|
|
err = idx.WalkIndexFile(indexFile, 0, func(key types.NeedleId, offset types.Offset, size types.Size) error {
|
|
if !offset.IsZero() && !size.IsDeleted() {
|
|
cm.Set(key, offset, size)
|
|
} else {
|
|
cm.Delete(key)
|
|
}
|
|
return nil
|
|
})
|
|
return cm, err
|
|
}
|