mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
fix(seaweed-volume): stop EC shard deletion from phantom .dat on restart (#9874)
* fix(seaweed-volume): stop EC shard deletion from phantom .dat on restart On startup load_existing_volumes() scans .vif/.idx entries (not just .dat). For distributed EC, a volume's .vif can be mirrored onto a disk whose .ecx lives on a sibling disk, so the per-disk ecx check is false and the loader falls through to Volume::new, which always creates the .dat if missing -> a phantom 8-byte superblock stub. The store-level prune_incomplete_ec_with_sibling_dat then treats that stub as the authoritative source and deletes the real EC shards on sibling disks. Go guards the same case (disk_location.go: 'Without this guard NewVolume below would create a phantom empty .dat') but only same-disk. Fix A (root cause): in load_existing_volumes, don't create a .dat during load. Skip the entry when there is no local .dat AND the .vif does not reference remote files -- remote-tiered volumes have no local .dat but must still load via the remote path. Uses the robust check_dat_file_exists helper so a transient stat error doesn't skip a real volume. New volumes go through create_volume(). Covers the cross-disk .vif/.ecx split Go's same-disk hasEcxFile() misses. Fix B (defense in depth, Go + Rust): when the EC .vif records no source size (dat_file_size==0), require the sibling .dat to be strictly larger than a bare superblock, so an empty 8-byte stub can't pass the credibility gate. Previously it fell back to SUPER_BLOCK_SIZE, which an 8-byte stub exactly meets. Adds regression tests reproducing the cross-disk lone-.vif phantom and the 8-byte stub gate; updates an existing prune test to use a real collection so its .ecx lookup matches the loaders. * fix(storage): don't create phantom .dat from lone .vif on Go volume load Mirror Fix A on the Go side. loadExistingVolume scans .vif/.idx entries, and for distributed EC a .vif can be mirrored onto a disk whose .ecx is on a sibling disk. The same-disk hasEcxFile() guard does not fire there, so the loader falls through to NewVolume(createDatIfMissing=true) and writes an 8-byte phantom .dat, which the sibling-.dat prune then uses to delete the real EC shards on sibling disks. Skip the entry when there is no local .dat AND the .vif has no remote file (via MaybeLoadVolumeInfo); remote-tiered volumes have no local .dat but must still load. Adds TestLoneVifDoesNotCreatePhantomDat (fails without the guard) and TestRemoteTier_DiskScanLoadsRemoteOnlyVolume (fails if the guard skips a remote-only volume).
This commit is contained in:
@@ -18,6 +18,7 @@ import (
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||
)
|
||||
|
||||
@@ -224,6 +225,23 @@ func (l *DiskLocation) loadExistingVolume(dirEntry os.DirEntry, needleMapKind Ne
|
||||
return true
|
||||
}
|
||||
|
||||
// Load existing data only; never let NewVolume create a phantom .dat. A
|
||||
// lone .vif/.idx (e.g. an EC sidecar whose .ecx is on a sibling disk,
|
||||
// which the same-disk hasEcxFile() guard misses) would otherwise get an
|
||||
// 8-byte stub that the sibling-.dat prune deletes real shards against.
|
||||
// Remote-tiered volumes also have no local .dat, but their .vif points at
|
||||
// remote files and must still load via the remote path.
|
||||
if !util.FileExists(l.Directory + "/" + volumeName + ".dat") {
|
||||
_, hasRemote, _, _ := volume_info.MaybeLoadVolumeInfo(l.Directory + "/" + volumeName + ".vif")
|
||||
if !hasRemote && l.IdxDirectory != l.Directory {
|
||||
_, hasRemote, _, _ = volume_info.MaybeLoadVolumeInfo(l.IdxDirectory + "/" + volumeName + ".vif")
|
||||
}
|
||||
if !hasRemote {
|
||||
glog.V(1).Infof("loadExistingVolume: skipping volume %d (collection=%q); no .dat and no remote file", vid, collection)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// load the volume
|
||||
v, e := NewVolume(l.Directory, l.IdxDirectory, collection, vid, needleMapKind, nil, nil, 0, needle.GetCurrentVersion(), 0, ldbTimeout)
|
||||
if e != nil {
|
||||
|
||||
@@ -16,6 +16,8 @@ import (
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||
)
|
||||
|
||||
// In-process integration tests for cloud-tiered ("remote") volumes.
|
||||
@@ -223,6 +225,40 @@ func reloadVolume(t *testing.T, dir string, vid needle.VolumeId) *Volume {
|
||||
return v
|
||||
}
|
||||
|
||||
// TestRemoteTier_DiskScanLoadsRemoteOnlyVolume locks in that the disk scan
|
||||
// (loadExistingVolume) loads a remote-only volume — a .vif pointing at remote
|
||||
// files with no local .dat — instead of skipping it as a lone sidecar. The
|
||||
// phantom-.dat guard must let remote volumes through.
|
||||
func TestRemoteTier_DiskScanLoadsRemoteOnlyVolume(t *testing.T) {
|
||||
b := newLocalDirBackend(t)
|
||||
registerTestBackend(t, b)
|
||||
|
||||
dir := t.TempDir()
|
||||
const vid = needle.VolumeId(44)
|
||||
tierUpVolume(t, dir, vid, b) // leaves .vif (remote) + .idx, no .dat
|
||||
|
||||
require.False(t, util.FileExists(filepath.Join(dir, "44.dat")), "tier-up should have removed the local .dat")
|
||||
|
||||
loc := &DiskLocation{
|
||||
Directory: dir,
|
||||
DirectoryUuid: "test-uuid",
|
||||
IdxDirectory: dir,
|
||||
DiskType: types.HddType,
|
||||
MaxVolumeCount: 100,
|
||||
OriginalMaxVolumeCount: 100,
|
||||
MinFreeSpace: util.MinFreeSpace{Type: util.AsPercent, Percent: 1, Raw: "1"},
|
||||
}
|
||||
loc.volumes = make(map[needle.VolumeId]*Volume)
|
||||
loc.ecVolumes = make(map[needle.VolumeId]*erasure_coding.EcVolume)
|
||||
|
||||
loc.loadExistingVolumes(NeedleMapInMemory, 0)
|
||||
|
||||
v, ok := loc.volumes[vid]
|
||||
require.True(t, ok, "remote-only volume must be loaded by the disk scan, not skipped by the phantom-.dat guard")
|
||||
require.True(t, v.HasRemoteFile(), "loaded volume should be in remote mode")
|
||||
v.Close()
|
||||
}
|
||||
|
||||
// TestRemoteTier_Move_KeepsRemoteObject simulates the move-on-source-after-copy
|
||||
// step of a balance: Destroy(onlyEmpty=false, keepRemoteData=true). The remote
|
||||
// object must survive — the destination's freshly-copied .vif points at it.
|
||||
|
||||
@@ -0,0 +1,124 @@
|
||||
package storage
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/stats"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/types"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
|
||||
"github.com/seaweedfs/seaweedfs/weed/util"
|
||||
)
|
||||
|
||||
// TestLoneVifDoesNotCreatePhantomDat: a lone .vif whose .ecx is on a
|
||||
// sibling disk must not make loadExistingVolume create a phantom 8-byte
|
||||
// .dat, which the sibling-.dat prune would then use to delete the real EC shards
|
||||
// on the sibling. The same-disk hasEcxFile() guard misses this split.
|
||||
func TestLoneVifDoesNotCreatePhantomDat(t *testing.T) {
|
||||
tempDir := t.TempDir()
|
||||
dir0 := filepath.Join(tempDir, "data1")
|
||||
dir1 := filepath.Join(tempDir, "data2")
|
||||
for _, d := range []string{dir0, dir1} {
|
||||
if err := os.MkdirAll(d, 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", d, err)
|
||||
}
|
||||
}
|
||||
|
||||
collection := "warp-loadtest"
|
||||
vid := needle.VolumeId(57)
|
||||
const dataShards, parityShards = 10, 4
|
||||
const datSize int64 = 10 * 1024 * 1024
|
||||
expectedShardSize := calculateExpectedShardSize(datSize, dataShards)
|
||||
|
||||
// dir0: a self-contained but partial (2 < 10) EC volume.
|
||||
base0 := erasure_coding.EcShardFileName(collection, dir0, int(vid))
|
||||
for _, sid := range []int{2, 4} {
|
||||
f, err := os.Create(base0 + erasure_coding.ToExt(sid))
|
||||
if err != nil {
|
||||
t.Fatalf("create shard %d: %v", sid, err)
|
||||
}
|
||||
if err := f.Truncate(expectedShardSize); err != nil {
|
||||
f.Close()
|
||||
t.Fatalf("truncate shard %d: %v", sid, err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
t.Fatalf("close shard %d: %v", sid, err)
|
||||
}
|
||||
}
|
||||
if err := os.WriteFile(base0+".ecx", make([]byte, 20), 0o644); err != nil {
|
||||
t.Fatalf("write .ecx: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(base0+".ecj", nil, 0o644); err != nil {
|
||||
t.Fatalf("write .ecj: %v", err)
|
||||
}
|
||||
// DatFileSize 0 mirrors the production .vif that triggered the bug and
|
||||
// makes the prune's credibility gate fall back to the superblock size.
|
||||
vif := &volume_server_pb.VolumeInfo{
|
||||
Version: uint32(needle.Version3),
|
||||
EcShardConfig: &volume_server_pb.EcShardConfig{
|
||||
DataShards: dataShards,
|
||||
ParityShards: parityShards,
|
||||
},
|
||||
}
|
||||
if err := volume_info.SaveVolumeInfo(base0+".vif", vif); err != nil {
|
||||
t.Fatalf("save .vif dir0: %v", err)
|
||||
}
|
||||
|
||||
// dir1: ONLY the mirrored .vif — no .ecx, no shard, no .dat.
|
||||
base1 := erasure_coding.EcShardFileName(collection, dir1, int(vid))
|
||||
if err := volume_info.SaveVolumeInfo(base1+".vif", vif); err != nil {
|
||||
t.Fatalf("save .vif dir1: %v", err)
|
||||
}
|
||||
|
||||
diskIOProbeConfig := stats.DefaultDiskIOProbeConfig()
|
||||
store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", "store-id",
|
||||
[]string{dir0, dir1},
|
||||
[]int32{100, 100},
|
||||
[]util.MinFreeSpace{{}, {}},
|
||||
"",
|
||||
NeedleMapInMemory,
|
||||
[]types.DiskType{types.HardDriveType, types.HardDriveType},
|
||||
nil,
|
||||
3,
|
||||
diskIOProbeConfig,
|
||||
)
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-store.NewVolumesChan:
|
||||
case <-store.NewEcShardsChan:
|
||||
case <-store.DeletedVolumesChan:
|
||||
case <-store.DeletedEcShardsChan:
|
||||
case <-store.StateUpdateChan:
|
||||
case <-done:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
t.Cleanup(func() {
|
||||
store.Close()
|
||||
close(done)
|
||||
})
|
||||
|
||||
// Fix A: the lone .vif on dir1 must not have spawned a phantom .dat.
|
||||
if util.FileExists(base1 + ".dat") {
|
||||
t.Errorf("phantom .dat was created on the lone-.vif disk %s", dir1)
|
||||
}
|
||||
|
||||
// The real EC shards on dir0 must survive: with no phantom .dat, the
|
||||
// sibling-.dat prune finds no .dat owner and deletes nothing.
|
||||
loc0 := store.Locations[0]
|
||||
for _, sid := range []erasure_coding.ShardId{2, 4} {
|
||||
if _, found := loc0.FindEcShard(vid, sid); !found {
|
||||
t.Errorf("EC shard %d on dir0 was deleted", sid)
|
||||
}
|
||||
if !util.FileExists(base0 + erasure_coding.ToExt(int(sid))) {
|
||||
t.Errorf("EC shard file %d on dir0 was removed from disk", sid)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,13 +216,11 @@ func (s *Store) indexEcxOwners() map[ecKeyForReconcile]ecxOwnerInfo {
|
||||
// also fall through unchanged because the lookup in the .dat index below
|
||||
// will simply not find a match.
|
||||
//
|
||||
// Before deleting any EC files we also check that the sibling .dat is
|
||||
// plausibly the encoding source: at least super_block.SuperBlockSize
|
||||
// bytes long, and — when the EC's .vif recorded a non-zero source size
|
||||
// in datFileSize — at least that many bytes. A zero-byte shell or a
|
||||
// truncated .dat does not justify wiping the partial EC, because that
|
||||
// EC shard may still combine usefully with shards on other servers in
|
||||
// a recoverable distributed-EC layout.
|
||||
// The sibling .dat must be a credible encoding source before we delete
|
||||
// anything: at least the size .vif recorded at encode time, or — when
|
||||
// unknown (0) — more than a bare superblock so an empty 8-byte stub
|
||||
// can't pass. A truncated .dat leaves the partial EC alone; those shards
|
||||
// may still reconstruct from other servers.
|
||||
//
|
||||
// We push DeletedEcShardsChan for every pruned shard so the master is told
|
||||
// to forget the registrations the per-disk pass already emitted on
|
||||
@@ -269,13 +267,12 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() {
|
||||
if !hasDat || owner.location == loc {
|
||||
continue
|
||||
}
|
||||
// Decide whether the sibling .dat is a credible source.
|
||||
// Prefer the size baked into .vif at encode time; fall
|
||||
// back to "at least a superblock" for old EC volumes
|
||||
// whose .vif predates the field.
|
||||
// Credible source size: prefer .vif's encode-time size; when
|
||||
// unknown (0) require more than a bare superblock so an empty
|
||||
// 8-byte stub (e.g. a phantom .dat) can't pass.
|
||||
requiredDatSize := ev.DatFileSize()
|
||||
if requiredDatSize <= 0 {
|
||||
requiredDatSize = int64(super_block.SuperBlockSize)
|
||||
requiredDatSize = int64(super_block.SuperBlockSize) + 1
|
||||
}
|
||||
if owner.size < requiredDatSize {
|
||||
glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards but sibling .dat on %s is %d bytes (need >= %d); leaving partial EC in place so distributed reconstruction is still possible",
|
||||
|
||||
Reference in New Issue
Block a user