From 1c9039d3ac97b2ab832a7545e851fbe222ab20e7 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 8 Jun 2026 22:10:16 -0700 Subject: [PATCH] fix(seaweed-volume): stop EC shard deletion from phantom .dat on restart (#9874) * fix(seaweed-volume): stop EC shard deletion from phantom .dat on restart On startup load_existing_volumes() scans .vif/.idx entries (not just .dat). For distributed EC, a volume's .vif can be mirrored onto a disk whose .ecx lives on a sibling disk, so the per-disk ecx check is false and the loader falls through to Volume::new, which always creates the .dat if missing -> a phantom 8-byte superblock stub. The store-level prune_incomplete_ec_with_sibling_dat then treats that stub as the authoritative source and deletes the real EC shards on sibling disks. Go guards the same case (disk_location.go: 'Without this guard NewVolume below would create a phantom empty .dat') but only same-disk. Fix A (root cause): in load_existing_volumes, don't create a .dat during load. Skip the entry when there is no local .dat AND the .vif does not reference remote files -- remote-tiered volumes have no local .dat but must still load via the remote path. Uses the robust check_dat_file_exists helper so a transient stat error doesn't skip a real volume. New volumes go through create_volume(). Covers the cross-disk .vif/.ecx split Go's same-disk hasEcxFile() misses. Fix B (defense in depth, Go + Rust): when the EC .vif records no source size (dat_file_size==0), require the sibling .dat to be strictly larger than a bare superblock, so an empty 8-byte stub can't pass the credibility gate. Previously it fell back to SUPER_BLOCK_SIZE, which an 8-byte stub exactly meets. Adds regression tests reproducing the cross-disk lone-.vif phantom and the 8-byte stub gate; updates an existing prune test to use a real collection so its .ecx lookup matches the loaders. * fix(storage): don't create phantom .dat from lone .vif on Go volume load Mirror Fix A on the Go side. loadExistingVolume scans .vif/.idx entries, and for distributed EC a .vif can be mirrored onto a disk whose .ecx is on a sibling disk. The same-disk hasEcxFile() guard does not fire there, so the loader falls through to NewVolume(createDatIfMissing=true) and writes an 8-byte phantom .dat, which the sibling-.dat prune then uses to delete the real EC shards on sibling disks. Skip the entry when there is no local .dat AND the .vif has no remote file (via MaybeLoadVolumeInfo); remote-tiered volumes have no local .dat but must still load. Adds TestLoneVifDoesNotCreatePhantomDat (fails without the guard) and TestRemoteTier_DiskScanLoadsRemoteOnlyVolume (fails if the guard skips a remote-only volume). --- seaweed-volume/src/storage/disk_location.rs | 29 +++- .../src/storage/store_ec_reconcile.rs | 127 +++++++++++++++--- weed/storage/disk_location.go | 18 +++ weed/storage/remote_tier_integration_test.go | 36 +++++ weed/storage/store_ec_phantom_dat_test.go | 124 +++++++++++++++++ weed/storage/store_ec_reconcile.go | 21 ++- 6 files changed, 322 insertions(+), 33 deletions(-) create mode 100644 weed/storage/store_ec_phantom_dat_test.go diff --git a/seaweed-volume/src/storage/disk_location.rs b/seaweed-volume/src/storage/disk_location.rs index 21d478b11..ec7244bdd 100644 --- a/seaweed-volume/src/storage/disk_location.rs +++ b/seaweed-volume/src/storage/disk_location.rs @@ -21,7 +21,9 @@ use crate::storage::erasure_coding::ec_volume::EcVolume; use crate::storage::needle_map::NeedleMapKind; use crate::storage::super_block::ReplicaPlacement; use crate::storage::types::*; -use crate::storage::volume::{remove_volume_files, volume_file_name, Volume, VolumeError}; +use crate::storage::volume::{ + remove_volume_files, volume_file_name, VifVolumeInfo, Volume, VolumeError, +}; /// A single disk location managing volumes in one directory. pub struct DiskLocation { @@ -178,6 +180,20 @@ impl DiskLocation { continue; } + // Load existing data only; never create a phantom `.dat`. A lone + // `.vif`/`.idx` (e.g. an EC sidecar whose `.ecx` is on a sibling + // disk) would otherwise have Volume::new write an 8-byte stub that + // the sibling-.dat prune deletes real shards against. Remote-tiered + // volumes also have no local `.dat`, but their `.vif` points at + // remote files and must still load via the remote path. + let dat_path = format!("{}.dat", volume_name); + if !check_dat_file_exists(&dat_path) + && !vif_references_remote_file(&format!("{}.vif", volume_name)) + && !vif_references_remote_file(&format!("{}.vif", idx_name)) + { + continue; + } + match Volume::new( &self.directory, &self.idx_directory, @@ -1018,6 +1034,17 @@ fn check_dat_file_exists(path: &str) -> bool { } } +/// True when a `.vif` references remote-tier files: a remote-only volume +/// that has no local `.dat` but must still load via the remote path, +/// rather than be skipped as a lone EC sidecar. +fn vif_references_remote_file(vif_path: &str) -> bool { + fs::read_to_string(vif_path) + .ok() + .and_then(|s| serde_json::from_str::(&s).ok()) + .map(|vif| !vif.files.is_empty()) + .unwrap_or(false) +} + fn parse_volume_filename(filename: &str) -> Option<(String, VolumeId)> { let stem = filename .strip_suffix(".dat") diff --git a/seaweed-volume/src/storage/store_ec_reconcile.rs b/seaweed-volume/src/storage/store_ec_reconcile.rs index f42934fae..88bda5854 100644 --- a/seaweed-volume/src/storage/store_ec_reconcile.rs +++ b/seaweed-volume/src/storage/store_ec_reconcile.rs @@ -203,14 +203,11 @@ impl Store { /// this server) also fall through unchanged because the `.dat` /// index never matches. /// - /// Before deleting any EC files we also check that the sibling - /// `.dat` is plausibly the encoding source: at least - /// `SUPER_BLOCK_SIZE` bytes long, and — when the EC's `.vif` - /// recorded a non-zero source size in `dat_file_size` — at least - /// that many bytes. A zero-byte shell or a truncated `.dat` does - /// not justify wiping the partial EC, because that EC shard may - /// still combine usefully with shards on other servers in a - /// recoverable distributed-EC layout. + /// The sibling `.dat` must be a credible encoding source before we + /// delete anything: at least the size `.vif` recorded at encode time, + /// or — when unknown (0) — more than a bare superblock so an empty + /// 8-byte stub can't pass. A truncated `.dat` leaves the partial EC + /// alone; those shards may still reconstruct from other servers. /// /// We don't have to push anything to a deleted-shards channel /// here: the Rust heartbeat path in `server/heartbeat.rs` diffs @@ -257,14 +254,13 @@ impl Store { // per-disk pass; don't second-guess it here. continue; } - // Decide whether the sibling .dat is credible. Prefer - // the size baked into .vif at encode time; fall back - // to "at least a superblock" for old EC volumes whose - // .vif predates the field. + // Credible source size: prefer .vif's encode-time size; when + // unknown (0) require more than a bare superblock so an empty + // 8-byte stub (e.g. a phantom .dat) can't pass. let required = if ev.dat_file_size > 0 { ev.dat_file_size as u64 } else { - SUPER_BLOCK_SIZE as u64 + SUPER_BLOCK_SIZE as u64 + 1 }; if owner.size < required { warn!( @@ -469,6 +465,97 @@ mod tests { .unwrap(); } + /// Regression: a lone `.vif` whose `.ecx` is on a sibling disk must not + /// make the loader create a phantom `.dat`, nor the sibling-.dat prune + /// delete the real shards on the sibling. + #[test] + fn test_lone_vif_does_not_create_phantom_dat_or_delete_shards() { + let tmp = TempDir::new().unwrap(); + let d0 = tmp.path().join("data0"); + let d1 = tmp.path().join("data1"); + std::fs::create_dir_all(&d0).unwrap(); + std::fs::create_dir_all(&d1).unwrap(); + let coll = "warp-loadtest"; + let vid = 57u32; + + // d0: a self-contained but partial (2 < 10) EC volume. + write_shard(d0.to_str().unwrap(), coll, vid, 2); + write_shard(d0.to_str().unwrap(), coll, vid, 4); + write_index_files(d0.to_str().unwrap(), coll, vid, 10, 4); + + // d1: ONLY the mirrored `.vif` — no `.ecx`, no shard, no `.dat`. + std::fs::copy( + d0.join(format!("{}_{}.vif", coll, vid)), + d1.join(format!("{}_{}.vif", coll, vid)), + ) + .unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + for d in [&d0, &d1] { + store + .add_location( + d.to_str().unwrap(), + d.to_str().unwrap(), + 100, + DiskType::HardDrive, + MinFreeSpace::Percent(0.0), + Vec::new(), + ) + .unwrap(); + } + + let phantom = d1.join(format!("{}_{}.dat", coll, vid)); + assert!( + !phantom.exists(), + "loader created a phantom .dat from a lone .vif" + ); + let total: usize = store.locations.iter().map(|l| l.ec_shard_count()).sum(); + assert_eq!(total, 2, "real EC shards were deleted (total={})", total); + } + + /// Regression for the prune credibility gate: when the EC `.vif` + /// records no source size (`dat_file_size == 0`), an empty 8-byte + /// sibling `.dat` stub must NOT justify deleting partial EC shards. + #[test] + fn test_prune_refuses_unverifiable_8byte_dat() { + let tmp = TempDir::new().unwrap(); + let d0 = tmp.path().join("data0"); + let d1 = tmp.path().join("data1"); + std::fs::create_dir_all(&d0).unwrap(); + std::fs::create_dir_all(&d1).unwrap(); + let coll = "warp-loadtest"; + let vid = 99u32; + + // d0: partial EC (2 shards); its `.vif` records no source size. + write_shard(d0.to_str().unwrap(), coll, vid, 0); + write_shard(d0.to_str().unwrap(), coll, vid, 1); + write_index_files(d0.to_str().unwrap(), coll, vid, 10, 4); + + // d1: a real but empty 8-byte (superblock-sized) `.dat` stub. + std::fs::write(d1.join(format!("{}_{}.dat", coll, vid)), vec![0u8; 8]).unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + for d in [&d0, &d1] { + store + .add_location( + d.to_str().unwrap(), + d.to_str().unwrap(), + 100, + DiskType::HardDrive, + MinFreeSpace::Percent(0.0), + Vec::new(), + ) + .unwrap(); + } + + let total: usize = store.locations.iter().map(|l| l.ec_shard_count()).sum(); + assert_eq!( + total, 2, + "prune deleted shards against an unverifiable 8-byte .dat (total={})", + total + ); + } + /// Reproduces the orphan-shard layout from issue #9212. Shards live /// on dir0; the .ecx / .ecj / .vif live on dir1. Without /// reconciliation, dir0's shards are silently dropped at startup. @@ -966,15 +1053,15 @@ mod tests { std::fs::create_dir_all(&dat_dir).unwrap(); std::fs::create_dir_all(&ec_dir).unwrap(); - let collection = ""; + // Real collection so the loader's volume_file_name-based `.ecx` + // lookup matches the helpers (empty collection emits `_122.*` vs the + // expected `122.*`). + let collection = "pics"; let vid = 122u32; - // Disk A (sdd): a non-trivial .dat plus the regular-volume - // sidecars. The .dat content doesn't matter for the prune — - // load_existing_volumes refuses to mount it because the - // superblock is absent — but its file name has to be present - // so index_dat_owners records this disk as the .dat owner. - let dat_path = dat_dir.join(format!("{}_{}.dat", collection, vid).trim_start_matches('_')); + // Disk A (sdd): a .dat whose name must be present so index_dat_owners + // records this disk as the .dat owner (content doesn't matter). + let dat_path = dat_dir.join(format!("{}_{}.dat", collection, vid)); std::fs::write(&dat_path, vec![0u8; 1024]).unwrap(); // Disk B (sdf): partial EC — one shard, plus .ecx / .ecj / .vif. diff --git a/weed/storage/disk_location.go b/weed/storage/disk_location.go index 46558b27a..ee32c87c7 100644 --- a/weed/storage/disk_location.go +++ b/weed/storage/disk_location.go @@ -18,6 +18,7 @@ import ( "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" "github.com/seaweedfs/seaweedfs/weed/storage/needle" "github.com/seaweedfs/seaweedfs/weed/storage/types" + "github.com/seaweedfs/seaweedfs/weed/storage/volume_info" "github.com/seaweedfs/seaweedfs/weed/util" ) @@ -224,6 +225,23 @@ func (l *DiskLocation) loadExistingVolume(dirEntry os.DirEntry, needleMapKind Ne return true } + // Load existing data only; never let NewVolume create a phantom .dat. A + // lone .vif/.idx (e.g. an EC sidecar whose .ecx is on a sibling disk, + // which the same-disk hasEcxFile() guard misses) would otherwise get an + // 8-byte stub that the sibling-.dat prune deletes real shards against. + // Remote-tiered volumes also have no local .dat, but their .vif points at + // remote files and must still load via the remote path. + if !util.FileExists(l.Directory + "/" + volumeName + ".dat") { + _, hasRemote, _, _ := volume_info.MaybeLoadVolumeInfo(l.Directory + "/" + volumeName + ".vif") + if !hasRemote && l.IdxDirectory != l.Directory { + _, hasRemote, _, _ = volume_info.MaybeLoadVolumeInfo(l.IdxDirectory + "/" + volumeName + ".vif") + } + if !hasRemote { + glog.V(1).Infof("loadExistingVolume: skipping volume %d (collection=%q); no .dat and no remote file", vid, collection) + return false + } + } + // load the volume v, e := NewVolume(l.Directory, l.IdxDirectory, collection, vid, needleMapKind, nil, nil, 0, needle.GetCurrentVersion(), 0, ldbTimeout) if e != nil { diff --git a/weed/storage/remote_tier_integration_test.go b/weed/storage/remote_tier_integration_test.go index 11cbe2f91..59744a84c 100644 --- a/weed/storage/remote_tier_integration_test.go +++ b/weed/storage/remote_tier_integration_test.go @@ -16,6 +16,8 @@ import ( "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" "github.com/seaweedfs/seaweedfs/weed/storage/needle" "github.com/seaweedfs/seaweedfs/weed/storage/super_block" + "github.com/seaweedfs/seaweedfs/weed/storage/types" + "github.com/seaweedfs/seaweedfs/weed/util" ) // In-process integration tests for cloud-tiered ("remote") volumes. @@ -223,6 +225,40 @@ func reloadVolume(t *testing.T, dir string, vid needle.VolumeId) *Volume { return v } +// TestRemoteTier_DiskScanLoadsRemoteOnlyVolume locks in that the disk scan +// (loadExistingVolume) loads a remote-only volume — a .vif pointing at remote +// files with no local .dat — instead of skipping it as a lone sidecar. The +// phantom-.dat guard must let remote volumes through. +func TestRemoteTier_DiskScanLoadsRemoteOnlyVolume(t *testing.T) { + b := newLocalDirBackend(t) + registerTestBackend(t, b) + + dir := t.TempDir() + const vid = needle.VolumeId(44) + tierUpVolume(t, dir, vid, b) // leaves .vif (remote) + .idx, no .dat + + require.False(t, util.FileExists(filepath.Join(dir, "44.dat")), "tier-up should have removed the local .dat") + + loc := &DiskLocation{ + Directory: dir, + DirectoryUuid: "test-uuid", + IdxDirectory: dir, + DiskType: types.HddType, + MaxVolumeCount: 100, + OriginalMaxVolumeCount: 100, + MinFreeSpace: util.MinFreeSpace{Type: util.AsPercent, Percent: 1, Raw: "1"}, + } + loc.volumes = make(map[needle.VolumeId]*Volume) + loc.ecVolumes = make(map[needle.VolumeId]*erasure_coding.EcVolume) + + loc.loadExistingVolumes(NeedleMapInMemory, 0) + + v, ok := loc.volumes[vid] + require.True(t, ok, "remote-only volume must be loaded by the disk scan, not skipped by the phantom-.dat guard") + require.True(t, v.HasRemoteFile(), "loaded volume should be in remote mode") + v.Close() +} + // TestRemoteTier_Move_KeepsRemoteObject simulates the move-on-source-after-copy // step of a balance: Destroy(onlyEmpty=false, keepRemoteData=true). The remote // object must survive — the destination's freshly-copied .vif points at it. diff --git a/weed/storage/store_ec_phantom_dat_test.go b/weed/storage/store_ec_phantom_dat_test.go new file mode 100644 index 000000000..fd47e5a64 --- /dev/null +++ b/weed/storage/store_ec_phantom_dat_test.go @@ -0,0 +1,124 @@ +package storage + +import ( + "os" + "path/filepath" + "testing" + + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/stats" + "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" + "github.com/seaweedfs/seaweedfs/weed/storage/needle" + "github.com/seaweedfs/seaweedfs/weed/storage/types" + "github.com/seaweedfs/seaweedfs/weed/storage/volume_info" + "github.com/seaweedfs/seaweedfs/weed/util" +) + +// TestLoneVifDoesNotCreatePhantomDat: a lone .vif whose .ecx is on a +// sibling disk must not make loadExistingVolume create a phantom 8-byte +// .dat, which the sibling-.dat prune would then use to delete the real EC shards +// on the sibling. The same-disk hasEcxFile() guard misses this split. +func TestLoneVifDoesNotCreatePhantomDat(t *testing.T) { + tempDir := t.TempDir() + dir0 := filepath.Join(tempDir, "data1") + dir1 := filepath.Join(tempDir, "data2") + for _, d := range []string{dir0, dir1} { + if err := os.MkdirAll(d, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", d, err) + } + } + + collection := "warp-loadtest" + vid := needle.VolumeId(57) + const dataShards, parityShards = 10, 4 + const datSize int64 = 10 * 1024 * 1024 + expectedShardSize := calculateExpectedShardSize(datSize, dataShards) + + // dir0: a self-contained but partial (2 < 10) EC volume. + base0 := erasure_coding.EcShardFileName(collection, dir0, int(vid)) + for _, sid := range []int{2, 4} { + f, err := os.Create(base0 + erasure_coding.ToExt(sid)) + if err != nil { + t.Fatalf("create shard %d: %v", sid, err) + } + if err := f.Truncate(expectedShardSize); err != nil { + f.Close() + t.Fatalf("truncate shard %d: %v", sid, err) + } + if err := f.Close(); err != nil { + t.Fatalf("close shard %d: %v", sid, err) + } + } + if err := os.WriteFile(base0+".ecx", make([]byte, 20), 0o644); err != nil { + t.Fatalf("write .ecx: %v", err) + } + if err := os.WriteFile(base0+".ecj", nil, 0o644); err != nil { + t.Fatalf("write .ecj: %v", err) + } + // DatFileSize 0 mirrors the production .vif that triggered the bug and + // makes the prune's credibility gate fall back to the superblock size. + vif := &volume_server_pb.VolumeInfo{ + Version: uint32(needle.Version3), + EcShardConfig: &volume_server_pb.EcShardConfig{ + DataShards: dataShards, + ParityShards: parityShards, + }, + } + if err := volume_info.SaveVolumeInfo(base0+".vif", vif); err != nil { + t.Fatalf("save .vif dir0: %v", err) + } + + // dir1: ONLY the mirrored .vif — no .ecx, no shard, no .dat. + base1 := erasure_coding.EcShardFileName(collection, dir1, int(vid)) + if err := volume_info.SaveVolumeInfo(base1+".vif", vif); err != nil { + t.Fatalf("save .vif dir1: %v", err) + } + + diskIOProbeConfig := stats.DefaultDiskIOProbeConfig() + store := NewStore(nil, "localhost", 8080, 18080, "http://localhost:8080", "store-id", + []string{dir0, dir1}, + []int32{100, 100}, + []util.MinFreeSpace{{}, {}}, + "", + NeedleMapInMemory, + []types.DiskType{types.HardDriveType, types.HardDriveType}, + nil, + 3, + diskIOProbeConfig, + ) + done := make(chan struct{}) + go func() { + for { + select { + case <-store.NewVolumesChan: + case <-store.NewEcShardsChan: + case <-store.DeletedVolumesChan: + case <-store.DeletedEcShardsChan: + case <-store.StateUpdateChan: + case <-done: + return + } + } + }() + t.Cleanup(func() { + store.Close() + close(done) + }) + + // Fix A: the lone .vif on dir1 must not have spawned a phantom .dat. + if util.FileExists(base1 + ".dat") { + t.Errorf("phantom .dat was created on the lone-.vif disk %s", dir1) + } + + // The real EC shards on dir0 must survive: with no phantom .dat, the + // sibling-.dat prune finds no .dat owner and deletes nothing. + loc0 := store.Locations[0] + for _, sid := range []erasure_coding.ShardId{2, 4} { + if _, found := loc0.FindEcShard(vid, sid); !found { + t.Errorf("EC shard %d on dir0 was deleted", sid) + } + if !util.FileExists(base0 + erasure_coding.ToExt(int(sid))) { + t.Errorf("EC shard file %d on dir0 was removed from disk", sid) + } + } +} diff --git a/weed/storage/store_ec_reconcile.go b/weed/storage/store_ec_reconcile.go index a5b36146d..3c35ba75e 100644 --- a/weed/storage/store_ec_reconcile.go +++ b/weed/storage/store_ec_reconcile.go @@ -216,13 +216,11 @@ func (s *Store) indexEcxOwners() map[ecKeyForReconcile]ecxOwnerInfo { // also fall through unchanged because the lookup in the .dat index below // will simply not find a match. // -// Before deleting any EC files we also check that the sibling .dat is -// plausibly the encoding source: at least super_block.SuperBlockSize -// bytes long, and — when the EC's .vif recorded a non-zero source size -// in datFileSize — at least that many bytes. A zero-byte shell or a -// truncated .dat does not justify wiping the partial EC, because that -// EC shard may still combine usefully with shards on other servers in -// a recoverable distributed-EC layout. +// The sibling .dat must be a credible encoding source before we delete +// anything: at least the size .vif recorded at encode time, or — when +// unknown (0) — more than a bare superblock so an empty 8-byte stub +// can't pass. A truncated .dat leaves the partial EC alone; those shards +// may still reconstruct from other servers. // // We push DeletedEcShardsChan for every pruned shard so the master is told // to forget the registrations the per-disk pass already emitted on @@ -269,13 +267,12 @@ func (s *Store) pruneIncompleteEcWithSiblingDat() { if !hasDat || owner.location == loc { continue } - // Decide whether the sibling .dat is a credible source. - // Prefer the size baked into .vif at encode time; fall - // back to "at least a superblock" for old EC volumes - // whose .vif predates the field. + // Credible source size: prefer .vif's encode-time size; when + // unknown (0) require more than a bare superblock so an empty + // 8-byte stub (e.g. a phantom .dat) can't pass. requiredDatSize := ev.DatFileSize() if requiredDatSize <= 0 { - requiredDatSize = int64(super_block.SuperBlockSize) + requiredDatSize = int64(super_block.SuperBlockSize) + 1 } if owner.size < requiredDatSize { glog.Warningf("ec volume %d (collection=%q) on %s has only %d shards but sibling .dat on %s is %d bytes (need >= %d); leaving partial EC in place so distributed reconstruction is still possible",