From cfc08fbf6c90f6fa5e33774f179ff8dc379521e2 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 19 May 2026 13:11:19 -0700 Subject: [PATCH] fix(volume): tombstone integrity check no longer flips volumes read-only (fixes #9563) (#9565) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(volume): pass on-disk tombstone size to ReadData in verifyDeletedNeedleIntegrity verifyDeletedNeedleIntegrity was forwarding TombstoneFileSize (-1) into Needle.ReadData. A deletion tombstone is appended to .dat with DataSize=0 so the on-disk needle header carries Size=0; TombstoneFileSize is only the .idx sentinel for "this entry is deleted" and is never written into a needle header. ReadBytes' size check therefore mismatched on every tombstone (-1 != 0), returned ErrorSizeMismatch, and triggered the 4-byte-offset wrap-around retry in ReadData (offset + 32 GB). On any volume large enough that offset+32 GB exceeds dat fileSize the retry read EOF, CheckVolumeDataIntegrity reported corruption, and the loader set noWriteOrDelete = true. Every volume whose last 10 .idx entries included a deletion went read-only on startup — i.e. any healthy volume where the most recent operations included a delete. Pass Size(0) so the size check matches the on-disk tombstone header. Add a regression test that writes three needles, deletes one, and asserts CheckVolumeDataIntegrity succeeds with a tombstone at the .idx tail. Without this fix the test reproduces the exact log shape from the bug report: read 0 dataSize 32 offset fileSize : EOF verifyDeletedNeedleIntegrity ...idx failed: read data [N,N+32) : EOF The Rust port guards its integrity-check size comparison with !size.is_deleted() (seaweed-volume/src/storage/volume.rs) and never hits this path, so no Rust mirror change is needed. * test(seaweed-volume): mirror Go regression for deletion-tombstone integrity The Rust integrity check already guards its size-mismatch comparison with !size.is_deleted() (volume.rs:1859) and reads tombstone AppendAtNs with body_size=0, so the Go regression fixed in the previous commit does not apply. Lock that guarantee in with a parallel reload test: write three needles, delete one, sync, reopen via Volume::new, assert the volume is not flipped read-only. Catches any future change that removes the deleted-entry guard or re-introduces a size-strict path in check_volume_data_integrity for tombstones. * fix(volume): propagate io.EOF and ErrorSizeMismatch from verifyDeletedNeedleIntegrity CheckVolumeDataIntegrity relies on identity comparison against io.EOF and ErrorSizeMismatch to walk back through the last ten .idx entries and tolerate a partial truncation at the tail (the "fix and continue" loop). The live-needle branch in doCheckAndFixVolumeData already returns those sentinels unwrapped; the deletion branch wrapped them in fmt.Errorf, so a genuine .dat truncation past a tombstone offset broke the recovery and flipped the volume read-only. Mirror the live-needle handling: both verifyDeletedNeedleIntegrity and doCheckAndFixVolumeData now short-circuit on io.EOF / ErrorSizeMismatch and pass them through unwrapped. Other errors keep their existing context wrapping. Also tighten the regression test to capture lastAppendAtNs and assert it's non-zero, so a future regression that skips the tombstone body (and therefore never populates AppendAtNs) is caught even when the err check still passes. --- seaweed-volume/src/storage/volume.rs | 51 +++++++++++++++++++++++++++ weed/storage/volume_checking.go | 19 ++++++++-- weed/storage/volume_checking_test.go | 52 ++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 2 deletions(-) diff --git a/seaweed-volume/src/storage/volume.rs b/seaweed-volume/src/storage/volume.rs index b01bf6b0a..a26509460 100644 --- a/seaweed-volume/src/storage/volume.rs +++ b/seaweed-volume/src/storage/volume.rs @@ -3592,6 +3592,57 @@ mod tests { assert!(matches!(err, VolumeError::Deleted)); } + // Guard the Rust integrity-check tombstone path against the Go regression + // where verifyDeletedNeedleIntegrity forwarded TombstoneFileSize into the + // needle-size check, mismatched against the on-disk Size=0 header, and + // sent every volume with a trailing deletion read-only on load. The Rust + // check guards its size comparison with !size.is_deleted(); this test + // keeps that guarantee from silently regressing. + #[test] + fn test_check_volume_data_integrity_with_deletion_tombstone() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let mut v = make_test_volume(dir); + for i in 1..=3 { + let data = format!("data {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.delete_needle(&mut Needle { + id: NeedleId(2), + cookie: Cookie(2), + ..Needle::default() + }) + .unwrap(); + v.sync_to_disk().unwrap(); + } + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert!( + !v.is_no_write_or_delete(), + "volume should not be read-only after reload with trailing deletion tombstone" + ); + } + #[test] fn test_volume_multiple_needles() { let tmp = TempDir::new().unwrap(); diff --git a/weed/storage/volume_checking.go b/weed/storage/volume_checking.go index 29d84f3a9..f398b82e2 100644 --- a/weed/storage/volume_checking.go +++ b/weed/storage/volume_checking.go @@ -154,8 +154,14 @@ func doCheckAndFixVolumeData(v *Volume, indexFile *os.File, indexOffset int64) ( return 0, nil } if size < 0 { - // read the deletion entry + // read the deletion entry. Pass io.EOF and ErrorSizeMismatch through + // unwrapped so CheckVolumeDataIntegrity can recognize them and run its + // trailing-truncation / wrap-around recovery loop, matching the live + // branch below. if lastAppendAtNs, err = verifyDeletedNeedleIntegrity(v.DataBackend, v.Version(), offset.ToActualOffset(), key); err != nil { + if err == io.EOF || err == ErrorSizeMismatch { + return lastAppendAtNs, err + } return lastAppendAtNs, fmt.Errorf("verifyDeletedNeedleIntegrity %s failed: %v", indexFile.Name(), err) } } else { @@ -243,8 +249,17 @@ func verifyNeedleIntegrity(datFile backend.BackendStorageFile, v needle.Version, func verifyDeletedNeedleIntegrity(datFile backend.BackendStorageFile, v needle.Version, offset int64, key types.NeedleId) (lastAppendAtNs uint64, err error) { n := new(needle.Needle) - size := types.TombstoneFileSize + // Tombstones are appended with DataSize=0, so the on-disk header carries + // Size=0. TombstoneFileSize (-1) lives only in the .idx; passing it to + // ReadData fails the size check and triggers the 32GB wrap-around retry, + // which reads past EOF and falsely marks the volume read-only. + size := types.Size(0) if err = n.ReadData(datFile, offset, size, v); err != nil { + // Preserve io.EOF and ErrorSizeMismatch as-is so CheckVolumeDataIntegrity + // can detect trailing truncation and trigger its wrap-around retry. + if err == io.EOF || err == ErrorSizeMismatch { + return n.AppendAtNs, err + } return n.AppendAtNs, fmt.Errorf("read data [%d,%d) : %v", offset, offset+needle.GetActualSize(size, v), err) } if n.Id != key { diff --git a/weed/storage/volume_checking_test.go b/weed/storage/volume_checking_test.go index 3da226d9d..2a57f3676 100644 --- a/weed/storage/volume_checking_test.go +++ b/weed/storage/volume_checking_test.go @@ -81,6 +81,58 @@ func TestScrubVolumeData(t *testing.T) { } } +// TestCheckVolumeDataIntegrityWithDeletionTombstone guards the size argument +// passed to ReadData when verifying a trailing deletion tombstone. The .idx +// entry carries TombstoneFileSize (-1) but the appended tombstone needle in +// .dat carries Size=0. Forwarding the .idx size into ReadData fails the size +// check, activates the 32GB 4-byte-offset wrap-around retry, and surfaces EOF; +// CheckVolumeDataIntegrity then treats the volume as corrupt and the loader +// marks every such volume read-only on startup. +func TestCheckVolumeDataIntegrityWithDeletionTombstone(t *testing.T) { + dir := t.TempDir() + + v, err := NewVolume(dir, dir, "", 1, NeedleMapInMemory, &super_block.ReplicaPlacement{}, &needle.TTL{}, 0, needle.GetCurrentVersion(), 0, 0) + if err != nil { + t.Fatalf("volume creation: %v", err) + } + defer v.Close() + + for i := 1; i <= 3; i++ { + n := newRandomNeedle(uint64(i)) + if _, _, _, err := v.writeNeedle2(n, true, false); err != nil { + t.Fatalf("write needle %d: %v", i, err) + } + } + // Delete a live needle so a tombstone (Size=0 on disk, -1 in idx) is the + // last entry the integrity check examines. + if _, err := v.doDeleteRequest(newEmptyNeedle(2)); err != nil { + t.Fatalf("delete needle: %v", err) + } + if err := v.DataBackend.Sync(); err != nil { + t.Fatalf("sync .dat: %v", err) + } + if err := v.nm.Sync(); err != nil { + t.Fatalf("sync .idx: %v", err) + } + + idxFile, err := os.OpenFile(v.FileName(".idx"), os.O_RDONLY, 0644) + if err != nil { + t.Fatalf("open .idx: %v", err) + } + defer idxFile.Close() + + lastAppendAtNs, err := CheckVolumeDataIntegrity(v, idxFile) + if err != nil { + t.Fatalf("CheckVolumeDataIntegrity should succeed with a trailing deletion tombstone: %v", err) + } + // V3 tombstones still record AppendAtNs; a zero return means ReadData + // bailed out before populating the needle and would mask future regressions + // that quietly skip the tombstone body. + if lastAppendAtNs == 0 { + t.Errorf("expected non-zero lastAppendAtNs for V3 deletion tombstone, got 0") + } +} + // TestMaxNeedleEnd ensures the needle map's MaxNeedleEnd accumulator lets // volume.load() detect an .idx that references bytes past the end of the .dat // — the deeper-than-tail corruption shape from issue #8928 that the existing