fix(volume): tombstone integrity check no longer flips volumes read-only (fixes #9563) (#9565)

* fix(volume): pass on-disk tombstone size to ReadData in verifyDeletedNeedleIntegrity

verifyDeletedNeedleIntegrity was forwarding TombstoneFileSize (-1) into
Needle.ReadData. A deletion tombstone is appended to .dat with DataSize=0
so the on-disk needle header carries Size=0; TombstoneFileSize is only
the .idx sentinel for "this entry is deleted" and is never written into
a needle header.

ReadBytes' size check therefore mismatched on every tombstone
(-1 != 0), returned ErrorSizeMismatch, and triggered the
4-byte-offset wrap-around retry in ReadData (offset + 32 GB). On any
volume large enough that offset+32 GB exceeds dat fileSize the retry
read EOF, CheckVolumeDataIntegrity reported corruption, and the loader
set noWriteOrDelete = true. Every volume whose last 10 .idx entries
included a deletion went read-only on startup — i.e. any healthy
volume where the most recent operations included a delete.

Pass Size(0) so the size check matches the on-disk tombstone header.

Add a regression test that writes three needles, deletes one, and
asserts CheckVolumeDataIntegrity succeeds with a tombstone at the .idx
tail. Without this fix the test reproduces the exact log shape from
the bug report:

  read 0 dataSize 32 offset <orig+32GB> fileSize <much smaller>: EOF
  verifyDeletedNeedleIntegrity ...idx failed: read data [N,N+32) : EOF

The Rust port guards its integrity-check size comparison with
!size.is_deleted() (seaweed-volume/src/storage/volume.rs) and never
hits this path, so no Rust mirror change is needed.

* test(seaweed-volume): mirror Go regression for deletion-tombstone integrity

The Rust integrity check already guards its size-mismatch comparison
with !size.is_deleted() (volume.rs:1859) and reads tombstone AppendAtNs
with body_size=0, so the Go regression fixed in the previous commit
does not apply. Lock that guarantee in with a parallel reload test:
write three needles, delete one, sync, reopen via Volume::new, assert
the volume is not flipped read-only.

Catches any future change that removes the deleted-entry guard or
re-introduces a size-strict path in check_volume_data_integrity for
tombstones.

* fix(volume): propagate io.EOF and ErrorSizeMismatch from verifyDeletedNeedleIntegrity

CheckVolumeDataIntegrity relies on identity comparison against io.EOF
and ErrorSizeMismatch to walk back through the last ten .idx entries
and tolerate a partial truncation at the tail (the "fix and continue"
loop). The live-needle branch in doCheckAndFixVolumeData already
returns those sentinels unwrapped; the deletion branch wrapped them
in fmt.Errorf, so a genuine .dat truncation past a tombstone offset
broke the recovery and flipped the volume read-only.

Mirror the live-needle handling: both verifyDeletedNeedleIntegrity
and doCheckAndFixVolumeData now short-circuit on io.EOF /
ErrorSizeMismatch and pass them through unwrapped. Other errors keep
their existing context wrapping.

Also tighten the regression test to capture lastAppendAtNs and assert
it's non-zero, so a future regression that skips the tombstone body
(and therefore never populates AppendAtNs) is caught even when the
err check still passes.
This commit is contained in:
Chris Lu
2026-05-19 13:11:19 -07:00
committed by GitHub
parent d57de6dc20
commit cfc08fbf6c
3 changed files with 120 additions and 2 deletions
+51
View File
@@ -3592,6 +3592,57 @@ mod tests {
assert!(matches!(err, VolumeError::Deleted));
}
// Guard the Rust integrity-check tombstone path against the Go regression
// where verifyDeletedNeedleIntegrity forwarded TombstoneFileSize into the
// needle-size check, mismatched against the on-disk Size=0 header, and
// sent every volume with a trailing deletion read-only on load. The Rust
// check guards its size comparison with !size.is_deleted(); this test
// keeps that guarantee from silently regressing.
#[test]
fn test_check_volume_data_integrity_with_deletion_tombstone() {
let tmp = TempDir::new().unwrap();
let dir = tmp.path().to_str().unwrap();
{
let mut v = make_test_volume(dir);
for i in 1..=3 {
let data = format!("data {}", i);
let mut n = Needle {
id: NeedleId(i),
cookie: Cookie(i as u32),
data: data.as_bytes().to_vec(),
data_size: data.len() as u32,
..Needle::default()
};
v.write_needle(&mut n, true).unwrap();
}
v.delete_needle(&mut Needle {
id: NeedleId(2),
cookie: Cookie(2),
..Needle::default()
})
.unwrap();
v.sync_to_disk().unwrap();
}
let v = Volume::new(
dir,
dir,
"",
VolumeId(1),
NeedleMapKind::InMemory,
None,
None,
0,
Version::current(),
)
.unwrap();
assert!(
!v.is_no_write_or_delete(),
"volume should not be read-only after reload with trailing deletion tombstone"
);
}
#[test]
fn test_volume_multiple_needles() {
let tmp = TempDir::new().unwrap();
+17 -2
View File
@@ -154,8 +154,14 @@ func doCheckAndFixVolumeData(v *Volume, indexFile *os.File, indexOffset int64) (
return 0, nil
}
if size < 0 {
// read the deletion entry
// read the deletion entry. Pass io.EOF and ErrorSizeMismatch through
// unwrapped so CheckVolumeDataIntegrity can recognize them and run its
// trailing-truncation / wrap-around recovery loop, matching the live
// branch below.
if lastAppendAtNs, err = verifyDeletedNeedleIntegrity(v.DataBackend, v.Version(), offset.ToActualOffset(), key); err != nil {
if err == io.EOF || err == ErrorSizeMismatch {
return lastAppendAtNs, err
}
return lastAppendAtNs, fmt.Errorf("verifyDeletedNeedleIntegrity %s failed: %v", indexFile.Name(), err)
}
} else {
@@ -243,8 +249,17 @@ func verifyNeedleIntegrity(datFile backend.BackendStorageFile, v needle.Version,
func verifyDeletedNeedleIntegrity(datFile backend.BackendStorageFile, v needle.Version, offset int64, key types.NeedleId) (lastAppendAtNs uint64, err error) {
n := new(needle.Needle)
size := types.TombstoneFileSize
// Tombstones are appended with DataSize=0, so the on-disk header carries
// Size=0. TombstoneFileSize (-1) lives only in the .idx; passing it to
// ReadData fails the size check and triggers the 32GB wrap-around retry,
// which reads past EOF and falsely marks the volume read-only.
size := types.Size(0)
if err = n.ReadData(datFile, offset, size, v); err != nil {
// Preserve io.EOF and ErrorSizeMismatch as-is so CheckVolumeDataIntegrity
// can detect trailing truncation and trigger its wrap-around retry.
if err == io.EOF || err == ErrorSizeMismatch {
return n.AppendAtNs, err
}
return n.AppendAtNs, fmt.Errorf("read data [%d,%d) : %v", offset, offset+needle.GetActualSize(size, v), err)
}
if n.Id != key {
+52
View File
@@ -81,6 +81,58 @@ func TestScrubVolumeData(t *testing.T) {
}
}
// TestCheckVolumeDataIntegrityWithDeletionTombstone guards the size argument
// passed to ReadData when verifying a trailing deletion tombstone. The .idx
// entry carries TombstoneFileSize (-1) but the appended tombstone needle in
// .dat carries Size=0. Forwarding the .idx size into ReadData fails the size
// check, activates the 32GB 4-byte-offset wrap-around retry, and surfaces EOF;
// CheckVolumeDataIntegrity then treats the volume as corrupt and the loader
// marks every such volume read-only on startup.
func TestCheckVolumeDataIntegrityWithDeletionTombstone(t *testing.T) {
dir := t.TempDir()
v, err := NewVolume(dir, dir, "", 1, NeedleMapInMemory, &super_block.ReplicaPlacement{}, &needle.TTL{}, 0, needle.GetCurrentVersion(), 0, 0)
if err != nil {
t.Fatalf("volume creation: %v", err)
}
defer v.Close()
for i := 1; i <= 3; i++ {
n := newRandomNeedle(uint64(i))
if _, _, _, err := v.writeNeedle2(n, true, false); err != nil {
t.Fatalf("write needle %d: %v", i, err)
}
}
// Delete a live needle so a tombstone (Size=0 on disk, -1 in idx) is the
// last entry the integrity check examines.
if _, err := v.doDeleteRequest(newEmptyNeedle(2)); err != nil {
t.Fatalf("delete needle: %v", err)
}
if err := v.DataBackend.Sync(); err != nil {
t.Fatalf("sync .dat: %v", err)
}
if err := v.nm.Sync(); err != nil {
t.Fatalf("sync .idx: %v", err)
}
idxFile, err := os.OpenFile(v.FileName(".idx"), os.O_RDONLY, 0644)
if err != nil {
t.Fatalf("open .idx: %v", err)
}
defer idxFile.Close()
lastAppendAtNs, err := CheckVolumeDataIntegrity(v, idxFile)
if err != nil {
t.Fatalf("CheckVolumeDataIntegrity should succeed with a trailing deletion tombstone: %v", err)
}
// V3 tombstones still record AppendAtNs; a zero return means ReadData
// bailed out before populating the needle and would mask future regressions
// that quietly skip the tombstone body.
if lastAppendAtNs == 0 {
t.Errorf("expected non-zero lastAppendAtNs for V3 deletion tombstone, got 0")
}
}
// TestMaxNeedleEnd ensures the needle map's MaxNeedleEnd accumulator lets
// volume.load() detect an .idx that references bytes past the end of the .dat
// — the deeper-than-tail corruption shape from issue #8928 that the existing