From d9b86fb495a2a3d2b077716d78f94b0a53b2f45a Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 28 Apr 2026 21:02:52 -0700 Subject: [PATCH] fix(s3api): clear stale latest-version pointer when .versions dir cleanup is blocked (#9269) * fix(s3api): clear stale latest-version pointer when post-deletion cleanup is blocked In versioned buckets, when the only remaining children of a .versions directory are orphan entries (v_ files that lack the version-id extended attribute - e.g. left behind by older code paths or interrupted writes), updateLatestVersionAfterDeletion's selectLatestVersion finds zero promotable versions and falls through to s3a.rm. The non-recursive rm fails because the orphan blocks the directory deletion. Previously the code logged the failure and returned, leaving the LatestVersionId pointer pointing at the version we just deleted. For Veeam-style workloads that GET-PUT-GET-DELETE a small lock object on every checkpoint, the stale pointer poisons every subsequent run: the next GET re-enters getLatestObjectVersion's 13x retry loop on the missing pointer file plus the self-heal rescan, all to return the same 404. The cycle is self-perpetuating until the orphan is removed by hand. When rm fails, additionally clear the LatestVersionId / LatestVersionFile pointer fields on the .versions directory entry. The orphan files stay in place (an operator can audit and remove them); from the S3 API perspective the object is now correctly absent and subsequent reads short-circuit to ErrNotFound on the fast path instead of replaying the heal cycle. * fix(s3api): clear stale latest-version pointer on read-side self-heal failure healStaleLatestVersionPointer is invoked by getLatestObjectVersion when the pointed-at version file is missing. The rescan path can find no remaining version-id-tagged entries (e.g. when only orphan v_ files lacking the version-id extended attribute remain). Prior code returned ErrNotFound but left the stale pointer in place, so every subsequent read replayed the same 13x retry loop on the missing file and re-entered self-heal, all to return the same 404. Reuse the same pointer-clear logic introduced for updateLatestVersionAfterDeletion. The two call sites are now identical, so factor the body out into clearStaleLatestVersionPointer. The caller parameter carries the source function name so the log lines operators were already grepping (updateLatestVersionAfterDeletion: cleared stale ... and healStaleLatestVersionPointer: cleared stale ...) keep the same prefix. * fix(s3api): re-validate before clearing latest-version pointer (CAS) Reviewer feedback (gemini-code-assist, coderabbitai) on PR #9269 flagged a TOCTOU in clearStaleLatestVersionPointer: between the caller loading versionsEntry and this function persisting the cleared map, a concurrent PUT could promote a fresh version. Persisting the caller's snapshot then silently overwrites the live pointer and re-introduces the missing-pointer state on a now-existing object. Make the persist CAS-style: 1. Re-scan .versions for any version-id-tagged entry. If one now exists, abort - the concurrent writer has populated the directory and either already updated the pointer or the next read's self-heal will pick up the new entry. 2. Re-fetch the live .versions directory entry and only proceed if its latest-pointer fields still match the stale id the caller observed. A concurrent pointer update by another writer is detected here and the clear is skipped. 3. Persist with mkFile using the live Extended map (minus the two pointer fields and the cached metadata) so any other Extended fields written concurrently between (2) and the persist are preserved. A note on the literal suggestion of mutating updatedEntry.Extended in the mkFile callback: that does not work because mkFile constructs a fresh *filer_pb.Entry rather than reading the live entry first (weed/pb/filer_pb/filer_client.go:247). The callback's updatedEntry is nil at invocation, so a delete on it would be a no-op and we would lose all Extended fields, not just the two we want to clear. The correct shape - re-fetching the live entry before mkFile and carrying its Extended map into the persist - is what this change implements. True atomic CAS would require filer-level conditional update support; this change narrows the race window from the full caller scope to the ~ms gap between (2) and (3), which is the best we can do without that. --- weed/s3api/s3api_object_versioning.go | 108 ++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 7 deletions(-) diff --git a/weed/s3api/s3api_object_versioning.go b/weed/s3api/s3api_object_versioning.go index 5edb71082..8ecb9c968 100644 --- a/weed/s3api/s3api_object_versioning.go +++ b/weed/s3api/s3api_object_versioning.go @@ -1112,22 +1112,109 @@ func (s3a *S3ApiServer) updateLatestVersionAfterDeletion(bucket, object string) return fmt.Errorf("failed to update .versions directory metadata: %v", err) } } else { - // No version-tagged entries remain - delete the .versions directory. - // rm is non-recursive, so any stray non-version entries will cause - // rm to fail; we log and continue, treating the directory cleanup as - // best-effort since the version data is already gone. + // No version-tagged entries remain - try to delete the .versions + // directory. rm is non-recursive, so any stray non-version entries + // (orphan files from older code paths or interrupted writes that left + // behind a v_ file without the version-id extended attribute) + // will cause rm to fail. + // + // In that case, clear the stale latest-version pointer on the + // .versions directory entry so subsequent reads return a clean + // "object absent" instead of repeatedly chasing a missing version + // file through getLatestObjectVersion's retry loop and the self-heal + // path on every request. The orphan files remain in the directory + // (an operator can remove them); from the S3 API perspective, the + // object is correctly absent. glog.V(2).Infof("updateLatestVersionAfterDeletion: no versions left for %s/%s, deleting .versions directory", bucket, object) err = s3a.rm(bucketDir, versionsObjectPath, true, false) - if err != nil { - glog.Warningf("updateLatestVersionAfterDeletion: failed to delete .versions directory for %s/%s: %v", bucket, object, err) - // Don't return error - the versions are already deleted, this is just cleanup + if err == nil { + return nil } + glog.Warningf("updateLatestVersionAfterDeletion: failed to delete .versions directory for %s/%s: %v", bucket, object, err) + s3a.clearStaleLatestVersionPointer(bucket, object, bucketDir, versionsObjectPath, versionsEntry, "updateLatestVersionAfterDeletion") } return nil } +// clearStaleLatestVersionPointer best-effort clears the latest-version +// pointer on a .versions directory entry when no recoverable version +// remains and the directory cannot be removed (e.g. because of orphan +// entries that lack the version-id extended attribute). +// +// The persist is CAS-style to avoid wiping a pointer that a concurrent +// writer just promoted between the caller's snapshot and this function: +// +// 1. Re-scan .versions for any version-id-tagged entry. If one now +// exists, abort - either the concurrent writer already updated the +// pointer or the next read's self-heal will pick up the new entry. +// 2. Re-fetch the live .versions directory entry and require its +// latest-pointer fields to still match the stale id observed by the +// caller. If they have changed, abort. +// 3. Persist with mkFile using the live Extended map (with the two +// pointer fields and cached metadata removed) so any other Extended +// fields written concurrently between (2) and the persist are +// preserved. +// +// caller is the source-function name used in log lines so operators can +// trace which path ran the clear. +func (s3a *S3ApiServer) clearStaleLatestVersionPointer(bucket, object, bucketDir, versionsObjectPath string, versionsEntry *filer_pb.Entry, caller string) { + if versionsEntry == nil || versionsEntry.Extended == nil { + return + } + observedStaleId := string(versionsEntry.Extended[s3_constants.ExtLatestVersionIdKey]) + versionsDir := bucketDir + "/" + versionsObjectPath + + startFrom := "" + for { + entries, isLast, listErr := s3a.list(versionsDir, "", startFrom, false, filer.PaginationSize) + if listErr != nil { + glog.Warningf("%s: re-scan failed for %s/%s, leaving pointer untouched: %v", caller, bucket, object, listErr) + return + } + if pageEntry, _, _, _ := selectLatestVersion(entries); pageEntry != nil { + glog.V(1).Infof("%s: skipping pointer clear for %s/%s, concurrent writer added a tagged version", caller, bucket, object) + return + } + if isLast || len(entries) == 0 { + break + } + startFrom = entries[len(entries)-1].Name + } + + liveEntry, err := s3a.getEntry(bucketDir, versionsObjectPath) + if err != nil { + // Directory was concurrently removed - nothing to clear. + return + } + if liveEntry.Extended == nil { + return + } + currentIdBytes, hasId := liveEntry.Extended[s3_constants.ExtLatestVersionIdKey] + _, hasFile := liveEntry.Extended[s3_constants.ExtLatestVersionFileNameKey] + if !hasId && !hasFile { + return + } + if observedStaleId != "" && string(currentIdBytes) != observedStaleId { + glog.V(1).Infof("%s: skipping pointer clear for %s/%s, live pointer changed (observed=%s, current=%s)", caller, bucket, object, observedStaleId, string(currentIdBytes)) + return + } + + delete(liveEntry.Extended, s3_constants.ExtLatestVersionIdKey) + delete(liveEntry.Extended, s3_constants.ExtLatestVersionFileNameKey) + clearCachedVersionMetadata(liveEntry.Extended) + if mkErr := s3a.mkFile(bucketDir, versionsObjectPath, liveEntry.Chunks, func(updatedEntry *filer_pb.Entry) { + updatedEntry.Extended = liveEntry.Extended + updatedEntry.Attributes = liveEntry.Attributes + updatedEntry.Chunks = liveEntry.Chunks + }); mkErr != nil { + glog.Warningf("%s: failed to clear stale pointer for %s/%s: %v", caller, bucket, object, mkErr) + return + } + glog.V(1).Infof("%s: cleared stale latest-version pointer for %s/%s (orphan entries remain in .versions directory)", caller, bucket, object) +} + // ListObjectVersionsHandler handles the list object versions request // https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html func (s3a *S3ApiServer) ListObjectVersionsHandler(w http.ResponseWriter, r *http.Request) { @@ -1375,6 +1462,13 @@ func (s3a *S3ApiServer) healStaleLatestVersionPointer(bucket, normalizedObject s } if latestEntry == nil { + // Best-effort clear the stale latest-version pointer so subsequent + // reads short-circuit to ErrNotFound directly instead of replaying + // getLatestObjectVersion's read-retry loop and re-entering self-heal + // on every request. Orphan entries (files in .versions/ that lack + // the version-id extended attribute) remain in place; from the S3 + // API perspective the object is correctly absent. + s3a.clearStaleLatestVersionPointer(bucket, normalizedObject, bucketDir, versionsObjectPath, versionsEntry, "healStaleLatestVersionPointer") // Wrap filer_pb.ErrNotFound so callers can distinguish genuine // object-absence (nothing left to promote) from scan failures // (I/O errors during list) via errors.Is.