fix(mount): close inodeLocks cleanup race that let two flock holders coexist (#9128)

* fix(mount): close inodeLocks cleanup race that allowed two flock holders PosixLockTable.getOrCreateInodeLocks released plt.mu before the caller acquired il.mu. A concurrent maybeCleanupInode could delete the map entry in that window; the first caller would then insert its lock into the orphaned inodeLocks while a later caller created a fresh entry in the map, so findConflict never observed the orphaned lock and two owners could simultaneously believe they held the same exclusive flock. This matches the flaky CI failure seen in TestPosixFileLocking/ConcurrentLockContention: Error: Should be empty, but was [worker N: flock overlap detected with 2 holders] Mark removed inodeLocks as dead under plt.mu+il.mu, and have SetLk / SetLkw recheck the flag after locking il.mu, refetching the live entry from the map when orphaned. Also delete the map entry only if it still points to this il, so a racing recreate is not clobbered. Adds TestConcurrentFlockChurnPreservesMutualExclusion: 16 goroutines x 500 flock/unflock iterations on one inode. Reliably reports 500+ overlaps per run before the fix; clean across 100 race-enabled runs after. * fix(mount): extend dead-flag contract to GetLk and self-heal primitives Address review feedback on the initial cleanup-race fix: 1. GetLk had the same stale-pointer bug as SetLk. A caller could grab an inodeLocks pointer, have cleanup orphan it and a replacement il receive a conflicting lock, then answer F_UNLCK off the empty dead pointer. Add the same dead recheck + refetch loop. 2. getOrCreateInodeLocks and getInodeLocks now treat a dead map entry as defective: the former replaces it with a fresh inodeLocks, the latter drops it and returns nil. Production cannot reach that state (maybeCleanupInode atomically deletes under plt.mu when it sets dead), but the hardening guarantees the SetLk / SetLkw / GetLk retry loops always make progress even if a future refactor reorders those operations, and it lets the white-box tests set up a stale dead entry without spinning. 3. Strengthen the regression suite: - TestSetLkRetriesPastDeadInodeLocks: deterministic white-box test that installs a dead il in the map and asserts SetLk routes the new lock into a fresh il (not the orphan), that GetLk reports the resulting conflict, and that a different-owner acquire is rejected with EAGAIN. - TestGetInodeLocksEvictsDeadEntry: verifies both map-read primitives drop or replace dead entries. - TestConcurrentFlockChurnPreservesMutualExclusion: replace the timing-fragile Add(1)-and-check counter with a Swap+CAS detector. Each worker claims a slot after SetLk OK and releases it before UN, flagging both an observed predecessor and a lost CAS on release. Against a reverted fix the detector fires 1000+ times per run; with the fix clean across 100 race-enabled iterations. * test(mount): fail fast on unexpected SetLk statuses in churn loop The stress test blindly spun on any non-OK SetLk status and discarded the unlock return. If SetLk ever returns something other than OK or EAGAIN (e.g. after a future refactor introduces a new error), the acquire loop would spin forever and an unlock failure would be silently swallowed. Capture the acquire status, retry only on the expected EAGAIN, and assert unlock returns OK. Use t.Errorf + return (not t.Fatalf) because the checks run on worker goroutines where FailNow is unsafe. The Swap+CAS overlap detector is unchanged.
2026-06-13 23:36:45 +03:00 · 2026-04-17 23:12:26 -07:00
parent c1ccbe97dd
commit 1c130c2d47
2 changed files with 295 additions and 27 deletions
@@ -27,6 +27,12 @@ type inodeLocks struct {
 	locks    []lockRange   // currently held locks, sorted by Start
 	waiters  []*lockWaiter // blocked SetLkw callers
 	wakeRefs int           // woken waiters still retrying on this inodeLocks
+	// dead marks an inodeLocks that has been removed from the table's map.
+	// A caller holding a pointer returned by getInodeLocks/getOrCreateInodeLocks
+	// must recheck this flag after acquiring il.mu: if true, the pointer is
+	// orphaned and must be refreshed from the table so findConflict runs
+	// against the live state.
+	dead bool
 }

 // lockWaiter represents a blocked SetLkw caller.
@@ -48,12 +54,17 @@ func NewPosixLockTable() *PosixLockTable {
 	}
 }

-// getOrCreateInodeLocks returns the lock state for an inode, creating it if needed.
+// getOrCreateInodeLocks returns the lock state for an inode, creating it if
+// needed. A dead entry (which maybeCleanupInode normally evicts together
+// with setting dead) is replaced transparently so callers never observe one
+// via this path and the SetLk retry loop cannot spin on a stale map entry.
+// The il.dead read is safe without il.mu: dead transitions only under both
+// plt.mu and il.mu, so holding plt.mu here is sufficient happens-before.
 func (plt *PosixLockTable) getOrCreateInodeLocks(inode uint64) *inodeLocks {
 	plt.mu.Lock()
 	defer plt.mu.Unlock()
 	il, ok := plt.inodes[inode]
-	if !ok {
+	if !ok || il.dead {
 		il = &inodeLocks{}
 		plt.inodes[inode] = il
 	}
@@ -61,14 +72,29 @@ func (plt *PosixLockTable) getOrCreateInodeLocks(inode uint64) *inodeLocks {
 }

 // getInodeLocks returns the lock state for an inode, or nil if none exists.
+// A dead entry (which maybeCleanupInode normally evicts together with setting
+// dead) is dropped so callers never observe an orphaned inodeLocks via this
+// path. The il.dead read is safe without il.mu for the same reason as in
+// getOrCreateInodeLocks: transitions happen under both plt.mu and il.mu.
 func (plt *PosixLockTable) getInodeLocks(inode uint64) *inodeLocks {
 	plt.mu.Lock()
 	defer plt.mu.Unlock()
-	return plt.inodes[inode]
+	il, ok := plt.inodes[inode]
+	if !ok {
+		return nil
+	}
+	if il.dead {
+		delete(plt.inodes, inode)
+		return nil
+	}
+	return il
 }

 // maybeCleanupInode removes the inodeLocks entry if it has no locks, no waiters,
-// and no woken waiters still retrying against this inodeLocks.
+// and no woken waiters still retrying against this inodeLocks. The removed
+// inodeLocks is marked dead so any caller that is mid-way through acquiring
+// il.mu with a stale pointer will notice and refetch from the map instead of
+// mutating an orphaned instance (which would be invisible to future callers).
 func (plt *PosixLockTable) maybeCleanupInode(inode uint64, il *inodeLocks) {
 	// Caller must NOT hold il.mu. We acquire both locks in the correct order.
 	plt.mu.Lock()
@@ -76,7 +102,10 @@ func (plt *PosixLockTable) maybeCleanupInode(inode uint64, il *inodeLocks) {
 	il.mu.Lock()
 	defer il.mu.Unlock()
 	if len(il.locks) == 0 && len(il.waiters) == 0 && il.wakeRefs == 0 {
-		delete(plt.inodes, inode)
+		if plt.inodes[inode] == il {
+			delete(plt.inodes, inode)
+		}
+		il.dead = true
 	}
 }

@@ -247,22 +276,32 @@ func removeWaiter(il *inodeLocks, w *lockWaiter) {
 // GetLk checks for a conflicting lock. If found, it populates out with the
 // conflict details. If no conflict, out.Typ is set to F_UNLCK.
 func (plt *PosixLockTable) GetLk(inode uint64, proposed lockRange, out *fuse.LkOut) {
-	il := plt.getInodeLocks(inode)
-	if il == nil {
-		out.Lk.Typ = syscall.F_UNLCK
-		return
-	}
-	il.mu.Lock()
-	conflict, found := findConflict(il.locks, proposed)
-	il.mu.Unlock()
+	for {
+		il := plt.getInodeLocks(inode)
+		if il == nil {
+			out.Lk.Typ = syscall.F_UNLCK
+			return
+		}
+		il.mu.Lock()
+		if il.dead {
+			// Orphaned by a concurrent cleanup: the map may already hold a
+			// fresh inodeLocks with different state, so answering from this
+			// stale pointer would miss real conflicts. Refetch.
+			il.mu.Unlock()
+			continue
+		}
+		conflict, found := findConflict(il.locks, proposed)
+		il.mu.Unlock()

-	if found {
-		out.Lk.Start = conflict.Start
-		out.Lk.End = conflict.End
-		out.Lk.Typ = conflict.Typ
-		out.Lk.Pid = conflict.Pid
-	} else {
-		out.Lk.Typ = syscall.F_UNLCK
+		if found {
+			out.Lk.Start = conflict.Start
+			out.Lk.End = conflict.End
+			out.Lk.Typ = conflict.Typ
+			out.Lk.Pid = conflict.Pid
+		} else {
+			out.Lk.Typ = syscall.F_UNLCK
+		}
+		return
 	}
 }

@@ -276,6 +315,13 @@ func (plt *PosixLockTable) SetLk(inode uint64, lk lockRange) fuse.Status {
 			return fuse.OK
 		}
 		il.mu.Lock()
+		if il.dead {
+			// Orphaned by a concurrent cleanup: any lock we may have held
+			// is gone with it, and there is no point waking waiters on a
+			// detached inodeLocks.
+			il.mu.Unlock()
+			return fuse.OK
+		}
 		removeLocks(il, func(existing lockRange) bool {
 			return existing.Owner == lk.Owner && existing.IsFlock == lk.IsFlock
 		}, lk.Start, lk.End)
@@ -285,15 +331,21 @@ func (plt *PosixLockTable) SetLk(inode uint64, lk lockRange) fuse.Status {
 		return fuse.OK
 	}

-	il := plt.getOrCreateInodeLocks(inode)
-	il.mu.Lock()
-	if _, found := findConflict(il.locks, lk); found {
+	for {
+		il := plt.getOrCreateInodeLocks(inode)
+		il.mu.Lock()
+		if il.dead {
+			il.mu.Unlock()
+			continue
+		}
+		if _, found := findConflict(il.locks, lk); found {
+			il.mu.Unlock()
+			return fuse.EAGAIN
+		}
+		insertAndCoalesce(il, lk)
 		il.mu.Unlock()
-		return fuse.EAGAIN
+		return fuse.OK
 	}
-	insertAndCoalesce(il, lk)
-	il.mu.Unlock()
-	return fuse.OK
 }

 // SetLkw attempts a blocking lock. It waits until the lock can be acquired
@@ -307,6 +359,15 @@ func (plt *PosixLockTable) SetLkw(inode uint64, lk lockRange, cancel <-chan stru
 	var waiter *lockWaiter
 	for {
 		il.mu.Lock()
+		if il.dead {
+			// The inodeLocks we were holding was orphaned by a concurrent
+			// cleanup. Refetch the live instance from the table and retry.
+			// A dead il never has waiters or wakeRefs (the cleanup condition
+			// requires both to be zero), so waiter is guaranteed nil here.
+			il.mu.Unlock()
+			il = plt.getOrCreateInodeLocks(inode)
+			continue
+		}
 		releaseWakeRef(il, waiter)
 		if _, found := findConflict(il.locks, lk); !found {
 			insertAndCoalesce(il, lk)
@@ -2,6 +2,9 @@ package mount

 import (
 	"math"
+	"runtime"
+	"sync"
+	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
@@ -615,3 +618,207 @@ func TestAdjacencyNoOverflowAtMaxUint64(t *testing.T) {
 		t.Fatalf("expected 2 separate locks (no overflow merge), got %d", ownerLocks)
 	}
 }
+
+// TestSetLkRetriesPastDeadInodeLocks deterministically exercises the
+// getOrCreateInodeLocks vs maybeCleanupInode race: a caller holding a
+// pointer to an inodeLocks that is concurrently marked dead must refetch
+// from the map instead of mutating the orphaned instance (which would be
+// invisible to subsequent callers and let two exclusive flock holders
+// coexist). The test bypasses scheduling by hand-installing a dead il into
+// the table and asserting that the next SetLk routes around it.
+func TestSetLkRetriesPastDeadInodeLocks(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(42)
+
+	// Acquire and release a lock so maybeCleanupInode marks the il dead and
+	// removes it from the map.
+	lock := lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 1, IsFlock: true}
+	if s := plt.SetLk(inode, lock); s != fuse.OK {
+		t.Fatalf("prime SetLk: got %v", s)
+	}
+	dead := plt.getInodeLocks(inode)
+	unlock := lock
+	unlock.Typ = syscall.F_UNLCK
+	if s := plt.SetLk(inode, unlock); s != fuse.OK {
+		t.Fatalf("prime unlock: got %v", s)
+	}
+	if !dead.dead {
+		t.Fatal("expected il to be marked dead after unlock+cleanup")
+	}
+	plt.mu.Lock()
+	_, stillMapped := plt.inodes[inode]
+	plt.mu.Unlock()
+	if stillMapped {
+		t.Fatal("expected the dead il to be removed from the map")
+	}
+
+	// Simulate the race: the next caller's getOrCreateInodeLocks races with
+	// the cleanup and ends up holding a pointer to the dead il. We force that
+	// state by re-publishing `dead` into the map.
+	plt.mu.Lock()
+	plt.inodes[inode] = dead
+	plt.mu.Unlock()
+
+	// SetLk must notice dead, refetch, and install the new lock in a fresh il.
+	if s := plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 2, IsFlock: true}); s != fuse.OK {
+		t.Fatalf("SetLk after dead: got %v", s)
+	}
+
+	dead.mu.Lock()
+	if n := len(dead.locks); n != 0 {
+		t.Fatalf("dead il should not have accepted the insert, found %d locks", n)
+	}
+	dead.mu.Unlock()
+
+	plt.mu.Lock()
+	live := plt.inodes[inode]
+	plt.mu.Unlock()
+	if live == nil || live == dead {
+		t.Fatalf("expected a fresh live il, got %v", live)
+	}
+
+	// A conflicting owner must see the new lock and be rejected.
+	if s := plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 3, IsFlock: true}); s != fuse.EAGAIN {
+		t.Fatalf("second owner should conflict with owner 2, got %v", s)
+	}
+
+	// GetLk must report the conflict as well: without the dead-recheck the
+	// GetLk path would answer F_UNLCK off the orphaned il.
+	var out fuse.LkOut
+	plt.GetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 4, IsFlock: true}, &out)
+	if out.Lk.Typ != syscall.F_WRLCK {
+		t.Fatalf("GetLk should report the live conflict, got Typ=%d", out.Lk.Typ)
+	}
+}
+
+// TestGetInodeLocksEvictsDeadEntry verifies that a dead inodeLocks which
+// somehow ends up in the map (e.g. through a future refactor that reorders
+// delete and dead=true) is dropped on read so callers never observe one.
+// This is the backstop that lets GetLk's and SetLk's retry loops terminate.
+func TestGetInodeLocksEvictsDeadEntry(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(42)
+
+	lock := lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 1, IsFlock: true}
+	if s := plt.SetLk(inode, lock); s != fuse.OK {
+		t.Fatalf("prime SetLk: got %v", s)
+	}
+	dead := plt.getInodeLocks(inode)
+	unlock := lock
+	unlock.Typ = syscall.F_UNLCK
+	if s := plt.SetLk(inode, unlock); s != fuse.OK {
+		t.Fatalf("prime unlock: got %v", s)
+	}
+	if !dead.dead {
+		t.Fatal("expected dead after cleanup")
+	}
+
+	// Force the broken state that production cannot reach but tests and
+	// future refactors might: dead entry still in the map.
+	plt.mu.Lock()
+	plt.inodes[inode] = dead
+	plt.mu.Unlock()
+
+	if il := plt.getInodeLocks(inode); il != nil {
+		t.Fatalf("getInodeLocks should drop a dead map entry, got %p", il)
+	}
+	plt.mu.Lock()
+	_, stillMapped := plt.inodes[inode]
+	plt.mu.Unlock()
+	if stillMapped {
+		t.Fatal("expected dead entry to be removed from the map")
+	}
+
+	// getOrCreateInodeLocks must also self-heal (replace the dead entry with
+	// a fresh live one) so SetLk's retry path cannot spin.
+	plt.mu.Lock()
+	plt.inodes[inode] = dead
+	plt.mu.Unlock()
+	fresh := plt.getOrCreateInodeLocks(inode)
+	if fresh == dead {
+		t.Fatal("getOrCreateInodeLocks should not return a dead entry")
+	}
+	if fresh.dead {
+		t.Fatal("fresh entry should not be dead")
+	}
+}
+
+// TestConcurrentFlockChurnPreservesMutualExclusion is a stress companion to
+// the deterministic tests above. It uses a Swap+CAS detector that flags
+// overlap at two points (on acquire and on release), so a second granted
+// holder is caught even if it sneaks in after the first goroutine's claim
+// but before its release. 16 goroutines churn whole-file exclusive flock on
+// one inode; with the race the detector fires hundreds of times per run,
+// with the fix it stays at zero.
+func TestConcurrentFlockChurnPreservesMutualExclusion(t *testing.T) {
+	plt := NewPosixLockTable()
+	const (
+		inode      = uint64(42)
+		numWorkers = 16
+		iterations = 500
+	)
+	var (
+		wg          sync.WaitGroup
+		holder      atomic.Int64 // 0 = nobody; otherwise = holder's claim token
+		overlapSeen atomic.Int32
+	)
+
+	for w := 0; w < numWorkers; w++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			owner := uint64(100 + id)
+			lock := lockRange{
+				Start:   0,
+				End:     math.MaxUint64,
+				Typ:     syscall.F_WRLCK,
+				Owner:   owner,
+				Pid:     uint32(id + 1),
+				IsFlock: true,
+			}
+			unlock := lock
+			unlock.Typ = syscall.F_UNLCK
+			token := int64(id + 1)
+			for i := 0; i < iterations; i++ {
+				// SetLk(WRLCK) may only return OK (granted) or EAGAIN
+				// (conflict); anything else indicates a bug and the test
+				// must fail rather than spin. Use Errorf + return because
+				// Fatalf is not safe from a non-test goroutine.
+				for {
+					s := plt.SetLk(inode, lock)
+					if s == fuse.OK {
+						break
+					}
+					if s != fuse.EAGAIN {
+						t.Errorf("worker %d iter %d: unexpected SetLk(WRLCK) status %v", id, i, s)
+						return
+					}
+					runtime.Gosched()
+				}
+				// Claim the slot. If Swap observes a non-zero predecessor,
+				// another goroutine already believes it holds the lock.
+				if prev := holder.Swap(token); prev != 0 {
+					overlapSeen.Add(1)
+				}
+				// Widen the window so a concurrently-granted peer has a
+				// chance to race into its own Swap before we release.
+				runtime.Gosched()
+				runtime.Gosched()
+				// Release the slot. If CAS fails someone else overwrote our
+				// claim, which only happens when two holders raced.
+				if !holder.CompareAndSwap(token, 0) {
+					overlapSeen.Add(1)
+				}
+				if s := plt.SetLk(inode, unlock); s != fuse.OK {
+					t.Errorf("worker %d iter %d: unexpected SetLk(UNLCK) status %v", id, i, s)
+					return
+				}
+			}
+		}(w)
+	}
+	wg.Wait()
+
+	if n := overlapSeen.Load(); n != 0 {
+		t.Fatalf("flock overlap detected %d times: two owners simultaneously granted the same exclusive lock", n)
+	}
+}