From 1c130c2d476c590a5eaa2001a885ebf77b315213 Mon Sep 17 00:00:00 2001
From: Chris Lu <chrislusf@users.noreply.github.com>
Date: Fri, 17 Apr 2026 23:12:26 -0700
Subject: [PATCH] fix(mount): close inodeLocks cleanup race that let two flock
 holders coexist (#9128)

* fix(mount): close inodeLocks cleanup race that allowed two flock holders

PosixLockTable.getOrCreateInodeLocks released plt.mu before the caller
acquired il.mu. A concurrent maybeCleanupInode could delete the map
entry in that window; the first caller would then insert its lock into
the orphaned inodeLocks while a later caller created a fresh entry in
the map, so findConflict never observed the orphaned lock and two
owners could simultaneously believe they held the same exclusive flock.

This matches the flaky CI failure seen in
TestPosixFileLocking/ConcurrentLockContention:

    Error: Should be empty, but was [worker N: flock overlap detected with 2 holders]

Mark removed inodeLocks as dead under plt.mu+il.mu, and have SetLk /
SetLkw recheck the flag after locking il.mu, refetching the live entry
from the map when orphaned. Also delete the map entry only if it still
points to this il, so a racing recreate is not clobbered.

Adds TestConcurrentFlockChurnPreservesMutualExclusion: 16 goroutines x
500 flock/unflock iterations on one inode. Reliably reports 500+
overlaps per run before the fix; clean across 100 race-enabled runs
after.

* fix(mount): extend dead-flag contract to GetLk and self-heal primitives

Address review feedback on the initial cleanup-race fix:

1. GetLk had the same stale-pointer bug as SetLk. A caller could grab
   an inodeLocks pointer, have cleanup orphan it and a replacement il
   receive a conflicting lock, then answer F_UNLCK off the empty dead
   pointer. Add the same dead recheck + refetch loop.

2. getOrCreateInodeLocks and getInodeLocks now treat a dead map entry
   as defective: the former replaces it with a fresh inodeLocks, the
   latter drops it and returns nil. Production cannot reach that state
   (maybeCleanupInode atomically deletes under plt.mu when it sets
   dead), but the hardening guarantees the SetLk / SetLkw / GetLk
   retry loops always make progress even if a future refactor reorders
   those operations, and it lets the white-box tests set up a stale
   dead entry without spinning.

3. Strengthen the regression suite:
   - TestSetLkRetriesPastDeadInodeLocks: deterministic white-box test
     that installs a dead il in the map and asserts SetLk routes the
     new lock into a fresh il (not the orphan), that GetLk reports the
     resulting conflict, and that a different-owner acquire is rejected
     with EAGAIN.
   - TestGetInodeLocksEvictsDeadEntry: verifies both map-read primitives
     drop or replace dead entries.
   - TestConcurrentFlockChurnPreservesMutualExclusion: replace the
     timing-fragile Add(1)-and-check counter with a Swap+CAS detector.
     Each worker claims a slot after SetLk OK and releases it before
     UN, flagging both an observed predecessor and a lost CAS on
     release. Against a reverted fix the detector fires 1000+ times per
     run; with the fix clean across 100 race-enabled iterations.

* test(mount): fail fast on unexpected SetLk statuses in churn loop

The stress test blindly spun on any non-OK SetLk status and discarded
the unlock return. If SetLk ever returns something other than OK or
EAGAIN (e.g. after a future refactor introduces a new error), the
acquire loop would spin forever and an unlock failure would be
silently swallowed.

Capture the acquire status, retry only on the expected EAGAIN, and
assert unlock returns OK. Use t.Errorf + return (not t.Fatalf) because
the checks run on worker goroutines where FailNow is unsafe. The
Swap+CAS overlap detector is unchanged.
---
 weed/mount/posix_file_lock.go      | 115 ++++++++++++----
 weed/mount/posix_file_lock_test.go | 207 +++++++++++++++++++++++++++++
 2 files changed, 295 insertions(+), 27 deletions(-)

diff --git a/weed/mount/posix_file_lock.go b/weed/mount/posix_file_lock.go
index c277e5f88..075b97e12 100644
--- a/weed/mount/posix_file_lock.go
+++ b/weed/mount/posix_file_lock.go
@@ -27,6 +27,12 @@ type inodeLocks struct {
 	locks    []lockRange   // currently held locks, sorted by Start
 	waiters  []*lockWaiter // blocked SetLkw callers
 	wakeRefs int           // woken waiters still retrying on this inodeLocks
+	// dead marks an inodeLocks that has been removed from the table's map.
+	// A caller holding a pointer returned by getInodeLocks/getOrCreateInodeLocks
+	// must recheck this flag after acquiring il.mu: if true, the pointer is
+	// orphaned and must be refreshed from the table so findConflict runs
+	// against the live state.
+	dead bool
 }
 
 // lockWaiter represents a blocked SetLkw caller.
@@ -48,12 +54,17 @@ func NewPosixLockTable() *PosixLockTable {
 	}
 }
 
-// getOrCreateInodeLocks returns the lock state for an inode, creating it if needed.
+// getOrCreateInodeLocks returns the lock state for an inode, creating it if
+// needed. A dead entry (which maybeCleanupInode normally evicts together
+// with setting dead) is replaced transparently so callers never observe one
+// via this path and the SetLk retry loop cannot spin on a stale map entry.
+// The il.dead read is safe without il.mu: dead transitions only under both
+// plt.mu and il.mu, so holding plt.mu here is sufficient happens-before.
 func (plt *PosixLockTable) getOrCreateInodeLocks(inode uint64) *inodeLocks {
 	plt.mu.Lock()
 	defer plt.mu.Unlock()
 	il, ok := plt.inodes[inode]
-	if !ok {
+	if !ok || il.dead {
 		il = &inodeLocks{}
 		plt.inodes[inode] = il
 	}
@@ -61,14 +72,29 @@ func (plt *PosixLockTable) getOrCreateInodeLocks(inode uint64) *inodeLocks {
 }
 
 // getInodeLocks returns the lock state for an inode, or nil if none exists.
+// A dead entry (which maybeCleanupInode normally evicts together with setting
+// dead) is dropped so callers never observe an orphaned inodeLocks via this
+// path. The il.dead read is safe without il.mu for the same reason as in
+// getOrCreateInodeLocks: transitions happen under both plt.mu and il.mu.
 func (plt *PosixLockTable) getInodeLocks(inode uint64) *inodeLocks {
 	plt.mu.Lock()
 	defer plt.mu.Unlock()
-	return plt.inodes[inode]
+	il, ok := plt.inodes[inode]
+	if !ok {
+		return nil
+	}
+	if il.dead {
+		delete(plt.inodes, inode)
+		return nil
+	}
+	return il
 }
 
 // maybeCleanupInode removes the inodeLocks entry if it has no locks, no waiters,
-// and no woken waiters still retrying against this inodeLocks.
+// and no woken waiters still retrying against this inodeLocks. The removed
+// inodeLocks is marked dead so any caller that is mid-way through acquiring
+// il.mu with a stale pointer will notice and refetch from the map instead of
+// mutating an orphaned instance (which would be invisible to future callers).
 func (plt *PosixLockTable) maybeCleanupInode(inode uint64, il *inodeLocks) {
 	// Caller must NOT hold il.mu. We acquire both locks in the correct order.
 	plt.mu.Lock()
@@ -76,7 +102,10 @@ func (plt *PosixLockTable) maybeCleanupInode(inode uint64, il *inodeLocks) {
 	il.mu.Lock()
 	defer il.mu.Unlock()
 	if len(il.locks) == 0 && len(il.waiters) == 0 && il.wakeRefs == 0 {
-		delete(plt.inodes, inode)
+		if plt.inodes[inode] == il {
+			delete(plt.inodes, inode)
+		}
+		il.dead = true
 	}
 }
 
@@ -247,22 +276,32 @@ func removeWaiter(il *inodeLocks, w *lockWaiter) {
 // GetLk checks for a conflicting lock. If found, it populates out with the
 // conflict details. If no conflict, out.Typ is set to F_UNLCK.
 func (plt *PosixLockTable) GetLk(inode uint64, proposed lockRange, out *fuse.LkOut) {
-	il := plt.getInodeLocks(inode)
-	if il == nil {
-		out.Lk.Typ = syscall.F_UNLCK
-		return
-	}
-	il.mu.Lock()
-	conflict, found := findConflict(il.locks, proposed)
-	il.mu.Unlock()
+	for {
+		il := plt.getInodeLocks(inode)
+		if il == nil {
+			out.Lk.Typ = syscall.F_UNLCK
+			return
+		}
+		il.mu.Lock()
+		if il.dead {
+			// Orphaned by a concurrent cleanup: the map may already hold a
+			// fresh inodeLocks with different state, so answering from this
+			// stale pointer would miss real conflicts. Refetch.
+			il.mu.Unlock()
+			continue
+		}
+		conflict, found := findConflict(il.locks, proposed)
+		il.mu.Unlock()
 
-	if found {
-		out.Lk.Start = conflict.Start
-		out.Lk.End = conflict.End
-		out.Lk.Typ = conflict.Typ
-		out.Lk.Pid = conflict.Pid
-	} else {
-		out.Lk.Typ = syscall.F_UNLCK
+		if found {
+			out.Lk.Start = conflict.Start
+			out.Lk.End = conflict.End
+			out.Lk.Typ = conflict.Typ
+			out.Lk.Pid = conflict.Pid
+		} else {
+			out.Lk.Typ = syscall.F_UNLCK
+		}
+		return
 	}
 }
 
@@ -276,6 +315,13 @@ func (plt *PosixLockTable) SetLk(inode uint64, lk lockRange) fuse.Status {
 			return fuse.OK
 		}
 		il.mu.Lock()
+		if il.dead {
+			// Orphaned by a concurrent cleanup: any lock we may have held
+			// is gone with it, and there is no point waking waiters on a
+			// detached inodeLocks.
+			il.mu.Unlock()
+			return fuse.OK
+		}
 		removeLocks(il, func(existing lockRange) bool {
 			return existing.Owner == lk.Owner && existing.IsFlock == lk.IsFlock
 		}, lk.Start, lk.End)
@@ -285,15 +331,21 @@ func (plt *PosixLockTable) SetLk(inode uint64, lk lockRange) fuse.Status {
 		return fuse.OK
 	}
 
-	il := plt.getOrCreateInodeLocks(inode)
-	il.mu.Lock()
-	if _, found := findConflict(il.locks, lk); found {
+	for {
+		il := plt.getOrCreateInodeLocks(inode)
+		il.mu.Lock()
+		if il.dead {
+			il.mu.Unlock()
+			continue
+		}
+		if _, found := findConflict(il.locks, lk); found {
+			il.mu.Unlock()
+			return fuse.EAGAIN
+		}
+		insertAndCoalesce(il, lk)
 		il.mu.Unlock()
-		return fuse.EAGAIN
+		return fuse.OK
 	}
-	insertAndCoalesce(il, lk)
-	il.mu.Unlock()
-	return fuse.OK
 }
 
 // SetLkw attempts a blocking lock. It waits until the lock can be acquired
@@ -307,6 +359,15 @@ func (plt *PosixLockTable) SetLkw(inode uint64, lk lockRange, cancel <-chan stru
 	var waiter *lockWaiter
 	for {
 		il.mu.Lock()
+		if il.dead {
+			// The inodeLocks we were holding was orphaned by a concurrent
+			// cleanup. Refetch the live instance from the table and retry.
+			// A dead il never has waiters or wakeRefs (the cleanup condition
+			// requires both to be zero), so waiter is guaranteed nil here.
+			il.mu.Unlock()
+			il = plt.getOrCreateInodeLocks(inode)
+			continue
+		}
 		releaseWakeRef(il, waiter)
 		if _, found := findConflict(il.locks, lk); !found {
 			insertAndCoalesce(il, lk)
diff --git a/weed/mount/posix_file_lock_test.go b/weed/mount/posix_file_lock_test.go
index 61d9a158f..37a4633dc 100644
--- a/weed/mount/posix_file_lock_test.go
+++ b/weed/mount/posix_file_lock_test.go
@@ -2,6 +2,9 @@ package mount
 
 import (
 	"math"
+	"runtime"
+	"sync"
+	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
@@ -615,3 +618,207 @@ func TestAdjacencyNoOverflowAtMaxUint64(t *testing.T) {
 		t.Fatalf("expected 2 separate locks (no overflow merge), got %d", ownerLocks)
 	}
 }
+
+// TestSetLkRetriesPastDeadInodeLocks deterministically exercises the
+// getOrCreateInodeLocks vs maybeCleanupInode race: a caller holding a
+// pointer to an inodeLocks that is concurrently marked dead must refetch
+// from the map instead of mutating the orphaned instance (which would be
+// invisible to subsequent callers and let two exclusive flock holders
+// coexist). The test bypasses scheduling by hand-installing a dead il into
+// the table and asserting that the next SetLk routes around it.
+func TestSetLkRetriesPastDeadInodeLocks(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(42)
+
+	// Acquire and release a lock so maybeCleanupInode marks the il dead and
+	// removes it from the map.
+	lock := lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 1, IsFlock: true}
+	if s := plt.SetLk(inode, lock); s != fuse.OK {
+		t.Fatalf("prime SetLk: got %v", s)
+	}
+	dead := plt.getInodeLocks(inode)
+	unlock := lock
+	unlock.Typ = syscall.F_UNLCK
+	if s := plt.SetLk(inode, unlock); s != fuse.OK {
+		t.Fatalf("prime unlock: got %v", s)
+	}
+	if !dead.dead {
+		t.Fatal("expected il to be marked dead after unlock+cleanup")
+	}
+	plt.mu.Lock()
+	_, stillMapped := plt.inodes[inode]
+	plt.mu.Unlock()
+	if stillMapped {
+		t.Fatal("expected the dead il to be removed from the map")
+	}
+
+	// Simulate the race: the next caller's getOrCreateInodeLocks races with
+	// the cleanup and ends up holding a pointer to the dead il. We force that
+	// state by re-publishing `dead` into the map.
+	plt.mu.Lock()
+	plt.inodes[inode] = dead
+	plt.mu.Unlock()
+
+	// SetLk must notice dead, refetch, and install the new lock in a fresh il.
+	if s := plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 2, IsFlock: true}); s != fuse.OK {
+		t.Fatalf("SetLk after dead: got %v", s)
+	}
+
+	dead.mu.Lock()
+	if n := len(dead.locks); n != 0 {
+		t.Fatalf("dead il should not have accepted the insert, found %d locks", n)
+	}
+	dead.mu.Unlock()
+
+	plt.mu.Lock()
+	live := plt.inodes[inode]
+	plt.mu.Unlock()
+	if live == nil || live == dead {
+		t.Fatalf("expected a fresh live il, got %v", live)
+	}
+
+	// A conflicting owner must see the new lock and be rejected.
+	if s := plt.SetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 3, IsFlock: true}); s != fuse.EAGAIN {
+		t.Fatalf("second owner should conflict with owner 2, got %v", s)
+	}
+
+	// GetLk must report the conflict as well: without the dead-recheck the
+	// GetLk path would answer F_UNLCK off the orphaned il.
+	var out fuse.LkOut
+	plt.GetLk(inode, lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 4, IsFlock: true}, &out)
+	if out.Lk.Typ != syscall.F_WRLCK {
+		t.Fatalf("GetLk should report the live conflict, got Typ=%d", out.Lk.Typ)
+	}
+}
+
+// TestGetInodeLocksEvictsDeadEntry verifies that a dead inodeLocks which
+// somehow ends up in the map (e.g. through a future refactor that reorders
+// delete and dead=true) is dropped on read so callers never observe one.
+// This is the backstop that lets GetLk's and SetLk's retry loops terminate.
+func TestGetInodeLocksEvictsDeadEntry(t *testing.T) {
+	plt := NewPosixLockTable()
+	inode := uint64(42)
+
+	lock := lockRange{Start: 0, End: math.MaxUint64, Typ: syscall.F_WRLCK, Owner: 1, IsFlock: true}
+	if s := plt.SetLk(inode, lock); s != fuse.OK {
+		t.Fatalf("prime SetLk: got %v", s)
+	}
+	dead := plt.getInodeLocks(inode)
+	unlock := lock
+	unlock.Typ = syscall.F_UNLCK
+	if s := plt.SetLk(inode, unlock); s != fuse.OK {
+		t.Fatalf("prime unlock: got %v", s)
+	}
+	if !dead.dead {
+		t.Fatal("expected dead after cleanup")
+	}
+
+	// Force the broken state that production cannot reach but tests and
+	// future refactors might: dead entry still in the map.
+	plt.mu.Lock()
+	plt.inodes[inode] = dead
+	plt.mu.Unlock()
+
+	if il := plt.getInodeLocks(inode); il != nil {
+		t.Fatalf("getInodeLocks should drop a dead map entry, got %p", il)
+	}
+	plt.mu.Lock()
+	_, stillMapped := plt.inodes[inode]
+	plt.mu.Unlock()
+	if stillMapped {
+		t.Fatal("expected dead entry to be removed from the map")
+	}
+
+	// getOrCreateInodeLocks must also self-heal (replace the dead entry with
+	// a fresh live one) so SetLk's retry path cannot spin.
+	plt.mu.Lock()
+	plt.inodes[inode] = dead
+	plt.mu.Unlock()
+	fresh := plt.getOrCreateInodeLocks(inode)
+	if fresh == dead {
+		t.Fatal("getOrCreateInodeLocks should not return a dead entry")
+	}
+	if fresh.dead {
+		t.Fatal("fresh entry should not be dead")
+	}
+}
+
+// TestConcurrentFlockChurnPreservesMutualExclusion is a stress companion to
+// the deterministic tests above. It uses a Swap+CAS detector that flags
+// overlap at two points (on acquire and on release), so a second granted
+// holder is caught even if it sneaks in after the first goroutine's claim
+// but before its release. 16 goroutines churn whole-file exclusive flock on
+// one inode; with the race the detector fires hundreds of times per run,
+// with the fix it stays at zero.
+func TestConcurrentFlockChurnPreservesMutualExclusion(t *testing.T) {
+	plt := NewPosixLockTable()
+	const (
+		inode      = uint64(42)
+		numWorkers = 16
+		iterations = 500
+	)
+	var (
+		wg          sync.WaitGroup
+		holder      atomic.Int64 // 0 = nobody; otherwise = holder's claim token
+		overlapSeen atomic.Int32
+	)
+
+	for w := 0; w < numWorkers; w++ {
+		wg.Add(1)
+		go func(id int) {
+			defer wg.Done()
+			owner := uint64(100 + id)
+			lock := lockRange{
+				Start:   0,
+				End:     math.MaxUint64,
+				Typ:     syscall.F_WRLCK,
+				Owner:   owner,
+				Pid:     uint32(id + 1),
+				IsFlock: true,
+			}
+			unlock := lock
+			unlock.Typ = syscall.F_UNLCK
+			token := int64(id + 1)
+			for i := 0; i < iterations; i++ {
+				// SetLk(WRLCK) may only return OK (granted) or EAGAIN
+				// (conflict); anything else indicates a bug and the test
+				// must fail rather than spin. Use Errorf + return because
+				// Fatalf is not safe from a non-test goroutine.
+				for {
+					s := plt.SetLk(inode, lock)
+					if s == fuse.OK {
+						break
+					}
+					if s != fuse.EAGAIN {
+						t.Errorf("worker %d iter %d: unexpected SetLk(WRLCK) status %v", id, i, s)
+						return
+					}
+					runtime.Gosched()
+				}
+				// Claim the slot. If Swap observes a non-zero predecessor,
+				// another goroutine already believes it holds the lock.
+				if prev := holder.Swap(token); prev != 0 {
+					overlapSeen.Add(1)
+				}
+				// Widen the window so a concurrently-granted peer has a
+				// chance to race into its own Swap before we release.
+				runtime.Gosched()
+				runtime.Gosched()
+				// Release the slot. If CAS fails someone else overwrote our
+				// claim, which only happens when two holders raced.
+				if !holder.CompareAndSwap(token, 0) {
+					overlapSeen.Add(1)
+				}
+				if s := plt.SetLk(inode, unlock); s != fuse.OK {
+					t.Errorf("worker %d iter %d: unexpected SetLk(UNLCK) status %v", id, i, s)
+					return
+				}
+			}
+		}(w)
+	}
+	wg.Wait()
+
+	if n := overlapSeen.Load(); n != 0 {
+		t.Fatalf("flock overlap detected %d times: two owners simultaneously granted the same exclusive lock", n)
+	}
+}