filer: warm-up + fail-closed cooling for POSIX locks on owner (re)start (#9673)

After a (re)start the owner defers would-be grants for posixLockWarmup
while mounts re-assert, trusting only locally-visible conflicts, so it
does not double-grant from empty state; a deferred grant is a retry for
SetLkw and EAGAIN for non-blocking SetLk, never a spurious grant. Cooling
now fail-closes: if the previous owner is unreachable during a ring
change, defer rather than risk a double-grant. readyAt is atomic so the
handler reads it without locking.
This commit is contained in:
Chris Lu
2026-05-25 13:14:05 -07:00
committed by GitHub
parent a3c0baa9b0
commit 85ca3cb757
3 changed files with 85 additions and 17 deletions
+52 -17
View File
@@ -20,12 +20,18 @@ const (
// posixCoolingProbeTimeout bounds the dual-read probe to the prior owner so a
// slow peer can't stall a non-blocking lock call during the cooling window.
posixCoolingProbeTimeout = 2 * time.Second
// posixLockWarmup is how long after a (re)start the owner defers would-be
// grants while mounts re-assert their held locks, so it does not double-grant
// a lock its fresh, still-empty state has not yet rebuilt. Must exceed the
// mount keepalive interval and stay below the TTL.
posixLockWarmup = 10 * time.Second
)
// startPosixLockSweeper periodically reaps the locks of leased sessions (mounts)
// that stopped sending keepalives. Sessions that never renew are never reaped, so
// this is inert until mounts run with -posixLock.
func (fs *FilerServer) startPosixLockSweeper() {
fs.posixLockReadyAt.Store(time.Now().UnixNano())
fs.posixLockSweeperStop = make(chan struct{})
go func() {
ticker := time.NewTicker(posixLockSweepInterval)
@@ -93,13 +99,18 @@ func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockReq
resp := &filer_pb.PosixLockResponse{}
switch req.Op {
case filer_pb.PosixLockOp_TRY_LOCK:
// During a ring change, a previous owner may still hold a conflicting lock
// this fresh owner has not rebuilt yet; consult it before granting.
// A fresh owner's state may be incomplete (post-restart warm-up, or a ring
// change whose previous owner can't be reached): report a known conflict,
// else defer the grant so the client retries rather than risk a
// double-grant. A deferred grant becomes EAGAIN for non-blocking SetLk and
// a retry for the blocking SetLkw poll — never a spurious grant.
if !req.CoolingProbe {
if c, found := fs.coolingConflict(ctx, req.Key, lk); found {
if c, deferGrant := fs.posixCoolingOrWarmup(ctx, req.Key, lk); c != nil {
resp.HasConflict = true
resp.Conflict = c
break
} else if deferGrant {
break
}
}
if c, granted := fs.posixLocks.TryLock(req.Key, lk); granted {
@@ -111,8 +122,9 @@ func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockReq
case filer_pb.PosixLockOp_UNLOCK:
fs.posixLocks.Unlock(req.Key, lk)
case filer_pb.PosixLockOp_GET_LK:
// A query is best-effort: report a known conflict but never defer.
if !req.CoolingProbe {
if c, found := fs.coolingConflict(ctx, req.Key, lk); found {
if c, _ := fs.posixCoolingOrWarmup(ctx, req.Key, lk); c != nil {
resp.HasConflict = true
resp.Conflict = c
break
@@ -147,13 +159,37 @@ func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockReq
return resp, nil
}
// coolingConflict asks the key's previous owner, during a ring-change cooling
// window, whether it still holds a lock that blocks lk — so a fresh owner does
// not double-grant before re-assertion rebuilds its local state. Best-effort: if
// the previous owner is unreachable (e.g. it left, which caused the change, so
// its locks are gone) the caller proceeds. The probe is marked cooling_probe so
// the previous owner answers from local state without itself cooling off.
func (fs *FilerServer) coolingConflict(ctx context.Context, key string, lk posixlock.Range) (*filer_pb.PosixLockRange, bool) {
// posixWarmingUp reports whether this filer is still within posixLockWarmup of
// when it began serving POSIX locks. The zero readyAt (e.g. in tests) is never
// warming up.
func (fs *FilerServer) posixWarmingUp() bool {
readyAt := fs.posixLockReadyAt.Load()
return readyAt != 0 && time.Since(time.Unix(0, readyAt)) < posixLockWarmup
}
// posixCoolingOrWarmup decides whether a fresh owner can trust its local state
// for key before granting. It returns a known blocking lock (conflict != nil),
// or asks the caller to defer the grant (deferGrant) when the state may be
// incomplete and no conflict can be confirmed. It returns (nil, false) when the
// owner is authoritative and should consult its local table.
//
// - Warm-up: just after a (re)start the owner is still rebuilding from
// re-assertions, so only a locally-visible conflict is trustworthy; a
// would-be grant is deferred.
// - Ring change (cooling window): the previous owner may still hold a lock this
// owner hasn't rebuilt. Ask it (marked cooling_probe so it answers locally
// without recursing), under a short deadline so a slow peer can't stall the
// non-blocking lock path. If it is unreachable — typically because it
// crashed, which caused the change — we cannot confirm, so we defer rather
// than risk a double-grant; re-assertion rebuilds this owner before the
// window ends.
func (fs *FilerServer) posixCoolingOrWarmup(ctx context.Context, key string, lk posixlock.Range) (conflict *filer_pb.PosixLockRange, deferGrant bool) {
if fs.posixWarmingUp() {
if c, found := fs.posixLocks.GetLk(key, lk); found {
return posixRangeToPb(c), false
}
return nil, true
}
if fs.filer.Dlm == nil {
return nil, false
}
@@ -161,9 +197,6 @@ func (fs *FilerServer) coolingConflict(ctx context.Context, key string, lk posix
if prior == "" || prior == fs.option.Host {
return nil, false
}
// Bound the probe with its own short deadline: it runs on the non-blocking
// lock path, so a slow prior owner must not stall the caller. On timeout the
// probe is treated as unreachable (best-effort).
probeCtx, cancel := context.WithTimeout(ctx, posixCoolingProbeTimeout)
defer cancel()
var resp *filer_pb.PosixLockResponse
@@ -179,11 +212,13 @@ func (fs *FilerServer) coolingConflict(ctx context.Context, key string, lk posix
return e
})
if err != nil {
glog.V(2).InfofCtx(ctx, "posix cooling probe %s -> %s: %v", key, prior, err)
return nil, false
// Cannot confirm — defer rather than risk a double-grant (the prior owner
// likely crashed; re-assertion rebuilds this owner before the window ends).
glog.V(2).InfofCtx(ctx, "posix cooling probe %s -> %s: %v (deferring)", key, prior, err)
return nil, true
}
if resp.GetHasConflict() {
return resp.GetConflict(), true
return resp.GetConflict(), false
}
return nil, false
}
@@ -149,3 +149,30 @@ func TestPosixLockForwardsToOwner(t *testing.T) {
t.Fatal("sender must forward, not apply locally")
}
}
func TestPosixLockWarmupDefersGrants(t *testing.T) {
fs := newPosixTestServer()
fs.posixLockReadyAt.Store(time.Now().UnixNano()) // warming up
// A would-be grant is deferred (not granted, no conflict) so the client retries.
r := posixOp(t, fs, filer_pb.PosixLockOp_TRY_LOCK, pbLock(0, 99, posixlock.Write, 1, 1, 7, false))
if r.Granted {
t.Fatal("grant should be deferred during warm-up")
}
if r.HasConflict {
t.Fatalf("deferred grant should report no conflict, got %+v", r.Conflict)
}
// A lock the owner already knows about is still reported as a conflict.
fs.posixLocks.TryLock("s3.fuse.lock:/x", posixlock.Range{Start: 0, End: 99, Type: posixlock.Write, Sid: 9, Owner: 1, Pid: 5})
if r := posixOp(t, fs, filer_pb.PosixLockOp_TRY_LOCK, pbLock(50, 60, posixlock.Write, 1, 1, 7, false)); r.Granted || !r.HasConflict {
t.Fatalf("known conflict should be reported during warm-up: %+v", r)
}
// After warm-up, grants resume.
posixOp(t, fs, filer_pb.PosixLockOp_UNLOCK, pbLock(0, 99, posixlock.Unlock, 9, 1, 5, false))
fs.posixLockReadyAt.Store(time.Now().Add(-2 * posixLockWarmup).UnixNano())
if r := posixOp(t, fs, filer_pb.PosixLockOp_TRY_LOCK, pbLock(0, 99, posixlock.Write, 1, 1, 7, false)); !r.Granted {
t.Fatal("grant should succeed after warm-up")
}
}
+6
View File
@@ -141,6 +141,12 @@ type FilerServer struct {
posixLocks *posixlock.Manager
// posixLockSweeperStop stops the lease-reaping sweeper goroutine on Shutdown.
posixLockSweeperStop chan struct{}
// posixLockReadyAt is the unix-nanos when this filer began serving POSIX
// locks. For posixLockWarmup after it, the owner defers would-be grants while
// mounts re-assert, so a (re)started owner does not double-grant from empty
// state. Atomic so the handler reads it without locking; 0 means "not warming
// up" (e.g. in tests).
posixLockReadyAt atomic.Int64
}
func NewFilerServer(defaultMux, readonlyMux *http.ServeMux, option *FilerOption) (fs *FilerServer, err error) {