From 85ca3cb7571c71efd92361c4f685c51e0a55cdc3 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Mon, 25 May 2026 13:14:05 -0700 Subject: [PATCH] filer: warm-up + fail-closed cooling for POSIX locks on owner (re)start (#9673) After a (re)start the owner defers would-be grants for posixLockWarmup while mounts re-assert, trusting only locally-visible conflicts, so it does not double-grant from empty state; a deferred grant is a retry for SetLkw and EAGAIN for non-blocking SetLk, never a spurious grant. Cooling now fail-closes: if the previous owner is unreachable during a ring change, defer rather than risk a double-grant. readyAt is atomic so the handler reads it without locking. --- weed/server/filer_grpc_server_posix_lock.go | 69 ++++++++++++++----- .../filer_grpc_server_posix_lock_test.go | 27 ++++++++ weed/server/filer_server.go | 6 ++ 3 files changed, 85 insertions(+), 17 deletions(-) diff --git a/weed/server/filer_grpc_server_posix_lock.go b/weed/server/filer_grpc_server_posix_lock.go index 5036953d5..c7024a883 100644 --- a/weed/server/filer_grpc_server_posix_lock.go +++ b/weed/server/filer_grpc_server_posix_lock.go @@ -20,12 +20,18 @@ const ( // posixCoolingProbeTimeout bounds the dual-read probe to the prior owner so a // slow peer can't stall a non-blocking lock call during the cooling window. posixCoolingProbeTimeout = 2 * time.Second + // posixLockWarmup is how long after a (re)start the owner defers would-be + // grants while mounts re-assert their held locks, so it does not double-grant + // a lock its fresh, still-empty state has not yet rebuilt. Must exceed the + // mount keepalive interval and stay below the TTL. + posixLockWarmup = 10 * time.Second ) // startPosixLockSweeper periodically reaps the locks of leased sessions (mounts) // that stopped sending keepalives. Sessions that never renew are never reaped, so // this is inert until mounts run with -posixLock. func (fs *FilerServer) startPosixLockSweeper() { + fs.posixLockReadyAt.Store(time.Now().UnixNano()) fs.posixLockSweeperStop = make(chan struct{}) go func() { ticker := time.NewTicker(posixLockSweepInterval) @@ -93,13 +99,18 @@ func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockReq resp := &filer_pb.PosixLockResponse{} switch req.Op { case filer_pb.PosixLockOp_TRY_LOCK: - // During a ring change, a previous owner may still hold a conflicting lock - // this fresh owner has not rebuilt yet; consult it before granting. + // A fresh owner's state may be incomplete (post-restart warm-up, or a ring + // change whose previous owner can't be reached): report a known conflict, + // else defer the grant so the client retries rather than risk a + // double-grant. A deferred grant becomes EAGAIN for non-blocking SetLk and + // a retry for the blocking SetLkw poll — never a spurious grant. if !req.CoolingProbe { - if c, found := fs.coolingConflict(ctx, req.Key, lk); found { + if c, deferGrant := fs.posixCoolingOrWarmup(ctx, req.Key, lk); c != nil { resp.HasConflict = true resp.Conflict = c break + } else if deferGrant { + break } } if c, granted := fs.posixLocks.TryLock(req.Key, lk); granted { @@ -111,8 +122,9 @@ func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockReq case filer_pb.PosixLockOp_UNLOCK: fs.posixLocks.Unlock(req.Key, lk) case filer_pb.PosixLockOp_GET_LK: + // A query is best-effort: report a known conflict but never defer. if !req.CoolingProbe { - if c, found := fs.coolingConflict(ctx, req.Key, lk); found { + if c, _ := fs.posixCoolingOrWarmup(ctx, req.Key, lk); c != nil { resp.HasConflict = true resp.Conflict = c break @@ -147,13 +159,37 @@ func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockReq return resp, nil } -// coolingConflict asks the key's previous owner, during a ring-change cooling -// window, whether it still holds a lock that blocks lk — so a fresh owner does -// not double-grant before re-assertion rebuilds its local state. Best-effort: if -// the previous owner is unreachable (e.g. it left, which caused the change, so -// its locks are gone) the caller proceeds. The probe is marked cooling_probe so -// the previous owner answers from local state without itself cooling off. -func (fs *FilerServer) coolingConflict(ctx context.Context, key string, lk posixlock.Range) (*filer_pb.PosixLockRange, bool) { +// posixWarmingUp reports whether this filer is still within posixLockWarmup of +// when it began serving POSIX locks. The zero readyAt (e.g. in tests) is never +// warming up. +func (fs *FilerServer) posixWarmingUp() bool { + readyAt := fs.posixLockReadyAt.Load() + return readyAt != 0 && time.Since(time.Unix(0, readyAt)) < posixLockWarmup +} + +// posixCoolingOrWarmup decides whether a fresh owner can trust its local state +// for key before granting. It returns a known blocking lock (conflict != nil), +// or asks the caller to defer the grant (deferGrant) when the state may be +// incomplete and no conflict can be confirmed. It returns (nil, false) when the +// owner is authoritative and should consult its local table. +// +// - Warm-up: just after a (re)start the owner is still rebuilding from +// re-assertions, so only a locally-visible conflict is trustworthy; a +// would-be grant is deferred. +// - Ring change (cooling window): the previous owner may still hold a lock this +// owner hasn't rebuilt. Ask it (marked cooling_probe so it answers locally +// without recursing), under a short deadline so a slow peer can't stall the +// non-blocking lock path. If it is unreachable — typically because it +// crashed, which caused the change — we cannot confirm, so we defer rather +// than risk a double-grant; re-assertion rebuilds this owner before the +// window ends. +func (fs *FilerServer) posixCoolingOrWarmup(ctx context.Context, key string, lk posixlock.Range) (conflict *filer_pb.PosixLockRange, deferGrant bool) { + if fs.posixWarmingUp() { + if c, found := fs.posixLocks.GetLk(key, lk); found { + return posixRangeToPb(c), false + } + return nil, true + } if fs.filer.Dlm == nil { return nil, false } @@ -161,9 +197,6 @@ func (fs *FilerServer) coolingConflict(ctx context.Context, key string, lk posix if prior == "" || prior == fs.option.Host { return nil, false } - // Bound the probe with its own short deadline: it runs on the non-blocking - // lock path, so a slow prior owner must not stall the caller. On timeout the - // probe is treated as unreachable (best-effort). probeCtx, cancel := context.WithTimeout(ctx, posixCoolingProbeTimeout) defer cancel() var resp *filer_pb.PosixLockResponse @@ -179,11 +212,13 @@ func (fs *FilerServer) coolingConflict(ctx context.Context, key string, lk posix return e }) if err != nil { - glog.V(2).InfofCtx(ctx, "posix cooling probe %s -> %s: %v", key, prior, err) - return nil, false + // Cannot confirm — defer rather than risk a double-grant (the prior owner + // likely crashed; re-assertion rebuilds this owner before the window ends). + glog.V(2).InfofCtx(ctx, "posix cooling probe %s -> %s: %v (deferring)", key, prior, err) + return nil, true } if resp.GetHasConflict() { - return resp.GetConflict(), true + return resp.GetConflict(), false } return nil, false } diff --git a/weed/server/filer_grpc_server_posix_lock_test.go b/weed/server/filer_grpc_server_posix_lock_test.go index 96423971b..bb05056d7 100644 --- a/weed/server/filer_grpc_server_posix_lock_test.go +++ b/weed/server/filer_grpc_server_posix_lock_test.go @@ -149,3 +149,30 @@ func TestPosixLockForwardsToOwner(t *testing.T) { t.Fatal("sender must forward, not apply locally") } } + +func TestPosixLockWarmupDefersGrants(t *testing.T) { + fs := newPosixTestServer() + fs.posixLockReadyAt.Store(time.Now().UnixNano()) // warming up + + // A would-be grant is deferred (not granted, no conflict) so the client retries. + r := posixOp(t, fs, filer_pb.PosixLockOp_TRY_LOCK, pbLock(0, 99, posixlock.Write, 1, 1, 7, false)) + if r.Granted { + t.Fatal("grant should be deferred during warm-up") + } + if r.HasConflict { + t.Fatalf("deferred grant should report no conflict, got %+v", r.Conflict) + } + + // A lock the owner already knows about is still reported as a conflict. + fs.posixLocks.TryLock("s3.fuse.lock:/x", posixlock.Range{Start: 0, End: 99, Type: posixlock.Write, Sid: 9, Owner: 1, Pid: 5}) + if r := posixOp(t, fs, filer_pb.PosixLockOp_TRY_LOCK, pbLock(50, 60, posixlock.Write, 1, 1, 7, false)); r.Granted || !r.HasConflict { + t.Fatalf("known conflict should be reported during warm-up: %+v", r) + } + + // After warm-up, grants resume. + posixOp(t, fs, filer_pb.PosixLockOp_UNLOCK, pbLock(0, 99, posixlock.Unlock, 9, 1, 5, false)) + fs.posixLockReadyAt.Store(time.Now().Add(-2 * posixLockWarmup).UnixNano()) + if r := posixOp(t, fs, filer_pb.PosixLockOp_TRY_LOCK, pbLock(0, 99, posixlock.Write, 1, 1, 7, false)); !r.Granted { + t.Fatal("grant should succeed after warm-up") + } +} diff --git a/weed/server/filer_server.go b/weed/server/filer_server.go index a6c2e1508..54d9b0a92 100644 --- a/weed/server/filer_server.go +++ b/weed/server/filer_server.go @@ -141,6 +141,12 @@ type FilerServer struct { posixLocks *posixlock.Manager // posixLockSweeperStop stops the lease-reaping sweeper goroutine on Shutdown. posixLockSweeperStop chan struct{} + // posixLockReadyAt is the unix-nanos when this filer began serving POSIX + // locks. For posixLockWarmup after it, the owner defers would-be grants while + // mounts re-assert, so a (re)started owner does not double-grant from empty + // state. Atomic so the handler reads it without locking; 0 means "not warming + // up" (e.g. in tests). + posixLockReadyAt atomic.Int64 } func NewFilerServer(defaultMux, readonlyMux *http.ServeMux, option *FilerOption) (fs *FilerServer, err error) {