Files
seaweedfs/weed/server/filer_grpc_server_posix_lock.go
T
Chris Lu 85ca3cb757 filer: warm-up + fail-closed cooling for POSIX locks on owner (re)start (#9673)
After a (re)start the owner defers would-be grants for posixLockWarmup
while mounts re-assert, trusting only locally-visible conflicts, so it
does not double-grant from empty state; a deferred grant is a retry for
SetLkw and EAGAIN for non-blocking SetLk, never a spurious grant. Cooling
now fail-closes: if the previous owner is unreachable during a ring
change, defer rather than risk a double-grant. readyAt is atomic so the
handler reads it without locking.
2026-05-25 13:14:05 -07:00

249 lines
8.9 KiB
Go

package weed_server
import (
"context"
"fmt"
"time"
"github.com/seaweedfs/seaweedfs/weed/filer/posixlock"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
)
const (
// posixLockSessionTTL is how long a mount's lease survives without a
// keepalive before its locks are reaped; posixLockSweepInterval is how often
// each filer checks. The mount renews well within the TTL.
posixLockSessionTTL = 15 * time.Second
posixLockSweepInterval = 5 * time.Second
// posixCoolingProbeTimeout bounds the dual-read probe to the prior owner so a
// slow peer can't stall a non-blocking lock call during the cooling window.
posixCoolingProbeTimeout = 2 * time.Second
// posixLockWarmup is how long after a (re)start the owner defers would-be
// grants while mounts re-assert their held locks, so it does not double-grant
// a lock its fresh, still-empty state has not yet rebuilt. Must exceed the
// mount keepalive interval and stay below the TTL.
posixLockWarmup = 10 * time.Second
)
// startPosixLockSweeper periodically reaps the locks of leased sessions (mounts)
// that stopped sending keepalives. Sessions that never renew are never reaped, so
// this is inert until mounts run with -posixLock.
func (fs *FilerServer) startPosixLockSweeper() {
fs.posixLockReadyAt.Store(time.Now().UnixNano())
fs.posixLockSweeperStop = make(chan struct{})
go func() {
ticker := time.NewTicker(posixLockSweepInterval)
defer ticker.Stop()
for {
select {
case <-fs.posixLockSweeperStop:
return
case <-ticker.C:
if reaped := fs.posixLocks.ReapExpired(posixLockSessionTTL); len(reaped) > 0 {
glog.V(2).Infof("posix lock: reaped %d expired session(s): %v", len(reaped), reaped)
}
}
}
}()
}
// PosixLock applies one advisory lock operation against the in-memory lock table
// of the inode's owner filer. The owner is resolved from req.Key on the same
// ring the gateway and DLM use; a non-owner filer forwards the request one hop
// (is_moved bounds it), so the owner's table stays the single authority even when
// the caller's ring view is stale. Blocking (SetLkw) is the caller's job — this
// is strictly non-blocking try/release/query.
func (fs *FilerServer) PosixLock(ctx context.Context, req *filer_pb.PosixLockRequest) (*filer_pb.PosixLockResponse, error) {
if req.Key == "" {
return &filer_pb.PosixLockResponse{}, fmt.Errorf("key is required")
}
if req.Lock == nil {
return &filer_pb.PosixLockResponse{}, fmt.Errorf("lock is required")
}
if !req.IsMoved && fs.filer.Dlm != nil {
if owner := fs.filer.Dlm.LockRing.GetPrimary(req.Key); owner != "" && owner != fs.option.Host {
forwarded := &filer_pb.PosixLockRequest{
Key: req.Key,
IsMoved: true,
Op: req.Op,
Lock: req.Lock,
Locks: req.Locks,
}
glog.V(4).InfofCtx(ctx, "PosixLock %s op=%v: forwarding to owner %s", req.Key, req.Op, owner)
var resp *filer_pb.PosixLockResponse
err := pb.WithFilerClient(false, 0, owner, fs.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
var e error
resp, e = client.PosixLock(ctx, forwarded)
return e
})
if err != nil {
return &filer_pb.PosixLockResponse{}, err
}
return resp, nil
}
}
lk := posixlock.Range{
Start: req.Lock.GetStart(),
End: req.Lock.GetEnd(),
Type: req.Lock.GetType(),
Sid: req.Lock.GetSid(),
Owner: req.Lock.GetOwner(),
Pid: req.Lock.GetPid(),
IsFlock: req.Lock.GetIsFlock(),
}
resp := &filer_pb.PosixLockResponse{}
switch req.Op {
case filer_pb.PosixLockOp_TRY_LOCK:
// A fresh owner's state may be incomplete (post-restart warm-up, or a ring
// change whose previous owner can't be reached): report a known conflict,
// else defer the grant so the client retries rather than risk a
// double-grant. A deferred grant becomes EAGAIN for non-blocking SetLk and
// a retry for the blocking SetLkw poll — never a spurious grant.
if !req.CoolingProbe {
if c, deferGrant := fs.posixCoolingOrWarmup(ctx, req.Key, lk); c != nil {
resp.HasConflict = true
resp.Conflict = c
break
} else if deferGrant {
break
}
}
if c, granted := fs.posixLocks.TryLock(req.Key, lk); granted {
resp.Granted = true
} else {
resp.HasConflict = true
resp.Conflict = posixRangeToPb(c)
}
case filer_pb.PosixLockOp_UNLOCK:
fs.posixLocks.Unlock(req.Key, lk)
case filer_pb.PosixLockOp_GET_LK:
// A query is best-effort: report a known conflict but never defer.
if !req.CoolingProbe {
if c, _ := fs.posixCoolingOrWarmup(ctx, req.Key, lk); c != nil {
resp.HasConflict = true
resp.Conflict = c
break
}
}
if c, found := fs.posixLocks.GetLk(req.Key, lk); found {
resp.HasConflict = true
resp.Conflict = posixRangeToPb(c)
}
case filer_pb.PosixLockOp_RELEASE_POSIX_OWNER:
fs.posixLocks.ReleasePosixOwner(req.Key, lk.Sid, lk.Owner)
case filer_pb.PosixLockOp_RELEASE_FLOCK_OWNER:
fs.posixLocks.ReleaseFlockOwner(req.Key, lk.Sid, lk.Owner)
case filer_pb.PosixLockOp_KEEP_ALIVE:
// A re-assertion carries the mount's held locks on this key so the owner
// can rebuild its in-memory state after an ownership change or restart; a
// bare keepalive just renews the lease.
if len(req.Locks) > 0 {
held := make([]posixlock.Range, 0, len(req.Locks))
for _, l := range req.Locks {
held = append(held, posixRangeFromPb(l))
}
if conflicts := fs.posixLocks.Reassert(req.Key, lk.Sid, held); len(conflicts) > 0 {
glog.Warningf("posix reassert %s sid %d: %d lock(s) lost to another session: %+v", req.Key, lk.Sid, len(conflicts), conflicts)
}
} else {
fs.posixLocks.Renew(lk.Sid)
}
default:
return &filer_pb.PosixLockResponse{}, fmt.Errorf("unknown posix lock op %v", req.Op)
}
return resp, nil
}
// posixWarmingUp reports whether this filer is still within posixLockWarmup of
// when it began serving POSIX locks. The zero readyAt (e.g. in tests) is never
// warming up.
func (fs *FilerServer) posixWarmingUp() bool {
readyAt := fs.posixLockReadyAt.Load()
return readyAt != 0 && time.Since(time.Unix(0, readyAt)) < posixLockWarmup
}
// posixCoolingOrWarmup decides whether a fresh owner can trust its local state
// for key before granting. It returns a known blocking lock (conflict != nil),
// or asks the caller to defer the grant (deferGrant) when the state may be
// incomplete and no conflict can be confirmed. It returns (nil, false) when the
// owner is authoritative and should consult its local table.
//
// - Warm-up: just after a (re)start the owner is still rebuilding from
// re-assertions, so only a locally-visible conflict is trustworthy; a
// would-be grant is deferred.
// - Ring change (cooling window): the previous owner may still hold a lock this
// owner hasn't rebuilt. Ask it (marked cooling_probe so it answers locally
// without recursing), under a short deadline so a slow peer can't stall the
// non-blocking lock path. If it is unreachable — typically because it
// crashed, which caused the change — we cannot confirm, so we defer rather
// than risk a double-grant; re-assertion rebuilds this owner before the
// window ends.
func (fs *FilerServer) posixCoolingOrWarmup(ctx context.Context, key string, lk posixlock.Range) (conflict *filer_pb.PosixLockRange, deferGrant bool) {
if fs.posixWarmingUp() {
if c, found := fs.posixLocks.GetLk(key, lk); found {
return posixRangeToPb(c), false
}
return nil, true
}
if fs.filer.Dlm == nil {
return nil, false
}
prior := fs.filer.Dlm.LockRing.PriorOwner(key)
if prior == "" || prior == fs.option.Host {
return nil, false
}
probeCtx, cancel := context.WithTimeout(ctx, posixCoolingProbeTimeout)
defer cancel()
var resp *filer_pb.PosixLockResponse
err := pb.WithFilerClient(false, 0, prior, fs.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
var e error
resp, e = client.PosixLock(probeCtx, &filer_pb.PosixLockRequest{
Key: key,
IsMoved: true,
CoolingProbe: true,
Op: filer_pb.PosixLockOp_GET_LK,
Lock: posixRangeToPb(lk),
})
return e
})
if err != nil {
// Cannot confirm — defer rather than risk a double-grant (the prior owner
// likely crashed; re-assertion rebuilds this owner before the window ends).
glog.V(2).InfofCtx(ctx, "posix cooling probe %s -> %s: %v (deferring)", key, prior, err)
return nil, true
}
if resp.GetHasConflict() {
return resp.GetConflict(), false
}
return nil, false
}
func posixRangeFromPb(l *filer_pb.PosixLockRange) posixlock.Range {
return posixlock.Range{
Start: l.GetStart(),
End: l.GetEnd(),
Type: l.GetType(),
Sid: l.GetSid(),
Owner: l.GetOwner(),
Pid: l.GetPid(),
IsFlock: l.GetIsFlock(),
}
}
func posixRangeToPb(r posixlock.Range) *filer_pb.PosixLockRange {
return &filer_pb.PosixLockRange{
Start: r.Start,
End: r.End,
Type: r.Type,
Sid: r.Sid,
Owner: r.Owner,
Pid: r.Pid,
IsFlock: r.IsFlock,
}
}