fix(lbclient): prevent deadlock when all clients are removed (#2287)

LBClient.get() took an RLock and then read cs[0] without checking for an
empty slice. Once every client was removed via RemoveClients, cs is empty,
so get() panicked on cs[0]; because the RUnlock was not deferred it was
never reached, leaking the lock and deadlocking every later AddClient and
RemoveClients call.

Defer the unlock in get() and return nil for an empty client set, with
DoDeadline/DoTimeout reporting the new ErrNoAvailableClients instead of
panicking. AddClient and RemoveClients now also defer their unlock so a
panic in the user-supplied RemoveClients callback can't leak the write
lock.

RemoveClients additionally niled cc.cs[idx] before invoking rc, so a panic
in rc left cc.cs with nil holes that would later crash get(). Compact the
slice first and nil the unused tail only afterwards, so a panic leaves the
client set usable.

Fixes #2270

Signed-off-by: Y.Horie <u5.horie@gmail.com>
This commit is contained in:
Y.Horie
2026-06-13 16:24:36 +09:00
committed by GitHub
parent 8fe6af3619
commit 76aa66015c
2 changed files with 153 additions and 7 deletions
+31 -7
View File
@@ -1,11 +1,16 @@
package fasthttp
import (
"errors"
"sync"
"sync/atomic"
"time"
)
// ErrNoAvailableClients is returned by LBClient methods when no clients are
// available, for example after every client has been removed.
var ErrNoAvailableClients = errors.New("no available clients")
// BalancingClient is the interface for clients, which may be passed
// to LBClient.Clients.
type BalancingClient interface {
@@ -62,13 +67,21 @@ const DefaultLBClientTimeout = time.Second
// DoDeadline calls DoDeadline on the least loaded client.
func (cc *LBClient) DoDeadline(req *Request, resp *Response, deadline time.Time) error {
return cc.get().DoDeadline(req, resp, deadline)
c := cc.get()
if c == nil {
return ErrNoAvailableClients
}
return c.DoDeadline(req, resp, deadline)
}
// DoTimeout calculates deadline and calls DoDeadline on the least loaded client.
func (cc *LBClient) DoTimeout(req *Request, resp *Response, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
return cc.get().DoDeadline(req, resp, deadline)
c := cc.get()
if c == nil {
return ErrNoAvailableClients
}
return c.DoDeadline(req, resp, deadline)
}
// Do calculates timeout using LBClient.Timeout and calls DoTimeout
@@ -100,11 +113,11 @@ func (cc *LBClient) init() {
// returns the new total number of clients.
func (cc *LBClient) AddClient(c BalancingClient) int {
cc.mu.Lock()
defer cc.mu.Unlock()
cc.cs = append(cc.cs, &lbClient{
c: c,
healthCheck: cc.HealthCheck,
})
cc.mu.Unlock()
return len(cc.cs)
}
@@ -113,18 +126,24 @@ func (cc *LBClient) AddClient(c BalancingClient) int {
// Returns the new total number of clients.
func (cc *LBClient) RemoveClients(rc func(BalancingClient) bool) int {
cc.mu.Lock()
// defer so a panic in the user-supplied rc can't leak the lock.
defer cc.mu.Unlock()
n := 0
for idx, cs := range cc.cs {
cc.cs[idx] = nil
for _, cs := range cc.cs {
if rc(cs.c) {
continue
}
cc.cs[n] = cs
n++
}
// Nil out the now-unused tail so removed clients can be garbage collected.
// This is done only after the loop so a panic in rc can't leave cc.cs with
// nil holes that would later crash get().
for i := n; i < len(cc.cs); i++ {
cc.cs[i] = nil
}
cc.cs = cc.cs[:n]
cc.mu.Unlock()
return len(cc.cs)
}
@@ -132,7 +151,13 @@ func (cc *LBClient) get() *lbClient {
cc.once.Do(cc.init)
cc.mu.RLock()
defer cc.mu.RUnlock()
cs := cc.cs
if len(cs) == 0 {
// No clients (e.g. all removed): avoid panicking on cs[0].
return nil
}
minC := cs[0]
minN := minC.PendingRequests()
@@ -146,7 +171,6 @@ func (cc *LBClient) get() *lbClient {
minT = t
}
}
cc.mu.RUnlock()
return minC
}
+122
View File
@@ -0,0 +1,122 @@
package fasthttp
import (
"errors"
"testing"
"time"
)
type mockBalancingClient struct {
err error
pending int
}
func (m *mockBalancingClient) DoDeadline(_ *Request, _ *Response, _ time.Time) error {
return m.err
}
func (m *mockBalancingClient) PendingRequests() int {
return m.pending
}
// TestLBClientRemoveAllClients reproduces the deadlock reported in #2270:
// once every client is removed, cc.cs is empty and get() used to panic on
// cs[0] while holding (and never releasing) the RLock, deadlocking every
// subsequent AddClient/RemoveClients call.
func TestLBClientRemoveAllClients(t *testing.T) {
t.Parallel()
lbc := &LBClient{Clients: []BalancingClient{&mockBalancingClient{}}}
var req Request
var resp Response
deadline := func() time.Time { return time.Now().Add(time.Second) }
// Trigger the lazy init and confirm the configured client serves the request.
if err := lbc.DoDeadline(&req, &resp, deadline()); err != nil {
t.Fatalf("unexpected error: %v", err)
}
// Remove every client so cc.cs becomes empty.
if n := lbc.RemoveClients(func(BalancingClient) bool { return true }); n != 0 {
t.Fatalf("unexpected client count after removal: %d", n)
}
// get() must not panic and must release the RLock, so the request fails
// cleanly with ErrNoAvailableClients instead.
if err := lbc.DoDeadline(&req, &resp, deadline()); !errors.Is(err, ErrNoAvailableClients) {
t.Fatalf("unexpected error: %v. Expecting %v", err, ErrNoAvailableClients)
}
// With the bug the leaked RLock makes AddClient block forever.
done := make(chan struct{})
go func() {
lbc.AddClient(&mockBalancingClient{})
close(done)
}()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("AddClient deadlocked after all clients were removed")
}
// The freshly added client must be usable again.
if err := lbc.DoDeadline(&req, &resp, deadline()); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
// TestLBClientRemoveClientsCallbackPanic ensures a panic in the user-supplied
// RemoveClients callback does not leak the write lock and deadlock later calls.
func TestLBClientRemoveClientsCallbackPanic(t *testing.T) {
t.Parallel()
lbc := &LBClient{Clients: []BalancingClient{
&mockBalancingClient{},
&mockBalancingClient{},
&mockBalancingClient{},
}}
// Trigger the lazy init so cc.cs is populated and the callback runs.
var req Request
var resp Response
if err := lbc.DoDeadline(&req, &resp, time.Now().Add(time.Second)); err != nil {
t.Fatalf("unexpected error: %v", err)
}
// Panic part way through so the old code would have already niled an
// earlier cc.cs slot, leaving a hole that crashes later get() calls.
calls := 0
func() {
defer func() {
if recover() == nil {
t.Fatal("expected panic from RemoveClients callback")
}
}()
lbc.RemoveClients(func(BalancingClient) bool {
calls++
if calls == 2 {
panic("boom")
}
return false
})
}()
// If the lock leaked, AddClient would block forever.
done := make(chan struct{})
go func() {
lbc.AddClient(&mockBalancingClient{})
close(done)
}()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("AddClient deadlocked after RemoveClients callback panicked")
}
// cc.cs must not be left with nil holes: a request still has to work
// instead of panicking on a nil client.
if err := lbc.DoDeadline(&req, &resp, time.Now().Add(time.Second)); err != nil {
t.Fatalf("unexpected error after panicking RemoveClients: %v", err)
}
}