security: hot-reload JWT signing keys on SIGHUP (#9826)

* security: reload JWT signing keys on SIGHUP

Signing keys were read once in the server constructors and never
refreshed. After a key rotation (Secret update, divergent reads) the
in-memory key stayed stale and every request kept failing "wrong jwt"
until the affected process was restarted.

Add Guard.UpdateSigningKeys and call it from the master, volume and
filer reload paths and the s3 reload hook, next to the existing
whitelist refresh. Make the global chunk-read JWT cache reloadable via
an atomic swap, and register the master's Reload with grace.OnReload --
it was never wired, so the master ignored SIGHUP entirely.

Mirror the same refresh in the Rust volume server's SIGHUP handler.

* security: swap signing keys behind an atomic pointer

Addresses review feedback on the in-place key swap: SigningKey is a
[]byte, so reassigning the Guard fields while a request handler reads
them is a data race that can tear the multi-word slice header and read
out of bounds.

Hold the four signing-key fields in an immutable signingConfig snapshot
behind atomic.Pointer; UpdateSigningKeys swaps the whole pointer, so a
reader sees either the old keys or the new ones. Reads go through new
SigningKey/ExpiresAfterSec/ReadSigningKey/ReadExpiresAfterSec accessors.

The Rust guard is already safe: every read and the SIGHUP write go
through the shared RwLock<Guard>.

* security: fold whitelist + auth state into the atomic snapshot

Review follow-up. UpdateSigningKeys still wrote isWriteActive while the
request path read it (and the whitelist maps) unsynchronized, so a SIGHUP
under load could expose an inconsistent mix of activation bits and
whitelist contents.

Move all hot-reloadable Guard state -- keys, expirations, whitelist, and
the activation flags -- into a single immutable guardState swapped behind
one atomic.Pointer. The Update* methods take a small mutex to serialize
the read-modify-write; readers stay lock-free. The concurrency test now
also rotates the whitelist and probes IsWhiteListed under -race.

Also read each signing key once per branch in the volume/filer JWT auth
checks, so a reload landing mid-check can't take the allow-fast-path
after auth was enabled or verify against a different key than the branch
saw.
This commit is contained in:
Chris Lu
2026-06-04 22:26:08 -07:00
committed by GitHub
parent 0d72023fac
commit ab7be7867d
16 changed files with 285 additions and 50 deletions
+6
View File
@@ -545,6 +545,12 @@ async fn run(
whitelist.extend(sec.guard_white_list.iter().cloned());
let mut guard = state_reload.guard.write().unwrap();
guard.update_whitelist(&whitelist);
guard.update_signing_keys(
SigningKey(sec.jwt_signing_key),
sec.jwt_signing_expires,
SigningKey(sec.jwt_read_signing_key),
sec.jwt_read_signing_expires,
);
}
// Trigger heartbeat to report new volumes
+57
View File
@@ -172,6 +172,26 @@ impl Guard {
self.is_write_active = !is_empty_whitelist || !self.signing_key.is_empty();
}
/// Refresh the JWT signing keys and their expirations in place so operators
/// can rotate keys via SIGHUP without restarting the process. Mirrors Go's
/// `Guard::UpdateSigningKeys`: it swaps the fields and recomputes
/// `is_write_active`.
pub fn update_signing_keys(
&mut self,
signing_key: SigningKey,
expires_after_sec: i64,
read_signing_key: SigningKey,
read_expires_after_sec: i64,
) {
self.signing_key = signing_key;
self.expires_after_sec = expires_after_sec;
self.read_signing_key = read_signing_key;
self.read_expires_after_sec = read_expires_after_sec;
let is_empty_whitelist = self.whitelist_ips.is_empty() && self.whitelist_cidrs.is_empty();
self.is_write_active = !is_empty_whitelist || !self.signing_key.is_empty();
}
/// Check if a remote IP is in the whitelist.
/// Returns true if write security is inactive (no whitelist and no signing key),
/// if the whitelist is empty, or if the IP matches.
@@ -463,6 +483,43 @@ mod tests {
assert!(matches!(err, Err(JwtError::FileIdMismatch { .. })));
}
#[test]
fn test_update_signing_keys_rotates() {
let mut guard = Guard::new(
&[],
SigningKey::from_string("old-write"),
10,
SigningKey::from_string("old-read"),
60,
);
guard.update_signing_keys(
SigningKey::from_string("new-write"),
11,
SigningKey::from_string("new-read"),
61,
);
let token = gen_jwt(&guard.signing_key, 60, "3,01637037d6").unwrap();
// Validates with the rotated key, not the old one.
assert!(decode_jwt(&guard.signing_key, &token).is_ok());
assert!(decode_jwt(&SigningKey::from_string("old-write"), &token).is_err());
assert_eq!(guard.expires_after_sec, 11);
assert_eq!(guard.read_expires_after_sec, 61);
}
#[test]
fn test_update_signing_keys_toggles_write_active() {
let mut guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0);
assert!(guard.check_jwt(None, true).is_ok());
guard.update_signing_keys(SigningKey::from_string("write"), 10, SigningKey(vec![]), 0);
assert!(guard.is_write_active);
guard.update_signing_keys(SigningKey(vec![]), 0, SigningKey(vec![]), 0);
assert!(!guard.is_write_active);
}
#[test]
fn test_extract_host() {
assert_eq!(extract_host("192.168.1.1:8080"), "192.168.1.1");
+1
View File
@@ -355,6 +355,7 @@ func startMaster(masterOption MasterOptions, masterWhiteList []string) {
grace.OnInterrupt(ms.Shutdown)
grace.OnInterrupt(grpcS.Stop)
grace.OnReload(ms.Reload)
grace.OnReload(func() {
if ms.Topo.HashicorpRaft != nil && ms.Topo.HashicorpRaft.State() == hashicorpRaft.Leader {
ms.Topo.HashicorpRaft.LeadershipTransfer()
+15
View File
@@ -395,6 +395,21 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl
}
})
}
// Refresh the JWT signing keys on SIGHUP so an operator can rotate them
// without restarting; otherwise filer/volume auth stays stuck on the stale
// key after a rotation.
grace.OnReload(func() {
util.LoadConfiguration("security", false)
v := util.GetViper()
s3ApiServer.filerGuard.UpdateSigningKeys(
v.GetString("jwt.filer_signing.key"),
v.GetInt("jwt.filer_signing.expires_after_seconds"),
v.GetString("jwt.filer_signing.read.key"),
v.GetInt("jwt.filer_signing.read.expires_after_seconds"),
)
util_http.ReloadJwtSigningReadConfig()
})
s3ApiServer.bucketRegistry = NewBucketRegistry(s3ApiServer)
// Update IAM with the final filer client (already handled by SetFilerClient above,
+64 -22
View File
@@ -6,6 +6,8 @@ import (
"net"
"net/http"
"strings"
"sync"
"sync/atomic"
"github.com/seaweedfs/seaweedfs/weed/glog"
)
@@ -39,31 +41,50 @@ Generating JWT:
Referenced:
https://github.com/pkieltyka/jwtauth/blob/master/jwtauth.go
*/
type Guard struct {
whiteListIp map[string]struct{}
whiteListCIDR map[string]*net.IPNet
SigningKey SigningKey
ExpiresAfterSec int
ReadSigningKey SigningKey
ReadExpiresAfterSec int
// guardState is the immutable snapshot of all hot-reloadable Guard state. The
// Update* methods build a new snapshot from the current one and swap it in
// atomically, so request-path readers (WhiteList, IsWhiteListed, the SigningKey
// accessors) always observe a consistent set of keys and whitelist — never a
// torn slice header or a mix of old and new state across a SIGHUP.
type guardState struct {
signingKey SigningKey
expiresAfterSec int
readSigningKey SigningKey
readExpiresAfterSec int
whiteListIp map[string]struct{}
whiteListCIDR map[string]*net.IPNet
isWriteActive bool
isEmptyWhiteList bool
}
type Guard struct {
// state is swapped atomically by the Update* methods. Read it via Load.
state atomic.Pointer[guardState]
// updateMu serializes the read-modify-write inside the Update* methods so
// concurrent reloads don't clobber each other; readers stay lock-free.
updateMu sync.Mutex
}
func NewGuard(whiteList []string, signingKey string, expiresAfterSec int, readSigningKey string, readExpiresAfterSec int) *Guard {
g := &Guard{
SigningKey: SigningKey(signingKey),
ExpiresAfterSec: expiresAfterSec,
ReadSigningKey: SigningKey(readSigningKey),
ReadExpiresAfterSec: readExpiresAfterSec,
}
g := &Guard{}
g.state.Store(&guardState{
signingKey: SigningKey(signingKey),
expiresAfterSec: expiresAfterSec,
readSigningKey: SigningKey(readSigningKey),
readExpiresAfterSec: readExpiresAfterSec,
})
g.UpdateWhiteList(whiteList)
return g
}
func (g *Guard) SigningKey() SigningKey { return g.state.Load().signingKey }
func (g *Guard) ExpiresAfterSec() int { return g.state.Load().expiresAfterSec }
func (g *Guard) ReadSigningKey() SigningKey { return g.state.Load().readSigningKey }
func (g *Guard) ReadExpiresAfterSec() int { return g.state.Load().readExpiresAfterSec }
func (g *Guard) WhiteList(f http.HandlerFunc) http.HandlerFunc {
if !g.isWriteActive {
if !g.state.Load().isWriteActive {
//if no security needed, just skip all checking
return f
}
@@ -109,18 +130,19 @@ func (g *Guard) checkWhiteList(w http.ResponseWriter, r *http.Request) error {
// IsWhiteListed returns true if the given host IP is allowed by the guard.
// When no whitelist is configured (security inactive), all hosts are allowed.
func (g *Guard) IsWhiteListed(host string) bool {
if !g.isWriteActive {
st := g.state.Load()
if !st.isWriteActive {
return true
}
if g.isEmptyWhiteList {
if st.isEmptyWhiteList {
return true
}
if _, ok := g.whiteListIp[host]; ok {
if _, ok := st.whiteListIp[host]; ok {
return true
}
remote := net.ParseIP(host)
if remote != nil {
for _, cidrnet := range g.whiteListCIDR {
for _, cidrnet := range st.whiteListCIDR {
if cidrnet.Contains(remote) {
return true
}
@@ -129,6 +151,22 @@ func (g *Guard) IsWhiteListed(host string) bool {
return false
}
// UpdateSigningKeys refreshes the JWT signing keys and their expirations so
// operators can rotate keys (e.g. via SIGHUP) without restarting the process.
// It swaps in a new snapshot carrying the existing whitelist, so a concurrent
// reader sees either the old keys or the new ones, never a torn slice header.
func (g *Guard) UpdateSigningKeys(signingKey string, expiresAfterSec int, readSigningKey string, readExpiresAfterSec int) {
g.updateMu.Lock()
defer g.updateMu.Unlock()
next := *g.state.Load()
next.signingKey = SigningKey(signingKey)
next.expiresAfterSec = expiresAfterSec
next.readSigningKey = SigningKey(readSigningKey)
next.readExpiresAfterSec = readExpiresAfterSec
next.isWriteActive = !next.isEmptyWhiteList || len(next.signingKey) != 0
g.state.Store(&next)
}
func (g *Guard) UpdateWhiteList(whiteList []string) {
whiteListIp := make(map[string]struct{})
whiteListCIDR := make(map[string]*net.IPNet)
@@ -144,8 +182,12 @@ func (g *Guard) UpdateWhiteList(whiteList []string) {
whiteListIp[ip] = struct{}{}
}
}
g.isEmptyWhiteList = len(whiteListIp) == 0 && len(whiteListCIDR) == 0
g.isWriteActive = !g.isEmptyWhiteList || len(g.SigningKey) != 0
g.whiteListIp = whiteListIp
g.whiteListCIDR = whiteListCIDR
g.updateMu.Lock()
defer g.updateMu.Unlock()
next := *g.state.Load()
next.isEmptyWhiteList = len(whiteListIp) == 0 && len(whiteListCIDR) == 0
next.isWriteActive = !next.isEmptyWhiteList || len(next.signingKey) != 0
next.whiteListIp = whiteListIp
next.whiteListCIDR = whiteListCIDR
g.state.Store(&next)
}
+81
View File
@@ -0,0 +1,81 @@
package security
import (
"sync"
"testing"
)
// TestUpdateSigningKeysRotates pins the hot-reload contract: after a key
// rotation the guard validates tokens minted with the new key and rejects the
// old one, so a SIGHUP recovers from a key mismatch without a process restart.
func TestUpdateSigningKeysRotates(t *testing.T) {
g := NewGuard(nil, "old-write", 10, "old-read", 60)
g.UpdateSigningKeys("new-write", 11, "new-read", 61)
if string(g.SigningKey()) != "new-write" || g.ExpiresAfterSec() != 11 {
t.Fatalf("write key not refreshed: key=%q exp=%d", g.SigningKey(), g.ExpiresAfterSec())
}
if string(g.ReadSigningKey()) != "new-read" || g.ReadExpiresAfterSec() != 61 {
t.Fatalf("read key not refreshed: key=%q exp=%d", g.ReadSigningKey(), g.ReadExpiresAfterSec())
}
tok := GenJwtForVolumeServer(g.SigningKey(), g.ExpiresAfterSec(), "3,01637037d6")
if _, err := DecodeJwt(g.SigningKey(), tok, &SeaweedFileIdClaims{}); err != nil {
t.Fatalf("token minted with rotated key should verify: %v", err)
}
if _, err := DecodeJwt(SigningKey("old-write"), tok, &SeaweedFileIdClaims{}); err == nil {
t.Fatalf("token minted with rotated key must not verify against the old key")
}
}
// TestUpdateSigningKeysConcurrent rotates keys and whitelist while readers mint
// and verify tokens and check whitelist membership, so `go test -race` would
// flag any torn read of the signing-key slice header or the whitelist maps.
// Each key rotation uses matching keys so every reader's token validates.
func TestUpdateSigningKeysConcurrent(t *testing.T) {
g := NewGuard([]string{"10.0.0.1"}, "k0", 60, "k0", 60)
var wg sync.WaitGroup
for w := 0; w < 8; w++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < 1000; i++ {
key := g.SigningKey()
tok := GenJwtForVolumeServer(key, g.ExpiresAfterSec(), "3,01637037d6")
if _, err := DecodeJwt(key, tok, &SeaweedFileIdClaims{}); err != nil {
t.Errorf("token should verify against the snapshot it was minted from: %v", err)
return
}
g.IsWhiteListed("10.0.0.1")
}
}()
}
for i := 0; i < 1000; i++ {
g.UpdateSigningKeys("kw", 60, "kr", 60)
g.UpdateSigningKeys("kw2", 60, "kr2", 60)
g.UpdateWhiteList([]string{"10.0.0.1", "192.168.0.0/16"})
}
wg.Wait()
}
// TestUpdateSigningKeysTogglesWriteActive guards the isWriteActive recompute:
// adding a signing key to an otherwise-open guard must activate auth, and
// clearing it again must deactivate.
func TestUpdateSigningKeysTogglesWriteActive(t *testing.T) {
g := NewGuard(nil, "", 0, "", 0)
if g.state.Load().isWriteActive {
t.Fatalf("no whitelist and no key should be inactive")
}
g.UpdateSigningKeys("write", 10, "", 0)
if !g.state.Load().isWriteActive {
t.Fatalf("setting a signing key should activate auth")
}
g.UpdateSigningKeys("", 0, "", 0)
if g.state.Load().isWriteActive {
t.Fatalf("clearing the signing key should deactivate auth")
}
}
+15
View File
@@ -23,6 +23,7 @@ import (
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
util_http "github.com/seaweedfs/seaweedfs/weed/util/http"
"github.com/seaweedfs/seaweedfs/weed/filer"
_ "github.com/seaweedfs/seaweedfs/weed/filer/arangodb"
@@ -343,4 +344,18 @@ func (fs *FilerServer) Reload() {
glog.V(0).Infoln("Reload filer server...")
util.LoadConfiguration("security", false)
v := util.GetViper()
fs.filerGuard.UpdateSigningKeys(
v.GetString("jwt.filer_signing.key"),
v.GetInt("jwt.filer_signing.expires_after_seconds"),
v.GetString("jwt.filer_signing.read.key"),
v.GetInt("jwt.filer_signing.read.expires_after_seconds"),
)
fs.volumeGuard.UpdateSigningKeys(
v.GetString("jwt.signing.key"),
v.GetInt("jwt.signing.expires_after_seconds"),
v.GetString("jwt.signing.read.key"),
v.GetInt("jwt.signing.read.expires_after_seconds"),
)
util_http.ReloadJwtSigningReadConfig()
}
+4 -6
View File
@@ -219,16 +219,14 @@ func (fs *FilerServer) maybeCheckJwtAuthorization(r *http.Request, isWrite bool)
var signingKey security.SigningKey
if isWrite {
if len(fs.filerGuard.SigningKey) == 0 {
signingKey = fs.filerGuard.SigningKey()
if len(signingKey) == 0 {
return true
} else {
signingKey = fs.filerGuard.SigningKey
}
} else {
if len(fs.filerGuard.ReadSigningKey) == 0 {
signingKey = fs.filerGuard.ReadSigningKey()
if len(signingKey) == 0 {
return true
} else {
signingKey = fs.filerGuard.ReadSigningKey
}
}
+1 -1
View File
@@ -254,5 +254,5 @@ func (fs *FilerServer) GetOrHeadHandler(w http.ResponseWriter, r *http.Request)
}
func (fs *FilerServer) maybeGetVolumeReadJwtAuthorizationToken(fileId string) string {
return string(security.GenJwtForVolumeServer(fs.volumeGuard.ReadSigningKey, fs.volumeGuard.ReadExpiresAfterSec, fileId))
return string(security.GenJwtForVolumeServer(fs.volumeGuard.ReadSigningKey(), fs.volumeGuard.ReadExpiresAfterSec(), fileId))
}
+1 -1
View File
@@ -155,7 +155,7 @@ func (ms *MasterServer) Assign(ctx context.Context, req *master_pb.AssignRequest
DataCenter: dn.GetDataCenterId(),
},
Count: count,
Auth: string(security.GenJwtForVolumeServer(ms.guard.SigningKey, ms.guard.ExpiresAfterSec, fid)),
Auth: string(security.GenJwtForVolumeServer(ms.guard.SigningKey(), ms.guard.ExpiresAfterSec(), fid)),
Replicas: replicas,
}, nil
}
+1 -1
View File
@@ -183,7 +183,7 @@ func (ms *MasterServer) LookupVolume(ctx context.Context, req *master_pb.LookupV
}
var auth string
if commaSep > 0 { // this is a file id
auth = string(security.GenJwtForVolumeServer(ms.guard.SigningKey, ms.guard.ExpiresAfterSec, result.VolumeOrFileId))
auth = string(security.GenJwtForVolumeServer(ms.guard.SigningKey(), ms.guard.ExpiresAfterSec(), result.VolumeOrFileId))
}
if result.NotFound {
notFoundCount++
+6
View File
@@ -567,4 +567,10 @@ func (ms *MasterServer) Reload() {
ms.guard.UpdateWhiteList(append(ms.option.WhiteList,
util.StringSplit(v.GetString("guard.white_list"), ",")...),
)
ms.guard.UpdateSigningKeys(
v.GetString("jwt.signing.key"),
v.GetInt("jwt.signing.expires_after_seconds"),
v.GetString("jwt.signing.read.key"),
v.GetInt("jwt.signing.read.expires_after_seconds"),
)
}
+2 -2
View File
@@ -213,9 +213,9 @@ func (ms *MasterServer) maybeAddJwtAuthorization(w http.ResponseWriter, fileId s
}
var encodedJwt security.EncodedJwt
if isWrite {
encodedJwt = security.GenJwtForVolumeServer(ms.guard.SigningKey, ms.guard.ExpiresAfterSec, fileId)
encodedJwt = security.GenJwtForVolumeServer(ms.guard.SigningKey(), ms.guard.ExpiresAfterSec(), fileId)
} else {
encodedJwt = security.GenJwtForVolumeServer(ms.guard.ReadSigningKey, ms.guard.ReadExpiresAfterSec, fileId)
encodedJwt = security.GenJwtForVolumeServer(ms.guard.ReadSigningKey(), ms.guard.ReadExpiresAfterSec(), fileId)
}
if encodedJwt == "" {
return
+6
View File
@@ -190,6 +190,12 @@ func (vs *VolumeServer) Reload() {
util.LoadConfiguration("security", false)
v := util.GetViper()
vs.guard.UpdateWhiteList(append(vs.whiteList, util.StringSplit(v.GetString("guard.white_list"), ",")...))
vs.guard.UpdateSigningKeys(
v.GetString("jwt.signing.key"),
v.GetInt("jwt.signing.expires_after_seconds"),
v.GetString("jwt.signing.read.key"),
v.GetInt("jwt.signing.read.expires_after_seconds"),
)
}
// Returns whether a volume server is in maintenance (i.e. read-only) mode.
+4 -6
View File
@@ -330,16 +330,14 @@ func (vs *VolumeServer) maybeCheckJwtAuthorization(r *http.Request, vid, fid str
var signingKey security.SigningKey
if isWrite {
if len(vs.guard.SigningKey) == 0 {
signingKey = vs.guard.SigningKey()
if len(signingKey) == 0 {
return true
} else {
signingKey = vs.guard.SigningKey
}
} else {
if len(vs.guard.ReadSigningKey) == 0 {
signingKey = vs.guard.ReadSigningKey()
if len(signingKey) == 0 {
return true
} else {
signingKey = vs.guard.ReadSigningKey
}
}
+21 -11
View File
@@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"sync"
"sync/atomic"
"github.com/seaweedfs/seaweedfs/weed/util"
"github.com/seaweedfs/seaweedfs/weed/util/mem"
@@ -27,10 +28,14 @@ import (
var ErrNotFound = fmt.Errorf("not found")
var ErrTooManyRequests = fmt.Errorf("too many requests")
type jwtSigningReadConfig struct {
key security.SigningKey
expires int
}
var (
jwtSigningReadKey security.SigningKey
jwtSigningReadKeyExpires int
loadJwtConfigOnce sync.Once
jwtSigningReadConfigPtr atomic.Pointer[jwtSigningReadConfig]
loadJwtConfigOnce sync.Once
)
func AppendQueryParameter(rawURL, key, value string) string {
@@ -57,8 +62,17 @@ func AppendQueryParameter(rawURL, key, value string) string {
func loadJwtConfig() {
v := util.GetViper()
jwtSigningReadKey = security.SigningKey(v.GetString("jwt.signing.read.key"))
jwtSigningReadKeyExpires = v.GetInt("jwt.signing.read.expires_after_seconds")
jwtSigningReadConfigPtr.Store(&jwtSigningReadConfig{
key: security.SigningKey(v.GetString("jwt.signing.read.key")),
expires: v.GetInt("jwt.signing.read.expires_after_seconds"),
})
}
// ReloadJwtSigningReadConfig re-reads the volume read-signing key from the
// already-reloaded security config, so operators can rotate it via SIGHUP
// without restarting the process.
func ReloadJwtSigningReadConfig() {
loadJwtConfig()
}
func Post(url string, values url.Values) ([]byte, error) {
@@ -534,12 +548,8 @@ func RetriedFetchChunkData(ctx context.Context, buffer []byte, urlStrings []stri
loadJwtConfigOnce.Do(loadJwtConfig)
var jwt security.EncodedJwt
if len(jwtSigningReadKey) > 0 {
jwt = security.GenJwtForVolumeServer(
jwtSigningReadKey,
jwtSigningReadKeyExpires,
fileId,
)
if cfg := jwtSigningReadConfigPtr.Load(); cfg != nil && len(cfg.key) > 0 {
jwt = security.GenJwtForVolumeServer(cfg.key, cfg.expires, fileId)
}
// For unencrypted, non-gzipped full chunks, use direct buffer read