mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-13 23:36:45 +03:00
feat(filer.backup): -initialSnapshot re-seeds a reinitialized destination (#9828)
* feat(filer.backup): add -resetCheckpoint to force a fresh sync filer.backup resumes from a per-sink offset persisted in the source filer's KV. There was no first-class way to discard that checkpoint and re-run from the beginning short of guessing a large -timeAgo, which also skips -initialSnapshot. Add -resetCheckpoint: before reading the offset, write 0 for this sink so getOffset returns 0, isFreshSync stays true, and -initialSnapshot re-runs a full walk. Effective only when -timeAgo is 0. The flag is cleared after the first successful reset: runFilerBackup retries doFilerBackup forever on error, so leaving it set would re-zero the checkpoint on every retry and never make forward progress after a transient failure. Later retries resume from the persisted checkpoint instead. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(filer.backup): keep fresh-sync intent when offset read fails after reset After -resetCheckpoint writes offset 0, a transient getOffset read-back error flipped isFreshSync to false, which skipped the -initialSnapshot walk the reset explicitly requested. Track that the reset happened this iteration and, on a getOffset error, preserve isFreshSync=true in that case (the non-reset path keeps treating a read error as "not fresh" to avoid re-walking on transients). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * refactor(filer.backup): skip offset read-back on reset instead of tracking a flag Replace the didReset bool by branching: on -resetCheckpoint, clear the offset and start fresh without reading it back (we just wrote 0, so the state is known); otherwise read the offset as before. This drops the redundant getOffset RPC after a reset and removes the read-back error case entirely, so no separate flag is needed to preserve isFreshSync. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * filer.backup: -initialSnapshot re-seeds on every start; drop -resetCheckpoint -initialSnapshot now walks the live tree whenever -timeAgo is 0, seeds the destination, and overwrites the saved checkpoint, rather than running only on a fresh sync. That re-seeds a reinitialized destination on its own, so the separate -resetCheckpoint flag is gone. The walk runs once per process: the in-memory flag is cleared after the watermark is persisted, so the retry loop resumes from the persisted checkpoint instead of re-walking on every transient error. A process restart re-walks, so remove the flag once the backup is caught up. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Co-authored-by: Chris Lu <chris.lu@gmail.com>
This commit is contained in:
@@ -63,7 +63,7 @@ func init() {
|
||||
filerBackupOptions.retentionDays = cmdFilerBackup.Flag.Int("retentionDays", 0, "incremental backup retention days")
|
||||
filerBackupOptions.disableErrorRetry = cmdFilerBackup.Flag.Bool("disableErrorRetry", false, "disables errors retry, only logs will print")
|
||||
filerBackupOptions.ignore404Error = cmdFilerBackup.Flag.Bool("ignore404Error", true, "ignore 404 errors from filer")
|
||||
filerBackupOptions.initialSnapshot = cmdFilerBackup.Flag.Bool("initialSnapshot", false, "before subscribing to metadata updates, walk the live filer tree under -filerPath and seed the destination. Only runs on a fresh sync (no prior checkpoint and -timeAgo is 0). After the walk, subscription starts from the walk-start timestamp so concurrent changes are still captured.")
|
||||
filerBackupOptions.initialSnapshot = cmdFilerBackup.Flag.Bool("initialSnapshot", false, "before subscribing to metadata updates, walk the live filer tree under -filerPath and seed the destination, then subscribe from the walk-start timestamp so concurrent changes are still captured. Runs on every start when -timeAgo is 0 and overwrites the saved checkpoint, so it also re-seeds a reinitialized destination. Remove it once the backup is caught up, otherwise it re-walks the whole tree on each restart.")
|
||||
}
|
||||
|
||||
var cmdFilerBackup = &Command{
|
||||
@@ -75,12 +75,13 @@ var cmdFilerBackup = &Command{
|
||||
and write to the destination. This is to replace filer.replicate command since additional message queue is not needed.
|
||||
|
||||
If restarted and "-timeAgo" is not set, the synchronization will resume from the previous checkpoints, persisted every minute.
|
||||
A fresh sync will start from the earliest metadata logs. To reset the checkpoints, just set "-timeAgo" to a high value.
|
||||
A fresh sync will start from the earliest metadata logs.
|
||||
|
||||
On a fresh sync the metadata event log only re-materializes files that still exist on the source; entries that were
|
||||
created and later deleted are replayed as a create-then-delete pair and therefore never appear on the destination.
|
||||
Pass "-initialSnapshot" to walk the live filer tree first and seed the destination with the current tree, then
|
||||
subscribe from the walk-start timestamp. The walk only runs when there is no prior checkpoint.
|
||||
subscribe from the walk-start timestamp. This also re-seeds a reinitialized destination: it overwrites the saved
|
||||
checkpoint and re-walks on every start, so remove it once the backup is caught up.
|
||||
|
||||
`,
|
||||
}
|
||||
@@ -135,27 +136,25 @@ func doFilerBackup(grpcDialOption grpc.DialOption, backupOption *FilerBackupOpti
|
||||
|
||||
// get start time for the data sink
|
||||
startFrom := time.Unix(0, 0)
|
||||
isFreshSync := true
|
||||
sinkId := util.HashStringToLong(dataSink.GetName() + dataSink.GetSinkToDirectory())
|
||||
if timeAgo.Milliseconds() == 0 {
|
||||
lastOffsetTsNs, err := getOffset(grpcDialOption, sourceFiler, BackupKeyPrefix, int32(sinkId))
|
||||
if err != nil {
|
||||
// A KV read failure is ambiguous — a checkpoint may well exist but the
|
||||
// source filer is temporarily unreachable. Don't treat that as a fresh
|
||||
// sync; otherwise runFilerBackup's retry loop would redo the full
|
||||
// -initialSnapshot walk on every transient error.
|
||||
isFreshSync = false
|
||||
glog.V(0).Infof("starting from %v (offset read failed: %v)", startFrom, err)
|
||||
} else if lastOffsetTsNs > 0 {
|
||||
startFrom = time.Unix(0, lastOffsetTsNs)
|
||||
isFreshSync = false
|
||||
glog.V(0).Infof("resuming from %v", startFrom)
|
||||
runSnapshot := *backupOption.initialSnapshot && timeAgo == 0
|
||||
if timeAgo == 0 {
|
||||
if runSnapshot {
|
||||
// snapshot below sets the start point; no checkpoint read needed
|
||||
glog.V(0).Infof("initialSnapshot requested — walking live tree before subscribing")
|
||||
} else {
|
||||
glog.V(0).Infof("starting from %v (no prior checkpoint)", startFrom)
|
||||
lastOffsetTsNs, err := getOffset(grpcDialOption, sourceFiler, BackupKeyPrefix, int32(sinkId))
|
||||
if err != nil {
|
||||
glog.V(0).Infof("starting from %v (offset read failed: %v)", startFrom, err)
|
||||
} else if lastOffsetTsNs > 0 {
|
||||
startFrom = time.Unix(0, lastOffsetTsNs)
|
||||
glog.V(0).Infof("resuming from %v", startFrom)
|
||||
} else {
|
||||
glog.V(0).Infof("starting from %v (no prior checkpoint)", startFrom)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
startFrom = time.Now().Add(-timeAgo)
|
||||
isFreshSync = false
|
||||
glog.V(0).Infof("start time is set to %v", startFrom)
|
||||
}
|
||||
|
||||
@@ -174,13 +173,11 @@ func doFilerBackup(grpcDialOption grpc.DialOption, backupOption *FilerBackupOpti
|
||||
}
|
||||
dataSink.SetSourceFiler(filerSource)
|
||||
|
||||
// When the destination has no prior checkpoint and the user opted in to an
|
||||
// initial snapshot, walk the live filer tree first and seed the destination
|
||||
// with the current entries. This avoids the "only new files appear" pitfall
|
||||
// of replaying the metadata event log: entries created-then-deleted before
|
||||
// the walk leave no trace, so a re-backup after wiping the destination
|
||||
// reflects what is actually live on the source instead of an empty tree.
|
||||
if *backupOption.initialSnapshot && isFreshSync {
|
||||
// Walk and seed the live tree, then subscribe from the walk-start watermark:
|
||||
// replaying the event log alone misses entries created-then-deleted before
|
||||
// the walk. The watermark overwrites any stale checkpoint, re-seeding a wiped
|
||||
// destination.
|
||||
if runSnapshot {
|
||||
snapshotTsNs, err := runInitialSnapshot(sourceFiler.ToGrpcAddress(), filerSource, sourcePath, targetPath, excludePaths, reExcludeFileName, excludeFileNames, excludePathPatterns, dataSink, *backupOption.ignore404Error)
|
||||
if err != nil {
|
||||
return fmt.Errorf("initial snapshot: %w", err)
|
||||
@@ -193,6 +190,8 @@ func doFilerBackup(grpcDialOption grpc.DialOption, backupOption *FilerBackupOpti
|
||||
return fmt.Errorf("persist initial snapshot offset: %w", err)
|
||||
}
|
||||
startFrom = time.Unix(0, snapshotTsNs)
|
||||
// walk once per process; retries resume from the persisted checkpoint
|
||||
*backupOption.initialSnapshot = false
|
||||
glog.V(0).Infof("initialSnapshot done; subscribing from %v", startFrom)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user