Files
seaweedfs/weed/storage/disk_location.go
T
Aleksey e3e02d3364 [CheckDisk]: implement disk health detection (#9560)
* [CheckDisk][GRPC]: implement MVP for disk health detection, added timeout for new grpc connections

* fix(volume): build disk health check on every platform

setDiskStatus only existed behind the statfs build tag, so disk.go failed
to compile on windows, openbsd, solaris, netbsd and plan9. Move the timeout
wrapper and failure tracking into the shared disk.go and have each platform's
fillInDiskStatus return an error, so every platform gets the same protection
from a stuck filesystem.

Also restore the uint64(fs.Bavail) cast: Bavail is int64 on freebsd, so the
unguarded multiply broke the freebsd build.

* fix(volume): keep one outstanding statfs probe per disk

A stuck statfs used to leave isChecking cleared by the timeout path, so the
next check spawned another goroutine while the previous one was still blocked
in the syscall, leaking one goroutine per minute on a hung disk. Clear the
flag only when statfs returns and treat an overlapping check as a failure, so
a hung filesystem keeps a single outstanding probe and still gets reported.

* fix(volume): assume disk available until the first health check

isDiskAvailable defaulted to false, and CollectHeartbeat skips locations that
are not available. A freshly started volume server would therefore omit every
volume from its first heartbeats until the async CheckDiskSpace ran, so the
master could briefly treat all of them as missing.

* fix(volume): label the disk error metric by data directory

The new gauge tagged the series with IdxDirectory while every neighbouring
resource gauge uses Directory, so the error series would not line up with them
in dashboards. Also log the underlying error instead of a generic message.

* test(volume): cover disk health success and repeated-failure paths

* fix(volume): make a healthy disk the zero-value default

Track the disk as isDiskUnavailable instead of isDiskAvailable so the safe
state is the zero value, matching isDiskSpaceLow. CollectHeartbeat only skips a
location once a check has actively marked it unavailable, so any DiskLocation
built without running CheckDiskSpace (tests, future call sites) still reports
its volumes instead of silently dropping them.

* feat(disk): detect degraded disks using IO latency probes

* feat(stats): introduce configurable disk I/O health probe with EWMA-based latency detection

* feat(disk): replace EWMA with sliding window algorithm for disk health detection and added user-friendly options

* feat(disk): improve disk health probing and recovery

* feat(volume): configure disk health checks via volume.toml

* fix(volume): Remove disk IO probe CLI options

---------

Co-authored-by: ptukha <ptukha@tochka.com>
Co-authored-by: Chris Lu <chris.lu@gmail.com>
2026-06-02 09:02:05 -07:00

573 lines
15 KiB
Go

package storage
import (
"fmt"
"os"
"path/filepath"
"runtime"
"slices"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/seaweedfs/seaweedfs/weed/util"
)
const (
UUIDFileName = "vol_dir.uuid"
UUIDFileMod = 0644
)
type DiskLocation struct {
Directory string
DirectoryUuid string
IdxDirectory string
DiskType types.DiskType
Tags []string
MaxVolumeCount int32
OriginalMaxVolumeCount int32
MinFreeSpace util.MinFreeSpace
AvailableSpace atomic.Uint64
volumes map[needle.VolumeId]*Volume
volumesLock sync.RWMutex
// erasure coding
ecVolumes map[needle.VolumeId]*erasure_coding.EcVolume
ecVolumesLock sync.RWMutex
ecShardNotifyHandler func(collection string, vid needle.VolumeId, shardId erasure_coding.ShardId, ecVolume *erasure_coding.EcVolume)
isDiskSpaceLow atomic.Bool
isDiskUnavailable atomic.Bool
closeCh chan struct{}
}
func GenerateDirUuid(dir string) (dirUuidString string, err error) {
glog.V(1).Infof("Getting uuid of volume directory:%s", dir)
fileName := filepath.Join(dir, UUIDFileName)
if !util.FileExists(fileName) {
dirUuidString, err = writeNewUuid(fileName)
} else {
uuidData, readErr := os.ReadFile(fileName)
if readErr != nil {
return "", fmt.Errorf("failed to read uuid from %s : %v", fileName, readErr)
}
if len(uuidData) > 0 {
dirUuidString = string(uuidData)
} else {
dirUuidString, err = writeNewUuid(fileName)
}
}
return dirUuidString, err
}
func writeNewUuid(fileName string) (string, error) {
dirUuid, _ := uuid.NewRandom()
dirUuidString := dirUuid.String()
if err := util.WriteFile(fileName, []byte(dirUuidString), UUIDFileMod); err != nil {
return "", fmt.Errorf("failed to write uuid to %s : %v", fileName, err)
}
return dirUuidString, nil
}
func NewDiskLocation(dir string, maxVolumeCount int32, minFreeSpace util.MinFreeSpace, idxDir string, diskType types.DiskType, tags []string, config stats.DiskIOProbeConfig) *DiskLocation {
glog.V(4).Infof("Added new Disk %s: maxVolumes=%d", dir, maxVolumeCount)
dir = util.ResolvePath(dir)
if idxDir == "" {
idxDir = dir
} else {
idxDir = util.ResolvePath(idxDir)
}
dirUuid, err := GenerateDirUuid(dir)
if err != nil {
glog.Fatalf("cannot generate uuid of dir %s: %v", dir, err)
}
// Defensive copy of tags to prevent external mutation
var copiedTags []string
if len(tags) > 0 {
copiedTags = make([]string, len(tags))
copy(copiedTags, tags)
}
location := &DiskLocation{
Directory: dir,
DirectoryUuid: dirUuid,
IdxDirectory: idxDir,
DiskType: diskType,
Tags: copiedTags,
MaxVolumeCount: maxVolumeCount,
OriginalMaxVolumeCount: maxVolumeCount,
MinFreeSpace: minFreeSpace,
}
location.volumes = make(map[needle.VolumeId]*Volume)
location.ecVolumes = make(map[needle.VolumeId]*erasure_coding.EcVolume)
location.closeCh = make(chan struct{})
go func() {
location.CheckDiskSpace(config)
for {
select {
case <-location.closeCh:
return
case <-time.After(time.Minute):
location.CheckDiskSpace(config)
}
}
}()
return location
}
func volumeIdFromFileName(filename string) (needle.VolumeId, string, error) {
if isValidVolume(filename) {
base := filename[:len(filename)-4]
collection, volumeId, err := parseCollectionVolumeId(base)
return volumeId, collection, err
}
return 0, "", fmt.Errorf("file is not a volume: %s", filename)
}
func parseCollectionVolumeId(base string) (collection string, vid needle.VolumeId, err error) {
i := strings.LastIndex(base, "_")
if i > 0 {
collection, base = base[0:i], base[i+1:]
}
vol, err := needle.NewVolumeId(base)
return collection, vol, err
}
func isValidVolume(basename string) bool {
return strings.HasSuffix(basename, ".idx") || strings.HasSuffix(basename, ".vif")
}
func getValidVolumeName(basename string) string {
if isValidVolume(basename) {
return basename[:len(basename)-4]
}
return ""
}
// hasEcxFile reports whether an .ecx for volumeName exists on this disk.
// Checks IdxDirectory first, then falls back to Directory (the .ecx may
// have been created before -dir.idx was configured).
func (l *DiskLocation) hasEcxFile(volumeName string) bool {
if util.FileExists(filepath.Join(l.IdxDirectory, volumeName+".ecx")) {
return true
}
if l.IdxDirectory != l.Directory {
return util.FileExists(filepath.Join(l.Directory, volumeName+".ecx"))
}
return false
}
func (l *DiskLocation) loadExistingVolume(dirEntry os.DirEntry, needleMapKind NeedleMapKind, skipIfEcVolumesExists bool, ldbTimeout int64, diskId uint32) bool {
basename := dirEntry.Name()
if dirEntry.IsDir() {
return false
}
volumeName := getValidVolumeName(basename)
if volumeName == "" {
return false
}
// parse out collection, volume id (moved up to use in EC validation)
vid, collection, err := volumeIdFromFileName(basename)
if err != nil {
glog.Warningf("get volume id failed, %s, err : %s", volumeName, err)
return false
}
// .vif next to .ecx is EC shard metadata, not a regular volume.
// Without this guard NewVolume below would create a phantom empty .dat.
if strings.HasSuffix(basename, ".vif") && l.hasEcxFile(volumeName) {
glog.V(1).Infof("loadExistingVolume: skipping .vif-only entry for volume %d (collection=%q); .ecx present", vid, collection)
return false
}
// skip if ec volumes exists, but validate EC files first
if skipIfEcVolumesExists {
if l.hasEcxFile(volumeName) {
// Validate EC volume: shard count, size consistency, and expected size vs .dat file
if !l.validateEcVolume(collection, vid) {
glog.Warningf("EC volume %d validation failed, removing incomplete EC files to allow .dat file loading", vid)
l.removeEcVolumeFiles(collection, vid)
// Continue to load .dat file
} else {
// Valid EC volume exists, skip .dat file
return false
}
}
}
// check for incomplete volume
noteFile := l.Directory + "/" + volumeName + ".note"
if util.FileExists(noteFile) {
note, _ := os.ReadFile(noteFile)
glog.Warningf("volume %s was not completed: %s", volumeName, string(note))
removeVolumeFiles(l.Directory+"/"+volumeName, false)
removeVolumeFiles(l.IdxDirectory+"/"+volumeName, false)
return false
}
// avoid loading one volume more than once
l.volumesLock.RLock()
_, found := l.volumes[vid]
l.volumesLock.RUnlock()
if found {
glog.V(1).Infof("loaded volume, %v", vid)
return true
}
// load the volume
v, e := NewVolume(l.Directory, l.IdxDirectory, collection, vid, needleMapKind, nil, nil, 0, needle.GetCurrentVersion(), 0, ldbTimeout)
if e != nil {
glog.V(0).Infof("new volume %s error %s", volumeName, e)
return false
}
v.diskId = diskId // Set the disk ID for existing volumes
l.SetVolume(vid, v)
size, _, _ := v.FileStat()
glog.V(2).Infof("data file %s, replication=%s v=%d size=%d ttl=%s disk_id=%d",
l.Directory+"/"+volumeName+".dat", v.ReplicaPlacement, v.Version(), size, v.Ttl.String(), diskId)
return true
}
func (l *DiskLocation) concurrentLoadingVolumes(needleMapKind NeedleMapKind, concurrency int, ldbTimeout int64, diskId uint32) {
task_queue := make(chan os.DirEntry, 10*concurrency)
go func() {
foundVolumeNames := make(map[string]bool)
if dirEntries, err := os.ReadDir(l.Directory); err == nil {
for _, entry := range dirEntries {
volumeName := getValidVolumeName(entry.Name())
if volumeName == "" {
continue
}
if _, found := foundVolumeNames[volumeName]; !found {
foundVolumeNames[volumeName] = true
task_queue <- entry
}
}
}
close(task_queue)
}()
var wg sync.WaitGroup
for workerNum := 0; workerNum < concurrency; workerNum++ {
wg.Add(1)
go func() {
defer wg.Done()
for fi := range task_queue {
_ = l.loadExistingVolume(fi, needleMapKind, true, ldbTimeout, diskId)
}
}()
}
wg.Wait()
}
func (l *DiskLocation) loadExistingVolumes(needleMapKind NeedleMapKind, ldbTimeout int64) {
l.loadExistingVolumesWithId(needleMapKind, ldbTimeout, 0) // Default disk ID for backward compatibility
}
func (l *DiskLocation) loadExistingVolumesWithId(needleMapKind NeedleMapKind, ldbTimeout int64, diskId uint32) {
workerNum := runtime.NumCPU()
val, ok := os.LookupEnv("GOMAXPROCS")
if ok {
num, err := strconv.Atoi(val)
if err != nil || num < 1 {
num = 10
glog.Warningf("failed to set worker number from GOMAXPROCS , set to default:10")
}
workerNum = num
} else {
if workerNum <= 10 {
workerNum = 10
}
}
l.concurrentLoadingVolumes(needleMapKind, workerNum, ldbTimeout, diskId)
glog.V(2).Infof("Store started on dir: %s with %d volumes max %d (disk ID: %d)", l.Directory, len(l.volumes), l.MaxVolumeCount, diskId)
l.loadAllEcShards(l.ecShardNotifyHandler)
glog.V(2).Infof("Store started on dir: %s with %d ec shards (disk ID: %d)", l.Directory, len(l.ecVolumes), diskId)
}
func (l *DiskLocation) DeleteCollectionFromDiskLocation(collection string) (e error) {
l.volumesLock.Lock()
delVolsMap := l.unmountVolumeByCollection(collection)
l.volumesLock.Unlock()
l.ecVolumesLock.Lock()
delEcVolsMap := l.unmountEcVolumeByCollection(collection)
l.ecVolumesLock.Unlock()
errChain := make(chan error, 2)
var wg sync.WaitGroup
wg.Add(2)
go func() {
for k, v := range delVolsMap {
if err := v.Destroy(false, false); err != nil {
errChain <- err
} else {
l.volumesLock.Lock()
delete(l.volumes, k)
l.volumesLock.Unlock()
}
}
wg.Done()
}()
go func() {
for _, v := range delEcVolsMap {
v.Destroy()
}
wg.Done()
}()
go func() {
wg.Wait()
close(errChain)
}()
errBuilder := strings.Builder{}
for err := range errChain {
errBuilder.WriteString(err.Error())
errBuilder.WriteString("; ")
}
if errBuilder.Len() > 0 {
e = fmt.Errorf("%s", errBuilder.String())
}
return
}
func (l *DiskLocation) deleteVolumeById(vid needle.VolumeId, onlyEmpty bool, keepRemoteData bool) (found bool, e error) {
v, ok := l.volumes[vid]
if !ok {
return
}
e = v.Destroy(onlyEmpty, keepRemoteData)
if e != nil {
return
}
found = true
delete(l.volumes, vid)
return
}
func (l *DiskLocation) LoadVolume(diskId uint32, vid needle.VolumeId, needleMapKind NeedleMapKind) bool {
if fileInfo, found := l.LocateVolume(vid); found {
return l.loadExistingVolume(fileInfo, needleMapKind, false, 0, diskId)
}
return false
}
var ErrVolumeNotFound = fmt.Errorf("volume not found")
func (l *DiskLocation) DeleteVolume(vid needle.VolumeId, onlyEmpty bool, keepRemoteData bool) error {
l.volumesLock.Lock()
defer l.volumesLock.Unlock()
_, ok := l.volumes[vid]
if !ok {
return ErrVolumeNotFound
}
_, err := l.deleteVolumeById(vid, onlyEmpty, keepRemoteData)
return err
}
func (l *DiskLocation) UnloadVolume(vid needle.VolumeId) error {
l.volumesLock.Lock()
defer l.volumesLock.Unlock()
v, ok := l.volumes[vid]
if !ok {
return ErrVolumeNotFound
}
v.Close()
delete(l.volumes, vid)
return nil
}
func (l *DiskLocation) unmountVolumeByCollection(collectionName string) map[needle.VolumeId]*Volume {
deltaVols := make(map[needle.VolumeId]*Volume, 0)
for k, v := range l.volumes {
if v.Collection == collectionName && !v.isCompactionInProgress.Load() {
deltaVols[k] = v
}
}
return deltaVols
}
func (l *DiskLocation) SetVolume(vid needle.VolumeId, volume *Volume) {
l.volumesLock.Lock()
defer l.volumesLock.Unlock()
l.volumes[vid] = volume
volume.location = l
}
func (l *DiskLocation) FindVolume(vid needle.VolumeId) (*Volume, bool) {
l.volumesLock.RLock()
defer l.volumesLock.RUnlock()
v, ok := l.volumes[vid]
return v, ok
}
// Returns all regular volume IDs stored at this location.
func (l *DiskLocation) VolumeIds() []needle.VolumeId {
l.volumesLock.RLock()
defer l.volumesLock.RUnlock()
vids := make([]needle.VolumeId, len(l.volumes))
i := 0
for vid := range l.volumes {
vids[i] = vid
i++
}
slices.Sort(vids)
return vids
}
// Returns all EC volume IDs stored at this location.
func (l *DiskLocation) EcVolumeIds() []needle.VolumeId {
l.ecVolumesLock.RLock()
defer l.ecVolumesLock.RUnlock()
vids := make([]needle.VolumeId, len(l.ecVolumes))
i := 0
for vid := range l.ecVolumes {
vids[i] = vid
i++
}
slices.Sort(vids)
return vids
}
func (l *DiskLocation) VolumesLen() int {
l.volumesLock.RLock()
defer l.volumesLock.RUnlock()
return len(l.volumes)
}
func (l *DiskLocation) LocalVolumesLen() int {
l.volumesLock.RLock()
defer l.volumesLock.RUnlock()
count := 0
for _, v := range l.volumes {
if !v.HasRemoteFile() {
count++
}
}
return count
}
func (l *DiskLocation) SetStopping() {
l.volumesLock.Lock()
for _, v := range l.volumes {
v.SyncToDisk()
}
l.volumesLock.Unlock()
return
}
func (l *DiskLocation) Close() {
l.volumesLock.Lock()
for _, v := range l.volumes {
v.Close()
}
l.volumesLock.Unlock()
l.ecVolumesLock.Lock()
for _, ecVolume := range l.ecVolumes {
ecVolume.Close()
}
l.ecVolumesLock.Unlock()
close(l.closeCh)
return
}
func (l *DiskLocation) LocateVolume(vid needle.VolumeId) (os.DirEntry, bool) {
// println("LocateVolume", vid, "on", l.Directory)
if dirEntries, err := os.ReadDir(l.Directory); err == nil {
for _, entry := range dirEntries {
// println("checking", entry.Name(), "...")
volId, _, err := volumeIdFromFileName(entry.Name())
// println("volId", volId, "err", err)
if vid == volId && err == nil {
return entry, true
}
}
}
return nil, false
}
func (l *DiskLocation) UnUsedSpace(volumeSizeLimit uint64) (unUsedSpace uint64) {
l.volumesLock.RLock()
defer l.volumesLock.RUnlock()
for _, vol := range l.volumes {
if vol.IsReadOnly() {
continue
}
datSize, idxSize, _ := vol.FileStat()
unUsedSpaceVolume := int64(volumeSizeLimit) - int64(datSize+idxSize)
glog.V(4).Infof("Volume stats for %d: volumeSizeLimit=%d, datSize=%d idxSize=%d unused=%d", vol.Id, volumeSizeLimit, datSize, idxSize, unUsedSpaceVolume)
if unUsedSpaceVolume >= 0 {
unUsedSpace += uint64(unUsedSpaceVolume)
}
}
return
}
func (l *DiskLocation) CheckDiskSpace(config stats.DiskIOProbeConfig) {
if dir, e := filepath.Abs(l.Directory); e == nil {
s := stats.NewDiskStatusOnStart(dir, config)
if len(s.Error) != 0 {
l.isDiskUnavailable.Store(true)
stats.VolumeServerDiskErrorGauge.WithLabelValues(l.Directory, "error").Set(1)
glog.V(1).Infof("disk %s is not healthy: %s", dir, s.Error)
} else {
l.isDiskUnavailable.Store(false)
stats.VolumeServerDiskErrorGauge.WithLabelValues(l.Directory, "error").Set(0)
}
available := l.MinFreeSpace.AvailableSpace(s.Free, s.All)
stats.VolumeServerResourceGauge.WithLabelValues(l.Directory, "all").Set(float64(s.All))
stats.VolumeServerResourceGauge.WithLabelValues(l.Directory, "used").Set(float64(s.Used))
stats.VolumeServerResourceGauge.WithLabelValues(l.Directory, "free").Set(float64(s.Free))
stats.VolumeServerResourceGauge.WithLabelValues(l.Directory, "avail").Set(float64(available))
l.AvailableSpace.Store(available)
isLow, desc := l.MinFreeSpace.IsLow(s.Free, s.PercentFree)
if isLow != l.isDiskSpaceLow.Load() {
l.isDiskSpaceLow.Store(isLow)
}
logLevel := glog.Level(4)
if l.isDiskSpaceLow.Load() {
logLevel = glog.Level(0)
}
glog.V(logLevel).Infof("dir %s %s", dir, desc)
}
}