Skip to content

Commit 068325c

Browse files
committed
Monitor free disk percentage, not just absolute space
Signed-off-by: Yacov Manevich <[email protected]>
1 parent d6386c1 commit 068325c

File tree

11 files changed

+101
-23
lines changed

11 files changed

+101
-23
lines changed

config/config.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,14 +1138,17 @@ func getCPUTargeterConfig(v *viper.Viper) (tracker.TargeterConfig, error) {
11381138
}
11391139
}
11401140

1141-
func getDiskSpaceConfig(v *viper.Viper) (requiredAvailableDiskSpace uint64, warningThresholdAvailableDiskSpace uint64, err error) {
1141+
func getDiskSpaceConfig(v *viper.Viper) (requiredAvailableDiskSpace uint64, warningThresholdAvailableDiskSpace uint64, warningThresholdAvailableDiskSpacePercentage uint64, err error) {
11421142
requiredAvailableDiskSpace = v.GetUint64(SystemTrackerRequiredAvailableDiskSpaceKey)
11431143
warningThresholdAvailableDiskSpace = v.GetUint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey)
1144+
warningThresholdAvailableDiskSpacePercentage = v.GetUint64(SystemTrackerWarnThreshAvailDiskSpacePercentageKey)
11441145
switch {
1146+
case warningThresholdAvailableDiskSpacePercentage > 50 || warningThresholdAvailableDiskSpacePercentage == 0:
1147+
return 0, 0, 0, fmt.Errorf("%q (%d) must be in [1, 50]", SystemTrackerWarnThreshAvailDiskSpacePercentageKey, warningThresholdAvailableDiskSpacePercentage)
11451148
case warningThresholdAvailableDiskSpace < requiredAvailableDiskSpace:
1146-
return 0, 0, fmt.Errorf("%q (%d) < %q (%d)", SystemTrackerWarningThresholdAvailableDiskSpaceKey, warningThresholdAvailableDiskSpace, SystemTrackerRequiredAvailableDiskSpaceKey, requiredAvailableDiskSpace)
1149+
return 0, 0, 0, fmt.Errorf("%q (%d) < %q (%d)", SystemTrackerWarningThresholdAvailableDiskSpaceKey, warningThresholdAvailableDiskSpace, SystemTrackerRequiredAvailableDiskSpaceKey, requiredAvailableDiskSpace)
11471150
default:
1148-
return requiredAvailableDiskSpace, warningThresholdAvailableDiskSpace, nil
1151+
return requiredAvailableDiskSpace, warningThresholdAvailableDiskSpace, warningThresholdAvailableDiskSpacePercentage, nil
11491152
}
11501153
}
11511154

@@ -1400,7 +1403,7 @@ func GetNodeConfig(v *viper.Viper) (node.Config, error) {
14001403
nodeConfig.SystemTrackerCPUHalflife = v.GetDuration(SystemTrackerCPUHalflifeKey)
14011404
nodeConfig.SystemTrackerDiskHalflife = v.GetDuration(SystemTrackerDiskHalflifeKey)
14021405

1403-
nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, err = getDiskSpaceConfig(v)
1406+
nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpacePercentage, err = getDiskSpaceConfig(v)
14041407
if err != nil {
14051408
return node.Config{}, err
14061409
}

config/flags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ func addNodeFlags(fs *pflag.FlagSet) {
359359
fs.Duration(SystemTrackerDiskHalflifeKey, time.Minute, "Halflife to use for the disk tracker. Larger halflife --> disk usage metrics change more slowly")
360360
fs.Uint64(SystemTrackerRequiredAvailableDiskSpaceKey, units.GiB/2, "Minimum number of available bytes on disk, under which the node will shutdown.")
361361
fs.Uint64(SystemTrackerWarningThresholdAvailableDiskSpaceKey, units.GiB, fmt.Sprintf("Warning threshold for the number of available bytes on disk, under which the node will be considered unhealthy. Must be >= [%s]", SystemTrackerRequiredAvailableDiskSpaceKey))
362+
fs.Uint64(SystemTrackerWarnThreshAvailDiskSpacePercentageKey, 3, "Warning threshold for the percentage (between 1 and 50) of available disk space, under which the node will be considered unhealthy.")
362363

363364
// CPU management
364365
fs.Float64(CPUVdrAllocKey, float64(runtime.NumCPU()), "Maximum number of CPUs to allocate for use by validators. Value should be in range [0, total core count]")

config/keys.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ const (
193193
SystemTrackerDiskHalflifeKey = "system-tracker-disk-halflife"
194194
SystemTrackerRequiredAvailableDiskSpaceKey = "system-tracker-disk-required-available-space"
195195
SystemTrackerWarningThresholdAvailableDiskSpaceKey = "system-tracker-disk-warning-threshold-available-space"
196+
SystemTrackerWarnThreshAvailDiskSpacePercentageKey = "system-tracker-disk-warning-threshold-available-space-percentage"
196197
DiskVdrAllocKey = "throttler-inbound-disk-validator-alloc"
197198
DiskMaxNonVdrUsageKey = "throttler-inbound-disk-max-non-validator-usage"
198199
DiskMaxNonVdrNodeUsageKey = "throttler-inbound-disk-max-non-validator-node-usage"

config/node/config.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,9 @@ type Config struct {
205205

206206
DiskTargeterConfig tracker.TargeterConfig `json:"diskTargeterConfig"`
207207

208-
RequiredAvailableDiskSpace uint64 `json:"requiredAvailableDiskSpace"`
209-
WarningThresholdAvailableDiskSpace uint64 `json:"warningThresholdAvailableDiskSpace"`
208+
RequiredAvailableDiskSpace uint64 `json:"requiredAvailableDiskSpace"`
209+
WarningThresholdAvailableDiskSpace uint64 `json:"warningThresholdAvailableDiskSpace"`
210+
WarningThresholdAvailableDiskSpacePercentage uint64 `json:"warningThresholdAvailableDiskSpacePercentage"`
210211

211212
TraceConfig trace.Config `json:"traceConfig"`
212213

node/node.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1457,6 +1457,7 @@ func (n *Node) initHealthAPI() error {
14571457
// if there is too little disk space remaining, first report unhealthy and then shutdown the node
14581458

14591459
availableDiskBytes := n.resourceTracker.DiskTracker().AvailableDiskBytes()
1460+
availableDiskPercentage := n.resourceTracker.DiskTracker().AvailableDiskPercentage()
14601461

14611462
var err error
14621463
if availableDiskBytes < n.Config.RequiredAvailableDiskSpace {
@@ -1469,8 +1470,13 @@ func (n *Node) initHealthAPI() error {
14691470
err = fmt.Errorf("remaining available disk space (%d) is below the warning threshold of disk space (%d)", availableDiskBytes, n.Config.WarningThresholdAvailableDiskSpace)
14701471
}
14711472

1473+
if availableDiskPercentage < n.Config.WarningThresholdAvailableDiskSpacePercentage {
1474+
err = fmt.Errorf("remaining available disk space percentage (%d%%) is below minimum required available space percentage (%d%%)", availableDiskPercentage, n.Config.WarningThresholdAvailableDiskSpacePercentage)
1475+
}
1476+
14721477
return map[string]interface{}{
1473-
"availableDiskBytes": availableDiskBytes,
1478+
"availableDiskBytes": availableDiskBytes,
1479+
"availableDiskPercentage": availableDiskPercentage,
14741480
}, err
14751481
})
14761482

snow/networking/tracker/resource_tracker.go

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ type Tracker interface {
3636
type DiskTracker interface {
3737
Tracker
3838
AvailableDiskBytes() uint64
39+
AvailableDiskPercentage() uint64
3940
}
4041

4142
// ResourceTracker is an interface for tracking peers' usage of resources
@@ -150,6 +151,16 @@ func (t *diskResourceTracker) AvailableDiskBytes() uint64 {
150151
return bytesAvailable
151152
}
152153

154+
func (t *diskResourceTracker) AvailableDiskPercentage() uint64 {
155+
rt := t.t
156+
rt.lock.Lock()
157+
defer rt.lock.Unlock()
158+
159+
percentageAvailable := rt.resources.AvailableDiskPercentage()
160+
rt.metrics.diskPercentageAvailable.Set(float64(percentageAvailable))
161+
return percentageAvailable
162+
}
163+
153164
func (t *diskResourceTracker) TotalUsage() float64 {
154165
realReadUsage, _ := t.t.resources.DiskUsage()
155166
return realReadUsage
@@ -286,11 +297,12 @@ func (rt *resourceTracker) prune(now time.Time) {
286297
}
287298

288299
type trackerMetrics struct {
289-
processingTimeMetric prometheus.Gauge
290-
cpuMetric prometheus.Gauge
291-
diskReadsMetric prometheus.Gauge
292-
diskWritesMetric prometheus.Gauge
293-
diskSpaceAvailable prometheus.Gauge
300+
processingTimeMetric prometheus.Gauge
301+
cpuMetric prometheus.Gauge
302+
diskReadsMetric prometheus.Gauge
303+
diskWritesMetric prometheus.Gauge
304+
diskSpaceAvailable prometheus.Gauge
305+
diskPercentageAvailable prometheus.Gauge
294306
}
295307

296308
func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
@@ -315,13 +327,18 @@ func newCPUTrackerMetrics(reg prometheus.Registerer) (*trackerMetrics, error) {
315327
Name: "disk_available_space",
316328
Help: "Available space remaining (bytes) on the database volume",
317329
}),
330+
diskPercentageAvailable: prometheus.NewGauge(prometheus.GaugeOpts{
331+
Name: "disk_available_percentage",
332+
Help: "Percentage of database volume available",
333+
}),
318334
}
319335
err := errors.Join(
320336
reg.Register(m.processingTimeMetric),
321337
reg.Register(m.cpuMetric),
322338
reg.Register(m.diskReadsMetric),
323339
reg.Register(m.diskWritesMetric),
324340
reg.Register(m.diskSpaceAvailable),
341+
reg.Register(m.diskPercentageAvailable),
325342
)
326343
return m, err
327344
}

utils/resource/no_usage.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ var NoUsage User = noUsage{}
1010

1111
type noUsage struct{}
1212

13+
func (noUsage) AvailableDiskPercentage() uint64 {
14+
return math.MaxUint64
15+
}
16+
1317
func (noUsage) CPUUsage() float64 {
1418
return 0
1519
}
@@ -19,5 +23,5 @@ func (noUsage) DiskUsage() (float64, float64) {
1923
}
2024

2125
func (noUsage) AvailableDiskBytes() uint64 {
22-
return math.MaxUint64
26+
return 100
2327
}

utils/resource/resourcemock/user.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utils/resource/usage.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ type DiskUser interface {
4141

4242
// returns number of bytes available in the db volume
4343
AvailableDiskBytes() uint64
44+
45+
// returns percentage free in the db volume
46+
AvailableDiskPercentage() uint64
4447
}
4548

4649
type User interface {
@@ -82,6 +85,8 @@ type manager struct {
8285

8386
availableDiskBytes uint64
8487

88+
availableDiskPercent uint64
89+
8590
closeOnce sync.Once
8691
onClose chan struct{}
8792
}
@@ -132,6 +137,13 @@ func (m *manager) AvailableDiskBytes() uint64 {
132137
return m.availableDiskBytes
133138
}
134139

140+
func (m *manager) AvailableDiskPercentage() uint64 {
141+
m.usageLock.RLock()
142+
defer m.usageLock.RUnlock()
143+
144+
return m.availableDiskPercent
145+
}
146+
135147
func (m *manager) TrackProcess(pid int) {
136148
p, err := process.NewProcess(int32(pid))
137149
if err != nil {
@@ -174,7 +186,7 @@ func (m *manager) update(diskPath string, frequency, cpuHalflife, diskHalflife t
174186
currentScaledReadUsage := newDiskWeight * currentReadUsage
175187
currentScaledWriteUsage := newDiskWeight * currentWriteUsage
176188

177-
availableBytes, getBytesErr := storage.AvailableBytes(diskPath)
189+
availableBytes, availablePercentage, getBytesErr := storage.AvailableBytes(diskPath)
178190
if getBytesErr != nil {
179191
m.log.Verbo("failed to lookup resource",
180192
zap.String("resource", "system disk"),
@@ -190,6 +202,7 @@ func (m *manager) update(diskPath string, frequency, cpuHalflife, diskHalflife t
190202

191203
if getBytesErr == nil {
192204
m.availableDiskBytes = availableBytes
205+
m.availableDiskPercent = availablePercentage
193206
}
194207

195208
m.usageLock.Unlock()

utils/storage/storage_openbsd.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,23 @@
66

77
package storage
88

9-
import "syscall"
9+
import (
10+
"errors"
11+
"syscall"
12+
)
1013

11-
func AvailableBytes(storagePath string) (uint64, error) {
14+
var errZeroAvailableBytes = errors.New("available blocks is reported as 0")
15+
16+
func AvailableBytes(storagePath string) (uint64, uint64, error) {
1217
var stat syscall.Statfs_t
1318
err := syscall.Statfs(storagePath, &stat)
1419
if err != nil {
15-
return 0, err
20+
return 0, 0, err
21+
}
22+
if stat.Blocks == 0 {
23+
return 0, 0, errZeroAvailableBytes
1624
}
17-
avail := uint64(stat.F_bavail) * uint64(stat.F_bsize)
18-
return avail, nil
25+
avail := stat.F_bavail * uint64(stat.F_bsize)
26+
percentage := stat.F_bavail * 100 / stat.F_blocks
27+
return avail, percentage, nil
1928
}

0 commit comments

Comments
 (0)