Skip to content

Commit

Permalink
Add drive I/O metrics for Prometheus (#955)
Browse files Browse the repository at this point in the history
Below metrics are exported:
* directpv_stats_drive_ready
* directpv_stats_drive_total_read_bytes
* directpv_stats_drive_total_write_bytes
* directpv_stats_drive_read_latency_seconds
* directpv_stats_drive_write_latency_seconds
* directpv_stats_drive_wait_time_seconds

Fixes #839

Signed-off-by: Bala.FA <[email protected]>
  • Loading branch information
balamurugana authored Oct 15, 2024
1 parent 557e925 commit 35aaf55
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 4 deletions.
8 changes: 7 additions & 1 deletion docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
DirectPV nodes export Prometheus compatible metrics data via port `10443`. The metrics data includes
* directpv_stats_bytes_used
* directpv_stats_bytes_total
and categorized by labels `tenant`, `volumeID` and `node`.
* directpv_stats_drive_ready
* directpv_stats_drive_total_read_bytes
* directpv_stats_drive_total_write_bytes
* directpv_stats_drive_read_latency_seconds
* directpv_stats_drive_write_latency_seconds
* directpv_stats_drive_wait_time_seconds
and categorized by labels `drive`, `tenant`, `volumeID` and `node`.

To scrape data in Prometheus, each node must be accessible by port `10443`. A simple example is below

Expand Down
18 changes: 18 additions & 0 deletions pkg/device/sysfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,21 @@ func getHolders(name string) ([]string, error) {
func getDMName(name string) (string, error) {
return readFirstLine("/sys/class/block/" + name + "/dm/name")
}

// GetStat returns statistics for a given device name.
func GetStat(name string) (stats []uint64, err error) {
line, err := readFirstLine("/sys/class/block/" + name + "/stat")
if err != nil {
return nil, err
}

for _, token := range strings.Fields(line) {
ui64, err := strconv.ParseUint(token, 10, 64)
if err != nil {
return nil, err
}
stats = append(stats, ui64)
}

return stats, nil
}
143 changes: 140 additions & 3 deletions pkg/metrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,51 @@ package metrics

import (
"context"
"fmt"

directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
"github.com/minio/directpv/pkg/client"
"github.com/minio/directpv/pkg/consts"
"github.com/minio/directpv/pkg/device"
"github.com/minio/directpv/pkg/sys"
"github.com/minio/directpv/pkg/types"
"github.com/minio/directpv/pkg/utils"
"github.com/minio/directpv/pkg/xfs"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/klog/v2"
)

const defaultSectorSize = 512

type driveStats struct {
readBytes float64
readTicks float64
writeBytes float64
writeTicks float64
timeInQueue float64
}

func getDriveStats(driveName string) (*driveStats, error) {
stat, err := device.GetStat(driveName)
switch {
case err != nil:
return nil, err
case len(stat) == 0:
return nil, fmt.Errorf("unable to read stat from drive %v", driveName)
case len(stat) < 10:
return nil, fmt.Errorf("invalid stat format from drive %v", driveName)
}

// Refer https://www.kernel.org/doc/Documentation/block/stat.txt for meaning of each field.
return &driveStats{
readBytes: float64(stat[2] * defaultSectorSize),
readTicks: float64(stat[3]),
writeBytes: float64(stat[6] * defaultSectorSize),
writeTicks: float64(stat[7]),
timeInQueue: float64(stat[10]),
}, nil
}

type metricsCollector struct {
nodeID directpvtypes.NodeID
desc *prometheus.Desc
Expand Down Expand Up @@ -95,21 +129,124 @@ func (c *metricsCollector) publishVolumeStats(ctx context.Context, volume *types
)
}

func (c *metricsCollector) publishDriveStats(drive *types.Drive, ch chan<- prometheus.Metric) {
deviceID, err := c.getDeviceByFSUUID(drive.Status.FSUUID)
if err != nil {
klog.ErrorS(
err,
"unable to find device by FSUUID; "+
"either device is removed or run command "+
"`sudo udevadm control --reload-rules && sudo udevadm trigger`"+
" on the host to reload",
"FSUUID", drive.Status.FSUUID)
client.Eventf(
drive, client.EventTypeWarning, client.EventReasonMetrics,
"unable to find device by FSUUID %v; "+
"either device is removed or run command "+
"`sudo udevadm control --reload-rules && sudo udevadm trigger`"+
" on the host to reload", drive.Status.FSUUID)

return
}
deviceName := utils.TrimDevPrefix(deviceID)

status := float64(1) // Online
driveStat, err := getDriveStats(deviceName)
if err != nil {
klog.ErrorS(err, "unable to read drive statistics")
status = float64(0) // Offline
}

// Metrics
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_ready"),
"Drive Online/Offline Status",
[]string{"drive"}, nil),
prometheus.GaugeValue,
status, drive.Name,
)

if driveStat == nil {
return
}

ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_total_read_bytes"),
"Total number of bytes read from the drive",
[]string{"drive"}, nil),
prometheus.CounterValue,
driveStat.readBytes, drive.Name,
)

ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_total_write_bytes"),
"Total number of bytes written to the drive",
[]string{"drive"}, nil),
prometheus.CounterValue,
driveStat.writeBytes, drive.Name,
)

// Drive Read/Write Latency
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_read_latency_seconds"),
"Drive Read Latency",
[]string{"drive"}, nil),
prometheus.GaugeValue,
driveStat.readTicks/1000, drive.Name,
)

ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_write_latency_seconds"),
"Drive Write Latency",
[]string{"drive"}, nil),
prometheus.GaugeValue,
driveStat.writeTicks/1000, drive.Name,
)

// Wait Time
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(consts.AppName, "stats", "drive_wait_time_seconds"),
"Drive Wait Time",
[]string{"drive"}, nil),
prometheus.GaugeValue,
driveStat.timeInQueue/1000, drive.Name,
)
}

// Collect is called by Prometheus registry when collecting metrics.
func (c *metricsCollector) Collect(ch chan<- prometheus.Metric) {
ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()

resultCh := client.NewVolumeLister().
// Collecting volume statistics
volumeResultCh := client.NewVolumeLister().
NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}).
List(ctx)
for result := range resultCh {
for result := range volumeResultCh {
if result.Err != nil {
return
break
}

if result.Volume.Status.TargetPath != "" {
c.publishVolumeStats(ctx, &result.Volume, ch)
}
}

// Collecting drive statistics
driveResultCh := client.NewDriveLister().
NodeSelector([]directpvtypes.LabelValue{directpvtypes.ToLabelValue(string(c.nodeID))}).
List(ctx)
for result := range driveResultCh {
if result.Err != nil {
break
}

c.publishDriveStats(&result.Drive, ch)
}
}

0 comments on commit 35aaf55

Please sign in to comment.