Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add systemd performances metrics (CPU/Memory) #3233

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 103 additions & 5 deletions collector/systemd_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ import (
)

const (
// minSystemdVersionCPUUsage is the minimum SystemD version for availability of
// the 'CPUUsageMetrics' manager property
minSystemdVersionCPUUsage = 220
// minSystemdVersionMemoryCurrent is the minimum SystemD version for availability of
// the 'MemoryCurrentMetrics' manager property
minSystemdVersionMemoryCurrent = 219
// minSystemdVersionSystemState is the minimum SystemD version for availability of
// the 'SystemState' manager property and the timer property 'LastTriggerUSec'
// https://github.com/prometheus/node_exporter/issues/291
Expand All @@ -52,17 +58,20 @@ var (
systemdUnitExcludeSet = true
return nil
}).String()
oldSystemdUnitExclude = kingpin.Flag("collector.systemd.unit-blacklist", "DEPRECATED: Use collector.systemd.unit-exclude").Hidden().String()
systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus (Strongly discouraged since it requires root. For testing purposes only).").Hidden().Bool()
enableTaskMetrics = kingpin.Flag("collector.systemd.enable-task-metrics", "Enables service unit tasks metrics unit_tasks_current and unit_tasks_max").Bool()
enableRestartsMetrics = kingpin.Flag("collector.systemd.enable-restarts-metrics", "Enables service unit metric service_restart_total").Bool()
enableStartTimeMetrics = kingpin.Flag("collector.systemd.enable-start-time-metrics", "Enables service unit metric unit_start_time_seconds").Bool()
oldSystemdUnitExclude = kingpin.Flag("collector.systemd.unit-blacklist", "DEPRECATED: Use collector.systemd.unit-exclude").Hidden().String()
systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus (Strongly discouraged since it requires root. For testing purposes only).").Hidden().Bool()
enablePerformanceMetrics = kingpin.Flag("collector.systemd.enable-performance-metrics", "Enables service unit performance metric unit_cpu_usage_nseconds and unit_memory_current").Bool()
enableTaskMetrics = kingpin.Flag("collector.systemd.enable-task-metrics", "Enables service unit tasks metrics unit_tasks_current and unit_tasks_max").Bool()
enableRestartsMetrics = kingpin.Flag("collector.systemd.enable-restarts-metrics", "Enables service unit metric service_restart_total").Bool()
enableStartTimeMetrics = kingpin.Flag("collector.systemd.enable-start-time-metrics", "Enables service unit metric unit_start_time_seconds").Bool()

systemdVersionRE = regexp.MustCompile(`[0-9]{3,}(\.[0-9]+)?`)
)

type systemdCollector struct {
unitDesc *prometheus.Desc
unitCPUUsageDesc *prometheus.Desc
unitMemoryCurrentDesc *prometheus.Desc
unitStartTimeDesc *prometheus.Desc
unitTasksCurrentDesc *prometheus.Desc
unitTasksMaxDesc *prometheus.Desc
Expand Down Expand Up @@ -94,6 +103,14 @@ func NewSystemdCollector(logger *slog.Logger) (Collector, error) {
prometheus.BuildFQName(namespace, subsystem, "unit_state"),
"Systemd unit", []string{"name", "state", "type"}, nil,
)
unitCPUUsageDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_cpu_usage_nseconds"),
"Current CPU used per systemd unit in nanosecond", []string{"name"}, nil,
)
unitMemoryCurrentDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_memory_current"),
"Current memory used per systemd unit", []string{"name"}, nil,
)
unitStartTimeDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_start_time_seconds"),
"Start time of the unit since unix epoch in seconds.", []string{"name"}, nil,
Expand Down Expand Up @@ -156,6 +173,8 @@ func NewSystemdCollector(logger *slog.Logger) (Collector, error) {

return &systemdCollector{
unitDesc: unitDesc,
unitCPUUsageDesc: unitCPUUsageDesc,
unitMemoryCurrentDesc: unitMemoryCurrentDesc,
unitStartTimeDesc: unitStartTimeDesc,
unitTasksCurrentDesc: unitTasksCurrentDesc,
unitTasksMaxDesc: unitTasksMaxDesc,
Expand Down Expand Up @@ -187,6 +206,12 @@ func (c *systemdCollector) Update(ch chan<- prometheus.Metric) error {
if systemdVersion < minSystemdVersionSystemState {
c.logger.Debug("Detected systemd version is lower than minimum, some systemd state and timer metrics will not be available", "current", systemdVersion, "minimum", minSystemdVersionSystemState)
}
if *enablePerformanceMetrics && systemdVersion < minSystemdVersionCPUUsage {
c.logger.Debug("Detected systemd version is lower than minimum, services cpu usage metrics will not be available", "current", systemdVersion, "minimum", minSystemdVersionCPUUsage)
}
if *enablePerformanceMetrics && systemdVersion < minSystemdVersionMemoryCurrent {
c.logger.Debug("Detected systemd version is lower than minimum, services memory usage metrics will not be available", "current", systemdVersion, "minimum", minSystemdVersionMemoryCurrent)
}
ch <- prometheus.MustNewConstMetric(
c.systemdVersionDesc,
prometheus.GaugeValue,
Expand Down Expand Up @@ -220,6 +245,28 @@ func (c *systemdCollector) Update(ch chan<- prometheus.Metric) error {
c.logger.Debug("collectUnitStatusMetrics took", "duration_seconds", time.Since(begin).Seconds())
}()

if *enablePerformanceMetrics {
if systemdVersion >= minSystemdVersionCPUUsage {
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectUnitCPUUsageMetrics(conn, ch, units)
c.logger.Debug("collectUnitCPUUsageMetrics took", "duration_seconds", time.Since(begin).Seconds())
}()
}

if systemdVersion >= minSystemdVersionMemoryCurrent {
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectUnitMemoryCurrentMetrics(conn, ch, units)
c.logger.Debug("collectUnitMemoryCurrentMetrics took", "duration_seconds", time.Since(begin).Seconds())
}()
}
}

if *enableStartTimeMetrics {
wg.Add(1)
go func() {
Expand Down Expand Up @@ -342,6 +389,57 @@ func (c *systemdCollector) collectSockets(conn *dbus.Conn, ch chan<- prometheus.
}
}

func (c *systemdCollector) collectUnitCPUUsageMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
var val uint64

for _, unit := range units {
if !strings.HasSuffix(unit.Name, ".service") {
continue
}
if unit.ActiveState != "active" {
val = 0
} else {
CPUUsageNSec, err := conn.GetServicePropertyContext(context.TODO(), unit.Name, "CPUUsageNSec")
if err != nil {
c.logger.Debug("couldn't get service property CPUUsageNSec", "unit", unit.Name, "err", err)
continue
}
val = CPUUsageNSec.Value.Value().(uint64)
}

ch <- prometheus.MustNewConstMetric(
c.unitCPUUsageDesc, prometheus.GaugeValue,
float64(val), unit.Name)
}
}

func (c *systemdCollector) collectUnitMemoryCurrentMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
var val uint64

for _, unit := range units {
if !strings.HasSuffix(unit.Name, ".service") {
continue
}
if unit.ActiveState != "active" {
val = 0
} else {
MemoryCurrent, err := conn.GetServicePropertyContext(context.TODO(), unit.Name, "MemoryCurrent")
if err != nil {
c.logger.Debug("couldn't get service property MemoryCurrent", "unit", unit.Name, "err", err)
continue
}
val = MemoryCurrent.Value.Value().(uint64)
}

// Don't set if memoryCurrent if dbus reports MaxUint64.
if val != math.MaxUint64 {
ch <- prometheus.MustNewConstMetric(
c.unitMemoryCurrentDesc, prometheus.GaugeValue,
float64(val), unit.Name)
}
}
}

func (c *systemdCollector) collectUnitStartTimeMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
var startTimeUsec uint64

Expand Down