From a5e4c9ce96e0f4f8c73defb09d34f96355096da1 Mon Sep 17 00:00:00 2001 From: Jorge Parada Date: Tue, 21 Nov 2023 16:00:11 -0800 Subject: [PATCH] Metrics support enhancements: 1) Splitting up amd_gpu_memory_use_percent into two separate metrics: amd_gpu_memory_used and amd_gpu_memory_total. 2) Adding 'gpu' label which is the index of the gpu on the node. 3) Adding 'node' label which is the name of the node on which the gpu runs. 4) Adding pod name label. --- src/collect/cpustat.go | 10 +++++++ src/cpu_data.go | 65 ++++++++++++++++++++++++++++++------------ src/main.go | 16 +++++++++-- 3 files changed, 71 insertions(+), 20 deletions(-) diff --git a/src/collect/cpustat.go b/src/collect/cpustat.go index d3b292a..9f23849 100644 --- a/src/collect/cpustat.go +++ b/src/collect/cpustat.go @@ -59,7 +59,9 @@ type AMDParams struct { GPUSCLK [24]float64 GPUMCLK [24]float64 GPUUsage [24]float64 + GPUMemoryBusyPercent [24]float64 GPUMemoryUsage [24]float64 + GPUMemoryTotal [24]float64 } func Scan() (AMDParams) { @@ -141,8 +143,16 @@ func Scan() (AMDParams) { value64 = 0 value64 = uint64(goamdsmi.GO_rsmi_dev_gpu_memory_busy_percent_get(i)) + stat.GPUMemoryBusyPercent[i] = float64(value64) + value64 = 0 + + value64 = uint64(goamdsmi.GO_rsmi_dev_gpu_memory_usage_get(i)) stat.GPUMemoryUsage[i] = float64(value64) value64 = 0 + + value64 = uint64(goamdsmi.GO_rsmi_dev_gpu_memory_total_get(i)) + stat.GPUMemoryTotal[i] = float64(value64) + value64 = 0 } } diff --git a/src/cpu_data.go b/src/cpu_data.go index 0257b5c..3f0f605 100644 --- a/src/cpu_data.go +++ b/src/cpu_data.go @@ -66,7 +66,9 @@ type amd_data struct { GPUSCLK *prometheus.Desc GPUMCLK *prometheus.Desc GPUUsage *prometheus.Desc + GPUMemoryBusyPercent *prometheus.Desc GPUMemoryUsage *prometheus.Desc + GPUMemoryTotal *prometheus.Desc Data func() (collect.AMDParams) } @@ -141,52 +143,63 @@ func NewCollector(handle func() (collect.AMDParams)) prometheus.Collector { GPUDevId: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_dev_id"), "AMD Params",// The metric's help text. - []string{"gpu_dev_id", "productname"},// The metric's variable label dimensions. + []string{"gpu_dev_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), GPUPowerCap: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_power_cap"), "AMD Params",// The metric's help text. - []string{"gpu_power_cap", "productname"},// The metric's variable label dimensions. + []string{"gpu_power_cap", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), GPUPowerAvg: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_power_avg"), "AMD Params",// The metric's help text. - []string{"gpu_power_avg", "productname"},// The metric's variable label dimensions. + []string{"gpu_power_avg", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), GPUTemperature: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_current_temperature"), "AMD Params",// The metric's help text. - []string{"gpu_current_temperature", "productname"},// The metric's variable label dimensions. + []string{"gpu_current_temperature", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), GPUSCLK: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_SCLK"), "AMD Params",// The metric's help text. - []string{"gpu_SCLK", "productname"},// The metric's variable label dimensions. + []string{"gpu_SCLK", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), GPUMCLK: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_MCLK"), "AMD Params",// The metric's help text. - []string{"gpu_MCLK", "productname"},// The metric's variable label dimensions. + []string{"gpu_MCLK", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), GPUUsage: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_use_percent"), "AMD Params",// The metric's help text. - []string{"gpu_use_percent", "productname"},// The metric's variable label dimensions. + []string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), - GPUMemoryUsage: prometheus.NewDesc( + GPUMemoryBusyPercent: prometheus.NewDesc( prometheus.BuildFQName("amd", "", "gpu_memory_use_percent"), "AMD Params",// The metric's help text. - []string{"gpu_memory_use_percent", "productname"},// The metric's variable label dimensions. + []string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. + nil,// The metric's constant label dimensions. + ), + GPUMemoryUsage: prometheus.NewDesc( + prometheus.BuildFQName("amd", "", "gpu_memory_used"), + "AMD Params",// The metric's help text. + []string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. + nil,// The metric's constant label dimensions. + ), + GPUMemoryTotal: prometheus.NewDesc( + prometheus.BuildFQName("amd", "", "gpu_memory_total"), + "AMD Params",// The metric's help text. + []string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. nil,// The metric's constant label dimensions. ), - Data: handle, //This is the Scan() function handle } @@ -260,7 +273,7 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUDevId, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUPowerCap{ @@ -268,7 +281,7 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUPowerCap, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUPowerAvg{ @@ -276,7 +289,7 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUPowerAvg, - prometheus.CounterValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.CounterValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUTemperature{ @@ -284,7 +297,7 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUTemperature, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUSCLK{ @@ -292,7 +305,7 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUSCLK, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUMCLK{ @@ -300,7 +313,7 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUMCLK, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUUsage{ @@ -308,7 +321,15 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUUsage, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) + } + + for i,s := range data.GPUMemoryBusyPercent{ + if uint(i) > (data.NumGPUs - 1) { + continue + } + ch <- prometheus.MustNewConstMetric(c.GPUMemoryBusyPercent, + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } for i,s := range data.GPUMemoryUsage{ @@ -316,7 +337,15 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { continue } ch <- prometheus.MustNewConstMetric(c.GPUMemoryUsage, - prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) + } + + for i,s := range data.GPUMemoryTotal{ + if uint(i) > (data.NumGPUs - 1) { + continue + } + ch <- prometheus.MustNewConstMetric(c.GPUMemoryTotal, + prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) } ch <- prometheus.MustNewConstMetric(c.Sockets, diff --git a/src/main.go b/src/main.go index 0865ff7..88d415d 100644 --- a/src/main.go +++ b/src/main.go @@ -53,7 +53,8 @@ import ( ) var gGPUProductNames[24] string - +var gNodeName string +var gPod string /* rocm-smi output sample {"card0": { @@ -98,7 +99,19 @@ func GetGpuProductNames() { } } +func GetNodeName() { + nodename, err := exec.Command("uname", "-n").Output() + if err == nil { + gNodeName = string(nodename) + gPod = string(nodename) + } else { + log.Fatal(err) + } +} + func main() { + // Get Node name + GetNodeName() // Get all GPU product names GetGpuProductNames() @@ -122,4 +135,3 @@ func main() { log.Fatalf("cannot start collector exporter: %s", err) } } -