-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Metrics support enhancements: 1) Splitting up amd_gpu_memory_use_perc… #5
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,7 +66,9 @@ type amd_data struct { | |
GPUSCLK *prometheus.Desc | ||
GPUMCLK *prometheus.Desc | ||
GPUUsage *prometheus.Desc | ||
GPUMemoryBusyPercent *prometheus.Desc | ||
GPUMemoryUsage *prometheus.Desc | ||
GPUMemoryTotal *prometheus.Desc | ||
Data func() (collect.AMDParams) | ||
} | ||
|
||
|
@@ -141,52 +143,63 @@ func NewCollector(handle func() (collect.AMDParams)) prometheus.Collector { | |
GPUDevId: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_dev_id"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_dev_id", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_dev_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUPowerCap: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_power_cap"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_power_cap", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_power_cap", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUPowerAvg: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_power_avg"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_power_avg", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_power_avg", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUTemperature: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_current_temperature"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_current_temperature", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_current_temperature", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUSCLK: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_SCLK"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_SCLK", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_SCLK", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUMCLK: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_MCLK"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_MCLK", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_MCLK", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUUsage: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_use_percent"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_use_percent", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUMemoryUsage: prometheus.NewDesc( | ||
GPUMemoryBusyPercent: prometheus.NewDesc( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By "busy" memory, does it mean "used"? Not sure what 'busy' means for memory? Could we changed to "used"? |
||
prometheus.BuildFQName("amd", "", "gpu_memory_use_percent"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_memory_use_percent", "productname"},// The metric's variable label dimensions. | ||
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUMemoryUsage: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_memory_used"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
GPUMemoryTotal: prometheus.NewDesc( | ||
prometheus.BuildFQName("amd", "", "gpu_memory_total"), | ||
"AMD Params",// The metric's help text. | ||
[]string{"gpu_id", "productname", "node_id", "pod"},// The metric's variable label dimensions. | ||
nil,// The metric's constant label dimensions. | ||
), | ||
|
||
|
||
Data: handle, //This is the Scan() function handle | ||
} | ||
|
@@ -260,63 +273,79 @@ func (c *amd_data) Collect(ch chan<- prometheus.Metric) { | |
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUDevId, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUPowerCap{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUPowerCap, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUPowerAvg{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUPowerAvg, | ||
prometheus.CounterValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.CounterValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUTemperature{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUTemperature, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUSCLK{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUSCLK, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUMCLK{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUMCLK, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUUsage{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUUsage, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUMemoryBusyPercent{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUMemoryBusyPercent, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUMemoryUsage{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUMemoryUsage, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i]) | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
for i,s := range data.GPUMemoryTotal{ | ||
if uint(i) > (data.NumGPUs - 1) { | ||
continue | ||
} | ||
ch <- prometheus.MustNewConstMetric(c.GPUMemoryTotal, | ||
prometheus.GaugeValue, float64(s), strconv.Itoa(i), gGPUProductNames[i], gNodeName, gPod) | ||
} | ||
|
||
ch <- prometheus.MustNewConstMetric(c.Sockets, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,7 +53,8 @@ import ( | |
) | ||
|
||
var gGPUProductNames[24] string | ||
|
||
var gNodeName string | ||
var gPod string | ||
/* rocm-smi output sample | ||
{"card0": | ||
{ | ||
|
@@ -98,7 +99,19 @@ func GetGpuProductNames() { | |
} | ||
} | ||
|
||
func GetNodeName() { | ||
nodename, err := exec.Command("uname", "-n").Output() | ||
if err == nil { | ||
gNodeName = string(nodename) | ||
gPod = string(nodename) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need |
||
} else { | ||
log.Fatal(err) | ||
} | ||
} | ||
|
||
func main() { | ||
// Get Node name | ||
GetNodeName() | ||
// Get all GPU product names | ||
GetGpuProductNames() | ||
|
||
|
@@ -122,4 +135,3 @@ func main() { | |
log.Fatalf("cannot start collector exporter: %s", err) | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
argument "pod" is not working for any of the APIs when we try to query. Please check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Query with argument "node_id" also gives "Empty Query result"