Skip to content

Commit 5a27b2e

Browse files
committed
CP-33636: add DCGM GPU metrics collection and transformation
Add complete NVIDIA DCGM GPU metrics support with Prometheus scraping and transformation pipeline. Prometheus Scrape Configuration: - Added DCGM Exporter scrape job with Kubernetes service discovery - Matches services labeled `app.kubernetes.io/name=dcgm-exporter` - Collects DCGM_FI_DEV_GPU_UTIL, DCGM_FI_DEV_FB_USED, DCGM_FI_DEV_FB_FREE - Includes 10 comprehensive Helm unit tests for scrape configuration Metric Transformation Pipeline: - Implemented hexagonal architecture for vendor-agnostic GPU metrics: - MetricTransformer interface in app/types (port) - Catalog transformer with sequential routing (domain service) - DCGM transformer for NVIDIA metrics (adapter) DCGM Transformations: - DCGM_FI_DEV_GPU_UTIL → container_resources_gpu_usage_percent - DCGM_FI_DEV_FB_USED + FREE → container_resources_gpu_memory_usage_percent - UUID label renamed to gpu_uuid with GPU- prefix stripped - Supports both 'node' and 'Hostname' labels for node identification - Memory metrics buffered and calculated as percentages in flush phase Testing: - 96.4% test coverage for DCGM transformer with table-driven tests - Edge cases: missing labels, invalid values, incomplete pairs - 10 Helm unit tests for scrape configuration and metric filtering - Verified on EKS cluster (g4dn.xlarge with Tesla T4 GPU) Documentation: - README.md with transformation tables and processing strategy - CLAUDE.md with AI development guide and architecture context - Documented label transformations, node identification, and edge cases Configuration: - Added 100.0 to golangci-lint allowFloats for percentage conversion - Updated Makefile for transform package test target This provides foundation for multi-vendor GPU support (AMD, Intel) following established architectural patterns in the codebase.
1 parent 664203a commit 5a27b2e

File tree

21 files changed

+2563
-12
lines changed

21 files changed

+2563
-12
lines changed

.golangci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ linters:
5151
- name: atomic
5252
- name: add-constant
5353
arguments:
54-
- allowFloats: 0.0,0.,1.0,1.,2.0,2.,3.0,3.
54+
- allowFloats: 0.0,0.,1.0,1.,2.0,2.,3.0,3.,100.0
5555
allowInts: 0,1,2,3,10,8,16,32,64,100,128,192,256,512,1024,2048,4096,8192,16384,32768,65536
5656
allowStrs: '""'
5757
maxLitCount: "3"

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ GO_BINARY_DIRS = \
289289
$(NULL)
290290

291291
GO_COMMAND_PACKAGE_DIRS = \
292-
$(foreach parent_dir,$(GO_BINARY_DIRS),$(foreach src_dir,$(wildcard $(parent_dir)/*/),$(patsubst %/,%,$(src_dir)))) \
292+
$(patsubst %/,%,$(filter %/,$(foreach parent_dir,$(GO_BINARY_DIRS),$(wildcard $(parent_dir)/*/)))) \
293293
$(NULL)
294294

295295
GO_BINARIES = \

app/domain/metric_collector.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"github.com/rs/zerolog/log"
3131

3232
config "github.com/cloudzero/cloudzero-agent/app/config/gator"
33+
"github.com/cloudzero/cloudzero-agent/app/domain/transform"
3334
"github.com/cloudzero/cloudzero-agent/app/types"
3435
)
3536

@@ -113,6 +114,9 @@ type MetricCollector struct {
113114
// filter implements metric classification logic to separate cost from observability metrics.
114115
filter *MetricFilter
115116

117+
// transformer handles vendor-specific metric transformation (e.g., DCGM GPU metrics).
118+
transformer types.MetricTransformer
119+
116120
// clock provides time abstraction for testing and consistent timestamping.
117121
clock types.TimeProvider
118122

@@ -144,6 +148,7 @@ func NewMetricCollector(s *config.Settings, clock types.TimeProvider, costStore
144148
costStore: costStore,
145149
observabilityStore: observabilityStore,
146150
filter: filter,
151+
transformer: transform.NewMetricTransformer(),
147152
clock: clock,
148153
cancelFunc: cancel,
149154
}
@@ -192,6 +197,27 @@ func (d *MetricCollector) PutMetrics(ctx context.Context, contentType, encodingT
192197
return nil, fmt.Errorf("unsupported content type: %s", contentType)
193198
}
194199

200+
// Log complete DCGM metrics for debugging GPU transformation
201+
for _, metric := range metrics {
202+
if strings.HasPrefix(metric.MetricName, "DCGM_FI_DEV_") {
203+
log.Ctx(ctx).Info().
204+
Str("metricName", metric.MetricName).
205+
Str("value", metric.Value).
206+
Str("nodeName", metric.NodeName).
207+
Interface("labels", metric.Labels).
208+
Time("timestamp", metric.TimeStamp).
209+
Str("clusterName", metric.ClusterName).
210+
Str("cloudAccountID", metric.CloudAccountID).
211+
Msg("DCGM metric received")
212+
}
213+
}
214+
215+
// Transform vendor-specific metrics (e.g., DCGM GPU metrics) before filtering
216+
metrics, err = d.transformer.Transform(ctx, metrics)
217+
if err != nil {
218+
return stats, fmt.Errorf("failed to transform metrics: %w", err)
219+
}
220+
195221
costMetrics, observabilityMetrics, droppedMetrics := d.filter.Filter(metrics)
196222

197223
metricsReceived.WithLabelValues().Add(float64(len(metrics)))
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Package catalog provides a catalog-based metric transformer that routes
5+
// metrics to registered specialized transformers.
6+
//
7+
// The catalog transformer orchestrates multiple specialized transformers to
8+
// provide automatic routing based on metric characteristics.
9+
package catalog
10+
11+
import (
12+
"context"
13+
14+
"github.com/cloudzero/cloudzero-agent/app/types"
15+
)
16+
17+
// Transformer implements types.MetricTransformer using a catalog of specialized
18+
// transformers.
19+
//
20+
// Each transformer in the catalog processes all metrics sequentially.
21+
// Transformers identify which metrics they can handle and transform those while
22+
// passing through others unchanged.
23+
type Transformer struct {
24+
transformers []types.MetricTransformer
25+
}
26+
27+
// NewTransformer creates a new catalog transformer with the provided
28+
// specialized transformers.
29+
//
30+
// Transformers are applied sequentially - each transformer receives all metrics
31+
// and decides which ones to transform based on implementation-specific logic
32+
// (e.g., metric name patterns).
33+
func NewTransformer(transformers ...types.MetricTransformer) *Transformer {
34+
return &Transformer{
35+
transformers: transformers,
36+
}
37+
}
38+
39+
// Transform processes metrics by routing them sequentially through specialized
40+
// transformers.
41+
//
42+
// Processing flow:
43+
// 1. Pass metrics through first transformer
44+
// 2. Pass results through second transformer
45+
// 3. Continue until all transformers have processed the metrics
46+
//
47+
// This implements the types.MetricTransformer interface.
48+
func (t *Transformer) Transform(ctx context.Context, metrics []types.Metric) ([]types.Metric, error) {
49+
if len(t.transformers) == 0 {
50+
return metrics, nil
51+
}
52+
53+
// Process through each transformer in sequence
54+
result := metrics
55+
var err error
56+
57+
for _, transformer := range t.transformers {
58+
result, err = transformer.Transform(ctx, result)
59+
if err != nil {
60+
return nil, err
61+
}
62+
}
63+
64+
return result, nil
65+
}

0 commit comments

Comments
 (0)