From e932870550fb0d26684b41be1c41cbc7b760a6c4 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Wed, 8 Oct 2025 18:15:45 -0400 Subject: [PATCH] CP-33636: add DCGM GPU metrics collection and transformation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add complete NVIDIA DCGM GPU metrics support with Prometheus scraping and transformation pipeline. Prometheus Scrape Configuration: - Added DCGM Exporter scrape job with Kubernetes service discovery - Matches services labeled `app.kubernetes.io/name=dcgm-exporter` - Collects DCGM_FI_DEV_GPU_UTIL, DCGM_FI_DEV_FB_USED, DCGM_FI_DEV_FB_FREE - Includes 10 comprehensive Helm unit tests for scrape configuration Metric Transformation Pipeline: - Implemented hexagonal architecture for vendor-agnostic GPU metrics: - MetricTransformer interface in app/types (port) - Catalog transformer with sequential routing (domain service) - DCGM transformer for NVIDIA metrics (adapter) DCGM Transformations: - DCGM_FI_DEV_GPU_UTIL → container_resources_gpu_usage_percent - DCGM_FI_DEV_FB_USED + FREE → container_resources_gpu_memory_usage_percent - UUID label renamed to gpu_uuid with GPU- prefix stripped - Supports both 'node' and 'Hostname' labels for node identification - Memory metrics buffered and calculated as percentages in flush phase Testing: - 96.4% test coverage for DCGM transformer with table-driven tests - Edge cases: missing labels, invalid values, incomplete pairs - 10 Helm unit tests for scrape configuration and metric filtering - Verified on EKS cluster (g4dn.xlarge with Tesla T4 GPU) Documentation: - README.md with transformation tables and processing strategy - CLAUDE.md with AI development guide and architecture context - Documented label transformations, node identification, and edge cases Configuration: - Added 100.0 to golangci-lint allowFloats for percentage conversion - Updated Makefile for transform package test target This provides foundation for multi-vendor GPU support (AMD, Intel) following established architectural patterns in the codebase. --- .golangci.yaml | 2 +- Makefile | 2 +- app/domain/metric_collector.go | 26 + app/domain/transform/catalog/catalog.go | 65 ++ app/domain/transform/dcgm/CLAUDE.md | 581 ++++++++++++++ app/domain/transform/dcgm/README.md | 395 +++++++++ app/domain/transform/dcgm/transformer.go | 340 ++++++++ app/domain/transform/dcgm/transformer_test.go | 757 ++++++++++++++++++ app/domain/transform/transform.go | 25 + app/functions/helmless/default-values.yaml | 5 + app/types/metric_transformer.go | 47 ++ helm/templates/_cm_helpers.tpl | 88 ++ helm/templates/_defaults.tpl | 6 + helm/templates/agent-cm.yaml | 4 + helm/tests/gpu_metrics_test.yaml | 146 ++++ helm/values.schema.json | 12 + helm/values.schema.yaml | 22 + helm/values.yaml | 5 + tests/helm/template/cert-manager.yaml | 13 +- tests/helm/template/federated.yaml | 15 +- tests/helm/template/manifest.yaml | 13 +- 21 files changed, 2557 insertions(+), 12 deletions(-) create mode 100644 app/domain/transform/catalog/catalog.go create mode 100644 app/domain/transform/dcgm/CLAUDE.md create mode 100644 app/domain/transform/dcgm/README.md create mode 100644 app/domain/transform/dcgm/transformer.go create mode 100644 app/domain/transform/dcgm/transformer_test.go create mode 100644 app/domain/transform/transform.go create mode 100644 app/types/metric_transformer.go create mode 100644 helm/tests/gpu_metrics_test.yaml diff --git a/.golangci.yaml b/.golangci.yaml index 20e70c6d..42ea19cf 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -51,7 +51,7 @@ linters: - name: atomic - name: add-constant arguments: - - allowFloats: 0.0,0.,1.0,1.,2.0,2.,3.0,3. + - allowFloats: 0.0,0.,1.0,1.,2.0,2.,3.0,3.,100.0 allowInts: 0,1,2,3,10,8,16,32,64,100,128,192,256,512,1024,2048,4096,8192,16384,32768,65536 allowStrs: '""' maxLitCount: "3" diff --git a/Makefile b/Makefile index 2f08a74c..7b0289e7 100644 --- a/Makefile +++ b/Makefile @@ -289,7 +289,7 @@ GO_BINARY_DIRS = \ $(NULL) GO_COMMAND_PACKAGE_DIRS = \ - $(foreach parent_dir,$(GO_BINARY_DIRS),$(foreach src_dir,$(wildcard $(parent_dir)/*/),$(patsubst %/,%,$(src_dir)))) \ + $(patsubst %/,%,$(filter %/,$(foreach parent_dir,$(GO_BINARY_DIRS),$(wildcard $(parent_dir)/*/)))) \ $(NULL) GO_BINARIES = \ diff --git a/app/domain/metric_collector.go b/app/domain/metric_collector.go index 17e010d7..d85a5548 100644 --- a/app/domain/metric_collector.go +++ b/app/domain/metric_collector.go @@ -30,6 +30,7 @@ import ( "github.com/rs/zerolog/log" config "github.com/cloudzero/cloudzero-agent/app/config/gator" + "github.com/cloudzero/cloudzero-agent/app/domain/transform" "github.com/cloudzero/cloudzero-agent/app/types" ) @@ -113,6 +114,9 @@ type MetricCollector struct { // filter implements metric classification logic to separate cost from observability metrics. filter *MetricFilter + // transformer handles vendor-specific metric transformation (e.g., DCGM GPU metrics). + transformer types.MetricTransformer + // clock provides time abstraction for testing and consistent timestamping. clock types.TimeProvider @@ -144,6 +148,7 @@ func NewMetricCollector(s *config.Settings, clock types.TimeProvider, costStore costStore: costStore, observabilityStore: observabilityStore, filter: filter, + transformer: transform.NewMetricTransformer(), clock: clock, cancelFunc: cancel, } @@ -192,6 +197,27 @@ func (d *MetricCollector) PutMetrics(ctx context.Context, contentType, encodingT return nil, fmt.Errorf("unsupported content type: %s", contentType) } + // Log complete DCGM metrics for debugging GPU transformation + for _, metric := range metrics { + if strings.HasPrefix(metric.MetricName, "DCGM_FI_DEV_") { + log.Ctx(ctx).Info(). + Str("metricName", metric.MetricName). + Str("value", metric.Value). + Str("nodeName", metric.NodeName). + Interface("labels", metric.Labels). + Time("timestamp", metric.TimeStamp). + Str("clusterName", metric.ClusterName). + Str("cloudAccountID", metric.CloudAccountID). + Msg("DCGM metric received") + } + } + + // Transform vendor-specific metrics (e.g., DCGM GPU metrics) before filtering + metrics, err = d.transformer.Transform(ctx, metrics) + if err != nil { + return stats, fmt.Errorf("failed to transform metrics: %w", err) + } + costMetrics, observabilityMetrics, droppedMetrics := d.filter.Filter(metrics) metricsReceived.WithLabelValues().Add(float64(len(metrics))) diff --git a/app/domain/transform/catalog/catalog.go b/app/domain/transform/catalog/catalog.go new file mode 100644 index 00000000..cef68c38 --- /dev/null +++ b/app/domain/transform/catalog/catalog.go @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Package catalog provides a catalog-based metric transformer that routes +// metrics to registered specialized transformers. +// +// The catalog transformer orchestrates multiple specialized transformers to +// provide automatic routing based on metric characteristics. +package catalog + +import ( + "context" + + "github.com/cloudzero/cloudzero-agent/app/types" +) + +// Transformer implements types.MetricTransformer using a catalog of specialized +// transformers. +// +// Each transformer in the catalog processes all metrics sequentially. +// Transformers identify which metrics they can handle and transform those while +// passing through others unchanged. +type Transformer struct { + transformers []types.MetricTransformer +} + +// NewTransformer creates a new catalog transformer with the provided +// specialized transformers. +// +// Transformers are applied sequentially - each transformer receives all metrics +// and decides which ones to transform based on implementation-specific logic +// (e.g., metric name patterns). +func NewTransformer(transformers ...types.MetricTransformer) *Transformer { + return &Transformer{ + transformers: transformers, + } +} + +// Transform processes metrics by routing them sequentially through specialized +// transformers. +// +// Processing flow: +// 1. Pass metrics through first transformer +// 2. Pass results through second transformer +// 3. Continue until all transformers have processed the metrics +// +// This implements the types.MetricTransformer interface. +func (t *Transformer) Transform(ctx context.Context, metrics []types.Metric) ([]types.Metric, error) { + if len(t.transformers) == 0 { + return metrics, nil + } + + // Process through each transformer in sequence + result := metrics + var err error + + for _, transformer := range t.transformers { + result, err = transformer.Transform(ctx, result) + if err != nil { + return nil, err + } + } + + return result, nil +} diff --git a/app/domain/transform/dcgm/CLAUDE.md b/app/domain/transform/dcgm/CLAUDE.md new file mode 100644 index 00000000..92abac80 --- /dev/null +++ b/app/domain/transform/dcgm/CLAUDE.md @@ -0,0 +1,581 @@ +# DCGM Transformer - AI Development Guide + +## Quick Reference + +**Purpose**: Transform NVIDIA DCGM exporter metrics into standardized GPU metrics for cost allocation + +**Location**: `app/domain/transform/dcgm/` + +**Key Files**: + +- [transformer.go](transformer.go) - Core transformation logic +- [transformer_test.go](transformer_test.go) - Comprehensive unit tests +- [README.md](README.md) - User-facing documentation + +**Testing**: `GO_TEST_TARGET=./app/domain/transform/dcgm make test` + +## Architecture Context + +### Hexagonal Architecture Position + +````text +app/ +├── types/ # Interfaces (ports) +│ └── metric_transformer.go # MetricTransformer interface +│ +├── domain/ # Business logic (core) +│ ├── metric_collector.go # Invokes transformation pipeline +│ └── transform/ +│ ├── catalog/ # Routes to specialized transformers +│ └── dcgm/ # ← THIS PACKAGE +│ └── transformer.go # Implements MetricTransformer +```text + +### Integration Points + +**Upstream** (calls this package): + +- [app/domain/transform/catalog/catalog.go](../catalog/catalog.go:55) - Routes DCGM metrics here +- [app/domain/metric_collector.go](../../metric_collector.go:216) - Invokes catalog transformer + +**Downstream** (this package calls): + +- [app/types/metric.go](../../../types/metric.go) - Metric data structure +- `github.com/rs/zerolog/log` - Structured logging + +**Configuration**: + +- [helm/templates/\_defaults.tpl](../../../../helm/templates/_defaults.tpl:69-75) - Prometheus scraping +- [helm/templates/\_defaults.tpl](../../../../helm/templates/_defaults.tpl:234-235) - Cost metric filters + +## Implementation Guide + +### Transformation Logic Flow + +```go +// Phase 1: Per-Metric Processing +Transform(ctx, metrics) { + for each metric { + if DCGM_FI_DEV_GPU_UTIL: + → immediate transform → container_resources_gpu_usage_percent + + if DCGM_FI_DEV_FB_USED: + → buffer in memoryBuffer[namespace/pod/container/gpu].used + + if DCGM_FI_DEV_FB_FREE: + → buffer in memoryBuffer[namespace/pod/container/gpu].free + + else: + → pass through unchanged + } +} + +// Phase 2: Batch Completion +flushMemory(ctx) { + for each buffered pair { + if has_both(used, free): + percentage = (used / (used + free)) * 100 + → create container_resources_gpu_memory_usage_percent + else: + → drop incomplete pair (log warning) + } + clear buffer +} +```text + +### Key Data Structures + +**Transformer**: + +```go +type Transformer struct { + memoryBuffer map[string]*memoryPair // Key: "namespace/pod/container/gpu" +} +```text + +**Memory Pair**: + +```go +type memoryPair struct { + used *types.Metric // DCGM_FI_DEV_FB_USED + free *types.Metric // DCGM_FI_DEV_FB_FREE +} +```text + +**Buffer Key Format**: + +```text +"{namespace}/{pod}/{container}/{gpu}" +Example: "default/gpu-pod-123/cuda-app/0" +```text + +### Critical Implementation Details + +**1. Why Buffering?** + +Memory metrics arrive as separate USED and FREE metrics. We need both to calculate percentage: + +```go +percentage = (used / (used + free)) * 100 +```text + +Buffering ensures we have complete pairs before calculation. + +**2. Why Flush at End?** + +The buffer accumulates metrics throughout the batch. Flushing at the end ensures: + +- All metrics in the batch are considered +- Pairs are only calculated when complete +- Buffer doesn't grow unbounded across batches + +**3. Required Labels Check** + +Container attribution requires these labels: + +```go +var requiredLabels = []string{"namespace", "pod", "container"} +```text + +Metrics missing any are dropped - we can't attribute cost without knowing which container used the GPU. + +**4. Label Transformations** + +The transformer standardizes DCGM labels for consistency: + +**UUID → gpu_uuid Renaming**: + +```go +// copyLabels renames UUID to gpu_uuid for standardization +if k == "UUID" { + result["gpu_uuid"] = v // Standardized name +} +```text + +**Node Name Aliasing**: + +```go +nodeName := metric.NodeName +if nodeName == "" { + nodeName = metric.Labels["Hostname"] // Fallback to DCGM label +} +```text + +These transformations ensure compatibility with standardized GPU metric conventions. + +## Development Workflow + +### Adding New DCGM Metrics + +**Step 1**: Add constant for DCGM metric name: + +```go +const ( + dcgmGPUTemperature = "DCGM_FI_DEV_GPU_TEMP" // Example new metric +) +```text + +**Step 2**: Add constant for standardized name: + +```go +const ( + standardGPUTemperature = "container_resources_gpu_temperature_celsius" +) +```text + +**Step 3**: Add case to `transformSingle()`: + +```go +case dcgmGPUTemperature: + return transformGPUTemperature(metric), nil +```text + +**Step 4**: Implement transformation function: + +```go +func transformGPUTemperature(metric types.Metric) []types.Metric { + transformed := metric + transformed.MetricName = standardGPUTemperature + transformed.ID = uuid.New() + return []types.Metric{transformed} +} +```text + +**Step 5**: Add test cases to [transformer_test.go](transformer_test.go). + +**Step 6**: Update Helm chart to include metric in filters: + +```yaml +# helm/templates/_defaults.tpl +containerMetrics: + - container_resources_gpu_temperature_celsius +```text + +### Adding Tests + +Follow the table-driven test pattern: + +```go +func TestTransform_NewMetric(t *testing.T) { + tests := []struct { + name string + input []types.Metric + want []types.Metric + wantErr bool + }{ + { + name: "transforms new metric", + input: []types.Metric{{ + MetricName: "DCGM_FI_DEV_NEW_METRIC", + // ... test data + }}, + want: []types.Metric{{ + MetricName: "container_resources_new_metric", + // ... expected output + }}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + transformer := NewTransformer() + got, err := transformer.Transform(context.Background(), tt.input) + + if tt.wantErr { + assert.Error(t, err) + return + } + + assert.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} +```text + +### Debugging Tips + +**1. Enable Debug Logging** + +Set aggregator log level to debug in cluster overrides: + +```yaml +# clusters/brahms-overrides.yaml +aggregator: + logging: + level: debug +```text + +**2. Check Metric Flow** + +```bash +# Check if DCGM metrics are being received +kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + | grep "DCGM_FI_DEV" + +# Check if transformed metrics are produced +kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + | grep "container_resources_gpu" + +# Check for dropped metrics +kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + | grep "dropping" +```text + +**3. Add Temporary Logging** + +If debugging specific issues, add temporary logs (remember to remove): + +```go +log.Ctx(ctx).Debug(). + Str("metric", metric.MetricName). + Interface("labels", metric.Labels). + Msg("DEBUG: processing metric") +```text + +**Mark with `// TODO: TEMPORARY` and remove after debugging.** + +## Common Development Tasks + +### Task: Add New GPU Vendor Support + +**Example**: Adding AMD ROCm support + +**Step 1**: Create new package `app/domain/transform/rocm/` + +**Step 2**: Implement `types.MetricTransformer`: + +```go +package rocm + +type Transformer struct { + // ROCm-specific state +} + +func (t *Transformer) Transform(ctx context.Context, metrics []types.Metric) ([]types.Metric, error) { + // Transform ROCm metrics to standard format +} +```text + +**Step 3**: Register in catalog transformer: + +```go +// app/domain/transform/catalog/catalog.go +func NewMetricTransformer() types.MetricTransformer { + return &Transformer{ + transformers: []types.MetricTransformer{ + dcgm.NewTransformer(), + rocm.NewTransformer(), // Add AMD support + }, + } +} +```text + +**Step 4**: Add Prometheus scrape job in Helm chart for ROCm exporter. + +### Task: Change Transformation Logic + +**Example**: Use total memory instead of percentage + +**Current**: + +```go +percentage = (used / (used + free)) * 100 +```text + +**New**: + +```go +totalBytes = used // Just report used bytes directly +```text + +**Changes Required**: + +1. Update `calculateMemoryPercentage()` function +2. Update metric name to reflect bytes vs percentage +3. Update unit tests for new calculation +4. Update Helm chart metric name in filters +5. Update this documentation + +### Task: Fix Memory Leak in Buffer + +**Symptom**: Memory buffer grows unbounded + +**Diagnosis**: + +```go +// Check if buffer is being cleared +log.Ctx(ctx).Debug(). + Int("buffer_size_before_flush", len(t.memoryBuffer)). + Int("buffer_size_after_flush", len(t.memoryBuffer)). + Msg("buffer flush") +```text + +**Fix**: Ensure `flushMemory()` clears buffer at end: + +```go +// Clear buffer after flush +t.memoryBuffer = make(map[string]*memoryPair) +```text + +## Testing Checklist + +Before submitting changes: + +- [ ] Unit tests pass: `GO_TEST_TARGET=./app/domain/transform/dcgm make test` +- [ ] Test coverage >90%: `go test -cover ./app/domain/transform/dcgm` +- [ ] Integration test with DCGM exporter in test cluster +- [ ] Helm chart updated if metric names changed +- [ ] Documentation updated (README.md, CLAUDE.md) +- [ ] No temporary debug logging remains +- [ ] Code follows existing patterns (see similar functions) + +## Performance Guidelines + +### Memory Buffer Size + +**Expected**: 10-100 entries per batch +**Maximum**: ~1000 entries (acceptable) +**Alert**: >10,000 entries (indicates buffer not being cleared) + +**Monitoring**: + +```go +if len(t.memoryBuffer) > 1000 { + log.Ctx(ctx).Warn(). + Int("buffer_size", len(t.memoryBuffer)). + Msg("DCGM memory buffer unusually large") +} +```text + +### Transformation Latency + +**Expected**: <1ms per batch +**Maximum**: <10ms per batch +**Alert**: >100ms per batch + +The transformation should add negligible overhead compared to network and database I/O. + +## Related Packages + +**Must Read First**: + +- [app/domain/transform/README.md](../README.md) - Transformation architecture overview +- [app/domain/transform/catalog/CLAUDE.md](../catalog/CLAUDE.md) - Routing logic + +**Related Transformers** (future): + +- `app/domain/transform/rocm/` - AMD GPU support +- `app/domain/transform/xpu/` - Intel GPU support + +**Consumers**: + +- [app/domain/metric_collector.go](../../metric_collector.go) - Invokes transformation +- [app/domain/metric_filter.go](../../metric_filter.go) - Filters transformed metrics + +## Important Constraints + +### DO NOT Change + +These are part of the public contract and changing them breaks cost allocation: + +1. **Standardized metric names**: + + - `container_resources_gpu_usage_percent` + - `container_resources_gpu_memory_usage_percent` + +2. **Required labels**: `namespace`, `pod`, `container` (needed for cost attribution) + +3. **Percentage range**: 0-100 (CloudZero expects this range) + +### Safe to Change + +These are internal implementation details: + +1. Buffer key format (as long as it's unique per GPU) +2. Logging messages +3. Internal function names +4. Performance optimizations +5. Error messages + +### Requires Coordination + +These require updates in multiple places: + +1. **Adding new metrics**: Update Helm chart filters +2. **Changing label requirements**: Update DCGM Exporter config +3. **Changing calculation logic**: Update documentation and tests + +## Edge Cases to Consider + +### Multi-Instance GPU (MIG) + +NVIDIA MIG partitions a single GPU into multiple instances. Each instance appears as a separate GPU: + +```text +gpu="0" → MIG instance 0 of physical GPU 0 +gpu="1" → MIG instance 1 of physical GPU 0 +```text + +The transformer handles this correctly - each MIG instance is treated as a separate GPU in the buffer key. + +### GPU Time-Slicing + +When Kubernetes time-slices GPUs across containers, DCGM reports per-container metrics: + +```text +namespace="default", pod="pod-a", container="app-1", gpu="0" +namespace="default", pod="pod-b", container="app-2", gpu="0" +```text + +The transformer handles this correctly - each container gets its own buffer key. + +### Missing Metrics + +If DCGM Exporter stops reporting metrics (crash, network issue), incomplete pairs accumulate in buffer: + +**Current behavior**: Dropped with warning on next flush +**Alternative**: Could implement TTL to drop stale pairs + +## Maintenance Notes + +### When to Update This Package + +1. **NVIDIA releases new DCGM metrics**: Add to transformation logic +2. **CloudZero adds GPU cost models**: May need new derived metrics +3. **Performance issues**: Optimize buffer management +4. **New GPU vendors**: Create sibling packages (don't modify this one) + +### When NOT to Update This Package + +1. **Changing metric filters**: Update Helm chart instead +2. **Changing Prometheus scraping**: Update Prometheus config +3. **DCGM Exporter deployment**: Update DCGM Exporter Helm chart +4. **Cost calculation changes**: Update CloudZero backend + +### Code Review Checklist + +When reviewing changes to this package: + +- [ ] Unit tests cover new/changed code paths +- [ ] Existing tests still pass +- [ ] No breaking changes to metric names or output format +- [ ] Buffer management remains bounded (cleared after flush) +- [ ] Required labels are validated +- [ ] Error cases have appropriate logging +- [ ] Performance impact is acceptable (<10ms per batch) +- [ ] Documentation updated to reflect changes + +## Quick Command Reference + +```bash +# Run tests +GO_TEST_TARGET=./app/domain/transform/dcgm make test + +# Run tests with coverage +go test -cover ./app/domain/transform/dcgm + +# Run specific test +GO_TEST_FLAGS="-run TestTransform_GPUUtilization" \ + GO_TEST_TARGET=./app/domain/transform/dcgm make test + +# Check for race conditions +GO_TEST_FLAGS="-race" GO_TEST_TARGET=./app/domain/transform/dcgm make test + +# Format code +make format + +# Lint code +make lint + +# Build entire project (includes this package) +make build + +# Deploy to test cluster +CLUSTER_NAME=brahms make helm-install helm-wait + +# Check logs for GPU metrics +kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + --tail=100 | grep -E "(DCGM|container_resources_gpu)" +```text + +## Remember + +1. **Read [README.md](README.md) first** - It has the user-facing documentation +2. **Check [app/domain/transform/README.md](../README.md)** - Understand transformation architecture +3. **Follow hexagonal architecture** - Keep business logic in domain layer +4. **Use table-driven tests** - See existing tests for patterns +5. **Document why, not what** - Code shows what, comments explain why +6. **Test with real DCGM metrics** - Unit tests alone aren't sufficient +7. **Update Helm chart when adding metrics** - Filter configuration matters + +## Support + +If you're stuck: + +1. Read the README.md in this directory +2. Check similar transformers (catalog transformer as example) +3. Look at existing test cases for patterns +4. Review the MetricTransformer interface definition +5. Check DCGM Exporter documentation for metric definitions +```` diff --git a/app/domain/transform/dcgm/README.md b/app/domain/transform/dcgm/README.md new file mode 100644 index 00000000..0f6ff143 --- /dev/null +++ b/app/domain/transform/dcgm/README.md @@ -0,0 +1,395 @@ +# DCGM Transformer + +## Overview + +The DCGM transformer converts NVIDIA DCGM (Data Center GPU Manager) exporter metrics into standardized container-level GPU resource metrics for cost allocation analysis. This transformer is part of the metric transformation pipeline and handles GPU-specific metric normalization. + +## Purpose + +NVIDIA DCGM Exporter provides raw GPU telemetry in NVIDIA's native format. For accurate cost allocation, CloudZero needs: + +1. **Standardized metric names** - Consistent naming across GPU vendors (NVIDIA, AMD, Intel) +2. **Percentage-based metrics** - Normalized values for comparison and analysis +3. **Container attribution** - Per-container GPU usage for accurate cost allocation + +This transformer bridges the gap between NVIDIA's native DCGM format and CloudZero's standardized GPU metrics. + +## Architecture + +````text +MetricCollector + └── catalog.Transformer (metric routing) + └── dcgm.Transformer (NVIDIA GPU metrics) +```text + +The DCGM transformer is invoked by the catalog transformer, which routes metrics to specialized transformers based on metric characteristics. Future GPU vendors (AMD ROCm, Intel XPU) would be implemented as sibling transformers. + +## Metric Transformations + +### Input Metrics (DCGM Format) + +The transformer processes three DCGM metrics from NVIDIA DCGM Exporter: + +| DCGM Metric | Description | Unit | Labels | +| ---------------------- | --------------------------- | ------------------ | -------------------------------------------------------------------------------------- | +| `DCGM_FI_DEV_GPU_UTIL` | GPU compute utilization | Percentage (0-100) | namespace, pod, container, gpu, Hostname, UUID, modelName, pci_bus_id, device, pod_uid | +| `DCGM_FI_DEV_FB_USED` | GPU framebuffer memory used | MiB | namespace, pod, container, gpu, Hostname, UUID, modelName, pci_bus_id, device, pod_uid | +| `DCGM_FI_DEV_FB_FREE` | GPU framebuffer memory free | MiB | namespace, pod, container, gpu, Hostname, UUID, modelName, pci_bus_id, device, pod_uid | + +### Output Metrics (Standardized Format) + +The transformer produces two standardized container-level GPU metrics: + +| Standardized Metric | Description | Unit | Calculation | +| ---------------------------------------------- | ----------------------- | ------------------ | ---------------------------------------- | +| `container_resources_gpu_usage_percent` | GPU compute utilization | Percentage (0-100) | Pass-through from `DCGM_FI_DEV_GPU_UTIL` | +| `container_resources_gpu_memory_usage_percent` | GPU memory utilization | Percentage (0-100) | `(USED / (USED + FREE)) * 100` | + +### Transformation Rules + +1. **GPU Utilization**: Direct rename from `DCGM_FI_DEV_GPU_UTIL` to `container_resources_gpu_usage_percent` + + - Already in percentage format (0-100) + - No calculation required + - Immediate transformation + +2. **GPU Memory Utilization**: Calculated from `DCGM_FI_DEV_FB_USED` and `DCGM_FI_DEV_FB_FREE` + - Formula: `(used / (used + free)) * 100` + - Requires buffering to ensure paired metrics + - Calculated during flush phase + +## Processing Strategy + +The transformer uses a **buffered processing model** for memory metrics to handle the asynchronous arrival of USED and FREE metrics: + +### Phase 1: Transform (Per Metric) + +```text +For each metric in batch: + ├─ If DCGM_FI_DEV_GPU_UTIL + │ └─ Transform immediately → container_resources_gpu_usage_percent + │ + ├─ If DCGM_FI_DEV_FB_USED + │ └─ Buffer in memoryBuffer (key: namespace/pod/container/gpu) + │ + ├─ If DCGM_FI_DEV_FB_FREE + │ └─ Buffer in memoryBuffer (key: namespace/pod/container/gpu) + │ + └─ If not DCGM metric + └─ Pass through unchanged +```text + +### Phase 2: Flush (End of Batch) + +```text +For each buffered memory pair: + ├─ If both USED and FREE present + │ └─ Calculate percentage → container_resources_gpu_memory_usage_percent + │ + └─ If incomplete pair (missing USED or FREE) + └─ Drop (logged as incomplete pair) +```text + +This two-phase approach ensures: + +- Memory metrics are always calculated from complete USED+FREE pairs +- GPU utilization metrics are transformed immediately (no buffering overhead) +- Non-DCGM metrics pass through without interference + +## Required Labels + +For accurate container attribution, DCGM metrics must include these labels: + +- `namespace` - Kubernetes namespace +- `pod` - Pod name +- `container` - Container name + +Metrics missing any required label are dropped with a warning log. + +## Label Handling + +### DCGM Labels Preserved and Transformed + +The transformer preserves all DCGM-specific labels for operational correlation: + +- `gpu` - GPU index (0, 1, 2, etc.) - **preserved as-is** +- `Hostname` - Node hostname where GPU is located (e.g., "ip-10-30-23-129.ec2.internal") - **preserved as-is** +- `UUID` → `gpu_uuid` - NVIDIA GPU UUID (e.g., "GPU-4980eea4-963e-7b82-ecb9-36ee26fdceb8") - **renamed for standardization** +- `modelName` - GPU model name (e.g., "Tesla T4", "NVIDIA A100-SXM4-40GB") - **preserved as-is** +- `pci_bus_id` - PCIe bus identifier (e.g., "00000000:00:1E.0") - **preserved as-is** +- `device` - NVIDIA device name (e.g., "nvidia0") - **preserved as-is** +- `pod_uid` - Kubernetes pod UID (may be empty) - **preserved as-is** + +**Label Renaming**: The `UUID` label from DCGM is renamed to `gpu_uuid` in the output metrics for consistency with standardized GPU metric conventions. + +### Label Aliasing + +For node attribution, the transformer provides fallback logic: + +- Primary: Use `node` label if present +- Fallback: Use `Hostname` label from DCGM + +This ensures node attribution works regardless of label source. + +## Configuration + +### Helm Chart Integration + +The DCGM transformer is automatically enabled when GPU metrics are configured: + +```yaml +# clusters/brahms-overrides.yaml +prometheusConfig: + scrapeJobs: + gpu: + enabled: true + scrapeInterval: 30s +```text + +### Metric Filtering + +Transformed metrics are classified as **cost metrics** in the Helm chart: + +```yaml +# helm/templates/_defaults.tpl +metricFilters: + cost: + name: + exact: + - container_resources_gpu_usage_percent + - container_resources_gpu_memory_usage_percent +```text + +Note: Raw DCGM metrics are **not** included in cost filters - only the transformed percentage-based metrics. + +## Data Flow + +```text +DCGM Exporter (GPU node) + │ + │ DCGM_FI_DEV_GPU_UTIL + │ DCGM_FI_DEV_FB_USED + │ DCGM_FI_DEV_FB_FREE + │ + ▼ +Prometheus (scrapes every 30s) + │ + │ Prometheus Remote Write + │ + ▼ +CloudZero Agent Aggregator + │ + ▼ +MetricCollector.PutMetrics() + │ + ├─ Decode Prometheus format + │ + ├─ Transform (catalog → dcgm) + │ │ + │ ├─ GPU_UTIL → container_resources_gpu_usage_percent + │ │ + │ └─ FB_USED + FB_FREE → container_resources_gpu_memory_usage_percent + │ + ├─ Filter (cost vs observability) + │ │ + │ └─ Transformed metrics → COST + │ + └─ Store in database → Ship to CloudZero +```text + +## Error Handling + +### Incomplete Memory Pairs + +If a memory USED metric arrives without a corresponding FREE metric (or vice versa), the incomplete pair is dropped: + +```go +log.Ctx(ctx).Warn(). + Str("key", key). + Bool("has_used", pair.used != nil). + Bool("has_free", pair.free != nil). + Msg("dropping incomplete DCGM memory metric pair") +```text + +This can occur when: + +- Prometheus scrape fails for one metric but not the other +- DCGM Exporter temporarily stops reporting one metric +- Network issues cause partial metric delivery + +### Missing Required Labels + +Metrics without required container attribution labels are dropped: + +```go +log.Ctx(ctx).Warn(). + Str("metric", metric.MetricName). + Interface("labels", metric.Labels). + Msg("dropping DCGM metric missing required labels") +```text + +### Memory Buffer Management + +The memory buffer is cleared after each flush to prevent unbounded growth: + +```go +// Clear buffer after flush +t.memoryBuffer = make(map[string]*memoryPair) +```text + +This ensures the buffer doesn't accumulate stale metrics across batches. + +## Testing + +### Unit Tests + +Comprehensive unit tests cover all transformation scenarios: + +```bash +# Run DCGM transformer tests +GO_TEST_TARGET=./app/domain/transform/dcgm make test + +# Run with verbose output +GO_TEST_FLAGS="-v" GO_TEST_TARGET=./app/domain/transform/dcgm make test +```text + +Test coverage includes: + +- GPU utilization pass-through transformation +- Memory percentage calculation from USED+FREE pairs +- Label preservation and aliasing +- Error cases (missing labels, incomplete pairs) +- Buffer management and flush behavior + +### Integration Testing + +End-to-end testing with actual DCGM metrics: + +1. Deploy DCGM Exporter to a GPU-enabled cluster +2. Configure CloudZero Agent with GPU scraping enabled +3. Verify transformed metrics appear in CloudZero platform +4. Validate cost allocation accuracy for GPU workloads + +## Performance Considerations + +### Memory Buffer Size + +The memory buffer is bounded by the number of unique GPU containers in a single batch: + +- **Typical size**: 10-100 entries (10 pods × 1-10 GPUs each) +- **Maximum size**: ~1000 entries (pathological case) +- **Memory overhead**: ~1 KB per entry (metric metadata + pointers) + +The buffer is cleared after each batch, preventing unbounded growth. + +### Transformation Overhead + +- **GPU utilization**: O(1) - simple field rename +- **Memory percentage**: O(1) - buffered calculation during flush +- **Overall complexity**: O(n) where n = number of metrics in batch + +Transformation adds negligible latency (<1ms per batch) compared to network and database I/O. + +## Future Extensions + +### Multi-Vendor Support + +The transformer architecture supports future GPU vendor extensions: + +```text +app/domain/transform/ + ├── catalog/ # Metric routing + ├── dcgm/ # NVIDIA GPUs (current) + ├── rocm/ # AMD GPUs (future) + └── xpu/ # Intel GPUs (future) +```text + +Each vendor-specific transformer would: + +1. Convert vendor metrics to standardized format +2. Handle vendor-specific label schemas +3. Implement vendor-specific calculation logic + +### Additional GPU Metrics + +Future enhancements may include: + +- **GPU temperature** - For thermal-aware cost optimization +- **GPU power consumption** - For energy cost attribution +- **GPU error rates** - For reliability tracking +- **Multi-instance GPU (MIG)** - For GPU partitioning support + +## Troubleshooting + +### No GPU Metrics Appearing + +**Symptom**: No `container_resources_gpu_*` metrics in CloudZero + +**Diagnosis**: + +1. Check DCGM Exporter is running: + + ```bash + kubectl get pods -A | grep dcgm +```` + +2. Check Prometheus is scraping DCGM: + + ```bash + # Port-forward to Prometheus + kubectl -n cloudzero-agent port-forward svc/prometheus 9090:9090 + + # Check targets: http://localhost:9090/targets + # Look for "cloudzero-dcgm-exporter" job + ``` + +3. Check aggregator logs for DCGM metrics: + ```bash + kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + | grep "DCGM_FI_DEV" + ``` + +### Metrics Being Dropped + +**Symptom**: DCGM metrics received but not transformed + +**Diagnosis**: Check for missing required labels: + +````bash +kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + | grep "dropping DCGM metric missing required labels" +```text + +**Resolution**: Ensure DCGM Exporter is configured to include Kubernetes labels (namespace, pod, container). + +### Missing Memory Metrics + +**Symptom**: GPU utilization metrics present but memory metrics missing + +**Diagnosis**: Check for incomplete pairs: + +```bash +kubectl -n cloudzero-agent logs -l app.kubernetes.io/component=aggregator \ + | grep "incomplete DCGM memory metric pair" +```text + +**Resolution**: + +- Check DCGM Exporter health +- Verify Prometheus scrape success rate +- Check for network issues between Prometheus and DCGM Exporter + +## References + +### NVIDIA DCGM + +- [DCGM Documentation](https://docs.nvidia.com/datacenter/dcgm/) +- [DCGM Exporter GitHub](https://github.com/NVIDIA/dcgm-exporter) +- [DCGM Field Identifiers](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html) + +### Related Documentation + +- [app/domain/transform/README.md](../README.md) - Transformation architecture +- [app/domain/transform/catalog/README.md](../catalog/README.md) - Metric routing +- [helm/docs/troubleshooting-guide.md](../../../../helm/docs/troubleshooting-guide.md) - Operational troubleshooting +```` diff --git a/app/domain/transform/dcgm/transformer.go b/app/domain/transform/dcgm/transformer.go new file mode 100644 index 00000000..43fdf008 --- /dev/null +++ b/app/domain/transform/dcgm/transformer.go @@ -0,0 +1,340 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Package dcgm provides NVIDIA DCGM metric transformation for cost allocation. +// +// This package implements types.MetricTransformer following hexagonal architecture principles. +// It transforms NVIDIA DCGM exporter metrics into standardized GPU metrics suitable for +// cost allocation and monitoring. +// +// # Transformation Rules +// +// - DCGM_FI_DEV_GPU_UTIL → container_resources_gpu_usage_percent (pass-through percentage) +// - DCGM_FI_DEV_FB_USED + FB_FREE → container_resources_gpu_memory_usage_percent (calculated percentage) +// +// # Processing Strategy +// +// Memory metrics are buffered during Transform() and calculated during the final flush phase +// to ensure paired USED/FREE metrics are processed together for accurate percentage calculation. +// +// # Architecture +// +// catalog.Transformer (routes to specialized transformers) +// └── dcgm.Transformer (handles NVIDIA DCGM metrics) +// +// Future GPU vendors (Intel XPU, AMD ROCm) would be implemented as peer packages. +package dcgm + +import ( + "context" + "fmt" + "strconv" + "strings" + + "github.com/google/uuid" + "github.com/rs/zerolog/log" + + "github.com/cloudzero/cloudzero-agent/app/types" +) + +// NVIDIA DCGM metric names that we transform. +const ( + dcgmGPUUtilization = "DCGM_FI_DEV_GPU_UTIL" // GPU utilization percentage + dcgmMemoryUsed = "DCGM_FI_DEV_FB_USED" // Framebuffer memory used (MiB) + dcgmMemoryFree = "DCGM_FI_DEV_FB_FREE" // Framebuffer memory free (MiB) +) + +// Standardized container-level GPU metric names. +const ( + standardGPUUsage = "container_resources_gpu_usage_percent" + standardGPUMemoryUsage = "container_resources_gpu_memory_usage_percent" +) + +// Required labels for GPU metric attribution. +var requiredLabels = []string{"namespace", "pod", "container"} + +// Transformer implements types.MetricTransformer for NVIDIA DCGM metrics. +// +// This transformer converts native DCGM exporter metrics into standardized +// container-level GPU resource metrics. It handles both immediate +// transformations (GPU utilization) and buffered transformations (memory +// percentage calculation requiring paired USED/FREE metrics). +type Transformer struct { + // memoryBuffer stores memory metrics awaiting paired calculation. + // Key format: "namespace/pod/container/gpu" + memoryBuffer map[string]*memoryPair +} + +// memoryPair tracks USED and FREE memory metrics for percentage calculation. +type memoryPair struct { + used *types.Metric + free *types.Metric +} + +// NewTransformer creates a new DCGM metric transformer. +func NewTransformer() *Transformer { + return &Transformer{ + memoryBuffer: make(map[string]*memoryPair), + } +} + +// Transform converts DCGM metrics to standardized format while passing through +// non-DCGM metrics unchanged. +// +// Processing flow: +// 1. For each metric, check if it's a DCGM metric +// 2. If DCGM GPU utilization, transform immediately +// 3. If DCGM memory (USED/FREE), buffer for later calculation +// 4. If not DCGM, pass through unchanged +// 5. Flush memory buffer to calculate percentages from paired metrics +// +// This implements the types.MetricTransformer interface. +func (t *Transformer) Transform(ctx context.Context, metrics []types.Metric) ([]types.Metric, error) { + if len(metrics) == 0 { + return metrics, nil + } + + // Estimate result capacity (metrics may expand during transformation). + result := make([]types.Metric, 0, len(metrics)) + + // Transform each metric + for _, metric := range metrics { + transformed, err := t.transformSingle(ctx, metric) + if err != nil { + return nil, err + } + result = append(result, transformed...) + } + + // Flush memory buffer to get calculated memory percentage metrics. + flushed, err := t.flushMemory(ctx) + if err != nil { + return nil, err + } + result = append(result, flushed...) + + return result, nil +} + +// transformSingle transforms a single metric. Returns the metric unchanged if +// it's not a DCGM metric. +func (t *Transformer) transformSingle(ctx context.Context, metric types.Metric) ([]types.Metric, error) { + // Check if this is a DCGM metric + if !strings.HasPrefix(metric.MetricName, "DCGM_FI_DEV_") { + // Not a DCGM metric - pass through unchanged + return []types.Metric{metric}, nil + } + + // Validate required labels for cost attribution + if !hasRequiredLabels(metric) { + log.Ctx(ctx).Debug(). + Str("metric", metric.MetricName). + Interface("labels", metric.Labels). + Msg("dropping DCGM metric missing required labels") + return []types.Metric{}, nil + } + + switch metric.MetricName { + case dcgmGPUUtilization: + // GPU utilization is already a percentage - just rename and return + return transformGPUUtilization(metric), nil + + case dcgmMemoryUsed: + // Buffer for later percentage calculation + t.bufferMemoryMetric(metric, true) + return []types.Metric{}, nil + + case dcgmMemoryFree: + // Buffer for later percentage calculation + t.bufferMemoryMetric(metric, false) + return []types.Metric{}, nil + + default: + // Unknown DCGM metric - pass through unchanged + return []types.Metric{metric}, nil + } +} + +// flushMemory calculates and returns GPU memory percentage metrics from buffered USED/FREE pairs. +// After flush, the memory buffer is cleared. +// +// Memory percentage is calculated as: (used / (used + free)) * 100 +// +// Incomplete pairs (missing either USED or FREE) are dropped with debug logging. +func (t *Transformer) flushMemory(ctx context.Context) ([]types.Metric, error) { + if len(t.memoryBuffer) == 0 { + return []types.Metric{}, nil + } + + result := make([]types.Metric, 0, len(t.memoryBuffer)) + + for key, pair := range t.memoryBuffer { + if pair.used == nil || pair.free == nil { + log.Ctx(ctx).Debug(). + Str("key", key). + Bool("hasUsed", pair.used != nil). + Bool("hasFree", pair.free != nil). + Msg("dropping incomplete memory metric pair") + continue + } + + // Parse string values to floats for calculation + used, err := parseFloat(pair.used.Value) + if err != nil { + log.Ctx(ctx).Debug(). + Str("key", key). + Str("usedValue", pair.used.Value). + Err(err). + Msg("dropping memory metric with invalid used value") + continue + } + + free, err := parseFloat(pair.free.Value) + if err != nil { + log.Ctx(ctx).Debug(). + Str("key", key). + Str("freeValue", pair.free.Value). + Err(err). + Msg("dropping memory metric with invalid free value") + continue + } + + total := used + free + if total == 0 { + log.Ctx(ctx).Debug(). + Str("key", key). + Msg("dropping memory metric with zero total") + continue + } + + percentage := (used / total) * 100.0 + + // Extract node name from field or labels. DCGM uses "Hostname" label, + // other exporters may use "node". + nodeName := pair.used.NodeName + if nodeName == "" { + nodeName = pair.used.Labels["node"] + } + if nodeName == "" { + nodeName = pair.used.Labels["Hostname"] + } + + // Create standardized memory usage metric. Use timestamp and metadata + // from the USED metric. + memoryMetric := types.Metric{ + ID: uuid.New(), + ClusterName: pair.used.ClusterName, + CloudAccountID: pair.used.CloudAccountID, + MetricName: standardGPUMemoryUsage, + NodeName: nodeName, + Value: formatFloat(percentage), + TimeStamp: pair.used.TimeStamp, + CreatedAt: pair.used.CreatedAt, + Labels: transformLabels(pair.used.Labels), + } + + result = append(result, memoryMetric) + } + + // Clear buffer after flush + t.memoryBuffer = make(map[string]*memoryPair) + + return result, nil +} + +// transformGPUUtilization converts DCGM GPU utilization to standardized format. +func transformGPUUtilization(metric types.Metric) []types.Metric { + // Extract node name from field or labels. DCGM uses "Hostname" label, other + // exporters may use "node". + nodeName := metric.NodeName + if nodeName == "" { + nodeName = metric.Labels["node"] + } + if nodeName == "" { + nodeName = metric.Labels["Hostname"] + } + + return []types.Metric{ + { + ID: uuid.New(), + ClusterName: metric.ClusterName, + CloudAccountID: metric.CloudAccountID, + MetricName: standardGPUUsage, + NodeName: nodeName, + Value: metric.Value, + TimeStamp: metric.TimeStamp, + CreatedAt: metric.CreatedAt, + Labels: transformLabels(metric.Labels), + }, + } +} + +// bufferMemoryMetric stores a memory metric for later percentage calculation. +func (t *Transformer) bufferMemoryMetric(metric types.Metric, isUsed bool) { + key := makeMemoryKey(metric) + + pair, exists := t.memoryBuffer[key] + if !exists { + pair = &memoryPair{} + t.memoryBuffer[key] = pair + } + + if isUsed { + pair.used = &metric + } else { + pair.free = &metric + } +} + +// makeMemoryKey creates a unique key for buffering memory metrics. Format: +// "namespace/pod/container/gpu" +func makeMemoryKey(metric types.Metric) string { + return fmt.Sprintf("%s/%s/%s/%s", + metric.Labels["namespace"], + metric.Labels["pod"], + metric.Labels["container"], + metric.Labels["gpu"], + ) +} + +// hasRequiredLabels checks if metric has all required labels for cost +// attribution. +func hasRequiredLabels(metric types.Metric) bool { + for _, label := range requiredLabels { + if _, exists := metric.Labels[label]; !exists { + return false + } + } + return true +} + +// transformLabels creates a shallow copy of the labels map with standardization +// transformations. This performs vendor-specific label transformations to +// ensure compatibility with standardized GPU metrics. +func transformLabels(labels map[string]string) map[string]string { + if labels == nil { + return nil + } + + result := make(map[string]string, len(labels)) + for k, v := range labels { + // Rename DCGM's "UUID" label to standardized "gpu_uuid" + // Strip "GPU-" prefix: "GPU-4980eea4-..." becomes "4980eea4-..." + if k == "UUID" { + result["gpu_uuid"] = strings.TrimPrefix(v, "GPU-") + } else { + result[k] = v + } + } + return result +} + +// parseFloat converts a string value to float64. +func parseFloat(value string) (float64, error) { + return strconv.ParseFloat(value, 64) +} + +// formatFloat converts a float64 value to string. +func formatFloat(value float64) string { + return strconv.FormatFloat(value, 'f', -1, 64) +} diff --git a/app/domain/transform/dcgm/transformer_test.go b/app/domain/transform/dcgm/transformer_test.go new file mode 100644 index 00000000..a2d1e25b --- /dev/null +++ b/app/domain/transform/dcgm/transformer_test.go @@ -0,0 +1,757 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package dcgm + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + "github.com/cloudzero/cloudzero-agent/app/types" +) + +// Test that non-DCGM metrics pass through unchanged. +func TestTransformer_PassThrough(t *testing.T) { + transformer := NewTransformer() + ctx := context.Background() + timestamp := time.Now() + + tests := []struct { + name string + input []types.Metric + }{ + { + name: "CPU metrics pass through", + input: []types.Metric{ + { + MetricName: "container_cpu_usage_seconds_total", + Value: "1.5", + TimeStamp: timestamp, + }, + }, + }, + { + name: "memory metrics pass through", + input: []types.Metric{ + { + MetricName: "container_memory_working_set_bytes", + Value: "1073741824", + TimeStamp: timestamp, + }, + }, + }, + { + name: "network metrics pass through", + input: []types.Metric{ + { + MetricName: "container_network_receive_bytes_total", + Value: "12345", + TimeStamp: timestamp, + }, + }, + }, + { + name: "empty input returns empty output", + input: []types.Metric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := transformer.Transform(ctx, tt.input) + if err != nil { + t.Fatalf("Transform() error = %v", err) + } + + // Non-DCGM metrics should pass through unchanged + if diff := cmp.Diff(tt.input, got, cmpopts.IgnoreFields(types.Metric{}, "ID")); diff != "" { + t.Errorf("Transform() mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// Test GPU utilization transformation. +func TestTransformer_GPUUtilization(t *testing.T) { + transformer := NewTransformer() + ctx := context.Background() + timestamp := time.Now() + + tests := []struct { + name string + input []types.Metric + expected []types.Metric + }{ + { + name: "transforms DCGM GPU utilization with node label", + input: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "DCGM_FI_DEV_GPU_UTIL", + Value: "85.5", + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-1", + "container": "app", + "gpu": "0", + "node": "gpu-node-1", + }, + }, + }, + expected: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "container_resources_gpu_usage_percent", + NodeName: "gpu-node-1", + Value: "85.5", + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-1", + "container": "app", + "gpu": "0", + "node": "gpu-node-1", + }, + }, + }, + }, + { + name: "transforms DCGM GPU utilization with Hostname label", + input: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "DCGM_FI_DEV_GPU_UTIL", + Value: "92.0", + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-2", + "container": "cuda", + "gpu": "1", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + }, + expected: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "container_resources_gpu_usage_percent", + NodeName: "ip-10-30-23-129.ec2.internal", + Value: "92.0", + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-2", + "container": "cuda", + "gpu": "1", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + }, + }, + { + name: "renames UUID label to gpu_uuid and strips GPU- prefix", + input: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "DCGM_FI_DEV_GPU_UTIL", + Value: "75.0", + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-3", + "container": "ml-app", + "gpu": "0", + "UUID": "GPU-4980eea4-963e-7b82-ecb9-36ee26fdceb8", + "modelName": "Tesla T4", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + }, + expected: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "container_resources_gpu_usage_percent", + NodeName: "ip-10-30-23-129.ec2.internal", + Value: "75.0", + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-3", + "container": "ml-app", + "gpu": "0", + "gpu_uuid": "4980eea4-963e-7b82-ecb9-36ee26fdceb8", + "modelName": "Tesla T4", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + }, + }, + { + name: "drops GPU utilization missing required labels", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_GPU_UTIL", + Value: "85.0", + Labels: map[string]string{ + "gpu": "0", + }, + }, + }, + expected: []types.Metric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := transformer.Transform(ctx, tt.input) + if err != nil { + t.Fatalf("Transform() error = %v", err) + } + + opts := []cmp.Option{ + cmpopts.IgnoreFields(types.Metric{}, "ID"), + } + + if diff := cmp.Diff(tt.expected, got, opts...); diff != "" { + t.Errorf("Transform() mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// Test GPU memory percentage calculation from paired USED/FREE metrics. +func TestTransformer_GPUMemory(t *testing.T) { + transformer := NewTransformer() + ctx := context.Background() + timestamp := time.Now() + + tests := []struct { + name string + input []types.Metric + expected []types.Metric + }{ + { + name: "calculates memory percentage from paired USED/FREE metrics", + input: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", // 4GB + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-1", + "container": "app", + "gpu": "0", + "node": "gpu-node-1", + }, + }, + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", // 12GB + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-1", + "container": "app", + "gpu": "0", + "node": "gpu-node-1", + }, + }, + }, + expected: []types.Metric{ + { + ClusterName: "test-cluster", + CloudAccountID: "123456789", + MetricName: "container_resources_gpu_memory_usage_percent", + NodeName: "gpu-node-1", + Value: "25", // 4GB / (4GB + 12GB) * 100 = 25% + TimeStamp: timestamp, + CreatedAt: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod-1", + "container": "app", + "gpu": "0", + "node": "gpu-node-1", + }, + }, + }, + }, + { + name: "handles multiple GPU memory metrics", + input: []types.Metric{ + // GPU 0 + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "8589934592", // 8GB + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "0", + "node": "node-1", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "8589934592", // 8GB + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "0", + "node": "node-1", + }, + }, + // GPU 1 + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", // 4GB + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "1", + "node": "node-1", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", // 12GB + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "1", + "node": "node-1", + }, + }, + }, + expected: []types.Metric{ + { + MetricName: "container_resources_gpu_memory_usage_percent", + NodeName: "node-1", + Value: "50", // GPU 0: 8GB / 16GB = 50% + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "0", + "node": "node-1", + }, + }, + { + MetricName: "container_resources_gpu_memory_usage_percent", + NodeName: "node-1", + Value: "25", // GPU 1: 4GB / 16GB = 25% + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "1", + "node": "node-1", + }, + }, + }, + }, + { + name: "drops incomplete memory pairs (only USED)", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "0", + }, + }, + }, + expected: []types.Metric{}, + }, + { + name: "drops incomplete memory pairs (only FREE)", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", + Labels: map[string]string{ + "namespace": "default", + "pod": "pod-1", + "container": "app", + "gpu": "0", + }, + }, + }, + expected: []types.Metric{}, + }, + { + name: "drops memory metrics missing required labels", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", + Labels: map[string]string{ + "gpu": "0", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", + Labels: map[string]string{ + "gpu": "0", + }, + }, + }, + expected: []types.Metric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := transformer.Transform(ctx, tt.input) + if err != nil { + t.Fatalf("Transform() error = %v", err) + } + + opts := []cmp.Option{ + cmpopts.IgnoreFields(types.Metric{}, "ID", "ClusterName", "CloudAccountID", "CreatedAt"), + cmpopts.SortSlices(func(a, b types.Metric) bool { + return a.Labels["gpu"] < b.Labels["gpu"] + }), + } + + if diff := cmp.Diff(tt.expected, got, opts...); diff != "" { + t.Errorf("Transform() mismatch (-want +got):\n%s", diff) + } + }) + } +} + +// Test mixed batches of DCGM and non-DCGM metrics. +func TestTransformer_MixedBatch(t *testing.T) { + transformer := NewTransformer() + ctx := context.Background() + timestamp := time.Now() + + input := []types.Metric{ + // Non-DCGM metrics (should pass through) + { + MetricName: "container_cpu_usage_seconds_total", + Value: "1.5", + TimeStamp: timestamp, + }, + // DCGM GPU utilization (should transform) + { + MetricName: "DCGM_FI_DEV_GPU_UTIL", + Value: "85.0", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + // More non-DCGM metrics + { + MetricName: "container_memory_working_set_bytes", + Value: "1073741824", + TimeStamp: timestamp, + }, + // DCGM memory USED (should buffer) + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + // DCGM memory FREE (should buffer and calculate) + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + } + + got, err := transformer.Transform(ctx, input) + if err != nil { + t.Fatalf("Transform() error = %v", err) + } + + // Should get back: + // - 2 pass-through metrics (CPU, memory) + // - 1 transformed GPU utilization + // - 1 calculated GPU memory percentage + if len(got) != 4 { + t.Errorf("Transform() returned %d metrics, want 4", len(got)) + } + + // Check that we got the right metric types + metricNames := make(map[string]int) + for _, m := range got { + metricNames[m.MetricName]++ + } + + expectedNames := map[string]int{ + "container_cpu_usage_seconds_total": 1, + "container_memory_working_set_bytes": 1, + "container_resources_gpu_usage_percent": 1, + "container_resources_gpu_memory_usage_percent": 1, + } + + if diff := cmp.Diff(expectedNames, metricNames); diff != "" { + t.Errorf("Metric names mismatch (-want +got):\n%s", diff) + } +} + +// Test unknown DCGM metrics pass through. +func TestTransformer_UnknownDCGMMetrics(t *testing.T) { + transformer := NewTransformer() + ctx := context.Background() + + input := []types.Metric{ + { + MetricName: "DCGM_FI_DEV_UNKNOWN_METRIC", + Value: "123", + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + }, + }, + } + + got, err := transformer.Transform(ctx, input) + if err != nil { + t.Fatalf("Transform() error = %v", err) + } + + // Unknown DCGM metrics should pass through unchanged + if diff := cmp.Diff(input, got, cmpopts.IgnoreFields(types.Metric{}, "ID")); diff != "" { + t.Errorf("Transform() mismatch (-want +got):\n%s", diff) + } +} + +// Test edge cases in memory percentage calculation. +func TestTransformer_GPUMemoryEdgeCases(t *testing.T) { + transformer := NewTransformer() + ctx := context.Background() + timestamp := time.Now() + + tests := []struct { + name string + input []types.Metric + expected []types.Metric + }{ + { + name: "drops memory pair with invalid USED value", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "not-a-number", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + }, + expected: []types.Metric{}, + }, + { + name: "drops memory pair with invalid FREE value", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "invalid", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + }, + expected: []types.Metric{}, + }, + { + name: "drops memory pair with zero total", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "0", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "0", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "node": "gpu-node", + }, + }, + }, + expected: []types.Metric{}, + }, + { + name: "uses Hostname label when node label and NodeName field are missing", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_FB_USED", + Value: "4294967296", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + { + MetricName: "DCGM_FI_DEV_FB_FREE", + Value: "12884901888", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + }, + expected: []types.Metric{ + { + MetricName: "container_resources_gpu_memory_usage_percent", + NodeName: "ip-10-30-23-129.ec2.internal", + Value: "25", + TimeStamp: timestamp, + Labels: map[string]string{ + "namespace": "default", + "pod": "gpu-pod", + "container": "app", + "gpu": "0", + "Hostname": "ip-10-30-23-129.ec2.internal", + }, + }, + }, + }, + { + name: "handles nil labels map", + input: []types.Metric{ + { + MetricName: "DCGM_FI_DEV_GPU_UTIL", + Value: "85.0", + NodeName: "gpu-node", + TimeStamp: timestamp, + Labels: nil, + }, + }, + expected: []types.Metric{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := transformer.Transform(ctx, tt.input) + if err != nil { + t.Fatalf("Transform() error = %v", err) + } + + opts := []cmp.Option{ + cmpopts.IgnoreFields(types.Metric{}, "ID", "ClusterName", "CloudAccountID", "CreatedAt"), + } + + if diff := cmp.Diff(tt.expected, got, opts...); diff != "" { + t.Errorf("Transform() mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/app/domain/transform/transform.go b/app/domain/transform/transform.go new file mode 100644 index 00000000..7679d3fb --- /dev/null +++ b/app/domain/transform/transform.go @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Package transform provides metric transformation capabilities for +// standardizing vendor-specific metrics into common formats for cost +// allocation. +package transform + +import ( + "github.com/cloudzero/cloudzero-agent/app/domain/transform/catalog" + "github.com/cloudzero/cloudzero-agent/app/domain/transform/dcgm" + "github.com/cloudzero/cloudzero-agent/app/types" +) + +// NewMetricTransformer creates a new MetricTransformer with all registered +// specialized transformers. +// +// This is the primary entry point for metric transformation, following the +// Scout pattern. Add new specialized transformers here as peer implementations +// (Intel XPU, AMD ROCm, network, etc.). +func NewMetricTransformer() types.MetricTransformer { + return catalog.NewTransformer( + dcgm.NewTransformer(), + ) +} diff --git a/app/functions/helmless/default-values.yaml b/app/functions/helmless/default-values.yaml index b92bccb0..5a01cb52 100644 --- a/app/functions/helmless/default-values.yaml +++ b/app/functions/helmless/default-values.yaml @@ -554,6 +554,11 @@ prometheusConfig: enabled: true # Scrape interval for aggregator job scrapeInterval: 120s + # -- Enables the GPU metrics scrape job (NVIDIA DCGM Exporter auto-discovery). + gpu: + enabled: false + # Scrape interval for GPU metrics job + scrapeInterval: 30s # -- Any items added to this list will be added to the Prometheus scrape configuration. additionalScrapeJobs: [] diff --git a/app/types/metric_transformer.go b/app/types/metric_transformer.go new file mode 100644 index 00000000..b69e909a --- /dev/null +++ b/app/types/metric_transformer.go @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package types + +import "context" + +// MetricTransformer defines the port for metric transformation operations in the Application Core. +// This interface enables conversion of vendor-specific metrics into standardized formats +// for cost allocation and resource tracking, following the hexagonal architecture pattern where +// transformation logic is abstracted through a port interface. +// +// MetricTransformer implementations provide: +// - Vendor-specific metric detection and classification +// - Transformation of native metrics to standardized formats +// - Support for multiple device types (GPUs, network devices, storage, etc.) +// - Pass-through of unrecognized metrics without modification +// +// The transformer operates as part of the metric collection pipeline between decode and +// filter stages: +// +// Prometheus → Decode → Transform → Filter → Store +// +// Example usage: +// +// transformer := transform.NewMetricTransformer() +// transformed, err := transformer.Transform(ctx, metrics) +type MetricTransformer interface { + // Transform processes a slice of metrics, converting vendor-specific metrics + // into standardized formats while passing through all other metrics unchanged. + // + // The transformation is idempotent - metrics that are already in standardized + // format or are not recognized pass through without modification. + // + // Parameters: + // - ctx: Request context for cancellation and tracing + // - metrics: Input metrics from Prometheus remote_write decode + // + // Returns: + // - Transformed metrics with GPU metrics in standardized format + // - Error if transformation fails (context cancellation, invalid data, etc.) + // + // The returned slice may have a different length than the input if transformation + // results in metric expansion (e.g., one input metric generating multiple output + // metrics for different resource attribution). + Transform(ctx context.Context, metrics []Metric) ([]Metric, error) +} diff --git a/helm/templates/_cm_helpers.tpl b/helm/templates/_cm_helpers.tpl index f0c6c311..7a465680 100644 --- a/helm/templates/_cm_helpers.tpl +++ b/helm/templates/_cm_helpers.tpl @@ -163,6 +163,94 @@ remote_write: send: false {{- end -}} +{{/* +NVIDIA DCGM GPU Metrics Scrape Job Configuration Template + +Generates Prometheus scrape job configuration for collecting NVIDIA GPU metrics from +DCGM Exporter. This enables CloudZero cost allocation for NVIDIA GPU workloads in +Kubernetes clusters. + +DCGM metrics collected: +- DCGM_FI_DEV_GPU_UTIL: GPU compute utilization (0-100%) +- DCGM_FI_DEV_FB_USED: GPU memory used per GPU (bytes) +- DCGM_FI_DEV_FB_FREE: GPU memory free per GPU (bytes) + +Scraping features: +- Auto-discovery: Kubernetes service discovery with label selector for DCGM services +- Container attribution: Per-container GPU usage via Kubernetes Pod Resources API +- Label preservation: All DCGM labels (gpu, container, pod, namespace, Hostname, modelName, UUID) forwarded +- Metric filtering: Collects only the 3 DCGM metrics needed, drops unattributed metrics +- Provenance tracking: Adds "provenance=dcgm" label to identify metric source + +Note: This template is specific to NVIDIA DCGM Exporter. Future GPU vendors (AMD, Intel) +will have separate scrape job templates added alongside this one. + +This configuration enables accurate GPU cost allocation by tracking per-container +GPU usage across compute and memory dimensions, supporting multi-GPU containers +and GPU time-slicing scenarios. +*/}} +{{- define "cloudzero-agent.prometheus.scrapeGPU" -}} +# NVIDIA DCGM GPU Metrics Scrape Job +# cloudzero-dcgm-exporter +# +# Automatically discovers and scrapes NVIDIA GPU metrics from DCGM Exporter +# for GPU cost allocation and utilization tracking. +# +# This job is specific to NVIDIA DCGM Exporter. Future GPU vendors (AMD, Intel) +# will have separate scrape jobs added to this configuration. +- job_name: cloudzero-dcgm-exporter + scrape_interval: {{ .Values.prometheusConfig.scrapeJobs.gpu.scrapeInterval }} + + # Discover DCGM Exporter services in all namespaces + # Use label selector to filter at the Kubernetes API level for performance + kubernetes_sd_configs: + - role: service + kubeconfig_file: "" + selectors: + - role: service + label: "app.kubernetes.io/name=dcgm-exporter" + + # Relabel configs for label enrichment + relabel_configs: + + # Add provenance label to indicate DCGM as the metric source + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: dcgm-exporter + replacement: dcgm + target_label: provenance + + # Add Kubernetes metadata for cost attribution + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_service + + # Note: __address__ is automatically set by service discovery to : + # No need to override it - Prometheus will use the port from the Service definition + + # Metric relabel configs for filtering + metric_relabel_configs: + # Collect only the 3 raw DCGM metrics needed + - source_labels: [__name__] + regex: DCGM_FI_DEV_GPU_UTIL|DCGM_FI_DEV_FB_USED|DCGM_FI_DEV_FB_FREE + action: keep + + # Drop metrics without container attribution + # (These are node-level GPU metrics not assigned to containers) + - source_labels: [container] + regex: ^$ + action: drop + + - source_labels: [pod] + regex: ^$ + action: drop + + - source_labels: [namespace] + regex: ^$ + action: drop +{{- end -}} + {{/* Prometheus Self-Monitoring Scrape Job Configuration Template diff --git a/helm/templates/_defaults.tpl b/helm/templates/_defaults.tpl index e8b27195..e4746532 100644 --- a/helm/templates/_defaults.tpl +++ b/helm/templates/_defaults.tpl @@ -64,11 +64,15 @@ kubeMetrics: # Container Runtime Metrics - Essential for Resource Usage Tracking # These metrics capture actual resource consumption by containers, enabling CloudZero # to correlate resource requests/limits with actual usage for cost optimization insights. +# GPU metrics are collected in native DCGM format and transformed by the collector +# to percentage-based metrics for consistent reporting. containerMetrics: - container_cpu_usage_seconds_total - container_memory_working_set_bytes - container_network_receive_bytes_total - container_network_transmit_bytes_total + - container_resources_gpu_usage_percent + - container_resources_gpu_memory_usage_percent # CloudZero Agent Operational Metrics - Essential for Agent Health Monitoring # These metrics track CloudZero Agent performance, resource usage, and operational health, # enabling monitoring, alerting, and troubleshooting of the cost allocation pipeline. @@ -227,6 +231,8 @@ metricFilters: - container_memory_working_set_bytes - container_network_receive_bytes_total - container_network_transmit_bytes_total + - container_resources_gpu_usage_percent + - container_resources_gpu_memory_usage_percent - kube_node_info - kube_node_status_capacity - kube_pod_container_resource_limits diff --git a/helm/templates/agent-cm.yaml b/helm/templates/agent-cm.yaml index 4ff07e3b..192486c4 100644 --- a/helm/templates/agent-cm.yaml +++ b/helm/templates/agent-cm.yaml @@ -40,6 +40,10 @@ data: {{- include "cloudzero-agent.prometheus.scrapePrometheus" . | nindent 6 }} {{- end }} + {{- if .Values.prometheusConfig.scrapeJobs.gpu.enabled }} + {{- include "cloudzero-agent.prometheus.scrapeGPU" . | nindent 6 }} + {{- end }}{{/* End GPU scrape job */}} + {{- if .Values.prometheusConfig.scrapeJobs.additionalScrapeJobs -}} {{ toYaml .Values.prometheusConfig.scrapeJobs.additionalScrapeJobs | toString | nindent 6 }} {{- end }}{{/* End additional scrape jobs */}} diff --git a/helm/tests/gpu_metrics_test.yaml b/helm/tests/gpu_metrics_test.yaml new file mode 100644 index 00000000..f5460925 --- /dev/null +++ b/helm/tests/gpu_metrics_test.yaml @@ -0,0 +1,146 @@ +suite: test GPU metrics scrape job configuration +templates: + - agent-cm.yaml +tests: + # Test that GPU scrape job is included when GPU metrics are enabled + + - it: should include DCGM GPU scrape job when GPU metrics enabled + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + prometheusConfig.scrapeJobs.gpu.scrapeInterval: 30s + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "job_name: cloudzero-dcgm-exporter" + + # Test that GPU scrape job is NOT included when GPU metrics are disabled + + - it: should NOT include GPU scrape job when GPU metrics disabled + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: false + asserts: + - notMatchRegex: + path: data["prometheus.yml"] + pattern: "cloudzero-dcgm-exporter" + + # Test that custom scrape interval is applied correctly + + - it: should use custom GPU scrape interval of 45s when specified + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + prometheusConfig.scrapeJobs.gpu.scrapeInterval: 45s + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "job_name: cloudzero-dcgm-exporter" + - matchRegex: + path: data["prometheus.yml"] + pattern: "scrape_interval: 45s" + + # Test that DCGM label selector is present + + - it: should include DCGM label selector for service discovery + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "app.kubernetes.io/name=dcgm-exporter" + + # Test that provenance label is added + + - it: should add provenance=dcgm label to GPU metrics + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "replacement: dcgm" + - matchRegex: + path: data["prometheus.yml"] + pattern: "target_label: provenance" + + # Test that DCGM metrics are collected in native format + + - it: should collect DCGM_FI_DEV_GPU_UTIL in native format + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "DCGM_FI_DEV_GPU_UTIL" + + - it: should collect DCGM_FI_DEV_FB_USED in native format + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "DCGM_FI_DEV_FB_USED" + + - it: should collect DCGM_FI_DEV_FB_FREE in native format + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "DCGM_FI_DEV_FB_FREE" + + # Test that DCGM metrics are in remote_write allow list + + - it: should include DCGM metrics in remote_write filter + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "DCGM_FI_DEV_GPU_UTIL" + - matchRegex: + path: data["prometheus.yml"] + pattern: "DCGM_FI_DEV_FB_USED" + - matchRegex: + path: data["prometheus.yml"] + pattern: "DCGM_FI_DEV_FB_FREE" + + # Test that unattributed metrics are dropped + + - it: should drop metrics without container attribution + template: agent-cm.yaml + set: + apiKey: "test-key" + existingSecretName: null + prometheusConfig.scrapeJobs.gpu.enabled: true + asserts: + - matchRegex: + path: data["prometheus.yml"] + pattern: "source_labels: \\[container\\]" + - matchRegex: + path: data["prometheus.yml"] + pattern: "action: drop" diff --git a/helm/values.schema.json b/helm/values.schema.json index f89bc510..d96c54f2 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -6440,6 +6440,18 @@ "required": ["enabled"], "type": "object" }, + "gpu": { + "additionalProperties": false, + "properties": { + "enabled": { + "type": "boolean" + }, + "scrapeInterval": { + "$ref": "#/$defs/com.cloudzero.agent.duration" + } + }, + "type": "object" + }, "kubeStateMetrics": { "additionalProperties": false, "properties": { diff --git a/helm/values.schema.yaml b/helm/values.schema.yaml index 5a5bad02..f10f7941 100644 --- a/helm/values.schema.yaml +++ b/helm/values.schema.yaml @@ -909,6 +909,28 @@ properties: description: | Scrape interval for aggregator job. $ref: "#/$defs/com.cloudzero.agent.duration" + gpu: + description: | + GPU metrics scrape job configuration. + + Automatically discovers and scrapes GPU metrics from NVIDIA DCGM + Exporter. Collects GPU compute utilization and memory usage + metrics with per-container attribution. + type: object + additionalProperties: false + properties: + enabled: + description: | + Whether to enable the GPU metrics scrape job. + + When enabled, Prometheus will automatically discover GPU + exporters (currently only NVIDIA DCGM ins supported) and + collect metrics. + type: boolean + scrapeInterval: + description: | + Scrape interval for GPU metrics job. + $ref: "#/$defs/com.cloudzero.agent.duration" additionalScrapeJobs: description: | Additional scrape jobs to add to the configuration. diff --git a/helm/values.yaml b/helm/values.yaml index b9989817..be3f9a04 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -554,6 +554,11 @@ prometheusConfig: enabled: true # Scrape interval for aggregator job scrapeInterval: 120s + # -- Enables the GPU metrics scrape job (NVIDIA DCGM Exporter auto-discovery). + gpu: + enabled: false + # Scrape interval for GPU metrics job + scrapeInterval: 30s # -- Any items added to this list will be added to the Prometheus scrape configuration. additionalScrapeJobs: [] diff --git a/tests/helm/template/cert-manager.yaml b/tests/helm/template/cert-manager.yaml index 83dba59b..51e0984b 100644 --- a/tests/helm/template/cert-manager.yaml +++ b/tests/helm/template/cert-manager.yaml @@ -292,7 +292,7 @@ data: # Metrics to keep. - source_labels: [__name__] - regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total)$ + regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent)$ action: keep kubernetes_sd_configs: @@ -335,7 +335,7 @@ data: regex: port-(shipper|collector) metric_relabel_configs: - source_labels: [__name__] - regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" + regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" action: keep - job_name: static-prometheus scrape_interval: 120s @@ -352,7 +352,7 @@ data: credentials_file: /etc/config/secrets/value write_relabel_configs: - source_labels: [__name__] - regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" action: keep metadata_config: send: false @@ -388,6 +388,10 @@ data: match: exact - pattern: "container_network_transmit_bytes_total" match: exact + - pattern: "container_resources_gpu_usage_percent" + match: exact + - pattern: "container_resources_gpu_memory_usage_percent" + match: exact - pattern: "kube_node_info" match: exact - pattern: "kube_node_status_capacity" @@ -1351,6 +1355,9 @@ data: cadvisor: enabled: true scrapeInterval: 60s + gpu: + enabled: false + scrapeInterval: 30s kubeStateMetrics: enabled: true scrapeInterval: 60s diff --git a/tests/helm/template/federated.yaml b/tests/helm/template/federated.yaml index c0229551..a491f6f1 100644 --- a/tests/helm/template/federated.yaml +++ b/tests/helm/template/federated.yaml @@ -297,7 +297,7 @@ data: regex: port-(shipper|collector) metric_relabel_configs: - source_labels: [__name__] - regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" + regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" action: keep - job_name: static-prometheus scrape_interval: 120s @@ -314,7 +314,7 @@ data: credentials_file: /etc/config/secrets/value write_relabel_configs: - source_labels: [__name__] - regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" action: keep metadata_config: send: false @@ -400,7 +400,7 @@ data: # Metrics to keep. - source_labels: [__name__] - regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total)$ + regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent)$ action: keep kubernetes_sd_configs: @@ -421,7 +421,7 @@ data: credentials_file: /etc/config/secrets/value write_relabel_configs: - source_labels: [__name__] - regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" action: keep metadata_config: send: false @@ -457,6 +457,10 @@ data: match: exact - pattern: "container_network_transmit_bytes_total" match: exact + - pattern: "container_resources_gpu_usage_percent" + match: exact + - pattern: "container_resources_gpu_memory_usage_percent" + match: exact - pattern: "kube_node_info" match: exact - pattern: "kube_node_status_capacity" @@ -1420,6 +1424,9 @@ data: cadvisor: enabled: true scrapeInterval: 60s + gpu: + enabled: false + scrapeInterval: 30s kubeStateMetrics: enabled: true scrapeInterval: 60s diff --git a/tests/helm/template/manifest.yaml b/tests/helm/template/manifest.yaml index c211cb77..32a19a43 100644 --- a/tests/helm/template/manifest.yaml +++ b/tests/helm/template/manifest.yaml @@ -308,7 +308,7 @@ data: # Metrics to keep. - source_labels: [__name__] - regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total)$ + regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent)$ action: keep kubernetes_sd_configs: @@ -351,7 +351,7 @@ data: regex: port-(shipper|collector) metric_relabel_configs: - source_labels: [__name__] - regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" + regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" action: keep - job_name: static-prometheus scrape_interval: 120s @@ -368,7 +368,7 @@ data: credentials_file: /etc/config/secrets/value write_relabel_configs: - source_labels: [__name__] - regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|container_resources_gpu_usage_percent|container_resources_gpu_memory_usage_percent|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" action: keep metadata_config: send: false @@ -404,6 +404,10 @@ data: match: exact - pattern: "container_network_transmit_bytes_total" match: exact + - pattern: "container_resources_gpu_usage_percent" + match: exact + - pattern: "container_resources_gpu_memory_usage_percent" + match: exact - pattern: "kube_node_info" match: exact - pattern: "kube_node_status_capacity" @@ -1367,6 +1371,9 @@ data: cadvisor: enabled: true scrapeInterval: 60s + gpu: + enabled: false + scrapeInterval: 30s kubeStateMetrics: enabled: true scrapeInterval: 60s