From 6fdfff9a63673e0e8af2d9bd258f8b11fd44a39b Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Wed, 16 Jul 2025 08:46:00 -0400 Subject: [PATCH 1/2] Temporarily ignore AI rules We want to add these eventually, but they're still under development right now. --- .gitignore | 4 ++++ .prettierrc.json | 10 ++++++++++ 2 files changed, 14 insertions(+) create mode 100644 .prettierrc.json diff --git a/.gitignore b/.gitignore index 76154230..fb8f2b83 100644 --- a/.gitignore +++ b/.gitignore @@ -124,3 +124,7 @@ override.tf.json # Ignore CLI configuration files .terraformrc terraform.rc + +# AI files to temporarily ignore, until we're ready to commit them +CLAUDE.md +/.cursor diff --git a/.prettierrc.json b/.prettierrc.json new file mode 100644 index 00000000..7968fe6d --- /dev/null +++ b/.prettierrc.json @@ -0,0 +1,10 @@ +{ + "overrides": [ + { + "files": ".cursor/rules/*.mdc", + "options": { + "parser": "markdown" + } + } + ] +} From 5217a0f6346923216c5b9c8c2d887974ef1c49ff Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Wed, 16 Jul 2025 08:44:21 -0400 Subject: [PATCH 2/2] CP-30604: implement HPA autoscaling with custom metrics API This change adds Kubernetes Horizontal Pod Autoscaler (HPA) support with a custom metrics API implementation for the CloudZero agent. The existing agent relied on manual scaling, which wasn't responsive to actual workload demands and could lead to resource inefficiencies. The implementation exposes a Kubernetes custom metrics API v1beta1 endpoint that serves the `czo_cost_metrics_shipping_progress` metric from the collector. This metric represents the percentage of pending metrics relative to the maximum record limit, allowing HPA to scale the aggregator deployment based on actual workload pressure. Key technical changes include: - Custom metrics API handlers implementing the v1beta1 specification - Discovery endpoints for API resource enumeration - Integration with existing metric collector to expose shipping progress - HPA configuration templates with proper RBAC permissions - Comprehensive test coverage and documentation The approach eliminates external dependencies like Prometheus Adapter by implementing the custom metrics API directly in the collector, creating a self-contained autoscaling solution that scales based on the agent's own internal metrics rather than external observability infrastructure. --- Makefile | 2 + app/config/gator/settings.go | 23 +- app/domain/metric_collector.go | 50 + app/domain/metric_collector_test.go | 151 + app/functions/collector/main.go | 150 +- app/functions/helmless/default-values.yaml | 60 + app/handlers/apis.go | 90 + app/handlers/custom_metrics.go | 447 +++ app/handlers/custom_metrics_test.go | 395 +++ app/handlers/discovery.go | 153 + app/handlers/openapi_discovery.go | 242 ++ app/handlers/prom_metrics.go | 1 + app/handlers/remote_write_test.go | 4 + app/http/middleware/middleware.go | 55 +- app/logging/store_sink_test.go | 5 + app/storage/disk/disk.go | 8 + app/types/mocks/store_mock.go | 14 + app/types/store.go | 4 + go.mod | 1 + go.sum | 2 + helm/docs/AUTOSCALING.md | 246 ++ helm/templates/_cm_helpers.tpl | 11 + helm/templates/_helpers.tpl | 17 + helm/templates/agent-clusterrole.yaml | 12 + helm/templates/aggregator-deploy.yaml | 24 +- helm/templates/aggregator-hpa.yaml | 50 + helm/templates/aggregator-init-cert-job.yaml | 74 + helm/templates/aggregator-service.yaml | 9 +- helm/templates/custom-metrics-apiservice.yaml | 28 + helm/templates/custom-metrics-rbac.yaml | 46 + helm/templates/init-cert-clusterrole.yaml | 17 + helm/values.schema.json | 305 +- helm/values.schema.yaml | 72 + helm/values.yaml | 60 + tests/helm/template/autoscale-overrides.yml | 15 + tests/helm/template/autoscale.yaml | 3139 +++++++++++++++++ tests/helm/template/cert-manager.yaml | 43 +- tests/helm/template/federated.yaml | 43 +- tests/helm/template/manifest.yaml | 43 +- 39 files changed, 6055 insertions(+), 56 deletions(-) create mode 100644 app/handlers/apis.go create mode 100644 app/handlers/custom_metrics.go create mode 100644 app/handlers/custom_metrics_test.go create mode 100644 app/handlers/discovery.go create mode 100644 app/handlers/openapi_discovery.go create mode 100644 helm/docs/AUTOSCALING.md create mode 100644 helm/templates/aggregator-hpa.yaml create mode 100644 helm/templates/aggregator-init-cert-job.yaml create mode 100644 helm/templates/custom-metrics-apiservice.yaml create mode 100644 helm/templates/custom-metrics-rbac.yaml create mode 100644 tests/helm/template/autoscale-overrides.yml create mode 100644 tests/helm/template/autoscale.yaml diff --git a/Makefile b/Makefile index 4f041254..a68af4ad 100644 --- a/Makefile +++ b/Makefile @@ -241,6 +241,8 @@ ifneq ($(REGENERATE),never) app/functions/helmless/default-values.yaml: helm/values.yaml $(wildcard helm/*.yaml helm/templates/*.yaml helm/templates/*.tpl helm/*.yaml) $(HELM) show values ./helm | $(PRETTIER) --stdin-filepath $@ > $@ +generate: app/functions/helmless/default-values.yaml + bin/cloudzero-helmless: app/functions/helmless/default-values.yaml # Add the embedded defaults file to dependencies diff --git a/app/config/gator/settings.go b/app/config/gator/settings.go index f09c37b7..044cad19 100644 --- a/app/config/gator/settings.go +++ b/app/config/gator/settings.go @@ -43,11 +43,12 @@ type Settings struct { Region string `yaml:"region" env:"CSP_REGION" env-description:"cloud service provider region"` ClusterName string `yaml:"cluster_name" env:"CLUSTER_NAME" env-description:"name of the cluster to monitor"` - Server Server `yaml:"server"` - Logging Logging `yaml:"logging"` - Database Database `yaml:"database"` - Cloudzero Cloudzero `yaml:"cloudzero"` - Metrics Metrics `yaml:"metrics"` + Server Server `yaml:"server"` + Logging Logging `yaml:"logging"` + Database Database `yaml:"database"` + Cloudzero Cloudzero `yaml:"cloudzero"` + Metrics Metrics `yaml:"metrics"` + Certificate Certificate `yaml:"certificate"` mu sync.Mutex } @@ -59,6 +60,11 @@ type Metrics struct { ObservabilityLabels []filter.FilterEntry `yaml:"observability_labels"` } +type Certificate struct { + Cert string `yaml:"cert" env:"CERT_PATH" env-description:"path to TLS certificate file"` + Key string `yaml:"key" env:"KEY_PATH" env-description:"path to TLS key file"` +} + type Logging struct { Level string `yaml:"level" default:"info" env:"LOG_LEVEL" env-description:"logging level such as debug, info, error"` Capture bool `yaml:"capture" default:"true" env:"LOG_CAPTURE" env-description:"whether to persist logs to disk or not"` @@ -66,9 +72,9 @@ type Logging struct { type Database struct { StoragePath string `yaml:"storage_path" default:"/cloudzero/data" env:"DATABASE_STORAGE_PATH" env-description:"location where to write database"` - MaxRecords int `yaml:"max_records" default:"1000000" env:"MAX_RECORDS_PER_FILE" env-description:"maximum records per file"` + MaxRecords int `yaml:"max_records" default:"1500000" env:"MAX_RECORDS_PER_FILE" env-description:"maximum records per file"` CompressionLevel int `yaml:"compression_level" default:"8" env:"DATABASE_COMPRESS_LEVEL" env-description:"compression level for database files"` - CostMaxInterval time.Duration `yaml:"cost_max_interval" default:"10m" env:"COST_MAX_INTERVAL" env-description:"maximum interval to wait before flushing cost metrics"` + CostMaxInterval time.Duration `yaml:"cost_max_interval" default:"30m" env:"COST_MAX_INTERVAL" env-description:"maximum interval to wait before flushing cost metrics"` ObservabilityMaxInterval time.Duration `yaml:"observability_max_interval" default:"10m" env:"OBSERVABILITY_MAX_INTERVAL" env-description:"maximum interval to wait before flushing observability metrics"` PurgeRules PurgeRules `yaml:"purge_rules"` @@ -82,8 +88,9 @@ type PurgeRules struct { } type Server struct { - Mode string `yaml:"mode" default:"http" env:"SERVER_MODE" env-description:"server mode such as http, https"` + Mode string `yaml:"mode" default:"http" env:"SERVER_MODE" env-description:"server mode such as http, https, dual"` Port uint `yaml:"port" default:"8080" env:"SERVER_PORT" env-description:"server port"` + TLSPort uint `yaml:"tls_port" default:"8443" env:"SERVER_TLS_PORT" env-description:"server TLS port"` Profiling bool `yaml:"profiling" default:"false" env:"SERVER_PROFILING" env-description:"enable profiling"` ReconnectFrequency int `yaml:"reconnect_frequency" default:"16" env:"SERVER_RECONNECT_FREQUENCY" env-description:"how frequently to close HTTP connections from clients, to distribute the load. 0=never, otherwise 1/N probability."` } diff --git a/app/domain/metric_collector.go b/app/domain/metric_collector.go index 9a0e71cd..16d77743 100644 --- a/app/domain/metric_collector.go +++ b/app/domain/metric_collector.go @@ -65,6 +65,14 @@ var ( }, []string{}, ) + // Custom metric for HPA scaling based on cost metrics shipping progress + costMetricsShippingProgress = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: types.ObservabilityMetric("cost_metrics_shipping_progress"), + Help: "Progress towards cost metrics shipping goal (ratio of currentPending/targetProgress), where targetProgress = (elapsedTime/costMaxInterval) * maxRecords, 1.0 = 100% of expected rate", + }, + []string{}, + ) ) // MetricCollector is responsible for collecting and flushing metrics. @@ -173,6 +181,9 @@ func (d *MetricCollector) PutMetrics(ctx context.Context, contentType, encodingT return stats, err } + // Update the shipping progress metric for HPA scaling + d.updateShippingProgressMetric() + // In order to reduce the amount of time until the server starts seeing // data, we perform a first flush 🍵 of the cost metrics immediately // upon receipt. @@ -193,6 +204,41 @@ func (d *MetricCollector) PutMetrics(ctx context.Context, contentType, encodingT return stats, nil } +// updateShippingProgressMetric calculates and updates the shipping progress metric +// for HPA scaling based on time-based expected progress versus actual pending records. +func (d *MetricCollector) updateShippingProgressMetric() { + currentPending := d.costStore.Pending() + maxRecords := d.settings.Database.MaxRecords + elapsedTime := d.costStore.ElapsedTime() + costMaxInterval := d.settings.Database.CostMaxInterval + + // Calculate time-based target progress using the correct formula: + // targetProgress = (elapsedTime / costMaxInterval) * maxRecords + // progress = currentPending / targetProgress + + // Convert costMaxInterval to milliseconds for calculation + costMaxIntervalMs := costMaxInterval.Milliseconds() + + // Calculate expected number of records at this point in time + var progress float64 + if elapsedTime == 0 || costMaxIntervalMs == 0 { + // At the very beginning or with invalid interval, use simple ratio + progress = float64(currentPending) / float64(maxRecords) + } else { + // Calculate time-based expected progress + targetProgress := (float64(elapsedTime) / float64(costMaxIntervalMs)) * float64(maxRecords) + + if targetProgress == 0 { + // Avoid division by zero at the very start + progress = 0.0 + } else { + progress = float64(currentPending) / targetProgress + } + } + + costMetricsShippingProgress.WithLabelValues().Set(progress) +} + type metricCounter map[string]map[string]int func (m metricCounter) Add(metricName string, metricValue string) { @@ -207,6 +253,10 @@ func (d *MetricCollector) Flush(ctx context.Context) error { if err := d.costStore.Flush(); err != nil { return err } + + // Update the shipping progress metric after flushing + d.updateShippingProgressMetric() + return d.observabilityStore.Flush() } diff --git a/app/domain/metric_collector_test.go b/app/domain/metric_collector_test.go index b0e0781b..2a22e9b0 100644 --- a/app/domain/metric_collector_test.go +++ b/app/domain/metric_collector_test.go @@ -40,6 +40,8 @@ func TestPutMetrics(t *testing.T) { storage := mocks.NewMockStore(ctrl) storage.EXPECT().Put(ctx, gomock.Any()).Return(nil) storage.EXPECT().Flush().Return(nil) + storage.EXPECT().Pending().Return(0).AnyTimes() // For the shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() // For the time-based shipping progress metric d, err := domain.NewMetricCollector(&cfg, mockClock, storage, nil) require.NoError(t, err) defer d.Close() @@ -55,6 +57,8 @@ func TestPutMetrics(t *testing.T) { storage := mocks.NewMockStore(ctrl) storage.EXPECT().Put(ctx, gomock.Any()).Return(nil) storage.EXPECT().Flush().Return(nil) + storage.EXPECT().Pending().Return(0).AnyTimes() // For the shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() // For the time-based shipping progress metric d, err := domain.NewMetricCollector(&cfg, mockClock, storage, nil) require.NoError(t, err) defer d.Close() @@ -74,3 +78,150 @@ func TestPutMetrics(t *testing.T) { assert.NotNil(t, stats) }) } + +func TestCostMetricsShippingProgressMetric(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + ctx := context.Background() + + tests := []struct { + name string + maxRecords int + pendingRecords int + elapsedTimeMs int64 + expectedProgress float64 + expectMetricUpdateCalled bool + }{ + { + name: "zero pending records", + maxRecords: 1000, + pendingRecords: 0, + elapsedTimeMs: 10000, // 10 seconds + expectedProgress: 0.0, + expectMetricUpdateCalled: true, + }, + { + name: "10 seconds elapsed with expected rate", + maxRecords: 1000, + pendingRecords: 6, // Expected: (10000/1800000) * 1000 = 5.56, actual: 6, so 6/5.56 ≈ 1.08 + elapsedTimeMs: 10000, + expectedProgress: 1.08, + expectMetricUpdateCalled: true, + }, + { + name: "30 seconds elapsed with expected rate", + maxRecords: 1000, + pendingRecords: 17, // Expected: (30000/1800000) * 1000 = 16.67, actual: 17, so 17/16.67 ≈ 1.02 + elapsedTimeMs: 30000, + expectedProgress: 1.02, + expectMetricUpdateCalled: true, + }, + { + name: "5 minutes elapsed with expected rate", + maxRecords: 1000, + pendingRecords: 167, // Expected: (300000/1800000) * 1000 = 166.67, actual: 167, so 167/166.67 ≈ 1.0 + elapsedTimeMs: 300000, + expectedProgress: 1.0, + expectMetricUpdateCalled: true, + }, + { + name: "15 minutes elapsed with expected rate", + maxRecords: 1500000, + pendingRecords: 750000, // Expected: (900000/1800000) * 1500000 = 750000, actual: 750000, so 750000/750000 = 1.0 + elapsedTimeMs: 900000, + expectedProgress: 1.0, + expectMetricUpdateCalled: true, + }, + { + name: "30 minutes elapsed (full interval)", + maxRecords: 1000, + pendingRecords: 1000, // Expected: (1800000/1800000) * 1000 = 1000, actual: 1000, so 1000/1000 = 1.0 + elapsedTimeMs: 1800000, + expectedProgress: 1.0, + expectMetricUpdateCalled: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := config.Settings{ + CloudAccountID: "123456789012", + Region: "us-west-2", + ClusterName: "testcluster", + Database: config.Database{ + MaxRecords: tt.maxRecords, + CostMaxInterval: 30 * time.Minute, // 30 minutes = 1800000 milliseconds + }, + } + + // Create a mock store that implements all needed methods + mockStore := mocks.NewMockStore(ctrl) + + if tt.expectMetricUpdateCalled { + mockStore.EXPECT().Pending().Return(tt.pendingRecords).AnyTimes() + mockStore.EXPECT().ElapsedTime().Return(tt.elapsedTimeMs).AnyTimes() + } + + mockStore.EXPECT().Put(ctx, gomock.Any()).Return(nil).AnyTimes() + mockStore.EXPECT().Flush().Return(nil).AnyTimes() + + d, err := domain.NewMetricCollector(&cfg, mockClock, mockStore, nil) + require.NoError(t, err) + defer d.Close() + + // Create test metric data + payload, _, _, err := testdata.BuildWriteRequest(testdata.WriteRequestFixture.Timeseries, nil, nil, nil, nil, "snappy") + require.NoError(t, err) + + // Process metrics to trigger the progress metric update + stats, err := d.PutMetrics(ctx, "application/x-protobuf", "snappy", payload) + assert.NoError(t, err) + assert.Nil(t, stats) + }) + } +} + +func TestUpdateShippingProgressMetric(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + ctx := context.Background() + + t.Run("flush triggers metric update", func(t *testing.T) { + cfg := config.Settings{ + CloudAccountID: "123456789012", + Region: "us-west-2", + ClusterName: "testcluster", + Database: config.Database{ + MaxRecords: 1000, + CostMaxInterval: 30 * time.Minute, + }, + } + + // Create mock stores for both cost and observability + mockCostStore := mocks.NewMockStore(ctrl) + mockObservabilityStore := mocks.NewMockStore(ctrl) + + // Expect Pending() and ElapsedTime() to be called when Flush() is called + mockCostStore.EXPECT().Pending().Return(0).AnyTimes() + mockCostStore.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() + + mockCostStore.EXPECT().Flush().Return(nil) + mockObservabilityStore.EXPECT().Flush().Return(nil) + + d, err := domain.NewMetricCollector(&cfg, mockClock, mockCostStore, mockObservabilityStore) + require.NoError(t, err) + defer d.Close() + + // Call flush to trigger metric update + err = d.Flush(ctx) + assert.NoError(t, err) + }) +} diff --git a/app/functions/collector/main.go b/app/functions/collector/main.go index 3a7ca779..00e5f53f 100644 --- a/app/functions/collector/main.go +++ b/app/functions/collector/main.go @@ -5,16 +5,21 @@ package main import ( "context" + "crypto/tls" "encoding/json" "flag" "fmt" "os" "os/signal" + "sync" "syscall" + "time" "github.com/go-obvious/server" "github.com/rs/zerolog" "github.com/rs/zerolog/log" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "github.com/cloudzero/cloudzero-agent/app/build" config "github.com/cloudzero/cloudzero-agent/app/config/gator" @@ -27,6 +32,83 @@ import ( "github.com/cloudzero/cloudzero-agent/app/utils" ) +const ( + // apiBasePath is the base path for the custom metrics API + apiBasePath = "/apis/custom.metrics.k8s.io/v1beta1" +) + +// startCollectorServer starts an HTTP server with core APIs (collector, metrics, profiling) +func startCollectorServer(ctx context.Context, logger *zerolog.Logger, settings *config.Settings, domain *domain.MetricCollector, k8sClient kubernetes.Interface, includeCustomMetrics bool) { + // HTTP server always serves core APIs + apis := []server.API{ + handlers.NewRemoteWriteAPI("/collector", domain), + handlers.NewPromMetricsAPI("/metrics"), + } + + // Add custom metrics API only if not served by separate HTTPS server + if includeCustomMetrics { + apis = append(apis, handlers.NewCustomMetricsAPI(apiBasePath, domain, k8sClient)) + } + + if settings.Server.Profiling { + apis = append(apis, handlers.NewProfilingAPI("/debug/pprof/")) + } + + // HTTP server uses all middleware including prometheus metrics + srv := server.New(build.Version()). + WithAddress(fmt.Sprintf(":%d", settings.Server.Port)). + WithMiddleware(middleware.LoggingMiddlewareWrapper, middleware.PromHTTPMiddleware). + WithAPIs(apis...) + + logger.Info().Uint("port", settings.Server.Port).Msg("Starting HTTP server") + srv.WithListener(server.HTTPListener()).Run(ctx) +} + +// startAutscalerServer starts an HTTPS server that serves only the custom metrics API +func startAutscalerServer(ctx context.Context, logger *zerolog.Logger, settings *config.Settings, domain *domain.MetricCollector, k8sClient kubernetes.Interface) { + // HTTPS server serves only custom metrics API with discovery endpoints + apis := []server.API{ + handlers.NewCustomMetricsWithDiscoveryAPI(apiBasePath, domain, k8sClient), + } + + // Use only logging middleware (no prometheus to avoid duplication with HTTP server) + srv := server.New(build.Version()). + WithAddress(fmt.Sprintf(":%d", settings.Server.TLSPort)). + WithMiddleware(middleware.LoggingMiddlewareWrapper). + WithAPIs(apis...) + + // TLS certificate paths for dual mode + certPath := settings.Certificate.Cert + keyPath := settings.Certificate.Key + + // Validate TLS certificates exist + if certPath == "" || keyPath == "" { + logger.Fatal().Msg("TLS certificate paths not configured, cannot start HTTPS server") + } + + if _, err := os.Stat(certPath); err != nil { + logger.Fatal().Err(err).Msg("TLS certificate file not found, cannot start HTTPS server") + } + + if _, err := os.Stat(keyPath); err != nil { + logger.Fatal().Err(err).Msg("TLS key file not found, cannot start HTTPS server") + } + + logger.Info().Uint("port", settings.Server.TLSPort).Msg("Starting HTTPS server (custom metrics API only)") + tlsProvider := func() *tls.Config { + cert, err := tls.LoadX509KeyPair(certPath, keyPath) + if err != nil { + logger.Fatal().Err(err).Msg("Failed to load TLS certificate") + } + return &tls.Config{ + Certificates: []tls.Certificate{cert}, + MinVersion: tls.VersionTLS12, + } + } + + srv.WithListener(server.TLSListener(30*time.Second, 30*time.Second, 120*time.Second, tlsProvider)).Run(ctx) +} + func main() { var configFile string flag.StringVar(&configFile, "config", configFile, "Path to the configuration file") @@ -96,12 +178,6 @@ func main() { } }() - // Handle shutdown events gracefully - go func() { - HandleShutdownEvents(ctx, costMetricStore, observabilityMetricStore) - os.Exit(0) - }() - // create the metric collector service interface domain, err := domain.NewMetricCollector(settings, clock, costMetricStore, observabilityMetricStore) if err != nil { @@ -109,28 +185,58 @@ func main() { } defer domain.Close() - mw := []server.Middleware{ - middleware.LoggingMiddlewareWrapper, - middleware.PromHTTPMiddleware, + // Create Kubernetes client for custom metrics API (only when running in cluster) + var k8sClient kubernetes.Interface + config, err := rest.InClusterConfig() + if err != nil { + // Silently skip Kubernetes client setup when not running in cluster + // Custom metrics API will work with fallback behavior (only looking at + // the current pod). + k8sClient = nil + } else { + k8sClient, err = kubernetes.NewForConfig(config) + if err != nil { + logger.Warn().Err(err).Msg("failed to create Kubernetes client, custom metrics API will have limited functionality") + k8sClient = nil + } } - apis := []server.API{ - handlers.NewRemoteWriteAPI("/collector", domain), - handlers.NewPromMetricsAPI("/metrics"), - } + // Create a cancellable context for server shutdown + ctx, cancel := context.WithCancel(ctx) + defer cancel() - if settings.Server.Profiling { - apis = append(apis, handlers.NewProfilingAPI("/debug/pprof/")) - } + // Handle shutdown events gracefully + go func() { + HandleShutdownEvents(ctx, costMetricStore, observabilityMetricStore) + logger.Info().Msg("Shutdown signal received, cancelling context") + cancel() + }() // Expose the service logger.Info().Msg("Starting service") - server.New(build.Version()). - WithAddress(fmt.Sprintf(":%d", settings.Server.Port)). - WithMiddleware(mw...). - WithAPIs(apis...). - WithListener(server.HTTPListener()). - Run(ctx) + + wg := sync.WaitGroup{} + + if settings.Server.Mode == "https" || settings.Server.Mode == "dual" { + wg.Add(1) + go func() { + defer wg.Done() + startAutscalerServer(ctx, logger, settings, domain, k8sClient) + }() + } + + if settings.Server.Mode == "http" || settings.Server.Mode == "dual" { + wg.Add(1) + go func() { + defer wg.Done() + // Include custom metrics API only in HTTP-only mode (not in dual mode) + includeCustomMetrics := settings.Server.Mode == "http" + startCollectorServer(ctx, logger, settings, domain, k8sClient, includeCustomMetrics) + }() + } + + wg.Wait() + logger.Info().Msg("Service stopping") } diff --git a/app/functions/helmless/default-values.yaml b/app/functions/helmless/default-values.yaml index c630ad25..af98d47d 100644 --- a/app/functions/helmless/default-values.yaml +++ b/app/functions/helmless/default-values.yaml @@ -225,6 +225,9 @@ components: # metrics from the agent, webhook, etc., and sends them to the CloudZero API # after some processing. aggregator: + # The number of replicas to run. Note that, if autoscale is enabled, this + # will be the starting number of replicas, which may or may not be the + # number of replicas in the cluster due to the HPA. replicas: 3 podDisruptionBudget: # enabled: @@ -232,6 +235,8 @@ components: # maxUnavailable: tolerations: [] annotations: {} + # Enable autoscaling for the aggregator deployment + autoscale: false # Settings for the webhook server. webhookServer: @@ -935,6 +940,16 @@ aggregator: limits: memory: "1024Mi" cpu: "2000m" + # TLS configuration for the collector (automatically enabled when autoscaling is enabled) + tls: + # Path where the TLS certificate and key will be mounted in the container + mountPath: /etc/certs + # Configuration for the TLS certificate Secret + secret: + # Whether to create a Secret to store the TLS certificate and key + create: true + # Name of the Secret to create. If empty, a name will be generated + name: "" # Configuration for the shipper component of the aggregator. shipper: # Port that the shipper listens on for internal communication. @@ -965,6 +980,51 @@ aggregator: # See the Kubernetes documentation for details: # https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity affinity: {} + # Detailed scaling configuration for the aggregator deployment + scaling: + # Minimum number of replicas for the aggregator + minReplicas: 1 + # Maximum number of replicas for the aggregator + maxReplicas: 10 + # Target value for czo_cost_metrics_shipping_progress metric + # "900m" = scale when metrics reach 90% of MaxRecords capacity + # Values >1.0 indicate saturation and trigger more aggressive scaling + targetValue: "900m" + # Annotations to apply to the HPA resource + annotations: {} + # Scaling behavior configuration + behavior: + scaleUp: + # Time to wait before allowing another scale up + stabilizationWindowSeconds: 300 + # Scale up policies + policies: + # Allow up to 100% increase + - type: Percent + value: 100 + periodSeconds: 60 + # Allow up to 2 pod increase + - type: Pods + value: 2 + periodSeconds: 60 + # Use the maximum of the policies + selectPolicy: Max + scaleDown: + # Time to wait before allowing another scale down + stabilizationWindowSeconds: 300 + # Scale down policies + policies: + # Allow up to 50% decrease + - type: Percent + value: 50 + periodSeconds: 60 + # Allow up to 1 pod decrease + - type: Pods + value: 1 + periodSeconds: 60 + # Use the minimum of the policies + selectPolicy: Min + # -- Deprecated. Override the name of the chart. Used in resource naming. nameOverride: diff --git a/app/handlers/apis.go b/app/handlers/apis.go new file mode 100644 index 00000000..3a92026e --- /dev/null +++ b/app/handlers/apis.go @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "encoding/json" + "net/http" + + "github.com/go-chi/chi/v5" + "github.com/go-obvious/server" + "github.com/go-obvious/server/api" + "github.com/go-obvious/server/request" + "github.com/rs/zerolog/log" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// APIsHandler provides the /apis endpoint for API discovery +type APIsHandler struct { + api.Service +} + +// NewAPIsHandler creates a new APIs handler that mounts to the specified path +func NewAPIsHandler(path string) *APIsHandler { + h := &APIsHandler{ + Service: api.Service{ + APIName: "apis", + Mounts: map[string]*chi.Mux{}, + }, + } + h.Service.Mounts[path] = h.Routes() + return h +} + +func (h *APIsHandler) Register(app server.Server) error { + if err := h.Service.Register(app); err != nil { + return err + } + return nil +} + +func (h *APIsHandler) Routes() *chi.Mux { + r := chi.NewRouter() + r.Get("/", h.listAPIGroups) + return r +} + +// listAPIGroups returns the API groups available for discovery +func (h *APIsHandler) listAPIGroups(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("APIsHandler: listAPIGroups called") + + // Return the API group list that includes custom.metrics.k8s.io + // According to Kubernetes API docs, this should list all API groups supported by the cluster + apiGroupList := metav1.APIGroupList{ + TypeMeta: metav1.TypeMeta{ + Kind: "APIGroupList", + APIVersion: "v1", + }, + Groups: []metav1.APIGroup{ + { + Name: "custom.metrics.k8s.io", + Versions: []metav1.GroupVersionForDiscovery{ + { + GroupVersion: "custom.metrics.k8s.io/v1beta1", + Version: "v1beta1", + }, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "custom.metrics.k8s.io/v1beta1", + Version: "v1beta1", + }, + // Include server address as required by some clients + ServerAddressByClientCIDRs: []metav1.ServerAddressByClientCIDR{ + { + ClientCIDR: "0.0.0.0/0", + ServerAddress: "", + }, + }, + }, + }, + } + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(apiGroupList); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode API group list") + request.Reply(r, w, "failed to encode API group list", http.StatusInternalServerError) + return + } +} diff --git a/app/handlers/custom_metrics.go b/app/handlers/custom_metrics.go new file mode 100644 index 00000000..4868ee1b --- /dev/null +++ b/app/handlers/custom_metrics.go @@ -0,0 +1,447 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/go-obvious/server" + "github.com/go-obvious/server/api" + "github.com/go-obvious/server/request" + "github.com/prometheus/client_golang/prometheus" + "github.com/rs/zerolog/log" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/metrics/pkg/apis/custom_metrics/v1beta1" + + "github.com/cloudzero/cloudzero-agent/app/domain" +) + +const ( + // metricName is the name of the custom metric for HPA scaling + metricName = "czo_cost_metrics_shipping_progress" + // errorMsgFailedToGetMetricValue is the error message when failing to get metric value + errorMsgFailedToGetMetricValue = "failed to get metric value" + // apiVersion is the API version for custom metrics + apiVersion = "custom.metrics.k8s.io/v1beta1" + // contentTypeJSON is the content type for JSON responses + contentTypeJSON = "application/json" + // contentTypeHeader is the HTTP header for content type + contentTypeHeader = "Content-Type" + // kubernetesAPIVersion is the Kubernetes API version + kubernetesAPIVersion = "v1" + // rootPath is the root path for API endpoints + rootPath = "/" +) + +// CustomMetricsAPI implements the Kubernetes custom metrics API +type CustomMetricsAPI struct { + api.Service + collector *domain.MetricCollector + k8sClient kubernetes.Interface +} + +// NewCustomMetricsAPI creates a new custom metrics API handler +func NewCustomMetricsAPI(base string, collector *domain.MetricCollector, k8sClient kubernetes.Interface) *CustomMetricsAPI { + a := &CustomMetricsAPI{ + collector: collector, + k8sClient: k8sClient, + Service: api.Service{ + APIName: "custom-metrics", + Mounts: map[string]*chi.Mux{}, + }, + } + + // Mount custom metrics API at its base path + a.Service.Mounts[base] = a.Routes() + + return a +} + +func (a *CustomMetricsAPI) Register(app server.Server) error { + if err := a.Service.Register(app); err != nil { + return err + } + return nil +} + +func (a *CustomMetricsAPI) Routes() *chi.Mux { + r := chi.NewRouter() + + // Custom metrics API routes + r.Get("/", a.listCustomMetrics) + r.Get("/namespaces/{namespace}/pods", a.listPodsMetrics) + r.Get("/namespaces/{namespace}/pods/{pod}/{metric}", a.getCustomMetricForPod) + r.Get("/namespaces/{namespace}/pods/{metric}", a.getCustomMetricForPods) + + return r +} + +// CustomMetricsWithDiscoveryAPI combines custom metrics API with discovery endpoints +type CustomMetricsWithDiscoveryAPI struct { + api.Service + customMetricsAPI *CustomMetricsAPI +} + +// NewCustomMetricsWithDiscoveryAPI creates a new API that includes both custom metrics and discovery endpoints +func NewCustomMetricsWithDiscoveryAPI(base string, collector *domain.MetricCollector, k8sClient kubernetes.Interface) *CustomMetricsWithDiscoveryAPI { + customMetricsAPI := &CustomMetricsAPI{ + collector: collector, + k8sClient: k8sClient, + } + + a := &CustomMetricsWithDiscoveryAPI{ + customMetricsAPI: customMetricsAPI, + Service: api.Service{ + APIName: "custom-metrics-with-discovery", + Mounts: map[string]*chi.Mux{}, + }, + } + + // Mount custom metrics API at its base path + a.Service.Mounts[base] = customMetricsAPI.Routes() + + // Mount discovery endpoints at root level + a.Service.Mounts["/apis"] = a.createAPIGroupsRoute() + a.Service.Mounts["/openapi/v2"] = a.createOpenAPIv2Route() + + return a +} + +func (a *CustomMetricsWithDiscoveryAPI) Register(app server.Server) error { + if err := a.Service.Register(app); err != nil { + return err + } + return nil +} + +// createAPIGroupsRoute creates a router for the /apis endpoint +func (a *CustomMetricsWithDiscoveryAPI) createAPIGroupsRoute() *chi.Mux { + r := chi.NewRouter() + r.Get("/", a.customMetricsAPI.getAPIGroups) + return r +} + +// createOpenAPIv2Route creates a router for the /openapi/v2 endpoint +func (a *CustomMetricsWithDiscoveryAPI) createOpenAPIv2Route() *chi.Mux { + r := chi.NewRouter() + r.Get("/", a.customMetricsAPI.getOpenAPIv2Spec) + return r +} + +// listCustomMetrics returns the list of available custom metrics +func (a *CustomMetricsAPI) listCustomMetrics(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + // Return APIResourceList according to Kubernetes custom metrics API v1beta1 spec + apiResourceList := metav1.APIResourceList{ + TypeMeta: metav1.TypeMeta{ + Kind: "APIResourceList", + APIVersion: "v1", + }, + GroupVersion: apiVersion, + APIResources: []metav1.APIResource{ + { + Name: "pods/" + metricName, + Namespaced: true, + Kind: "MetricValueList", + Verbs: []string{"get"}, + }, + }, + } + + w.Header().Set(contentTypeHeader, contentTypeJSON) + if err := json.NewEncoder(w).Encode(apiResourceList); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode metrics list") + request.Reply(r, w, "failed to encode metrics list", http.StatusInternalServerError) + return + } +} + +// listPodsMetrics returns the list of available metrics for pods +func (a *CustomMetricsAPI) listPodsMetrics(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + metrics := []string{metricName} + + w.Header().Set(contentTypeHeader, contentTypeJSON) + if err := json.NewEncoder(w).Encode(metrics); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode pods metrics list") + request.Reply(r, w, "failed to encode pods metrics list", http.StatusInternalServerError) + return + } +} + +// getCustomMetricForPod returns the custom metric value for a specific pod +func (a *CustomMetricsAPI) getCustomMetricForPod(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + namespace := chi.URLParam(r, "namespace") + pod := chi.URLParam(r, "pod") + metric := chi.URLParam(r, "metric") + + if metric != metricName { + request.Reply(r, w, fmt.Sprintf("metric %s not found", metric), http.StatusNotFound) + return + } + + // Special case: if pod is "*", return metrics for all pods (HPA compatibility) + if pod == "*" { + a.getCustomMetricForPods(w, r) + return + } + + // Get the current metric value from Prometheus + value, err := a.getCurrentMetricValue() + if err != nil { + log.Ctx(ctx).Err(err).Msg(errorMsgFailedToGetMetricValue) + request.Reply(r, w, errorMsgFailedToGetMetricValue, http.StatusInternalServerError) + return + } + + metricValue := &v1beta1.MetricValue{ + TypeMeta: metav1.TypeMeta{ + Kind: "MetricValue", + APIVersion: apiVersion, + }, + DescribedObject: corev1.ObjectReference{ + Kind: "Pod", + Namespace: namespace, + Name: pod, + APIVersion: "v1", + }, + MetricName: metric, + Timestamp: metav1.NewTime(time.Now()), + Value: *value, + Selector: nil, // No label selector for this metric + } + + w.Header().Set(contentTypeHeader, contentTypeJSON) + if err := json.NewEncoder(w).Encode(metricValue); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode metric value") + request.Reply(r, w, "failed to encode metric value", http.StatusInternalServerError) + return + } +} + +// getCustomMetricForPods returns the custom metric values for multiple pods +func (a *CustomMetricsAPI) getCustomMetricForPods(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + namespace := chi.URLParam(r, "namespace") + metric := chi.URLParam(r, "metric") + + if metric != metricName { + request.Reply(r, w, fmt.Sprintf("metric %s not found", metric), http.StatusNotFound) + return + } + + // Get the current metric value from Prometheus + value, err := a.getCurrentMetricValue() + if err != nil { + log.Ctx(ctx).Err(err).Msg(errorMsgFailedToGetMetricValue) + request.Reply(r, w, errorMsgFailedToGetMetricValue, http.StatusInternalServerError) + return + } + + // Create metric values for pods + var items []v1beta1.MetricValue + + if a.k8sClient != nil { + // Query actual pods from the namespace that match our deployment + pods, err := a.k8sClient.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/component=aggregator", + }) + if err != nil { + log.Ctx(ctx).Err(err).Msg("failed to list pods") + request.Reply(r, w, "failed to list pods", http.StatusInternalServerError) + return + } + + // Create metric values for all running aggregator pods + items = make([]v1beta1.MetricValue, 0, len(pods.Items)) + for _, pod := range pods.Items { + // Only include running pods + if pod.Status.Phase == corev1.PodRunning { + items = append(items, v1beta1.MetricValue{ + DescribedObject: corev1.ObjectReference{ + Kind: "Pod", + Namespace: namespace, + Name: pod.Name, + APIVersion: "v1", + }, + MetricName: metric, + Timestamp: metav1.NewTime(time.Now()), + Value: *value, + Selector: nil, // No label selector for this metric + }) + } + } + } else { + // Fallback: return a single metric value with wildcard name when k8s client is not available + items = []v1beta1.MetricValue{ + { + DescribedObject: corev1.ObjectReference{ + Kind: "Pod", + Namespace: namespace, + Name: "*", + APIVersion: "v1", + }, + MetricName: metric, + Timestamp: metav1.NewTime(time.Now()), + Value: *value, + Selector: nil, // No label selector for this metric + }, + } + } + + metricValueList := &v1beta1.MetricValueList{ + TypeMeta: metav1.TypeMeta{ + Kind: "MetricValueList", + APIVersion: apiVersion, + }, + ListMeta: metav1.ListMeta{ + ResourceVersion: "1", + }, + Items: items, + } + + w.Header().Set(contentTypeHeader, contentTypeJSON) + if err := json.NewEncoder(w).Encode(metricValueList); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode metric value list") + request.Reply(r, w, "failed to encode metric value list", http.StatusInternalServerError) + return + } +} + +// getCurrentMetricValue retrieves the current value of the shipping progress metric from Prometheus +func (a *CustomMetricsAPI) getCurrentMetricValue() (*resource.Quantity, error) { + // Get metrics from the default Prometheus registry + metricFamilies, err := prometheus.DefaultGatherer.Gather() + if err != nil { + return nil, fmt.Errorf("failed to gather metrics: %w", err) + } + + // Find the czo_cost_metrics_shipping_progress metric + for _, mf := range metricFamilies { + if mf.GetName() == metricName { + metrics := mf.GetMetric() + if len(metrics) > 0 { + // Get the gauge value + gauge := metrics[0].GetGauge() + if gauge != nil { + value := gauge.GetValue() + // Convert to Kubernetes resource.Quantity + // Parse the value as a decimal string to preserve precision + quantityStr := fmt.Sprintf("%.0fm", value*1000) + quantity, err := resource.ParseQuantity(quantityStr) + if err != nil { + return nil, fmt.Errorf("failed to parse quantity %s: %w", quantityStr, err) + } + return &quantity, nil + } + } + } + } + + // If metric not found, return 0 + quantity := resource.NewMilliQuantity(0, resource.DecimalSI) + return quantity, nil +} + +// getAPIGroups returns the API groups available for discovery +func (a *CustomMetricsAPI) getAPIGroups(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("CustomMetricsAPI: getAPIGroups called") + + // Return the API group list that includes custom.metrics.k8s.io + apiGroupList := metav1.APIGroupList{ + TypeMeta: metav1.TypeMeta{ + Kind: "APIGroupList", + APIVersion: kubernetesAPIVersion, + }, + Groups: []metav1.APIGroup{ + { + Name: "custom.metrics.k8s.io", + Versions: []metav1.GroupVersionForDiscovery{ + { + GroupVersion: "custom.metrics.k8s.io/v1beta1", + Version: "v1beta1", + }, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "custom.metrics.k8s.io/v1beta1", + Version: "v1beta1", + }, + ServerAddressByClientCIDRs: []metav1.ServerAddressByClientCIDR{ + { + ClientCIDR: "0.0.0.0/0", + ServerAddress: "", + }, + }, + }, + }, + } + + w.Header().Set(contentTypeHeader, contentTypeJSON) + if err := json.NewEncoder(w).Encode(apiGroupList); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode API group list") + request.Reply(r, w, "failed to encode API group list", http.StatusInternalServerError) + return + } +} + +// getOpenAPIv2Spec returns an OpenAPI v2 specification +func (a *CustomMetricsAPI) getOpenAPIv2Spec(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("CustomMetricsAPI: getOpenAPIv2Spec called") + + // Return minimal OpenAPI v2 spec for our custom metrics API + openAPISpec := map[string]interface{}{ + "swagger": "2.0", + "info": map[string]interface{}{ + "title": "CloudZero Custom Metrics API", + "version": "v1beta1", + }, + "host": "", + "basePath": rootPath, + "schemes": []string{"https"}, + "consumes": []string{contentTypeJSON}, + "produces": []string{contentTypeJSON}, + "paths": map[string]interface{}{ + "/apis/custom.metrics.k8s.io/v1beta1": map[string]interface{}{ + "get": map[string]interface{}{ + "description": "List available custom metrics", + "operationId": "listCustomMetrics", + "produces": []string{contentTypeJSON}, + "responses": map[string]interface{}{ + "200": map[string]interface{}{ + "description": "List of available metrics", + }, + }, + "tags": []string{"custom.metrics.k8s.io_v1beta1"}, + }, + }, + }, + "tags": []map[string]interface{}{ + { + "name": "custom.metrics.k8s.io_v1beta1", + "description": "Custom metrics API for autoscaling", + }, + }, + } + + w.Header().Set(contentTypeHeader, contentTypeJSON) + if err := json.NewEncoder(w).Encode(openAPISpec); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode OpenAPI v2 spec") + request.Reply(r, w, "failed to encode OpenAPI v2 spec", http.StatusInternalServerError) + return + } +} diff --git a/app/handlers/custom_metrics_test.go b/app/handlers/custom_metrics_test.go new file mode 100644 index 00000000..27cb00cf --- /dev/null +++ b/app/handlers/custom_metrics_test.go @@ -0,0 +1,395 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handlers_test + +import ( + "context" + "encoding/json" + "net/http" + "testing" + "time" + + "github.com/go-obvious/server/test" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/metrics/pkg/apis/custom_metrics/v1beta1" + + config "github.com/cloudzero/cloudzero-agent/app/config/gator" + "github.com/cloudzero/cloudzero-agent/app/domain" + "github.com/cloudzero/cloudzero-agent/app/handlers" + "github.com/cloudzero/cloudzero-agent/app/types/mocks" +) + +func TestCustomMetricsAPI_Routes(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + storage := mocks.NewMockStore(ctrl) + // Mock the Pending method to return a value for the shipping progress metric + storage.EXPECT().Pending().Return(500000).AnyTimes() + // Mock the ElapsedTime method for the time-based shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() + + cfg := &config.Settings{ + Database: config.Database{ + MaxRecords: 1500000, // 1.5 million + }, + } + + collector, err := domain.NewMetricCollector(cfg, mockClock, storage, nil) + assert.NoError(t, err) + defer collector.Close() + + handler := handlers.NewCustomMetricsAPI("/apis/custom.metrics.k8s.io/v1beta1", collector, nil) + + tests := []struct { + name string + method string + path string + expectedStatusCode int + }{ + { + name: "list_metrics", + method: "GET", + path: "/apis/custom.metrics.k8s.io/v1beta1/", + expectedStatusCode: http.StatusOK, + }, + { + name: "get_metric_for_specific_pod", + method: "GET", + path: "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/czo_cost_metrics_shipping_progress", + expectedStatusCode: http.StatusOK, + }, + { + name: "get_metric_for_pods_without_specific_pod", + method: "GET", + path: "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/czo_cost_metrics_shipping_progress", + expectedStatusCode: http.StatusOK, + }, + { + name: "unsupported_method", + method: "POST", + path: "/apis/custom.metrics.k8s.io/v1beta1/", + expectedStatusCode: http.StatusMethodNotAllowed, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := createRequest(tt.method, tt.path, nil) + resp, err := test.InvokeService(handler.Service, tt.path, *req) + assert.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, tt.expectedStatusCode, resp.StatusCode) + }) + } +} + +func TestCustomMetricsAPI_ListMetrics(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + storage := mocks.NewMockStore(ctrl) + cfg := &config.Settings{ + Database: config.Database{ + MaxRecords: 1500000, + }, + } + + collector, err := domain.NewMetricCollector(cfg, mockClock, storage, nil) + assert.NoError(t, err) + defer collector.Close() + + handler := handlers.NewCustomMetricsAPI("/apis/custom.metrics.k8s.io/v1beta1", collector, nil) + + req := createRequest("GET", "/apis/custom.metrics.k8s.io/v1beta1/", nil) + resp, err := test.InvokeService(handler.Service, "/apis/custom.metrics.k8s.io/v1beta1/", *req) + assert.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var apiResourceList metav1.APIResourceList + err = json.NewDecoder(resp.Body).Decode(&apiResourceList) + assert.NoError(t, err) + + // Verify the expected structure + assert.Equal(t, "APIResourceList", apiResourceList.Kind) + assert.Equal(t, "v1", apiResourceList.APIVersion) + assert.Equal(t, "custom.metrics.k8s.io/v1beta1", apiResourceList.GroupVersion) + assert.Len(t, apiResourceList.APIResources, 1) + assert.Equal(t, "pods/czo_cost_metrics_shipping_progress", apiResourceList.APIResources[0].Name) + assert.Equal(t, true, apiResourceList.APIResources[0].Namespaced) + assert.Equal(t, "MetricValueList", apiResourceList.APIResources[0].Kind) + assert.Equal(t, metav1.Verbs{"get"}, apiResourceList.APIResources[0].Verbs) +} + +func TestCustomMetricsAPI_GetCustomMetricForPod(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + storage := mocks.NewMockStore(ctrl) + // Mock the Pending method to return 0 for no metrics + storage.EXPECT().Pending().Return(0).AnyTimes() + // Mock the ElapsedTime method for the time-based shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() + + cfg := &config.Settings{ + Database: config.Database{ + MaxRecords: 1500000, + }, + } + + collector, err := domain.NewMetricCollector(cfg, mockClock, storage, nil) + assert.NoError(t, err) + defer collector.Close() + + handler := handlers.NewCustomMetricsAPI("/apis/custom.metrics.k8s.io/v1beta1", collector, nil) + + tests := []struct { + name string + path string + expectedStatus int + expectedMetric string + expectedPod string + expectedNS string + }{ + { + name: "valid metric request", + path: "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/czo_cost_metrics_shipping_progress", + expectedStatus: http.StatusOK, + expectedMetric: "czo_cost_metrics_shipping_progress", + expectedPod: "test-pod", + expectedNS: "cza", + }, + { + name: "unknown metric", + path: "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/unknown_metric", + expectedStatus: http.StatusNotFound, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := createRequest("GET", tt.path, nil) + resp, err := test.InvokeService(handler.Service, tt.path, *req) + assert.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, tt.expectedStatus, resp.StatusCode) + + if tt.expectedStatus == http.StatusOK { + assert.Equal(t, "application/json", resp.Header.Get("Content-Type")) + + var metricValue v1beta1.MetricValue + err = json.NewDecoder(resp.Body).Decode(&metricValue) + assert.NoError(t, err) + + assert.Equal(t, "MetricValue", metricValue.Kind) + assert.Equal(t, "custom.metrics.k8s.io/v1beta1", metricValue.APIVersion) + assert.Equal(t, tt.expectedMetric, metricValue.MetricName) + assert.Equal(t, tt.expectedPod, metricValue.DescribedObject.Name) + assert.Equal(t, tt.expectedNS, metricValue.DescribedObject.Namespace) + assert.Equal(t, "Pod", metricValue.DescribedObject.Kind) + assert.Equal(t, "v1", metricValue.DescribedObject.APIVersion) + + // Check that the value is a valid quantity (any format is acceptable) + assert.NotNil(t, metricValue.Value) + assert.True(t, metricValue.Value.IsZero() || !metricValue.Value.IsZero(), "value should be a valid quantity") + + // Check that timestamp is recent (within last minute) + timeDiff := time.Since(metricValue.Timestamp.Time) + assert.True(t, timeDiff < time.Minute, "timestamp should be recent") + } + }) + } +} + +func TestCustomMetricsAPI_GetCustomMetricForPods(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + storage := mocks.NewMockStore(ctrl) + // Mock the Pending method to return 0 for no metrics + storage.EXPECT().Pending().Return(0).AnyTimes() + // Mock the ElapsedTime method for the time-based shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() + + cfg := &config.Settings{ + Database: config.Database{ + MaxRecords: 1500000, + }, + } + + collector, err := domain.NewMetricCollector(cfg, mockClock, storage, nil) + assert.NoError(t, err) + defer collector.Close() + + handler := handlers.NewCustomMetricsAPI("/apis/custom.metrics.k8s.io/v1beta1", collector, nil) + + tests := []struct { + name string + path string + expectedStatus int + expectedMetric string + expectedNS string + }{ + { + name: "valid metric request for pods", + path: "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/czo_cost_metrics_shipping_progress", + expectedStatus: http.StatusOK, + expectedMetric: "czo_cost_metrics_shipping_progress", + expectedNS: "cza", + }, + { + name: "unknown metric for pods", + path: "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/unknown_metric", + expectedStatus: http.StatusNotFound, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := createRequest("GET", tt.path, nil) + resp, err := test.InvokeService(handler.Service, tt.path, *req) + assert.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, tt.expectedStatus, resp.StatusCode) + + if tt.expectedStatus == http.StatusOK { + assert.Equal(t, "application/json", resp.Header.Get("Content-Type")) + + var metricValueList v1beta1.MetricValueList + err = json.NewDecoder(resp.Body).Decode(&metricValueList) + assert.NoError(t, err) + + assert.Equal(t, "MetricValueList", metricValueList.Kind) + assert.Equal(t, "custom.metrics.k8s.io/v1beta1", metricValueList.APIVersion) + assert.Len(t, metricValueList.Items, 1) + + metricValue := metricValueList.Items[0] + assert.Equal(t, tt.expectedMetric, metricValue.MetricName) + assert.Equal(t, tt.expectedNS, metricValue.DescribedObject.Namespace) + assert.Equal(t, "Pod", metricValue.DescribedObject.Kind) + assert.Equal(t, "v1", metricValue.DescribedObject.APIVersion) + + // Check that the value is a valid quantity (any format is acceptable) + assert.NotNil(t, metricValue.Value) + assert.True(t, metricValue.Value.IsZero() || !metricValue.Value.IsZero(), "value should be a valid quantity") + } + }) + } +} + +func TestCustomMetricsAPI_GetCurrentMetricValue(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + storage := mocks.NewMockStore(ctrl) + // Mock the Pending method to return 0 for no metrics + storage.EXPECT().Pending().Return(0).AnyTimes() + // Mock the ElapsedTime method for the time-based shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() + + cfg := &config.Settings{ + Database: config.Database{ + MaxRecords: 1500000, + }, + } + + collector, err := domain.NewMetricCollector(cfg, mockClock, storage, nil) + assert.NoError(t, err) + defer collector.Close() + + handler := handlers.NewCustomMetricsAPI("/apis/custom.metrics.k8s.io/v1beta1", collector, nil) + + req := createRequest("GET", "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/czo_cost_metrics_shipping_progress", nil) + resp, err := test.InvokeService(handler.Service, "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/czo_cost_metrics_shipping_progress", *req) + assert.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + var metricValue v1beta1.MetricValue + err = json.NewDecoder(resp.Body).Decode(&metricValue) + assert.NoError(t, err) + + // Check that the value is a valid quantity (any format is acceptable) + assert.NotNil(t, metricValue.Value) + assert.True(t, metricValue.Value.IsZero() || !metricValue.Value.IsZero(), "value should be a valid quantity") +} + +func TestCustomMetricsAPI_PrometheusIntegration(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + initialTime := time.Date(2023, 10, 1, 12, 0, 0, 0, time.UTC) + mockClock := mocks.NewMockClock(initialTime) + + storage := mocks.NewMockStore(ctrl) + observabilityStorage := mocks.NewMockStore(ctrl) + + // Mock the Pending method to return a value that gives us 50% progress + storage.EXPECT().Pending().Return(750000).AnyTimes() // 750k / 1.5M = 0.5 + // Mock the ElapsedTime method for the time-based shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() + // Mock the Flush method for both stores + storage.EXPECT().Flush().Return(nil).AnyTimes() + observabilityStorage.EXPECT().Flush().Return(nil).AnyTimes() + + cfg := &config.Settings{ + Database: config.Database{ + MaxRecords: 1500000, + }, + } + + collector, err := domain.NewMetricCollector(cfg, mockClock, storage, observabilityStorage) + assert.NoError(t, err) + defer collector.Close() + + // The metric is already registered by the collector, so we don't need to register it again + // Just trigger an update to set the value + collector.Flush(context.Background()) + + handler := handlers.NewCustomMetricsAPI("/apis/custom.metrics.k8s.io/v1beta1", collector, nil) + + req := createRequest("GET", "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/czo_cost_metrics_shipping_progress", nil) + resp, err := test.InvokeService(handler.Service, "/apis/custom.metrics.k8s.io/v1beta1/namespaces/cza/pods/test-pod/czo_cost_metrics_shipping_progress", *req) + assert.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + // Parse the response + var result v1beta1.MetricValue + err = json.NewDecoder(resp.Body).Decode(&result) + assert.NoError(t, err) + + // Verify the basic structure + assert.Equal(t, "czo_cost_metrics_shipping_progress", result.MetricName) + assert.Equal(t, "test-pod", result.DescribedObject.Name) + assert.Equal(t, "cza", result.DescribedObject.Namespace) + + // Check that the value is a valid quantity and not zero (should be 50% progress) + assert.NotNil(t, result.Value) + assert.False(t, result.Value.IsZero(), "value should not be zero for 50% progress") +} diff --git a/app/handlers/discovery.go b/app/handlers/discovery.go new file mode 100644 index 00000000..a7448fd0 --- /dev/null +++ b/app/handlers/discovery.go @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "encoding/json" + "net/http" + + "github.com/go-chi/chi/v5" + "github.com/go-obvious/server" + "github.com/go-obvious/server/api" + "github.com/go-obvious/server/request" + "github.com/rs/zerolog/log" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + // descriptionKey is the key for description in OpenAPI spec + descriptionKey = "description" + // applicationJSON is the content type for JSON responses + applicationJSON = "application/json" +) + +// DiscoveryAPI provides API discovery endpoints required by HPA controller +type DiscoveryAPI struct { + api.Service +} + +// NewDiscoveryAPI creates a new API discovery handler +func NewDiscoveryAPI() *DiscoveryAPI { + d := &DiscoveryAPI{ + Service: api.Service{ + APIName: "discovery", + Mounts: map[string]*chi.Mux{}, + }, + } + d.Service.Mounts["/"] = d.Routes() + return d +} + +func (d *DiscoveryAPI) Register(app server.Server) error { + if err := d.Service.Register(app); err != nil { + return err + } + return nil +} + +func (d *DiscoveryAPI) Routes() *chi.Mux { + r := chi.NewRouter() + + // API discovery endpoints (required by HPA controller) - ONLY these endpoints + r.Get("/apis", d.listAPIGroups) + r.Get("/openapi/v2", d.getOpenAPISpec) + + return r +} + +// listAPIGroups returns the API groups available for discovery +func (d *DiscoveryAPI) listAPIGroups(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("DiscoveryAPI: listAPIGroups called") + + // Return the API group list that includes custom.metrics.k8s.io + apiGroupList := metav1.APIGroupList{ + TypeMeta: metav1.TypeMeta{ + Kind: "APIGroupList", + APIVersion: "v1", + }, + Groups: []metav1.APIGroup{ + { + Name: "custom.metrics.k8s.io", + Versions: []metav1.GroupVersionForDiscovery{ + { + GroupVersion: "custom.metrics.k8s.io/v1beta1", + Version: "v1beta1", + }, + }, + PreferredVersion: metav1.GroupVersionForDiscovery{ + GroupVersion: "custom.metrics.k8s.io/v1beta1", + Version: "v1beta1", + }, + }, + }, + } + + w.Header().Set("Content-Type", applicationJSON) + if err := json.NewEncoder(w).Encode(apiGroupList); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode API group list") + request.Reply(r, w, "failed to encode API group list", http.StatusInternalServerError) + return + } +} + +// getOpenAPISpec returns a minimal OpenAPI v2 specification +func (d *DiscoveryAPI) getOpenAPISpec(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("DiscoveryAPI: getOpenAPISpec called") + + // Return a minimal OpenAPI v2 spec for our custom metrics API + openAPISpec := map[string]interface{}{ + "swagger": "2.0", + "info": map[string]interface{}{ + "title": "CloudZero Custom Metrics API", + "version": "v1beta1", + }, + "paths": map[string]interface{}{ + "/apis/custom.metrics.k8s.io/v1beta1/": map[string]interface{}{ + "get": map[string]interface{}{ + descriptionKey: "List available custom metrics", + "produces": []string{"application/json"}, + "responses": map[string]interface{}{ + "200": map[string]interface{}{ + descriptionKey: "List of available metrics", + }, + }, + }, + }, + "/apis/custom.metrics.k8s.io/v1beta1/namespaces/{namespace}/pods/{metric}": map[string]interface{}{ + "get": map[string]interface{}{ + descriptionKey: "Get custom metric for pods in namespace", + "produces": []string{"application/json"}, + "parameters": []map[string]interface{}{ + { + "name": "namespace", + "in": "path", + "required": true, + "type": "string", + }, + { + "name": "metric", + "in": "path", + "required": true, + "type": "string", + }, + }, + "responses": map[string]interface{}{ + "200": map[string]interface{}{ + descriptionKey: "Metric value list", + }, + }, + }, + }, + }, + } + + w.Header().Set("Content-Type", applicationJSON) + if err := json.NewEncoder(w).Encode(openAPISpec); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode OpenAPI spec") + request.Reply(r, w, "failed to encode OpenAPI spec", http.StatusInternalServerError) + return + } +} diff --git a/app/handlers/openapi_discovery.go b/app/handlers/openapi_discovery.go new file mode 100644 index 00000000..44d80305 --- /dev/null +++ b/app/handlers/openapi_discovery.go @@ -0,0 +1,242 @@ +// SPDX-FileCopyrightText: Copyright (c) 2016-2025, CloudZero, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handlers + +import ( + "encoding/json" + "net/http" + + "github.com/go-chi/chi/v5" + "github.com/go-obvious/server" + "github.com/go-obvious/server/api" + "github.com/go-obvious/server/request" + "github.com/rs/zerolog/log" +) + +const ( + // openAPIApplicationJSON is the content type for JSON responses + openAPIApplicationJSON = "application/json" + // openAPIDescriptionKey is the key for description in OpenAPI spec + openAPIDescriptionKey = "description" + // openAPITypeKey is the key for type in OpenAPI spec + openAPITypeKey = "type" + // openAPIStringValue is the string value for type in OpenAPI spec + openAPIStringValue = "string" + // openAPIArrayValue is the array value for type in OpenAPI spec + openAPIArrayValue = "array" + // openAPIObjectValue is the object value for type in OpenAPI spec + openAPIObjectValue = "object" + // openAPIRefKey is the key for references in OpenAPI spec + openAPIRefKey = "$ref" +) + +// OpenAPIv2DiscoveryAPI provides OpenAPI v2 specification endpoint for API discovery +type OpenAPIv2DiscoveryAPI struct { + api.Service +} + +// NewOpenAPIv2DiscoveryAPI creates a new OpenAPI v2 discovery handler that mounts to the specified path +func NewOpenAPIv2DiscoveryAPI(path string) *OpenAPIv2DiscoveryAPI { + h := &OpenAPIv2DiscoveryAPI{ + Service: api.Service{ + APIName: "openapi-v2-discovery", + Mounts: map[string]*chi.Mux{}, + }, + } + h.Service.Mounts[path] = h.Routes() + return h +} + +func (h *OpenAPIv2DiscoveryAPI) Register(app server.Server) error { + if err := h.Service.Register(app); err != nil { + return err + } + return nil +} + +func (h *OpenAPIv2DiscoveryAPI) Routes() *chi.Mux { + r := chi.NewRouter() + r.Get("/", h.getOpenAPIv2Spec) + return r +} + +// getOpenAPIv2Spec returns an OpenAPI v2 specification +func (h *OpenAPIv2DiscoveryAPI) getOpenAPIv2Spec(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("OpenAPIv2DiscoveryAPI: getOpenAPIv2Spec called") + + // Return an aggregated OpenAPI v2 spec for our custom metrics API + openAPISpec := map[string]interface{}{ + "swagger": "2.0", + "info": map[string]interface{}{ + "title": "CloudZero Custom Metrics API", + "version": "v1beta1", + }, + "host": "", + "basePath": "/", + "schemes": []string{"https"}, + "consumes": []string{openAPIApplicationJSON}, + "produces": []string{openAPIApplicationJSON}, + "paths": map[string]interface{}{ + "/apis/custom.metrics.k8s.io/v1beta1": map[string]interface{}{ + "get": map[string]interface{}{ + openAPIDescriptionKey: "List available custom metrics", + "operationId": "listCustomMetrics", + "produces": []string{openAPIApplicationJSON}, + "responses": map[string]interface{}{ + "200": map[string]interface{}{ + openAPIDescriptionKey: "List of available metrics", + "schema": map[string]interface{}{ + openAPITypeKey: openAPIArrayValue, + "items": map[string]interface{}{ + openAPITypeKey: openAPIStringValue, + }, + }, + }, + }, + "tags": []string{"custom.metrics.k8s.io_v1beta1"}, + }, + }, + "/apis/custom.metrics.k8s.io/v1beta1/namespaces/{namespace}/pods/{metric}": map[string]interface{}{ + "get": map[string]interface{}{ + openAPIDescriptionKey: "Get custom metric for pods in namespace", + "operationId": "getCustomMetricForPods", + "produces": []string{openAPIApplicationJSON}, + "parameters": []map[string]interface{}{ + { + "name": "namespace", + "in": "path", + openAPIDescriptionKey: "object name and auth scope, such as for teams and projects", + "required": true, + openAPITypeKey: openAPIStringValue, + }, + { + "name": "metric", + "in": "path", + openAPIDescriptionKey: "the name of the metric", + "required": true, + openAPITypeKey: openAPIStringValue, + }, + }, + "responses": map[string]interface{}{ + "200": map[string]interface{}{ + openAPIDescriptionKey: "Metric value list", + "schema": map[string]interface{}{ + openAPIRefKey: "#/definitions/io.k8s.metrics.pkg.apis.custom_metrics.v1beta1.MetricValueList", + }, + }, + }, + "tags": []string{"custom.metrics.k8s.io_v1beta1"}, + }, + }, + }, + "definitions": map[string]interface{}{ + "io.k8s.metrics.pkg.apis.custom_metrics.v1beta1.MetricValue": map[string]interface{}{ + openAPITypeKey: openAPIObjectValue, + "properties": map[string]interface{}{ + "describedObject": map[string]interface{}{ + openAPIRefKey: "#/definitions/io.k8s.api.core.v1.ObjectReference", + }, + "metricName": map[string]interface{}{ + openAPITypeKey: openAPIStringValue, + }, + "timestamp": map[string]interface{}{ + openAPIRefKey: "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.Time", + }, + "value": map[string]interface{}{ + openAPIRefKey: "#/definitions/io.k8s.apimachinery.pkg.api.resource.Quantity", + }, + }, + }, + "io.k8s.metrics.pkg.apis.custom_metrics.v1beta1.MetricValueList": map[string]interface{}{ + openAPITypeKey: openAPIObjectValue, + "properties": map[string]interface{}{ + "items": map[string]interface{}{ + openAPITypeKey: openAPIArrayValue, + "items": map[string]interface{}{ + openAPIRefKey: "#/definitions/io.k8s.metrics.pkg.apis.custom_metrics.v1beta1.MetricValue", + }, + }, + "metadata": map[string]interface{}{ + openAPIRefKey: "#/definitions/io.k8s.apimachinery.pkg.apis.meta.v1.ListMeta", + }, + }, + }, + }, + "tags": []map[string]interface{}{ + { + "name": "custom.metrics.k8s.io_v1beta1", + openAPIDescriptionKey: "Custom metrics API for autoscaling", + }, + }, + } + + // Support both JSON and protobuf as per Kubernetes API docs + acceptHeader := r.Header.Get("Accept") + if acceptHeader == "application/com.github.proto-openapi.spec.v2@v1.0+protobuf" { + w.Header().Set("Content-Type", "application/com.github.proto-openapi.spec.v2@v1.0+protobuf") + // For now, return JSON even for protobuf requests since we don't have protobuf encoding + } else { + w.Header().Set("Content-Type", openAPIApplicationJSON) + } + + if err := json.NewEncoder(w).Encode(openAPISpec); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode OpenAPI v2 spec") + request.Reply(r, w, "failed to encode OpenAPI v2 spec", http.StatusInternalServerError) + return + } +} + +// OpenAPIv3DiscoveryAPI provides OpenAPI v3 specification endpoint +type OpenAPIv3DiscoveryAPI struct { + api.Service +} + +// NewOpenAPIv3DiscoveryAPI creates a new OpenAPI v3 discovery handler that mounts to the specified path +func NewOpenAPIv3DiscoveryAPI(path string) *OpenAPIv3DiscoveryAPI { + h := &OpenAPIv3DiscoveryAPI{ + Service: api.Service{ + APIName: "openapi-v3-discovery", + Mounts: map[string]*chi.Mux{}, + }, + } + h.Service.Mounts[path] = h.Routes() + return h +} + +func (h *OpenAPIv3DiscoveryAPI) Register(app server.Server) error { + if err := h.Service.Register(app); err != nil { + return err + } + return nil +} + +func (h *OpenAPIv3DiscoveryAPI) Routes() *chi.Mux { + r := chi.NewRouter() + // This serves OpenAPI v3 discovery at /openapi/v3 + r.Get("/", h.getOpenAPIv3Discovery) + return r +} + +// getOpenAPIv3Discovery returns OpenAPI v3 discovery information +func (h *OpenAPIv3DiscoveryAPI) getOpenAPIv3Discovery(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log.Ctx(ctx).Info().Str("path", r.URL.Path).Msg("OpenAPIv3DiscoveryAPI: getOpenAPIv3Discovery called") + + // Return OpenAPI v3 discovery with available API groups + discovery := map[string]interface{}{ + "paths": map[string]interface{}{ + "apis/custom.metrics.k8s.io/v1beta1": map[string]interface{}{ + "serverRelativeURL": "/openapi/v3/apis/custom.metrics.k8s.io/v1beta1", + }, + }, + } + + w.Header().Set("Content-Type", openAPIApplicationJSON) + if err := json.NewEncoder(w).Encode(discovery); err != nil { + log.Ctx(ctx).Err(err).Msg("failed to encode OpenAPI v3 discovery") + request.Reply(r, w, "failed to encode OpenAPI v3 discovery", http.StatusInternalServerError) + return + } +} diff --git a/app/handlers/prom_metrics.go b/app/handlers/prom_metrics.go index 57e9c3de..e9b74bcf 100644 --- a/app/handlers/prom_metrics.go +++ b/app/handlers/prom_metrics.go @@ -37,5 +37,6 @@ func (a *PromMetricsAPI) Register(app server.Server) error { func (a *PromMetricsAPI) Routes() *chi.Mux { r := chi.NewRouter() r.Get("/", promhttp.Handler().ServeHTTP) + return r } diff --git a/app/handlers/remote_write_test.go b/app/handlers/remote_write_test.go index 279f545a..ee202894 100644 --- a/app/handlers/remote_write_test.go +++ b/app/handlers/remote_write_test.go @@ -57,6 +57,10 @@ func TestRemoteWriteMethods(t *testing.T) { storage.EXPECT().Put(gomock.Any(), gomock.Any()).Return(nil) storage.EXPECT().Flush().Return(nil) + // Add the Pending() expectation for the shipping progress metric + storage.EXPECT().Pending().Return(0).AnyTimes() + // Add the ElapsedTime() expectation for the time-based shipping progress metric + storage.EXPECT().ElapsedTime().Return(int64(10000)).AnyTimes() payload, _, _, err := testdata.BuildWriteRequest(testdata.WriteRequestFixture.Timeseries, nil, nil, nil, nil, "snappy") assert.NoError(t, err) diff --git a/app/http/middleware/middleware.go b/app/http/middleware/middleware.go index a5a6855f..4f9bbf1a 100644 --- a/app/http/middleware/middleware.go +++ b/app/http/middleware/middleware.go @@ -6,10 +6,10 @@ package middleware import ( "net/http" + "sync" "time" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/rs/zerolog" "github.com/rs/zerolog/log" @@ -25,24 +25,53 @@ func (r *statusRecorder) WriteHeader(code int) { r.ResponseWriter.WriteHeader(code) } -// PromHTTPMiddleware instruments HTTP requests with Prometheus metrics. -func PromHTTPMiddleware(next http.Handler) http.Handler { - return promhttp.InstrumentHandlerDuration( - promauto.NewHistogramVec( +var ( + httpRequestDuration *prometheus.HistogramVec + httpRequestsTotal *prometheus.CounterVec + metricsOnce sync.Once +) + +// getPrometheusMetrics returns initialized prometheus metrics, creating them only once +func getPrometheusMetrics() (*prometheus.HistogramVec, *prometheus.CounterVec) { + metricsOnce.Do(func() { + httpRequestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "http_request_duration_seconds", Help: "Duration of HTTP requests in seconds.", }, []string{"code", "method"}, - ), + ) + httpRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "http_requests_total", + Help: "Count of all HTTP requests processed, labeled by route, method and status code.", + }, + []string{"code", "method"}, + ) + // Register metrics with error handling to avoid panics on duplicate registration + if err := prometheus.Register(httpRequestDuration); err != nil { + if _, ok := err.(prometheus.AlreadyRegisteredError); !ok { + // Only panic if it's not an AlreadyRegisteredError + panic(err) + } + } + if err := prometheus.Register(httpRequestsTotal); err != nil { + if _, ok := err.(prometheus.AlreadyRegisteredError); !ok { + // Only panic if it's not an AlreadyRegisteredError + panic(err) + } + } + }) + return httpRequestDuration, httpRequestsTotal +} + +// PromHTTPMiddleware instruments HTTP requests with Prometheus metrics. +func PromHTTPMiddleware(next http.Handler) http.Handler { + duration, counter := getPrometheusMetrics() + return promhttp.InstrumentHandlerDuration( + duration, promhttp.InstrumentHandlerCounter( - promauto.NewCounterVec( - prometheus.CounterOpts{ - Name: "http_requests_total", - Help: "Count of all HTTP requests processed, labeled by route, method and status code.", - }, - []string{"code", "method"}, - ), + counter, next, ), ) diff --git a/app/logging/store_sink_test.go b/app/logging/store_sink_test.go index 5dddd03d..5e4a846a 100644 --- a/app/logging/store_sink_test.go +++ b/app/logging/store_sink_test.go @@ -88,6 +88,11 @@ func (ms *mockStore) GetLastError() error { return ms.lastError } +func (ms *mockStore) ElapsedTime() int64 { + // Return a mock elapsed time in milliseconds + return 1000 +} + // TestUnit_Logging_StoreWriter_Write_Basic tests basic functionality of a single write. func TestUnit_Logging_StoreWriter_Write_Basic(t *testing.T) { store := newMockStore() diff --git a/app/storage/disk/disk.go b/app/storage/disk/disk.go index c1dbd648..6d9a2f23 100644 --- a/app/storage/disk/disk.go +++ b/app/storage/disk/disk.go @@ -275,6 +275,14 @@ func (d *DiskStore) Pending() int { return d.rowCount } +// ElapsedTime returns the duration in milliseconds since the current buffer was started +func (d *DiskStore) ElapsedTime() int64 { + d.mu.Lock() + defer d.mu.Unlock() + + return timestamp.Milli() - d.startTime +} + func (d *DiskStore) GetFiles(paths ...string) ([]string, error) { // set to root path allPaths := []string{d.dirPath} diff --git a/app/types/mocks/store_mock.go b/app/types/mocks/store_mock.go index 493128b7..940c9539 100644 --- a/app/types/mocks/store_mock.go +++ b/app/types/mocks/store_mock.go @@ -58,6 +58,20 @@ func (mr *MockStoreMockRecorder) All(arg0, arg1 any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "All", reflect.TypeOf((*MockStore)(nil).All), arg0, arg1) } +// ElapsedTime mocks base method. +func (m *MockStore) ElapsedTime() int64 { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ElapsedTime") + ret0, _ := ret[0].(int64) + return ret0 +} + +// ElapsedTime indicates an expected call of ElapsedTime. +func (mr *MockStoreMockRecorder) ElapsedTime() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ElapsedTime", reflect.TypeOf((*MockStore)(nil).ElapsedTime)) +} + // Find mocks base method. func (m *MockStore) Find(ctx context.Context, filterName, filterExtension string) ([]string, error) { m.ctrl.T.Helper() diff --git a/app/types/store.go b/app/types/store.go index 903f150d..1591e2f7 100644 --- a/app/types/store.go +++ b/app/types/store.go @@ -27,6 +27,10 @@ type WritableStore interface { // Pending returns the number of rows currently buffered and not yet written to disk. // This can be used to monitor when a flush may be needed. Pending() int + + // ElapsedTime returns the duration in milliseconds since the current buffer was started. + // This is used for time-based metric calculations. + ElapsedTime() int64 } // ReadableStore is for performing read operations against the store diff --git a/go.mod b/go.mod index a266fd65..93951e01 100644 --- a/go.mod +++ b/go.mod @@ -98,6 +98,7 @@ require ( gorm.io/driver/sqlite v1.6.0 gorm.io/gorm v1.30.0 helm.sh/helm/v3 v3.18.4 + k8s.io/metrics v0.33.2 sigs.k8s.io/gateway-api v1.3.0 ) diff --git a/go.sum b/go.sum index 8a8b54df..2536cd52 100644 --- a/go.sum +++ b/go.sum @@ -857,6 +857,8 @@ k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUy k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= k8s.io/kubectl v0.33.2 h1:7XKZ6DYCklu5MZQzJe+CkCjoGZwD1wWl7t/FxzhMz7Y= k8s.io/kubectl v0.33.2/go.mod h1:8rC67FB8tVTYraovAGNi/idWIK90z2CHFNMmGJZJ3KI= +k8s.io/metrics v0.33.2 h1:gNCBmtnUMDMCRg9Ly5ehxP3OdKISMsOnh1vzk01iCgE= +k8s.io/metrics v0.33.2/go.mod h1:yxoAosKGRsZisv3BGekC5W6T1J8XSV+PoUEevACRv7c= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= mvdan.cc/gofumpt v0.8.0 h1:nZUCeC2ViFaerTcYKstMmfysj6uhQrA2vJe+2vwGU6k= diff --git a/helm/docs/AUTOSCALING.md b/helm/docs/AUTOSCALING.md new file mode 100644 index 00000000..2befd285 --- /dev/null +++ b/helm/docs/AUTOSCALING.md @@ -0,0 +1,246 @@ +# Autoscaling the CloudZero Agent + +## Introduction + +The CloudZero Agent's autoscaling feature automatically adjusts the number of aggregator pods based on the volume of cost metrics being processed. This ensures optimal resource utilization while maintaining performance on clusters of any size. + +### What is Autoscaling? + +Autoscaling automatically increases or decreases the number of replicas based on observed metrics. Traditional autoscaling relies on CPU or memory usage, but the CloudZero Agent uses a more sophisticated approach based on business logic—specifically, how close the system is to reaching its data shipping capacity. + +### Why Custom Metrics? + +The CloudZero Agent processes cost metrics from Kubernetes clusters and ships them to the CloudZero platform. The volume of this data varies significantly based on: + +- **Cluster size**: Larger clusters generate more metrics +- **Resource churn**: Frequent pod creation/deletion increases metric volume +- **Time of day**: Peak business hours often correlate with higher metric volume +- **Deployment patterns**: CI/CD activity and autoscaling events generate metric spikes + +### How It Works + +The aggregator component maintains an in-memory buffer of cost metrics before flushing them to disk and shipping to CloudZero. The custom metric `czo_cost_metrics_shipping_progress` tracks how full this buffer is relative to the configured thresholds. + +When the buffer fills up (indicating high metric volume), the HPA automatically scales up additional aggregator pods to handle the increased load. When the buffer is consistently low (indicating normal or low volume), it scales down to conserve resources. + +This approach provides: + +- **Proactive scaling**: Scale before performance degrades +- **Cost efficiency**: Use only the resources needed for current load +- **High availability**: Prevent data loss during metric volume spikes +- **Predictable performance**: Maintain consistent shipping rates regardless of load + +## Implementation Details + +### Custom Metrics API + +The chart automatically registers the custom metrics API with Kubernetes by creating an `APIService` resource. This allows the HPA controller to discover and query the custom metrics endpoint exposed by the aggregator pods. + +### The `czo_cost_metrics_shipping_progress` Metric + +The autoscaling system is built around a single custom metric that represents how close the system is to its in-memory buffer capacity. This metric is calculated based on two flush triggers that occur during normal operation: + +1. **Record Count**: Flush when `currentPending >= maxRecords` +2. **Time Interval**: Flush when `elapsedTime >= costMaxInterval` + +#### Calculation Formula + +The metric is calculated as: + +$$\frac{\text{Records Pending}}{\frac{\text{Elapsed Time}}{\text{Max Interval}} \times \text{Max Records}}$$ + +This formula normalizes the current buffer state against the expected capacity over time, providing a percentage-based value where: + +- **0.0**: Buffer is empty +- **1.0**: Buffer is at expected capacity for the elapsed time +- **> 1.0**: Buffer is over capacity and needs scaling + +#### Example Values + +Based on the default `maxRecords = 1,500,000` and `costMaxInterval = 30m`: + +- **5 minutes elapsed, 250,000 records**: `progress = 250,000 / (5/30 * 1,500,000) = 1.0` (100% of expected rate) +- **10 minutes elapsed, 300,000 records**: `progress = 300,000 / (10/30 * 1,500,000) = 0.6` (60% of expected rate - could scale down) +- **15 minutes elapsed, 900,000 records**: `progress = 900,000 / (15/30 * 1,500,000) = 1.2` (120% of expected rate - should scale up) +- **30 minutes elapsed, 1,500,000 records**: `progress = 1,500,000 / (30/30 * 1,500,000) = 1.0` (100% at time limit) + +#### Relationship to System Behavior + +The metric directly correlates with the system's disk flushing behavior: + +1. **Normal Operation**: Metrics accumulate in memory buffer +2. **Flush Trigger**: When `currentPending >= maxRecords` or time interval exceeded, system flushes to disk +3. **Post-Flush**: `currentPending` resets to 0, metric drops to 0.0 +4. **Cycle Repeats**: Buffer fills again as new metrics arrive + +#### HPA Integration + +By default, the HPA targets `"900m"` (0.9 or 90% of capacity). The target value of 90% allows the HPA to proactively scale before the buffer becomes full, preventing data loss or performance degradation: + +- **Below 0.9**: May scale down to as few as `minReplicas` +- **Above 0.9**: May scale up to as many as `maxReplicas` + +## Configuration + +### Enabling Autoscaling + +To enable HPA for the aggregator component: + +```yaml +components: + aggregator: + autoscale: true +``` + +This automatically creates: + +- **HPA Resource**: Configured to scale based on `czo_cost_metrics_shipping_progress` +- **APIService Registration**: Registers `v1beta1.custom.metrics.k8s.io` with Kubernetes +- **RBAC Permissions**: Allows the HPA controller to access custom metrics + +### Scaling Parameters + +Detailed configuration is available in the (non-API-stable) `aggregator.scaling` section: + +```yaml +aggregator: + scaling: + minReplicas: 1 + maxReplicas: 10 + targetValue: "900m" # Target 90% of capacity + behavior: + scaleUp: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 100 + periodSeconds: 60 + - type: Pods + value: 2 + periodSeconds: 60 + selectPolicy: Max + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 50 + periodSeconds: 60 + - type: Pods + value: 1 + periodSeconds: 60 + selectPolicy: Min +``` + +### Target Value Format + +The `targetValue` supports both percentage and resource quantity formats: + +- **Percentage**: `"90%"` (90% of maximum records threshold) +- **Resource Quantity**: `"900m"` (0.9 as a decimal) + +Both formats are equivalent and represent the same scaling threshold. + +## Troubleshooting + +### HPA Shows "Unknown" Metrics + +**Symptoms**: HPA status shows metrics as "unknown" or "0" + +**Causes**: + +- Collector pods not running +- Custom metrics API not accessible +- Network policies blocking access API server on collector container + +**Solutions**: + +1. Verify collector pods are running: + + ```bash + kubectl get pods -l app.kubernetes.io/name=cloudzero-collector + ``` + +2. Test custom metrics API directly: + + ```bash + kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/" + ``` + +3. Check collector logs for API errors: + ```bash + kubectl logs -l app.kubernetes.io/name=cloudzero-collector + ``` + +### HPA Not Scaling + +**Symptoms**: Metric values are high but HPA doesn't scale + +**Causes**: + +- Target value too high +- MaxReplicas already reached +- Resource constraints + +**Solutions**: + +1. Check HPA configuration: + + ```bash + kubectl describe hpa cloudzero-aggregator + ``` + +2. Verify resource quotas: + + ```bash + kubectl describe resourcequota + ``` + +3. Review HPA events: + ```bash + kubectl get events --field-selector involvedObject.name=cloudzero-aggregator + ``` + +### Metric Values Seem Wrong + +**Symptoms**: Metric values don't match expected shipping progress + +**Causes**: + +- Configuration mismatch between aggregator and collector +- Metric calculation errors +- Time synchronization issues + +**Solutions**: + +1. Compare aggregator configuration: + + ```bash + kubectl get configmap cloudzero-aggregator-config -o yaml + ``` + +2. Check Prometheus metrics directly: + ```bash + kubectl port-forward svc/cloudzero-collector 8080:8080 + curl localhost:8080/metrics | grep czo_cost_metrics_shipping_progress + ``` + +## Security + +### RBAC Requirements + +The HPA requires specific permissions to access custom metrics: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: hpa-custom-metrics-reader +rules: + - apiGroups: ["custom.metrics.k8s.io"] + resources: ["*"] + verbs: ["get", "list"] +``` + +### Network Policies + +Ensure network policies allow HPA to communicate with collector pods on the custom metrics API port. diff --git a/helm/templates/_cm_helpers.tpl b/helm/templates/_cm_helpers.tpl index 1abd9792..6ab2e0aa 100644 --- a/helm/templates/_cm_helpers.tpl +++ b/helm/templates/_cm_helpers.tpl @@ -55,9 +55,20 @@ metrics: {{- include "cloudzero-agent.generateMetricFilters" (dict "name" "cost_labels" "filters" (include "cloudzero-agent.defaults" . | fromYaml).metricFilters.cost.labels) | nindent 2 }} {{- include "cloudzero-agent.generateMetricFilters" (dict "name" "observability" "filters" (include "cloudzero-agent.defaults" . | fromYaml).metricFilters.observability.name) | nindent 2 }} {{- include "cloudzero-agent.generateMetricFilters" (dict "name" "observability_labels" "filters" (include "cloudzero-agent.defaults" . | fromYaml).metricFilters.observability.labels) | nindent 2 }} +{{- if .Values.components.aggregator.autoscale }} +certificate: + key: {{ .Values.aggregator.collector.tls.mountPath }}/tls.key + cert: {{ .Values.aggregator.collector.tls.mountPath }}/tls.crt +{{- end }} server: + {{- if .Values.components.aggregator.autoscale }} + mode: dual + port: {{ .Values.aggregator.collector.port }} + tls_port: {{ .Values.aggregator.collector.tls.port | default 8443 }} + {{- else }} mode: http port: {{ .Values.aggregator.collector.port }} + {{- end }} profiling: {{ .Values.aggregator.profiling }} reconnect_frequency: {{ .Values.aggregator.reconnectFrequency }} logging: diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index f5bae9dd..c50aa9ee 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -634,6 +634,23 @@ Name for the secret holding TLS certificates {{- .Values.insightsController.tls.secret.name | default (printf "%s-tls" (include "cloudzero-agent.insightsController.server.webhookFullname" .)) }} {{- end }} +{{/* +Name for the secret holding aggregator TLS certificates +*/}} +{{- define "cloudzero-agent.aggregator.tlsSecretName" -}} +{{- .Values.aggregator.collector.tls.secret.name | default (printf "%s-tls" (include "cloudzero-agent.aggregator.name" .)) }} +{{- end }} + +{{/* +CA Bundle for aggregator custom metrics API +*/}} +{{- define "cloudzero-agent.aggregator.caBundle" -}} +{{- $secret := lookup "v1" "Secret" .Release.Namespace (include "cloudzero-agent.aggregator.tlsSecretName" .) -}} +{{- if $secret -}} +{{- index $secret.data "tls.crt" -}} +{{- end -}} +{{- end }} + {{/* Volume mount for the API key */}} diff --git a/helm/templates/agent-clusterrole.yaml b/helm/templates/agent-clusterrole.yaml index 31208c45..a6c7aaa8 100644 --- a/helm/templates/agent-clusterrole.yaml +++ b/helm/templates/agent-clusterrole.yaml @@ -79,4 +79,16 @@ rules: - "/metrics" verbs: - get + {{- if and .Values.aggregator.collector.tls.secret.create .Values.components.aggregator.autoscale }} + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - create + - get + - list + - patch + - update + {{- end }} {{- end }} diff --git a/helm/templates/aggregator-deploy.yaml b/helm/templates/aggregator-deploy.yaml index c78f5677..9cf7bc54 100644 --- a/helm/templates/aggregator-deploy.yaml +++ b/helm/templates/aggregator-deploy.yaml @@ -37,10 +37,18 @@ spec: ports: - name: port-collector containerPort: {{ .Values.aggregator.collector.port }} + {{- if .Values.components.aggregator.autoscale }} + - name: port-coll-tls + containerPort: {{ .Values.aggregator.collector.tls.port | default 8443 }} + {{- end }} command: ["/app/cloudzero-collector", "-config", "{{ .Values.aggregator.mountRoot }}/config/config.yml"] env: - name: SERVER_PORT value: "{{ .Values.aggregator.collector.port }}" + {{- if .Values.components.aggregator.autoscale }} + - name: SERVER_TLS_PORT + value: "{{ .Values.aggregator.collector.tls.port | default 8443 }}" + {{- end }} volumeMounts: {{- include "cloudzero-agent.apiKeyVolumeMount" . | nindent 12 }} - name: aggregator-config-volume @@ -48,10 +56,16 @@ spec: readOnly: true - name: aggregator-persistent-storage mountPath: {{ .Values.aggregator.mountRoot }}/data + {{- if .Values.components.aggregator.autoscale }} + - name: aggregator-tls-certs + mountPath: {{ .Values.aggregator.collector.tls.mountPath }} + readOnly: true + {{- end }} readinessProbe: httpGet: path: /healthz port: {{ .Values.aggregator.collector.port }} + scheme: HTTP initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 @@ -59,6 +73,7 @@ spec: httpGet: path: /healthz port: {{ .Values.aggregator.collector.port }} + scheme: HTTP initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 @@ -84,14 +99,14 @@ spec: readinessProbe: httpGet: path: /healthz - port: {{ .Values.aggregator.collector.port }} + port: {{ .Values.aggregator.shipper.port }} initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 livenessProbe: httpGet: path: /healthz - port: {{ .Values.aggregator.collector.port }} + port: {{ .Values.aggregator.shipper.port }} initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 @@ -149,6 +164,11 @@ spec: - name: aggregator-config-volume configMap: name: {{ include "cloudzero-agent.aggregator.name" . }} + {{- if .Values.components.aggregator.autoscale }} + - name: aggregator-tls-certs + secret: + secretName: {{ include "cloudzero-agent.aggregator.tlsSecretName" . }} + {{- end }} - name: aggregator-persistent-storage {{- if .Values.aggregator.database.emptyDir.enabled }} emptyDir: diff --git a/helm/templates/aggregator-hpa.yaml b/helm/templates/aggregator-hpa.yaml new file mode 100644 index 00000000..a8ddbd62 --- /dev/null +++ b/helm/templates/aggregator-hpa.yaml @@ -0,0 +1,50 @@ +{{- if .Values.components.aggregator.autoscale }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }} + namespace: {{ .Release.Namespace }} + {{- include "cloudzero-agent.generateAnnotations" (merge + (deepCopy .Values.defaults.annotations) + .Values.aggregator.scaling.annotations + ) | nindent 2 }} + {{- include "cloudzero-agent.generateLabels" (dict + "globals" . + "labels" (merge (include "cloudzero-agent.aggregator.matchLabels" . | fromYaml) .Values.commonMetaLabels) + "component" "aggregator" + ) | nindent 2 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "cloudzero-agent.aggregator.name" . }} + minReplicas: {{ .Values.aggregator.scaling.minReplicas }} + maxReplicas: {{ .Values.aggregator.scaling.maxReplicas }} + metrics: + - type: Pods + pods: + metric: + name: czo_cost_metrics_shipping_progress + target: + type: AverageValue + averageValue: {{ .Values.aggregator.scaling.targetValue | default "900m" }} + behavior: + scaleUp: + stabilizationWindowSeconds: {{ .Values.aggregator.scaling.behavior.scaleUp.stabilizationWindowSeconds | default 300 }} + policies: + {{- range .Values.aggregator.scaling.behavior.scaleUp.policies }} + - type: {{ .type }} + value: {{ .value }} + periodSeconds: {{ .periodSeconds }} + {{- end }} + selectPolicy: {{ .Values.aggregator.scaling.behavior.scaleUp.selectPolicy | default "Max" }} + scaleDown: + stabilizationWindowSeconds: {{ .Values.aggregator.scaling.behavior.scaleDown.stabilizationWindowSeconds | default 300 }} + policies: + {{- range .Values.aggregator.scaling.behavior.scaleDown.policies }} + - type: {{ .type }} + value: {{ .value }} + periodSeconds: {{ .periodSeconds }} + {{- end }} + selectPolicy: {{ .Values.aggregator.scaling.behavior.scaleDown.selectPolicy | default "Min" }} +{{- end }} diff --git a/helm/templates/aggregator-init-cert-job.yaml b/helm/templates/aggregator-init-cert-job.yaml new file mode 100644 index 00000000..a7edb426 --- /dev/null +++ b/helm/templates/aggregator-init-cert-job.yaml @@ -0,0 +1,74 @@ +{{- if and .Values.aggregator.collector.tls.secret.create .Values.components.aggregator.autoscale }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-init-cert + namespace: {{ .Release.Namespace }} + labels: + {{- include "cloudzero-agent.aggregator.labels" . | nindent 4 }} + {{- include "cloudzero-agent.generateAnnotations" .Values.defaults.annotations | nindent 2 }} +spec: + template: + metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-init-cert + labels: + {{- include "cloudzero-agent.aggregator.labels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + serviceAccountName: {{ template "cloudzero-agent.serviceAccountName" . }} + {{- include "cloudzero-agent.server.imagePullSecrets" . | nindent 6 }} + containers: + - name: create-cert + {{- include "cloudzero-agent.generateImage" (dict "defaults" .Values.defaults.image "image" .Values.components.kubectl.image "compat" .Values.initCertJob.image) | nindent 8 }} + command: ["/bin/bash", "-c"] + workingDir: /var/tmp + args: + - | + #!/bin/bash + set -e + + SECRET_NAME="{{ include "cloudzero-agent.aggregator.tlsSecretName" . }}" + SERVICE_NAME="{{ include "cloudzero-agent.aggregator.name" . }}" + NAMESPACE="{{ .Release.Namespace }}" + + EXISTING_TLS_CRT=$(kubectl get secret $SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.tls\.crt}' 2>/dev/null || echo "") + EXISTING_TLS_KEY=$(kubectl get secret $SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.tls\.key}' 2>/dev/null || echo "") + + GENERATE_CERTIFICATE=false + + if [[ -n "$EXISTING_TLS_CRT" && -n "$EXISTING_TLS_KEY" ]]; then + # Check if the certificate is valid for our service + SAN=$(echo "$EXISTING_TLS_CRT" | base64 -d | openssl x509 -text -noout | grep DNS | sed 's/.*DNS://' || echo "") + if [[ "$SAN" != "$SERVICE_NAME.$NAMESPACE.svc" ]]; then + echo "The SANs in the certificate do not match the service name." + GENERATE_CERTIFICATE=true + fi + else + echo "TLS Secret is missing or incomplete." + GENERATE_CERTIFICATE=true + fi + + if [[ $GENERATE_CERTIFICATE == "true" ]]; then + echo "Generating new TLS certificate for $SERVICE_NAME.$NAMESPACE.svc" + + # Generate self-signed certificate and private key + openssl req -x509 -newkey rsa:2048 -keyout tls.key -out tls.crt -days 36500 -nodes \ + -subj "/CN=$SERVICE_NAME.$NAMESPACE.svc" \ + -addext "subjectAltName = DNS:$SERVICE_NAME.$NAMESPACE.svc" + + # Base64 encode the certificate and key + TLS_CRT=$(cat tls.crt | base64 | tr -d '\n') + TLS_KEY=$(cat tls.key | base64 | tr -d '\n') + + # Create or update the TLS Secret + kubectl create secret tls $SECRET_NAME \ + --cert=tls.crt \ + --key=tls.key \ + --namespace=$NAMESPACE \ + --dry-run=client -o yaml | kubectl apply -f - + + echo "TLS certificate created/updated successfully" + else + echo "Valid certificate already exists for $SERVICE_NAME.$NAMESPACE.svc" + fi +{{- end }} \ No newline at end of file diff --git a/helm/templates/aggregator-service.yaml b/helm/templates/aggregator-service.yaml index bcf89c84..d76a2a6d 100644 --- a/helm/templates/aggregator-service.yaml +++ b/helm/templates/aggregator-service.yaml @@ -9,7 +9,14 @@ spec: selector: {{- include "cloudzero-agent.aggregator.matchLabels" . | nindent 4 }} ports: - - protocol: TCP + - name: http + protocol: TCP port: 80 targetPort: {{ .Values.aggregator.collector.port }} + {{- if .Values.components.aggregator.autoscale }} + - name: https + protocol: TCP + port: 443 + targetPort: {{ .Values.aggregator.collector.tls.port | default 8443 }} + {{- end }} type: ClusterIP diff --git a/helm/templates/custom-metrics-apiservice.yaml b/helm/templates/custom-metrics-apiservice.yaml new file mode 100644 index 00000000..c5535c79 --- /dev/null +++ b/helm/templates/custom-metrics-apiservice.yaml @@ -0,0 +1,28 @@ +{{- if .Values.components.aggregator.autoscale }} +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.custom.metrics.k8s.io + namespace: {{ .Release.Namespace }} + labels: + {{- include "cloudzero-agent.aggregator.labels" . | nindent 4 }} + {{- include "cloudzero-agent.generateAnnotations" .Values.defaults.annotations | nindent 2 }} +spec: + service: + name: {{ include "cloudzero-agent.aggregator.name" . }} + namespace: {{ .Release.Namespace }} +{{- if .Values.components.aggregator.autoscale }} + port: 443 +{{- else }} + port: 80 +{{- end }} + group: custom.metrics.k8s.io + version: v1beta1 + groupPriorityMinimum: 100 + versionPriority: 100 +{{- if .Values.components.aggregator.autoscale }} + caBundle: {{ include "cloudzero-agent.aggregator.caBundle" . }} +{{- else }} + insecureSkipTLSVerify: true +{{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/custom-metrics-rbac.yaml b/helm/templates/custom-metrics-rbac.yaml new file mode 100644 index 00000000..2c64401d --- /dev/null +++ b/helm/templates/custom-metrics-rbac.yaml @@ -0,0 +1,46 @@ +{{- if .Values.components.aggregator.autoscale }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-custom-metrics-reader + labels: + {{- include "cloudzero-agent.aggregator.labels" . | nindent 4 }} + {{- include "cloudzero-agent.generateAnnotations" .Values.defaults.annotations | nindent 2 }} +rules: +- apiGroups: ["custom.metrics.k8s.io"] + resources: ["*"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-custom-metrics-reader + labels: + {{- include "cloudzero-agent.aggregator.labels" . | nindent 4 }} + {{- include "cloudzero-agent.generateAnnotations" .Values.defaults.annotations | nindent 2 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "cloudzero-agent.aggregator.name" . }}-custom-metrics-reader +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: system:controller:horizontal-pod-autoscaler +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "cloudzero-agent.aggregator.name" . }}-custom-metrics-reader-hpa + labels: + {{- include "cloudzero-agent.aggregator.labels" . | nindent 4 }} + {{- include "cloudzero-agent.generateAnnotations" .Values.defaults.annotations | nindent 2 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "cloudzero-agent.aggregator.name" . }}-custom-metrics-reader +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system +{{- end }} \ No newline at end of file diff --git a/helm/templates/init-cert-clusterrole.yaml b/helm/templates/init-cert-clusterrole.yaml index 1b33f65e..89c53c37 100644 --- a/helm/templates/init-cert-clusterrole.yaml +++ b/helm/templates/init-cert-clusterrole.yaml @@ -23,10 +23,27 @@ rules: - "secrets" resourceNames: - {{ include "cloudzero-agent.tlsSecretName" . }} + {{- if and .Values.aggregator.collector.tls.secret.create .Values.components.aggregator.autoscale }} + - {{ include "cloudzero-agent.aggregator.tlsSecretName" . }} + {{- end }} verbs: - get - list - patch + - create + - update + {{- if and .Values.aggregator.collector.tls.secret.create .Values.components.aggregator.autoscale }} + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - create + - get + - list + - patch + - update + {{- end }} - apiGroups: - "admissionregistration.k8s.io" resources: diff --git a/helm/values.schema.json b/helm/values.schema.json index 8c80ecb9..f1311263 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -233,6 +233,261 @@ }, "type": "object" }, + "io.k8s.api.autoscaling.v2.ContainerResourceMetricSource": { + "additionalProperties": false, + "properties": { + "container": { + "type": "string" + }, + "name": { + "type": "string" + }, + "target": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricTarget" + } + }, + "required": ["name", "target", "container"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.CrossVersionObjectReference": { + "additionalProperties": false, + "properties": { + "apiVersion": { + "type": "string" + }, + "kind": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": ["kind", "name"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.ExternalMetricSource": { + "additionalProperties": false, + "properties": { + "metric": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricIdentifier" + }, + "target": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricTarget" + } + }, + "required": ["metric", "target"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.HPAScalingPolicy": { + "additionalProperties": false, + "properties": { + "periodSeconds": { + "format": "int32", + "type": "integer" + }, + "type": { + "type": "string" + }, + "value": { + "format": "int32", + "type": "integer" + } + }, + "required": ["type", "value", "periodSeconds"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.HPAScalingRules": { + "additionalProperties": false, + "properties": { + "policies": { + "items": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HPAScalingPolicy" + }, + "type": "array", + "x-kubernetes-list-type": "atomic" + }, + "selectPolicy": { + "type": "string" + }, + "stabilizationWindowSeconds": { + "format": "int32", + "type": "integer" + }, + "tolerance": { + "$ref": "#/$defs/io.k8s.apimachinery.pkg.api.resource.Quantity" + } + }, + "type": "object" + }, + "io.k8s.api.autoscaling.v2.HorizontalPodAutoscaler": { + "additionalProperties": false, + "properties": { + "apiVersion": { + "type": "string" + }, + "kind": { + "enum": ["HorizontalPodAutoscaler"], + "type": "string" + }, + "metadata": { + "$ref": "#/$defs/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta" + }, + "spec": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerSpec" + }, + "status": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerStatus" + } + }, + "type": "object", + "x-kubernetes-group-version-kind": [ + { + "group": "autoscaling", + "kind": "HorizontalPodAutoscaler", + "version": "v2" + } + ] + }, + "io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerBehavior": { + "additionalProperties": false, + "properties": { + "scaleDown": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HPAScalingRules" + }, + "scaleUp": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HPAScalingRules" + } + }, + "type": "object" + }, + "io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerSpec": { + "additionalProperties": false, + "properties": { + "behavior": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerBehavior" + }, + "maxReplicas": { + "format": "int32", + "type": "integer" + }, + "metrics": { + "items": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricSpec" + }, + "type": "array", + "x-kubernetes-list-type": "atomic" + }, + "minReplicas": { + "format": "int32", + "type": "integer" + }, + "scaleTargetRef": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.CrossVersionObjectReference" + } + }, + "required": ["scaleTargetRef", "maxReplicas"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.MetricIdentifier": { + "additionalProperties": false, + "properties": { + "name": { + "type": "string" + }, + "selector": { + "$ref": "#/$defs/io.k8s.apimachinery.pkg.apis.meta.v1.LabelSelector" + } + }, + "required": ["name"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.MetricSpec": { + "additionalProperties": false, + "properties": { + "containerResource": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.ContainerResourceMetricSource" + }, + "external": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.ExternalMetricSource" + }, + "object": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.ObjectMetricSource" + }, + "pods": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.PodsMetricSource" + }, + "resource": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.ResourceMetricSource" + }, + "type": { + "type": "string" + } + }, + "required": ["type"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.MetricTarget": { + "additionalProperties": false, + "properties": { + "averageUtilization": { + "format": "int32", + "type": "integer" + }, + "averageValue": { + "$ref": "#/$defs/io.k8s.apimachinery.pkg.api.resource.Quantity" + }, + "type": { + "type": "string" + }, + "value": { + "$ref": "#/$defs/io.k8s.apimachinery.pkg.api.resource.Quantity" + } + }, + "required": ["type"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.ObjectMetricSource": { + "additionalProperties": false, + "properties": { + "describedObject": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.CrossVersionObjectReference" + }, + "metric": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricIdentifier" + }, + "target": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricTarget" + } + }, + "required": ["describedObject", "target", "metric"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.PodsMetricSource": { + "additionalProperties": false, + "properties": { + "metric": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricIdentifier" + }, + "target": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricTarget" + } + }, + "required": ["metric", "target"], + "type": "object" + }, + "io.k8s.api.autoscaling.v2.ResourceMetricSource": { + "additionalProperties": false, + "properties": { + "name": { + "type": "string" + }, + "target": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.MetricTarget" + } + }, + "required": ["name", "target"], + "type": "object" + }, "io.k8s.api.core.v1.AWSElasticBlockStoreVolumeSource": { "additionalProperties": false, "properties": { @@ -3210,6 +3465,30 @@ }, "resources": { "$ref": "#/$defs/io.k8s.api.core.v1.ResourceRequirements" + }, + "tls": { + "additionalProperties": false, + "properties": { + "mountPath": { + "default": "/etc/certs", + "type": "string" + }, + "secret": { + "additionalProperties": false, + "properties": { + "create": { + "default": true, + "type": "boolean" + }, + "name": { + "default": "", + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" } }, "type": "object" @@ -3279,7 +3558,7 @@ "type": "boolean" }, "level": { - "enum": ["debug", "info", "warn", "error"], + "enum": ["trace", "debug", "info", "warn", "error"], "type": "string" } }, @@ -3302,6 +3581,27 @@ "minimum": 0, "type": "integer" }, + "scaling": { + "additionalProperties": false, + "properties": { + "annotations": { + "$ref": "#/$defs/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta/properties/annotations" + }, + "behavior": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerBehavior" + }, + "maxReplicas": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerSpec/properties/maxReplicas" + }, + "minReplicas": { + "$ref": "#/$defs/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerSpec/properties/minReplicas" + }, + "targetValue": { + "type": "string" + } + }, + "type": "object" + }, "shipper": { "additionalProperties": false, "properties": { @@ -3376,6 +3676,9 @@ "annotations": { "$ref": "#/$defs/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta/properties/annotations" }, + "autoscale": { + "type": "boolean" + }, "podDisruptionBudget": { "oneOf": [ { diff --git a/helm/values.schema.yaml b/helm/values.schema.yaml index d6f39c24..f8669193 100644 --- a/helm/values.schema.yaml +++ b/helm/values.schema.yaml @@ -305,6 +305,18 @@ properties: type: - integer - "null" + autoscale: + description: | + Whether to enable horizontal pod autoscaling for the aggregator. + When enabled, the aggregator will automatically scale based on the + czo_cost_metrics_shipping_progress metric. Detailed scaling + configuration is found under the aggregator.scaling section. + + This creates: + - HPA resource with custom metrics configuration + - APIService registration for custom.metrics.k8s.io + - RBAC permissions for HPA to access custom metrics + type: boolean tolerations: description: | Tolerations configuration for the aggregator pods. @@ -400,6 +412,7 @@ properties: level: type: string enum: + - trace - debug - info - warn @@ -497,6 +510,37 @@ properties: # Resource requirements for the collector resources: $ref: "#/$defs/io.k8s.api.core.v1.ResourceRequirements" + tls: + description: | + TLS configuration for the collector. When autoscaling is enabled, + the collector runs in dual mode (HTTP + HTTPS) to serve both + regular metrics collection and the custom metrics API for HPA. + When autoscaling is disabled, the collector runs in HTTP-only + mode. + type: object + additionalProperties: false + properties: + mountPath: + description: | + Path where the TLS certificate and key will be mounted in the container + type: string + default: "/etc/certs" + secret: + description: | + Configuration for the TLS certificate Secret + type: object + additionalProperties: false + properties: + create: + description: | + Whether to create a Secret to store the TLS certificate and key + type: boolean + default: true + name: + description: | + Name of the Secret to create. If empty, a name will be generated + type: string + default: "" shipper: type: object additionalProperties: false @@ -515,6 +559,34 @@ properties: # Affinity settings for the aggregator affinity: $ref: "#/$defs/io.k8s.api.core.v1.Affinity" + # Scaling configuration for the aggregator + scaling: + description: | + Detailed scaling configuration for the aggregator deployment. + This section is only used when components.aggregator.autoscale is enabled. + additionalProperties: false + type: object + properties: + minReplicas: + $ref: "#/definitions/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerSpec/properties/minReplicas" + maxReplicas: + $ref: "#/definitions/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerSpec/properties/maxReplicas" + targetValue: + description: | + Target value for the czo_cost_metrics_shipping_progress metric. + "900m" = scale when metrics projected to reach 90% of MaxRecords + capacity. scaling. + type: string + annotations: + description: | + Annotations to be added to the HPA resource. + $ref: "#/$defs/io.k8s.apimachinery.pkg.apis.meta.v1.ObjectMeta/properties/annotations" + behavior: + description: | + HPA scaling behavior configuration. Configures the scaling + behavior of the target in both Up and Down directions. If not set, + the default HPAScalingRules for scale up and scale down are used. + $ref: "#/definitions/io.k8s.api.autoscaling.v2.HorizontalPodAutoscalerBehavior" prometheusConfig: description: | diff --git a/helm/values.yaml b/helm/values.yaml index f7799482..6b04d180 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -225,6 +225,9 @@ components: # metrics from the agent, webhook, etc., and sends them to the CloudZero API # after some processing. aggregator: + # The number of replicas to run. Note that, if autoscale is enabled, this + # will be the starting number of replicas, which may or may not be the + # number of replicas in the cluster due to the HPA. replicas: 3 podDisruptionBudget: # enabled: @@ -232,6 +235,8 @@ components: # maxUnavailable: tolerations: [] annotations: {} + # Enable autoscaling for the aggregator deployment + autoscale: false # Settings for the webhook server. webhookServer: @@ -935,6 +940,16 @@ aggregator: limits: memory: "1024Mi" cpu: "2000m" + # TLS configuration for the collector (automatically enabled when autoscaling is enabled) + tls: + # Path where the TLS certificate and key will be mounted in the container + mountPath: /etc/certs + # Configuration for the TLS certificate Secret + secret: + # Whether to create a Secret to store the TLS certificate and key + create: true + # Name of the Secret to create. If empty, a name will be generated + name: "" # Configuration for the shipper component of the aggregator. shipper: # Port that the shipper listens on for internal communication. @@ -965,6 +980,51 @@ aggregator: # See the Kubernetes documentation for details: # https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity affinity: {} + # Detailed scaling configuration for the aggregator deployment + scaling: + # Minimum number of replicas for the aggregator + minReplicas: 1 + # Maximum number of replicas for the aggregator + maxReplicas: 10 + # Target value for czo_cost_metrics_shipping_progress metric + # "900m" = scale when metrics reach 90% of MaxRecords capacity + # Values >1.0 indicate saturation and trigger more aggressive scaling + targetValue: "900m" + # Annotations to apply to the HPA resource + annotations: {} + # Scaling behavior configuration + behavior: + scaleUp: + # Time to wait before allowing another scale up + stabilizationWindowSeconds: 300 + # Scale up policies + policies: + # Allow up to 100% increase + - type: Percent + value: 100 + periodSeconds: 60 + # Allow up to 2 pod increase + - type: Pods + value: 2 + periodSeconds: 60 + # Use the maximum of the policies + selectPolicy: Max + scaleDown: + # Time to wait before allowing another scale down + stabilizationWindowSeconds: 300 + # Scale down policies + policies: + # Allow up to 50% decrease + - type: Percent + value: 50 + periodSeconds: 60 + # Allow up to 1 pod decrease + - type: Pods + value: 1 + periodSeconds: 60 + # Use the minimum of the policies + selectPolicy: Min + # -- Deprecated. Override the name of the chart. Used in resource naming. nameOverride: diff --git a/tests/helm/template/autoscale-overrides.yml b/tests/helm/template/autoscale-overrides.yml new file mode 100644 index 00000000..f10bb67e --- /dev/null +++ b/tests/helm/template/autoscale-overrides.yml @@ -0,0 +1,15 @@ +cloudAccountId: "1234567890" +clusterName: "my-cluster" +region: "us-east-1" +apiKey: "not-a-real-api-key" + +# For testing only, you should never use this property in production. +jobConfigID: "DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E" + +components: + aggregator: + autoscale: true + +aggregator: + logging: + level: debug diff --git a/tests/helm/template/autoscale.yaml b/tests/helm/template/autoscale.yaml new file mode 100644 index 00000000..bb7a27e3 --- /dev/null +++ b/tests/helm/template/autoscale.yaml @@ -0,0 +1,3139 @@ +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cloudzero-state-metrics + namespace: cz-agent + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: cloudzero-state-metrics + minAvailable: 1 +--- +# Source: cloudzero-agent/templates/agent-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cloudzero-agent-server + namespace: cz-agent +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: server + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/templates/aggregator-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-aggregator + namespace: cz-agent +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/templates/webhook-pdb.yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cz-agent-cloudzero-agent-webhook-server + namespace: cz-agent +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + name: cz-agent-cloudzero-state-metrics + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/agent-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cloudzero-agent-server + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/webhook-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-cloudzero-agent-webhook-server-init-cert + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/aggregator-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-api-key + namespace: cz-agent +data: + value: + "bm90LWEtcmVhbC1hcGkta2V5" +--- +# Source: cloudzero-agent/templates/webhook-tls-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cloudzero-agent-webhook-server-tls + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/agent-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-configuration + namespace: cz-agent + +data: + prometheus.yml: |- + global: + scrape_interval: 60s + + storage: + tsdb: + out_of_order_time_window: 5m + + scrape_configs: + # Kube State Metrics Scrape Job + # static-kube-state-metrics + # + # Kube State Metrics provides the CloudZero Agent with information + # regarding the configuration and state of various Kubernetes objects + # (nodes, pods, etc.), including where they are located in the cluster. + - job_name: static-kube-state-metrics + scrape_interval: 60s + + # Given a Kubernetes resource with a structure like: + # + # apiVersion: v1 + # kind: Service + # metadata: + # name: my-service + # namespace: my-namespace + # labels: + # app: my-app + # environment: production + # + # Kube State Metrics should provide labels such as: + # + # __meta_kubernetes_service_name: my-name + # __meta_kubernetes_namespace: my-namespace + # __meta_kubernetes_service_label_app: my-app + # __meta_kubernetes_service_label_environment: production + # + # We read these into the CloudZero Agent as: + # + # service: my-name + # namespace: my-namespace + # app: my-app + # environment: production + relabel_configs: + + # Relabel __meta_kubernetes_service_label_(.+) labels to $1. + - regex: __meta_kubernetes_service_label_(.+) + action: labelmap + + # Replace __meta_kubernetes_namespace labels with "namespace" + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + + # Replace __meta_kubernetes_service_name labels with "service" + - source_labels: [__meta_kubernetes_service_name] + target_label: service + + # Replace "__meta_kubernetes_pod_node_name" labels to "node" + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + # We filter out all but a select few metrics and labels. + metric_relabel_configs: + + # Metric names to keep. + - source_labels: [__name__] + regex: ^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info)$ + action: keep + + # Metric labels to keep. + - regex: ^(board_asset_tag|container|created_by_kind|created_by_name|image|instance|name|namespace|node|node_kubernetes_io_instance_type|pod|product_name|provider_id|resource|unit|uid|_.*|label_.*|app.kubernetes.io/*|k8s.*)$ + action: labelkeep + + static_configs: + - targets: + - cz-agent-cloudzero-state-metrics.cz-agent.svc.cluster.local:8080 + # cAdvisor Scrape Job cloudzero-nodes-cadvisor + # + # This job scrapes metrics about container resource usage (CPU, memory, + # network, etc.). + - job_name: cloudzero-nodes-cadvisor + + scrape_interval: 60s + scheme: https + + # cAdvisor endpoints are protected. In order to access them we need the + # credentials for the ServiceAccount. + authorization: + type: Bearer + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + + # Scrape metrics from cAdvisor. + relabel_configs: + + # Replace the value of __address__ labels with "kubernetes.default.svc:443" + - target_label: __address__ + replacement: kubernetes.default.svc:443 + + # Replace the value of __metrics_path__ in __meta_kubernetes_node_name with + # "/api/v1/nodes/$1/proxy/metrics/cadvisor" + - source_labels: [__meta_kubernetes_node_name] + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + + # Remove "__meta_kubernetes_node_label_" prefix from labels. + - regex: __meta_kubernetes_node_label_(.+) + action: labelmap + + # Replace __meta_kubernetes_node_name labels with "node" + - source_labels: [__meta_kubernetes_node_name] + target_label: node + + # We only want to keep a select few labels. + metric_relabel_configs: + + # Labels to keep. + - action: labelkeep + regex: ^(board_asset_tag|container|created_by_kind|created_by_name|image|instance|name|namespace|node|node_kubernetes_io_instance_type|pod|product_name|provider_id|resource|unit|uid|_.*|label_.*|app.kubernetes.io/*|k8s.*)$ + + # Metrics to keep. + - source_labels: [__name__] + regex: ^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total)$ + action: keep + + kubernetes_sd_configs: + - role: node + kubeconfig_file: "" + - job_name: cloudzero-webhook-job + scheme: https + tls_config: + insecure_skip_verify: true + + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + + relabel_configs: + # Keep __meta_kubernetes_endpoints_name labels. + - source_labels: [__meta_kubernetes_endpoints_name] + action: keep + regex: cz-agent-cloudzero-agent-webhook-server-svc + + metric_relabel_configs: + # Metrics to keep. + - source_labels: [__name__] + regex: "^(go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total)$" + action: keep + - job_name: cloudzero-aggregator-job + scrape_interval: 120s + kubernetes_sd_configs: + - role: endpoints + kubeconfig_file: "" + namespaces: + names: + - cz-agent + relabel_configs: + - source_labels: [__meta_kubernetes_service_name] + action: keep + regex: cz-agent-aggregator + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: port-(shipper|collector) + metric_relabel_configs: + - source_labels: [__name__] + regex: "^(container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|go_gc_duration_seconds|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_gc_gogc_percent|go_gc_gomemlimit_bytes|go_goroutines|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_stack_inuse_bytes|go_threads|http_request_duration_seconds_bucket|http_request_duration_seconds_count|http_request_duration_seconds_sum|http_requests_total|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_exemplars_in_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_in_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_remote_storage_string_interner_zero_reference_releases_total|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds|promhttp_metric_handler_requests_in_flight|promhttp_metric_handler_requests_total|remote_write_db_failures_total|remote_write_failures_total|remote_write_payload_size_bytes|remote_write_records_processed_total|remote_write_response_codes_total|remote_write_timeseries_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|function_execution_seconds|shipper_shutdown_total|shipper_new_files_error_total|shipper_new_files_processing_current|shipper_handle_request_file_count|shipper_handle_request_success_total|shipper_presigned_url_error_total|shipper_replay_request_total|shipper_replay_request_current|shipper_replay_request_file_count|shipper_replay_request_error_total|shipper_replay_request_abandon_files_total|shipper_replay_request_abandon_files_error_total|shipper_disk_total_size_bytes|shipper_current_disk_usage_bytes|shipper_current_disk_usage_percentage|shipper_current_disk_unsent_file|shipper_current_disk_sent_file|shipper_disk_replay_request_current|shipper_disk_cleanup_failure_total|shipper_disk_cleanup_success_total|shipper_disk_cleanup_percentage)$|^(cloudzero_|czo_)" + action: keep + - job_name: static-prometheus + scrape_interval: 120s + static_configs: + - targets: + - localhost:9090 + metric_relabel_configs: + - source_labels: [__name__] + regex: "^(go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + action: keep + remote_write: + - url: 'http://cz-agent-aggregator.cz-agent.svc.cluster.local/collector' + authorization: + credentials_file: /etc/config/secrets/value + write_relabel_configs: + - source_labels: [__name__] + regex: "^(kube_node_info|kube_node_status_capacity|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_labels|kube_pod_info|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_transmit_bytes_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|remote_write_timeseries_total|remote_write_response_codes_total|remote_write_payload_size_bytes|remote_write_failures_total|remote_write_records_processed_total|remote_write_db_failures_total|http_requests_total|storage_write_failure_total|czo_webhook_types_total|czo_storage_types_total|czo_ingress_types_total|czo_gateway_types_total|go_memstats_alloc_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_objects|go_memstats_last_gc_time_seconds|go_memstats_alloc_bytes|go_memstats_stack_inuse_bytes|go_goroutines|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_bytes|process_start_time_seconds|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_agent_corruptions_total|prometheus_api_remote_read_queries|prometheus_http_requests_total|prometheus_notifications_alertmanagers_discovered|prometheus_notifications_dropped_total|prometheus_remote_storage_bytes_total|prometheus_remote_storage_histograms_failed_total|prometheus_remote_storage_histograms_total|prometheus_remote_storage_metadata_bytes_total|prometheus_remote_storage_metadata_failed_total|prometheus_remote_storage_metadata_retried_total|prometheus_remote_storage_metadata_total|prometheus_remote_storage_samples_dropped_total|prometheus_remote_storage_samples_failed_total|prometheus_remote_storage_samples_in_total|prometheus_remote_storage_samples_total|prometheus_remote_storage_shard_capacity|prometheus_remote_storage_shards|prometheus_remote_storage_shards_desired|prometheus_remote_storage_shards_max|prometheus_remote_storage_shards_min|prometheus_sd_azure_cache_hit_total|prometheus_sd_azure_failures_total|prometheus_sd_discovered_targets|prometheus_sd_dns_lookup_failures_total|prometheus_sd_failed_configs|prometheus_sd_file_read_errors_total|prometheus_sd_file_scan_duration_seconds|prometheus_sd_file_watcher_errors_total|prometheus_sd_http_failures_total|prometheus_sd_kubernetes_events_total|prometheus_sd_kubernetes_http_request_duration_seconds|prometheus_sd_kubernetes_http_request_total|prometheus_sd_kubernetes_workqueue_depth|prometheus_sd_kubernetes_workqueue_items_total|prometheus_sd_kubernetes_workqueue_latency_seconds|prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds|prometheus_sd_kubernetes_workqueue_unfinished_work_seconds|prometheus_sd_kubernetes_workqueue_work_duration_seconds|prometheus_sd_received_updates_total|prometheus_sd_updates_delayed_total|prometheus_sd_updates_total|prometheus_target_scrape_pool_reloads_failed_total|prometheus_target_scrape_pool_reloads_total|prometheus_target_scrape_pool_sync_total|prometheus_target_scrape_pools_failed_total|prometheus_target_scrape_pools_total|prometheus_target_sync_failed_total|prometheus_target_sync_length_seconds)$" + action: keep + metadata_config: + send: false +--- +# Source: cloudzero-agent/templates/aggregator-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: cz-agent-aggregator + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +data: + config.yml: |- + cloud_account_id: 1234567890 + cluster_name: my-cluster + region: us-east-1 + + metrics: + + cost: + - pattern: "container_cpu_usage_seconds_total" + match: exact + - pattern: "container_memory_working_set_bytes" + match: exact + - pattern: "container_network_receive_bytes_total" + match: exact + - pattern: "container_network_transmit_bytes_total" + match: exact + - pattern: "kube_node_info" + match: exact + - pattern: "kube_node_status_capacity" + match: exact + - pattern: "kube_pod_container_resource_limits" + match: exact + - pattern: "kube_pod_container_resource_requests" + match: exact + - pattern: "kube_pod_labels" + match: exact + - pattern: "kube_pod_info" + match: exact + - pattern: "cloudzero_" + match: prefix + + cost_labels: + - pattern: "board_asset_tag" + match: exact + - pattern: "container" + match: exact + - pattern: "created_by_kind" + match: exact + - pattern: "created_by_name" + match: exact + - pattern: "image" + match: exact + - pattern: "instance" + match: exact + - pattern: "name" + match: exact + - pattern: "namespace" + match: exact + - pattern: "node" + match: exact + - pattern: "node_kubernetes_io_instance_type" + match: exact + - pattern: "pod" + match: exact + - pattern: "product_name" + match: exact + - pattern: "provider_id" + match: exact + - pattern: "resource" + match: exact + - pattern: "resource_type" + match: exact + - pattern: "unit" + match: exact + - pattern: "uid" + match: exact + - pattern: "workload" + match: exact + - pattern: "_" + match: prefix + - pattern: "label_" + match: prefix + - pattern: "app.kubernetes.io/" + match: prefix + - pattern: "k8s." + match: prefix + + observability: + - pattern: "go_gc_duration_seconds" + match: exact + - pattern: "go_gc_duration_seconds_count" + match: exact + - pattern: "go_gc_duration_seconds_sum" + match: exact + - pattern: "go_gc_gogc_percent" + match: exact + - pattern: "go_gc_gomemlimit_bytes" + match: exact + - pattern: "go_goroutines" + match: exact + - pattern: "go_memstats_alloc_bytes" + match: exact + - pattern: "go_memstats_heap_alloc_bytes" + match: exact + - pattern: "go_memstats_heap_idle_bytes" + match: exact + - pattern: "go_memstats_heap_inuse_bytes" + match: exact + - pattern: "go_memstats_heap_objects" + match: exact + - pattern: "go_memstats_last_gc_time_seconds" + match: exact + - pattern: "go_memstats_stack_inuse_bytes" + match: exact + - pattern: "go_threads" + match: exact + - pattern: "http_request_duration_seconds_bucket" + match: exact + - pattern: "http_request_duration_seconds_count" + match: exact + - pattern: "http_request_duration_seconds_sum" + match: exact + - pattern: "http_requests_total" + match: exact + - pattern: "process_cpu_seconds_total" + match: exact + - pattern: "process_max_fds" + match: exact + - pattern: "process_open_fds" + match: exact + - pattern: "process_resident_memory_bytes" + match: exact + - pattern: "process_start_time_seconds" + match: exact + - pattern: "process_virtual_memory_bytes" + match: exact + - pattern: "process_virtual_memory_max_bytes" + match: exact + - pattern: "prometheus_agent_corruptions_total" + match: exact + - pattern: "prometheus_api_remote_read_queries" + match: exact + - pattern: "prometheus_http_requests_total" + match: exact + - pattern: "prometheus_notifications_alertmanagers_discovered" + match: exact + - pattern: "prometheus_notifications_dropped_total" + match: exact + - pattern: "prometheus_remote_storage_bytes_total" + match: exact + - pattern: "prometheus_remote_storage_exemplars_in_total" + match: exact + - pattern: "prometheus_remote_storage_histograms_failed_total" + match: exact + - pattern: "prometheus_remote_storage_histograms_in_total" + match: exact + - pattern: "prometheus_remote_storage_histograms_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_bytes_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_failed_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_retried_total" + match: exact + - pattern: "prometheus_remote_storage_metadata_total" + match: exact + - pattern: "prometheus_remote_storage_samples_dropped_total" + match: exact + - pattern: "prometheus_remote_storage_samples_failed_total" + match: exact + - pattern: "prometheus_remote_storage_samples_in_total" + match: exact + - pattern: "prometheus_remote_storage_samples_total" + match: exact + - pattern: "prometheus_remote_storage_shard_capacity" + match: exact + - pattern: "prometheus_remote_storage_shards" + match: exact + - pattern: "prometheus_remote_storage_shards_desired" + match: exact + - pattern: "prometheus_remote_storage_shards_max" + match: exact + - pattern: "prometheus_remote_storage_shards_min" + match: exact + - pattern: "prometheus_remote_storage_string_interner_zero_reference_releases_total" + match: exact + - pattern: "prometheus_sd_azure_cache_hit_total" + match: exact + - pattern: "prometheus_sd_azure_failures_total" + match: exact + - pattern: "prometheus_sd_discovered_targets" + match: exact + - pattern: "prometheus_sd_dns_lookup_failures_total" + match: exact + - pattern: "prometheus_sd_failed_configs" + match: exact + - pattern: "prometheus_sd_file_read_errors_total" + match: exact + - pattern: "prometheus_sd_file_scan_duration_seconds" + match: exact + - pattern: "prometheus_sd_file_watcher_errors_total" + match: exact + - pattern: "prometheus_sd_http_failures_total" + match: exact + - pattern: "prometheus_sd_kubernetes_events_total" + match: exact + - pattern: "prometheus_sd_kubernetes_http_request_duration_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_http_request_total" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_depth" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_items_total" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_latency_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_longest_running_processor_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_unfinished_work_seconds" + match: exact + - pattern: "prometheus_sd_kubernetes_workqueue_work_duration_seconds" + match: exact + - pattern: "prometheus_sd_received_updates_total" + match: exact + - pattern: "prometheus_sd_updates_delayed_total" + match: exact + - pattern: "prometheus_sd_updates_total" + match: exact + - pattern: "prometheus_target_scrape_pool_reloads_failed_total" + match: exact + - pattern: "prometheus_target_scrape_pool_reloads_total" + match: exact + - pattern: "prometheus_target_scrape_pool_sync_total" + match: exact + - pattern: "prometheus_target_scrape_pools_failed_total" + match: exact + - pattern: "prometheus_target_scrape_pools_total" + match: exact + - pattern: "prometheus_target_sync_failed_total" + match: exact + - pattern: "prometheus_target_sync_length_seconds" + match: exact + - pattern: "promhttp_metric_handler_requests_in_flight" + match: exact + - pattern: "promhttp_metric_handler_requests_total" + match: exact + - pattern: "remote_write_db_failures_total" + match: exact + - pattern: "remote_write_failures_total" + match: exact + - pattern: "remote_write_payload_size_bytes" + match: exact + - pattern: "remote_write_records_processed_total" + match: exact + - pattern: "remote_write_response_codes_total" + match: exact + - pattern: "remote_write_timeseries_total" + match: exact + - pattern: "storage_write_failure_total" + match: exact + - pattern: "czo_webhook_types_total" + match: exact + - pattern: "czo_storage_types_total" + match: exact + - pattern: "czo_ingress_types_total" + match: exact + - pattern: "czo_gateway_types_total" + match: exact + - pattern: "function_execution_seconds" + match: exact + - pattern: "shipper_shutdown_total" + match: exact + - pattern: "shipper_new_files_error_total" + match: exact + - pattern: "shipper_new_files_processing_current" + match: exact + - pattern: "shipper_handle_request_file_count" + match: exact + - pattern: "shipper_handle_request_success_total" + match: exact + - pattern: "shipper_presigned_url_error_total" + match: exact + - pattern: "shipper_replay_request_total" + match: exact + - pattern: "shipper_replay_request_current" + match: exact + - pattern: "shipper_replay_request_file_count" + match: exact + - pattern: "shipper_replay_request_error_total" + match: exact + - pattern: "shipper_replay_request_abandon_files_total" + match: exact + - pattern: "shipper_replay_request_abandon_files_error_total" + match: exact + - pattern: "shipper_disk_total_size_bytes" + match: exact + - pattern: "shipper_current_disk_usage_bytes" + match: exact + - pattern: "shipper_current_disk_usage_percentage" + match: exact + - pattern: "shipper_current_disk_unsent_file" + match: exact + - pattern: "shipper_current_disk_sent_file" + match: exact + - pattern: "shipper_disk_replay_request_current" + match: exact + - pattern: "shipper_disk_cleanup_failure_total" + match: exact + - pattern: "shipper_disk_cleanup_success_total" + match: exact + - pattern: "shipper_disk_cleanup_percentage" + match: exact + - pattern: "czo_" + match: prefix + + certificate: + key: /etc/certs/tls.key + cert: /etc/certs/tls.crt + server: + mode: dual + port: 8080 + tls_port: 8443 + profiling: false + reconnect_frequency: 16 + logging: + level: "debug" + capture: true + database: + storage_path: /cloudzero/data + max_records: 1.5e+06 + cost_max_interval: 30m + observability_max_interval: 10m + compression_level: 8 + purge_rules: + metrics_older_than: 168h + lazy: true + percent: 20 + available_storage: + cloudzero: + api_key_path: /etc/config/secrets/value + send_interval: 10m + send_timeout: 120s + rotate_interval: 30m + host: api.cloudzero.com + http_max_retries: 10 + http_max_wait: 30s +--- +# Source: cloudzero-agent/templates/helmless-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: cz-agent-helmless-cm + namespace: cz-agent + labels: + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +data: + values.yaml: |- + aggregator: + affinity: {} + cloudzero: + httpMaxRetries: 10 + httpMaxWait: 30s + rotateInterval: 30m + sendInterval: 10m + sendTimeout: 120s + collector: + port: 8080 + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + tls: + mountPath: /etc/certs + secret: + create: true + name: "" + database: + compressionLevel: 8 + costMaxInterval: 30m + emptyDir: + enabled: true + sizeLimit: "" + maxRecords: 1500000 + observabilityMaxInterval: 10m + purgeRules: + lazy: true + metricsOlderThan: 168h + percent: 20 + debugContainer: false + image: + digest: null + pullPolicy: null + repository: null + tag: null + logging: + capture: true + level: debug + mountRoot: /cloudzero + name: null + nodeSelector: {} + profiling: false + reconnectFrequency: 16 + scaling: + annotations: {} + behavior: + scaleDown: + policies: + - periodSeconds: 60 + type: Percent + value: 50 + - periodSeconds: 60 + type: Pods + value: 1 + selectPolicy: Min + stabilizationWindowSeconds: 300 + scaleUp: + policies: + - periodSeconds: 60 + type: Percent + value: 100 + - periodSeconds: 60 + type: Pods + value: 2 + selectPolicy: Max + stabilizationWindowSeconds: 300 + maxReplicas: 10 + minReplicas: 1 + targetValue: 900m + shipper: + port: 8081 + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + tolerations: [] + apiKey: '***' + cloudAccountId: "1234567890" + clusterName: my-cluster + commonMetaLabels: {} + components: + agent: + image: + repository: ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent + tag: 1.2.3 + podDisruptionBudget: null + aggregator: + annotations: {} + autoscale: true + podDisruptionBudget: null + replicas: 3 + tolerations: [] + kubectl: + image: + repository: docker.io/bitnami/kubectl + tag: 1.33.1 + prometheus: + image: + repository: quay.io/prometheus/prometheus + tag: null + prometheusReloader: + image: + repository: quay.io/prometheus-operator/prometheus-config-reloader + tag: v0.83.0 + webhookServer: + podDisruptionBudget: null + replicas: 3 + configmapReload: + prometheus: + enabled: true + image: + digest: null + pullPolicy: null + repository: null + tag: null + resources: {} + defaults: + affinity: {} + annotations: {} + dns: + config: {} + policy: null + federation: + enabled: false + image: + pullPolicy: IfNotPresent + pullSecrets: null + labels: {} + nodeSelector: {} + podDisruptionBudget: + enabled: true + minAvailable: 1 + priorityClassName: null + tolerations: [] + host: api.cloudzero.com + imagePullSecrets: [] + initBackfillJob: + annotations: {} + enabled: true + image: + digest: null + pullPolicy: null + repository: null + tag: null + imagePullSecrets: null + nodeSelector: null + tolerations: [] + initCertJob: + annotations: {} + enabled: true + image: + digest: null + pullPolicy: null + repository: null + tag: null + imagePullSecrets: null + nodeSelector: {} + rbac: + clusterRoleBindingName: "" + clusterRoleName: "" + create: true + serviceAccountName: "" + tolerations: [] + initScrapeJob: + annotations: null + image: + digest: null + pullPolicy: null + repository: null + tag: null + imagePullSecrets: null + nodeSelector: null + tolerations: null + insightsController: + ConfigMapNameOverride: null + annotations: + enabled: false + patterns: + - .* + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false + configurationMountPath: null + enabled: true + labels: + enabled: true + patterns: + - app.kubernetes.io/component + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false + podAnnotations: {} + podLabels: {} + resources: {} + server: + affinity: {} + deploymentAnnotations: {} + healthCheck: + enabled: true + failureThreshold: 5 + initialDelaySeconds: 15 + path: /healthz + periodSeconds: 20 + port: 8443 + successThreshold: 1 + timeoutSeconds: 3 + idle_timeout: 120s + image: + pullPolicy: null + repository: null + tag: null + imagePullSecrets: [] + logging: + level: info + name: webhook-server + nodeSelector: {} + podAnnotations: {} + port: 8443 + read_timeout: 10s + reconnectFrequency: 16 + replicaCount: null + send_interval: 1m + send_timeout: 1m + suppressIstioAnnotations: false + tolerations: [] + write_timeout: 10s + service: + port: 443 + tls: + caBundle: "" + crt: "" + enabled: true + issuerSpec: {} + key: "" + mountPath: /etc/certs + secret: + create: true + name: "" + useCertManager: false + volumeMounts: [] + volumes: [] + webhooks: + annotations: {} + caInjection: null + namespaceSelector: {} + path: /validate + timeoutSeconds: 1 + jobConfigID: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + kubeStateMetrics: + affinity: {} + annotations: {} + automountServiceAccountToken: true + autosharding: + enabled: false + collectors: + - certificatesigningrequests + - configmaps + - cronjobs + - daemonsets + - deployments + - endpoints + - horizontalpodautoscalers + - ingresses + - jobs + - leases + - limitranges + - mutatingwebhookconfigurations + - namespaces + - networkpolicies + - nodes + - persistentvolumeclaims + - persistentvolumes + - poddisruptionbudgets + - pods + - replicasets + - replicationcontrollers + - resourcequotas + - secrets + - services + - statefulsets + - storageclasses + - validatingwebhookconfigurations + - volumeattachments + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + containers: [] + customLabels: {} + customResourceState: + config: {} + create: true + enabled: false + key: config.yaml + name: "" + dnsConfig: {} + dnsPolicy: ClusterFirst + enabled: true + env: [] + extraArgs: [] + extraManifests: [] + global: + imagePullSecrets: [] + imageRegistry: "" + hostNetwork: false + image: + pullPolicy: IfNotPresent + registry: registry.k8s.io + repository: kube-state-metrics/kube-state-metrics + tag: v2.15.0 + imagePullSecrets: [] + initContainers: [] + kubeRBACProxy: + containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + enabled: false + extraArgs: [] + image: + pullPolicy: IfNotPresent + registry: quay.io + repository: brancz/kube-rbac-proxy + sha: "" + tag: v0.19.1 + resources: {} + volumeMounts: [] + kubeTargetVersionOverride: "" + kubeconfig: + enabled: false + labels: {} + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + metricAllowlist: [] + metricAnnotationsAllowList: [] + metricDenylist: [] + metricLabelsAllowlist: [] + nameOverride: cloudzero-state-metrics + namespaceOverride: "" + namespaces: "" + namespacesDenylist: "" + networkPolicy: + enabled: false + flavor: kubernetes + nodeSelector: {} + podAnnotations: {} + podDisruptionBudget: + minAvailable: 1 + podLabels: {} + podSecurityPolicy: + additionalVolumes: [] + annotations: {} + enabled: false + prometheus: + monitor: + additionalLabels: {} + annotations: {} + enabled: false + http: + bearerTokenFile: "" + bearerTokenSecret: {} + enableHttp2: false + honorLabels: false + interval: "" + metricRelabelings: [] + proxyUrl: "" + relabelings: [] + scheme: "" + scrapeTimeout: "" + tlsConfig: {} + jobLabel: "" + labelLimit: 0 + labelNameLengthLimit: 0 + labelValueLengthLimit: 0 + metrics: + bearerTokenFile: "" + bearerTokenSecret: {} + enableHttp2: false + honorLabels: false + interval: "" + metricRelabelings: [] + proxyUrl: "" + relabelings: [] + scheme: "" + scrapeTimeout: "" + tlsConfig: {} + namespace: "" + namespaceSelector: [] + podTargetLabels: [] + sampleLimit: 0 + selectorOverride: {} + targetLabels: [] + targetLimit: 0 + scrapeconfig: + additionalLabels: {} + annotations: {} + enableHttp2: false + enabled: false + honorLabels: true + jobName: kube-state-metrics + labelLimit: 0 + labelNameLengthLimit: 0 + labelValueLengthLimit: 0 + metricRelabelings: [] + proxyUrl: "" + relabelings: [] + sampleLimit: 0 + scheme: "" + scrapeInterval: "" + scrapeTimeout: "" + staticConfigLabels: {} + targetLimit: 0 + tlsConfig: {} + prometheusScrape: false + rbac: + create: true + extraRules: [] + useClusterRole: true + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + releaseLabel: false + releaseNamespace: false + replicas: 1 + resources: {} + revisionHistoryLimit: 10 + securityContext: + enabled: true + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + selectorOverride: {} + selfMonitor: + enabled: false + service: + annotations: {} + clusterIP: "" + ipDualStack: + enabled: false + ipFamilies: + - IPv6 + - IPv4 + ipFamilyPolicy: PreferDualStack + loadBalancerIP: "" + loadBalancerSourceRanges: [] + nodePort: 0 + port: 8080 + type: ClusterIP + serviceAccount: + annotations: {} + automountServiceAccountToken: true + create: true + imagePullSecrets: [] + startupProbe: + enabled: false + failureThreshold: 3 + httpGet: + httpHeaders: [] + scheme: http + initialDelaySeconds: 0 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + tolerations: [] + topologySpreadConstraints: [] + verticalPodAutoscaler: + controlledResources: [] + enabled: false + maxAllowed: {} + minAllowed: {} + volumeMounts: [] + volumes: [] + prometheusConfig: + configMapAnnotations: {} + configMapNameOverride: "" + configOverride: "" + globalScrapeInterval: 60s + outOfOrderTimeWindow: 5m + scrapeJobs: + additionalScrapeJobs: [] + aggregator: + enabled: true + scrapeInterval: 120s + cadvisor: + enabled: true + scrapeInterval: 60s + kubeStateMetrics: + enabled: true + scrapeInterval: 60s + prometheus: + enabled: true + scrapeInterval: 120s + rbac: + create: true + region: us-east-1 + secretAnnotations: {} + server: + affinity: {} + agentMode: true + args: + - --config.file=/etc/config/prometheus/configmaps/prometheus.yml + - --web.enable-lifecycle + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + automountServiceAccountToken: null + clusterRoleNameOverride: null + deploymentAnnotations: {} + emptyDir: + sizeLimit: 8Gi + env: [] + fullnameOverride: null + image: + digest: null + pullPolicy: null + repository: null + tag: null + livenessProbe: + failureThreshold: 3 + initialDelaySeconds: 30 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 10 + livenessProbeFailureThreshold: null + livenessProbeInitialDelay: null + livenessProbePeriodSeconds: null + livenessProbeSuccessThreshold: null + livenessProbeTimeout: null + logging: + level: null + name: server + nodeSelector: {} + persistentVolume: + accessModes: + - ReadWriteOnce + annotations: {} + enabled: false + existingClaim: "" + labels: {} + mountPath: /data + selector: {} + size: 8Gi + storageClass: "" + subPath: "" + volumeBindingMode: null + volumeName: null + podAnnotations: {} + podLabels: {} + priorityClassName: null + readinessProbe: + failureThreshold: 3 + initialDelaySeconds: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 4 + readinessProbeFailureThreshold: null + readinessProbeInitialDelay: null + readinessProbePeriodSeconds: null + readinessProbeSuccessThreshold: null + readinessProbeTimeout: null + resources: + limits: + memory: 1024Mi + requests: + cpu: 250m + memory: 512Mi + serviceAccount: + name: null + terminationGracePeriodSeconds: 300 + tolerations: [] + topologySpreadConstraints: [] + useExistingClusterRoleName: null + serverConfig: + containerSecretFileName: value + containerSecretFilePath: /etc/config/secrets/ + serviceAccount: + annotations: {} + automountServiceAccountToken: null + create: true + name: "" + validator: + image: + digest: null + pullPolicy: null + pullSecrets: null + repository: null + tag: null + name: env-validator + resources: {} + serviceEndpoints: + kubeStateMetrics: null +--- +# Source: cloudzero-agent/templates/validator-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-validator-configuration + namespace: cz-agent + +data: + validator.yml: |- + versions: + chart_version: 1.1.0-dev + agent_version: + + logging: + level: info + location: ./cloudzero-agent-validator.log + + deployment: + account_id: 1234567890 + cluster_name: my-cluster + region: us-east-1 + + cloudzero: + host: https://api.cloudzero.com + credentials_file: /etc/config/secrets/value + disable_telemetry: false + + services: + namespace: cz-agent + insights_service: cz-agent-cloudzero-agent-webhook-server-svc + collector_service: cz-agent-aggregator + + prometheus: + kube_state_metrics_service_endpoint: http://cz-agent-cloudzero-state-metrics.cz-agent.svc.cluster.local:8080 + executable: /bin/prometheus + kube_metrics: + - kube_node_info + - kube_node_status_capacity + - kube_pod_container_resource_limits + - kube_pod_container_resource_requests + - kube_pod_labels + - kube_pod_info + configurations: + - /etc/prometheus/prometheus.yml + - /etc/config/prometheus/configmaps/prometheus.yml + + diagnostics: + stages: + - name: pre-start + enforce: true + checks: + - api_key_valid + - name: post-start + enforce: false + checks: + - k8s_version + - k8s_namespace + - k8s_provider + - kube_state_metrics_reachable + - prometheus_version + - scrape_cfg + - webhook_server_reachable + - name: pre-stop + enforce: false + checks: + - name: config-load + enforce: false + checks: + - api_key_valid + - k8s_version + - k8s_namespace + - k8s_provider + - kube_state_metrics_reachable + - agent_settings +--- +# Source: cloudzero-agent/templates/webhook-cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-webhook-configuration + namespace: cz-agent + +data: + server-config.yaml: |- + cloud_account_id: 1234567890 + region: us-east-1 + cluster_name: my-cluster + destination: 'http://cz-agent-aggregator.cz-agent.svc.cluster.local/collector' + logging: + level: info + remote_write: + send_interval: 1m + max_bytes_per_send: 500000 + send_timeout: 1m + max_retries: 3 + k8s_client: + timeout: 30s + database: + retention_time: 24h + cleanup_interval: 3h + batch_update_size: 500 + api_key_path: /etc/config/secrets/value + certificate: + key: /etc/certs/tls.key + cert: /etc/certs/tls.crt + server: + namespace: cz-agent + domain: cz-agent-cloudzero-agent-webhook-server-svc + port: 8443 + read_timeout: 10s + write_timeout: 10s + idle_timeout: 120s + reconnect_frequency: 16 + filters: + labels: + enabled: true + patterns: + - app.kubernetes.io/component + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false + annotations: + enabled: false + patterns: + - .* + resources: + cronjobs: false + daemonsets: false + deployments: false + jobs: false + namespaces: true + nodes: false + pods: true + statefulsets: false +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + name: cz-agent-cloudzero-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +# Source: cloudzero-agent/templates/agent-clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + name: cz-agent-cloudzero-agent-server +rules: + - apiGroups: + - "apps" + resources: + - "deployments" + - "statefulsets" + - "daemonsets" + verbs: + - "get" + - "list" + - apiGroups: + - "batch" + resources: + - "jobs" + - "cronjobs" + verbs: + - "get" + - "list" + - apiGroups: + - "" + resources: + - endpoints + - namespaces + - nodes + - nodes/proxy + - nodes/metrics + - services + - pods + - persistentvolumes + - persistentvolumeclaims + verbs: + - get + - list + - watch + - apiGroups: + - "extensions" + - "networking.k8s.io" + resources: + - ingresses/status + - ingresses + - ingressclasses + verbs: + - get + - list + - watch + - apiGroups: + - "gateway.networking.k8s.io" + resources: + - gatewayclasses + verbs: + - get + - list + - watch + - apiGroups: + - "storage.k8s.io" + resources: + - storageclasses + verbs: + - get + - list + - watch + - apiGroups: + - "discovery.k8s.io" + resources: + - endpointslices + verbs: + - get + - list + - watch + - nonResourceURLs: + - "/metrics" + verbs: + - get + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - create + - get + - list + - patch + - update +--- +# Source: cloudzero-agent/templates/custom-metrics-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cz-agent-aggregator-custom-metrics-reader + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +rules: +- apiGroups: ["custom.metrics.k8s.io"] + resources: ["*"] + verbs: ["get", "list"] +--- +# Source: cloudzero-agent/templates/init-cert-clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cloudzero-agent-webhook-server-init-cert + namespace: cz-agent +rules: + - apiGroups: + - "apps" + resources: + - "deployments" + resourceNames: + - cz-agent-cloudzero-agent-webhook-server + verbs: + - "get" + - "list" + - apiGroups: + - "" + resources: + - "secrets" + resourceNames: + - cz-agent-cloudzero-agent-webhook-server-tls + - cz-agent-aggregator-tls + verbs: + - get + - list + - patch + - create + - update + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - create + - get + - list + - patch + - update + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "validatingwebhookconfigurations" + resourceNames: + - cz-agent-cloudzero-agent-webhook-server-webhook + verbs: + - get + - list + - patch +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + name: cz-agent-cloudzero-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-cloudzero-state-metrics +subjects: +- kind: ServiceAccount + name: cz-agent-cloudzero-state-metrics + namespace: cz-agent +--- +# Source: cloudzero-agent/templates/agent-clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cloudzero-agent-server +subjects: + - kind: ServiceAccount + name: cz-agent-cloudzero-agent-server + namespace: cz-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-cloudzero-agent-server +--- +# Source: cloudzero-agent/templates/custom-metrics-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cz-agent-aggregator-custom-metrics-reader + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-aggregator-custom-metrics-reader +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: system:controller:horizontal-pod-autoscaler +--- +# Source: cloudzero-agent/templates/custom-metrics-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cz-agent-aggregator-custom-metrics-reader-hpa + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-aggregator-custom-metrics-reader +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system +--- +# Source: cloudzero-agent/templates/init-cert-clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cloudzero-agent-webhook-server-init-cert +subjects: + - kind: ServiceAccount + name: cz-agent-cloudzero-agent-webhook-server-init-cert + namespace: cz-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cz-agent-cloudzero-agent-webhook-server-init-cert +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cz-agent-cloudzero-state-metrics + namespace: cz-agent + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + annotations: +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/templates/aggregator-service.yaml +apiVersion: v1 +kind: Service +metadata: + namespace: cz-agent + name: cz-agent-aggregator + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + selector: + app.kubernetes.io/component: aggregator + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/instance: cz-agent + ports: + - name: http + protocol: TCP + port: 80 + targetPort: 8080 + - name: https + protocol: TCP + port: 443 + targetPort: 8443 + type: ClusterIP +--- +# Source: cloudzero-agent/templates/webhook-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: cz-agent-cloudzero-agent-webhook-server-svc + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "false" + namespace: cz-agent +spec: + type: ClusterIP + ports: + - port: 443 + targetPort: 8443 + name: http + selector: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent +--- +# Source: cloudzero-agent/charts/kubeStateMetrics/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cz-agent-cloudzero-state-metrics + namespace: cz-agent + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + replicas: 1 + strategy: + type: RollingUpdate + revisionHistoryLimit: 10 + template: + metadata: + labels: + helm.sh/chart: kubeStateMetrics-5.36.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: cloudzero-state-metrics + app.kubernetes.io/name: cloudzero-state-metrics + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/version: "2.15.0" + spec: + automountServiceAccountToken: true + hostNetwork: false + serviceAccountName: cz-agent-cloudzero-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + dnsPolicy: ClusterFirst + containers: + - name: cloudzero-state-metrics + args: + - --port=8080 + - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + imagePullPolicy: IfNotPresent + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + ports: + - containerPort: 8080 + name: "http" + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /livez + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /readyz + port: 8081 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + resources: + {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true +--- +# Source: cloudzero-agent/templates/agent-deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + name: cz-agent-cloudzero-agent-server + namespace: cz-agent +spec: + selector: + matchLabels: + app.kubernetes.io/component: server + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent + replicas: 1 + template: + metadata: + + labels: + app.kubernetes.io/component: server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + spec: + + serviceAccountName: cz-agent-cloudzero-agent-server + initContainers: + - name: env-validator-copy + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + env: + - name: K8S_NAMESPACE + value: cz-agent + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + command: + - /app/cloudzero-agent-validator + - install + - --destination + - /checks/bin/cloudzero-agent-validator + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: lifecycle-volume + mountPath: /checks/bin/ + - name: validator-config-volume + mountPath: /checks/config/ + - name: env-validator-run + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + env: + - name: K8S_NAMESPACE + value: cz-agent + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + command: + - /checks/bin/cloudzero-agent-validator + - diagnose + - pre-start + - -f + - /checks/config/validator.yml + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: lifecycle-volume + mountPath: /checks/bin/ + - name: validator-config-volume + mountPath: /checks/config/ + containers: + - name: cloudzero-agent-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.83.0" + imagePullPolicy: "IfNotPresent" + args: + - --watched-dir=/etc/config + - --reload-url=http://127.0.0.1:9090/-/reload + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + - name: cloudzero-agent-server + + image: "quay.io/prometheus/prometheus:v2.55.1" + imagePullPolicy: "IfNotPresent" + lifecycle: + postStart: + exec: + command: + - /checks/cloudzero-agent-validator + - diagnose + - post-start + - -f + - /checks/app/config/validator.yml + preStop: + exec: + command: + - /checks/cloudzero-agent-validator + - diagnose + - pre-stop + - -f + - /checks/app/config/validator.yml + args: + + - --config.file=/etc/config/prometheus/configmaps/prometheus.yml + - --web.enable-lifecycle + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --enable-feature=agent + - --log.level=info + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + memory: 1024Mi + requests: + cpu: 250m + memory: 512Mi + volumeMounts: + - name: config-volume + mountPath: /etc/config/prometheus/configmaps/ + - name: cloudzero-agent-storage-volume + mountPath: /data + subPath: "" + - name: lifecycle-volume + mountPath: /checks/ + - name: validator-config-volume + mountPath: /checks/app/config/ + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + securityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + + + + + + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: cz-agent-configuration + - name: validator-config-volume + configMap: + name: cz-agent-validator-configuration + - name: lifecycle-volume + emptyDir: {} + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key + - name: cloudzero-agent-storage-volume + emptyDir: + sizeLimit: 8Gi +--- +# Source: cloudzero-agent/templates/aggregator-deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cz-agent-aggregator + namespace: cz-agent + + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + selector: + matchLabels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/instance: cz-agent + replicas: 3 + template: + metadata: + annotations: + checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + spec: + serviceAccountName: cz-agent-cloudzero-agent-server + + containers: + - name: cz-agent-aggregator-collector + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + ports: + - name: port-collector + containerPort: 8080 + - name: port-coll-tls + containerPort: 8443 + command: ["/app/cloudzero-collector", "-config", "/cloudzero/config/config.yml"] + env: + - name: SERVER_PORT + value: "8080" + - name: SERVER_TLS_PORT + value: "8443" + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: aggregator-config-volume + mountPath: /cloudzero/config + readOnly: true + - name: aggregator-persistent-storage + mountPath: /cloudzero/data + - name: aggregator-tls-certs + mountPath: /etc/certs + readOnly: true + readinessProbe: + httpGet: + path: /healthz + port: 8080 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + + - name: cz-agent-aggregator-shipper + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + ports: + - name: port-shipper + containerPort: 8081 + command: ["/app/cloudzero-shipper", "-config", "/cloudzero/config/config.yml"] + env: + - name: SERVER_PORT + value: "8081" + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: aggregator-config-volume + mountPath: /cloudzero/config + readOnly: true + - name: aggregator-persistent-storage + mountPath: /cloudzero/data + readinessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + resources: + limits: + cpu: 2000m + memory: 1024Mi + requests: + cpu: 100m + memory: 64Mi + securityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + + + + + + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: cz-agent-configuration + - name: validator-config-volume + configMap: + name: cz-agent-validator-configuration + - name: lifecycle-volume + emptyDir: {} + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key + - name: aggregator-config-volume + configMap: + name: cz-agent-aggregator + - name: aggregator-tls-certs + secret: + secretName: cz-agent-aggregator-tls + - name: aggregator-persistent-storage + emptyDir: + {} +--- +# Source: cloudzero-agent/templates/webhook-deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cz-agent-cloudzero-agent-webhook-server + namespace: cz-agent + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + replicas: 3 + selector: + matchLabels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent + template: + metadata: + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + annotations: + checksum/config: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: cz-agent-cloudzero-agent-server + + securityContext: + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + fsGroup: 65534 + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: webhook-server + topologyKey: kubernetes.io/hostname + weight: 100 + + + + + containers: + - name: webhook-server + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-webhook + args: + - -config + - "/etc/cloudzero-agent-insights/server-config.yaml" + ports: + - containerPort: 8443 + resources: + {} + volumeMounts: + - name: insights-server-config + mountPath: /etc/cloudzero-agent-insights + - name: tls-certs + mountPath: /etc/certs + readOnly: true + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + livenessProbe: + httpGet: + scheme: HTTPS + path: /healthz + port: 8443 + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 5 + readinessProbe: + httpGet: + scheme: HTTPS + path: /healthz + port: 8443 + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 5 + volumes: + - name: insights-server-config + configMap: + name: cz-agent-webhook-configuration + - name: tls-certs + secret: + secretName: cz-agent-cloudzero-agent-webhook-server-tls + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key +--- +# Source: cloudzero-agent/templates/aggregator-hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: cz-agent-aggregator + namespace: cz-agent + + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: cz-agent-aggregator + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Pods + pods: + metric: + name: czo_cost_metrics_shipping_progress + target: + type: AverageValue + averageValue: 900m + behavior: + scaleUp: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 100 + periodSeconds: 60 + - type: Pods + value: 2 + periodSeconds: 60 + selectPolicy: Max + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 50 + periodSeconds: 60 + - type: Pods + value: 1 + periodSeconds: 60 + selectPolicy: Min +--- +# Source: cloudzero-agent/templates/aggregator-init-cert-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-aggregator-init-cert + namespace: cz-agent + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + template: + metadata: + name: cz-agent-aggregator-init-cert + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + spec: + restartPolicy: OnFailure + serviceAccountName: cz-agent-cloudzero-agent-server + + containers: + - name: create-cert + image: "docker.io/bitnami/kubectl:1.33.1" + imagePullPolicy: "IfNotPresent" + command: ["/bin/bash", "-c"] + workingDir: /var/tmp + args: + - | + #!/bin/bash + set -e + + SECRET_NAME="cz-agent-aggregator-tls" + SERVICE_NAME="cz-agent-aggregator" + NAMESPACE="cz-agent" + + EXISTING_TLS_CRT=$(kubectl get secret $SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.tls\.crt}' 2>/dev/null || echo "") + EXISTING_TLS_KEY=$(kubectl get secret $SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.tls\.key}' 2>/dev/null || echo "") + + GENERATE_CERTIFICATE=false + + if [[ -n "$EXISTING_TLS_CRT" && -n "$EXISTING_TLS_KEY" ]]; then + # Check if the certificate is valid for our service + SAN=$(echo "$EXISTING_TLS_CRT" | base64 -d | openssl x509 -text -noout | grep DNS | sed 's/.*DNS://' || echo "") + if [[ "$SAN" != "$SERVICE_NAME.$NAMESPACE.svc" ]]; then + echo "The SANs in the certificate do not match the service name." + GENERATE_CERTIFICATE=true + fi + else + echo "TLS Secret is missing or incomplete." + GENERATE_CERTIFICATE=true + fi + + if [[ $GENERATE_CERTIFICATE == "true" ]]; then + echo "Generating new TLS certificate for $SERVICE_NAME.$NAMESPACE.svc" + + # Generate self-signed certificate and private key + openssl req -x509 -newkey rsa:2048 -keyout tls.key -out tls.crt -days 36500 -nodes \ + -subj "/CN=$SERVICE_NAME.$NAMESPACE.svc" \ + -addext "subjectAltName = DNS:$SERVICE_NAME.$NAMESPACE.svc" + + # Base64 encode the certificate and key + TLS_CRT=$(cat tls.crt | base64 | tr -d '\n') + TLS_KEY=$(cat tls.key | base64 | tr -d '\n') + + # Create or update the TLS Secret + kubectl create secret tls $SECRET_NAME \ + --cert=tls.crt \ + --key=tls.key \ + --namespace=$NAMESPACE \ + --dry-run=client -o yaml | kubectl apply -f - + + echo "TLS certificate created/updated successfully" + else + echo "Valid certificate already exists for $SERVICE_NAME.$NAMESPACE.svc" + fi +--- +# Source: cloudzero-agent/templates/backfill-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-backfill-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-backfill-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + labels: + app.kubernetes.io/component: cz-agent-backfill-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent + + spec: + serviceAccountName: cz-agent-cloudzero-agent-server + restartPolicy: OnFailure + + + + containers: + - name: init-scrape + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-webhook + args: + - -config + - "/etc/cloudzero-agent-insights/server-config.yaml" + - -backfill + resources: + {} + volumeMounts: + - name: insights-server-config + mountPath: /etc/cloudzero-agent-insights + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + volumes: + - name: insights-server-config + configMap: + name: cz-agent-webhook-configuration + - name: tls-certs + secret: + secretName: cz-agent-cloudzero-agent-webhook-server-tls + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key +--- +# Source: cloudzero-agent/templates/config-loader-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-confload-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + annotations: + checksum/values: DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-confload-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + labels: + app.kubernetes.io/component: cz-agent-confload-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent + + spec: + serviceAccountName: cz-agent-cloudzero-agent-server + restartPolicy: OnFailure + + + containers: + - name: run-validator + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + env: + - name: K8S_NAMESPACE + value: cz-agent + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + command: + - /app/cloudzero-cluster-config + - load + - --account + - "1234567890" + - --region + - us-east-1 + - --cluster-name + - my-cluster + - --release-name + - cz-agent + - --chart-version + - 1.1.0-dev + - --agent-version + - "1.2.3" + - --values-file + - /cloudzero/config/values/values.yaml + - --config-validator + - /cloudzero/config/validator/validator.yml + - --config-webhook + - /etc/cloudzero-agent-insights/server-config.yaml + - --config-aggregator + - /cloudzero/config/config.yml + resources: + {} + volumeMounts: + - name: cloudzero-api-key + mountPath: /etc/config/secrets/ + subPath: "" + readOnly: true + - name: config-values + mountPath: /cloudzero/config/values # values.yaml + - name: config-volume + mountPath: /etc/config/prometheus/configmaps/ + - name: config-validator + mountPath: /cloudzero/config/validator # validator.yml + - name: config-webhook + mountPath: /etc/cloudzero-agent-insights # server-config.yaml + - name: config-aggregator + mountPath: /cloudzero/config # config.yaml + - name: aggregator-persistent-storage + mountPath: /cloudzero/data + volumes: + - name: config-values + configMap: + name: cz-agent-helmless-cm + - name: config-volume + configMap: + name: cz-agent-configuration + - name: config-validator + configMap: + name: cz-agent-validator-configuration + - name: config-webhook + configMap: + name: cz-agent-webhook-configuration + - name: config-aggregator + configMap: + name: cz-agent-aggregator + - name: cloudzero-api-key + secret: + secretName: cz-agent-api-key + - name: aggregator-persistent-storage + emptyDir: {} +--- +# Source: cloudzero-agent/templates/helmless-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-helmless-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + + labels: + app.kubernetes.io/component: helmless + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-helmless-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + labels: + app.kubernetes.io/component: helmless + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + + spec: + restartPolicy: OnFailure + + + containers: + - name: helmless + image: "ghcr.io/cloudzero/cloudzero-agent/cloudzero-agent:1.2.3" + imagePullPolicy: "IfNotPresent" + command: + - /app/cloudzero-helmless + args: + - --configured + - /etc/config/values/values.yaml + - --output + - "-" + volumeMounts: + - name: helmless-cm + mountPath: /etc/config/values + readOnly: true + volumes: + - name: helmless-cm + configMap: + name: cz-agent-helmless-cm +--- +# Source: cloudzero-agent/templates/init-cert-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: cz-agent-init-cert-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + namespace: cz-agent + + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev +spec: + template: + metadata: + name: cz-agent-init-cert-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + labels: + app.kubernetes.io/component: cz-agent-init-cert-DEADBEEF-FEED-FACE-CAFE-FEE10D15EA5E + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/instance: cz-agent + + spec: + + + + serviceAccountName: cz-agent-cloudzero-agent-webhook-server-init-cert + restartPolicy: Never + + + + containers: + - name: init-cert + image: "docker.io/bitnami/kubectl:1.33.1" + imagePullPolicy: "IfNotPresent" + command: ["/bin/bash", "-c"] + workingDir: /var/tmp + args: + - | + #!/bin/bash + set -e + GENERATE_CERTIFICATE=false + + # Check if the caBundle in the ValidatingWebhookConfiguration is the same for all webhooks + caBundles=() + + + wh_caBundle=($(kubectl get validatingwebhookconfiguration cz-agent-cloudzero-agent-webhook-server-webhook -o jsonpath='{.webhooks[0].clientConfig.caBundle}')) + caBundles+=("${wh_caBundle:-missing }") + + CA_BUNDLE=${caBundles[0]} + for caBundle in "${caBundles[@]}"; do + if [[ "$caBundle" == "missing" ]]; then + echo "Empty caBundle found in ValidatingWebhookConfiguration." + GENERATE_CERTIFICATE=true + fi + if [[ "$caBundle" != "$CA_BUNDLE" ]]; then + echo "Mismatch found between ValidatingWebhookConfiguration caBundle values." + GENERATE_CERTIFICATE=true + fi + done + + SECRET_NAME=cz-agent-cloudzero-agent-webhook-server-tls + NAMESPACE=cz-agent + + EXISTING_TLS_CRT=$(kubectl get secret $SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.tls\.crt}') + EXISTING_TLS_KEY=$(kubectl get secret $SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.tls\.key}') + + if [[ -n "$EXISTING_TLS_CRT" ]]; then + # Check if the SANs in the certificate match the service name + SAN=$(echo "$EXISTING_TLS_CRT" | base64 -d | openssl x509 -text -noout | grep DNS | sed 's/.*DNS://') + if [[ "$SAN" != "cz-agent-cloudzero-agent-webhook-server-svc.cz-agent.svc" ]]; then + echo "The SANs in the certificate do not match the service name." + GENERATE_CERTIFICATE=true + fi + # Check that caBundle and tls.crt are the same + if [[ "$CA_BUNDLE" != $EXISTING_TLS_CRT ]]; then + echo "The caBundle in the ValidatingWebhookConfiguration does not match the tls.crt in the TLS Secret." + GENERATE_CERTIFICATE=true + fi + fi + + # Check if the TLS Secret already has certificate information + if [[ -z "$EXISTING_TLS_CRT" ]] || [[ -z "$EXISTING_TLS_KEY" ]] || [[ $GENERATE_CERTIFICATE == "true" ]] ; then + echo "The TLS Secret and/or at least one webhook configuration contains empty certificate information, or the certificate is invalid/expired. Creating a new certificate..." + else + echo "The TLS Secret and all webhook configurations contain non-empty certificate information. Will not create a new certificate and will not patch resources." + exit 0 + fi + + # Generate self-signed certificate and private key + openssl req -x509 -newkey rsa:2048 -keyout tls.key -out tls.crt -days 36500 -nodes -subj "/CN=cz-agent-cloudzero-agent-webhook-server-svc" -addext "subjectAltName = DNS:cz-agent-cloudzero-agent-webhook-server-svc.cz-agent.svc" + + # Base64 encode the certificate + export CA_BUNDLE=$(cat tls.crt | base64 | tr -d '\n') + export TLS_CRT=$(cat tls.crt | base64 | tr -d '\n') + export TLS_KEY=$(cat tls.key | base64 | tr -d '\n') + + # Update the TLS Secret with the certificate and key + kubectl patch secret $SECRET_NAME \ + -p '{"data": {"ca.crt": "'"$TLS_CRT"'", "tls.crt": "'"$TLS_CRT"'", "tls.key": "'"$TLS_KEY"'"}}' + + + # Patch the ValidatingWebhookConfiguration cz-agent-cloudzero-agent-webhook-server-webhook with the caBundle + kubectl patch validatingwebhookconfiguration cz-agent-cloudzero-agent-webhook-server-webhook \ + --type='json' \ + -p="[{'op': 'replace', 'path': '/webhooks/0/clientConfig/caBundle', 'value':'$CA_BUNDLE'}]" + exit 0 +--- +# Source: cloudzero-agent/templates/custom-metrics-apiservice.yaml +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + name: v1beta1.custom.metrics.k8s.io + namespace: cz-agent + labels: + app.kubernetes.io/component: aggregator + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cz-agent-aggregator + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +spec: + service: + name: cz-agent-aggregator + namespace: cz-agent + port: 443 + group: custom.metrics.k8s.io + version: v1beta1 + groupPriorityMinimum: 100 + versionPriority: 100 + caBundle: +--- +# Source: cloudzero-agent/templates/webhook-validating-config.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: cz-agent-cloudzero-agent-webhook-server-webhook + namespace: cz-agent + labels: + app.kubernetes.io/component: webhook-server + app.kubernetes.io/instance: cz-agent + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: cloudzero-agent + app.kubernetes.io/part-of: cloudzero-agent + app.kubernetes.io/version: v2.55.1 + helm.sh/chart: cloudzero-agent-1.1.0-dev + +webhooks: + - name: cz-agent-cloudzero-agent-webhook-server-webhook.cz-agent.svc + namespaceSelector: + {} + failurePolicy: Ignore + rules: + - operations: [ "CREATE", "UPDATE", "DELETE" ] + apiGroups: ["*"] + apiVersions: ["*"] + resources: + + - deployment + - statefulset + - daemonset + - replicaset + - pod + - namespace + - node + - service + - storageclass + - persistentvolume + - persistentvolumeclaim + - job + - cronjob + - customresourcedefinition + - ingress + - ingressclass + - gateway + - gatewayclass + scope: "*" + clientConfig: + service: + namespace: cz-agent + name: cz-agent-cloudzero-agent-webhook-server-svc + path: /validate + port: 443 + admissionReviewVersions: ["v1"] + sideEffects: None + timeoutSeconds: 1 diff --git a/tests/helm/template/cert-manager.yaml b/tests/helm/template/cert-manager.yaml index 4ced5be7..d85f0640 100644 --- a/tests/helm/template/cert-manager.yaml +++ b/tests/helm/template/cert-manager.yaml @@ -711,6 +711,11 @@ data: requests: cpu: 100m memory: 64Mi + tls: + mountPath: /etc/certs + secret: + create: true + name: "" database: compressionLevel: 8 costMaxInterval: 30m @@ -737,6 +742,32 @@ data: nodeSelector: {} profiling: false reconnectFrequency: 16 + scaling: + annotations: {} + behavior: + scaleDown: + policies: + - periodSeconds: 60 + type: Percent + value: 50 + - periodSeconds: 60 + type: Pods + value: 1 + selectPolicy: Min + stabilizationWindowSeconds: 300 + scaleUp: + policies: + - periodSeconds: 60 + type: Percent + value: 100 + - periodSeconds: 60 + type: Pods + value: 2 + selectPolicy: Max + stabilizationWindowSeconds: 300 + maxReplicas: 10 + minReplicas: 1 + targetValue: 900m shipper: port: 8081 resources: @@ -759,6 +790,7 @@ data: podDisruptionBudget: null aggregator: annotations: {} + autoscale: false podDisruptionBudget: null replicas: 3 tolerations: [] @@ -1728,6 +1760,8 @@ rules: - get - list - patch + - create + - update - apiGroups: - "admissionregistration.k8s.io" resources: @@ -1854,7 +1888,8 @@ spec: app.kubernetes.io/name: cz-agent-aggregator app.kubernetes.io/instance: cz-agent ports: - - protocol: TCP + - name: http + protocol: TCP port: 80 targetPort: 8080 type: ClusterIP @@ -2230,6 +2265,7 @@ spec: httpGet: path: /healthz port: 8080 + scheme: HTTP initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 @@ -2237,6 +2273,7 @@ spec: httpGet: path: /healthz port: 8080 + scheme: HTTP initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 @@ -2271,14 +2308,14 @@ spec: readinessProbe: httpGet: path: /healthz - port: 8080 + port: 8081 initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 livenessProbe: httpGet: path: /healthz - port: 8080 + port: 8081 initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 diff --git a/tests/helm/template/federated.yaml b/tests/helm/template/federated.yaml index 2654f069..d3a4c28c 100644 --- a/tests/helm/template/federated.yaml +++ b/tests/helm/template/federated.yaml @@ -780,6 +780,11 @@ data: requests: cpu: 100m memory: 64Mi + tls: + mountPath: /etc/certs + secret: + create: true + name: "" database: compressionLevel: 8 costMaxInterval: 30m @@ -806,6 +811,32 @@ data: nodeSelector: {} profiling: false reconnectFrequency: 16 + scaling: + annotations: {} + behavior: + scaleDown: + policies: + - periodSeconds: 60 + type: Percent + value: 50 + - periodSeconds: 60 + type: Pods + value: 1 + selectPolicy: Min + stabilizationWindowSeconds: 300 + scaleUp: + policies: + - periodSeconds: 60 + type: Percent + value: 100 + - periodSeconds: 60 + type: Pods + value: 2 + selectPolicy: Max + stabilizationWindowSeconds: 300 + maxReplicas: 10 + minReplicas: 1 + targetValue: 900m shipper: port: 8081 resources: @@ -828,6 +859,7 @@ data: podDisruptionBudget: null aggregator: annotations: {} + autoscale: false podDisruptionBudget: null replicas: 3 tolerations: [] @@ -1797,6 +1829,8 @@ rules: - get - list - patch + - create + - update - apiGroups: - "admissionregistration.k8s.io" resources: @@ -1923,7 +1957,8 @@ spec: app.kubernetes.io/name: cz-agent-aggregator app.kubernetes.io/instance: cz-agent ports: - - protocol: TCP + - name: http + protocol: TCP port: 80 targetPort: 8080 type: ClusterIP @@ -2448,6 +2483,7 @@ spec: httpGet: path: /healthz port: 8080 + scheme: HTTP initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 @@ -2455,6 +2491,7 @@ spec: httpGet: path: /healthz port: 8080 + scheme: HTTP initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 @@ -2489,14 +2526,14 @@ spec: readinessProbe: httpGet: path: /healthz - port: 8080 + port: 8081 initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 livenessProbe: httpGet: path: /healthz - port: 8080 + port: 8081 initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 diff --git a/tests/helm/template/manifest.yaml b/tests/helm/template/manifest.yaml index 66348d97..d300d3f0 100644 --- a/tests/helm/template/manifest.yaml +++ b/tests/helm/template/manifest.yaml @@ -727,6 +727,11 @@ data: requests: cpu: 100m memory: 64Mi + tls: + mountPath: /etc/certs + secret: + create: true + name: "" database: compressionLevel: 8 costMaxInterval: 30m @@ -753,6 +758,32 @@ data: nodeSelector: {} profiling: false reconnectFrequency: 16 + scaling: + annotations: {} + behavior: + scaleDown: + policies: + - periodSeconds: 60 + type: Percent + value: 50 + - periodSeconds: 60 + type: Pods + value: 1 + selectPolicy: Min + stabilizationWindowSeconds: 300 + scaleUp: + policies: + - periodSeconds: 60 + type: Percent + value: 100 + - periodSeconds: 60 + type: Pods + value: 2 + selectPolicy: Max + stabilizationWindowSeconds: 300 + maxReplicas: 10 + minReplicas: 1 + targetValue: 900m shipper: port: 8081 resources: @@ -775,6 +806,7 @@ data: podDisruptionBudget: null aggregator: annotations: {} + autoscale: false podDisruptionBudget: null replicas: 3 tolerations: [] @@ -1744,6 +1776,8 @@ rules: - get - list - patch + - create + - update - apiGroups: - "admissionregistration.k8s.io" resources: @@ -1870,7 +1904,8 @@ spec: app.kubernetes.io/name: cz-agent-aggregator app.kubernetes.io/instance: cz-agent ports: - - protocol: TCP + - name: http + protocol: TCP port: 80 targetPort: 8080 type: ClusterIP @@ -2246,6 +2281,7 @@ spec: httpGet: path: /healthz port: 8080 + scheme: HTTP initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 @@ -2253,6 +2289,7 @@ spec: httpGet: path: /healthz port: 8080 + scheme: HTTP initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3 @@ -2287,14 +2324,14 @@ spec: readinessProbe: httpGet: path: /healthz - port: 8080 + port: 8081 initialDelaySeconds: 10 periodSeconds: 10 failureThreshold: 3 livenessProbe: httpGet: path: /healthz - port: 8080 + port: 8081 initialDelaySeconds: 30 periodSeconds: 30 failureThreshold: 3