From d9a7059b7b86b86719d01bf3d216425da61b1210 Mon Sep 17 00:00:00 2001 From: Matthieu Huin Date: Thu, 28 Sep 2023 18:25:46 +0200 Subject: [PATCH] Monitoring: Add statsd-exporter sidecar to nodepool and zuul-scheduler Create a nodepool PodMonitor, extend zuul's Podmonitor to scrape statsd metrics as well. Zuul's statsd mapping configuration is minimal and will likely need to be completed in the future (low hanging fruit, but somewhat tedious work). Change-Id: Ife0c02796841b60376b60f803c3e7a2bf2ad5478 --- api/v1/softwarefactory_types.go | 5 + cli/sfconfig/cmd/zuul/zuul.go | 9 +- cli/sfconfig/cmd/zuul_client/zuul_client.go | 9 +- ...efactory-project.io_softwarefactories.yaml | 9 ++ controllers/libs/monitoring/monitoring.go | 139 +++++++++++++++-- controllers/logserver_controller.go | 19 +-- controllers/nodepool.go | 142 ++++++++++++++---- controllers/softwarefactory_controller.go | 5 +- .../static/nodepool/statsd_mapping.yaml | 124 +++++++++++++++ controllers/static/zuul/statsd_mapping.yaml | 48 ++++++ controllers/static/zuul/zuul.conf | 5 +- controllers/volumestats_exporter.go | 56 ------- controllers/zuul.go | 30 +++- .../test-monitoring/tasks/main.yaml | 26 ++++ 14 files changed, 510 insertions(+), 116 deletions(-) create mode 100644 controllers/static/nodepool/statsd_mapping.yaml create mode 100644 controllers/static/zuul/statsd_mapping.yaml delete mode 100644 controllers/volumestats_exporter.go diff --git a/api/v1/softwarefactory_types.go b/api/v1/softwarefactory_types.go index d1a4009a..37488606 100644 --- a/api/v1/softwarefactory_types.go +++ b/api/v1/softwarefactory_types.go @@ -127,6 +127,8 @@ type ZuulExecutorSpec struct { type ZuulSchedulerSpec struct { // Storage-related settings Storage StorageSpec `json:"storage,omitempty"` + // The address to forward statsd metrics to (optional), in the form "host:port" + StatsdTarget string `json:"statsdTarget,omitempty"` } // TODO: make sure to update the GetConnectionsName when adding new connection type. @@ -193,9 +195,12 @@ type NodepoolBuilderSpec struct { } type NodepoolSpec struct { + // Nodepool-launcher related settings Launcher NodepoolLauncherSpec `json:"launcher,omitempty"` // Nodepool-builder related settings Builder NodepoolBuilderSpec `json:"builder,omitempty"` + // The address to forward statsd metrics to (optional), in the form "host:port" + StatsdTarget string `json:"statsdTarget,omitempty"` } type ZookeeperSpec struct { diff --git a/cli/sfconfig/cmd/zuul/zuul.go b/cli/sfconfig/cmd/zuul/zuul.go index da922e82..55250166 100644 --- a/cli/sfconfig/cmd/zuul/zuul.go +++ b/cli/sfconfig/cmd/zuul/zuul.go @@ -72,10 +72,11 @@ administrative actions on a specified tenant.`, buffer := &bytes.Buffer{} errorBuffer := &bytes.Buffer{} request := kubeClientSet.CoreV1().RESTClient().Post().Resource("pods").Namespace(namespace).Name(zuulSchedulerContainer.Name).SubResource("exec").VersionedParams(&v1.PodExecOptions{ - Command: zuulAdminArgs, - Stdin: false, - Stdout: true, - Stderr: true, + Container: "zuul-scheduler", + Command: zuulAdminArgs, + Stdin: false, + Stdout: true, + Stderr: true, }, scheme.ParameterCodec) exec, _ := remotecommand.NewSPDYExecutor(kubeConfig, "POST", request.URL()) diff --git a/cli/sfconfig/cmd/zuul_client/zuul_client.go b/cli/sfconfig/cmd/zuul_client/zuul_client.go index c39aa4c4..93f237a2 100644 --- a/cli/sfconfig/cmd/zuul_client/zuul_client.go +++ b/cli/sfconfig/cmd/zuul_client/zuul_client.go @@ -96,10 +96,11 @@ Examples: buf := &bytes.Buffer{} errBuf := &bytes.Buffer{} request := kubeClientSet.CoreV1().RESTClient().Post().Resource("pods").Namespace(namespace).Name(zuulwebcontainer.Name).SubResource("exec").VersionedParams(&v1.PodExecOptions{ - Command: zuulClientArgs, - Stdin: false, - Stdout: true, - Stderr: true, + Container: "zuul-web", + Command: zuulClientArgs, + Stdin: false, + Stdout: true, + Stderr: true, }, scheme.ParameterCodec) exec, _ := remotecommand.NewSPDYExecutor(kubeConfig, "POST", request.URL()) diff --git a/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml b/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml index 834fe4b8..854153d0 100644 --- a/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml +++ b/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml @@ -251,6 +251,7 @@ spec: type: object type: object launcher: + description: Nodepool-launcher related settings properties: logLevel: description: 'Specify the Log Level of the nodepool launcher @@ -262,6 +263,10 @@ spec: - DEBUG type: string type: object + statsdTarget: + description: The address to forward statsd metrics to (optional), + in the form "host:port" + type: string type: object storageClassName: description: Default storage class to use by Persistent Volume Claims @@ -456,6 +461,10 @@ spec: scheduler: description: Configuration of the scheduler microservice properties: + statsdTarget: + description: The address to forward statsd metrics to (optional), + in the form "host:port" + type: string storage: description: Storage-related settings properties: diff --git a/controllers/libs/monitoring/monitoring.go b/controllers/libs/monitoring/monitoring.go index 4481f0af..50d58184 100644 --- a/controllers/libs/monitoring/monitoring.go +++ b/controllers/libs/monitoring/monitoring.go @@ -1,15 +1,133 @@ // Copyright (C) 2023 Red Hat // SPDX-License-Identifier: Apache-2.0 -// Package monitoring provides various utility functions regarding monitoring for the sf-operator +/* +Package monitoring provides various utility functions regarding monitoring for the sf-operator: + +* create prometheus monitors and alert rules +* create nodeexporter sidecar +* create statsdexporter sidecar +*/ package monitoring import ( + "math" + "strconv" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/softwarefactory-project/sf-operator/controllers/libs/base" + apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) +func GetTruncatedPortName(serviceName string, suffix string) string { + // Port name is limited to 15 chars + var length = float64(len(serviceName)) + var maxChars = 15 - float64(len(suffix)) + var upper = int(math.Min(maxChars, length)) + var exporterPortName = serviceName[:upper] + suffix + return exporterPortName +} + +// Node exporter utilities + +const NodeExporterNameSuffix = "-nodeexporter" +const NodeExporterPortNameSuffix = "-ne" +const nodeExporterPort = 9100 + +const NodeExporterImage = "quay.io/prometheus/node-exporter:latest" + +// Fun fact: arrays cannot be consts, so we define our args in this function. +func getNodeExporterArgs(volumeMounts []apiv1.VolumeMount) []string { + var excludePaths = "^(/etc/hosts|/etc/hostname|/etc/passwd|/etc/resolv.conf|/run/.containerenv|/run/secrets|/dev|/proc|/sys)($|/)" + return []string{ + "--collector.disable-defaults", + "--collector.filesystem", + "--collector.filesystem.mount-points-exclude=" + excludePaths, + } +} + +func MkNodeExporterSideCarContainer(serviceName string, volumeMounts []apiv1.VolumeMount) apiv1.Container { + container := base.MkContainer(serviceName+NodeExporterNameSuffix, NodeExporterImage) + container.Args = getNodeExporterArgs(volumeMounts) + container.Ports = []apiv1.ContainerPort{ + base.MkContainerPort(nodeExporterPort, GetTruncatedPortName(serviceName, NodeExporterPortNameSuffix)), + } + container.VolumeMounts = volumeMounts + return container +} + +func MkNodeExporterSideCarService(serviceName string, namespace string) apiv1.Service { + var portName = GetTruncatedPortName(serviceName, NodeExporterPortNameSuffix) + servicePorts := []int32{nodeExporterPort} + neService := base.MkService(serviceName+NodeExporterPortNameSuffix, namespace, serviceName, servicePorts, portName) + return neService + +} + +// Statsd exporter utilities + +const statsdExporterNameSuffix = "-statsd" +const statsdExporterPortNameSuffix = "-se" +const StatsdExporterPortListen = int32(9125) +const statsdExporterPortExpose = int32(9102) +const StatsdExporterConfigFile = "statsd_mapping.yaml" +const statsdExporterImage = "quay.io/prometheus/statsd-exporter:v0.24.0" + +func getStatsdExporterArgs(configPath string, relayAddress *string) []string { + args := []string{ + "--statsd.mapping-config=" + configPath, + "--statsd.listen-udp=:" + strconv.Itoa(int(StatsdExporterPortListen)), + "--web.listen-address=:" + strconv.Itoa(int(statsdExporterPortExpose)), + } + if relayAddress != nil { + args = append(args, "--statsd.relay.address="+*relayAddress) + } + return args +} + +func GetStatsdExporterPort(serviceName string) string { + return GetTruncatedPortName(serviceName, statsdExporterPortNameSuffix+"e") +} + +func MkStatsdExporterSideCarContainer(serviceName string, configVolumeName string, relayAddress *string) apiv1.Container { + var seListenPortName = GetTruncatedPortName(serviceName, statsdExporterPortNameSuffix+"l") + var seExposePortName = GetStatsdExporterPort(serviceName) + var configFile = StatsdExporterConfigFile + var configPath = "/tmp/" + configFile + // var configVolumeName = serviceName + "-statsd-conf" + + volumeMounts := []apiv1.VolumeMount{ + { + Name: configVolumeName, + MountPath: configPath, + SubPath: configFile, + }, + } + args := getStatsdExporterArgs(configPath, relayAddress) + ports := []apiv1.ContainerPort{ + { + Name: seListenPortName, + Protocol: apiv1.ProtocolUDP, + ContainerPort: StatsdExporterPortListen, + }, + { + Name: seExposePortName, + Protocol: apiv1.ProtocolTCP, + ContainerPort: statsdExporterPortExpose, + }, + } + sidecar := base.MkContainer(serviceName+statsdExporterNameSuffix, statsdExporterImage) + sidecar.Args = args + sidecar.VolumeMounts = volumeMounts + sidecar.Ports = ports + + return sidecar +} + +// Prometheus utilities + // ServiceMonitorLabelSelector - TODO this could be a spec parameter. const ServiceMonitorLabelSelector = "sf-monitoring" @@ -34,7 +152,7 @@ func MkPrometheusAlertRule(name string, expr intstr.IntOrString, forDuration str } //lint:ignore U1000 this function will be used in a followup change -func mkServiceMonitor(name string, ns string, port string, selector metav1.LabelSelector) monitoringv1.ServiceMonitor { +func mkServiceMonitor(name string, ns string, portName string, selector metav1.LabelSelector) monitoringv1.ServiceMonitor { return monitoringv1.ServiceMonitor{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -47,7 +165,7 @@ func mkServiceMonitor(name string, ns string, port string, selector metav1.Label Endpoints: []monitoringv1.Endpoint{ { Interval: monitoringv1.Duration("30s"), - Port: port, + Port: portName, Scheme: "http", }, }, @@ -56,7 +174,12 @@ func mkServiceMonitor(name string, ns string, port string, selector metav1.Label } } -func MkPodMonitor(name string, ns string, port string, selector metav1.LabelSelector) monitoringv1.PodMonitor { +func MkPodMonitor(name string, ns string, ports []string, selector metav1.LabelSelector) monitoringv1.PodMonitor { + endpoints := []monitoringv1.PodMetricsEndpoint{} + for _, port := range ports { + endpoints = append(endpoints, monitoringv1.PodMetricsEndpoint{Port: port}) + } + return monitoringv1.PodMonitor{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -66,12 +189,8 @@ func MkPodMonitor(name string, ns string, port string, selector metav1.LabelSele }, }, Spec: monitoringv1.PodMonitorSpec{ - Selector: selector, - PodMetricsEndpoints: []monitoringv1.PodMetricsEndpoint{ - { - Port: port, - }, - }, + Selector: selector, + PodMetricsEndpoints: endpoints, }, } } diff --git a/controllers/logserver_controller.go b/controllers/logserver_controller.go index e37a7cd9..47b3ea5b 100644 --- a/controllers/logserver_controller.go +++ b/controllers/logserver_controller.go @@ -28,7 +28,7 @@ import ( "github.com/softwarefactory-project/sf-operator/controllers/libs/base" "github.com/softwarefactory-project/sf-operator/controllers/libs/conds" - "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring" + sfmonitoring "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring" "github.com/softwarefactory-project/sf-operator/controllers/libs/utils" ) @@ -100,8 +100,8 @@ func (r *LogServerController) ensureLogserverPodMonitor() bool { "run": logserverIdent, }, } - nePort := GetNodeexporterPortName(logserverIdent) - desiredLsPodmonitor := monitoring.MkPodMonitor(logserverIdent+"-monitor", r.ns, nePort, selector) + nePort := sfmonitoring.GetTruncatedPortName(logserverIdent, sfmonitoring.NodeExporterPortNameSuffix) + desiredLsPodmonitor := sfmonitoring.MkPodMonitor(logserverIdent+"-monitor", r.ns, []string{nePort}, selector) // add annotations so we can handle lifecycle annotations := map[string]string{ "version": "1", @@ -137,7 +137,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool { "description": "Log server only has at most three days' worth ({{ $value | humanize1024 }}) of free disk available.", "summary": "Log server running out of disk", } - diskFull := monitoring.MkPrometheusAlertRule( + diskFull := sfmonitoring.MkPrometheusAlertRule( "OutOfDiskNow", intstr.FromString( "(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} * 100 /"+ @@ -147,7 +147,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool { diskFullLabels, diskFullAnnotations, ) - diskFullIn3days := monitoring.MkPrometheusAlertRule( + diskFullIn3days := sfmonitoring.MkPrometheusAlertRule( "OutOfDiskInThreeDays", intstr.FromString( "(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} * 100 /"+ @@ -158,10 +158,10 @@ func (r *LogServerController) ensureLogserverPromRule() bool { map[string]string{}, diskFull3daysAnnotations, ) - lsDiskRuleGroup := monitoring.MkPrometheusRuleGroup( + lsDiskRuleGroup := sfmonitoring.MkPrometheusRuleGroup( "disk.rules", []monitoringv1.Rule{diskFull, diskFullIn3days}) - desiredLsPromRule := monitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns) + desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns) desiredLsPromRule.Spec.Groups = append(desiredLsPromRule.Spec.Groups, lsDiskRuleGroup) // add annotations so we can handle lifecycle @@ -345,7 +345,7 @@ func (r *LogServerController) DeployLogserver() sfv1.LogServerStatus { }, } - statsExporter := createNodeExporterSideCarContainer(logserverIdent, volumeMountsStatsExporter) + statsExporter := sfmonitoring.MkNodeExporterSideCarContainer(logserverIdent, volumeMountsStatsExporter) dep.Spec.Template.Spec.Containers = append(dep.Spec.Template.Spec.Containers, statsExporter) // Increase serial each time you need to enforce a deployment change/pod restart between operator versions @@ -378,7 +378,8 @@ func (r *LogServerController) DeployLogserver() sfv1.LogServerStatus { sshdService := base.MkService(sshdPortName, r.ns, logserverIdent, sshdServicePorts, sshdPortName) r.GetOrCreate(&sshdService) - r.getOrCreateNodeExporterSideCarService(logserverIdent) + nodeExporterSidecarService := sfmonitoring.MkNodeExporterSideCarService(logserverIdent, r.ns) + r.GetOrCreate(&nodeExporterSidecarService) pvcReadiness := r.reconcileExpandPVC(logserverIdent, r.cr.Spec.Settings.Storage) diff --git a/controllers/nodepool.go b/controllers/nodepool.go index 8f60f51a..3d617c8d 100644 --- a/controllers/nodepool.go +++ b/controllers/nodepool.go @@ -6,11 +6,15 @@ package controllers import ( _ "embed" + "strconv" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" v1 "github.com/softwarefactory-project/sf-operator/api/v1" "github.com/softwarefactory-project/sf-operator/controllers/libs/base" "github.com/softwarefactory-project/sf-operator/controllers/libs/conds" + "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring" "github.com/softwarefactory-project/sf-operator/controllers/libs/utils" + appsv1 "k8s.io/api/apps/v1" apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -28,14 +32,21 @@ var dibAnsibleWrapper string //go:embed static/nodepool/ssh_config var builderSSHConfig string -const launcherIdent = "nodepool-launcher" +//go:embed static/nodepool/statsd_mapping.yaml +var nodepoolStatsdMappingConfig string + +const nodepoolIdent = "nodepool" +const LauncherIdent = nodepoolIdent + "-launcher" +const shortIdent = "np" const launcherPortName = "nlwebapp" const launcherPort = 8006 const NodepoolProvidersSecretsName = "nodepool-providers-secrets" -const nodepoolLauncherImage = "quay.io/software-factory/" + launcherIdent + ":9.0.0-3" +const nodepoolLauncherImage = "quay.io/software-factory/" + LauncherIdent + ":9.0.0-3" -const builderIdent = "nodepool-builder" -const nodepoolBuilderImage = "quay.io/software-factory/" + builderIdent + ":9.0.0-3" +const BuilderIdent = nodepoolIdent + "-builder" +const nodepoolBuilderImage = "quay.io/software-factory/" + BuilderIdent + ":9.0.0-3" + +var nodepoolStatsdExporterPortName = monitoring.GetStatsdExporterPort(shortIdent) var configScriptVolumeMount = apiv1.VolumeMount{ Name: "nodepool-tooling-vol", @@ -66,17 +77,24 @@ func (r *SFController) commonToolingVolume() apiv1.Volume { } func (r *SFController) getNodepoolConfigEnvs() []apiv1.EnvVar { + nodepoolEnvVars := []apiv1.EnvVar{} if r.isConfigRepoSet() { - return []apiv1.EnvVar{ + nodepoolEnvVars = append(nodepoolEnvVars, base.MkEnvVar("CONFIG_REPO_SET", "TRUE"), base.MkEnvVar("CONFIG_REPO_BASE_URL", r.cr.Spec.ConfigLocation.BaseURL), base.MkEnvVar("CONFIG_REPO_NAME", r.cr.Spec.ConfigLocation.Name), - } + ) } else { - return []apiv1.EnvVar{ + nodepoolEnvVars = append(nodepoolEnvVars, base.MkEnvVar("CONFIG_REPO_SET", "FALSE"), - } + ) } + nodepoolEnvVars = append(nodepoolEnvVars, + base.MkEnvVar("HOME", "/var/lib/nodepool"), + base.MkEnvVar("STATSD_HOST", "localhost"), + base.MkEnvVar("STATSD_PORT", strconv.Itoa(int(monitoring.StatsdExporterPortListen))), + ) + return nodepoolEnvVars } func mkLoggingTemplate(logLevel v1.LogLevel) (string, error) { @@ -92,7 +110,44 @@ func mkLoggingTemplate(logLevel v1.LogLevel) (string, error) { return loggingConfig, err } -func (r *SFController) DeployNodepoolBuilder() bool { +func (r *SFController) EnsureNodepoolPodMonitor() bool { + selector := metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "run", + Operator: metav1.LabelSelectorOpIn, + Values: []string{LauncherIdent, BuilderIdent}, + }, + { + Key: "app", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"sf"}, + }, + }, + } + desiredNodepoolMonitor := monitoring.MkPodMonitor("nodepool-monitor", r.ns, []string{nodepoolStatsdExporterPortName}, selector) + // add annotations so we can handle lifecycle + annotations := map[string]string{ + "version": "1", + } + desiredNodepoolMonitor.ObjectMeta.Annotations = annotations + currentNPM := monitoringv1.PodMonitor{} + if !r.GetM(desiredNodepoolMonitor.Name, ¤tNPM) { + r.CreateR(&desiredNodepoolMonitor) + return false + } else { + if !utils.MapEquals(¤tNPM.ObjectMeta.Annotations, &annotations) { + r.log.V(1).Info("Nodepool PodMonitor configuration changed, updating...") + currentNPM.Spec = desiredNodepoolMonitor.Spec + currentNPM.ObjectMeta.Annotations = annotations + r.UpdateR(¤tNPM) + return false + } + } + return true +} + +func (r *SFController) DeployNodepoolBuilder(statsdExporterVolume apiv1.Volume) bool { r.EnsureSSHKeySecret("nodepool-builder-ssh-key") @@ -105,6 +160,12 @@ func (r *SFController) DeployNodepoolBuilder() bool { r.EnsureConfigMap("nodepool-builder-extra-config", builderExtraConfigData) var mod int32 = 256 // decimal for 0400 octal + // get statsd relay if defined + var relayAddress *string + if r.cr.Spec.Nodepool.StatsdTarget != "" { + relayAddress = &r.cr.Spec.Nodepool.StatsdTarget + } + volumes := []apiv1.Volume{ base.MkVolumeSecret("zookeeper-client-tls"), base.MkEmptyDirVolume("nodepool-config"), @@ -122,6 +183,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { }, base.MkVolumeCM("nodepool-builder-extra-config-vol", "nodepool-builder-extra-config-config-map"), + statsdExporterVolume, } volumeMount := []apiv1.VolumeMount{ @@ -135,7 +197,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { MountPath: "/etc/nodepool", }, { - Name: builderIdent, + Name: BuilderIdent, MountPath: "/var/lib/nodepool", }, { @@ -182,7 +244,6 @@ func (r *SFController) DeployNodepoolBuilder() bool { initContainer.Command = []string{"bash", "-c", "mkdir -p ~/dib; /usr/local/bin/generate-config.sh"} initContainer.Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool"), base.MkEnvVar("NODEPOOL_CONFIG_FILE", "nodepool-builder.yaml"), ) initContainer.VolumeMounts = []apiv1.VolumeMount{ @@ -191,7 +252,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { MountPath: "/etc/nodepool/", }, { - Name: builderIdent, + Name: BuilderIdent, MountPath: "/var/lib/nodepool", }, configScriptVolumeMount, @@ -199,7 +260,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { replicas := int32(1) nb := r.mkStatefulSet( - builderIdent, nodepoolBuilderImage, r.getStorageConfOrDefault(r.cr.Spec.Nodepool.Builder.Storage), + BuilderIdent, nodepoolBuilderImage, r.getStorageConfOrDefault(r.cr.Spec.Nodepool.Builder.Storage), replicas, apiv1.ReadWriteOnce) nb.Spec.Template.ObjectMeta.Annotations = annotations @@ -208,11 +269,14 @@ func (r *SFController) DeployNodepoolBuilder() bool { nb.Spec.Template.Spec.Containers[0].Command = []string{"/usr/local/bin/dumb-init", "--", "/usr/local/bin/nodepool-builder", "-f", "-l", "/etc/nodepool-logging/logging.yaml"} nb.Spec.Template.Spec.Containers[0].VolumeMounts = volumeMount - nb.Spec.Template.Spec.Containers[0].Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool")) + nb.Spec.Template.Spec.Containers[0].Env = r.getNodepoolConfigEnvs() + // Append statsd exporter sidecar + nb.Spec.Template.Spec.Containers = append(nb.Spec.Template.Spec.Containers, + monitoring.MkStatsdExporterSideCarContainer(shortIdent, "statsd-config", relayAddress), + ) current := appsv1.StatefulSet{} - if r.GetM(builderIdent, ¤t) { + if r.GetM(BuilderIdent, ¤t) { if !utils.MapEquals(¤t.Spec.Template.ObjectMeta.Annotations, &annotations) { r.log.V(1).Info("Nodepool-builder configuration changed, rollout pods ...") current.Spec = nb.DeepCopy().Spec @@ -226,17 +290,23 @@ func (r *SFController) DeployNodepoolBuilder() bool { var isReady = r.IsStatefulSetReady(¤t) - conds.UpdateConditions(&r.cr.Status.Conditions, builderIdent, isReady) + conds.UpdateConditions(&r.cr.Status.Conditions, BuilderIdent, isReady) return isReady } -func (r *SFController) DeployNodepoolLauncher() bool { +func (r *SFController) DeployNodepoolLauncher(statsdExporterVolume apiv1.Volume) bool { r.setNodepoolTooling() loggingConfig, _ := mkLoggingTemplate(r.cr.Spec.Nodepool.Launcher.LogLevel) + // get statsd relay if defined + var relayAddress *string + if r.cr.Spec.Nodepool.StatsdTarget != "" { + relayAddress = &r.cr.Spec.Nodepool.StatsdTarget + } + launcherExtraConfigData := make(map[string]string) launcherExtraConfigData["logging.yaml"] = loggingConfig r.EnsureConfigMap("nodepool-launcher-extra-config", launcherExtraConfigData) @@ -249,6 +319,7 @@ func (r *SFController) DeployNodepoolLauncher() bool { r.commonToolingVolume(), base.MkVolumeCM("nodepool-launcher-extra-config-vol", "nodepool-launcher-extra-config-config-map"), + statsdExporterVolume, } volumeMount := []apiv1.VolumeMount{ @@ -325,14 +396,12 @@ func (r *SFController) DeployNodepoolLauncher() bool { container.VolumeMounts = volumeMount container.Command = []string{"/usr/local/bin/dumb-init", "--", "/usr/local/bin/nodepool-launcher", "-f", "-l", "/etc/nodepool-logging/logging.yaml"} - container.Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool")) + container.Env = r.getNodepoolConfigEnvs() initContainer := base.MkContainer("nodepool-launcher-init", BusyboxImage) initContainer.Command = []string{"/usr/local/bin/generate-config.sh"} - initContainer.Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool")) + initContainer.Env = r.getNodepoolConfigEnvs() initContainer.VolumeMounts = []apiv1.VolumeMount{ { Name: "nodepool-config", @@ -347,7 +416,9 @@ func (r *SFController) DeployNodepoolLauncher() bool { nl.Spec.Template.Spec.Volumes = volumes nl.Spec.Template.Spec.InitContainers = []apiv1.Container{initContainer} - nl.Spec.Template.Spec.Containers = []apiv1.Container{container} + nl.Spec.Template.Spec.Containers = []apiv1.Container{ + container, + monitoring.MkStatsdExporterSideCarContainer(shortIdent, "statsd-config", relayAddress)} nl.Spec.Template.ObjectMeta.Annotations = annotations nl.Spec.Template.Spec.Containers[0].ReadinessProbe = base.MkReadinessHTTPProbe("/ready", launcherPort) nl.Spec.Template.Spec.Containers[0].LivenessProbe = base.MkLiveHTTPProbe("/ready", launcherPort) @@ -357,7 +428,7 @@ func (r *SFController) DeployNodepoolLauncher() bool { } current := appsv1.Deployment{} - if r.GetM(launcherIdent, ¤t) { + if r.GetM(LauncherIdent, ¤t) { if !utils.MapEquals(¤t.Spec.Template.ObjectMeta.Annotations, &annotations) { r.log.V(1).Info("Nodepool-launcher configuration changed, rollout pods ...") current.Spec = nl.DeepCopy().Spec @@ -369,14 +440,31 @@ func (r *SFController) DeployNodepoolLauncher() bool { r.CreateR(¤t) } - srv := base.MkService(launcherIdent, r.ns, launcherIdent, []int32{launcherPort}, launcherIdent) + srv := base.MkService(LauncherIdent, r.ns, LauncherIdent, []int32{launcherPort}, LauncherIdent) r.GetOrCreate(&srv) - routeReady := r.ensureHTTPSRoute(r.cr.Name+"-nodepool-launcher", "nodepool", launcherIdent, "/", + routeReady := r.ensureHTTPSRoute(r.cr.Name+"-nodepool-launcher", "nodepool", LauncherIdent, "/", launcherPort, map[string]string{}, r.cr.Spec.FQDN, r.cr.Spec.LetsEncrypt) isDeploymentReady := r.IsDeploymentReady(¤t) - conds.UpdateConditions(&r.cr.Status.Conditions, launcherIdent, isDeploymentReady) + conds.UpdateConditions(&r.cr.Status.Conditions, LauncherIdent, isDeploymentReady) return isDeploymentReady && routeReady } + +func (r *SFController) DeployNodepool() map[string]bool { + + // create statsd exporter config map + r.EnsureConfigMap("np-statsd", map[string]string{ + monitoring.StatsdExporterConfigFile: nodepoolStatsdMappingConfig, + }) + statsdVolume := base.MkVolumeCM("statsd-config", "np-statsd-config-map") + + // Ensure monitoring + r.EnsureNodepoolPodMonitor() + + deployments := make(map[string]bool) + deployments[LauncherIdent] = r.DeployNodepoolLauncher(statsdVolume) + deployments[BuilderIdent] = r.DeployNodepoolBuilder(statsdVolume) + return deployments +} diff --git a/controllers/softwarefactory_controller.go b/controllers/softwarefactory_controller.go index 3404b2b4..5eccf0c7 100644 --- a/controllers/softwarefactory_controller.go +++ b/controllers/softwarefactory_controller.go @@ -185,8 +185,9 @@ func (r *SFController) Step() sfv1.SoftwareFactoryStatus { services["Logserver"] = r.DeployLogserverResource() if services["Zookeeper"] { - services["NodePoolLauncher"] = r.DeployNodepoolLauncher() - services["NodePoolBuilder"] = r.DeployNodepoolBuilder() + nodepool := r.DeployNodepool() + services["NodePoolLauncher"] = nodepool[LauncherIdent] + services["NodePoolBuilder"] = nodepool[BuilderIdent] } if services["Zuul"] { diff --git a/controllers/static/nodepool/statsd_mapping.yaml b/controllers/static/nodepool/statsd_mapping.yaml new file mode 100644 index 00000000..a2aeeac9 --- /dev/null +++ b/controllers/static/nodepool/statsd_mapping.yaml @@ -0,0 +1,124 @@ +# TODO Parsed manually from nodepool's source code. Look for calls of +# recordLaunchStats, updateNodeStats, updateProviderLimits and updateTenantLimits. + +mappings: + +# recordLaunchStats + - match: nodepool.launch.provider.*.ready + name: nodepool_launch_provider_ready + help: launch success counter per provider + labels: + provider: "$1" + + - match: nodepool.launch.provider.*.error.* + name: nodepool_launch_provider_error + help: launch error counter per provider and error type + labels: + provider: "$1" + error: "$2" + + - match: nodepool.launch.provider.*.*.ready + name: nodepool_launch_provider_az_ready + help: launch success counter per provider and az + labels: + provider: "$1" + az: "$2" + + - match: nodepool.launch.requestor.*.ready + name: nodepool_launch_requestor_ready + help: launch success counter per requestor + labels: + requestor: "$1" + + - match: nodepool.launch.ready + name: nodepool_launch_ready + help: launch success counter + + - match: nodepool.launch.provider.*.*.error.* + name: nodepool_launch_provider_az_error + help: launch error counter per provider, az and error type + labels: + provider: "$1" + az: "$2" + error: "$3" + + - match: nodepool.launch.requestor.*.error.* + name: nodepool_launch_requestor_error + help: launch error counter per requestor and error type + labels: + requestor: "$1" + error: "$2" + + - match: nodepool.launch.error.* + name: nodepool_launch_error + help: launch error counter per error type + labels: + error: "$1" + +# updateNodeStats + - match: nodepool.nodes.* + name: nodepool_nodes_state + labels: + state: "$1" + + - match: nodepool.provider.*.nodes.* + name: nodepool_provider_nodes_state + labels: + provider: "$1" + state: "$2" + + - match: nodepool.label.*.nodes.* + name: nodepool_label_nodes_state + labels: + label: "$1" + state: "$2" + +# updateProviderLimits + - match: nodepool.provider.*.max_servers + name: nodepool_provider_max_servers + labels: + provider: "$1" + + - match: nodepool.tenant_limits.*.* + name: nodepool_tenant_limits + labels: + tenant: "$1" + resource: "$2" + +# nodepool/builder.py + - match: nodepool.image_build_requests + name: nodepool_image_build_requests + + - match: nodepool.dib_image_build.*.status.duration + name: nodepool_dib_image_build_status_duration + labels: + name: "$1" + + - match: nodepool.dib_image_build.*.status.rc + name: nodepool_dib_image_build_status_rc + labels: + name: "$1" + + - match: nodepool.dib_image_build.*.status.last_build + name: nodepool_dib_image_build_status_last_build + labels: + name: "$1" + + - match: nodepool.dib_image_build.*.*.size + name: nodepool_dib_image_build_size + labels: + name: "$1" + image_type: "$2" + + - match: nodepool.image_update.*.* + name: nodepool_image_update + labels: + name: "$1" + provider: "$2" + + # Drop all non-matching metrics to avoid spamming prometheus with + # eventually unmatched metrics. + - match: . + match_type: regex + action: drop + name: "dropped" \ No newline at end of file diff --git a/controllers/static/zuul/statsd_mapping.yaml b/controllers/static/zuul/statsd_mapping.yaml new file mode 100644 index 00000000..7bad2150 --- /dev/null +++ b/controllers/static/zuul/statsd_mapping.yaml @@ -0,0 +1,48 @@ +# TODO This is not nearly complete, maybe an existing mapping of +# https://zuul-ci.org/docs/zuul/latest/monitoring.html#metrics exists somewhere? + +mappings: + - match: zuul.nodepool.requests.* + name: zuul_nodepool_requests + help: Zuul requests and responses from Nodepool + labels: + state: "$1" + + - match: zuul.nodepool.requests.*.label.* + name: zuul_nodepool_requests_state_by_label + help: Zuul requests and responses from Nodepool + labels: + state: "$1" + label: "$2" + + - match: zuul.tenant.*.management_events + name: zuul_tenant_management_events + help: Size of the tenant's management event queue + labels: + tenant: "$1" + + - match: zuul.tenant.*.pipeline.*.current_changes + name: zuul_tenant_pipeline_current_changes + help: Number of items currently being processed by this pipeline + labels: + tenant: "$1" + pipeline: "$2" + + - match: zuul.executors.online + name: zuul_executors_online + help: Number of Zuul executor processes online + + - match: zuul.executors.accepting + name: zuul_executors_accepting + help: Number of Zuul executor processes accepting new jobs + + - match: zuul.executors.accepting + name: zuul_executors_accepting + help: Number of Zuul executor processes accepting new jobs + + # Drop all non-matching metrics to avoid spamming prometheus with + # eventually unmatched metrics. + - match: . + match_type: regex + action: drop + name: "dropped" \ No newline at end of file diff --git a/controllers/static/zuul/zuul.conf b/controllers/static/zuul/zuul.conf index 5c239cca..6f496da8 100644 --- a/controllers/static/zuul/zuul.conf +++ b/controllers/static/zuul/zuul.conf @@ -23,4 +23,7 @@ untrusted_ro_paths=/etc/pki driver=HS256 allow_authz_override=true issuer_id=zuul-admin -client_id=zuul-client \ No newline at end of file +client_id=zuul-client + +[statsd] +server=localhost diff --git a/controllers/volumestats_exporter.go b/controllers/volumestats_exporter.go deleted file mode 100644 index c1517885..00000000 --- a/controllers/volumestats_exporter.go +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2023 Red Hat -// SPDX-License-Identifier: Apache-2.0 -// -// This package contains the node_exporter setup. -// This is meant as a side-car container for other services -// that do not expose metrics natively for Prometheus. - -package controllers - -import ( - "math" - - "github.com/softwarefactory-project/sf-operator/controllers/libs/base" - apiv1 "k8s.io/api/core/v1" -) - -const nameSuffix = "-nodeexporter" -const portNameSuffix = "-ne" -const port = 9100 - -const NodeExporterImage = "quay.io/prometheus/node-exporter:latest" - -func GetNodeexporterPortName(serviceName string) string { - // Port name is limited to 15 chars - var length = float64(len(serviceName)) - var upper = int(math.Min(12, length)) - var exporterPortName = serviceName[:upper] + portNameSuffix - return exporterPortName -} - -// Fun fact: arrays cannot be consts, so we define our args in this function. -func getNodeExporterArgs(volumeMounts []apiv1.VolumeMount) []string { - var excludePaths = "^(/etc/hosts|/etc/hostname|/etc/passwd|/etc/resolv.conf|/run/.containerenv|/run/secrets|/dev|/proc|/sys)($|/)" - return []string{ - "--collector.disable-defaults", - "--collector.filesystem", - "--collector.filesystem.mount-points-exclude=" + excludePaths, - } -} - -func createNodeExporterSideCarContainer(serviceName string, volumeMounts []apiv1.VolumeMount) apiv1.Container { - container := base.MkContainer(serviceName+nameSuffix, NodeExporterImage) - container.Args = getNodeExporterArgs(volumeMounts) - container.Ports = []apiv1.ContainerPort{ - base.MkContainerPort(port, GetNodeexporterPortName(serviceName)), - } - container.VolumeMounts = volumeMounts - return container -} - -func (r *SFUtilContext) getOrCreateNodeExporterSideCarService(serviceName string) { - var portName = GetNodeexporterPortName(serviceName) - servicePorts := []int32{port} - neService := base.MkService(serviceName+portNameSuffix, r.ns, serviceName, servicePorts, portName) - r.GetOrCreate(&neService) -} diff --git a/controllers/zuul.go b/controllers/zuul.go index 2f4ab68a..e8720810 100644 --- a/controllers/zuul.go +++ b/controllers/zuul.go @@ -32,9 +32,14 @@ const zuulExecutorPort = 7900 const zuulPrometheusPort = 9090 const zuulPrometheusPortName = "zuul-metrics" +var zuulStatsdExporterPortName = monitoring.GetStatsdExporterPort("zuul") + //go:embed static/zuul/zuul.conf var zuulDotconf string +//go:embed static/zuul/statsd_mapping.yaml +var zuulStatsdMappingConfig string + //go:embed static/zuul/generate-tenant-config.sh var zuulGenerateTenantConfig string @@ -120,6 +125,7 @@ func mkZuulVolumes(service string) []apiv1.Volume { }, }, }, + base.MkVolumeCM("statsd-config", "zuul-statsd-config-map"), } if !isStatefulset(service) { // statefulset already has a PV for the service-name, @@ -177,6 +183,7 @@ func (r *SFController) EnsureZuulScheduler(initContainers []apiv1.Container, cfg sections := utils.IniGetSectionNamesByPrefix(cfg, "connection") authSections := utils.IniGetSectionNamesByPrefix(cfg, "auth") sections = append(sections, authSections...) + // TODO add statsd section in followup patch sections = append(sections, "scheduler") annotations := map[string]string{ @@ -192,9 +199,19 @@ func (r *SFController) EnsureZuulScheduler(initContainers []apiv1.Container, cfg r.cr.Spec.ConfigLocation.Name } + var relayAddress *string + if r.cr.Spec.Zuul.Scheduler.StatsdTarget != "" { + relayAddress = &r.cr.Spec.Zuul.Scheduler.StatsdTarget + } + + zuulContainers := r.mkZuulContainer("zuul-scheduler") + statsdSidecar := monitoring.MkStatsdExporterSideCarContainer("zuul", "statsd-config", relayAddress) + + zuulContainers = append(zuulContainers, statsdSidecar) + var setAdditionalContainers = func(sts *appsv1.StatefulSet) { sts.Spec.Template.Spec.InitContainers = append(initContainers, r.mkInitSchedulerConfigContainer()) - sts.Spec.Template.Spec.Containers = r.mkZuulContainer("zuul-scheduler") + sts.Spec.Template.Spec.Containers = zuulContainers } schedulerToolingData := make(map[string]string) @@ -356,10 +373,10 @@ func (r *SFController) EnsureZuulPodMonitor() bool { }, }, } - desiredZuulPodMonitor := monitoring.MkPodMonitor("zuul-monitor", r.ns, zuulPrometheusPortName, selector) + desiredZuulPodMonitor := monitoring.MkPodMonitor("zuul-monitor", r.ns, []string{zuulPrometheusPortName, zuulStatsdExporterPortName}, selector) // add annotations so we can handle lifecycle annotations := map[string]string{ - "version": "1", + "version": "2", } desiredZuulPodMonitor.ObjectMeta.Annotations = annotations currentZPM := monitoringv1.PodMonitor{} @@ -506,6 +523,11 @@ func (r *SFController) DeployZuul() bool { return false } + // create statsd exporter config map + r.EnsureConfigMap("zuul-statsd", map[string]string{ + monitoring.StatsdExporterConfigFile: zuulStatsdMappingConfig, + }) + // Update base config to add connections cfgINI := LoadConfigINI(zuulDotconf) for _, conn := range r.cr.Spec.Zuul.GerritConns { @@ -557,6 +579,8 @@ func (r *SFController) DeployZuul() bool { } cfgINI.Section("auth zuul_client").NewKey("secret", string(cliAuthSecret)) cfgINI.Section("auth zuul_client").NewKey("realm", "zuul."+r.cr.Spec.FQDN) + // Configure statsd common config + cfgINI.Section("statsd").NewKey("port", strconv.Itoa(int(monitoring.StatsdExporterPortListen))) r.EnsureZuulConfigSecret(cfgINI) r.EnsureZuulComponentsFrontServices() diff --git a/roles/health-check/test-monitoring/tasks/main.yaml b/roles/health-check/test-monitoring/tasks/main.yaml index b794c2d1..1f088d07 100644 --- a/roles/health-check/test-monitoring/tasks/main.yaml +++ b/roles/health-check/test-monitoring/tasks/main.yaml @@ -29,3 +29,29 @@ loop: - "OutOfDiskNow" - "OutOfDiskInThreeDays" + +- name: Fetch a basic Zuul metric exporter by statsd + ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/query?query=zuul_executors_online | jq '.data.result[0].value[1]' + register: zeo + + # config-update-nodepool-builder should at least trigger one tick for readiness. +- name: Fetch a basic Nodepool metric exported by statsd + ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/query?query=nodepool_launch_ready | jq '.data.result[0].value[1]' + register: nlr + +# Use a dictionary to trick Ansible into casting metrics to int/float +- set_fact: + metric_yaml: + zeo: "{{ zeo.stdout_lines[-1][1:-1] | int }}" + nlr: "{{ nlr.stdout_lines[-1][1:-1] | float }}" + +- set_fact: + metric_dict: "{{ metric_yaml | from_yaml }}" + +- fail: + msg: Unexpected value for zuul_executors_online + when: metric_dict.zeo | int < 1 + +- fail: + msg: Unexpected value for nodepool_launch_ready + when: metric_dict.nlr | float < 1 \ No newline at end of file