diff --git a/api/v1/softwarefactory_types.go b/api/v1/softwarefactory_types.go index d1a4009a..37488606 100644 --- a/api/v1/softwarefactory_types.go +++ b/api/v1/softwarefactory_types.go @@ -127,6 +127,8 @@ type ZuulExecutorSpec struct { type ZuulSchedulerSpec struct { // Storage-related settings Storage StorageSpec `json:"storage,omitempty"` + // The address to forward statsd metrics to (optional), in the form "host:port" + StatsdTarget string `json:"statsdTarget,omitempty"` } // TODO: make sure to update the GetConnectionsName when adding new connection type. @@ -193,9 +195,12 @@ type NodepoolBuilderSpec struct { } type NodepoolSpec struct { + // Nodepool-launcher related settings Launcher NodepoolLauncherSpec `json:"launcher,omitempty"` // Nodepool-builder related settings Builder NodepoolBuilderSpec `json:"builder,omitempty"` + // The address to forward statsd metrics to (optional), in the form "host:port" + StatsdTarget string `json:"statsdTarget,omitempty"` } type ZookeeperSpec struct { diff --git a/cli/sfconfig/cmd/zuul/zuul.go b/cli/sfconfig/cmd/zuul/zuul.go index da922e82..55250166 100644 --- a/cli/sfconfig/cmd/zuul/zuul.go +++ b/cli/sfconfig/cmd/zuul/zuul.go @@ -72,10 +72,11 @@ administrative actions on a specified tenant.`, buffer := &bytes.Buffer{} errorBuffer := &bytes.Buffer{} request := kubeClientSet.CoreV1().RESTClient().Post().Resource("pods").Namespace(namespace).Name(zuulSchedulerContainer.Name).SubResource("exec").VersionedParams(&v1.PodExecOptions{ - Command: zuulAdminArgs, - Stdin: false, - Stdout: true, - Stderr: true, + Container: "zuul-scheduler", + Command: zuulAdminArgs, + Stdin: false, + Stdout: true, + Stderr: true, }, scheme.ParameterCodec) exec, _ := remotecommand.NewSPDYExecutor(kubeConfig, "POST", request.URL()) diff --git a/cli/sfconfig/cmd/zuul_client/zuul_client.go b/cli/sfconfig/cmd/zuul_client/zuul_client.go index c39aa4c4..93f237a2 100644 --- a/cli/sfconfig/cmd/zuul_client/zuul_client.go +++ b/cli/sfconfig/cmd/zuul_client/zuul_client.go @@ -96,10 +96,11 @@ Examples: buf := &bytes.Buffer{} errBuf := &bytes.Buffer{} request := kubeClientSet.CoreV1().RESTClient().Post().Resource("pods").Namespace(namespace).Name(zuulwebcontainer.Name).SubResource("exec").VersionedParams(&v1.PodExecOptions{ - Command: zuulClientArgs, - Stdin: false, - Stdout: true, - Stderr: true, + Container: "zuul-web", + Command: zuulClientArgs, + Stdin: false, + Stdout: true, + Stderr: true, }, scheme.ParameterCodec) exec, _ := remotecommand.NewSPDYExecutor(kubeConfig, "POST", request.URL()) diff --git a/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml b/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml index 834fe4b8..854153d0 100644 --- a/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml +++ b/config/crd/bases/sf.softwarefactory-project.io_softwarefactories.yaml @@ -251,6 +251,7 @@ spec: type: object type: object launcher: + description: Nodepool-launcher related settings properties: logLevel: description: 'Specify the Log Level of the nodepool launcher @@ -262,6 +263,10 @@ spec: - DEBUG type: string type: object + statsdTarget: + description: The address to forward statsd metrics to (optional), + in the form "host:port" + type: string type: object storageClassName: description: Default storage class to use by Persistent Volume Claims @@ -456,6 +461,10 @@ spec: scheduler: description: Configuration of the scheduler microservice properties: + statsdTarget: + description: The address to forward statsd metrics to (optional), + in the form "host:port" + type: string storage: description: Storage-related settings properties: diff --git a/controllers/libs/monitoring/monitoring.go b/controllers/libs/monitoring/monitoring.go index 4481f0af..50d58184 100644 --- a/controllers/libs/monitoring/monitoring.go +++ b/controllers/libs/monitoring/monitoring.go @@ -1,15 +1,133 @@ // Copyright (C) 2023 Red Hat // SPDX-License-Identifier: Apache-2.0 -// Package monitoring provides various utility functions regarding monitoring for the sf-operator +/* +Package monitoring provides various utility functions regarding monitoring for the sf-operator: + +* create prometheus monitors and alert rules +* create nodeexporter sidecar +* create statsdexporter sidecar +*/ package monitoring import ( + "math" + "strconv" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/softwarefactory-project/sf-operator/controllers/libs/base" + apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) +func GetTruncatedPortName(serviceName string, suffix string) string { + // Port name is limited to 15 chars + var length = float64(len(serviceName)) + var maxChars = 15 - float64(len(suffix)) + var upper = int(math.Min(maxChars, length)) + var exporterPortName = serviceName[:upper] + suffix + return exporterPortName +} + +// Node exporter utilities + +const NodeExporterNameSuffix = "-nodeexporter" +const NodeExporterPortNameSuffix = "-ne" +const nodeExporterPort = 9100 + +const NodeExporterImage = "quay.io/prometheus/node-exporter:latest" + +// Fun fact: arrays cannot be consts, so we define our args in this function. +func getNodeExporterArgs(volumeMounts []apiv1.VolumeMount) []string { + var excludePaths = "^(/etc/hosts|/etc/hostname|/etc/passwd|/etc/resolv.conf|/run/.containerenv|/run/secrets|/dev|/proc|/sys)($|/)" + return []string{ + "--collector.disable-defaults", + "--collector.filesystem", + "--collector.filesystem.mount-points-exclude=" + excludePaths, + } +} + +func MkNodeExporterSideCarContainer(serviceName string, volumeMounts []apiv1.VolumeMount) apiv1.Container { + container := base.MkContainer(serviceName+NodeExporterNameSuffix, NodeExporterImage) + container.Args = getNodeExporterArgs(volumeMounts) + container.Ports = []apiv1.ContainerPort{ + base.MkContainerPort(nodeExporterPort, GetTruncatedPortName(serviceName, NodeExporterPortNameSuffix)), + } + container.VolumeMounts = volumeMounts + return container +} + +func MkNodeExporterSideCarService(serviceName string, namespace string) apiv1.Service { + var portName = GetTruncatedPortName(serviceName, NodeExporterPortNameSuffix) + servicePorts := []int32{nodeExporterPort} + neService := base.MkService(serviceName+NodeExporterPortNameSuffix, namespace, serviceName, servicePorts, portName) + return neService + +} + +// Statsd exporter utilities + +const statsdExporterNameSuffix = "-statsd" +const statsdExporterPortNameSuffix = "-se" +const StatsdExporterPortListen = int32(9125) +const statsdExporterPortExpose = int32(9102) +const StatsdExporterConfigFile = "statsd_mapping.yaml" +const statsdExporterImage = "quay.io/prometheus/statsd-exporter:v0.24.0" + +func getStatsdExporterArgs(configPath string, relayAddress *string) []string { + args := []string{ + "--statsd.mapping-config=" + configPath, + "--statsd.listen-udp=:" + strconv.Itoa(int(StatsdExporterPortListen)), + "--web.listen-address=:" + strconv.Itoa(int(statsdExporterPortExpose)), + } + if relayAddress != nil { + args = append(args, "--statsd.relay.address="+*relayAddress) + } + return args +} + +func GetStatsdExporterPort(serviceName string) string { + return GetTruncatedPortName(serviceName, statsdExporterPortNameSuffix+"e") +} + +func MkStatsdExporterSideCarContainer(serviceName string, configVolumeName string, relayAddress *string) apiv1.Container { + var seListenPortName = GetTruncatedPortName(serviceName, statsdExporterPortNameSuffix+"l") + var seExposePortName = GetStatsdExporterPort(serviceName) + var configFile = StatsdExporterConfigFile + var configPath = "/tmp/" + configFile + // var configVolumeName = serviceName + "-statsd-conf" + + volumeMounts := []apiv1.VolumeMount{ + { + Name: configVolumeName, + MountPath: configPath, + SubPath: configFile, + }, + } + args := getStatsdExporterArgs(configPath, relayAddress) + ports := []apiv1.ContainerPort{ + { + Name: seListenPortName, + Protocol: apiv1.ProtocolUDP, + ContainerPort: StatsdExporterPortListen, + }, + { + Name: seExposePortName, + Protocol: apiv1.ProtocolTCP, + ContainerPort: statsdExporterPortExpose, + }, + } + sidecar := base.MkContainer(serviceName+statsdExporterNameSuffix, statsdExporterImage) + sidecar.Args = args + sidecar.VolumeMounts = volumeMounts + sidecar.Ports = ports + + return sidecar +} + +// Prometheus utilities + // ServiceMonitorLabelSelector - TODO this could be a spec parameter. const ServiceMonitorLabelSelector = "sf-monitoring" @@ -34,7 +152,7 @@ func MkPrometheusAlertRule(name string, expr intstr.IntOrString, forDuration str } //lint:ignore U1000 this function will be used in a followup change -func mkServiceMonitor(name string, ns string, port string, selector metav1.LabelSelector) monitoringv1.ServiceMonitor { +func mkServiceMonitor(name string, ns string, portName string, selector metav1.LabelSelector) monitoringv1.ServiceMonitor { return monitoringv1.ServiceMonitor{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -47,7 +165,7 @@ func mkServiceMonitor(name string, ns string, port string, selector metav1.Label Endpoints: []monitoringv1.Endpoint{ { Interval: monitoringv1.Duration("30s"), - Port: port, + Port: portName, Scheme: "http", }, }, @@ -56,7 +174,12 @@ func mkServiceMonitor(name string, ns string, port string, selector metav1.Label } } -func MkPodMonitor(name string, ns string, port string, selector metav1.LabelSelector) monitoringv1.PodMonitor { +func MkPodMonitor(name string, ns string, ports []string, selector metav1.LabelSelector) monitoringv1.PodMonitor { + endpoints := []monitoringv1.PodMetricsEndpoint{} + for _, port := range ports { + endpoints = append(endpoints, monitoringv1.PodMetricsEndpoint{Port: port}) + } + return monitoringv1.PodMonitor{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -66,12 +189,8 @@ func MkPodMonitor(name string, ns string, port string, selector metav1.LabelSele }, }, Spec: monitoringv1.PodMonitorSpec{ - Selector: selector, - PodMetricsEndpoints: []monitoringv1.PodMetricsEndpoint{ - { - Port: port, - }, - }, + Selector: selector, + PodMetricsEndpoints: endpoints, }, } } diff --git a/controllers/logserver_controller.go b/controllers/logserver_controller.go index e37a7cd9..47b3ea5b 100644 --- a/controllers/logserver_controller.go +++ b/controllers/logserver_controller.go @@ -28,7 +28,7 @@ import ( "github.com/softwarefactory-project/sf-operator/controllers/libs/base" "github.com/softwarefactory-project/sf-operator/controllers/libs/conds" - "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring" + sfmonitoring "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring" "github.com/softwarefactory-project/sf-operator/controllers/libs/utils" ) @@ -100,8 +100,8 @@ func (r *LogServerController) ensureLogserverPodMonitor() bool { "run": logserverIdent, }, } - nePort := GetNodeexporterPortName(logserverIdent) - desiredLsPodmonitor := monitoring.MkPodMonitor(logserverIdent+"-monitor", r.ns, nePort, selector) + nePort := sfmonitoring.GetTruncatedPortName(logserverIdent, sfmonitoring.NodeExporterPortNameSuffix) + desiredLsPodmonitor := sfmonitoring.MkPodMonitor(logserverIdent+"-monitor", r.ns, []string{nePort}, selector) // add annotations so we can handle lifecycle annotations := map[string]string{ "version": "1", @@ -137,7 +137,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool { "description": "Log server only has at most three days' worth ({{ $value | humanize1024 }}) of free disk available.", "summary": "Log server running out of disk", } - diskFull := monitoring.MkPrometheusAlertRule( + diskFull := sfmonitoring.MkPrometheusAlertRule( "OutOfDiskNow", intstr.FromString( "(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} * 100 /"+ @@ -147,7 +147,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool { diskFullLabels, diskFullAnnotations, ) - diskFullIn3days := monitoring.MkPrometheusAlertRule( + diskFullIn3days := sfmonitoring.MkPrometheusAlertRule( "OutOfDiskInThreeDays", intstr.FromString( "(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} * 100 /"+ @@ -158,10 +158,10 @@ func (r *LogServerController) ensureLogserverPromRule() bool { map[string]string{}, diskFull3daysAnnotations, ) - lsDiskRuleGroup := monitoring.MkPrometheusRuleGroup( + lsDiskRuleGroup := sfmonitoring.MkPrometheusRuleGroup( "disk.rules", []monitoringv1.Rule{diskFull, diskFullIn3days}) - desiredLsPromRule := monitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns) + desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns) desiredLsPromRule.Spec.Groups = append(desiredLsPromRule.Spec.Groups, lsDiskRuleGroup) // add annotations so we can handle lifecycle @@ -345,7 +345,7 @@ func (r *LogServerController) DeployLogserver() sfv1.LogServerStatus { }, } - statsExporter := createNodeExporterSideCarContainer(logserverIdent, volumeMountsStatsExporter) + statsExporter := sfmonitoring.MkNodeExporterSideCarContainer(logserverIdent, volumeMountsStatsExporter) dep.Spec.Template.Spec.Containers = append(dep.Spec.Template.Spec.Containers, statsExporter) // Increase serial each time you need to enforce a deployment change/pod restart between operator versions @@ -378,7 +378,8 @@ func (r *LogServerController) DeployLogserver() sfv1.LogServerStatus { sshdService := base.MkService(sshdPortName, r.ns, logserverIdent, sshdServicePorts, sshdPortName) r.GetOrCreate(&sshdService) - r.getOrCreateNodeExporterSideCarService(logserverIdent) + nodeExporterSidecarService := sfmonitoring.MkNodeExporterSideCarService(logserverIdent, r.ns) + r.GetOrCreate(&nodeExporterSidecarService) pvcReadiness := r.reconcileExpandPVC(logserverIdent, r.cr.Spec.Settings.Storage) diff --git a/controllers/nodepool.go b/controllers/nodepool.go index 8f60f51a..3d617c8d 100644 --- a/controllers/nodepool.go +++ b/controllers/nodepool.go @@ -6,11 +6,15 @@ package controllers import ( _ "embed" + "strconv" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" v1 "github.com/softwarefactory-project/sf-operator/api/v1" "github.com/softwarefactory-project/sf-operator/controllers/libs/base" "github.com/softwarefactory-project/sf-operator/controllers/libs/conds" + "github.com/softwarefactory-project/sf-operator/controllers/libs/monitoring" "github.com/softwarefactory-project/sf-operator/controllers/libs/utils" + appsv1 "k8s.io/api/apps/v1" apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -28,14 +32,21 @@ var dibAnsibleWrapper string //go:embed static/nodepool/ssh_config var builderSSHConfig string -const launcherIdent = "nodepool-launcher" +//go:embed static/nodepool/statsd_mapping.yaml +var nodepoolStatsdMappingConfig string + +const nodepoolIdent = "nodepool" +const LauncherIdent = nodepoolIdent + "-launcher" +const shortIdent = "np" const launcherPortName = "nlwebapp" const launcherPort = 8006 const NodepoolProvidersSecretsName = "nodepool-providers-secrets" -const nodepoolLauncherImage = "quay.io/software-factory/" + launcherIdent + ":9.0.0-3" +const nodepoolLauncherImage = "quay.io/software-factory/" + LauncherIdent + ":9.0.0-3" -const builderIdent = "nodepool-builder" -const nodepoolBuilderImage = "quay.io/software-factory/" + builderIdent + ":9.0.0-3" +const BuilderIdent = nodepoolIdent + "-builder" +const nodepoolBuilderImage = "quay.io/software-factory/" + BuilderIdent + ":9.0.0-3" + +var nodepoolStatsdExporterPortName = monitoring.GetStatsdExporterPort(shortIdent) var configScriptVolumeMount = apiv1.VolumeMount{ Name: "nodepool-tooling-vol", @@ -66,17 +77,24 @@ func (r *SFController) commonToolingVolume() apiv1.Volume { } func (r *SFController) getNodepoolConfigEnvs() []apiv1.EnvVar { + nodepoolEnvVars := []apiv1.EnvVar{} if r.isConfigRepoSet() { - return []apiv1.EnvVar{ + nodepoolEnvVars = append(nodepoolEnvVars, base.MkEnvVar("CONFIG_REPO_SET", "TRUE"), base.MkEnvVar("CONFIG_REPO_BASE_URL", r.cr.Spec.ConfigLocation.BaseURL), base.MkEnvVar("CONFIG_REPO_NAME", r.cr.Spec.ConfigLocation.Name), - } + ) } else { - return []apiv1.EnvVar{ + nodepoolEnvVars = append(nodepoolEnvVars, base.MkEnvVar("CONFIG_REPO_SET", "FALSE"), - } + ) } + nodepoolEnvVars = append(nodepoolEnvVars, + base.MkEnvVar("HOME", "/var/lib/nodepool"), + base.MkEnvVar("STATSD_HOST", "localhost"), + base.MkEnvVar("STATSD_PORT", strconv.Itoa(int(monitoring.StatsdExporterPortListen))), + ) + return nodepoolEnvVars } func mkLoggingTemplate(logLevel v1.LogLevel) (string, error) { @@ -92,7 +110,44 @@ func mkLoggingTemplate(logLevel v1.LogLevel) (string, error) { return loggingConfig, err } -func (r *SFController) DeployNodepoolBuilder() bool { +func (r *SFController) EnsureNodepoolPodMonitor() bool { + selector := metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "run", + Operator: metav1.LabelSelectorOpIn, + Values: []string{LauncherIdent, BuilderIdent}, + }, + { + Key: "app", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"sf"}, + }, + }, + } + desiredNodepoolMonitor := monitoring.MkPodMonitor("nodepool-monitor", r.ns, []string{nodepoolStatsdExporterPortName}, selector) + // add annotations so we can handle lifecycle + annotations := map[string]string{ + "version": "1", + } + desiredNodepoolMonitor.ObjectMeta.Annotations = annotations + currentNPM := monitoringv1.PodMonitor{} + if !r.GetM(desiredNodepoolMonitor.Name, ¤tNPM) { + r.CreateR(&desiredNodepoolMonitor) + return false + } else { + if !utils.MapEquals(¤tNPM.ObjectMeta.Annotations, &annotations) { + r.log.V(1).Info("Nodepool PodMonitor configuration changed, updating...") + currentNPM.Spec = desiredNodepoolMonitor.Spec + currentNPM.ObjectMeta.Annotations = annotations + r.UpdateR(¤tNPM) + return false + } + } + return true +} + +func (r *SFController) DeployNodepoolBuilder(statsdExporterVolume apiv1.Volume) bool { r.EnsureSSHKeySecret("nodepool-builder-ssh-key") @@ -105,6 +160,12 @@ func (r *SFController) DeployNodepoolBuilder() bool { r.EnsureConfigMap("nodepool-builder-extra-config", builderExtraConfigData) var mod int32 = 256 // decimal for 0400 octal + // get statsd relay if defined + var relayAddress *string + if r.cr.Spec.Nodepool.StatsdTarget != "" { + relayAddress = &r.cr.Spec.Nodepool.StatsdTarget + } + volumes := []apiv1.Volume{ base.MkVolumeSecret("zookeeper-client-tls"), base.MkEmptyDirVolume("nodepool-config"), @@ -122,6 +183,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { }, base.MkVolumeCM("nodepool-builder-extra-config-vol", "nodepool-builder-extra-config-config-map"), + statsdExporterVolume, } volumeMount := []apiv1.VolumeMount{ @@ -135,7 +197,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { MountPath: "/etc/nodepool", }, { - Name: builderIdent, + Name: BuilderIdent, MountPath: "/var/lib/nodepool", }, { @@ -182,7 +244,6 @@ func (r *SFController) DeployNodepoolBuilder() bool { initContainer.Command = []string{"bash", "-c", "mkdir -p ~/dib; /usr/local/bin/generate-config.sh"} initContainer.Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool"), base.MkEnvVar("NODEPOOL_CONFIG_FILE", "nodepool-builder.yaml"), ) initContainer.VolumeMounts = []apiv1.VolumeMount{ @@ -191,7 +252,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { MountPath: "/etc/nodepool/", }, { - Name: builderIdent, + Name: BuilderIdent, MountPath: "/var/lib/nodepool", }, configScriptVolumeMount, @@ -199,7 +260,7 @@ func (r *SFController) DeployNodepoolBuilder() bool { replicas := int32(1) nb := r.mkStatefulSet( - builderIdent, nodepoolBuilderImage, r.getStorageConfOrDefault(r.cr.Spec.Nodepool.Builder.Storage), + BuilderIdent, nodepoolBuilderImage, r.getStorageConfOrDefault(r.cr.Spec.Nodepool.Builder.Storage), replicas, apiv1.ReadWriteOnce) nb.Spec.Template.ObjectMeta.Annotations = annotations @@ -208,11 +269,14 @@ func (r *SFController) DeployNodepoolBuilder() bool { nb.Spec.Template.Spec.Containers[0].Command = []string{"/usr/local/bin/dumb-init", "--", "/usr/local/bin/nodepool-builder", "-f", "-l", "/etc/nodepool-logging/logging.yaml"} nb.Spec.Template.Spec.Containers[0].VolumeMounts = volumeMount - nb.Spec.Template.Spec.Containers[0].Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool")) + nb.Spec.Template.Spec.Containers[0].Env = r.getNodepoolConfigEnvs() + // Append statsd exporter sidecar + nb.Spec.Template.Spec.Containers = append(nb.Spec.Template.Spec.Containers, + monitoring.MkStatsdExporterSideCarContainer(shortIdent, "statsd-config", relayAddress), + ) current := appsv1.StatefulSet{} - if r.GetM(builderIdent, ¤t) { + if r.GetM(BuilderIdent, ¤t) { if !utils.MapEquals(¤t.Spec.Template.ObjectMeta.Annotations, &annotations) { r.log.V(1).Info("Nodepool-builder configuration changed, rollout pods ...") current.Spec = nb.DeepCopy().Spec @@ -226,17 +290,23 @@ func (r *SFController) DeployNodepoolBuilder() bool { var isReady = r.IsStatefulSetReady(¤t) - conds.UpdateConditions(&r.cr.Status.Conditions, builderIdent, isReady) + conds.UpdateConditions(&r.cr.Status.Conditions, BuilderIdent, isReady) return isReady } -func (r *SFController) DeployNodepoolLauncher() bool { +func (r *SFController) DeployNodepoolLauncher(statsdExporterVolume apiv1.Volume) bool { r.setNodepoolTooling() loggingConfig, _ := mkLoggingTemplate(r.cr.Spec.Nodepool.Launcher.LogLevel) + // get statsd relay if defined + var relayAddress *string + if r.cr.Spec.Nodepool.StatsdTarget != "" { + relayAddress = &r.cr.Spec.Nodepool.StatsdTarget + } + launcherExtraConfigData := make(map[string]string) launcherExtraConfigData["logging.yaml"] = loggingConfig r.EnsureConfigMap("nodepool-launcher-extra-config", launcherExtraConfigData) @@ -249,6 +319,7 @@ func (r *SFController) DeployNodepoolLauncher() bool { r.commonToolingVolume(), base.MkVolumeCM("nodepool-launcher-extra-config-vol", "nodepool-launcher-extra-config-config-map"), + statsdExporterVolume, } volumeMount := []apiv1.VolumeMount{ @@ -325,14 +396,12 @@ func (r *SFController) DeployNodepoolLauncher() bool { container.VolumeMounts = volumeMount container.Command = []string{"/usr/local/bin/dumb-init", "--", "/usr/local/bin/nodepool-launcher", "-f", "-l", "/etc/nodepool-logging/logging.yaml"} - container.Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool")) + container.Env = r.getNodepoolConfigEnvs() initContainer := base.MkContainer("nodepool-launcher-init", BusyboxImage) initContainer.Command = []string{"/usr/local/bin/generate-config.sh"} - initContainer.Env = append(r.getNodepoolConfigEnvs(), - base.MkEnvVar("HOME", "/var/lib/nodepool")) + initContainer.Env = r.getNodepoolConfigEnvs() initContainer.VolumeMounts = []apiv1.VolumeMount{ { Name: "nodepool-config", @@ -347,7 +416,9 @@ func (r *SFController) DeployNodepoolLauncher() bool { nl.Spec.Template.Spec.Volumes = volumes nl.Spec.Template.Spec.InitContainers = []apiv1.Container{initContainer} - nl.Spec.Template.Spec.Containers = []apiv1.Container{container} + nl.Spec.Template.Spec.Containers = []apiv1.Container{ + container, + monitoring.MkStatsdExporterSideCarContainer(shortIdent, "statsd-config", relayAddress)} nl.Spec.Template.ObjectMeta.Annotations = annotations nl.Spec.Template.Spec.Containers[0].ReadinessProbe = base.MkReadinessHTTPProbe("/ready", launcherPort) nl.Spec.Template.Spec.Containers[0].LivenessProbe = base.MkLiveHTTPProbe("/ready", launcherPort) @@ -357,7 +428,7 @@ func (r *SFController) DeployNodepoolLauncher() bool { } current := appsv1.Deployment{} - if r.GetM(launcherIdent, ¤t) { + if r.GetM(LauncherIdent, ¤t) { if !utils.MapEquals(¤t.Spec.Template.ObjectMeta.Annotations, &annotations) { r.log.V(1).Info("Nodepool-launcher configuration changed, rollout pods ...") current.Spec = nl.DeepCopy().Spec @@ -369,14 +440,31 @@ func (r *SFController) DeployNodepoolLauncher() bool { r.CreateR(¤t) } - srv := base.MkService(launcherIdent, r.ns, launcherIdent, []int32{launcherPort}, launcherIdent) + srv := base.MkService(LauncherIdent, r.ns, LauncherIdent, []int32{launcherPort}, LauncherIdent) r.GetOrCreate(&srv) - routeReady := r.ensureHTTPSRoute(r.cr.Name+"-nodepool-launcher", "nodepool", launcherIdent, "/", + routeReady := r.ensureHTTPSRoute(r.cr.Name+"-nodepool-launcher", "nodepool", LauncherIdent, "/", launcherPort, map[string]string{}, r.cr.Spec.FQDN, r.cr.Spec.LetsEncrypt) isDeploymentReady := r.IsDeploymentReady(¤t) - conds.UpdateConditions(&r.cr.Status.Conditions, launcherIdent, isDeploymentReady) + conds.UpdateConditions(&r.cr.Status.Conditions, LauncherIdent, isDeploymentReady) return isDeploymentReady && routeReady } + +func (r *SFController) DeployNodepool() map[string]bool { + + // create statsd exporter config map + r.EnsureConfigMap("np-statsd", map[string]string{ + monitoring.StatsdExporterConfigFile: nodepoolStatsdMappingConfig, + }) + statsdVolume := base.MkVolumeCM("statsd-config", "np-statsd-config-map") + + // Ensure monitoring + r.EnsureNodepoolPodMonitor() + + deployments := make(map[string]bool) + deployments[LauncherIdent] = r.DeployNodepoolLauncher(statsdVolume) + deployments[BuilderIdent] = r.DeployNodepoolBuilder(statsdVolume) + return deployments +} diff --git a/controllers/softwarefactory_controller.go b/controllers/softwarefactory_controller.go index 3404b2b4..5eccf0c7 100644 --- a/controllers/softwarefactory_controller.go +++ b/controllers/softwarefactory_controller.go @@ -185,8 +185,9 @@ func (r *SFController) Step() sfv1.SoftwareFactoryStatus { services["Logserver"] = r.DeployLogserverResource() if services["Zookeeper"] { - services["NodePoolLauncher"] = r.DeployNodepoolLauncher() - services["NodePoolBuilder"] = r.DeployNodepoolBuilder() + nodepool := r.DeployNodepool() + services["NodePoolLauncher"] = nodepool[LauncherIdent] + services["NodePoolBuilder"] = nodepool[BuilderIdent] } if services["Zuul"] { diff --git a/controllers/static/nodepool/statsd_mapping.yaml b/controllers/static/nodepool/statsd_mapping.yaml new file mode 100644 index 00000000..a2aeeac9 --- /dev/null +++ b/controllers/static/nodepool/statsd_mapping.yaml @@ -0,0 +1,124 @@ +# TODO Parsed manually from nodepool's source code. Look for calls of +# recordLaunchStats, updateNodeStats, updateProviderLimits and updateTenantLimits. + +mappings: + +# recordLaunchStats + - match: nodepool.launch.provider.*.ready + name: nodepool_launch_provider_ready + help: launch success counter per provider + labels: + provider: "$1" + + - match: nodepool.launch.provider.*.error.* + name: nodepool_launch_provider_error + help: launch error counter per provider and error type + labels: + provider: "$1" + error: "$2" + + - match: nodepool.launch.provider.*.*.ready + name: nodepool_launch_provider_az_ready + help: launch success counter per provider and az + labels: + provider: "$1" + az: "$2" + + - match: nodepool.launch.requestor.*.ready + name: nodepool_launch_requestor_ready + help: launch success counter per requestor + labels: + requestor: "$1" + + - match: nodepool.launch.ready + name: nodepool_launch_ready + help: launch success counter + + - match: nodepool.launch.provider.*.*.error.* + name: nodepool_launch_provider_az_error + help: launch error counter per provider, az and error type + labels: + provider: "$1" + az: "$2" + error: "$3" + + - match: nodepool.launch.requestor.*.error.* + name: nodepool_launch_requestor_error + help: launch error counter per requestor and error type + labels: + requestor: "$1" + error: "$2" + + - match: nodepool.launch.error.* + name: nodepool_launch_error + help: launch error counter per error type + labels: + error: "$1" + +# updateNodeStats + - match: nodepool.nodes.* + name: nodepool_nodes_state + labels: + state: "$1" + + - match: nodepool.provider.*.nodes.* + name: nodepool_provider_nodes_state + labels: + provider: "$1" + state: "$2" + + - match: nodepool.label.*.nodes.* + name: nodepool_label_nodes_state + labels: + label: "$1" + state: "$2" + +# updateProviderLimits + - match: nodepool.provider.*.max_servers + name: nodepool_provider_max_servers + labels: + provider: "$1" + + - match: nodepool.tenant_limits.*.* + name: nodepool_tenant_limits + labels: + tenant: "$1" + resource: "$2" + +# nodepool/builder.py + - match: nodepool.image_build_requests + name: nodepool_image_build_requests + + - match: nodepool.dib_image_build.*.status.duration + name: nodepool_dib_image_build_status_duration + labels: + name: "$1" + + - match: nodepool.dib_image_build.*.status.rc + name: nodepool_dib_image_build_status_rc + labels: + name: "$1" + + - match: nodepool.dib_image_build.*.status.last_build + name: nodepool_dib_image_build_status_last_build + labels: + name: "$1" + + - match: nodepool.dib_image_build.*.*.size + name: nodepool_dib_image_build_size + labels: + name: "$1" + image_type: "$2" + + - match: nodepool.image_update.*.* + name: nodepool_image_update + labels: + name: "$1" + provider: "$2" + + # Drop all non-matching metrics to avoid spamming prometheus with + # eventually unmatched metrics. + - match: . + match_type: regex + action: drop + name: "dropped" \ No newline at end of file diff --git a/controllers/static/zuul/statsd_mapping.yaml b/controllers/static/zuul/statsd_mapping.yaml new file mode 100644 index 00000000..7bad2150 --- /dev/null +++ b/controllers/static/zuul/statsd_mapping.yaml @@ -0,0 +1,48 @@ +# TODO This is not nearly complete, maybe an existing mapping of +# https://zuul-ci.org/docs/zuul/latest/monitoring.html#metrics exists somewhere? + +mappings: + - match: zuul.nodepool.requests.* + name: zuul_nodepool_requests + help: Zuul requests and responses from Nodepool + labels: + state: "$1" + + - match: zuul.nodepool.requests.*.label.* + name: zuul_nodepool_requests_state_by_label + help: Zuul requests and responses from Nodepool + labels: + state: "$1" + label: "$2" + + - match: zuul.tenant.*.management_events + name: zuul_tenant_management_events + help: Size of the tenant's management event queue + labels: + tenant: "$1" + + - match: zuul.tenant.*.pipeline.*.current_changes + name: zuul_tenant_pipeline_current_changes + help: Number of items currently being processed by this pipeline + labels: + tenant: "$1" + pipeline: "$2" + + - match: zuul.executors.online + name: zuul_executors_online + help: Number of Zuul executor processes online + + - match: zuul.executors.accepting + name: zuul_executors_accepting + help: Number of Zuul executor processes accepting new jobs + + - match: zuul.executors.accepting + name: zuul_executors_accepting + help: Number of Zuul executor processes accepting new jobs + + # Drop all non-matching metrics to avoid spamming prometheus with + # eventually unmatched metrics. + - match: . + match_type: regex + action: drop + name: "dropped" \ No newline at end of file diff --git a/controllers/static/zuul/zuul.conf b/controllers/static/zuul/zuul.conf index 5c239cca..6f496da8 100644 --- a/controllers/static/zuul/zuul.conf +++ b/controllers/static/zuul/zuul.conf @@ -23,4 +23,7 @@ untrusted_ro_paths=/etc/pki driver=HS256 allow_authz_override=true issuer_id=zuul-admin -client_id=zuul-client \ No newline at end of file +client_id=zuul-client + +[statsd] +server=localhost diff --git a/controllers/volumestats_exporter.go b/controllers/volumestats_exporter.go deleted file mode 100644 index c1517885..00000000 --- a/controllers/volumestats_exporter.go +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (C) 2023 Red Hat -// SPDX-License-Identifier: Apache-2.0 -// -// This package contains the node_exporter setup. -// This is meant as a side-car container for other services -// that do not expose metrics natively for Prometheus. - -package controllers - -import ( - "math" - - "github.com/softwarefactory-project/sf-operator/controllers/libs/base" - apiv1 "k8s.io/api/core/v1" -) - -const nameSuffix = "-nodeexporter" -const portNameSuffix = "-ne" -const port = 9100 - -const NodeExporterImage = "quay.io/prometheus/node-exporter:latest" - -func GetNodeexporterPortName(serviceName string) string { - // Port name is limited to 15 chars - var length = float64(len(serviceName)) - var upper = int(math.Min(12, length)) - var exporterPortName = serviceName[:upper] + portNameSuffix - return exporterPortName -} - -// Fun fact: arrays cannot be consts, so we define our args in this function. -func getNodeExporterArgs(volumeMounts []apiv1.VolumeMount) []string { - var excludePaths = "^(/etc/hosts|/etc/hostname|/etc/passwd|/etc/resolv.conf|/run/.containerenv|/run/secrets|/dev|/proc|/sys)($|/)" - return []string{ - "--collector.disable-defaults", - "--collector.filesystem", - "--collector.filesystem.mount-points-exclude=" + excludePaths, - } -} - -func createNodeExporterSideCarContainer(serviceName string, volumeMounts []apiv1.VolumeMount) apiv1.Container { - container := base.MkContainer(serviceName+nameSuffix, NodeExporterImage) - container.Args = getNodeExporterArgs(volumeMounts) - container.Ports = []apiv1.ContainerPort{ - base.MkContainerPort(port, GetNodeexporterPortName(serviceName)), - } - container.VolumeMounts = volumeMounts - return container -} - -func (r *SFUtilContext) getOrCreateNodeExporterSideCarService(serviceName string) { - var portName = GetNodeexporterPortName(serviceName) - servicePorts := []int32{port} - neService := base.MkService(serviceName+portNameSuffix, r.ns, serviceName, servicePorts, portName) - r.GetOrCreate(&neService) -} diff --git a/controllers/zuul.go b/controllers/zuul.go index 2f4ab68a..e8720810 100644 --- a/controllers/zuul.go +++ b/controllers/zuul.go @@ -32,9 +32,14 @@ const zuulExecutorPort = 7900 const zuulPrometheusPort = 9090 const zuulPrometheusPortName = "zuul-metrics" +var zuulStatsdExporterPortName = monitoring.GetStatsdExporterPort("zuul") + //go:embed static/zuul/zuul.conf var zuulDotconf string +//go:embed static/zuul/statsd_mapping.yaml +var zuulStatsdMappingConfig string + //go:embed static/zuul/generate-tenant-config.sh var zuulGenerateTenantConfig string @@ -120,6 +125,7 @@ func mkZuulVolumes(service string) []apiv1.Volume { }, }, }, + base.MkVolumeCM("statsd-config", "zuul-statsd-config-map"), } if !isStatefulset(service) { // statefulset already has a PV for the service-name, @@ -177,6 +183,7 @@ func (r *SFController) EnsureZuulScheduler(initContainers []apiv1.Container, cfg sections := utils.IniGetSectionNamesByPrefix(cfg, "connection") authSections := utils.IniGetSectionNamesByPrefix(cfg, "auth") sections = append(sections, authSections...) + // TODO add statsd section in followup patch sections = append(sections, "scheduler") annotations := map[string]string{ @@ -192,9 +199,19 @@ func (r *SFController) EnsureZuulScheduler(initContainers []apiv1.Container, cfg r.cr.Spec.ConfigLocation.Name } + var relayAddress *string + if r.cr.Spec.Zuul.Scheduler.StatsdTarget != "" { + relayAddress = &r.cr.Spec.Zuul.Scheduler.StatsdTarget + } + + zuulContainers := r.mkZuulContainer("zuul-scheduler") + statsdSidecar := monitoring.MkStatsdExporterSideCarContainer("zuul", "statsd-config", relayAddress) + + zuulContainers = append(zuulContainers, statsdSidecar) + var setAdditionalContainers = func(sts *appsv1.StatefulSet) { sts.Spec.Template.Spec.InitContainers = append(initContainers, r.mkInitSchedulerConfigContainer()) - sts.Spec.Template.Spec.Containers = r.mkZuulContainer("zuul-scheduler") + sts.Spec.Template.Spec.Containers = zuulContainers } schedulerToolingData := make(map[string]string) @@ -356,10 +373,10 @@ func (r *SFController) EnsureZuulPodMonitor() bool { }, }, } - desiredZuulPodMonitor := monitoring.MkPodMonitor("zuul-monitor", r.ns, zuulPrometheusPortName, selector) + desiredZuulPodMonitor := monitoring.MkPodMonitor("zuul-monitor", r.ns, []string{zuulPrometheusPortName, zuulStatsdExporterPortName}, selector) // add annotations so we can handle lifecycle annotations := map[string]string{ - "version": "1", + "version": "2", } desiredZuulPodMonitor.ObjectMeta.Annotations = annotations currentZPM := monitoringv1.PodMonitor{} @@ -506,6 +523,11 @@ func (r *SFController) DeployZuul() bool { return false } + // create statsd exporter config map + r.EnsureConfigMap("zuul-statsd", map[string]string{ + monitoring.StatsdExporterConfigFile: zuulStatsdMappingConfig, + }) + // Update base config to add connections cfgINI := LoadConfigINI(zuulDotconf) for _, conn := range r.cr.Spec.Zuul.GerritConns { @@ -557,6 +579,8 @@ func (r *SFController) DeployZuul() bool { } cfgINI.Section("auth zuul_client").NewKey("secret", string(cliAuthSecret)) cfgINI.Section("auth zuul_client").NewKey("realm", "zuul."+r.cr.Spec.FQDN) + // Configure statsd common config + cfgINI.Section("statsd").NewKey("port", strconv.Itoa(int(monitoring.StatsdExporterPortListen))) r.EnsureZuulConfigSecret(cfgINI) r.EnsureZuulComponentsFrontServices() diff --git a/roles/health-check/test-monitoring/tasks/main.yaml b/roles/health-check/test-monitoring/tasks/main.yaml index b794c2d1..1f088d07 100644 --- a/roles/health-check/test-monitoring/tasks/main.yaml +++ b/roles/health-check/test-monitoring/tasks/main.yaml @@ -29,3 +29,29 @@ loop: - "OutOfDiskNow" - "OutOfDiskInThreeDays" + +- name: Fetch a basic Zuul metric exporter by statsd + ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/query?query=zuul_executors_online | jq '.data.result[0].value[1]' + register: zeo + + # config-update-nodepool-builder should at least trigger one tick for readiness. +- name: Fetch a basic Nodepool metric exported by statsd + ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/query?query=nodepool_launch_ready | jq '.data.result[0].value[1]' + register: nlr + +# Use a dictionary to trick Ansible into casting metrics to int/float +- set_fact: + metric_yaml: + zeo: "{{ zeo.stdout_lines[-1][1:-1] | int }}" + nlr: "{{ nlr.stdout_lines[-1][1:-1] | float }}" + +- set_fact: + metric_dict: "{{ metric_yaml | from_yaml }}" + +- fail: + msg: Unexpected value for zuul_executors_online + when: metric_dict.zeo | int < 1 + +- fail: + msg: Unexpected value for nodepool_launch_ready + when: metric_dict.nlr | float < 1 \ No newline at end of file