From bf8c279a074bcdd34bbe9849124062a3a65b6cb0 Mon Sep 17 00:00:00 2001 From: Matthieu Huin Date: Thu, 12 Oct 2023 17:20:52 +0200 Subject: [PATCH] Fix type issue in clouds to statsd mapping func, add test and documentation Change-Id: I7b99d0447f8feef151de91d8caa3df170ce7b400 --- controllers/libs/monitoring/monitoring.go | 9 ++-- doc/deployment/monitoring.md | 31 +++++++------ doc/deployment/nodepool.md | 2 +- .../test-monitoring/files/clouds.yaml | 31 +++++++++++++ .../test-monitoring/tasks/main.yaml | 43 +++++++++++++++++++ 5 files changed, 99 insertions(+), 17 deletions(-) create mode 100644 roles/health-check/test-monitoring/files/clouds.yaml diff --git a/controllers/libs/monitoring/monitoring.go b/controllers/libs/monitoring/monitoring.go index 64e8ae4a..3df0622a 100644 --- a/controllers/libs/monitoring/monitoring.go +++ b/controllers/libs/monitoring/monitoring.go @@ -198,9 +198,9 @@ func MkStatsdMappingsFromCloudsYaml(extraMappings []StatsdMetricMapping, cloudsY if globalMetricsConf, ok := cloudsYaml["metrics"]; ok { gmc := globalMetricsConf.(map[string]interface{}) if globalStatsdConf, ok := gmc["statsd"]; ok { - gsc := globalStatsdConf.(map[string]string) + gsc := globalStatsdConf.(map[string]interface{}) if prefix, ok := gsc["prefix"]; ok { - globalPrefix = prefix + globalPrefix = prefix.(string) } } } @@ -211,8 +211,9 @@ func MkStatsdMappingsFromCloudsYaml(extraMappings []StatsdMetricMapping, cloudsY if metricsConf, ok := cC["metrics"]; ok { mC := metricsConf.(map[string]interface{}) if statsdConf, ok := mC["statsd"]; ok { - sC := statsdConf.(map[string]string) - if prefix, ok := sC["prefix"]; ok { + sC := statsdConf.(map[string]interface{}) + if prefx, ok := sC["prefix"]; ok { + prefix := prefx.(string) var extraMapping = StatsdMetricMapping{ Name: strings.Replace(prefix, ".", "_", -1), ProviderName: cloudName, diff --git a/doc/deployment/monitoring.md b/doc/deployment/monitoring.md index 138f3a4c..624de2cf 100644 --- a/doc/deployment/monitoring.md +++ b/doc/deployment/monitoring.md @@ -82,18 +82,25 @@ or configure Prometheus to forward alerts to an external AlertManager instance. The following alerting rules are created automatically at deployment time: -| Alert name | Severity | Service | Prometheus Group Rule | Description | -|---------|------|------|--------|------------------| -| `OutOfDiskNow` | critical | Log server | disk_default.rules | The Log server has less than 10% free storage space left | -| `OutOfDiskInThreeDays` | warning | Log server | disk_default.rules | Assuming a linear trend, the Log server's storage space will fill up in less than three days | -| `ConfigUpdateFailureInPostPipeline` | critical | Zuul | config-repository_default.rules | A `config-update` job failed in the `post` pipeline, meaning a configuration change was not applied properly to the Software Factory deployment's services | -| `NotEnoughExecutors` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some jobs are waiting for an available executor to run on | -| `NotEnoughMergers` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some merge jobs are waiting for an available merger to run on | -| `NotEnoughTestNodes` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case Nodepool could not fulfill node requests | -| `DIBImageBuildFailure` | warning | nodepool-builder | builder_default.rules | the disk-image-builder service (DIB) failed to build an image | -| `HighOpenStackAPIError5xxRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of API calls on an OpenStack provider return a status code of 5xx (server-side error) over a period of 15 minutes | -| `HighFailedStateRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of nodes on a provider are in failed state over a period of one hour | -| `HighNodeLaunchErrorRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of node launch events end in an error state over a period of one hour | +| Alert name | Severity | Description | +|---------|------|------------------| +| `OutOfDiskNow` | critical | The Log server has less than 10% free storage space left | +| `OutOfDiskInThreeDays` | warning | Assuming a linear trend, the Log server's storage space will fill up in less than three days | +| `ConfigUpdateFailureInPostPipeline` | critical | A `config-update` job failed in the `post` pipeline, meaning a configuration change was not applied properly to the Software Factory deployment's services | +| `NotEnoughExecutors` | warning | Lack of resources is throttling performance in the last hour; in that case some jobs are waiting for an available executor to run on | +| `NotEnoughMergers` | warning | Lack of resources is throttling performance in the last hour; in that case some merge jobs are waiting for an available merger to run on | +| `NotEnoughTestNodes` | warning | Lack of resources is throttling performance in the last hour; in that case Nodepool could not fulfill node requests | +| `DIBImageBuildFailure` | warning | the disk-image-builder service (DIB) failed to build an image | +| `HighFailedStateRate` | critical | Triggers when more than 5% of nodes on a provider are in failed state over a period of one hour | +| `HighNodeLaunchErrorRate` | critical | Triggers when more than 5% of node launch events end in an error state over a period of one hour | +| `HighOpenStackAPIError5xxRate` | critical | Triggers when more than 5% of API calls on OpenStack return a status code of 5xx (server-side error) over a period of 15 minutes | + +If [statsd metrics prefixes are set](https://docs.openstack.org/openstacksdk/latest/user/guides/stats.html) for clouds defined in Nodepool's `clouds.yaml`, SF-Operator will also create the following alert +for each cloud with a set prefix: + +| Alert name | Severity | Description | +|---------|------|------------------| +| `HighOpenStackAPIError5xxRate_` | critical | Triggers when more than 5% of API calls on cloud return a status code of 5xx (server-side error) over a period of 15 minutes | Note that these alerts are generic and might not be relevant to your deployment's specificities. For instance, it may be normal to hit the `NotEnoughTestNodes` alert if resource quotas are in place diff --git a/doc/deployment/nodepool.md b/doc/deployment/nodepool.md index 643560ff..87330842 100644 --- a/doc/deployment/nodepool.md +++ b/doc/deployment/nodepool.md @@ -60,7 +60,7 @@ nodepool: 2. Run sfconfig: ```sh -./tools/sfconfig nodepool-providers-secrets --upload +./tools/sfconfig nodepool-providers-secrets --update ``` 3. Wait until your deployment becomes ready again: diff --git a/roles/health-check/test-monitoring/files/clouds.yaml b/roles/health-check/test-monitoring/files/clouds.yaml new file mode 100644 index 00000000..0681b0ec --- /dev/null +++ b/roles/health-check/test-monitoring/files/clouds.yaml @@ -0,0 +1,31 @@ +cache: + expiration: + server: 5 + port: 5 + floating-ip: 5 +metrics: + statsd: + prefix: nodepool.openstack +clouds: + nimbus: + api_timeout: 60 + auth: + username: user1 + password: password1 + auth_url: https://keystone.nimbus/v2.0 + project_name: my-project + image_format: 'raw' + metrics: + statsd: + prefix: nodepool.openstack.nimbus + cumulus: + api_timeout: 60 + auth: + username: user2 + password: password2 + auth_url: https://keystone.cumulus/v2.0 + project_name: my-other-project + image_format: 'raw' + metrics: + statsd: + prefix: nodepool.openstack.cumulus \ No newline at end of file diff --git a/roles/health-check/test-monitoring/tasks/main.yaml b/roles/health-check/test-monitoring/tasks/main.yaml index add351f9..924cb1e0 100644 --- a/roles/health-check/test-monitoring/tasks/main.yaml +++ b/roles/health-check/test-monitoring/tasks/main.yaml @@ -56,3 +56,46 @@ - fail: msg: Unexpected value for nodepool_launch_ready when: metric_dict.nlr | float < 1 + +- name: set a fake clouds.yaml configuration + ansible.builtin.copy: + src: clouds.yaml + dest: /tmp/test-clouds.yaml + become: true + +- name: configure clouds.yaml in sfconfig.yaml + ansible.builtin.lineinfile: + path: "{{ zuul.project.src_dir }}/sfconfig.yaml" + regexp: '^ clouds_file:' + line: " clouds_file: /tmp/test-clouds.yaml" + +- name: Upload clouds secrets to nodepool + command: "tools/sfconfig nodepool-providers-secrets --update" + args: + chdir: "{{ zuul.project.src_dir }}" + +- name: Wait for secrets to be updated + ansible.builtin.include_role: + name: "roles/health-check/check-sf-resource-ready" + +- name: Ensure statsd mapping config has custom mappings from clouds.yaml + ansible.builtin.command: kubectl get configmap np-statsd-config-map -o jsonpath='{.data}' + register: statsd_mappings + +- name: Ensure statsd mapping config has custom mappings from clouds.yaml + fail: + msg: "statsd mapping configuration has no reference to clouds config" + when: ("nodepool.openstack.nimbus" not in statsd_mappings.stdout) or + ("nodepool.openstack.cumulus" not in statsd_mappings.stdout) or + ("nodepool.openstack.*.*.*.*" not in statsd_mappings.stdout) + +- name: Fetch OpenStack API alerts + ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/rules | jq '.data.groups[] | select(.name == "providersAPI_default.rules") | .rules[] | select(.name == "{{ item }}") | .health' + register: logserver_alert + until: "\"ok\" in logserver_alert.stdout" + loop: + - "HighOpenStackAPIError5xxRate" + - "HighOpenStackAPIError5xxRate_nimbus" + - "HighOpenStackAPIError5xxRate_cumulus" + retries: 6 + delay: 10 \ No newline at end of file