From bf8c279a074bcdd34bbe9849124062a3a65b6cb0 Mon Sep 17 00:00:00 2001
From: Matthieu Huin <mhuin@redhat.com>
Date: Thu, 12 Oct 2023 17:20:52 +0200
Subject: [PATCH] Fix type issue in clouds to statsd mapping func, add test and
 documentation

Change-Id: I7b99d0447f8feef151de91d8caa3df170ce7b400
---
 controllers/libs/monitoring/monitoring.go     |  9 ++--
 doc/deployment/monitoring.md                  | 31 +++++++------
 doc/deployment/nodepool.md                    |  2 +-
 .../test-monitoring/files/clouds.yaml         | 31 +++++++++++++
 .../test-monitoring/tasks/main.yaml           | 43 +++++++++++++++++++
 5 files changed, 99 insertions(+), 17 deletions(-)
 create mode 100644 roles/health-check/test-monitoring/files/clouds.yaml

diff --git a/controllers/libs/monitoring/monitoring.go b/controllers/libs/monitoring/monitoring.go
index 64e8ae4a..3df0622a 100644
--- a/controllers/libs/monitoring/monitoring.go
+++ b/controllers/libs/monitoring/monitoring.go
@@ -198,9 +198,9 @@ func MkStatsdMappingsFromCloudsYaml(extraMappings []StatsdMetricMapping, cloudsY
 	if globalMetricsConf, ok := cloudsYaml["metrics"]; ok {
 		gmc := globalMetricsConf.(map[string]interface{})
 		if globalStatsdConf, ok := gmc["statsd"]; ok {
-			gsc := globalStatsdConf.(map[string]string)
+			gsc := globalStatsdConf.(map[string]interface{})
 			if prefix, ok := gsc["prefix"]; ok {
-				globalPrefix = prefix
+				globalPrefix = prefix.(string)
 			}
 		}
 	}
@@ -211,8 +211,9 @@ func MkStatsdMappingsFromCloudsYaml(extraMappings []StatsdMetricMapping, cloudsY
 			if metricsConf, ok := cC["metrics"]; ok {
 				mC := metricsConf.(map[string]interface{})
 				if statsdConf, ok := mC["statsd"]; ok {
-					sC := statsdConf.(map[string]string)
-					if prefix, ok := sC["prefix"]; ok {
+					sC := statsdConf.(map[string]interface{})
+					if prefx, ok := sC["prefix"]; ok {
+						prefix := prefx.(string)
 						var extraMapping = StatsdMetricMapping{
 							Name:         strings.Replace(prefix, ".", "_", -1),
 							ProviderName: cloudName,
diff --git a/doc/deployment/monitoring.md b/doc/deployment/monitoring.md
index 138f3a4c..624de2cf 100644
--- a/doc/deployment/monitoring.md
+++ b/doc/deployment/monitoring.md
@@ -82,18 +82,25 @@ or configure Prometheus to forward alerts to an external AlertManager instance.
 
 The following alerting rules are created automatically at deployment time:
 
-| Alert name | Severity | Service | Prometheus Group Rule | Description |
-|---------|------|------|--------|------------------|
-| `OutOfDiskNow` | critical | Log server | disk_default.rules | The Log server has less than 10% free storage space left |
-| `OutOfDiskInThreeDays` | warning | Log server | disk_default.rules | Assuming a linear trend, the Log server's storage space will fill up in less than three days |
-| `ConfigUpdateFailureInPostPipeline` | critical | Zuul | config-repository_default.rules | A `config-update` job failed in the `post` pipeline, meaning a configuration change was not applied properly to the Software Factory deployment's services |
-| `NotEnoughExecutors` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some jobs are waiting for an available executor to run on |
-| `NotEnoughMergers` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case some merge jobs are waiting for an available merger to run on |
-| `NotEnoughTestNodes` | warning | Zuul | zuul_default.rules | Lack of resources is throttling performance in the last hour; in that case Nodepool could not fulfill node requests |
-| `DIBImageBuildFailure` | warning | nodepool-builder | builder_default.rules | the disk-image-builder service (DIB) failed to build an image |
-| `HighOpenStackAPIError5xxRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of API calls on an OpenStack provider return a status code of 5xx (server-side error) over a period of 15 minutes |
-| `HighFailedStateRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of nodes on a provider are in failed state over a period of one hour |
-| `HighNodeLaunchErrorRate` | critical | nodepool-launcher | launcher_default.rules | Triggers when more than 5% of node launch events end in an error state over a period of one hour |
+| Alert name | Severity  | Description |
+|---------|------|------------------|
+| `OutOfDiskNow` | critical | The Log server has less than 10% free storage space left |
+| `OutOfDiskInThreeDays` | warning | Assuming a linear trend, the Log server's storage space will fill up in less than three days |
+| `ConfigUpdateFailureInPostPipeline` | critical | A `config-update` job failed in the `post` pipeline, meaning a configuration change was not applied properly to the Software Factory deployment's services |
+| `NotEnoughExecutors` | warning | Lack of resources is throttling performance in the last hour; in that case some jobs are waiting for an available executor to run on |
+| `NotEnoughMergers` | warning | Lack of resources is throttling performance in the last hour; in that case some merge jobs are waiting for an available merger to run on |
+| `NotEnoughTestNodes` | warning | Lack of resources is throttling performance in the last hour; in that case Nodepool could not fulfill node requests |
+| `DIBImageBuildFailure` | warning | the disk-image-builder service (DIB) failed to build an image |
+| `HighFailedStateRate` | critical | Triggers when more than 5% of nodes on a provider are in failed state over a period of one hour |
+| `HighNodeLaunchErrorRate` | critical | Triggers when more than 5% of node launch events end in an error state over a period of one hour |
+| `HighOpenStackAPIError5xxRate` | critical | Triggers when more than 5% of API calls on OpenStack return a status code of 5xx (server-side error) over a period of 15 minutes |
+
+If [statsd metrics prefixes are set](https://docs.openstack.org/openstacksdk/latest/user/guides/stats.html) for clouds defined in Nodepool's `clouds.yaml`, SF-Operator will also create the following alert
+for each cloud with a set prefix:
+
+| Alert name | Severity  | Description |
+|---------|------|------------------|
+| `HighOpenStackAPIError5xxRate_<CLOUD NAME>` | critical | Triggers when more than 5% of API calls on cloud <CLOUD NAME> return a status code of 5xx (server-side error) over a period of 15 minutes |
 
 Note that these alerts are generic and might not be relevant to your deployment's specificities.
 For instance, it may be normal to hit the `NotEnoughTestNodes` alert if resource quotas are in place
diff --git a/doc/deployment/nodepool.md b/doc/deployment/nodepool.md
index 643560ff..87330842 100644
--- a/doc/deployment/nodepool.md
+++ b/doc/deployment/nodepool.md
@@ -60,7 +60,7 @@ nodepool:
 2. Run sfconfig:
 
 ```sh
-./tools/sfconfig nodepool-providers-secrets --upload
+./tools/sfconfig nodepool-providers-secrets --update
 ```
 
 3. Wait until your deployment becomes ready again:
diff --git a/roles/health-check/test-monitoring/files/clouds.yaml b/roles/health-check/test-monitoring/files/clouds.yaml
new file mode 100644
index 00000000..0681b0ec
--- /dev/null
+++ b/roles/health-check/test-monitoring/files/clouds.yaml
@@ -0,0 +1,31 @@
+cache:
+  expiration:
+    server: 5
+    port: 5
+    floating-ip: 5
+metrics:
+  statsd:
+    prefix: nodepool.openstack
+clouds:
+  nimbus:
+    api_timeout: 60
+    auth:
+      username: user1
+      password: password1
+      auth_url: https://keystone.nimbus/v2.0
+      project_name: my-project
+    image_format: 'raw'
+    metrics:
+      statsd:
+        prefix: nodepool.openstack.nimbus
+  cumulus:
+    api_timeout: 60
+    auth:
+      username: user2
+      password: password2
+      auth_url: https://keystone.cumulus/v2.0
+      project_name: my-other-project
+    image_format: 'raw'
+    metrics:
+      statsd:
+        prefix: nodepool.openstack.cumulus
\ No newline at end of file
diff --git a/roles/health-check/test-monitoring/tasks/main.yaml b/roles/health-check/test-monitoring/tasks/main.yaml
index add351f9..924cb1e0 100644
--- a/roles/health-check/test-monitoring/tasks/main.yaml
+++ b/roles/health-check/test-monitoring/tasks/main.yaml
@@ -56,3 +56,46 @@
 - fail:
     msg: Unexpected value for nodepool_launch_ready
   when: metric_dict.nlr | float < 1
+
+- name: set a fake clouds.yaml configuration
+  ansible.builtin.copy:
+    src: clouds.yaml
+    dest: /tmp/test-clouds.yaml
+  become: true
+
+- name: configure clouds.yaml in sfconfig.yaml
+  ansible.builtin.lineinfile:
+    path: "{{ zuul.project.src_dir }}/sfconfig.yaml"
+    regexp: '^  clouds_file:'
+    line: "  clouds_file: /tmp/test-clouds.yaml"
+
+- name: Upload clouds secrets to nodepool
+  command: "tools/sfconfig nodepool-providers-secrets --update"
+  args:
+    chdir: "{{ zuul.project.src_dir }}"
+
+- name: Wait for secrets to be updated
+  ansible.builtin.include_role:
+    name: "roles/health-check/check-sf-resource-ready"
+
+- name: Ensure statsd mapping config has custom mappings from clouds.yaml
+  ansible.builtin.command: kubectl get configmap np-statsd-config-map -o jsonpath='{.data}'
+  register: statsd_mappings
+  
+- name: Ensure statsd mapping config has custom mappings from clouds.yaml
+  fail:
+    msg: "statsd mapping configuration has no reference to clouds config"
+  when: ("nodepool.openstack.nimbus" not in statsd_mappings.stdout) or
+        ("nodepool.openstack.cumulus" not in statsd_mappings.stdout) or
+        ("nodepool.openstack.*.*.*.*" not in statsd_mappings.stdout)
+
+- name: Fetch OpenStack API alerts
+  ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/rules | jq '.data.groups[] | select(.name == "providersAPI_default.rules") | .rules[] | select(.name == "{{ item }}") | .health'
+  register: logserver_alert
+  until: "\"ok\" in logserver_alert.stdout"
+  loop:
+    - "HighOpenStackAPIError5xxRate"
+    - "HighOpenStackAPIError5xxRate_nimbus"
+    - "HighOpenStackAPIError5xxRate_cumulus"
+  retries: 6
+  delay: 10
\ No newline at end of file