Skip to content

Commit

Permalink
Merge pull request #416 from vshn/ha-alerts
Browse files Browse the repository at this point in the history
Ha alerts
  • Loading branch information
zugao authored Jul 19, 2024
2 parents fb081d8 + f67057c commit e581877
Show file tree
Hide file tree
Showing 32 changed files with 800 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .githooks/pre-commit
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ defaults_file = os.path.join(current_dir, '../../component/class/defaults.yml')
keys_to_check_yaml = [
'parameters.appcat.services.vshn.keycloak.additionalInputs.registry_password',
'parameters.appcat.services.vshn.keycloak.additionalInputs.registry_username',
'parameters.appcat.services.vshn.emailAlerting.smtpPassword',
'parameters.appcat.services.emailAlerting.smtpPassword',
]

with open(tests_vshn_file, 'r') as f:
Expand Down
19 changes: 9 additions & 10 deletions component/class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -415,22 +415,21 @@ parameters:
services:
# Used for deploying jobs
controlNamespace: "syn-appcat-control"
# Works only for VSHN services for now
emailAlerting:
enabled: false
smtpHost: "smtp.eu.mailgun.org:465"
smtpUsername: [email protected]
smtpPassword: "?{vaultkv:__shared__/__shared__/mailgun/smtp_password}"
smtpFromAddress: [email protected]
secretNamespace: syn-appcat
secretName: mailgun-smtp-credentials
vshn:
enabled: false
externalDatabaseConnectionsEnabled: "false"
e2eTests: false
quotasEnabled: ${appcat:quotasEnabled}
secretNamespace: ${crossplane:namespace}
emailAlerting:
enabled: false
smtpHost: "smtp.eu.mailgun.org:465"
smtpUsername: [email protected]
smtpPassword: "?{vaultkv:__shared__/__shared__/mailgun/smtp_password}"
smtpFromAddress: [email protected]
secretNamespace: syn-appcat
secretName: mailgun-smtp-credentials
stsResizer:
enabled: true
postgres:
billing: true
# bucket_region: 'lpg' || 'ch-gva-2'
Expand Down
91 changes: 87 additions & 4 deletions component/component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,93 @@ local mockOrgInfo = kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', '
},
};

local emailSecret = kube.Secret(params.services.vshn.emailAlerting.secretName) {
local emailSecret = kube.Secret(params.services.emailAlerting.secretName) {
metadata+: {
namespace: params.services.vshn.emailAlerting.secretNamespace,
namespace: params.services.emailAlerting.secretNamespace,
},
stringData: {
password: params.services.vshn.emailAlerting.smtpPassword,
password: params.services.emailAlerting.smtpPassword,
},
};

local filterName(name) = if name == 'postgres' then 'postgresql' else name;
local jobRegex = std.foldl(function(prev, current) (if prev == '' then filterName(current.name) else prev + '|' + filterName(current.name)), common.FilterServiceByBoolean('enabled'), '');

local backupPrometheusRule = {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
name: 'appcat-backup',
namespace: params.namespace,
},
spec: {
groups: [
{
name: 'appcat-backup',
rules: [
{
alert: 'AppCatBackupJobError',
annotations: {
description: 'The backup job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed.',
runbook_url: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/AppCatBackupJobError.html',
summary: 'AppCat service backup failed.',
},
expr: 'kube_job_failed{job_name=~".*backup.*", namespace=~"vshn-(' + jobRegex + ')-.*"} > 0',
'for': '1m',
labels: {
severity: 'warning',
syn_team: 'schedar',
},
},
],
},
],
},
};

local haPrometheusRule = {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
name: 'appcat-ha',
namespace: params.namespace,
},
spec: {
groups: [
{
name: 'appcat-ha',
rules: [
{
alert: 'AppCatHighAvailableDeploymentWarning',
annotations: {
description: 'The deployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has less replicas than expected.',
runbook_url: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableDeploymentWarning.html',
summary: 'AppCat service instance has unavailable pods.',
},
expr: 'kube_deployment_status_replicas{namespace=~"vshn-(' + jobRegex + ')-.*"} > 1 AND kube_deployment_status_replicas{namespace=~"vshn-(' + jobRegex + ')-.*"} - kube_deployment_status_replicas_ready{namespace=~"vshn-(' + jobRegex + ')-.*"} > 0',
'for': '1m',
labels: {
severity: 'warning',
syn_team: 'schedar',
},
},
{
alert: 'AppCatHighAvailableStatefulsetWarning',
annotations: {
description: 'The statefulset {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has less replicas than expected.',
runbook_url: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableStatefulsetWarning.html',
summary: 'AppCat service instance has unavailable pods.',
},
expr: 'kube_statefulset_status_replicas{namespace=~"vshn-(' + jobRegex + ')-.*"} > 1 AND kube_statefulset_status_replicas{namespace=~"vshn-(' + jobRegex + ')-.*"} - kube_statefulset_status_replicas_ready{namespace=~"vshn-(' + jobRegex + ')-.*"} > 0',
'for': '1m',
labels: {
severity: 'warning',
syn_team: 'schedar',
},
},
],
},
],
},
};

Expand All @@ -201,7 +282,9 @@ local emailSecret = kube.Secret(params.services.vshn.emailAlerting.secretName) {
'10_clusterrole_services_read': readServices,
'10_appcat_namespace': ns,
'10_appcat_legacy_billing_recording_rule': legacyBillingRule,
'10_appcat_backup_monitoring': backupPrometheusRule,
'10_appcat_ha_monitoring': haPrometheusRule,
[if params.billing.vshn.meteringRules then '10_appcat_metering_recording_rule']: meteringRule,
[if params.services.vshn.enabled && params.services.vshn.emailAlerting.enabled then '10_mailgun_secret']: emailSecret,
[if params.services.vshn.enabled && params.services.emailAlerting.enabled then '10_mailgun_secret']: emailSecret,
[if params.billing.enableMockOrgInfo then '10_mock_org_info']: mockOrgInfo,
}
2 changes: 1 addition & 1 deletion component/component/vshn_appcat_services.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ local vshn_appcat_service(name, serviceParams) =
ownerKind: xrd.spec.names.kind,
ownerGroup: xrd.spec.group,
ownerVersion: xrd.spec.versions[0].name,
} + common.EmailAlerting(params.services.vshn.emailAlerting)
} + common.EmailAlerting(params.services.emailAlerting)
+ restoreSA
+ additonalInputs
+ proxyFunction,
Expand Down
2 changes: 1 addition & 1 deletion component/component/vshn_postgres.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ local composition =
isOpenshift: std.toString(isOpenshift),
sliNamespace: params.slos.namespace,
} + std.get(pgParams, 'additionalInputs', default={}, inc_hidden=true)
+ common.EmailAlerting(params.services.vshn.emailAlerting)
+ common.EmailAlerting(params.services.emailAlerting)
+ if pgParams.proxyFunction then {
proxyEndpoint: pgParams.grpcEndpoint,
} else {},
Expand Down
2 changes: 1 addition & 1 deletion component/component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ local composition =
ownerVersion: xrd.spec.versions[0].name,
isOpenshift: std.toString(isOpenshift),
sliNamespace: params.slos.namespace,
} + common.EmailAlerting(params.services.vshn.emailAlerting)
} + common.EmailAlerting(params.services.emailAlerting)
+ if redisParams.proxyFunction then {
proxyEndpoint: redisParams.grpcEndpoint,
} else {},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-backup
namespace: syn-appcat
spec:
groups:
- name: appcat-backup
rules:
- alert: AppCatBackupJobError
annotations:
description: The backup job {{ $labels.job_name }} in namespace {{ $labels.namespace
}} has failed.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/AppCatBackupJobError.html
summary: AppCat service backup failed.
expr: kube_job_failed{job_name=~".*backup.*", namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-ha
namespace: syn-appcat
spec:
groups:
- name: appcat-ha
rules:
- alert: AppCatHighAvailableDeploymentWarning
annotations:
description: The deployment {{ $labels.deployment }} in namespace {{ $labels.namespace
}} has less replicas than expected.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableDeploymentWarning.html
summary: AppCat service instance has unavailable pods.
expr: kube_deployment_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
> 1 AND kube_deployment_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
- kube_deployment_status_replicas_ready{namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
- alert: AppCatHighAvailableStatefulsetWarning
annotations:
description: The statefulset {{ $labels.statefulset }} in namespace {{
$labels.namespace }} has less replicas than expected.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableStatefulsetWarning.html
summary: AppCat service instance has unavailable pods.
expr: kube_statefulset_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
> 1 AND kube_statefulset_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
- kube_statefulset_status_replicas_ready{namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-backup
namespace: syn-appcat
spec:
groups:
- name: appcat-backup
rules:
- alert: AppCatBackupJobError
annotations:
description: The backup job {{ $labels.job_name }} in namespace {{ $labels.namespace
}} has failed.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/AppCatBackupJobError.html
summary: AppCat service backup failed.
expr: kube_job_failed{job_name=~".*backup.*", namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-ha
namespace: syn-appcat
spec:
groups:
- name: appcat-ha
rules:
- alert: AppCatHighAvailableDeploymentWarning
annotations:
description: The deployment {{ $labels.deployment }} in namespace {{ $labels.namespace
}} has less replicas than expected.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableDeploymentWarning.html
summary: AppCat service instance has unavailable pods.
expr: kube_deployment_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
> 1 AND kube_deployment_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
- kube_deployment_status_replicas_ready{namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
- alert: AppCatHighAvailableStatefulsetWarning
annotations:
description: The statefulset {{ $labels.statefulset }} in namespace {{
$labels.namespace }} has less replicas than expected.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableStatefulsetWarning.html
summary: AppCat service instance has unavailable pods.
expr: kube_statefulset_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
> 1 AND kube_statefulset_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
- kube_statefulset_status_replicas_ready{namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-backup
namespace: syn-appcat
spec:
groups:
- name: appcat-backup
rules:
- alert: AppCatBackupJobError
annotations:
description: The backup job {{ $labels.job_name }} in namespace {{ $labels.namespace
}} has failed.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/AppCatBackupJobError.html
summary: AppCat service backup failed.
expr: kube_job_failed{job_name=~".*backup.*", namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-ha
namespace: syn-appcat
spec:
groups:
- name: appcat-ha
rules:
- alert: AppCatHighAvailableDeploymentWarning
annotations:
description: The deployment {{ $labels.deployment }} in namespace {{ $labels.namespace
}} has less replicas than expected.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableDeploymentWarning.html
summary: AppCat service instance has unavailable pods.
expr: kube_deployment_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
> 1 AND kube_deployment_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
- kube_deployment_status_replicas_ready{namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
- alert: AppCatHighAvailableStatefulsetWarning
annotations:
description: The statefulset {{ $labels.statefulset }} in namespace {{
$labels.namespace }} has less replicas than expected.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/vshn/AppCatHighAvailableStatefulsetWarning.html
summary: AppCat service instance has unavailable pods.
expr: kube_statefulset_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
> 1 AND kube_statefulset_status_replicas{namespace=~"vshn-(postgresql|redis)-.*"}
- kube_statefulset_status_replicas_ready{namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: appcat-backup
namespace: syn-appcat
spec:
groups:
- name: appcat-backup
rules:
- alert: AppCatBackupJobError
annotations:
description: The backup job {{ $labels.job_name }} in namespace {{ $labels.namespace
}} has failed.
runbook_url: https://kb.vshn.ch/app-catalog/how-tos/appcat/AppCatBackupJobError.html
summary: AppCat service backup failed.
expr: kube_job_failed{job_name=~".*backup.*", namespace=~"vshn-(postgresql|redis)-.*"}
> 0
for: 1m
labels:
severity: warning
syn_team: schedar
Loading

0 comments on commit e581877

Please sign in to comment.