From 421ac2ae48ca3c2de6905e581b0051aedda95217 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Mon, 24 Jan 2022 21:25:05 +0100 Subject: [PATCH 1/2] Add `MaxSurge` and `MaxUnavailable` strategy to all loki k8 workloads. This fixes couple of issues. 1. By default these configs are 25% in k8, meaning during rollout 25% of pods are allowed to shutdown immediately. 2. Due to (1), during graceful shutdown process, 25% of all the pods access consul to `unregister()` from shared key value. (2) makes CAS rate of underlying KV store high (leads to lots of retry and failing) sometimes failing to unregister leaving the ring "unhealthy" Also this PR make these configs consistent across all k8 workloads. More details: https://github.com/grafana/dskit/issues/117 --- production/ksonnet/loki/boltdb_shipper.libsonnet | 4 +++- production/ksonnet/loki/distributor.libsonnet | 4 +++- production/ksonnet/loki/gateway.libsonnet | 4 +++- production/ksonnet/loki/index-gateway.libsonnet | 4 +++- production/ksonnet/loki/ingester.libsonnet | 4 +++- production/ksonnet/loki/querier.libsonnet | 8 ++++++-- production/ksonnet/loki/query-frontend.libsonnet | 4 +++- production/ksonnet/loki/query-scheduler.libsonnet | 4 +++- production/ksonnet/loki/ruler.libsonnet | 8 ++++++-- 9 files changed, 33 insertions(+), 11 deletions(-) diff --git a/production/ksonnet/loki/boltdb_shipper.libsonnet b/production/ksonnet/loki/boltdb_shipper.libsonnet index b34f566589144..c1cb9f00a7085 100644 --- a/production/ksonnet/loki/boltdb_shipper.libsonnet +++ b/production/ksonnet/loki/boltdb_shipper.libsonnet @@ -70,6 +70,8 @@ k.util.configVolumeMount('loki', '/etc/loki/config') + k.util.configVolumeMount('overrides', '/etc/loki/overrides') + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, } diff --git a/production/ksonnet/loki/distributor.libsonnet b/production/ksonnet/loki/distributor.libsonnet index 7fd299c737c58..9633381ed9311 100644 --- a/production/ksonnet/loki/distributor.libsonnet +++ b/production/ksonnet/loki/distributor.libsonnet @@ -29,7 +29,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity, + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), distributor_service: k.util.serviceFor($.distributor_deployment), diff --git a/production/ksonnet/loki/gateway.libsonnet b/production/ksonnet/loki/gateway.libsonnet index 36ffc51f1f055..e99628e2fe7f8 100644 --- a/production/ksonnet/loki/gateway.libsonnet +++ b/production/ksonnet/loki/gateway.libsonnet @@ -98,7 +98,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; }) + k.util.configVolumeMount('gateway-config', '/etc/nginx') + k.util.secretVolumeMount('gateway-secret', '/etc/nginx/secrets', defaultMode=420) + - k.util.antiAffinity, + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), gateway_service: k.util.serviceFor($.gateway_deployment), diff --git a/production/ksonnet/loki/index-gateway.libsonnet b/production/ksonnet/loki/index-gateway.libsonnet index 6fafa525743de..b7e74707b6bac 100644 --- a/production/ksonnet/loki/index-gateway.libsonnet +++ b/production/ksonnet/loki/index-gateway.libsonnet @@ -57,7 +57,9 @@ $.config_hash_mixin + k.util.configVolumeMount('loki', '/etc/loki/config') + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, index_gateway_service: if $._config.use_index_gateway then diff --git a/production/ksonnet/loki/ingester.libsonnet b/production/ksonnet/loki/ingester.libsonnet index 8e72019b5a92c..d3bfc84fd239b 100644 --- a/production/ksonnet/loki/ingester.libsonnet +++ b/production/ksonnet/loki/ingester.libsonnet @@ -69,7 +69,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, ingester_service: diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 1bb5c0f66d904..80350d598c1c0 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -34,7 +34,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, // PVC for queriers when running as statefulsets @@ -57,7 +59,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ) + k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, querier_service: diff --git a/production/ksonnet/loki/query-frontend.libsonnet b/production/ksonnet/loki/query-frontend.libsonnet index d495ff2ab5929..404de05bce234 100644 --- a/production/ksonnet/loki/query-frontend.libsonnet +++ b/production/ksonnet/loki/query-frontend.libsonnet @@ -35,7 +35,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity, + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), local service = k.core.v1.service, diff --git a/production/ksonnet/loki/query-scheduler.libsonnet b/production/ksonnet/loki/query-scheduler.libsonnet index 7b15ee4d583ad..4f71e2f7de527 100644 --- a/production/ksonnet/loki/query-scheduler.libsonnet +++ b/production/ksonnet/loki/query-scheduler.libsonnet @@ -51,7 +51,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, local service = k.core.v1.service, diff --git a/production/ksonnet/loki/ruler.libsonnet b/production/ksonnet/loki/ruler.libsonnet index ebaf6d7fbe79e..5b4958644d801 100644 --- a/production/ksonnet/loki/ruler.libsonnet +++ b/production/ksonnet/loki/ruler.libsonnet @@ -44,7 +44,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, ruler_service: if !$._config.ruler_enabled @@ -75,6 +77,8 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ) + k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, } From c604e2f7c4869469a7f1d9a670cf06b7b3b82cb5 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Tue, 25 Jan 2022 09:55:23 +0100 Subject: [PATCH 2/2] Remove it from statefulset workloads Signed-off-by: Kaviraj --- production/ksonnet/loki/boltdb_shipper.libsonnet | 4 +--- production/ksonnet/loki/index-gateway.libsonnet | 4 +--- production/ksonnet/loki/ingester.libsonnet | 4 +--- production/ksonnet/loki/querier.libsonnet | 4 +--- production/ksonnet/loki/ruler.libsonnet | 4 +--- 5 files changed, 5 insertions(+), 15 deletions(-) diff --git a/production/ksonnet/loki/boltdb_shipper.libsonnet b/production/ksonnet/loki/boltdb_shipper.libsonnet index c1cb9f00a7085..b34f566589144 100644 --- a/production/ksonnet/loki/boltdb_shipper.libsonnet +++ b/production/ksonnet/loki/boltdb_shipper.libsonnet @@ -70,8 +70,6 @@ k.util.configVolumeMount('loki', '/etc/loki/config') + k.util.configVolumeMount('overrides', '/etc/loki/overrides') + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile else {}, } diff --git a/production/ksonnet/loki/index-gateway.libsonnet b/production/ksonnet/loki/index-gateway.libsonnet index b7e74707b6bac..6fafa525743de 100644 --- a/production/ksonnet/loki/index-gateway.libsonnet +++ b/production/ksonnet/loki/index-gateway.libsonnet @@ -57,9 +57,7 @@ $.config_hash_mixin + k.util.configVolumeMount('loki', '/etc/loki/config') + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile else {}, index_gateway_service: if $._config.use_index_gateway then diff --git a/production/ksonnet/loki/ingester.libsonnet b/production/ksonnet/loki/ingester.libsonnet index d3bfc84fd239b..8e72019b5a92c 100644 --- a/production/ksonnet/loki/ingester.libsonnet +++ b/production/ksonnet/loki/ingester.libsonnet @@ -69,9 +69,7 @@ local k = import 'ksonnet-util/kausal.libsonnet'; k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) else {}, ingester_service: diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 80350d598c1c0..848c6c06ef827 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -59,9 +59,7 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ) + k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile else {}, querier_service: diff --git a/production/ksonnet/loki/ruler.libsonnet b/production/ksonnet/loki/ruler.libsonnet index 5b4958644d801..85dce3f9e22db 100644 --- a/production/ksonnet/loki/ruler.libsonnet +++ b/production/ksonnet/loki/ruler.libsonnet @@ -77,8 +77,6 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ) + k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile else {}, }