From 421ac2ae48ca3c2de6905e581b0051aedda95217 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Mon, 24 Jan 2022 21:25:05 +0100 Subject: [PATCH] Add `MaxSurge` and `MaxUnavailable` strategy to all loki k8 workloads. This fixes couple of issues. 1. By default these configs are 25% in k8, meaning during rollout 25% of pods are allowed to shutdown immediately. 2. Due to (1), during graceful shutdown process, 25% of all the pods access consul to `unregister()` from shared key value. (2) makes CAS rate of underlying KV store high (leads to lots of retry and failing) sometimes failing to unregister leaving the ring "unhealthy" Also this PR make these configs consistent across all k8 workloads. More details: https://github.com/grafana/dskit/issues/117 --- production/ksonnet/loki/boltdb_shipper.libsonnet | 4 +++- production/ksonnet/loki/distributor.libsonnet | 4 +++- production/ksonnet/loki/gateway.libsonnet | 4 +++- production/ksonnet/loki/index-gateway.libsonnet | 4 +++- production/ksonnet/loki/ingester.libsonnet | 4 +++- production/ksonnet/loki/querier.libsonnet | 8 ++++++-- production/ksonnet/loki/query-frontend.libsonnet | 4 +++- production/ksonnet/loki/query-scheduler.libsonnet | 4 +++- production/ksonnet/loki/ruler.libsonnet | 8 ++++++-- 9 files changed, 33 insertions(+), 11 deletions(-) diff --git a/production/ksonnet/loki/boltdb_shipper.libsonnet b/production/ksonnet/loki/boltdb_shipper.libsonnet index b34f566589144..c1cb9f00a7085 100644 --- a/production/ksonnet/loki/boltdb_shipper.libsonnet +++ b/production/ksonnet/loki/boltdb_shipper.libsonnet @@ -70,6 +70,8 @@ k.util.configVolumeMount('loki', '/etc/loki/config') + k.util.configVolumeMount('overrides', '/etc/loki/overrides') + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, } diff --git a/production/ksonnet/loki/distributor.libsonnet b/production/ksonnet/loki/distributor.libsonnet index 7fd299c737c58..9633381ed9311 100644 --- a/production/ksonnet/loki/distributor.libsonnet +++ b/production/ksonnet/loki/distributor.libsonnet @@ -29,7 +29,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity, + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), distributor_service: k.util.serviceFor($.distributor_deployment), diff --git a/production/ksonnet/loki/gateway.libsonnet b/production/ksonnet/loki/gateway.libsonnet index 36ffc51f1f055..e99628e2fe7f8 100644 --- a/production/ksonnet/loki/gateway.libsonnet +++ b/production/ksonnet/loki/gateway.libsonnet @@ -98,7 +98,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; }) + k.util.configVolumeMount('gateway-config', '/etc/nginx') + k.util.secretVolumeMount('gateway-secret', '/etc/nginx/secrets', defaultMode=420) + - k.util.antiAffinity, + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), gateway_service: k.util.serviceFor($.gateway_deployment), diff --git a/production/ksonnet/loki/index-gateway.libsonnet b/production/ksonnet/loki/index-gateway.libsonnet index 6fafa525743de..b7e74707b6bac 100644 --- a/production/ksonnet/loki/index-gateway.libsonnet +++ b/production/ksonnet/loki/index-gateway.libsonnet @@ -57,7 +57,9 @@ $.config_hash_mixin + k.util.configVolumeMount('loki', '/etc/loki/config') + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, index_gateway_service: if $._config.use_index_gateway then diff --git a/production/ksonnet/loki/ingester.libsonnet b/production/ksonnet/loki/ingester.libsonnet index 8e72019b5a92c..d3bfc84fd239b 100644 --- a/production/ksonnet/loki/ingester.libsonnet +++ b/production/ksonnet/loki/ingester.libsonnet @@ -69,7 +69,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, ingester_service: diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 1bb5c0f66d904..80350d598c1c0 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -34,7 +34,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, // PVC for queriers when running as statefulsets @@ -57,7 +59,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ) + k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, querier_service: diff --git a/production/ksonnet/loki/query-frontend.libsonnet b/production/ksonnet/loki/query-frontend.libsonnet index d495ff2ab5929..404de05bce234 100644 --- a/production/ksonnet/loki/query-frontend.libsonnet +++ b/production/ksonnet/loki/query-frontend.libsonnet @@ -35,7 +35,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity, + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), local service = k.core.v1.service, diff --git a/production/ksonnet/loki/query-scheduler.libsonnet b/production/ksonnet/loki/query-scheduler.libsonnet index 7b15ee4d583ad..4f71e2f7de527 100644 --- a/production/ksonnet/loki/query-scheduler.libsonnet +++ b/production/ksonnet/loki/query-scheduler.libsonnet @@ -51,7 +51,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, local service = k.core.v1.service, diff --git a/production/ksonnet/loki/ruler.libsonnet b/production/ksonnet/loki/ruler.libsonnet index ebaf6d7fbe79e..5b4958644d801 100644 --- a/production/ksonnet/loki/ruler.libsonnet +++ b/production/ksonnet/loki/ruler.libsonnet @@ -44,7 +44,9 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + k.util.antiAffinity + + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, ruler_service: if !$._config.ruler_enabled @@ -75,6 +77,8 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ) + k.util.antiAffinity + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, }