Skip to content

Commit

Permalink
Use celery control to stop workers more gracefully?
Browse files Browse the repository at this point in the history
  • Loading branch information
aequitas committed Jan 21, 2025
1 parent 7a1a900 commit 5cb1a9d
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 38 deletions.
24 changes: 12 additions & 12 deletions docker/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ services:
resolver-permissive:
condition: service_healthy
# set hostname for Sentry
hostname: app
# hostname: app
environment:
- INTERNET_NL_CHECK_SUPPORT_IPV6
- INTERNET_NL_CHECK_SUPPORT_DNSSEC
Expand Down Expand Up @@ -236,17 +236,17 @@ services:
# SIGTERM is default, but make it explicit
stop_signal: SIGTERM

celery accepts tasks during warm shutdown?????
# celery accepts tasks during warm shutdown?????

Session terminated, killing shell...
worker: Warm shutdown (MainProcess)
[2024-10-29 18:45:01,807: INFO/MainProcess] Task checks.tasks.ipv6.batch_web[506e07c1-a208-4a52-a578-e95cdad9fdf0] received
...killed.
celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=300 --concurrency=500 --queues nassl_worker,batch_nassl
ENABLE_BATCH is set for this server but the database is lacking the required indexes. Consider running `manage.py api_create_db_indexes`.
Batch enabled.
# Session terminated, killing shell...
# worker: Warm shutdown (MainProcess)
# [2024-10-29 18:45:01,807: INFO/MainProcess] Task checks.tasks.ipv6.batch_web[506e07c1-a208-4a52-a578-e95cdad9fdf0] received
# ...killed.
# celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=300 --concurrency=500 --queues nassl_worker,batch_nassl
# ENABLE_BATCH is set for this server but the database is lacking the required indexes. Consider running `manage.py api_create_db_indexes`.
# Batch enabled.

hoe zit dat met healthcheck tijdens warm shutdown?????
# hoe zit dat met healthcheck tijdens warm shutdown?????



Expand All @@ -273,7 +273,7 @@ hoe zit dat met healthcheck tijdens warm shutdown?????
resolver-permissive:
condition: service_healthy
# set hostname for Sentry
hostname: worker
# hostname: worker
environment:
- INTERNET_NL_CHECK_SUPPORT_IPV6
- INTERNET_NL_CHECK_SUPPORT_DNSSEC
Expand Down Expand Up @@ -356,7 +356,7 @@ hoe zit dat met healthcheck tijdens warm shutdown?????

command: celery --app internetnl worker --without-gossip --pool=eventlet --time-limit=600 --concurrency=$WORKER_SLOW_CONCURRENCY
--queues slow_db_worker,batch_slow
hostname: worker-slow
# hostname: worker-slow

beat:
image: ${DOCKER_IMAGE_APP:-${DOCKER_REGISTRY:-ghcr.io/internetstandards}/internet.nl:${RELEASE}}
Expand Down
6 changes: 4 additions & 2 deletions docker/cron-docker/periodic/15min/restart_nassl_worker
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ set -e
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do
docker stop "$worker"
docker start "$worker"
docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true

# docker stop "$worker"
# docker start "$worker"
# wait for container to be healthy
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
done
32 changes: 10 additions & 22 deletions docker/cron-docker/periodic/15min/restart_slow_worker
Original file line number Diff line number Diff line change
@@ -1,24 +1,12 @@
#!/bin/sh

# restart slow worker every day to prevent slow memory leaks
# as the slow worker can run very long tasks (eg: report generation)
# we first start a new container before stopping the previous one

set -e

cd /opt/Internet.nl

SERVICE=worker-slow
REPLICAS=$WORKER_SLOW_REPLICAS
COMPOSE_CMD="docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env"

OLD_CONTAINERS=$($COMPOSE_CMD ps --format "{{ .Name }}"|grep "$SERVICE")

# bring up new containers, wait until healthy
$COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$(($REPLICAS*2))" "$SERVICE"

# graceful shutdown and remove old containers
docker rm --force "$OLD_CONTAINERS"

# restore replica number to original
$COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$REPLICAS" "$SERVICE"
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker-slow --quiet); do
# tell celery worker to graceful shutdown
docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
# docker stop "$worker"
# docker start "$worker"
# wait for container to be healthy
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
done
6 changes: 4 additions & 2 deletions docker/cron-docker/periodic/15min/restart_worker
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ set -e
# stop and start worker one at a time to ensure (batch) tasks are still being picked up
# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do
docker stop "$worker"
docker start "$worker"
# tell celery worker to graceful shutdown
docker exec "$worker" celery --app=internetnl control shutdown --destination "celery@$worker" --timeout 300 || true
# docker stop "$worker"
# docker start "$worker"
# wait for container to be healthy
timeout 300 sh -c "while docker inspect \"$worker\"| jq --exit-status '.[0].State.Health.Status != \"healthy\"' >/dev/null;do sleep 1;done" || true
done

0 comments on commit 5cb1a9d

Please sign in to comment.