From 5bcb69a040739616b321faed6b3fb3199055ec44 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 30 Jan 2024 18:09:53 +0100 Subject: [PATCH] dockerize snapshot upload --- .github/workflows/snapshot-service-image.yml | 57 +++++++++++++++++++ images/snapshot-service/.dockerignore | 1 + images/snapshot-service/Dockerfile | 25 ++++++++ images/snapshot-service/README.md | 38 +++++++++++++ .../snapshot-service/src}/daily_snapshot.rb | 19 ++----- images/snapshot-service/src/run.sh | 19 +++++++ .../snapshot-service/src}/upload_snapshot.sh | 28 ++++++--- tf-managed/modules/daily-snapshot/main.tf | 38 ++++++------- .../modules/daily-snapshot/prep_sources.sh | 6 +- .../daily-snapshot/service/calibnet_cron_job | 5 +- .../daily-snapshot/service/forest-env.tpl | 11 ---- .../modules/daily-snapshot/service/init.sh | 19 +------ .../daily-snapshot/service/mainnet_cron_job | 5 +- tf-managed/modules/daily-snapshot/variable.tf | 10 ---- 14 files changed, 187 insertions(+), 94 deletions(-) create mode 100644 .github/workflows/snapshot-service-image.yml create mode 100644 images/snapshot-service/.dockerignore create mode 100644 images/snapshot-service/Dockerfile create mode 100644 images/snapshot-service/README.md rename {tf-managed/modules/daily-snapshot/service => images/snapshot-service/src}/daily_snapshot.rb (78%) create mode 100644 images/snapshot-service/src/run.sh rename {tf-managed/modules/daily-snapshot/service => images/snapshot-service/src}/upload_snapshot.sh (81%) delete mode 100644 tf-managed/modules/daily-snapshot/service/forest-env.tpl diff --git a/.github/workflows/snapshot-service-image.yml b/.github/workflows/snapshot-service-image.yml new file mode 100644 index 000000000..730c29f07 --- /dev/null +++ b/.github/workflows/snapshot-service-image.yml @@ -0,0 +1,57 @@ +name: Snapshot Service Image + +# Cancel workflow if there is a new change to the branch. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +on: + push: + branches: [main] + merge_group: + pull_request: + branches: [main] + +jobs: + build-and-push-docker-image: + name: Build images and push to GHCR + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: List cached docker images + run: docker image ls + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Login to Github Packages + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # This step yields the following labels: + # - date+sha, e.g. 2023-01-19-da4692d, + # - latest, + - name: Docker Meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/chainsafe/forest-snapshot-service + tags: | + type=raw,value={{date 'YYYY-MM-DD'}}-{{sha}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build image and push to GitHub Container Registry + uses: docker/build-push-action@v5 + with: + context: ./images/snapshot-service/ + build-contexts: | + common=./tf-managed/scripts/ + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + push: ${{ github.ref == 'refs/heads/main' }} + + - name: List docker images + run: docker image ls diff --git a/images/snapshot-service/.dockerignore b/images/snapshot-service/.dockerignore new file mode 100644 index 000000000..dd449725e --- /dev/null +++ b/images/snapshot-service/.dockerignore @@ -0,0 +1 @@ +*.md diff --git a/images/snapshot-service/Dockerfile b/images/snapshot-service/Dockerfile new file mode 100644 index 000000000..813973af9 --- /dev/null +++ b/images/snapshot-service/Dockerfile @@ -0,0 +1,25 @@ +# Snapshot service Dockerfile. +# It is meant to produce a single snapshot of the given chain in the Filecoin network and upload it to S3 (preferably Cloudflare R2, it should work for other providers as well, but it wasn't tested). +FROM docker:24 +LABEL org.opencontainers.image.description "Forest snapshot service generator and uploader for Filecoin" + +RUN apk add --no-cache \ + ruby=3.2.2-r1 \ + ruby-dev=3.2.2-r1 \ + docker=24.0.7-r0 \ + aws-cli=2.13.25-r0 \ + bash=5.2.21-r0 && \ + gem install \ + docker-api:1.28.0 \ + slack-ruby-client:2.2 \ + activesupport:7.0.8 + +COPY ./src /opt/snapshot-service + +# TODO: Change this once `sync-check` is fully-dockerized as well. +# hadolint ignore=DL3022 +COPY --from=common ruby_common /opt/snapshot-service/ruby_common + +WORKDIR /opt/snapshot-service + +CMD ["bash", "run.sh"] diff --git a/images/snapshot-service/README.md b/images/snapshot-service/README.md new file mode 100644 index 000000000..050e1dd62 --- /dev/null +++ b/images/snapshot-service/README.md @@ -0,0 +1,38 @@ +# Forest snapshot service + +This service serves as a Filecoin snapshot generator and uploader. Supported networks are [calibnet](https://docs.filecoin.io/networks/calibration) and [mainnet](https://docs.filecoin.io/networks/mainnet). All S3-compatible providers should work correctly, though it was tested exclusively on Cloudflare R2. + +## Building the image + +```bash +docker build --build-context common=../../tf-managed/scripts/ -t : . +``` + +## Running the Forest snapshot service + +The container needs additional privileges and access to the docker socket to issue other `docker` commands. + +This command will generate a snapshot for the given network and upload it to an S3 bucket. +```bash +docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm --env-file --env NETWORK_CHAIN= ghcr.io/chainsafe/forest-snapshot-service:edge +``` + +## Variables (all required) + +```bash +# Details for the snapshot upload +R2_ACCESS_KEY= +R2_SECRET_KEY= +R2_ENDPOINT= +SNAPSHOT_BUCKET= + +# Details for the Slack notifications +SLACK_API_TOKEN= +SLACK_NOTIFICATION_CHANNEL= + +# Network chain - can be either `mainnet` or `calibnet` +NETWORK_CHAIN= +# Forest tag to use. `latest` is the newest stable version. +# See [Forest packages](https://github.com/ChainSafe/forest/pkgs/container/forest) for more. +FOREST_TAG= +``` diff --git a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb b/images/snapshot-service/src/daily_snapshot.rb similarity index 78% rename from tf-managed/modules/daily-snapshot/service/daily_snapshot.rb rename to images/snapshot-service/src/daily_snapshot.rb index 0ad65a2e7..8d3f2d213 100644 --- a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb +++ b/images/snapshot-service/src/daily_snapshot.rb @@ -8,24 +8,16 @@ require 'logger' require 'fileutils' -BASE_FOLDER = get_and_assert_env_variable 'BASE_FOLDER' SLACK_TOKEN = get_and_assert_env_variable 'SLACK_API_TOKEN' -CHANNEL = get_and_assert_env_variable 'SLACK_NOTIF_CHANNEL' - -# Prune logs files(txt) older than 2 weeks -def prune_logs(logs_folder = 'logs') - cutoff_date = Date.today - 14 # set the cutoff date to 14 days ago - - Dir.glob("#{logs_folder}/*.txt").each do |file| - File.delete(file) if File.file?(file) && File.mtime(file).to_date < cutoff_date - end -end +CHANNEL = get_and_assert_env_variable 'SLACK_NOTIFICATION_CHANNEL' CHAIN_NAME = ARGV[0] raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty? # Current datetime, to append to the log files DATE = Time.new.strftime '%FT%H:%M:%S' + +FileUtils.mkdir_p 'logs' LOG_EXPORT_SCRIPT_RUN = "logs/#{CHAIN_NAME}_#{DATE}_script_run.txt" LOG_EXPORT_DAEMON = "logs/#{CHAIN_NAME}_#{DATE}_daemon.txt" LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt" @@ -46,7 +38,7 @@ def prune_logs(logs_folder = 'logs') upload_cmd = <<~CMD.chomp set -o pipefail && \ - timeout --signal=KILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | \ + timeout -s SIGKILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | \ #{add_timestamps_cmd} CMD @@ -71,6 +63,3 @@ def prune_logs(logs_folder = 'logs') [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| logger.info "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) end - -# Prune logs files(txt) in the logs directory older than 2 weeks -prune_logs diff --git a/images/snapshot-service/src/run.sh b/images/snapshot-service/src/run.sh new file mode 100644 index 000000000..866444bb1 --- /dev/null +++ b/images/snapshot-service/src/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -euo pipefail + +# Assert that all required environment variables are set +: "${R2_ACCESS_KEY:?}" +: "${R2_SECRET_KEY:?}" +: "${R2_ENDPOINT:?}" +: "${SNAPSHOT_BUCKET:?}" +: "${SLACK_API_TOKEN:?}" +: "${SLACK_NOTIFICATION_CHANNEL:?}" +: "${NETWORK_CHAIN:?}" +: "${FOREST_TAG:?}" + +aws configure set default.s3.multipart_chunksize 4GB +aws configure set aws_access_key_id "$R2_ACCESS_KEY" +aws configure set aws_secret_access_key "$R2_SECRET_KEY" + +ruby daily_snapshot.rb "$NETWORK_CHAIN" diff --git a/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh b/images/snapshot-service/src/upload_snapshot.sh similarity index 81% rename from tf-managed/modules/daily-snapshot/service/upload_snapshot.sh rename to images/snapshot-service/src/upload_snapshot.sh index df4564d02..08fb9f9f1 100755 --- a/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh +++ b/images/snapshot-service/src/upload_snapshot.sh @@ -83,7 +83,6 @@ else timeout 30m forest-tool snapshot validate --check-links 0 --check-network "$CHAIN_NAME" --check-stateroots 5 forest_db/forest_snapshot_*.forest.car.zst fi - # Kill the metrics writer process kill %1 @@ -95,24 +94,35 @@ CONTAINER_NAME="forest-snapshot-upload-node-$CHAIN_NAME" docker stop "$CONTAINER_NAME" || true docker rm --force "$CONTAINER_NAME" -CHAIN_DB_DIR="$BASE_FOLDER/forest_db/$CHAIN_NAME" -CHAIN_LOGS_DIR="$BASE_FOLDER/logs" +CHAIN_DB_DIR="/opt/forest_db/$CHAIN_NAME" +CHAIN_LOGS_DIR="/opt/logs/$CHAIN_NAME" +mkdir -p "$CHAIN_DB_DIR" +mkdir -p "$CHAIN_LOGS_DIR" -# Delete any existing snapshot files. It may be that the previous run failed -# before deleting those. -rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* +# Cleanup volumes from the previous if any. +DB_VOLUME="${CHAIN_NAME}_db" +LOG_VOLUME="${CHAIN_NAME}_logs" +docker volume rm "${DB_VOLUME}" || true +docker volume rm "${LOG_VOLUME}" || true -# Run forest and generate a snapshot in forest_db/ +# Run forest and generate a snapshot in the `DB_VOLUME` volume. docker run \ --name "$CONTAINER_NAME" \ --rm \ --user root \ - -v "$CHAIN_DB_DIR:/home/forest/forest_db":z \ - -v "$CHAIN_LOGS_DIR:/home/forest/logs":z \ + -v "${DB_VOLUME}:/home/forest/forest_db" \ + -v "${LOG_VOLUME}:/home/forest/logs" \ --entrypoint /bin/bash \ ghcr.io/chainsafe/forest:"${FOREST_TAG}" \ -c "$COMMANDS" || exit 1 +# Dummy container to copy the snapshot files from the volume to the "host". +COPIER=$(docker container create -v "${CHAIN_NAME}_db:/opt" busybox) +docker run --rm -v "${DB_VOLUME}:/opt" busybox /bin/sh -c 'ls /opt/forest_snapshot_*.forest.car.zst' | while read -r file; do + docker cp "$COPIER":"$file" "$CHAIN_DB_DIR" +done +docker rm "$COPIER" + aws --endpoint "$R2_ENDPOINT" s3 cp --no-progress "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"*.forest.car.zst s3://"$SNAPSHOT_BUCKET"/"$CHAIN_NAME"/latest/ || exit 1 # Delete snapshot files diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index c931fb43b..7512c21f9 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -7,7 +7,7 @@ // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { - program = ["bash", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] + program = ["bash", "${path.module}/prep_sources.sh", path.module] } @@ -30,32 +30,30 @@ data "digitalocean_ssh_keys" "keys" { } } -# Set required environment variables +# Required environment variables for the snapshot service itself. locals { - env_content = templatefile("${path.module}/service/forest-env.tpl", { - R2_ACCESS_KEY = var.R2_ACCESS_KEY, - R2_SECRET_KEY = var.R2_SECRET_KEY, - r2_endpoint = var.r2_endpoint, - slack_token = var.slack_token, - slack_channel = var.slack_channel, - snapshot_bucket = var.snapshot_bucket, - snapshot_endpoint = var.snapshot_endpoint, - NEW_RELIC_API_KEY = var.new_relic_api_key, - NEW_RELIC_ACCOUNT_ID = var.new_relic_account_id, - NEW_RELIC_REGION = var.new_relic_region, - BASE_FOLDER = "/root", - forest_tag = var.forest_tag - }) + env_content = <<-EOT + R2_ACCESS_KEY=${var.R2_ACCESS_KEY} + R2_SECRET_KEY=${var.R2_SECRET_KEY} + R2_ENDPOINT=${var.r2_endpoint} + SNAPSHOT_BUCKET=${var.snapshot_bucket} + SLACK_API_TOKEN=${var.slack_token} + SLACK_NOTIFICATION_CHANNEL=${var.slack_channel} + FOREST_TAG=${var.forest_tag} + EOT } locals { init_commands = ["cd /root/", "tar xf sources.tar", - # Set required environment variables "echo '${local.env_content}' >> /root/.forest_env", - "echo '. ~/.forest_env' >> .bashrc", - ". ~/.forest_env", - "nohup sh ./init.sh > init_log.txt &", + <<-EOT + export NEW_RELIC_API_KEY=${var.new_relic_api_key} + export NEW_RELIC_ACCOUNT_ID=${var.new_relic_account_id} + export NEW_RELIC_REGION=${var.new_relic_region} + nohup sh ./init.sh > init_log.txt & + EOT + , # Exiting without a sleep sometimes kills the script :-/ "sleep 60s" ] diff --git a/tf-managed/modules/daily-snapshot/prep_sources.sh b/tf-managed/modules/daily-snapshot/prep_sources.sh index 05d9a16df..1d63836c8 100755 --- a/tf-managed/modules/daily-snapshot/prep_sources.sh +++ b/tf-managed/modules/daily-snapshot/prep_sources.sh @@ -3,12 +3,8 @@ # Enable strict error handling and command tracing set -euxo pipefail -# Copy local source files in a folder together with ruby_common and create a zip archive. - +# Copy local source files in a folder, and create a zip archive. cd "$1" -cp --archive "$2"/ruby_common service/ - rm -f sources.tar (cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) -rm -fr service/ruby_common echo "{ \"path\": \"$1/sources.tar\" }" diff --git a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job index 0511fa173..48ece9597 100755 --- a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job +++ b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job @@ -1,6 +1,5 @@ #!/bin/bash # shellcheck source=/dev/null -source ~/.forest_env -cd "$BASE_FOLDER" || exit -flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet >> logs/calibnet_log.txt 2>&1" +cd "$HOME" || exit +flock -n /tmp/calibnet.lock -c "docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm --env-file .forest_env -e NETWORK_CHAIN=calibnet ghcr.io/chainsafe/forest-snapshot-service:latest >> calibnet_log.txt 2>&1" diff --git a/tf-managed/modules/daily-snapshot/service/forest-env.tpl b/tf-managed/modules/daily-snapshot/service/forest-env.tpl deleted file mode 100644 index 14f644c7a..000000000 --- a/tf-managed/modules/daily-snapshot/service/forest-env.tpl +++ /dev/null @@ -1,11 +0,0 @@ -export R2_ACCESS_KEY="${R2_ACCESS_KEY}" -export R2_SECRET_KEY="${R2_SECRET_KEY}" -export R2_ENDPOINT="${r2_endpoint}" -export SLACK_API_TOKEN="${slack_token}" -export SLACK_NOTIF_CHANNEL="${slack_channel}" -export SNAPSHOT_BUCKET="${snapshot_bucket}" -export NEW_RELIC_API_KEY="${NEW_RELIC_API_KEY}" -export NEW_RELIC_ACCOUNT_ID="${NEW_RELIC_ACCOUNT_ID}" -export NEW_RELIC_REGION="${NEW_RELIC_REGION}" -export BASE_FOLDER="${BASE_FOLDER}" -export FOREST_TAG="${forest_tag}" diff --git a/tf-managed/modules/daily-snapshot/service/init.sh b/tf-managed/modules/daily-snapshot/service/init.sh index 3013827a5..9f17ee885 100755 --- a/tf-managed/modules/daily-snapshot/service/init.sh +++ b/tf-managed/modules/daily-snapshot/service/init.sh @@ -10,26 +10,9 @@ export DEBIAN_FRONTEND=noninteractive # Using timeout to ensure the script retries if the APT servers are temporarily unavailable. timeout 10m bash -c 'until apt-get -qqq --yes update && \ - apt-get -qqq --yes install ruby ruby-dev anacron awscli; do sleep 10; \ + apt-get -qqq --yes install anacron ; do sleep 10; \ done' -# Install the gems -gem install docker-api slack-ruby-client -gem install activesupport -v 7.0.8 - -# 1. Configure aws -# 2. Create forest_db directory -# 3. Copy scripts to /etc/cron.hourly - -## Configure aws -aws configure set default.s3.multipart_chunksize 4GB -aws configure set aws_access_key_id "$R2_ACCESS_KEY" -aws configure set aws_secret_access_key "$R2_SECRET_KEY" - -## Create forest data directory -mkdir forest_db logs -chmod 777 forest_db logs - # Run new_relic and fail2ban scripts bash newrelic_fail2ban.sh diff --git a/tf-managed/modules/daily-snapshot/service/mainnet_cron_job b/tf-managed/modules/daily-snapshot/service/mainnet_cron_job index f39a81f76..b24ade470 100755 --- a/tf-managed/modules/daily-snapshot/service/mainnet_cron_job +++ b/tf-managed/modules/daily-snapshot/service/mainnet_cron_job @@ -1,6 +1,5 @@ #!/bin/bash # shellcheck source=/dev/null -source ~/.forest_env -cd "$BASE_FOLDER" || exit -flock -n /tmp/mainnet.lock -c "ruby daily_snapshot.rb mainnet > mainnet_log.txt 2>&1" || exit +cd "$HOME" || exit +flock -n /tmp/mainnet.lock -c "docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm --env-file .forest_env -e NETWORK_CHAIN=mainnet ghcr.io/chainsafe/forest-snapshot-service:latest >> mainnet_log.txt 2>&1" diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf index a250b8af0..fd12a9f35 100644 --- a/tf-managed/modules/daily-snapshot/variable.tf +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -48,12 +48,6 @@ variable "r2_endpoint" { type = string } -variable "snapshot_endpoint" { - description = "S3 endpoint for the snapshots" - type = string - default = "https://fra1.digitaloceanspaces.com/" -} - variable "forest_tag" { description = "Image tag for the Forest container" type = string @@ -110,10 +104,6 @@ variable "new_relic_account_id" { sensitive = true } -variable "common_resources_dir" { - type = string -} - variable "environment" { description = "The environment name" type = string