From 1d280dda6f7b97746436381d88a277727a23b127 Mon Sep 17 00:00:00 2001 From: Hubert Date: Tue, 6 Feb 2024 12:58:17 +0100 Subject: [PATCH] Revert "dockerize snapshot upload (#400)" (#402) --- .github/workflows/hadolint-ci.yml | 2 +- .github/workflows/snapshot-service-image.yml | 57 ------------------- images/snapshot-service/.dockerignore | 1 - images/snapshot-service/Dockerfile | 27 --------- images/snapshot-service/README.md | 38 ------------- images/snapshot-service/src/run.sh | 19 ------- tf-managed/modules/daily-snapshot/main.tf | 38 +++++++------ .../modules/daily-snapshot/prep_sources.sh | 6 +- .../daily-snapshot/service/calibnet_cron_job | 5 +- .../daily-snapshot/service}/daily_snapshot.rb | 19 +++++-- .../daily-snapshot/service/forest-env.tpl | 11 ++++ .../modules/daily-snapshot/service/init.sh | 19 ++++++- .../daily-snapshot/service/mainnet_cron_job | 5 +- .../service}/upload_snapshot.sh | 28 +++------ tf-managed/modules/daily-snapshot/variable.tf | 10 ++++ 15 files changed, 95 insertions(+), 190 deletions(-) delete mode 100644 .github/workflows/snapshot-service-image.yml delete mode 100644 images/snapshot-service/.dockerignore delete mode 100644 images/snapshot-service/Dockerfile delete mode 100644 images/snapshot-service/README.md delete mode 100644 images/snapshot-service/src/run.sh rename {images/snapshot-service/src => tf-managed/modules/daily-snapshot/service}/daily_snapshot.rb (78%) create mode 100644 tf-managed/modules/daily-snapshot/service/forest-env.tpl rename {images/snapshot-service/src => tf-managed/modules/daily-snapshot/service}/upload_snapshot.sh (81%) diff --git a/.github/workflows/hadolint-ci.yml b/.github/workflows/hadolint-ci.yml index 5268e1f8a..4845cb381 100644 --- a/.github/workflows/hadolint-ci.yml +++ b/.github/workflows/hadolint-ci.yml @@ -17,4 +17,4 @@ jobs: dockerfile: "Dockerfile*" recursive: true # don't pin versions in dependencies - ignore: DL3028,DL3008,DL3018 + ignore: DL3028,DL3008 diff --git a/.github/workflows/snapshot-service-image.yml b/.github/workflows/snapshot-service-image.yml deleted file mode 100644 index 730c29f07..000000000 --- a/.github/workflows/snapshot-service-image.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: Snapshot Service Image - -# Cancel workflow if there is a new change to the branch. -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -on: - push: - branches: [main] - merge_group: - pull_request: - branches: [main] - -jobs: - build-and-push-docker-image: - name: Build images and push to GHCR - runs-on: ubuntu-latest - timeout-minutes: 30 - steps: - - name: List cached docker images - run: docker image ls - - - name: Checkout code - uses: actions/checkout@v4 - - - name: Login to Github Packages - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # This step yields the following labels: - # - date+sha, e.g. 2023-01-19-da4692d, - # - latest, - - name: Docker Meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ghcr.io/chainsafe/forest-snapshot-service - tags: | - type=raw,value={{date 'YYYY-MM-DD'}}-{{sha}} - type=raw,value=latest,enable={{is_default_branch}} - - - name: Build image and push to GitHub Container Registry - uses: docker/build-push-action@v5 - with: - context: ./images/snapshot-service/ - build-contexts: | - common=./tf-managed/scripts/ - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - push: ${{ github.ref == 'refs/heads/main' }} - - - name: List docker images - run: docker image ls diff --git a/images/snapshot-service/.dockerignore b/images/snapshot-service/.dockerignore deleted file mode 100644 index dd449725e..000000000 --- a/images/snapshot-service/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -*.md diff --git a/images/snapshot-service/Dockerfile b/images/snapshot-service/Dockerfile deleted file mode 100644 index 5da8d17ad..000000000 --- a/images/snapshot-service/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -# Snapshot service Dockerfile. -# It is meant to produce a single snapshot of the given chain in the Filecoin network and upload it to S3 (preferably Cloudflare R2, it should work for other providers as well, but it wasn't tested). -FROM docker:24 -LABEL org.opencontainers.image.description "Forest snapshot service generator and uploader for Filecoin" - -RUN apk add --no-cache \ - ruby \ - ruby-dev \ - docker \ - aws-cli \ - bash && \ - gem install \ - docker-api \ - slack-ruby-client \ - activesupport - -COPY ./src /opt/snapshot-service - -# `common` is defined via the `--build-context` flag in the `docker build` command, e.g., -# `docker build --build-context common=../../tf-managed/scripts/ -t ghcr.io/chainsafe/forest-snapshot-service:latest .` -# TODO: Change this once `sync-check` is fully-dockerized as well. -# hadolint ignore=DL3022 -COPY --from=common ruby_common /opt/snapshot-service/ruby_common - -WORKDIR /opt/snapshot-service - -CMD ["bash", "run.sh"] diff --git a/images/snapshot-service/README.md b/images/snapshot-service/README.md deleted file mode 100644 index 050e1dd62..000000000 --- a/images/snapshot-service/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Forest snapshot service - -This service serves as a Filecoin snapshot generator and uploader. Supported networks are [calibnet](https://docs.filecoin.io/networks/calibration) and [mainnet](https://docs.filecoin.io/networks/mainnet). All S3-compatible providers should work correctly, though it was tested exclusively on Cloudflare R2. - -## Building the image - -```bash -docker build --build-context common=../../tf-managed/scripts/ -t : . -``` - -## Running the Forest snapshot service - -The container needs additional privileges and access to the docker socket to issue other `docker` commands. - -This command will generate a snapshot for the given network and upload it to an S3 bucket. -```bash -docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm --env-file --env NETWORK_CHAIN= ghcr.io/chainsafe/forest-snapshot-service:edge -``` - -## Variables (all required) - -```bash -# Details for the snapshot upload -R2_ACCESS_KEY= -R2_SECRET_KEY= -R2_ENDPOINT= -SNAPSHOT_BUCKET= - -# Details for the Slack notifications -SLACK_API_TOKEN= -SLACK_NOTIFICATION_CHANNEL= - -# Network chain - can be either `mainnet` or `calibnet` -NETWORK_CHAIN= -# Forest tag to use. `latest` is the newest stable version. -# See [Forest packages](https://github.com/ChainSafe/forest/pkgs/container/forest) for more. -FOREST_TAG= -``` diff --git a/images/snapshot-service/src/run.sh b/images/snapshot-service/src/run.sh deleted file mode 100644 index 866444bb1..000000000 --- a/images/snapshot-service/src/run.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -# Assert that all required environment variables are set -: "${R2_ACCESS_KEY:?}" -: "${R2_SECRET_KEY:?}" -: "${R2_ENDPOINT:?}" -: "${SNAPSHOT_BUCKET:?}" -: "${SLACK_API_TOKEN:?}" -: "${SLACK_NOTIFICATION_CHANNEL:?}" -: "${NETWORK_CHAIN:?}" -: "${FOREST_TAG:?}" - -aws configure set default.s3.multipart_chunksize 4GB -aws configure set aws_access_key_id "$R2_ACCESS_KEY" -aws configure set aws_secret_access_key "$R2_SECRET_KEY" - -ruby daily_snapshot.rb "$NETWORK_CHAIN" diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index 7512c21f9..c931fb43b 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -7,7 +7,7 @@ // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { - program = ["bash", "${path.module}/prep_sources.sh", path.module] + program = ["bash", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] } @@ -30,30 +30,32 @@ data "digitalocean_ssh_keys" "keys" { } } -# Required environment variables for the snapshot service itself. +# Set required environment variables locals { - env_content = <<-EOT - R2_ACCESS_KEY=${var.R2_ACCESS_KEY} - R2_SECRET_KEY=${var.R2_SECRET_KEY} - R2_ENDPOINT=${var.r2_endpoint} - SNAPSHOT_BUCKET=${var.snapshot_bucket} - SLACK_API_TOKEN=${var.slack_token} - SLACK_NOTIFICATION_CHANNEL=${var.slack_channel} - FOREST_TAG=${var.forest_tag} - EOT + env_content = templatefile("${path.module}/service/forest-env.tpl", { + R2_ACCESS_KEY = var.R2_ACCESS_KEY, + R2_SECRET_KEY = var.R2_SECRET_KEY, + r2_endpoint = var.r2_endpoint, + slack_token = var.slack_token, + slack_channel = var.slack_channel, + snapshot_bucket = var.snapshot_bucket, + snapshot_endpoint = var.snapshot_endpoint, + NEW_RELIC_API_KEY = var.new_relic_api_key, + NEW_RELIC_ACCOUNT_ID = var.new_relic_account_id, + NEW_RELIC_REGION = var.new_relic_region, + BASE_FOLDER = "/root", + forest_tag = var.forest_tag + }) } locals { init_commands = ["cd /root/", "tar xf sources.tar", + # Set required environment variables "echo '${local.env_content}' >> /root/.forest_env", - <<-EOT - export NEW_RELIC_API_KEY=${var.new_relic_api_key} - export NEW_RELIC_ACCOUNT_ID=${var.new_relic_account_id} - export NEW_RELIC_REGION=${var.new_relic_region} - nohup sh ./init.sh > init_log.txt & - EOT - , + "echo '. ~/.forest_env' >> .bashrc", + ". ~/.forest_env", + "nohup sh ./init.sh > init_log.txt &", # Exiting without a sleep sometimes kills the script :-/ "sleep 60s" ] diff --git a/tf-managed/modules/daily-snapshot/prep_sources.sh b/tf-managed/modules/daily-snapshot/prep_sources.sh index 1d63836c8..05d9a16df 100755 --- a/tf-managed/modules/daily-snapshot/prep_sources.sh +++ b/tf-managed/modules/daily-snapshot/prep_sources.sh @@ -3,8 +3,12 @@ # Enable strict error handling and command tracing set -euxo pipefail -# Copy local source files in a folder, and create a zip archive. +# Copy local source files in a folder together with ruby_common and create a zip archive. + cd "$1" +cp --archive "$2"/ruby_common service/ + rm -f sources.tar (cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) +rm -fr service/ruby_common echo "{ \"path\": \"$1/sources.tar\" }" diff --git a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job index 48ece9597..0511fa173 100755 --- a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job +++ b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job @@ -1,5 +1,6 @@ #!/bin/bash # shellcheck source=/dev/null -cd "$HOME" || exit -flock -n /tmp/calibnet.lock -c "docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm --env-file .forest_env -e NETWORK_CHAIN=calibnet ghcr.io/chainsafe/forest-snapshot-service:latest >> calibnet_log.txt 2>&1" +source ~/.forest_env +cd "$BASE_FOLDER" || exit +flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet >> logs/calibnet_log.txt 2>&1" diff --git a/images/snapshot-service/src/daily_snapshot.rb b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb similarity index 78% rename from images/snapshot-service/src/daily_snapshot.rb rename to tf-managed/modules/daily-snapshot/service/daily_snapshot.rb index 8d3f2d213..0ad65a2e7 100644 --- a/images/snapshot-service/src/daily_snapshot.rb +++ b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb @@ -8,16 +8,24 @@ require 'logger' require 'fileutils' +BASE_FOLDER = get_and_assert_env_variable 'BASE_FOLDER' SLACK_TOKEN = get_and_assert_env_variable 'SLACK_API_TOKEN' -CHANNEL = get_and_assert_env_variable 'SLACK_NOTIFICATION_CHANNEL' +CHANNEL = get_and_assert_env_variable 'SLACK_NOTIF_CHANNEL' + +# Prune logs files(txt) older than 2 weeks +def prune_logs(logs_folder = 'logs') + cutoff_date = Date.today - 14 # set the cutoff date to 14 days ago + + Dir.glob("#{logs_folder}/*.txt").each do |file| + File.delete(file) if File.file?(file) && File.mtime(file).to_date < cutoff_date + end +end CHAIN_NAME = ARGV[0] raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty? # Current datetime, to append to the log files DATE = Time.new.strftime '%FT%H:%M:%S' - -FileUtils.mkdir_p 'logs' LOG_EXPORT_SCRIPT_RUN = "logs/#{CHAIN_NAME}_#{DATE}_script_run.txt" LOG_EXPORT_DAEMON = "logs/#{CHAIN_NAME}_#{DATE}_daemon.txt" LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt" @@ -38,7 +46,7 @@ upload_cmd = <<~CMD.chomp set -o pipefail && \ - timeout -s SIGKILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | \ + timeout --signal=KILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | \ #{add_timestamps_cmd} CMD @@ -63,3 +71,6 @@ [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| logger.info "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) end + +# Prune logs files(txt) in the logs directory older than 2 weeks +prune_logs diff --git a/tf-managed/modules/daily-snapshot/service/forest-env.tpl b/tf-managed/modules/daily-snapshot/service/forest-env.tpl new file mode 100644 index 000000000..14f644c7a --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/forest-env.tpl @@ -0,0 +1,11 @@ +export R2_ACCESS_KEY="${R2_ACCESS_KEY}" +export R2_SECRET_KEY="${R2_SECRET_KEY}" +export R2_ENDPOINT="${r2_endpoint}" +export SLACK_API_TOKEN="${slack_token}" +export SLACK_NOTIF_CHANNEL="${slack_channel}" +export SNAPSHOT_BUCKET="${snapshot_bucket}" +export NEW_RELIC_API_KEY="${NEW_RELIC_API_KEY}" +export NEW_RELIC_ACCOUNT_ID="${NEW_RELIC_ACCOUNT_ID}" +export NEW_RELIC_REGION="${NEW_RELIC_REGION}" +export BASE_FOLDER="${BASE_FOLDER}" +export FOREST_TAG="${forest_tag}" diff --git a/tf-managed/modules/daily-snapshot/service/init.sh b/tf-managed/modules/daily-snapshot/service/init.sh index 9f17ee885..3013827a5 100755 --- a/tf-managed/modules/daily-snapshot/service/init.sh +++ b/tf-managed/modules/daily-snapshot/service/init.sh @@ -10,9 +10,26 @@ export DEBIAN_FRONTEND=noninteractive # Using timeout to ensure the script retries if the APT servers are temporarily unavailable. timeout 10m bash -c 'until apt-get -qqq --yes update && \ - apt-get -qqq --yes install anacron ; do sleep 10; \ + apt-get -qqq --yes install ruby ruby-dev anacron awscli; do sleep 10; \ done' +# Install the gems +gem install docker-api slack-ruby-client +gem install activesupport -v 7.0.8 + +# 1. Configure aws +# 2. Create forest_db directory +# 3. Copy scripts to /etc/cron.hourly + +## Configure aws +aws configure set default.s3.multipart_chunksize 4GB +aws configure set aws_access_key_id "$R2_ACCESS_KEY" +aws configure set aws_secret_access_key "$R2_SECRET_KEY" + +## Create forest data directory +mkdir forest_db logs +chmod 777 forest_db logs + # Run new_relic and fail2ban scripts bash newrelic_fail2ban.sh diff --git a/tf-managed/modules/daily-snapshot/service/mainnet_cron_job b/tf-managed/modules/daily-snapshot/service/mainnet_cron_job index b24ade470..f39a81f76 100755 --- a/tf-managed/modules/daily-snapshot/service/mainnet_cron_job +++ b/tf-managed/modules/daily-snapshot/service/mainnet_cron_job @@ -1,5 +1,6 @@ #!/bin/bash # shellcheck source=/dev/null -cd "$HOME" || exit -flock -n /tmp/mainnet.lock -c "docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm --env-file .forest_env -e NETWORK_CHAIN=mainnet ghcr.io/chainsafe/forest-snapshot-service:latest >> mainnet_log.txt 2>&1" +source ~/.forest_env +cd "$BASE_FOLDER" || exit +flock -n /tmp/mainnet.lock -c "ruby daily_snapshot.rb mainnet > mainnet_log.txt 2>&1" || exit diff --git a/images/snapshot-service/src/upload_snapshot.sh b/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh similarity index 81% rename from images/snapshot-service/src/upload_snapshot.sh rename to tf-managed/modules/daily-snapshot/service/upload_snapshot.sh index 08fb9f9f1..df4564d02 100755 --- a/images/snapshot-service/src/upload_snapshot.sh +++ b/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh @@ -83,6 +83,7 @@ else timeout 30m forest-tool snapshot validate --check-links 0 --check-network "$CHAIN_NAME" --check-stateroots 5 forest_db/forest_snapshot_*.forest.car.zst fi + # Kill the metrics writer process kill %1 @@ -94,35 +95,24 @@ CONTAINER_NAME="forest-snapshot-upload-node-$CHAIN_NAME" docker stop "$CONTAINER_NAME" || true docker rm --force "$CONTAINER_NAME" -CHAIN_DB_DIR="/opt/forest_db/$CHAIN_NAME" -CHAIN_LOGS_DIR="/opt/logs/$CHAIN_NAME" -mkdir -p "$CHAIN_DB_DIR" -mkdir -p "$CHAIN_LOGS_DIR" +CHAIN_DB_DIR="$BASE_FOLDER/forest_db/$CHAIN_NAME" +CHAIN_LOGS_DIR="$BASE_FOLDER/logs" -# Cleanup volumes from the previous if any. -DB_VOLUME="${CHAIN_NAME}_db" -LOG_VOLUME="${CHAIN_NAME}_logs" -docker volume rm "${DB_VOLUME}" || true -docker volume rm "${LOG_VOLUME}" || true +# Delete any existing snapshot files. It may be that the previous run failed +# before deleting those. +rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* -# Run forest and generate a snapshot in the `DB_VOLUME` volume. +# Run forest and generate a snapshot in forest_db/ docker run \ --name "$CONTAINER_NAME" \ --rm \ --user root \ - -v "${DB_VOLUME}:/home/forest/forest_db" \ - -v "${LOG_VOLUME}:/home/forest/logs" \ + -v "$CHAIN_DB_DIR:/home/forest/forest_db":z \ + -v "$CHAIN_LOGS_DIR:/home/forest/logs":z \ --entrypoint /bin/bash \ ghcr.io/chainsafe/forest:"${FOREST_TAG}" \ -c "$COMMANDS" || exit 1 -# Dummy container to copy the snapshot files from the volume to the "host". -COPIER=$(docker container create -v "${CHAIN_NAME}_db:/opt" busybox) -docker run --rm -v "${DB_VOLUME}:/opt" busybox /bin/sh -c 'ls /opt/forest_snapshot_*.forest.car.zst' | while read -r file; do - docker cp "$COPIER":"$file" "$CHAIN_DB_DIR" -done -docker rm "$COPIER" - aws --endpoint "$R2_ENDPOINT" s3 cp --no-progress "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"*.forest.car.zst s3://"$SNAPSHOT_BUCKET"/"$CHAIN_NAME"/latest/ || exit 1 # Delete snapshot files diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf index fd12a9f35..a250b8af0 100644 --- a/tf-managed/modules/daily-snapshot/variable.tf +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -48,6 +48,12 @@ variable "r2_endpoint" { type = string } +variable "snapshot_endpoint" { + description = "S3 endpoint for the snapshots" + type = string + default = "https://fra1.digitaloceanspaces.com/" +} + variable "forest_tag" { description = "Image tag for the Forest container" type = string @@ -104,6 +110,10 @@ variable "new_relic_account_id" { sensitive = true } +variable "common_resources_dir" { + type = string +} + variable "environment" { description = "The environment name" type = string