Skip to content

Commit

Permalink
GTC-1250 Update lambda layer based on any datapump source changes
Browse files Browse the repository at this point in the history
The datapump was using the lambda-layer module of the gfw-lambda-layers
repo. This module only updates a lambda layer if the associated
Dockerfile changes, since it is mainly targeted to creating layers of
existing python libraries, so only the version number in the Dockerfile
matters.

So, changed the terraform to build the layer.zip using the usual
Dockerfile, but using the hash of the entire layer.zip to decide if the
layer.zip should be uploaded to S3 and used to update the lambda. We no
longer use the lambda-layer module.

The layer.zip file includes all the modified times of the files, which
keep changing, even when the file names and contents are the same. I
tried generating a hash using only filenames and contents, but terraform
seems to create its own hash of the layer.zip file as well, so basically
we're always going to update the layer.zip no matter what, which seems
fine.

Added a bunch of explanatory comments, cleaned up some other comments.
  • Loading branch information
danscales committed Feb 20, 2025
1 parent f1325a9 commit cc39e17
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 32 deletions.
10 changes: 0 additions & 10 deletions .github/workflows/terraform_build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,6 @@ jobs:
run: |
curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
chmod +x ./cc-test-reporter
- name: Test with pytest
env:
ENV: test
AWS_ACCESS_KEY_ID: test
AWS_SECRET_ACCESS_KEY: test # pragma: allowlist secret
AWS_DEFAULT_REGION: ${{ secrets.aws_region_dev }}
LOCALSTACK_API_KEY: ${{ secrets.localstack_api_key }}
DUMP_TO_STDOUT: true
run: |
./scripts/test
- name: Deploy production
if: success() && contains(github.ref, 'master')
env:
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/terraform_destroy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ on: [delete]

jobs:
build:
if: contains(github.ref_type, 'branch') && (! contains(github.ref, 'master')) && (! contains(github.ref, 'develop'))
runs-on: ubuntu-22.04
steps:
- name: Destroy state and delete workspace
run: docker compose -f docker/terraform/docker-compose.yml run --entrypoint delete_workspace --rm terraform github.ref
run: echo "ref_type ${{ github.ref_type }}, ref ${{ github.ref }}, ${{ contains(github.ref, 'develop') }}, ${{ ! contains(github.ref, 'develop') }}"
11 changes: 3 additions & 8 deletions src/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# This Dockerfile is used to build the datapump layer for the lambdas.
FROM public.ecr.aws/lambda/python:3.10

ENV WORKDIR /opt
Expand All @@ -8,18 +9,12 @@ RUN mkdir -p /opt/python
# Make the dir and to install all packages into packages/
COPY . $WORKDIR

# installing dependencies to build package
# Installing dependencies to build package. This uses setup.py for the dependency list.
RUN pip install . -t python

# This next line needs to be changed (just increment the number) in order
# to change the hash of the file and get TF to realize it needs to be
# redeployed. Ticket for a better solution:
# https://gfw.atlassian.net/browse/GTC-1250
# change 35

RUN yum install -y zip geos-devel

# Precompile all python packages and remove .py files
# Remove any precompiled files and __pycache__ dirs
RUN find python/ -type f -name '*.pyc' -print0 | xargs -0 rm -rf
RUN find python/ -type d -a -name '__pycache__' -print0 | xargs -0 rm -rf

Expand Down
53 changes: 45 additions & 8 deletions terraform/modules/datapump/data.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,48 @@ data "template_file" "sfn_datapump" {
}
}

module "py310_datapump_021" {
source = "git::https://github.com/wri/gfw-lambda-layers.git//terraform/modules/lambda_layer"
bucket = var.pipelines_bucket
name = "datapump-${terraform.workspace}"
module_version = "0.2.1"
runtime = "python3.10"
layer_path = "${var.lambda_layers_path}/"
}
# Terraform to create and upload layer.zip of the datapump source code
# and dependencies.

locals {
layer_name = substr("python3.10-datapump-${terraform.workspace}_0.2.1", 0, 64)

}

# Build the Docker image and copy ZIP file to local folder
# Always build the zip file so we can do a hash on the entire source.
resource "null_resource" "build" {
triggers = {
curtime = timestamp()
}

provisioner "local-exec" {
command = "${path.module}/scripts/build.sh ${var.lambda_layers_path} ${local.layer_name}"
interpreter = ["bash", "-c"]
}
}

data "external" "layer_sha256" {
program = [ "${path.module}/scripts/hash.sh", "${var.lambda_layers_path}/layer.zip"]
depends_on = [null_resource.build]
}

resource "aws_s3_bucket_object" "py310_datapump_021" {
bucket = var.pipelines_bucket
key = "lambda_layers/${local.layer_name}.zip"
source = "${var.lambda_layers_path}/layer.zip"
# This is what decides if the s3 upload of the layer will happen,
# though terraform seems to do its own hash of the zip file as well.
etag = lookup(data.external.layer_sha256.result, "hash")
}

resource "aws_lambda_layer_version" "py310_datapump_021" {
layer_name = replace(local.layer_name, ".", "")
s3_bucket = aws_s3_bucket_object.py310_datapump_021.bucket
s3_key = aws_s3_bucket_object.py310_datapump_021.key
compatible_runtimes = ["python3.10"]
# This decides if the actual layer will be replaced in the lambda,
# though terraform seems use its own etag of the zip file on S3 as well,
# which means we always update the zip file.
source_code_hash = lookup(data.external.layer_sha256.result, "hash")
}
8 changes: 4 additions & 4 deletions terraform/modules/datapump/lambdas.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ resource "aws_lambda_function" "dispatcher" {
publish = true
tags = local.tags
layers = [
module.py310_datapump_021.layer_arn,
aws_lambda_layer_version.py310_datapump_021.arn,
var.numpy_lambda_layer_arn,
var.rasterio_lambda_layer_arn,
var.shapely_lambda_layer_arn
Expand Down Expand Up @@ -39,7 +39,7 @@ resource "aws_lambda_function" "executor" {
timeout = var.lambda_params.timeout
publish = true
tags = local.tags
layers = [module.py310_datapump_021.layer_arn]
layers = [aws_lambda_layer_version.py310_datapump_021.arn]
environment {
variables = {
ENV = var.environment
Expand Down Expand Up @@ -68,7 +68,7 @@ resource "aws_lambda_function" "postprocessor" {
publish = true
tags = local.tags
layers = [
module.py310_datapump_021.layer_arn,
aws_lambda_layer_version.py310_datapump_021.arn,
var.numpy_lambda_layer_arn,
var.rasterio_lambda_layer_arn,
var.shapely_lambda_layer_arn
Expand All @@ -82,4 +82,4 @@ resource "aws_lambda_function" "postprocessor" {
DATAPUMP_TABLE_NAME = aws_dynamodb_table.datapump.name
}
}
}
}
33 changes: 33 additions & 0 deletions terraform/modules/datapump/scripts/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env bash

# This is the same build script as in gfw-lambda-layers/terraform/modules/lambda_layer/scripts/build.sh
# It builds and runs a docker as specified in ${1}/Dockerfile to create a layer.zip.

set -e

LAYER_PATH="${1}"
IMAGE="globalforestwatch/${2}"

echo -n "${LAYER_PATH}" > "${LAYER_PATH}/foo.txt"
date >> "${LAYER_PATH}/foo.txt"
CONTAINER_NAME="container_$(sha1sum ${LAYER_PATH}/foo.txt |cut -c 1-8)"

pushd "${LAYER_PATH}"

echo "BUILD image ${IMAGE}"
docker build --no-cache -t "${IMAGE}" .

echo "CREATE container ${CONTAINER_NAME}"
docker run -itd --name "${CONTAINER_NAME}" "${IMAGE}" /bin/bash

echo "COPY ZIP package to host"
docker cp "${CONTAINER_NAME}":"/opt/layer.zip" layer.zip

echo "STOP container"
docker stop "${CONTAINER_NAME}"
docker wait "${CONTAINER_NAME}"

echo "REMOVE container"
docker rm -f "${CONTAINER_NAME}"

popd
10 changes: 10 additions & 0 deletions terraform/modules/datapump/scripts/hash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

# This does a hash of the zip file, but that includes all the modified times of the
# files, which keep changing, even when the file names and contents are the same. I
# tried generating a hash using only filenames and contents, but terraform seems to
# create its own hash of the layer.zip file as well, so basically we're always going
# to update the layer.zip no matter what, which seems fine.
hash=$(sha256sum $1 | cut -d' ' -f1)

echo '{ "hash": "'"$hash"'" }'

0 comments on commit cc39e17

Please sign in to comment.