diff --git a/.github/workflows/awscleanup.yaml b/.github/workflows/awscleanup.yaml new file mode 100644 index 00000000..841ba2d7 --- /dev/null +++ b/.github/workflows/awscleanup.yaml @@ -0,0 +1,51 @@ +name: Daily AWS Cleanup Bot + +on: + pull_request: + types: + - opened + - synchronize + branches: + - test-awsresourcecleanup + push: + branches: + - test-awsresourcecleanup + +jobs: + cleanup: + runs-on: linux-amd64-cpu4 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up AWS CLI + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-west-1 + + - name: Identify resources for deletion + id: identify-resources + run: | + # Find vpcs with names ci* + vpcs=$(aws ec2 describe-vpcs \ + --filters "Name=tag:Name,Values=ci*" \ + --query "Vpcs[].VpcId" \ + --output text | tr -d '\r' | tr '\n' ' ') + echo "Found VPCs: $vpcs" + echo "AWS_VPC_IDS=$vpcs" >> $GITHUB_ENV + + - name: Clean up VPCs + if: env.AWS_VPC_IDS != '' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + for vpcid in $AWS_VPC_IDS; do + scripts/awscleanup.sh $vpcid + done + + - name: Post cleanup + run: | + echo "Cleanup completed." diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 674cb838..21c41265 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -20,12 +20,14 @@ on: - opened - synchronize branches: - - main - - release-* + - main-no + - release-no + - test-awsresourcecleanup push: branches: - - main - - release-* + - main-no + - release-no + - test-awsresourcecleanup schedule: - cron: '31 11 * * 4' diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index a186b981..fa3b322e 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -15,14 +15,15 @@ name: End-to-end Tests on: - workflow_run: - workflows: [Go] + pull_request: types: - - completed + - opened + - synchronize branches: - - "pull-request/[0-9]+" - - main - - release-* + - test-awsresourcecleanup + push: + branches: + - test-awsresourcecleanup jobs: e2e-test: diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 9e215402..b18e5bc2 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -17,12 +17,14 @@ name: Go on: push: branches: - - main - - release-* + - main-no + - release-no + - test-awsresourcecleanup pull_request: branches: - - main - - release-* + - main-no + - release-no + - test-awsresourcecleanup jobs: build: diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml index f57058f8..ed35c894 100644 --- a/.github/workflows/image.yml +++ b/.github/workflows/image.yml @@ -17,14 +17,16 @@ name: Image on: pull_request: branches: - - 'main' - - 'release-*' + - 'main-no' + - 'release-no' + - test-awsresourcecleanup push: tags: - 'v*.*.*' branches: - - 'main' - - 'release-*' + - 'main-no' + - 'release-no' + - test-awsresourcecleanup jobs: docker: diff --git a/scripts/awscleanup.sh b/scripts/awscleanup.sh new file mode 100755 index 00000000..a7a66bb8 --- /dev/null +++ b/scripts/awscleanup.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +if [[ $# -ne 1 ]]; then + echo " vpcid required for deletion" + exit 1 +fi + +export vpcid=$1 + +get_tag_value(){ + if [[ $# -ne 2 ]]; then + echo " vpcid and key required to get tag value" + exit 1 + fi + local vpc=$1 + local key=$2 + aws ec2 describe-tags --filters "Name=resource-id,Values=$vpcid" "Name=key,Values=$key" \ + --query "Tags[0].Value" --output text +} + +delete_vpc_resources() { + if [[ $# -ne 1 ]]; then + echo " vpcid required for deletion" + exit 1 + fi + local vpcid=$1 + + echo "Start cleanup of resources in VPC: $vpcid" + + # Delete Instance + instances=$(aws ec2 describe-instances \ + --filters "Name=vpc-id,Values=$vpcid" \ + --query "Reservations[].Instances[].InstanceId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for instance in $instances; do + aws ec2 terminate-instances --instance-ids "$instance" + done + + # Detach and Delete Security Groups + security_groups=$(aws ec2 describe-security-groups \ + --filters Name=vpc-id,Values=$vpcid \ + --query "SecurityGroups[?GroupName!='default'].GroupId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for sg in $security_groups; do + enis=$(aws ec2 describe-network-interfaces \ + --filters Name=group-id,Values=$sg \ + --query "NetworkInterfaces[].NetworkInterfaceId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for eni in $enis; do + aws ec2 modify-network-interface-attribute \ + --network-interface-id "$eni" \ + --groups "$(aws ec2 describe-security-groups \ + --query 'SecurityGroups[?GroupName==`default`].GroupId' \ + --output text)" + done + aws ec2 delete-security-group --group-id "$sg" + done + + # Delete Subnets + subnets=$(aws ec2 describe-subnets \ + --filters Name=vpc-id,Values=$vpcid \ + --query "Subnets[].SubnetId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for subnet in $subnets; do + aws ec2 delete-subnet --subnet-id "$subnet" + done + + # Delete Route Tables + # 1. Make first rt as Main , as we cannot delete vpcs attached with main + # 2. replace all rt with first rt + # 3. delete rt + # 4. Main table(first_rt) will be deleted once vpc deleted + first_rt="" + route_tables=$(aws ec2 describe-route-tables \ + --filters Name=vpc-id,Values=$vpcid \ + --query "RouteTables[].RouteTableId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for rt in $route_tables; do + associations=$(aws ec2 describe-route-tables \ + --route-table-ids "$rt" \ + --query "RouteTables[].Associations[].RouteTableAssociationId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for assoc_id in $associations; do + if [ -z "$first_rt" ]; then + aws ec2 replace-route-table-association --association-id $assoc_id --route-table-id $rt + first_rt=$rt + else + aws ec2 replace-route-table-association --association-id $assoc_id --route-table-id $first_rt + fi + done + aws ec2 delete-route-table --route-table-id "$rt" 2>>/dev/null + done + + # Delete Internet Gateway + internet_gateways=$(aws ec2 describe-internet-gateways \ + --filters Name=attachment.vpc-id,Values=$vpcid \ + --query "InternetGateways[].InternetGatewayId" \ + --output text | tr -d '\r' | tr '\n' ' ') + for igw in $internet_gateways; do + aws ec2 detach-internet-gateway --internet-gateway-id "$igw" --vpc-id "$vpcid" + aws ec2 delete-internet-gateway --internet-gateway-id "$igw" + done + + # Delete vpc + # try 3 times with 30 seconds interval + attempts=0 + echo "All resource Deleted for VPC: $vpcid , now delete vpc" + while [ $attempts -lt 3 ]; do + if aws ec2 delete-vpc --vpc-id $vpcid; then + echo "Successfully deleted VPC: $vpcid" + break + else + attempts=$((attempts + 1)) + if [ $attempts -lt 3 ]; then + echo "Failed to delete VPC: $vpcid. Retrying in 30 seconds..." + sleep 30 + fi + fi + done + if [ $attempts -eq 3 ]; then + echo "Failed to delete VPC: $vpcid after 3 attempts. Continue the loop to delete other vpc" + fi +} + +github_repository=$(get_tag_value $vpcid "GitHubRepository") +run_id=$(get_tag_value $vpcid "GitHubRunId") +job_name=$(get_tag_value $vpcid "GitHubJob") +response=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \ + "https://api.github.com/repos/${github_repository}/actions/runs/${run_id}/jobs") +if [[ -z "$response" || "$response" == "null" ]]; then + exit 0 +fi + +# 1. make sure .jobs exist in response +# e.g. { "message": "Not Found", "documentation_url": "https://docs.github.com/rest", "status": "404" } +# 2. check if all jobs completed + +if ! echo "$response" | jq -e '.jobs != null' >/dev/null 2>&1; then + exit 0 +fi + +is_jobs_not_completed=$(echo "$response" | jq -r ".jobs? // [] | + map(select(.status != \"completed\")) | + length") + +if [[ "$is_jobs_not_completed" -eq 0 ]]; then + echo "Holodeck e2e Job status is not in running stage , Delete the vpc $vpcid and dependent resources" + delete_vpc_resources $vpcid +fi diff --git a/tests/aws_test.go b/tests/aws_test.go index 541c5d7d..81cb6df7 100644 --- a/tests/aws_test.go +++ b/tests/aws_test.go @@ -56,7 +56,7 @@ var _ = Describe("AWS", func() { Expect(err).ToNot(HaveOccurred()) // Set unique name for the environment - opts.cfg.Name = opts.cfg.Name + "-" + common.GenerateUID() + common.SetCfgName(&opts.cfg) // set cache path opts.cachePath = LogArtifactDir // set cache file diff --git a/tests/common/common.go b/tests/common/common.go index 254c0dd1..973476c3 100644 --- a/tests/common/common.go +++ b/tests/common/common.go @@ -16,7 +16,13 @@ package common -import "math/rand" +import ( + "fmt" + "math/rand" + "os" + + "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" +) func GenerateUID() string { const charset = "abcdefghijklmnopqrstuvwxyz0123456789" @@ -28,3 +34,16 @@ func GenerateUID() string { return string(b) } + +func SetCfgName(cfg *v1alpha1.Environment) { + sha := os.Getenv("GITHUB_SHA") + attempt := os.Getenv("GITHUB_RUN_ATTEMPT") + // short sha + if len(sha) > 8 { + sha = sha[:8] + } + // uid is unique for each run + uid := GenerateUID() + + cfg.Name = fmt.Sprintf("ci%s-%s-%s", attempt, sha, uid) +}