Skip to content

Commit

Permalink
unused aws instace and vpcs cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Dec 19, 2024
1 parent 6783298 commit d64df03
Show file tree
Hide file tree
Showing 8 changed files with 257 additions and 21 deletions.
69 changes: 69 additions & 0 deletions .github/workflows/awscleanup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: Daily AWS Cleanup Bot

on:
pull_request:
types:
- opened
- synchronize
branches:
- test-awsresourcecleanup
push:
branches:
- test-awsresourcecleanup

jobs:
cleanup:
runs-on: linux-amd64-cpu4

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up AWS CLI
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-west-1

- name: Identify resources for deletion
id: identify-resources
run: |
# Find vpcs with names ci*
vpcs=$(aws ec2 describe-vpcs \
--filters "Name=tag:Name,Values=ci*" \
--query "Vpcs[].VpcId" \
--output text | tr -d '\r' | tr '\n' ' ')
echo "Found VPCs: $vpcs"
echo "vpcs=$vpcs" >> $GITHUB_ENV
- name: Clean up VPCs
if: env.vpcs != ''
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
get_tag_value(){
local vpc_id=$1
local key=$2
aws ec2 describe-tags --filters "Name=resource-id,Values=$vpc_id" "Name=key,Values=$key" \
--query "Tags[0].Value" --output text
}
for vpc in $vpcs; do
github_repository=$(get_tag_value $vpc "GitHubRepository")
run_id=$(get_tag_value $vpc "GitHubRunId")
job_name=$(get_tag_value $vpc "GitHubJob")
response=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \
"https://api.github.com/repos/${github_repository}/actions/runs/${run_id}/jobs")
if [[ -z "$response" || "$response" == "null" ]]; then
continue
fi
status=$(echo "$response" | jq -r ".jobs? // [] | map(select(.name | test(\"^$job_name\"))) | .[0]?.status // \"null\"" 2>/dev/null || echo "null")
if [[ "$status" != "null" && ! -z "$status" && $(echo "$status" | grep -qvE '^(queued|in_progress)$'; echo $?) -eq 0 ]]; then
echo "Holodeck e2e Job status is not in running stage , Delete the dependent resources"
scripts/awscleanup.sh $vpc
fi
done
- name: Post cleanup
run: |
echo "Cleanup completed."
10 changes: 6 additions & 4 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ on:
- opened
- synchronize
branches:
- main
- release-*
- main-no
- release-no
- test-awsresourcecleanup
push:
branches:
- main
- release-*
- main-no
- release-no
- test-awsresourcecleanup
schedule:
- cron: '31 11 * * 4'

Expand Down
13 changes: 7 additions & 6 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@
name: End-to-end Tests

on:
workflow_run:
workflows: [Go]
pull_request:
types:
- completed
- opened
- synchronize
branches:
- "pull-request/[0-9]+"
- main
- release-*
- test-awsresourcecleanup
push:
branches:
- test-awsresourcecleanup

jobs:
e2e-test:
Expand Down
10 changes: 6 additions & 4 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ name: Go
on:
push:
branches:
- main
- release-*
- main-no
- release-no
- test-awsresourcecleanup
pull_request:
branches:
- main
- release-*
- main-no
- release-no
- test-awsresourcecleanup

jobs:
build:
Expand Down
10 changes: 6 additions & 4 deletions .github/workflows/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@ name: Image
on:
pull_request:
branches:
- 'main'
- 'release-*'
- 'main-no'
- 'release-no'
- test-awsresourcecleanup
push:
tags:
- 'v*.*.*'
branches:
- 'main'
- 'release-*'
- 'main-no'
- 'release-no'
- test-awsresourcecleanup

jobs:
docker:
Expand Down
142 changes: 142 additions & 0 deletions scripts/awscleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/bin/bash

if [[ $# -ne 1 ]]; then
echo " vpcid required for deletion"
exit 1
fi
export vpc=$1

echo "Start Deleting VPC: $vpc resource"

# Delete Instance
instances=$(aws ec2 describe-instances \
--filters "Name=vpc-id,Values=$vpc" \
--query "Reservations[].Instances[].InstanceId" \
--output text | tr -d '\r' | tr '\n' ' ')
for instance in $instances; do
echo "Terminating instance: $instance"
aws ec2 terminate-instances --instance-ids "$instance"
done

# Delete Internet Gateway
internet_gateways=$(aws ec2 describe-internet-gateways \
--filters Name=attachment.vpc-id,Values=$vpc \
--query "InternetGateways[].InternetGatewayId" \
--output text | tr -d '\r' | tr '\n' ' ')
for igw in $internet_gateways; do
aws ec2 detach-internet-gateway --internet-gateway-id "$igw" --vpc-id "$vpc"
aws ec2 delete-internet-gateway --internet-gateway-id "$igw"
done

# Delete NAT Gateways
nat_gateways=$(aws ec2 describe-nat-gateways \
--filter Name=vpc-id,Values=$vpc \
--query "NatGateways[].NatGatewayId" \
--output text | tr -d '\r' | tr '\n' ' ')
for ngw in $nat_gateways; do
aws ec2 delete-nat-gateway --nat-gateway-id "$ngw"
done

# Delete Elastic IPs
eips=$(aws ec2 describe-addresses \
--filters Name=domain,Values=vpc \
--query "Addresses[].[AllocationId,Association.VpcId]" \
--output text | grep "$vpc" | awk '{print $1}' | tr -d '\r' | tr '\n' ' ')
for eip in $eips; do
aws ec2 release-address --allocation-id "$eip"
done

# Detach and Delete Security Groups
security_groups=$(aws ec2 describe-security-groups \
--filters Name=vpc-id,Values=$vpc \
--query "SecurityGroups[?GroupName!='default'].GroupId" \
--output text | tr -d '\r' | tr '\n' ' ')
for sg in $security_groups; do
enis=$(aws ec2 describe-network-interfaces \
--filters Name=group-id,Values=$sg \
--query "NetworkInterfaces[].NetworkInterfaceId" \
--output text | tr -d '\r' | tr '\n' ' ')
for eni in $enis; do
aws ec2 modify-network-interface-attribute \
--network-interface-id "$eni" \
--groups "$(aws ec2 describe-security-groups \
--query 'SecurityGroups[?GroupName==`default`].GroupId' \
--output text)"
done
aws ec2 delete-security-group --group-id "$sg"
done

# Delete Route Tables
# 1. Make first rt as Main , as we cannot delete vpcs attached with main
# 2. replace all rt with first rt
# 3. delete rt
# 4. Main table(first_rt) will be deleted once vpc deleted
first_rt=""
route_tables=$(aws ec2 describe-route-tables \
--filters Name=vpc-id,Values=$vpc \
--query "RouteTables[].RouteTableId" \
--output text | tr -d '\r' | tr '\n' ' ')
for rt in $route_tables; do
associations=$(aws ec2 describe-route-tables \
--route-table-ids "$rt" \
--query "RouteTables[].Associations[].RouteTableAssociationId" \
--output text | tr -d '\r' | tr '\n' ' ')
for assoc_id in $associations; do
if [ -z "$first_rt" ]; then
aws ec2 replace-route-table-association --association-id $assoc_id --route-table-id $rt
first_rt=$rt
else
aws ec2 replace-route-table-association --association-id $assoc_id --route-table-id $first_rt
aws ec2 delete-route-table --route-table-id "$rt"
fi
done
done

# Delete Subnets
subnets=$(aws ec2 describe-subnets \
--filters Name=vpc-id,Values=$vpc \
--query "Subnets[].SubnetId" \
--output text | tr -d '\r' | tr '\n' ' ')
for subnet in $subnets; do
aws ec2 delete-subnet --subnet-id "$subnet"
done

# Delete Network Interfaces
eni_ids=$(aws ec2 describe-network-interfaces \
--filters Name=vpc-id,Values=$vpc \
--query "NetworkInterfaces[].NetworkInterfaceId" \
--output text | tr -d '\r' | tr '\n' ' ')
for eni in $eni_ids; do
aws ec2 delete-network-interface --network-interface-id "$eni"
done

# Delete Network ACLs
nw_acls=$(aws ec2 describe-network-acls \
--filters "Name=vpc-id,Values=$vpc" \
--query "NetworkAcls[?IsDefault==false].NetworkAclId" \
--output text | tr -d '\r' | tr '\n' ' ')
for acl in $nw_acls; do
echo "Deleting Network ACL: $acl"
aws ec2 delete-network-acl --network-acl-id $acl
done

# Delete vpc
# try 3 times with 30 seconds interval
attempts=0
echo "All resource Deleted for VPC: $vpc , now delete vpc"
while [ $attempts -lt 3 ]; do
echo "Attempting to delete VPC: $vpc (Attempt $((attempts+1)))"
if aws ec2 delete-vpc --vpc-id $vpc; then
echo "Successfully deleted VPC: $vpc"
break
else
attempts=$((attempts + 1))
if [ $attempts -lt 3 ]; then
echo "Failed to delete VPC: $vpc. Retrying in 30 seconds..."
sleep 30
fi
fi
done
if [ $attempts -eq 3 ]; then
echo "Failed to delete VPC: $vpc after 3 attempts. Continue the loop to delete other vpc"
fi
3 changes: 1 addition & 2 deletions tests/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"github.com/NVIDIA/holodeck/pkg/jyaml"
"github.com/NVIDIA/holodeck/pkg/provider"
"github.com/NVIDIA/holodeck/pkg/provisioner"
"github.com/NVIDIA/holodeck/tests/common"
)

// Actual test suite
Expand All @@ -56,7 +55,7 @@ var _ = Describe("AWS", func() {
Expect(err).ToNot(HaveOccurred())

// Set unique name for the environment
opts.cfg.Name = opts.cfg.Name + "-" + common.GenerateUID()
common.SetCfgName(opts.cfg)
// set cache path
opts.cachePath = LogArtifactDir
// set cache file
Expand Down
21 changes: 20 additions & 1 deletion tests/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@

package common

import "math/rand"
import (
"fmt"
"math/rand"
"os"

"github.com/NVIDIA/holodeck/api/holodeck/v1alpha1"
)

func GenerateUID() string {
const charset = "abcdefghijklmnopqrstuvwxyz0123456789"
Expand All @@ -28,3 +34,16 @@ func GenerateUID() string {

return string(b)
}

func SetCfgName(cfg *v1alpha1.Environment) {
sha := os.Getenv("GITHUB_SHA")
attempt := os.Getenv("GITHUB_RUN_ATTEMPT")
// short sha
if len(sha) > 8 {
sha = sha[:8]
}
// uid is unique for each run
uid := GenerateUID()

cfg.Name = fmt.Sprintf("ci%s-%s-%s", attempt, sha, uid)
}

0 comments on commit d64df03

Please sign in to comment.