From c0917eb32d75514dacf1d1aaaf15dc3ce8cececf Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Mon, 16 Jun 2025 00:07:43 +0000 Subject: [PATCH] adding terraform EKS reference architecture --- .../4.amazon-eks/{ => eksctl}/README.md | 0 .../{ => eksctl}/eks-g4dn-vpc.yaml | 0 .../4.amazon-eks/{ => eksctl}/eks-g4dn.yaml | 0 .../{ => eksctl}/eks-g5-node-autorepair.yaml | 0 .../{ => eksctl}/eks-p4de-odcr-vpc.yaml | 0 .../{ => eksctl}/eks-p4de-odcr.yaml | 0 .../{ => eksctl}/eks-p5-capacity-block.yaml | 0 .../{ => eksctl}/eks-p5-odcr-vpc.yaml | 0 .../4.amazon-eks/terraform/.gitignore | 34 + .../4.amazon-eks/terraform/DESTROY_PROCESS.md | 175 ++++++ .../4.amazon-eks/terraform/README.md | 470 ++++++++++++++ .../4.amazon-eks/terraform/deploy.sh | 255 ++++++++ .../4.amazon-eks/terraform/destroy.sh | 389 ++++++++++++ .../examples/fsx-lustre-example.yaml | 105 ++++ .../terraform/examples/gpu-workload.yaml | 55 ++ .../examples/karpenter-workloads.yaml | 262 ++++++++ .../examples/node-auto-repair-test.yaml | 197 ++++++ .../examples/s3-mountpoint-example.yaml | 181 ++++++ .../terraform/examples/test-karpenter.sh | 292 +++++++++ .../4.amazon-eks/terraform/main.tf | 389 ++++++++++++ .../terraform/modules/addons/main.tf | 590 ++++++++++++++++++ .../terraform/modules/addons/outputs.tf | 39 ++ .../terraform/modules/addons/variables.tf | 109 ++++ .../terraform/modules/eks/main.tf | 149 +++++ .../terraform/modules/eks/outputs.tf | 49 ++ .../terraform/modules/eks/user_data.sh | 108 ++++ .../terraform/modules/eks/variables.tf | 98 +++ .../terraform/modules/fsx-lustre/main.tf | 114 ++++ .../terraform/modules/fsx-lustre/outputs.tf | 34 + .../terraform/modules/fsx-lustre/variables.tf | 122 ++++ .../terraform/modules/s3-mountpoint/main.tf | 193 ++++++ .../modules/s3-mountpoint/outputs.tf | 21 + .../modules/s3-mountpoint/variables.tf | 61 ++ .../terraform/modules/vpc/main.tf | 147 +++++ .../terraform/modules/vpc/outputs.tf | 69 ++ .../terraform/modules/vpc/variables.tf | 96 +++ .../4.amazon-eks/terraform/outputs.tf | 119 ++++ .../terraform/terraform.tfvars.example | 67 ++ .../4.amazon-eks/terraform/variables.tf | 251 ++++++++ 39 files changed, 5240 insertions(+) rename 1.architectures/4.amazon-eks/{ => eksctl}/README.md (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-g4dn-vpc.yaml (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-g4dn.yaml (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-g5-node-autorepair.yaml (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p4de-odcr-vpc.yaml (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p4de-odcr.yaml (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p5-capacity-block.yaml (100%) rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p5-odcr-vpc.yaml (100%) create mode 100644 1.architectures/4.amazon-eks/terraform/.gitignore create mode 100644 1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md create mode 100644 1.architectures/4.amazon-eks/terraform/README.md create mode 100755 1.architectures/4.amazon-eks/terraform/deploy.sh create mode 100755 1.architectures/4.amazon-eks/terraform/destroy.sh create mode 100644 1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml create mode 100644 1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml create mode 100644 1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml create mode 100644 1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml create mode 100644 1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml create mode 100755 1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh create mode 100644 1.architectures/4.amazon-eks/terraform/main.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/addons/main.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/main.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf create mode 100644 1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf create mode 100644 1.architectures/4.amazon-eks/terraform/outputs.tf create mode 100644 1.architectures/4.amazon-eks/terraform/terraform.tfvars.example create mode 100644 1.architectures/4.amazon-eks/terraform/variables.tf diff --git a/1.architectures/4.amazon-eks/README.md b/1.architectures/4.amazon-eks/eksctl/README.md similarity index 100% rename from 1.architectures/4.amazon-eks/README.md rename to 1.architectures/4.amazon-eks/eksctl/README.md diff --git a/1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml b/1.architectures/4.amazon-eks/eksctl/eks-g4dn-vpc.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-g4dn-vpc.yaml diff --git a/1.architectures/4.amazon-eks/eks-g4dn.yaml b/1.architectures/4.amazon-eks/eksctl/eks-g4dn.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-g4dn.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-g4dn.yaml diff --git a/1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml b/1.architectures/4.amazon-eks/eksctl/eks-g5-node-autorepair.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-g5-node-autorepair.yaml diff --git a/1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr-vpc.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr-vpc.yaml diff --git a/1.architectures/4.amazon-eks/eks-p4de-odcr.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p4de-odcr.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr.yaml diff --git a/1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p5-capacity-block.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-p5-capacity-block.yaml diff --git a/1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p5-odcr-vpc.yaml similarity index 100% rename from 1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml rename to 1.architectures/4.amazon-eks/eksctl/eks-p5-odcr-vpc.yaml diff --git a/1.architectures/4.amazon-eks/terraform/.gitignore b/1.architectures/4.amazon-eks/terraform/.gitignore new file mode 100644 index 000000000..3c75e8011 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/.gitignore @@ -0,0 +1,34 @@ +# Terraform files +*.tfstate +*.tfstate.* +*.tfvars +!*.tfvars.example +.terraform/ +.terraform.lock.hcl +tfplan +tfplan.* + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db + +# Logs +*.log + +# Temporary files +*.tmp +*.temp + +# Kubectl config +kubeconfig* + +# Backup files +*.backup +*.bak \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md b/1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md new file mode 100644 index 000000000..e648362d5 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md @@ -0,0 +1,175 @@ +# Infrastructure Destruction Process + +This document outlines the safe destruction process implemented in `destroy.sh`. + +## Overview + +The `destroy.sh` script ensures safe cleanup of the EKS infrastructure by following a specific order to prevent orphaned AWS resources and failed Terraform destruction. + +## Destruction Process Flow + +### 1. Pre-Flight Checks +- ✅ Verify Terraform is installed +- ✅ Check kubectl connectivity to cluster +- ✅ Identify cluster name from Terraform state or kubectl context +- ✅ Confirm user intention with safety prompts + +### 2. Kubernetes Resource Cleanup + +#### Example Workloads +```bash +kubectl delete -f examples/gpu-workload.yaml +kubectl delete -f examples/fsx-lustre-example.yaml +kubectl delete -f examples/s3-mountpoint-example.yaml +kubectl delete -f examples/node-auto-repair-test.yaml +``` + +#### LoadBalancer Services +- Identifies all `LoadBalancer` type services across all namespaces +- Deletes each service individually with timeout protection +- Waits for AWS Load Balancers to be fully terminated + +#### Ingress Resources +- Finds all Ingress resources that may create ALBs/NLBs +- Deletes Ingress resources to trigger ALB cleanup +- Includes AWS Load Balancer Controller managed resources + +#### PersistentVolumeClaims +- Locates all PVCs that may create EBS volumes +- Deletes PVCs to release underlying EBS volumes +- Covers FSx, S3 Mountpoint, and standard EBS storage + +#### AWS Load Balancer Controller Resources +- Deletes TargetGroupBinding resources +- Ensures ALB/NLB target groups are cleaned up +- Prevents orphaned target groups + +### 3. Resource Cleanup Verification + +#### Wait Loop (10 minutes maximum) +```bash +# Continuously checks for: +- LoadBalancer services: 0 remaining +- PersistentVolumeClaims: 0 remaining +- Ingress resources: 0 remaining +``` + +#### AWS Resource Verification +```bash +# If AWS CLI available, checks for: +aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, '$CLUSTER_NAME')]" +aws ec2 describe-security-groups --filters "Name=group-name,Values=*$CLUSTER_NAME*" +aws elbv2 describe-target-groups --query "TargetGroups[?contains(TargetGroupName, '$CLUSTER_NAME')]" +``` + +### 4. Terraform Destruction + +#### Safety Confirmation +- Final confirmation prompt before destruction +- Clear warning about data loss +- Option to cancel at any point + +#### Terraform Commands +```bash +# Initialize if needed +terraform init + +# Destroy with auto-approve +terraform destroy -auto-approve +``` + +### 5. Local Cleanup + +#### File Cleanup +```bash +rm -f terraform.tfstate.backup* +rm -f tfplan* +rm -f kubeconfig* +``` + +## Why This Order Matters + +### 1. **Prevent Orphaned Resources** +- Kubernetes-created AWS resources must be deleted first +- Terraform doesn't track LoadBalancers created by services +- PVCs create EBS volumes outside Terraform state + +### 2. **Avoid Terraform Errors** +- Security groups can't be deleted if still attached to resources +- Load balancers must be deleted before their target groups +- VPC can't be deleted with remaining ENIs + +### 3. **Cost Management** +- Prevents billing for orphaned load balancers +- Ensures EBS volumes are properly deleted +- Cleanup of target groups and security groups + +## Error Recovery + +### If Script Fails +```bash +# Manual cleanup commands provided in output +kubectl get svc --all-namespaces --field-selector spec.type=LoadBalancer +kubectl get pvc --all-namespaces +kubectl get ingress --all-namespaces + +# AWS CLI cleanup +aws elbv2 describe-load-balancers +aws ec2 describe-security-groups --filters "Name=group-name,Values=*eks*" +``` + +### Force Destruction +```bash +# Skip Kubernetes cleanup if cluster inaccessible +./destroy.sh --skip-k8s-cleanup + +# Bypass confirmations for automation +./destroy.sh --force +``` + +## Script Features + +### ✅ **Safety First** +- Multiple confirmation prompts +- Comprehensive resource detection +- Graceful error handling + +### ✅ **Comprehensive Cleanup** +- All AWS resource types covered +- Multiple cleanup strategies +- Verification steps + +### ✅ **User-Friendly** +- Colored output for clarity +- Progress indicators +- Detailed error messages + +### ✅ **Flexible Options** +- Skip Kubernetes cleanup +- Force mode for automation +- Help documentation + +## Expected Timeline + +| Phase | Duration | Description | +|-------|----------|-------------| +| Kubernetes Cleanup | 2-5 minutes | Delete services, PVCs, ingresses | +| AWS Resource Deletion | 5-10 minutes | Load balancers, target groups | +| Terraform Destroy | 5-15 minutes | VPC, EKS, FSx, etc. | +| **Total** | **12-30 minutes** | Complete infrastructure removal | + +## Best Practices + +1. **Always use the script** instead of direct `terraform destroy` +2. **Verify cleanup** before proceeding with Terraform destruction +3. **Check AWS console** for any remaining resources after completion +4. **Backup important data** before running destruction +5. **Test in non-production** environments first + +## Script Exit Codes + +- `0`: Successful completion +- `1`: Critical error (missing tools, failed destruction) +- `130`: User cancellation (Ctrl+C) + +This comprehensive approach ensures safe, complete, and cost-effective infrastructure destruction. \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/README.md b/1.architectures/4.amazon-eks/terraform/README.md new file mode 100644 index 000000000..bfb415f34 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/README.md @@ -0,0 +1,470 @@ +# EKS Terraform Reference Architecture + +This repository contains a comprehensive Terraform configuration for deploying an Amazon EKS cluster with advanced features including GPU support, FSx for Lustre, and Mountpoint for S3. + +## Architecture Overview + +This reference architecture includes: + +- **EKS Cluster**: Managed Kubernetes cluster with both default and GPU node groups +- **GPU Support**: Dedicated GPU node groups with NVIDIA device plugin and node auto-repair +- **Node Auto-Repair**: Automatic detection and replacement of unhealthy nodes +- **Storage Solutions**: + - FSx for Lustre for high-performance computing workloads + - Mountpoint for S3 for object storage access + - EBS and EFS CSI drivers +- **Networking**: VPC with public/private subnets and VPC endpoints +- **Security**: IAM roles with least privilege access +- **Monitoring**: CloudWatch integration and metrics server +- **Auto-scaling**: Karpenter for intelligent and fast node provisioning + +## Prerequisites + +- AWS CLI configured with appropriate permissions +- Terraform >= 1.0 +- kubectl +- Helm (for add-ons) + +## Required AWS Permissions + +Your AWS credentials need the following permissions: +- EKS cluster management +- EC2 instance and VPC management +- IAM role and policy management +- FSx file system management +- S3 bucket access +- CloudWatch and logging + +## Quick Start + +1. Clone this repository: + ```bash + git clone + cd terraform-eks-reference-architecture + ``` + +2. Copy the example variables file: + ```bash + cp terraform.tfvars.example terraform.tfvars + ``` + +3. Edit `terraform.tfvars` with your specific values: + - Update `cluster_endpoint_public_access_cidrs` with your IP ranges + - Set `s3_mountpoint_bucket_name` to your S3 bucket name + - Configure FSx S3 import/export paths if needed + +4. Initialize Terraform: + ```bash + terraform init + ``` + +5. Plan the deployment: + ```bash + terraform plan + ``` + +6. Apply the configuration: + ```bash + terraform apply + ``` + +7. Configure kubectl: + ```bash + aws eks --region update-kubeconfig --name + ``` + +## Module Structure + +``` +modules/ +├── vpc/ # VPC, subnets, and networking +├── eks/ # EKS cluster and managed node groups +├── fsx-lustre/ # FSx for Lustre file system +├── s3-mountpoint/ # Mountpoint for S3 integration +└── addons/ # Kubernetes add-ons and controllers +``` + +## Configuration Options + +### Node Groups + +#### Default Node Group +- **Instance Types**: Configurable (default: m5.large, m5.xlarge) +- **Scaling**: Auto-scaling with configurable min/max/desired capacity +- **AMI**: Amazon Linux 2 EKS optimized + +#### GPU Node Group +- **Instance Types**: GPU-enabled instances (g4dn.xlarge, g4dn.2xlarge, p3.2xlarge) +- **AMI**: Amazon Linux 2 EKS GPU optimized +- **Taints**: Automatically taints GPU nodes +- **Auto-repair**: Enabled with extended grace period for GPU driver initialization + +### Node Auto-Repair + +Both node groups are configured with automatic node repair capabilities: + +#### Default Node Group Auto-Repair +- **Health Check Type**: EC2 instance health checks +- **Grace Period**: 300 seconds (5 minutes) +- **Monitoring**: Continuously monitors node health via EC2 instance status +- **Action**: Automatically replaces unhealthy nodes + +#### GPU Node Group Auto-Repair +- **Health Check Type**: EC2 instance health checks +- **Grace Period**: 600 seconds (10 minutes) - Extended due to GPU driver initialization time +- **Monitoring**: Enhanced monitoring for GPU-specific health issues +- **Action**: Intelligent replacement considering GPU resource constraints + +#### Auto-Repair Features +- **Proactive Monitoring**: Detects node issues before they impact workloads +- **Graceful Replacement**: Ensures workloads are safely rescheduled before node termination +- **Cost Optimization**: Prevents resource waste from unhealthy nodes +- **Zero-Touch Operations**: Reduces manual intervention for node maintenance + +### Auto-Scaling with Karpenter + +The reference architecture uses **Karpenter** instead of Cluster Autoscaler for superior node provisioning: + +#### Karpenter Advantages +- **Fast Provisioning**: Sub-minute node startup times +- **Cost Optimization**: Intelligent instance selection and spot instance support +- **Flexible Scheduling**: Pod-driven node selection with diverse instance types +- **Efficient Packing**: Optimal resource utilization and consolidation +- **Zero-Configuration**: Automatic node discovery and management + +#### Karpenter NodePools + +**Default NodePool** - For standard workloads: +```yaml +# Supports spot and on-demand instances +capacity-types: ["spot", "on-demand"] +instance-types: ["m5.*", "m5a.*", "c5.*"] +consolidation: WhenUnderutilized (30s) +expiration: 30 minutes +``` + +**GPU NodePool** - For GPU workloads: +```yaml +# GPU-specific instances with taints +capacity-types: ["on-demand"] +instance-types: ["g4dn.*", "g5.*", "p3.*"] +consolidation: WhenEmpty (30s) +expiration: 60 minutes +gpu-taints: nvidia.com/gpu=true:NoSchedule +``` + +#### Karpenter vs Cluster Autoscaler + +| Feature | Karpenter | Cluster Autoscaler | +|---------|-----------|-------------------| +| **Provisioning Speed** | ~45 seconds | 3-5 minutes | +| **Instance Selection** | Pod-driven | Node group limited | +| **Spot Support** | Native & seamless | Limited | +| **Cost Optimization** | Advanced bin-packing | Basic scaling | +| **Configuration** | Declarative NodePools | ASG management | +| **Multi-AZ** | Automatic | Manual setup | + +### Storage + +#### FSx for Lustre +- **Deployment Types**: SCRATCH_1, SCRATCH_2, PERSISTENT_1, PERSISTENT_2 +- **S3 Integration**: Optional import/export paths +- **Performance**: Configurable throughput +- **Kubernetes Integration**: Automatic CSI driver and storage class creation + +#### Mountpoint for S3 +- **CSI Driver**: Automatically deployed and configured +- **IAM Integration**: IRSA (IAM Roles for Service Accounts) +- **Storage Classes**: Pre-configured for immediate use + +### Add-ons + +The following add-ons are included: + +- **Cluster Autoscaler**: Automatic node scaling +- **AWS Load Balancer Controller**: ALB and NLB integration +- **NVIDIA Device Plugin**: GPU resource management +- **Metrics Server**: Resource metrics collection +- **AWS Node Termination Handler**: Graceful spot instance handling +- **EBS CSI Driver**: EBS volume management +- **EFS CSI Driver**: EFS file system support + +## Security Best Practices + +- **Network Security**: Private subnets for worker nodes +- **IAM**: Least privilege access with IRSA +- **Encryption**: EBS volumes and secrets encryption +- **VPC Endpoints**: Reduced internet traffic and improved security +- **Security Groups**: Restrictive ingress rules + +## Monitoring and Logging + +- **CloudWatch**: Container Insights integration +- **VPC Flow Logs**: Network traffic monitoring +- **Node Metrics**: CPU, memory, and disk monitoring +- **Application Logs**: Centralized logging to CloudWatch + +## Example Workloads + +### GPU Workload Example + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-workload +spec: + replicas: 1 + selector: + matchLabels: + app: gpu-workload + template: + metadata: + labels: + app: gpu-workload + spec: + tolerations: + - key: nvidia.com/gpu + operator: Equal + value: "true" + effect: NoSchedule + nodeSelector: + nvidia.com/gpu: "true" + containers: + - name: gpu-container + image: nvidia/cuda:11.0-base + resources: + limits: + nvidia.com/gpu: 1 + command: ["nvidia-smi"] +``` + +### FSx Lustre Usage + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: fsx-lustre-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: fsx-lustre-sc + resources: + requests: + storage: 100Gi +``` + +### S3 Mountpoint Usage + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: s3-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: s3-mountpoint-sc + resources: + requests: + storage: 1000Gi +``` + +## Karpenter Workload Examples + +### Standard Workload with Spot Preference +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: web-app +spec: + replicas: 5 + template: + spec: + containers: + - name: nginx + image: nginx + resources: + requests: + cpu: 100m + memory: 128Mi + # Prefer spot instances for cost optimization + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] +``` + +### GPU Workload Example +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: gpu-training +spec: + template: + spec: + containers: + - name: ml-training + image: nvidia/cuda:11.8-base + resources: + requests: + nvidia.com/gpu: 1 + cpu: 2000m + memory: 8Gi + nodeSelector: + node-type: gpu + tolerations: + - key: nvidia.com/gpu + effect: NoSchedule +``` + +### Testing Karpenter +```bash +# Deploy test workloads +kubectl apply -f examples/karpenter-workloads.yaml + +# Run comprehensive Karpenter tests +./examples/test-karpenter.sh test + +# Monitor Karpenter scaling +kubectl get nodes -w +kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -f +``` + +## Cost Optimization + +- **Spot Instances**: Can be enabled for cost savings +- **Single NAT Gateway**: Reduces NAT gateway costs (configurable) +- **VPC Endpoints**: Reduces data transfer costs +- **Auto-scaling**: Right-sizing based on demand + +## Troubleshooting + +### Common Issues + +1. **GPU Nodes Not Ready**: Check NVIDIA driver installation in user data +2. **FSx Mount Issues**: Verify security group rules and Lustre client installation +3. **S3 Mountpoint Errors**: Check IAM permissions and bucket policies +4. **Karpenter Issues**: Check NodePools, EC2NodeClasses, and IAM permissions +5. **Node Auto-Repair Issues**: + - Check EC2 instance health in AWS console + - Verify health check grace periods are appropriate + - Monitor CloudWatch metrics for node health events + +### Debugging Commands + +```bash +# Check node status +kubectl get nodes -o wide + +# Check GPU resources +kubectl describe nodes -l nvidia.com/gpu=true + +# Check storage classes +kubectl get storageclass + +# Check persistent volumes +kubectl get pv,pvc + +# Check add-on status +kubectl get pods -n kube-system + +# Monitor node health and auto-repair +kubectl get nodes --show-labels +kubectl describe node + +# Check node group health in AWS CLI +aws eks describe-nodegroup --cluster-name --nodegroup-name + +# Monitor auto-repair events +kubectl get events --field-selector involvedObject.kind=Node --sort-by='.lastTimestamp' + +# Check Karpenter status and logs +kubectl get nodepool,ec2nodeclass +kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=50 + +# Test Karpenter provisioning +./examples/test-karpenter.sh monitor +``` + +## Cleanup + +### Safe Infrastructure Destruction + +Use the provided destroy script for safe cleanup: + +```bash +./destroy.sh +``` + +The destroy script will: +1. **Clean up Kubernetes resources** that create AWS resources (LoadBalancers, PVCs, Ingresses) +2. **Wait for AWS resources** to be fully deleted +3. **Run terraform destroy** to remove all infrastructure +4. **Clean up local files** (state backups, plans, etc.) + +### Script Options + +```bash +# Interactive cleanup (default) +./destroy.sh + +# Skip Kubernetes cleanup (if cluster is not accessible) +./destroy.sh --skip-k8s-cleanup + +# Force mode (skip confirmations) +./destroy.sh --force + +# Get help +./destroy.sh --help +``` + +### Manual Cleanup (if script fails) + +If the destroy script fails, you can manually clean up: + +```bash +# Delete example workloads +kubectl delete -f examples/ --ignore-not-found=true + +# Delete LoadBalancer services +kubectl get svc --all-namespaces -o jsonpath='{range .items[?(@.spec.type=="LoadBalancer")]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' | while read ns svc; do kubectl delete svc $svc -n $ns; done + +# Delete PersistentVolumeClaims +kubectl delete pvc --all --all-namespaces + +# Delete Ingress resources +kubectl delete ingress --all --all-namespaces + +# Wait for AWS resources to be cleaned up (5-10 minutes) +# Then run terraform destroy +terraform destroy +``` + +**Important**: Always ensure Kubernetes resources are deleted before running `terraform destroy` to prevent orphaned AWS resources. + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make changes and test +4. Submit a pull request + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + +## Support + +For issues and questions: +- Check the troubleshooting section +- Review AWS EKS documentation +- Open an issue in this repository \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/deploy.sh b/1.architectures/4.amazon-eks/terraform/deploy.sh new file mode 100755 index 000000000..ac4429ad3 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/deploy.sh @@ -0,0 +1,255 @@ +#!/bin/bash + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check prerequisites +check_prerequisites() { + print_status "Checking prerequisites..." + + # Check if terraform is installed + if ! command -v terraform &> /dev/null; then + print_error "Terraform is not installed. Please install Terraform first." + exit 1 + fi + + # Check if AWS CLI is installed + if ! command -v aws &> /dev/null; then + print_error "AWS CLI is not installed. Please install AWS CLI first." + exit 1 + fi + + # Check if kubectl is installed + if ! command -v kubectl &> /dev/null; then + print_error "kubectl is not installed. Please install kubectl first." + exit 1 + fi + + # Check AWS credentials + if ! aws sts get-caller-identity &> /dev/null; then + print_error "AWS credentials not configured. Please run 'aws configure' first." + exit 1 + fi + + print_success "All prerequisites are satisfied." +} + +# Check if terraform.tfvars exists +check_tfvars() { + if [ ! -f "terraform.tfvars" ]; then + print_warning "terraform.tfvars not found. Creating from example..." + cp terraform.tfvars.example terraform.tfvars + print_warning "Please edit terraform.tfvars with your specific values before proceeding." + print_warning "Key values to update:" + echo " - cluster_endpoint_public_access_cidrs (your IP ranges)" + echo " - s3_mountpoint_bucket_name (your S3 bucket name)" + echo " - fsx_s3_import_path and fsx_s3_export_path (if using S3 integration)" + read -p "Press Enter to continue after editing terraform.tfvars..." + fi +} + +# Initialize Terraform +init_terraform() { + print_status "Initializing Terraform..." + terraform init + print_success "Terraform initialized successfully." +} + +# Plan Terraform deployment +plan_terraform() { + print_status "Planning Terraform deployment..." + terraform plan -out=tfplan + print_success "Terraform plan completed successfully." +} + +# Apply Terraform configuration +apply_terraform() { + print_status "Applying Terraform configuration..." + print_warning "This will create AWS resources that may incur costs." + read -p "Do you want to continue? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + terraform apply tfplan + print_success "Terraform apply completed successfully." + else + print_status "Deployment cancelled." + exit 0 + fi +} + +# Configure kubectl +configure_kubectl() { + print_status "Configuring kubectl..." + + # Get cluster name and region from Terraform outputs + CLUSTER_NAME=$(terraform output -raw cluster_name) + REGION=$(terraform output -raw region) + + # Update kubeconfig + aws eks --region $REGION update-kubeconfig --name $CLUSTER_NAME + + # Test connection + if kubectl get nodes &> /dev/null; then + print_success "kubectl configured successfully." + print_status "Cluster nodes:" + kubectl get nodes -o wide + else + print_error "Failed to connect to cluster. Please check your configuration." + exit 1 + fi +} + +# Deploy example workloads +deploy_examples() { + print_status "Do you want to deploy example workloads?" + read -p "Deploy examples? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + print_status "Deploying GPU workload example..." + kubectl apply -f examples/gpu-workload.yaml + + print_status "Deploying FSx Lustre example..." + kubectl apply -f examples/fsx-lustre-example.yaml + + print_status "Deploying S3 Mountpoint example..." + kubectl apply -f examples/s3-mountpoint-example.yaml + + print_success "Example workloads deployed successfully." + + print_status "Checking deployment status..." + kubectl get pods,pvc -o wide + fi +} + +# Display cluster information +show_cluster_info() { + print_status "Cluster Information:" + echo "====================" + + # Terraform outputs + echo "Cluster Name: $(terraform output -raw cluster_name)" + echo "Cluster Endpoint: $(terraform output -raw cluster_endpoint)" + echo "Region: $(terraform output -raw region)" + echo "VPC ID: $(terraform output -raw vpc_id)" + + echo "" + print_status "Useful Commands:" + echo "==================" + echo "View cluster nodes: kubectl get nodes -o wide" + echo "View all pods: kubectl get pods --all-namespaces" + echo "View storage classes: kubectl get storageclass" + echo "View persistent volumes: kubectl get pv,pvc --all-namespaces" + echo "Check GPU nodes: kubectl describe nodes -l nvidia.com/gpu=true" + echo "View cluster info: kubectl cluster-info" + + echo "" + print_status "Monitoring:" + echo "===========" + echo "Check cluster autoscaler: kubectl logs -n kube-system -l app=cluster-autoscaler" + echo "Check load balancer controller: kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller" + echo "Check NVIDIA device plugin: kubectl logs -n kube-system -l name=nvidia-device-plugin-ds" +} + +# Main deployment function +main() { + print_status "Starting EKS Reference Architecture Deployment" + echo "==============================================" + + check_prerequisites + check_tfvars + init_terraform + plan_terraform + apply_terraform + configure_kubectl + deploy_examples + show_cluster_info + + print_success "Deployment completed successfully!" + print_status "Your EKS cluster is ready to use." + echo "" + print_status "To destroy the infrastructure safely, use:" + echo " ./destroy.sh" +} + +# Cleanup function +cleanup() { + print_status "Starting cleanup process..." + print_warning "This will destroy all resources created by Terraform." + print_warning "Make sure to delete any Kubernetes resources (LoadBalancers, PVCs) first!" + + read -p "Are you sure you want to destroy all resources? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + # Delete example workloads first + print_status "Deleting example workloads..." + kubectl delete -f examples/ --ignore-not-found=true || true + + # Wait for cleanup + print_status "Waiting for Kubernetes resources to be cleaned up..." + sleep 30 + + # Destroy Terraform resources + print_status "Destroying Terraform resources..." + terraform destroy -auto-approve + + print_success "Cleanup completed successfully." + else + print_status "Cleanup cancelled." + fi +} + +# Script usage +usage() { + echo "Usage: $0 [command]" + echo "" + echo "Commands:" + echo " deploy - Deploy the EKS cluster (default)" + echo " cleanup - Destroy all resources" + echo " help - Show this help message" + echo "" + echo "Examples:" + echo " $0 deploy # Deploy the cluster" + echo " $0 cleanup # Destroy the cluster" + echo " $0 # Deploy the cluster (default)" +} + +# Parse command line arguments +case "${1:-deploy}" in + deploy) + main + ;; + cleanup) + cleanup + ;; + help|--help|-h) + usage + ;; + *) + print_error "Unknown command: $1" + usage + exit 1 + ;; +esac \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/destroy.sh b/1.architectures/4.amazon-eks/terraform/destroy.sh new file mode 100755 index 000000000..aab52d3bb --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/destroy.sh @@ -0,0 +1,389 @@ +#!/bin/bash + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to check if kubectl is configured +check_kubectl() { + print_status "Checking kubectl configuration..." + if ! kubectl cluster-info &> /dev/null; then + print_warning "kubectl is not configured or cluster is not accessible." + print_warning "Some cleanup steps will be skipped." + return 1 + fi + print_success "kubectl is configured and cluster is accessible." + return 0 +} + +# Function to get cluster name from terraform output +get_cluster_name() { + if [ -f "terraform.tfstate" ]; then + CLUSTER_NAME=$(terraform output -raw cluster_name 2>/dev/null || echo "") + if [ -n "$CLUSTER_NAME" ]; then + print_status "Found cluster name from Terraform: $CLUSTER_NAME" + return 0 + fi + fi + + # Try to get from kubectl context + CLUSTER_NAME=$(kubectl config current-context 2>/dev/null | grep -o 'arn:aws:eks:[^:]*:[^:]*:cluster/[^/]*' | cut -d'/' -f2 2>/dev/null || echo "") + if [ -n "$CLUSTER_NAME" ]; then + print_status "Found cluster name from kubectl context: $CLUSTER_NAME" + return 0 + fi + + print_warning "Could not determine cluster name" + return 1 +} + +# Function to delete LoadBalancer services +delete_load_balancers() { + print_status "Checking for LoadBalancer services..." + + LB_SERVICES=$(kubectl get svc --all-namespaces -o jsonpath='{range .items[?(@.spec.type=="LoadBalancer")]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "") + + if [ -n "$LB_SERVICES" ]; then + print_warning "Found LoadBalancer services that need to be deleted:" + echo "$LB_SERVICES" + + while IFS= read -r line; do + if [ -n "$line" ]; then + NAMESPACE=$(echo "$line" | awk '{print $1}') + SERVICE=$(echo "$line" | awk '{print $2}') + print_status "Deleting LoadBalancer service: $NAMESPACE/$SERVICE" + kubectl delete svc "$SERVICE" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete service $NAMESPACE/$SERVICE" + fi + done <<< "$LB_SERVICES" + + print_status "Waiting for LoadBalancers to be fully deleted..." + sleep 30 + else + print_success "No LoadBalancer services found." + fi +} + +# Function to delete Ingress resources +delete_ingresses() { + print_status "Checking for Ingress resources..." + + INGRESSES=$(kubectl get ingress --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "") + + if [ -n "$INGRESSES" ]; then + print_warning "Found Ingress resources that need to be deleted:" + echo "$INGRESSES" + + while IFS= read -r line; do + if [ -n "$line" ]; then + NAMESPACE=$(echo "$line" | awk '{print $1}') + INGRESS=$(echo "$line" | awk '{print $2}') + print_status "Deleting Ingress: $NAMESPACE/$INGRESS" + kubectl delete ingress "$INGRESS" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete ingress $NAMESPACE/$INGRESS" + fi + done <<< "$INGRESSES" + + print_status "Waiting for Ingresses to be fully deleted..." + sleep 30 + else + print_success "No Ingress resources found." + fi +} + +# Function to delete PersistentVolumeClaims +delete_pvcs() { + print_status "Checking for PersistentVolumeClaims..." + + PVCS=$(kubectl get pvc --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "") + + if [ -n "$PVCS" ]; then + print_warning "Found PersistentVolumeClaims that need to be deleted:" + echo "$PVCS" + + while IFS= read -r line; do + if [ -n "$line" ]; then + NAMESPACE=$(echo "$line" | awk '{print $1}') + PVC=$(echo "$line" | awk '{print $2}') + print_status "Deleting PVC: $NAMESPACE/$PVC" + kubectl delete pvc "$PVC" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete PVC $NAMESPACE/$PVC" + fi + done <<< "$PVCS" + + print_status "Waiting for PVCs to be fully deleted..." + sleep 30 + else + print_success "No PersistentVolumeClaims found." + fi +} + +# Function to delete example workloads +delete_example_workloads() { + print_status "Deleting example workloads..." + + if [ -d "examples" ]; then + for example_file in examples/*.yaml; do + if [ -f "$example_file" ]; then + print_status "Deleting resources from $example_file" + kubectl delete -f "$example_file" --ignore-not-found=true --timeout=300s || print_warning "Failed to delete some resources from $example_file" + fi + done + print_success "Example workloads cleanup completed." + else + print_status "No examples directory found." + fi +} + +# Function to delete AWS Load Balancer Controller resources +delete_alb_resources() { + print_status "Checking for AWS Load Balancer Controller managed resources..." + + # Delete TargetGroupBinding resources + TGB_RESOURCES=$(kubectl get targetgroupbindings --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "") + + if [ -n "$TGB_RESOURCES" ]; then + print_warning "Found TargetGroupBinding resources:" + while IFS= read -r line; do + if [ -n "$line" ]; then + NAMESPACE=$(echo "$line" | awk '{print $1}') + TGB=$(echo "$line" | awk '{print $2}') + print_status "Deleting TargetGroupBinding: $NAMESPACE/$TGB" + kubectl delete targetgroupbinding "$TGB" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete TargetGroupBinding $NAMESPACE/$TGB" + fi + done <<< "$TGB_RESOURCES" + fi + + print_success "AWS Load Balancer Controller resources cleanup completed." +} + +# Function to wait for all resources to be deleted +wait_for_cleanup() { + print_status "Waiting for all Kubernetes resources to be fully cleaned up..." + + # Wait up to 10 minutes for resources to be deleted + for i in {1..60}; do + LB_COUNT=$(kubectl get svc --all-namespaces -o jsonpath='{range .items[?(@.spec.type=="LoadBalancer")]}{.metadata.name}{"\n"}{end}' 2>/dev/null | wc -l || echo "0") + PVC_COUNT=$(kubectl get pvc --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0") + INGRESS_COUNT=$(kubectl get ingress --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0") + + if [ "$LB_COUNT" -eq 0 ] && [ "$PVC_COUNT" -eq 0 ] && [ "$INGRESS_COUNT" -eq 0 ]; then + print_success "All Kubernetes resources have been cleaned up." + break + fi + + print_status "Still waiting for cleanup... (${i}/60) - LBs: $LB_COUNT, PVCs: $PVC_COUNT, Ingresses: $INGRESS_COUNT" + sleep 10 + done +} + +# Function to check for remaining AWS resources +check_aws_resources() { + print_status "Checking for remaining AWS resources that might block Terraform destroy..." + + if command -v aws &> /dev/null && [ -n "$CLUSTER_NAME" ]; then + print_status "Checking for remaining Load Balancers..." + aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, '$CLUSTER_NAME')].LoadBalancerName" --output table 2>/dev/null || print_warning "Could not check ELBv2 resources" + + print_status "Checking for remaining Security Groups..." + aws ec2 describe-security-groups --filters "Name=group-name,Values=*$CLUSTER_NAME*" --query "SecurityGroups[].GroupName" --output table 2>/dev/null || print_warning "Could not check Security Groups" + + print_status "Checking for remaining Target Groups..." + aws elbv2 describe-target-groups --query "TargetGroups[?contains(TargetGroupName, '$CLUSTER_NAME')].TargetGroupName" --output table 2>/dev/null || print_warning "Could not check Target Groups" + else + print_warning "AWS CLI not available or cluster name not found. Skipping AWS resource check." + fi +} + +# Function to run terraform destroy +run_terraform_destroy() { + print_status "Running terraform destroy..." + print_warning "This will destroy all Terraform-managed infrastructure." + print_warning "Make sure you have backed up any important data." + + read -p "Are you sure you want to proceed with terraform destroy? (yes/no): " -r + if [[ $REPLY == "yes" ]]; then + print_status "Proceeding with terraform destroy..." + + # Initialize terraform if needed + if [ ! -d ".terraform" ]; then + print_status "Initializing Terraform..." + terraform init + fi + + # Run destroy with auto-approve + terraform destroy -auto-approve + + if [ $? -eq 0 ]; then + print_success "Terraform destroy completed successfully!" + else + print_error "Terraform destroy failed. Please check the output above." + exit 1 + fi + else + print_status "Terraform destroy cancelled." + exit 0 + fi +} + +# Function to cleanup local files +cleanup_local_files() { + print_status "Cleaning up local files..." + + # Remove terraform state backup files + rm -f terraform.tfstate.backup* 2>/dev/null || true + rm -f tfplan* 2>/dev/null || true + + # Remove kubectl config backups + rm -f kubeconfig* 2>/dev/null || true + + print_success "Local cleanup completed." +} + +# Main function +main() { + print_status "Starting EKS Infrastructure Destruction" + echo "========================================" + + print_warning "This script will:" + echo "1. Delete all Kubernetes resources that create AWS resources" + echo "2. Wait for cleanup to complete" + echo "3. Run terraform destroy to remove all infrastructure" + echo "4. Clean up local files" + echo "" + + read -p "Do you want to continue? (y/N): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + print_status "Destruction cancelled." + exit 0 + fi + + # Check prerequisites + if ! command -v terraform &> /dev/null; then + print_error "Terraform is not installed." + exit 1 + fi + + # Get cluster name + get_cluster_name + + # Check kubectl and proceed with Kubernetes cleanup if available + if check_kubectl; then + print_status "Starting Kubernetes resource cleanup..." + + delete_example_workloads + delete_load_balancers + delete_ingresses + delete_alb_resources + delete_pvcs + wait_for_cleanup + + print_success "Kubernetes cleanup completed." + else + print_warning "Skipping Kubernetes cleanup due to connectivity issues." + print_warning "You may need to manually clean up AWS resources if Terraform destroy fails." + fi + + # Check for remaining AWS resources + check_aws_resources + + # Wait a bit more to ensure AWS resources are cleaned up + print_status "Waiting additional 60 seconds for AWS resource cleanup..." + sleep 60 + + # Run terraform destroy + run_terraform_destroy + + # Clean up local files + cleanup_local_files + + print_success "Infrastructure destruction completed!" + print_status "All resources have been destroyed and local files cleaned up." +} + +# Handle script termination +cleanup_on_exit() { + print_warning "Script interrupted. Some resources may not be fully cleaned up." + print_warning "You may need to manually delete remaining AWS resources." +} + +trap cleanup_on_exit EXIT + +# Help function +usage() { + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --skip-k8s-cleanup Skip Kubernetes resource cleanup" + echo " --force Skip confirmation prompts" + echo " --help Show this help message" + echo "" + echo "This script safely destroys the EKS infrastructure by:" + echo "1. Cleaning up Kubernetes resources that create AWS resources" + echo "2. Waiting for AWS resources to be fully deleted" + echo "3. Running terraform destroy" + echo "4. Cleaning up local files" +} + +# Parse command line arguments +SKIP_K8S_CLEANUP=false +FORCE=false + +while [[ $# -gt 0 ]]; do + case $1 in + --skip-k8s-cleanup) + SKIP_K8S_CLEANUP=true + shift + ;; + --force) + FORCE=true + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + print_error "Unknown option: $1" + usage + exit 1 + ;; + esac +done + +# Override main function for options +if [ "$SKIP_K8S_CLEANUP" = true ]; then + print_warning "Skipping Kubernetes cleanup as requested." + run_terraform_destroy + cleanup_local_files + exit 0 +fi + +if [ "$FORCE" = true ]; then + print_warning "Force mode enabled. Skipping confirmations." + # Override read commands in functions + export REPLY="yes" +fi + +# Run main function +main \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml b/1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml new file mode 100644 index 000000000..0b5e47f38 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml @@ -0,0 +1,105 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: fsx-lustre-pvc + namespace: default +spec: + accessModes: + - ReadWriteMany + storageClassName: fsx-lustre-sc + resources: + requests: + storage: 100Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fsx-lustre-test + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: fsx-lustre-test + template: + metadata: + labels: + app: fsx-lustre-test + spec: + containers: + - name: test-container + image: amazonlinux:latest + command: + - /bin/bash + - -c + - | + yum update -y + yum install -y util-linux + echo "Testing FSx Lustre mount at /mnt/fsx" + df -h /mnt/fsx + echo "Writing test file..." + echo "$(date): Hello from $(hostname)" >> /mnt/fsx/test-$(hostname).txt + echo "Reading test files:" + ls -la /mnt/fsx/ + cat /mnt/fsx/test-*.txt + echo "FSx Lustre test completed. Sleeping..." + sleep 3600 + volumeMounts: + - name: fsx-volume + mountPath: /mnt/fsx + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + volumes: + - name: fsx-volume + persistentVolumeClaim: + claimName: fsx-lustre-pvc +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: fsx-performance-test + namespace: default +spec: + template: + spec: + containers: + - name: performance-test + image: amazonlinux:latest + command: + - /bin/bash + - -c + - | + yum update -y + yum install -y time + echo "Running FSx Lustre performance test..." + + # Write performance test + echo "Testing write performance..." + time dd if=/dev/zero of=/mnt/fsx/test-write-$(date +%s).dat bs=1M count=100 oflag=direct + + # Read performance test + echo "Testing read performance..." + time dd if=/mnt/fsx/test-write-*.dat of=/dev/null bs=1M iflag=direct + + echo "Performance test completed" + volumeMounts: + - name: fsx-volume + mountPath: /mnt/fsx + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + volumes: + - name: fsx-volume + persistentVolumeClaim: + claimName: fsx-lustre-pvc + restartPolicy: Never + backoffLimit: 3 \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml b/1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml new file mode 100644 index 000000000..0cc704f04 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-test + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: gpu-test + template: + metadata: + labels: + app: gpu-test + spec: + tolerations: + - key: nvidia.com/gpu + operator: Equal + value: "true" + effect: NoSchedule + nodeSelector: + nvidia.com/gpu: "true" + containers: + - name: gpu-container + image: nvidia/cuda:11.8-base-ubuntu20.04 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + command: + - /bin/bash + - -c + - | + nvidia-smi + echo "GPU test completed successfully" + sleep 3600 + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" +--- +apiVersion: v1 +kind: Service +metadata: + name: gpu-test-service + namespace: default +spec: + selector: + app: gpu-test + ports: + - port: 80 + targetPort: 8080 + type: ClusterIP \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml b/1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml new file mode 100644 index 000000000..078b9ebc2 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml @@ -0,0 +1,262 @@ +--- +# Example 1: Standard workload that will be scheduled on Karpenter-managed nodes +apiVersion: apps/v1 +kind: Deployment +metadata: + name: karpenter-example-app + namespace: default +spec: + replicas: 3 + selector: + matchLabels: + app: karpenter-example + template: + metadata: + labels: + app: karpenter-example + spec: + containers: + - name: app + image: nginx:latest + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + ports: + - containerPort: 80 + # This workload will prefer spot instances but can use on-demand + nodeSelector: + node-type: default + tolerations: + - key: karpenter.sh/provisioner-name + operator: Exists + effect: NoSchedule + +--- +# Example 2: GPU workload that requires GPU nodes +apiVersion: batch/v1 +kind: Job +metadata: + name: karpenter-gpu-workload + namespace: default +spec: + template: + spec: + containers: + - name: gpu-job + image: nvidia/cuda:11.8-base-ubuntu20.04 + command: + - /bin/bash + - -c + - | + echo "Starting GPU workload on Karpenter-managed node" + nvidia-smi + echo "GPU information:" + nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv + # Simulate GPU workload + echo "Running CUDA sample..." + sleep 300 + echo "GPU workload completed" + resources: + requests: + nvidia.com/gpu: 1 + cpu: 1000m + memory: 2Gi + limits: + nvidia.com/gpu: 1 + cpu: 2000m + memory: 4Gi + # This will specifically target GPU nodes managed by Karpenter + nodeSelector: + node-type: gpu + nvidia.com/gpu: "true" + tolerations: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + - key: karpenter.sh/provisioner-name + operator: Exists + effect: NoSchedule + restartPolicy: Never + backoffLimit: 3 + +--- +# Example 3: Burst workload that demonstrates Karpenter's fast scaling +apiVersion: apps/v1 +kind: Deployment +metadata: + name: karpenter-burst-workload + namespace: default +spec: + replicas: 1 # Will be scaled up to demonstrate Karpenter + selector: + matchLabels: + app: burst-workload + template: + metadata: + labels: + app: burst-workload + spec: + containers: + - name: cpu-intensive + image: busybox:latest + command: + - /bin/sh + - -c + - | + echo "Starting CPU-intensive workload on $(hostname)" + echo "Node labels:" + cat /etc/hostname + # Simulate CPU-intensive work + while true; do + echo "Working... $(date)" + # Light CPU work to demonstrate scaling + dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null + sleep 10 + done + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1000m + memory: 1Gi + nodeSelector: + node-type: default + tolerations: + - key: karpenter.sh/provisioner-name + operator: Exists + effect: NoSchedule + +--- +# Example 4: Mixed workload that can run on both spot and on-demand +apiVersion: apps/v1 +kind: Deployment +metadata: + name: karpenter-mixed-workload + namespace: default +spec: + replicas: 5 + selector: + matchLabels: + app: mixed-workload + template: + metadata: + labels: + app: mixed-workload + spec: + containers: + - name: web-server + image: httpd:2.4 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + ports: + - containerPort: 80 + # Demonstrate Karpenter's intelligent node selection + nodeSelector: + node-type: default + tolerations: + - key: karpenter.sh/provisioner-name + operator: Exists + effect: NoSchedule + affinity: + # Prefer spot instances for cost optimization + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + +--- +# Example 5: Batch job demonstrating Karpenter's ability to scale from zero +apiVersion: batch/v1 +kind: Job +metadata: + name: karpenter-batch-processing + namespace: default +spec: + parallelism: 10 + completions: 50 + template: + spec: + containers: + - name: batch-processor + image: alpine:latest + command: + - /bin/sh + - -c + - | + echo "Batch job starting on $(hostname)" + echo "Processing item $RANDOM..." + # Simulate batch processing work + sleep $((RANDOM % 60 + 30)) + echo "Batch job completed on $(hostname)" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + nodeSelector: + node-type: default + tolerations: + - key: karpenter.sh/provisioner-name + operator: Exists + effect: NoSchedule + restartPolicy: Never + backoffLimit: 3 + +--- +# Service for the example app +apiVersion: v1 +kind: Service +metadata: + name: karpenter-example-service + namespace: default +spec: + selector: + app: karpenter-example + ports: + - port: 80 + targetPort: 80 + type: ClusterIP + +--- +# HPA for demonstrating Karpenter's integration with pod autoscaling +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: karpenter-example-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: karpenter-burst-workload + minReplicas: 1 + maxReplicas: 20 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml b/1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml new file mode 100644 index 000000000..2d683120b --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml @@ -0,0 +1,197 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: node-health-test-script + namespace: default +data: + test-script.sh: | + #!/bin/bash + echo "Node Auto-Repair Test Script" + echo "==============================" + + # Get current node information + NODE_NAME=$(kubectl get nodes --no-headers -o custom-columns=":metadata.name" | head -1) + echo "Testing with node: $NODE_NAME" + + # Check node health status + echo "Current node status:" + kubectl describe node $NODE_NAME | grep -A 5 "Conditions:" + + # Monitor node health over time + echo "Monitoring node health for 5 minutes..." + for i in {1..30}; do + echo "Check $i/30 at $(date)" + kubectl get nodes --no-headers | grep -v "Ready" + sleep 10 + done + + echo "Node health monitoring complete" + +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: node-auto-repair-test + namespace: default +spec: + template: + spec: + serviceAccountName: default + containers: + - name: node-health-tester + image: bitnami/kubectl:latest + command: + - /bin/bash + - /scripts/test-script.sh + volumeMounts: + - name: test-script + mountPath: /scripts + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + volumes: + - name: test-script + configMap: + name: node-health-test-script + defaultMode: 0755 + restartPolicy: Never + backoffLimit: 1 + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-health-monitor + namespace: default + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-health-monitor +rules: +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-health-monitor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-health-monitor +subjects: +- kind: ServiceAccount + name: node-health-monitor + namespace: default + +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-health-monitor + namespace: default + labels: + app: node-health-monitor +spec: + selector: + matchLabels: + app: node-health-monitor + template: + metadata: + labels: + app: node-health-monitor + spec: + serviceAccountName: node-health-monitor + hostNetwork: true + hostPID: true + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + - key: nvidia.com/gpu + effect: NoSchedule + containers: + - name: node-health-monitor + image: amazonlinux:latest + command: + - /bin/bash + - -c + - | + yum update -y + yum install -y procps-ng util-linux + + echo "Node Health Monitor Starting on $(hostname)" + echo "Node: $NODE_NAME" + echo "Namespace: $POD_NAMESPACE" + + while true; do + # Check system health + echo "=== Health Check at $(date) ===" + + # CPU and Memory usage + echo "CPU Usage:" + cat /proc/loadavg + + echo "Memory Usage:" + free -h + + # Disk usage + echo "Disk Usage:" + df -h | grep -E "(/$|/var|/tmp)" + + # Check for any hardware issues + echo "Checking dmesg for errors:" + dmesg | tail -10 | grep -i "error\|fail\|warning" || echo "No recent errors found" + + # GPU health check (if GPU node) + if command -v nvidia-smi &> /dev/null; then + echo "GPU Status:" + nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits || echo "GPU check failed" + fi + + echo "Health check complete" + echo "==========================" + sleep 60 + done + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + securityContext: + privileged: true + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml b/1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml new file mode 100644 index 000000000..de90229f8 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml @@ -0,0 +1,181 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: s3-mountpoint-pvc + namespace: default +spec: + accessModes: + - ReadWriteMany + storageClassName: s3-mountpoint-sc + resources: + requests: + storage: 1000Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: s3-mountpoint-test + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: s3-mountpoint-test + template: + metadata: + labels: + app: s3-mountpoint-test + spec: + containers: + - name: test-container + image: amazonlinux:latest + command: + - /bin/bash + - -c + - | + echo "Testing S3 Mountpoint at /mnt/s3" + ls -la /mnt/s3/ + echo "Writing test file..." + echo "$(date): Hello from $(hostname)" > /mnt/s3/test-$(hostname)-$(date +%s).txt + echo "Listing files in S3 mount:" + ls -la /mnt/s3/ + echo "Reading back test files:" + cat /mnt/s3/test-*.txt || echo "No test files found yet" + echo "S3 Mountpoint test running. Sleeping..." + sleep 3600 + volumeMounts: + - name: s3-volume + mountPath: /mnt/s3 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + volumes: + - name: s3-volume + persistentVolumeClaim: + claimName: s3-mountpoint-pvc +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: s3-backup-job + namespace: default +spec: + schedule: "0 */6 * * *" # Every 6 hours + jobTemplate: + spec: + template: + spec: + containers: + - name: backup-container + image: amazonlinux:latest + command: + - /bin/bash + - -c + - | + echo "Starting backup job at $(date)" + + # Create backup directory + mkdir -p /mnt/s3/backups/$(date +%Y-%m-%d) + + # Example: backup some application data + echo "Backup completed at $(date)" > /mnt/s3/backups/$(date +%Y-%m-%d)/backup-$(date +%H%M%S).log + + # List backup files + echo "Current backups:" + ls -la /mnt/s3/backups/ + + echo "Backup job completed" + volumeMounts: + - name: s3-volume + mountPath: /mnt/s3 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + volumes: + - name: s3-volume + persistentVolumeClaim: + claimName: s3-mountpoint-pvc + restartPolicy: OnFailure +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: s3-app-config + namespace: default +data: + app.conf: | + # Application configuration + data_path=/mnt/s3/data + log_path=/mnt/s3/logs + backup_path=/mnt/s3/backups +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: s3-data-processor + namespace: default +spec: + serviceName: s3-data-processor + replicas: 1 + selector: + matchLabels: + app: s3-data-processor + template: + metadata: + labels: + app: s3-data-processor + spec: + containers: + - name: data-processor + image: amazonlinux:latest + command: + - /bin/bash + - -c + - | + source /etc/app/app.conf + echo "Data processor starting..." + mkdir -p $data_path $log_path $backup_path + + while true; do + timestamp=$(date +%Y%m%d-%H%M%S) + echo "[$timestamp] Processing data..." | tee -a $log_path/processor-$timestamp.log + + # Simulate data processing + echo "Sample data: $(date)" > $data_path/data-$timestamp.dat + + # Archive old data every 10 iterations + if [ $(($(date +%S) % 10)) -eq 0 ]; then + echo "Archiving old data..." + tar -czf $backup_path/archive-$timestamp.tar.gz $data_path/*.dat + rm -f $data_path/*.dat + fi + + sleep 30 + done + volumeMounts: + - name: s3-volume + mountPath: /mnt/s3 + - name: config-volume + mountPath: /etc/app + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + volumes: + - name: s3-volume + persistentVolumeClaim: + claimName: s3-mountpoint-pvc + - name: config-volume + configMap: + name: s3-app-config \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh b/1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh new file mode 100755 index 000000000..ff3455add --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check if kubectl is available and connected +check_kubectl() { + print_status "Checking kubectl connectivity..." + if ! kubectl cluster-info &> /dev/null; then + print_error "kubectl is not configured or cluster is not accessible." + exit 1 + fi + print_success "kubectl is configured and cluster is accessible." +} + +# Check if Karpenter is installed +check_karpenter() { + print_status "Checking if Karpenter is installed..." + if kubectl get deployment -n karpenter karpenter &> /dev/null; then + print_success "Karpenter is installed and running." + kubectl get pods -n karpenter + else + print_error "Karpenter is not installed. Please deploy the cluster first." + exit 1 + fi +} + +# Check Karpenter NodePools and EC2NodeClasses +check_karpenter_resources() { + print_status "Checking Karpenter NodePools..." + kubectl get nodepool + + print_status "Checking Karpenter EC2NodeClasses..." + kubectl get ec2nodeclass + + print_status "Checking current nodes..." + kubectl get nodes -o wide +} + +# Deploy test workloads +deploy_workloads() { + print_status "Deploying Karpenter test workloads..." + kubectl apply -f karpenter-workloads.yaml + + print_status "Waiting for deployments to be ready..." + kubectl wait --for=condition=available --timeout=300s deployment/karpenter-example-app + kubectl wait --for=condition=available --timeout=300s deployment/karpenter-mixed-workload + + print_success "Test workloads deployed successfully." +} + +# Test Karpenter scaling +test_scaling() { + print_status "Testing Karpenter node provisioning..." + + # Get initial node count + INITIAL_NODES=$(kubectl get nodes --no-headers | wc -l) + print_status "Initial node count: $INITIAL_NODES" + + # Scale up the burst workload to trigger node provisioning + print_status "Scaling up burst workload to trigger node provisioning..." + kubectl scale deployment karpenter-burst-workload --replicas=10 + + # Wait for Karpenter to provision new nodes + print_status "Waiting for Karpenter to provision new nodes (this may take 2-3 minutes)..." + + for i in {1..18}; do # Wait up to 3 minutes + CURRENT_NODES=$(kubectl get nodes --no-headers | wc -l) + if [ $CURRENT_NODES -gt $INITIAL_NODES ]; then + print_success "Karpenter provisioned new nodes! Current count: $CURRENT_NODES" + break + fi + echo "Waiting... ($i/18) Current nodes: $CURRENT_NODES" + sleep 10 + done + + # Show new nodes + print_status "Current node status:" + kubectl get nodes -o wide + + # Show Karpenter logs + print_status "Recent Karpenter logs:" + kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=20 +} + +# Test GPU workload (if GPU nodes are available) +test_gpu_workload() { + print_status "Testing GPU workload..." + + # Check if GPU NodePool exists + if kubectl get nodepool gpu &> /dev/null; then + print_status "GPU NodePool found. Deploying GPU workload..." + kubectl apply -f - </dev/null | wc -l || echo "0") + + if [ $SPOT_NODES -gt 0 ]; then + print_success "Found $SPOT_NODES spot instance(s)." + kubectl get nodes -l karpenter.sh/capacity-type=spot -o wide + + print_status "Karpenter should handle spot interruptions automatically." + print_status "Check SQS queue for interruption messages: $(kubectl get nodepool default -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "N/A")" + else + print_warning "No spot instances found. Karpenter may not have provisioned spot instances yet." + fi +} + +# Monitor Karpenter metrics +monitor_karpenter() { + print_status "Monitoring Karpenter status..." + + # Show NodePool status + print_status "NodePool status:" + kubectl get nodepool -o wide + + # Show node capacity and usage + print_status "Node resource usage:" + kubectl top nodes 2>/dev/null || print_warning "Metrics server not available" + + # Show pod distribution + print_status "Pod distribution across nodes:" + kubectl get pods -o wide | grep -E "(karpenter|gpu|burst|mixed)" | head -10 + + # Show Karpenter events + print_status "Recent Karpenter events:" + kubectl get events --field-selector involvedObject.kind=Node --sort-by='.lastTimestamp' | tail -10 +} + +# Scale down test +test_scale_down() { + print_status "Testing Karpenter scale-down behavior..." + + # Scale down workloads + print_status "Scaling down workloads..." + kubectl scale deployment karpenter-burst-workload --replicas=1 + kubectl scale deployment karpenter-mixed-workload --replicas=2 + + print_status "Workloads scaled down. Karpenter should consolidate or terminate unused nodes." + print_status "This process may take several minutes. Monitor with: kubectl get nodes -w" + + CURRENT_NODES=$(kubectl get nodes --no-headers | wc -l) + print_status "Current node count: $CURRENT_NODES" + print_status "Karpenter will evaluate nodes for termination based on the consolidation policy." +} + +# Cleanup function +cleanup() { + print_status "Cleaning up test resources..." + kubectl delete -f karpenter-workloads.yaml --ignore-not-found=true + kubectl delete job karpenter-gpu-test --ignore-not-found=true + print_success "Cleanup completed." +} + +# Show help +show_help() { + echo "Karpenter Test Script" + echo "Usage: $0 [command]" + echo "" + echo "Commands:" + echo " deploy Deploy test workloads" + echo " test Run comprehensive Karpenter tests" + echo " scale Test scaling behavior" + echo " gpu Test GPU workload" + echo " monitor Monitor Karpenter status" + echo " cleanup Remove test workloads" + echo " help Show this help message" + echo "" + echo "Examples:" + echo " $0 test # Run full test suite" + echo " $0 deploy # Just deploy workloads" + echo " $0 monitor # Monitor current status" +} + +# Main function +main() { + case "${1:-test}" in + deploy) + check_kubectl + check_karpenter + deploy_workloads + ;; + test) + check_kubectl + check_karpenter + check_karpenter_resources + deploy_workloads + test_scaling + test_gpu_workload + test_spot_handling + monitor_karpenter + test_scale_down + ;; + scale) + check_kubectl + check_karpenter + test_scaling + ;; + gpu) + check_kubectl + check_karpenter + test_gpu_workload + ;; + monitor) + check_kubectl + check_karpenter + monitor_karpenter + ;; + cleanup) + check_kubectl + cleanup + ;; + help|--help|-h) + show_help + ;; + *) + print_error "Unknown command: $1" + show_help + exit 1 + ;; + esac +} + +# Handle script interruption +trap cleanup EXIT + +# Run main function +main "$@" \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/main.tf b/1.architectures/4.amazon-eks/terraform/main.tf new file mode 100644 index 000000000..2905f4164 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/main.tf @@ -0,0 +1,389 @@ +terraform { + required_version = ">= 1.0" + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.23" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.11" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "~> 1.14" + } + } +} + +provider "aws" { + region = var.region +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + +provider "kubectl" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +locals { + name = var.cluster_name + tags = { + Environment = var.environment + Project = "EKS-Reference-Architecture" + ManagedBy = "Terraform" + } +} + +data "aws_availability_zones" "available" { + filter { + name = "state" + values = ["available"] + } +} + +data "aws_caller_identity" "current" {} + +module "vpc" { + source = "./modules/vpc" + + name = local.name + cidr = var.vpc_cidr + + azs = slice(data.aws_availability_zones.available.names, 0, 3) + private_subnets = var.private_subnets + public_subnets = var.public_subnets + + enable_nat_gateway = true + enable_vpn_gateway = false + enable_dns_hostnames = true + enable_dns_support = true + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + "karpenter.sh/discovery" = local.name + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + "karpenter.sh/discovery" = local.name + } + + tags = local.tags +} + +module "eks" { + source = "./modules/eks" + + cluster_name = local.name + cluster_version = var.cluster_version + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + control_plane_subnet_ids = module.vpc.private_subnets + + cluster_endpoint_private_access = true + cluster_endpoint_public_access = true + cluster_endpoint_public_access_cidrs = var.cluster_endpoint_public_access_cidrs + + cluster_encryption_config = [ + { + provider_key_arn = aws_kms_key.eks.arn + resources = ["secrets"] + } + ] + + cluster_addons = { + coredns = { + most_recent = true + } + kube-proxy = { + most_recent = true + } + vpc-cni = { + most_recent = true + } + aws-ebs-csi-driver = { + most_recent = true + } + aws-efs-csi-driver = { + most_recent = true + } + } + + # Karpenter requires at least one managed node group for system pods and Karpenter itself + eks_managed_node_groups = var.enable_karpenter ? { + # Minimal node group for Karpenter and system pods + karpenter = { + name = "${local.name}-karpenter" + + instance_types = ["m5.large"] + capacity_type = "ON_DEMAND" + + min_size = 2 + max_size = 3 + desired_size = 2 + + ami_type = "AL2_x86_64" + + labels = { + Environment = var.environment + NodeGroup = "karpenter" + "karpenter.sh/discovery" = local.name + } + + # Prevent Karpenter from managing these nodes + taints = [ + { + key = "CriticalAddonsOnly" + value = "true" + effect = "NO_SCHEDULE" + } + ] + + update_config = { + max_unavailable_percentage = 25 + } + + # Enable node auto repair + health_check_grace_period = var.default_health_check_grace_period + health_check_type = var.default_health_check_type + + tags = merge(local.tags, { + "karpenter.sh/discovery" = local.name + }) + } + } : { + # Original node groups when Karpenter is disabled + default = { + name = "${local.name}-default" + + instance_types = var.default_instance_types + capacity_type = "ON_DEMAND" + + min_size = var.default_min_size + max_size = var.default_max_size + desired_size = var.default_desired_size + + ami_type = "AL2_x86_64" + + labels = { + Environment = var.environment + NodeGroup = "default" + } + + taints = [] + + update_config = { + max_unavailable_percentage = 25 + } + + # Enable node auto repair + health_check_grace_period = var.default_health_check_grace_period + health_check_type = var.default_health_check_type + + tags = local.tags + } + + gpu = { + name = "${local.name}-gpu" + + instance_types = var.gpu_instance_types + capacity_type = "ON_DEMAND" + + min_size = var.gpu_min_size + max_size = var.gpu_max_size + desired_size = var.gpu_desired_size + + ami_type = "AL2_x86_64_GPU" + + labels = { + Environment = var.environment + NodeGroup = "gpu" + "nvidia.com/gpu" = "true" + } + + taints = [ + { + key = "nvidia.com/gpu" + value = "true" + effect = "NO_SCHEDULE" + } + ] + + update_config = { + max_unavailable_percentage = 25 + } + + # Enable node auto repair - GPU nodes need longer grace period + health_check_grace_period = var.gpu_health_check_grace_period + health_check_type = var.gpu_health_check_type + + tags = local.tags + } + } + + node_security_group_additional_rules = { + ingress_self_all = { + description = "Node to node all ports/protocols" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + self = true + } + + ingress_cluster_all = { + description = "Cluster to node all ports/protocols" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + source_cluster_security_group = true + } + + egress_all = { + description = "Node all egress" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "egress" + cidr_blocks = ["0.0.0.0/0"] + ipv6_cidr_blocks = ["::/0"] + } + } + + node_security_group_tags = { + "karpenter.sh/discovery" = local.name + } + + tags = local.tags +} + +resource "aws_kms_key" "eks" { + description = "EKS Secret Encryption Key" + deletion_window_in_days = 7 + enable_key_rotation = true + + tags = local.tags +} + +resource "aws_kms_alias" "eks" { + name = "alias/eks-${local.name}" + target_key_id = aws_kms_key.eks.key_id +} + +module "fsx_lustre" { + source = "./modules/fsx-lustre" + + name = "${local.name}-lustre" + subnet_ids = [module.vpc.private_subnets[0]] + security_group_ids = [aws_security_group.fsx_lustre.id] + storage_capacity = var.fsx_storage_capacity + deployment_type = var.fsx_deployment_type + per_unit_storage_throughput = var.fsx_per_unit_storage_throughput + + s3_import_path = var.fsx_s3_import_path + s3_export_path = var.fsx_s3_export_path + + tags = local.tags +} + +resource "aws_security_group" "fsx_lustre" { + name = "${local.name}-fsx-lustre" + description = "Security group for FSx Lustre" + vpc_id = module.vpc.vpc_id + + ingress { + from_port = 988 + to_port = 988 + protocol = "tcp" + cidr_blocks = [var.vpc_cidr] + } + + ingress { + from_port = 1021 + to_port = 1023 + protocol = "tcp" + cidr_blocks = [var.vpc_cidr] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.tags, { + Name = "${local.name}-fsx-lustre" + }) +} + +module "s3_mountpoint" { + source = "./modules/s3-mountpoint" + + cluster_name = module.eks.cluster_name + cluster_oidc_issuer_url = module.eks.cluster_oidc_issuer_url + + s3_bucket_name = var.s3_mountpoint_bucket_name + namespace = var.s3_mountpoint_namespace + + tags = local.tags +} + +module "addons" { + source = "./modules/addons" + + cluster_name = module.eks.cluster_name + cluster_oidc_issuer_url = module.eks.cluster_oidc_issuer_url + cluster_version = var.cluster_version + cluster_endpoint = module.eks.cluster_endpoint + vpc_id = module.vpc.vpc_id + + enable_karpenter = var.enable_karpenter + karpenter_chart_version = var.karpenter_chart_version + karpenter_default_capacity_types = var.karpenter_default_capacity_types + karpenter_default_instance_types = var.karpenter_default_instance_types + karpenter_gpu_capacity_types = var.karpenter_gpu_capacity_types + karpenter_gpu_instance_types = var.karpenter_gpu_instance_types + + enable_aws_load_balancer_controller = var.enable_aws_load_balancer_controller + enable_nvidia_device_plugin = var.enable_nvidia_device_plugin + enable_metrics_server = var.enable_metrics_server + enable_node_health_monitoring = var.enable_node_health_monitoring + enable_sns_alerts = var.enable_sns_alerts + alert_email = var.alert_email + + tags = local.tags +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/addons/main.tf b/1.architectures/4.amazon-eks/terraform/modules/addons/main.tf new file mode 100644 index 000000000..4994ed7ec --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/addons/main.tf @@ -0,0 +1,590 @@ +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} + +# Karpenter +module "karpenter" { + count = var.enable_karpenter ? 1 : 0 + source = "terraform-aws-modules/eks/aws//modules/karpenter" + version = "~> 19.21" + + cluster_name = var.cluster_name + + irsa_oidc_provider_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}" + irsa_namespace_service_accounts = ["karpenter:karpenter"] + + # Since Karpenter is running on EKS Managed Node Group, + # we need to ensure the access entry is not created for the Karpenter node IAM role + # Reference: https://github.com/aws/karpenter/issues/4002 + create_access_entry = false + + tags = var.tags +} + +resource "helm_release" "karpenter" { + count = var.enable_karpenter ? 1 : 0 + + namespace = "karpenter" + create_namespace = true + + name = "karpenter" + repository = "oci://public.ecr.aws/karpenter" + chart = "karpenter" + version = var.karpenter_chart_version + + values = [ + <<-EOT + settings: + clusterName: ${var.cluster_name} + clusterEndpoint: ${var.cluster_endpoint} + interruptionQueue: ${try(module.karpenter[0].queue_name, "")} + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: ${try(module.karpenter[0].iam_role_arn, "")} + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/master + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: karpenter.sh/provisioner-name + operator: DoesNotExist + EOT + ] + + depends_on = [module.karpenter] +} + +# Karpenter EC2NodeClass for default nodes +resource "kubectl_manifest" "karpenter_node_class_default" { + count = var.enable_karpenter ? 1 : 0 + + yaml_body = <<-YAML + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + metadata: + name: default + spec: + instanceStorePolicy: RAID0 + amiFamily: AL2 + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: ${var.cluster_name} + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: ${var.cluster_name} + instanceProfile: ${try(module.karpenter[0].node_instance_profile_name, "")} + userData: | + #!/bin/bash + /etc/eks/bootstrap.sh ${var.cluster_name} + # Install additional packages + yum update -y + yum install -y amazon-ssm-agent amazon-cloudwatch-agent + systemctl enable amazon-ssm-agent + systemctl start amazon-ssm-agent + systemctl enable amazon-cloudwatch-agent + systemctl start amazon-cloudwatch-agent + tags: + Name: "Karpenter-${var.cluster_name}-default" + Environment: ${var.tags.Environment} + NodeType: "default" + YAML + + depends_on = [helm_release.karpenter] +} + +# Karpenter EC2NodeClass for GPU nodes +resource "kubectl_manifest" "karpenter_node_class_gpu" { + count = var.enable_karpenter && var.enable_nvidia_device_plugin ? 1 : 0 + + yaml_body = <<-YAML + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + metadata: + name: gpu + spec: + instanceStorePolicy: RAID0 + amiFamily: AL2 + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: ${var.cluster_name} + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: ${var.cluster_name} + instanceProfile: ${try(module.karpenter[0].node_instance_profile_name, "")} + userData: | + #!/bin/bash + /etc/eks/bootstrap.sh ${var.cluster_name} --container-runtime containerd + # Install NVIDIA drivers and container runtime + yum update -y + yum install -y nvidia-driver-latest-dkms nvidia-container-toolkit + yum install -y amazon-ssm-agent amazon-cloudwatch-agent + + # Configure containerd for GPU support + mkdir -p /etc/containerd + cat > /etc/containerd/config.toml << EOF + version = 2 + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + SystemdCgroup = true + EOF + + systemctl restart containerd + systemctl enable amazon-ssm-agent + systemctl start amazon-ssm-agent + systemctl enable amazon-cloudwatch-agent + systemctl start amazon-cloudwatch-agent + tags: + Name: "Karpenter-${var.cluster_name}-gpu" + Environment: ${var.tags.Environment} + NodeType: "gpu" + YAML + + depends_on = [helm_release.karpenter] +} + +# Karpenter NodePool for default workloads +resource "kubectl_manifest" "karpenter_node_pool_default" { + count = var.enable_karpenter ? 1 : 0 + + yaml_body = <<-YAML + apiVersion: karpenter.sh/v1beta1 + kind: NodePool + metadata: + name: default + spec: + template: + metadata: + labels: + provisioner: karpenter + node-type: default + spec: + nodeClassRef: + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + name: default + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ${jsonencode(var.karpenter_default_capacity_types)} + - key: node.kubernetes.io/instance-type + operator: In + values: ${jsonencode(var.karpenter_default_instance_types)} + nodePolicy: + terminationGracePeriod: 30s + limits: + cpu: 1000 + memory: 1000Gi + disruption: + consolidationPolicy: WhenUnderutilized + consolidateAfter: 30s + expireAfter: 30m + YAML + + depends_on = [kubectl_manifest.karpenter_node_class_default] +} + +# Karpenter NodePool for GPU workloads +resource "kubectl_manifest" "karpenter_node_pool_gpu" { + count = var.enable_karpenter && var.enable_nvidia_device_plugin ? 1 : 0 + + yaml_body = <<-YAML + apiVersion: karpenter.sh/v1beta1 + kind: NodePool + metadata: + name: gpu + spec: + template: + metadata: + labels: + provisioner: karpenter + node-type: gpu + nvidia.com/gpu: "true" + spec: + nodeClassRef: + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + name: gpu + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ${jsonencode(var.karpenter_gpu_capacity_types)} + - key: node.kubernetes.io/instance-type + operator: In + values: ${jsonencode(var.karpenter_gpu_instance_types)} + - key: karpenter.k8s.aws/instance-gpu-count + operator: Gt + values: ["0"] + taints: + - key: nvidia.com/gpu + value: "true" + effect: NoSchedule + nodePolicy: + terminationGracePeriod: 60s + limits: + cpu: 1000 + memory: 1000Gi + nvidia.com/gpu: 100 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 60m + YAML + + depends_on = [kubectl_manifest.karpenter_node_class_gpu] +} + +# AWS Load Balancer Controller +module "load_balancer_controller_irsa_role" { + count = var.enable_aws_load_balancer_controller ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.0" + + role_name = "${var.cluster_name}-load-balancer-controller" + attach_load_balancer_controller_policy = true + + oidc_providers = { + ex = { + provider_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}" + namespace_service_accounts = ["kube-system:aws-load-balancer-controller"] + } + } + + tags = var.tags +} + +resource "helm_release" "aws_load_balancer_controller" { + count = var.enable_aws_load_balancer_controller ? 1 : 0 + + name = "aws-load-balancer-controller" + repository = "https://aws.github.io/eks-charts" + chart = "aws-load-balancer-controller" + namespace = "kube-system" + version = "1.6.2" + + set { + name = "clusterName" + value = var.cluster_name + } + + set { + name = "serviceAccount.create" + value = "true" + } + + set { + name = "serviceAccount.name" + value = "aws-load-balancer-controller" + } + + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = try(module.load_balancer_controller_irsa_role[0].iam_role_arn, "") + } + + set { + name = "region" + value = data.aws_region.current.name + } + + set { + name = "vpcId" + value = var.vpc_id + } + + depends_on = [module.load_balancer_controller_irsa_role] +} + +# NVIDIA Device Plugin +resource "helm_release" "nvidia_device_plugin" { + count = var.enable_nvidia_device_plugin ? 1 : 0 + + name = "nvidia-device-plugin" + repository = "https://nvidia.github.io/k8s-device-plugin" + chart = "nvidia-device-plugin" + namespace = "kube-system" + version = "0.14.1" + + set { + name = "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key" + value = "nvidia.com/gpu" + } + + set { + name = "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator" + value = "In" + } + + set { + name = "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]" + value = "true" + } + + set { + name = "tolerations[0].key" + value = "nvidia.com/gpu" + } + + set { + name = "tolerations[0].operator" + value = "Equal" + } + + set { + name = "tolerations[0].value" + value = "true" + } + + set { + name = "tolerations[0].effect" + value = "NoSchedule" + } +} + +# Metrics Server +resource "helm_release" "metrics_server" { + count = var.enable_metrics_server ? 1 : 0 + + name = "metrics-server" + repository = "https://kubernetes-sigs.github.io/metrics-server/" + chart = "metrics-server" + namespace = "kube-system" + version = "3.11.0" + + set { + name = "args[0]" + value = "--cert-dir=/tmp" + } + + set { + name = "args[1]" + value = "--secure-port=4443" + } + + set { + name = "args[2]" + value = "--kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname" + } + + set { + name = "args[3]" + value = "--kubelet-use-node-status-port" + } + + set { + name = "args[4]" + value = "--metric-resolution=15s" + } +} + +# EBS CSI Driver +module "ebs_csi_irsa_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.0" + + role_name = "${var.cluster_name}-ebs-csi-driver" + attach_ebs_csi_policy = true + + oidc_providers = { + ex = { + provider_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}" + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + + tags = var.tags +} + +# EFS CSI Driver +module "efs_csi_irsa_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.0" + + role_name = "${var.cluster_name}-efs-csi-driver" + attach_efs_csi_policy = true + + oidc_providers = { + ex = { + provider_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}" + namespace_service_accounts = ["kube-system:efs-csi-controller-sa"] + } + } + + tags = var.tags +} + +# Node Termination Handler +resource "helm_release" "aws_node_termination_handler" { + count = var.enable_node_termination_handler ? 1 : 0 + + name = "aws-node-termination-handler" + repository = "https://aws.github.io/eks-charts" + chart = "aws-node-termination-handler" + namespace = "kube-system" + version = "0.21.0" + + set { + name = "enableSpotInterruptionDraining" + value = "true" + } + + set { + name = "enableRebalanceMonitoring" + value = "true" + } + + set { + name = "enableScheduledEventDraining" + value = "true" + } + + set { + name = "enableRebalanceDraining" + value = "true" + } + + set { + name = "nodeSelector.karpenter\\.sh/provisioner-name" + value = "" + } +} + +# CloudWatch Dashboard for Node Health Monitoring +resource "aws_cloudwatch_dashboard" "node_health" { + count = var.enable_node_health_monitoring ? 1 : 0 + dashboard_name = "${var.cluster_name}-node-health" + + dashboard_body = jsonencode({ + widgets = [ + { + type = "metric" + x = 0 + y = 0 + width = 12 + height = 6 + + properties = { + metrics = [ + ["AWS/EKS", "cluster_node_count", "ClusterName", var.cluster_name], + ["AWS/EKS", "cluster_failed_node_count", "ClusterName", var.cluster_name] + ] + view = "timeSeries" + stacked = false + region = data.aws_region.current.name + title = "EKS Node Count" + period = 300 + } + }, + { + type = "metric" + x = 0 + y = 6 + width = 12 + height = 6 + + properties = { + metrics = [ + ["AWS/EC2", "StatusCheckFailed", { "stat" : "Sum" }], + ["AWS/EC2", "StatusCheckFailed_Instance", { "stat" : "Sum" }], + ["AWS/EC2", "StatusCheckFailed_System", { "stat" : "Sum" }] + ] + view = "timeSeries" + stacked = false + region = data.aws_region.current.name + title = "EC2 Status Check Failures" + period = 300 + } + } + ] + }) + + tags = var.tags +} + +# CloudWatch Alarms for Node Health +resource "aws_cloudwatch_metric_alarm" "node_health_check_failed" { + count = var.enable_node_health_monitoring ? 1 : 0 + + alarm_name = "${var.cluster_name}-node-health-check-failed" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "StatusCheckFailed" + namespace = "AWS/EC2" + period = "300" + statistic = "Maximum" + threshold = "0" + alarm_description = "This metric monitors EC2 instance status check failures for EKS nodes" + alarm_actions = var.enable_sns_alerts ? [aws_sns_topic.node_health_alerts[0].arn] : [] + + dimensions = { + AutoScalingGroupName = "*${var.cluster_name}*" + } + + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "gpu_node_health_check_failed" { + count = var.enable_node_health_monitoring && var.enable_nvidia_device_plugin ? 1 : 0 + + alarm_name = "${var.cluster_name}-gpu-node-health-check-failed" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "2" + metric_name = "StatusCheckFailed" + namespace = "AWS/EC2" + period = "600" # Longer period for GPU nodes + statistic = "Maximum" + threshold = "0" + alarm_description = "This metric monitors EC2 instance status check failures for EKS GPU nodes" + alarm_actions = var.enable_sns_alerts ? [aws_sns_topic.node_health_alerts[0].arn] : [] + + dimensions = { + AutoScalingGroupName = "*${var.cluster_name}*gpu*" + } + + tags = var.tags +} + +# SNS Topic for Node Health Alerts +resource "aws_sns_topic" "node_health_alerts" { + count = var.enable_sns_alerts ? 1 : 0 + name = "${var.cluster_name}-node-health-alerts" + + tags = var.tags +} + +resource "aws_sns_topic_subscription" "node_health_email" { + count = var.enable_sns_alerts && var.alert_email != "" ? 1 : 0 + topic_arn = aws_sns_topic.node_health_alerts[0].arn + protocol = "email" + endpoint = var.alert_email +} + +# Custom CloudWatch Log Group for Node Auto-Repair Events +resource "aws_cloudwatch_log_group" "node_auto_repair" { + count = var.enable_node_health_monitoring ? 1 : 0 + name = "/aws/eks/${var.cluster_name}/node-auto-repair" + retention_in_days = 30 + + tags = var.tags +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf new file mode 100644 index 000000000..27dfe079f --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf @@ -0,0 +1,39 @@ +output "karpenter_role_arn" { + description = "ARN of the Karpenter IAM role" + value = try(module.karpenter[0].iam_role_arn, "") +} + +output "karpenter_instance_profile_name" { + description = "Name of the Karpenter node instance profile" + value = try(module.karpenter[0].node_instance_profile_name, "") +} + +output "karpenter_queue_name" { + description = "Name of the Karpenter SQS queue" + value = try(module.karpenter[0].queue_name, "") +} + +output "load_balancer_controller_role_arn" { + description = "ARN of the load balancer controller IAM role" + value = try(module.load_balancer_controller_irsa_role[0].iam_role_arn, "") +} + +output "ebs_csi_driver_role_arn" { + description = "ARN of the EBS CSI driver IAM role" + value = module.ebs_csi_irsa_role.iam_role_arn +} + +output "efs_csi_driver_role_arn" { + description = "ARN of the EFS CSI driver IAM role" + value = module.efs_csi_irsa_role.iam_role_arn +} + +output "node_health_dashboard_url" { + description = "URL of the CloudWatch dashboard for node health monitoring" + value = var.enable_node_health_monitoring ? "https://${data.aws_region.current.name}.console.aws.amazon.com/cloudwatch/home?region=${data.aws_region.current.name}#dashboards:name=${aws_cloudwatch_dashboard.node_health[0].dashboard_name}" : "" +} + +output "node_health_sns_topic_arn" { + description = "ARN of the SNS topic for node health alerts" + value = var.enable_sns_alerts ? aws_sns_topic.node_health_alerts[0].arn : "" +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf new file mode 100644 index 000000000..d54b2d920 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf @@ -0,0 +1,109 @@ +variable "cluster_name" { + description = "Name of the EKS cluster" + type = string +} + +variable "cluster_oidc_issuer_url" { + description = "The URL on the EKS cluster for the OpenID Connect identity provider" + type = string +} + +variable "cluster_version" { + description = "Kubernetes version to use for the EKS cluster" + type = string +} + +variable "vpc_id" { + description = "ID of the VPC where the cluster is deployed" + type = string + default = "" +} + +variable "enable_karpenter" { + description = "Enable Karpenter for node provisioning" + type = bool + default = true +} + +variable "karpenter_chart_version" { + description = "Version of the Karpenter Helm chart" + type = string + default = "v0.32.1" +} + +variable "cluster_endpoint" { + description = "EKS cluster endpoint" + type = string +} + +variable "karpenter_default_capacity_types" { + description = "Capacity types for Karpenter default node pool" + type = list(string) + default = ["spot", "on-demand"] +} + +variable "karpenter_default_instance_types" { + description = "Instance types for Karpenter default node pool" + type = list(string) + default = ["m5.large", "m5.xlarge", "m5.2xlarge", "m5a.large", "m5a.xlarge", "m5a.2xlarge"] +} + +variable "karpenter_gpu_capacity_types" { + description = "Capacity types for Karpenter GPU node pool" + type = list(string) + default = ["on-demand"] +} + +variable "karpenter_gpu_instance_types" { + description = "Instance types for Karpenter GPU node pool" + type = list(string) + default = ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", "g5.xlarge", "g5.2xlarge", "p3.2xlarge"] +} + +variable "enable_aws_load_balancer_controller" { + description = "Enable AWS Load Balancer Controller" + type = bool + default = true +} + +variable "enable_nvidia_device_plugin" { + description = "Enable NVIDIA device plugin for GPU support" + type = bool + default = true +} + +variable "enable_metrics_server" { + description = "Enable metrics server" + type = bool + default = true +} + +variable "enable_node_termination_handler" { + description = "Enable AWS Node Termination Handler" + type = bool + default = true +} + +variable "enable_node_health_monitoring" { + description = "Enable CloudWatch monitoring for node health" + type = bool + default = true +} + +variable "enable_sns_alerts" { + description = "Enable SNS alerts for node health issues" + type = bool + default = false +} + +variable "alert_email" { + description = "Email address for node health alerts" + type = string + default = "" +} + +variable "tags" { + description = "A map of tags to add to all resources" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/main.tf b/1.architectures/4.amazon-eks/terraform/modules/eks/main.tf new file mode 100644 index 000000000..fca6bb0f1 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/eks/main.tf @@ -0,0 +1,149 @@ +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.21" + + cluster_name = var.cluster_name + cluster_version = var.cluster_version + + vpc_id = var.vpc_id + subnet_ids = var.subnet_ids + control_plane_subnet_ids = var.control_plane_subnet_ids + + cluster_endpoint_private_access = var.cluster_endpoint_private_access + cluster_endpoint_public_access = var.cluster_endpoint_public_access + cluster_endpoint_public_access_cidrs = var.cluster_endpoint_public_access_cidrs + + cluster_encryption_config = var.cluster_encryption_config + + cluster_addons = var.cluster_addons + + eks_managed_node_groups = var.eks_managed_node_groups + + node_security_group_additional_rules = var.node_security_group_additional_rules + + manage_aws_auth_configmap = true + + aws_auth_roles = [ + { + rolearn = module.eks_managed_node_group_role.iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = ["system:bootstrappers", "system:nodes"] + }, + ] + + tags = var.tags +} + +resource "aws_iam_role" "eks_managed_node_group_role" { + name = "${var.cluster_name}-node-group-role" + + assume_role_policy = jsonencode({ + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ec2.amazonaws.com" + } + }] + Version = "2012-10-17" + }) + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "eks_managed_node_group_role_policy" { + for_each = toset([ + "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy", + "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy", + "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly", + "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess", + "arn:aws:iam::aws:policy/AmazonFSxClientFullAccess" + ]) + + policy_arn = each.value + role = aws_iam_role.eks_managed_node_group_role.name +} + +resource "aws_iam_instance_profile" "eks_managed_node_group_instance_profile" { + name = "${var.cluster_name}-node-group-instance-profile" + role = aws_iam_role.eks_managed_node_group_role.name + + tags = var.tags +} + +data "aws_ssm_parameter" "eks_ami_release_version" { + for_each = var.eks_managed_node_groups + + name = "/aws/service/eks/optimized-ami/${var.cluster_version}/amazon-linux-2${each.value.ami_type == "AL2_x86_64_GPU" ? "-gpu" : ""}/recommended/release_version" +} + +resource "aws_launch_template" "eks_managed_node_group" { + for_each = var.eks_managed_node_groups + + name_prefix = "${var.cluster_name}-${each.key}-" + image_id = data.aws_ami.eks_default[each.key].id + instance_type = each.value.instance_types[0] + + vpc_security_group_ids = [module.eks.node_security_group_id] + + user_data = base64encode(templatefile("${path.module}/user_data.sh", { + cluster_name = var.cluster_name + endpoint = module.eks.cluster_endpoint + ca_certificate = module.eks.cluster_certificate_authority_data + bootstrap_arguments = each.value.ami_type == "AL2_x86_64_GPU" ? "--container-runtime containerd --use-max-pods false --b64-cluster-ca ${module.eks.cluster_certificate_authority_data} --apiserver-endpoint ${module.eks.cluster_endpoint}" : "" + })) + + block_device_mappings { + device_name = "/dev/xvda" + ebs { + volume_size = 50 + volume_type = "gp3" + iops = 3000 + throughput = 125 + encrypted = true + delete_on_termination = true + } + } + + metadata_options { + http_endpoint = "enabled" + http_tokens = "required" + http_put_response_hop_limit = 2 + instance_metadata_tags = "enabled" + } + + tag_specifications { + resource_type = "instance" + tags = merge(var.tags, { + Name = "${var.cluster_name}-${each.key}" + }) + } + + tags = var.tags + + lifecycle { + create_before_destroy = true + } +} + +data "aws_ami" "eks_default" { + for_each = var.eks_managed_node_groups + + most_recent = true + owners = ["602401143452"] + + filter { + name = "name" + values = ["amazon-eks-node-${var.cluster_version}-v*"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf new file mode 100644 index 000000000..6515588ea --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf @@ -0,0 +1,49 @@ +output "cluster_arn" { + description = "The Amazon Resource Name (ARN) of the cluster" + value = module.eks.cluster_arn +} + +output "cluster_certificate_authority_data" { + description = "Base64 encoded certificate data required to communicate with the cluster" + value = module.eks.cluster_certificate_authority_data +} + +output "cluster_endpoint" { + description = "Endpoint for your Kubernetes API server" + value = module.eks.cluster_endpoint +} + +output "cluster_id" { + description = "The name/id of the EKS cluster" + value = module.eks.cluster_id +} + +output "cluster_name" { + description = "The name of the EKS cluster" + value = module.eks.cluster_name +} + +output "cluster_oidc_issuer_url" { + description = "The URL on the EKS cluster for the OpenID Connect identity provider" + value = module.eks.cluster_oidc_issuer_url +} + +output "cluster_version" { + description = "The Kubernetes version for the EKS cluster" + value = module.eks.cluster_version +} + +output "cluster_security_group_id" { + description = "Security group ID attached to the EKS cluster" + value = module.eks.cluster_security_group_id +} + +output "node_security_group_id" { + description = "ID of the node shared security group" + value = module.eks.node_security_group_id +} + +output "eks_managed_node_groups" { + description = "Map of attribute maps for all EKS managed node groups created" + value = module.eks.eks_managed_node_groups +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh b/1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh new file mode 100644 index 000000000..de51ed616 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -o xtrace + +# Install SSM agent +yum install -y amazon-ssm-agent +systemctl enable amazon-ssm-agent +systemctl start amazon-ssm-agent + +# Configure kubelet +echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.conf +echo "net.bridge.bridge-nf-call-iptables = 1" >> /etc/sysctl.conf +echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.conf +sysctl -p + +# Bootstrap the node +/etc/eks/bootstrap.sh ${cluster_name} ${bootstrap_arguments} + +# Install additional packages for GPU nodes +if [[ "${bootstrap_arguments}" == *"gpu"* ]]; then + # Install NVIDIA drivers and container runtime + yum install -y nvidia-driver-latest-dkms + yum install -y nvidia-container-toolkit + + # Configure containerd for GPU support + mkdir -p /etc/containerd + cat > /etc/containerd/config.toml << EOF +version = 2 +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + runtime_type = "io.containerd.runc.v2" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + SystemdCgroup = true +EOF + + systemctl restart containerd +fi + +# Install FSx Lustre client +amazon-linux-extras install -y lustre2.10 + +# Configure CloudWatch agent +yum install -y amazon-cloudwatch-agent +cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json << EOF +{ + "agent": { + "metrics_collection_interval": 60, + "run_as_user": "cwagent" + }, + "metrics": { + "namespace": "EKS/Node", + "metrics_collected": { + "cpu": { + "measurement": [ + "cpu_usage_idle", + "cpu_usage_iowait", + "cpu_usage_user", + "cpu_usage_system" + ], + "metrics_collection_interval": 60 + }, + "disk": { + "measurement": [ + "used_percent" + ], + "metrics_collection_interval": 60, + "resources": [ + "*" + ] + }, + "diskio": { + "measurement": [ + "io_time" + ], + "metrics_collection_interval": 60, + "resources": [ + "*" + ] + }, + "mem": { + "measurement": [ + "mem_used_percent" + ], + "metrics_collection_interval": 60 + }, + "netstat": { + "measurement": [ + "tcp_established", + "tcp_time_wait" + ], + "metrics_collection_interval": 60 + }, + "swap": { + "measurement": [ + "swap_used_percent" + ], + "metrics_collection_interval": 60 + } + } + } +} +EOF + +systemctl enable amazon-cloudwatch-agent +systemctl start amazon-cloudwatch-agent \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf new file mode 100644 index 000000000..9d79e3361 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf @@ -0,0 +1,98 @@ +variable "cluster_name" { + description = "Name of the EKS cluster" + type = string +} + +variable "cluster_version" { + description = "Kubernetes version to use for the EKS cluster" + type = string +} + +variable "vpc_id" { + description = "ID of the VPC where to create the cluster" + type = string +} + +variable "subnet_ids" { + description = "List of subnet IDs where the EKS cluster will be deployed" + type = list(string) +} + +variable "control_plane_subnet_ids" { + description = "List of subnet IDs where the EKS cluster control plane will be deployed" + type = list(string) +} + +variable "cluster_endpoint_private_access" { + description = "Indicates whether or not the Amazon EKS private API server endpoint is enabled" + type = bool + default = false +} + +variable "cluster_endpoint_public_access" { + description = "Indicates whether or not the Amazon EKS public API server endpoint is enabled" + type = bool + default = true +} + +variable "cluster_endpoint_public_access_cidrs" { + description = "List of CIDR blocks which can access the Amazon EKS public API server endpoint" + type = list(string) + default = ["0.0.0.0/0"] +} + +variable "cluster_encryption_config" { + description = "Configuration block with encryption configuration for the cluster" + type = list(object({ + provider_key_arn = string + resources = list(string) + })) + default = [] +} + +variable "cluster_addons" { + description = "Map of cluster addon configurations to enable for the cluster" + type = map(object({ + most_recent = optional(bool) + version = optional(string) + })) + default = {} +} + +variable "eks_managed_node_groups" { + description = "Map of EKS managed node group definitions to create" + type = map(object({ + name = string + instance_types = list(string) + capacity_type = string + min_size = number + max_size = number + desired_size = number + ami_type = string + labels = map(string) + taints = list(object({ + key = string + value = string + effect = string + })) + update_config = object({ + max_unavailable_percentage = number + }) + health_check_grace_period = optional(number) + health_check_type = optional(string) + tags = map(string) + })) + default = {} +} + +variable "node_security_group_additional_rules" { + description = "List of additional security group rules to add to the node security group" + type = any + default = {} +} + +variable "tags" { + description = "A map of tags to add to all resources" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf new file mode 100644 index 000000000..ca5cfcb94 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf @@ -0,0 +1,114 @@ +resource "aws_fsx_lustre_file_system" "main" { + storage_capacity = var.storage_capacity + subnet_ids = var.subnet_ids + deployment_type = var.deployment_type + per_unit_storage_throughput = var.per_unit_storage_throughput + security_group_ids = var.security_group_ids + + dynamic "log_configuration" { + for_each = var.log_configuration != null ? [var.log_configuration] : [] + content { + destination = log_configuration.value.destination + level = log_configuration.value.level + } + } + + import_path = var.s3_import_path + export_path = var.s3_export_path + + # Auto import and export configuration + auto_import_policy = var.auto_import_policy + + # Data compression + data_compression_type = var.data_compression_type + + # Copy tags to snapshots + copy_tags_to_backups = var.copy_tags_to_backups + + # Weekly maintenance window + weekly_maintenance_start_time = var.weekly_maintenance_start_time + + # Backup configuration for PERSISTENT deployments + dynamic "backup_configuration" { + for_each = var.deployment_type == "PERSISTENT_1" || var.deployment_type == "PERSISTENT_2" ? [1] : [] + content { + automatic_backup_retention_days = var.automatic_backup_retention_days + daily_automatic_backup_start_time = var.daily_automatic_backup_start_time + } + } + + tags = merge(var.tags, { + Name = var.name + }) +} + +# Create CSI driver for FSx Lustre +resource "kubernetes_storage_class" "fsx_lustre" { + metadata { + name = "fsx-lustre-sc" + } + + storage_provisioner = "fsx.csi.aws.com" + + parameters = { + subPath = "/" + dnsName = aws_fsx_lustre_file_system.main.dns_name + mountName = aws_fsx_lustre_file_system.main.mount_name + } + + mount_options = [ + "flock" + ] +} + +# Create a persistent volume for FSx Lustre +resource "kubernetes_persistent_volume" "fsx_lustre" { + metadata { + name = "fsx-lustre-pv" + } + + spec { + capacity = { + storage = "${var.storage_capacity}Gi" + } + + access_modes = ["ReadWriteMany"] + + persistent_volume_source { + csi { + driver = "fsx.csi.aws.com" + volume_handle = aws_fsx_lustre_file_system.main.id + + volume_attributes = { + dnsName = aws_fsx_lustre_file_system.main.dns_name + mountName = aws_fsx_lustre_file_system.main.mount_name + } + } + } + + storage_class_name = kubernetes_storage_class.fsx_lustre.metadata[0].name + } +} + +# Example PVC for FSx Lustre +resource "kubernetes_persistent_volume_claim" "fsx_lustre_example" { + count = var.create_example_pvc ? 1 : 0 + + metadata { + name = "fsx-lustre-pvc" + namespace = var.example_namespace + } + + spec { + access_modes = ["ReadWriteMany"] + + resources { + requests = { + storage = "100Gi" + } + } + + storage_class_name = kubernetes_storage_class.fsx_lustre.metadata[0].name + volume_name = kubernetes_persistent_volume.fsx_lustre.metadata[0].name + } +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf new file mode 100644 index 000000000..3544f7d71 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf @@ -0,0 +1,34 @@ +output "file_system_id" { + description = "Identifier of the file system" + value = aws_fsx_lustre_file_system.main.id +} + +output "file_system_arn" { + description = "Amazon Resource Name of the file system" + value = aws_fsx_lustre_file_system.main.arn +} + +output "dns_name" { + description = "DNS name for the file system" + value = aws_fsx_lustre_file_system.main.dns_name +} + +output "mount_name" { + description = "The value to be used when mounting the filesystem" + value = aws_fsx_lustre_file_system.main.mount_name +} + +output "network_interface_ids" { + description = "Set of Elastic Network Interface identifiers from which the file system is accessible" + value = aws_fsx_lustre_file_system.main.network_interface_ids +} + +output "storage_class_name" { + description = "Name of the Kubernetes storage class" + value = kubernetes_storage_class.fsx_lustre.metadata[0].name +} + +output "persistent_volume_name" { + description = "Name of the Kubernetes persistent volume" + value = kubernetes_persistent_volume.fsx_lustre.metadata[0].name +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf new file mode 100644 index 000000000..396ca89c8 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf @@ -0,0 +1,122 @@ +variable "name" { + description = "Name of the FSx Lustre file system" + type = string +} + +variable "storage_capacity" { + description = "Storage capacity (GiB) of the file system" + type = number + validation { + condition = var.storage_capacity >= 1200 && var.storage_capacity % 1200 == 0 + error_message = "Storage capacity must be at least 1200 GiB and in increments of 1200 GiB." + } +} + +variable "subnet_ids" { + description = "List of subnet IDs for the file system" + type = list(string) +} + +variable "security_group_ids" { + description = "List of security group IDs for the file system" + type = list(string) +} + +variable "deployment_type" { + description = "Deployment type for the file system" + type = string + default = "SCRATCH_2" + validation { + condition = contains(["SCRATCH_1", "SCRATCH_2", "PERSISTENT_1", "PERSISTENT_2"], var.deployment_type) + error_message = "Valid values for deployment_type are SCRATCH_1, SCRATCH_2, PERSISTENT_1, or PERSISTENT_2." + } +} + +variable "per_unit_storage_throughput" { + description = "Per unit storage throughput (MB/s/TiB)" + type = number + default = null +} + +variable "s3_import_path" { + description = "S3 URI for importing data" + type = string + default = null +} + +variable "s3_export_path" { + description = "S3 URI for exporting data" + type = string + default = null +} + +variable "auto_import_policy" { + description = "How Amazon FSx keeps your file and directory listings up to date" + type = string + default = "NEW_CHANGED" + validation { + condition = contains(["NONE", "NEW", "NEW_CHANGED", "NEW_CHANGED_DELETED"], var.auto_import_policy) + error_message = "Valid values are NONE, NEW, NEW_CHANGED, or NEW_CHANGED_DELETED." + } +} + +variable "data_compression_type" { + description = "Sets the data compression configuration for the file system" + type = string + default = "NONE" + validation { + condition = contains(["NONE", "LZ4"], var.data_compression_type) + error_message = "Valid values are NONE or LZ4." + } +} + +variable "copy_tags_to_backups" { + description = "A boolean flag indicating whether tags for the file system should be copied to backups" + type = bool + default = false +} + +variable "weekly_maintenance_start_time" { + description = "The preferred start time (in d:HH:MM format) to perform weekly maintenance" + type = string + default = "1:02:00" +} + +variable "automatic_backup_retention_days" { + description = "The number of days to retain automatic backups" + type = number + default = 7 +} + +variable "daily_automatic_backup_start_time" { + description = "The preferred time (in HH:MM format) to take daily automatic backups" + type = string + default = "02:00" +} + +variable "log_configuration" { + description = "The Lustre logging configuration" + type = object({ + destination = string + level = string + }) + default = null +} + +variable "create_example_pvc" { + description = "Whether to create an example PVC" + type = bool + default = false +} + +variable "example_namespace" { + description = "Namespace for example resources" + type = string + default = "default" +} + +variable "tags" { + description = "A map of tags to assign to the resource" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf new file mode 100644 index 000000000..3295e263d --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf @@ -0,0 +1,193 @@ +data "aws_iam_policy_document" "s3_mountpoint_assume_role_policy" { + statement { + actions = ["sts:AssumeRoleWithWebIdentity"] + effect = "Allow" + + condition { + test = "StringEquals" + variable = "${replace(var.cluster_oidc_issuer_url, "https://", "")}:sub" + values = ["system:serviceaccount:${var.namespace}:mountpoint-s3-csi-driver"] + } + + condition { + test = "StringEquals" + variable = "${replace(var.cluster_oidc_issuer_url, "https://", "")}:aud" + values = ["sts.amazonaws.com"] + } + + principals { + identifiers = [var.oidc_provider_arn] + type = "Federated" + } + } +} + +resource "aws_iam_role" "s3_mountpoint" { + assume_role_policy = data.aws_iam_policy_document.s3_mountpoint_assume_role_policy.json + name = "${var.cluster_name}-s3-mountpoint-csi-driver" + tags = var.tags +} + +data "aws_iam_policy_document" "s3_mountpoint" { + statement { + effect = "Allow" + actions = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:GetObjectVersion", + "s3:DeleteObjectVersion", + "s3:ListBucketVersions" + ] + resources = [ + "arn:aws:s3:::${var.s3_bucket_name}", + "arn:aws:s3:::${var.s3_bucket_name}/*" + ] + } +} + +resource "aws_iam_policy" "s3_mountpoint" { + description = "S3 Mountpoint CSI Driver Policy" + name = "${var.cluster_name}-s3-mountpoint-csi-driver" + policy = data.aws_iam_policy_document.s3_mountpoint.json + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "s3_mountpoint" { + policy_arn = aws_iam_policy.s3_mountpoint.arn + role = aws_iam_role.s3_mountpoint.name +} + +# Create the service account +resource "kubernetes_service_account" "s3_mountpoint" { + metadata { + name = "mountpoint-s3-csi-driver" + namespace = var.namespace + annotations = { + "eks.amazonaws.com/role-arn" = aws_iam_role.s3_mountpoint.arn + } + } +} + +# Deploy the Mountpoint for S3 CSI driver +resource "helm_release" "mountpoint_s3_csi_driver" { + name = "aws-mountpoint-s3-csi-driver" + repository = "https://awslabs.github.io/mountpoint-s3-csi-driver" + chart = "aws-mountpoint-s3-csi-driver" + namespace = var.namespace + version = var.csi_driver_version + + set { + name = "node.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = aws_iam_role.s3_mountpoint.arn + } + + set { + name = "controller.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = aws_iam_role.s3_mountpoint.arn + } + + set { + name = "controller.replicaCount" + value = "2" + } + + set { + name = "node.tolerateAllTaints" + value = "true" + } + + depends_on = [kubernetes_service_account.s3_mountpoint] +} + +# Create a storage class for S3 Mountpoint +resource "kubernetes_storage_class" "s3_mountpoint" { + metadata { + name = "s3-mountpoint-sc" + } + + storage_provisioner = "s3.csi.aws.com" + + parameters = { + bucketName = var.s3_bucket_name + region = var.region + } + + volume_binding_mode = "Immediate" +} + +# Example PVC for S3 Mountpoint +resource "kubernetes_persistent_volume_claim" "s3_mountpoint_example" { + count = var.create_example_pvc ? 1 : 0 + + metadata { + name = "s3-mountpoint-pvc" + namespace = var.example_namespace + } + + spec { + access_modes = ["ReadWriteMany"] + + resources { + requests = { + storage = "1000Gi" + } + } + + storage_class_name = kubernetes_storage_class.s3_mountpoint.metadata[0].name + } +} + +# Example deployment using S3 Mountpoint +resource "kubernetes_deployment" "s3_mountpoint_example" { + count = var.create_example_deployment ? 1 : 0 + + metadata { + name = "s3-mountpoint-example" + namespace = var.example_namespace + labels = { + app = "s3-mountpoint-example" + } + } + + spec { + replicas = 1 + + selector { + match_labels = { + app = "s3-mountpoint-example" + } + } + + template { + metadata { + labels = { + app = "s3-mountpoint-example" + } + } + + spec { + container { + image = "busybox:latest" + name = "busybox" + + command = ["/bin/sh"] + args = ["-c", "while true; do echo $(date) >> /mnt/s3/test.txt; sleep 30; done"] + + volume_mount { + mount_path = "/mnt/s3" + name = "s3-volume" + } + } + + volume { + name = "s3-volume" + persistent_volume_claim { + claim_name = kubernetes_persistent_volume_claim.s3_mountpoint_example[0].metadata[0].name + } + } + } + } + } +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf new file mode 100644 index 000000000..e75f2e743 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf @@ -0,0 +1,21 @@ +output "role_arn" { + description = "ARN of the IAM role for S3 Mountpoint CSI driver" + value = aws_iam_role.s3_mountpoint.arn +} + +output "service_account_arn" { + description = "ARN of the Kubernetes service account" + value = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${aws_iam_role.s3_mountpoint.name}" +} + +output "service_account_name" { + description = "Name of the Kubernetes service account" + value = kubernetes_service_account.s3_mountpoint.metadata[0].name +} + +output "storage_class_name" { + description = "Name of the S3 Mountpoint storage class" + value = kubernetes_storage_class.s3_mountpoint.metadata[0].name +} + +data "aws_caller_identity" "current" {} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf new file mode 100644 index 000000000..2ed7a235e --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf @@ -0,0 +1,61 @@ +variable "cluster_name" { + description = "Name of the EKS cluster" + type = string +} + +variable "cluster_oidc_issuer_url" { + description = "The URL on the EKS cluster for the OpenID Connect identity provider" + type = string +} + +variable "oidc_provider_arn" { + description = "The ARN of the OIDC Provider for the EKS cluster" + type = string +} + +variable "s3_bucket_name" { + description = "Name of the S3 bucket to mount" + type = string +} + +variable "namespace" { + description = "Kubernetes namespace for the S3 Mountpoint CSI driver" + type = string + default = "kube-system" +} + +variable "region" { + description = "AWS region" + type = string + default = "us-west-2" +} + +variable "csi_driver_version" { + description = "Version of the Mountpoint S3 CSI driver" + type = string + default = "1.4.0" +} + +variable "create_example_pvc" { + description = "Whether to create an example PVC" + type = bool + default = false +} + +variable "create_example_deployment" { + description = "Whether to create an example deployment" + type = bool + default = false +} + +variable "example_namespace" { + description = "Namespace for example resources" + type = string + default = "default" +} + +variable "tags" { + description = "A map of tags to assign to the resource" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf b/1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf new file mode 100644 index 000000000..752471940 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf @@ -0,0 +1,147 @@ +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = var.name + cidr = var.cidr + + azs = var.azs + private_subnets = var.private_subnets + public_subnets = var.public_subnets + + enable_nat_gateway = var.enable_nat_gateway + enable_vpn_gateway = var.enable_vpn_gateway + enable_dns_hostnames = var.enable_dns_hostnames + enable_dns_support = var.enable_dns_support + + # Single NAT Gateway for cost optimization (can be changed to one_nat_gateway_per_az = true for HA) + single_nat_gateway = var.single_nat_gateway + one_nat_gateway_per_az = var.one_nat_gateway_per_az + + # VPC Flow Logs + enable_flow_log = var.enable_flow_log + create_flow_log_cloudwatch_iam_role = var.create_flow_log_cloudwatch_iam_role + create_flow_log_cloudwatch_log_group = var.create_flow_log_cloudwatch_log_group + + # Public subnet tags for ELB + public_subnet_tags = merge(var.public_subnet_tags, { + "kubernetes.io/role/elb" = "1" + }) + + # Private subnet tags for internal ELB + private_subnet_tags = merge(var.private_subnet_tags, { + "kubernetes.io/role/internal-elb" = "1" + }) + + tags = var.tags +} + +# VPC Endpoints for cost optimization and security +resource "aws_vpc_endpoint" "s3" { + vpc_id = module.vpc.vpc_id + service_name = "com.amazonaws.${data.aws_region.current.name}.s3" + + vpc_endpoint_type = "Gateway" + route_table_ids = concat(module.vpc.private_route_table_ids, module.vpc.public_route_table_ids) + + tags = merge(var.tags, { + Name = "${var.name}-s3-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ecr_dkr" { + vpc_id = module.vpc.vpc_id + service_name = "com.amazonaws.${data.aws_region.current.name}.ecr.dkr" + vpc_endpoint_type = "Interface" + subnet_ids = module.vpc.private_subnets + security_group_ids = [aws_security_group.vpc_endpoints.id] + + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-ecr-dkr-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ecr_api" { + vpc_id = module.vpc.vpc_id + service_name = "com.amazonaws.${data.aws_region.current.name}.ecr.api" + vpc_endpoint_type = "Interface" + subnet_ids = module.vpc.private_subnets + security_group_ids = [aws_security_group.vpc_endpoints.id] + + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-ecr-api-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ec2" { + vpc_id = module.vpc.vpc_id + service_name = "com.amazonaws.${data.aws_region.current.name}.ec2" + vpc_endpoint_type = "Interface" + subnet_ids = module.vpc.private_subnets + security_group_ids = [aws_security_group.vpc_endpoints.id] + + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-ec2-endpoint" + }) +} + +resource "aws_vpc_endpoint" "logs" { + vpc_id = module.vpc.vpc_id + service_name = "com.amazonaws.${data.aws_region.current.name}.logs" + vpc_endpoint_type = "Interface" + subnet_ids = module.vpc.private_subnets + security_group_ids = [aws_security_group.vpc_endpoints.id] + + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-logs-endpoint" + }) +} + +resource "aws_vpc_endpoint" "sts" { + vpc_id = module.vpc.vpc_id + service_name = "com.amazonaws.${data.aws_region.current.name}.sts" + vpc_endpoint_type = "Interface" + subnet_ids = module.vpc.private_subnets + security_group_ids = [aws_security_group.vpc_endpoints.id] + + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-sts-endpoint" + }) +} + +# Security group for VPC endpoints +resource "aws_security_group" "vpc_endpoints" { + name = "${var.name}-vpc-endpoints" + description = "Security group for VPC endpoints" + vpc_id = module.vpc.vpc_id + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = [var.cidr] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(var.tags, { + Name = "${var.name}-vpc-endpoints" + }) +} + +data "aws_region" "current" {} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf new file mode 100644 index 000000000..c65b90dc3 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf @@ -0,0 +1,69 @@ +output "vpc_id" { + description = "ID of the VPC" + value = module.vpc.vpc_id +} + +output "vpc_arn" { + description = "The ARN of the VPC" + value = module.vpc.vpc_arn +} + +output "vpc_cidr_block" { + description = "The CIDR block of the VPC" + value = module.vpc.vpc_cidr_block +} + +output "private_subnets" { + description = "List of IDs of private subnets" + value = module.vpc.private_subnets +} + +output "public_subnets" { + description = "List of IDs of public subnets" + value = module.vpc.public_subnets +} + +output "private_subnet_arns" { + description = "List of ARNs of private subnets" + value = module.vpc.private_subnet_arns +} + +output "public_subnet_arns" { + description = "List of ARNs of public subnets" + value = module.vpc.public_subnet_arns +} + +output "private_subnets_cidr_blocks" { + description = "List of cidr_blocks of private subnets" + value = module.vpc.private_subnets_cidr_blocks +} + +output "public_subnets_cidr_blocks" { + description = "List of cidr_blocks of public subnets" + value = module.vpc.public_subnets_cidr_blocks +} + +output "internet_gateway_id" { + description = "The ID of the Internet Gateway" + value = module.vpc.igw_id +} + +output "nat_gateway_ids" { + description = "List of IDs of the NAT Gateways" + value = module.vpc.natgw_ids +} + +output "private_route_table_ids" { + description = "List of IDs of the private route tables" + value = module.vpc.private_route_table_ids +} + +output "public_route_table_ids" { + description = "List of IDs of the public route tables" + value = module.vpc.public_route_table_ids +} + +output "vpc_endpoints_security_group_id" { + description = "ID of the security group for VPC endpoints" + value = aws_security_group.vpc_endpoints.id +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf new file mode 100644 index 000000000..43108a5ad --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf @@ -0,0 +1,96 @@ +variable "name" { + description = "Name to be used on all the resources as identifier" + type = string +} + +variable "cidr" { + description = "The CIDR block for the VPC" + type = string +} + +variable "azs" { + description = "A list of availability zones names or ids in the region" + type = list(string) +} + +variable "private_subnets" { + description = "A list of private subnets inside the VPC" + type = list(string) +} + +variable "public_subnets" { + description = "A list of public subnets inside the VPC" + type = list(string) +} + +variable "enable_nat_gateway" { + description = "Should be true if you want to provision NAT Gateways for each of your private networks" + type = bool + default = true +} + +variable "enable_vpn_gateway" { + description = "Should be true if you want to create a new VPN Gateway resource and attach it to the VPC" + type = bool + default = false +} + +variable "enable_dns_hostnames" { + description = "Should be true to enable DNS hostnames in the VPC" + type = bool + default = true +} + +variable "enable_dns_support" { + description = "Should be true to enable DNS support in the VPC" + type = bool + default = true +} + +variable "single_nat_gateway" { + description = "Should be true to provision a single shared NAT Gateway across all of your private networks" + type = bool + default = true +} + +variable "one_nat_gateway_per_az" { + description = "Should be true if you want only one NAT Gateway per availability zone" + type = bool + default = false +} + +variable "enable_flow_log" { + description = "Whether or not to enable VPC Flow Logs" + type = bool + default = false +} + +variable "create_flow_log_cloudwatch_iam_role" { + description = "Whether to create IAM role for VPC Flow Logs" + type = bool + default = false +} + +variable "create_flow_log_cloudwatch_log_group" { + description = "Whether to create CloudWatch log group for VPC Flow Logs" + type = bool + default = false +} + +variable "public_subnet_tags" { + description = "Additional tags for the public subnets" + type = map(string) + default = {} +} + +variable "private_subnet_tags" { + description = "Additional tags for the private subnets" + type = map(string) + default = {} +} + +variable "tags" { + description = "A map of tags to add to all resources" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/outputs.tf b/1.architectures/4.amazon-eks/terraform/outputs.tf new file mode 100644 index 000000000..247dcaa66 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/outputs.tf @@ -0,0 +1,119 @@ +output "region" { + description = "AWS region" + value = var.region +} + +output "cluster_name" { + description = "EKS cluster name" + value = module.eks.cluster_name +} + +output "cluster_endpoint" { + description = "Endpoint for EKS control plane" + value = module.eks.cluster_endpoint +} + +output "cluster_security_group_id" { + description = "Security group ID attached to the EKS cluster" + value = module.eks.cluster_security_group_id +} + +output "cluster_certificate_authority_data" { + description = "Base64 encoded certificate data required to communicate with the cluster" + value = module.eks.cluster_certificate_authority_data +} + +output "cluster_oidc_issuer_url" { + description = "The URL on the EKS cluster for the OpenID Connect identity provider" + value = module.eks.cluster_oidc_issuer_url +} + +output "cluster_version" { + description = "The Kubernetes version for the EKS cluster" + value = module.eks.cluster_version +} + +output "vpc_id" { + description = "ID of the VPC where the cluster is deployed" + value = module.vpc.vpc_id +} + +output "vpc_cidr_block" { + description = "CIDR block of the VPC" + value = module.vpc.vpc_cidr_block +} + +output "private_subnets" { + description = "List of IDs of private subnets" + value = module.vpc.private_subnets +} + +output "public_subnets" { + description = "List of IDs of public subnets" + value = module.vpc.public_subnets +} + +output "node_security_group_id" { + description = "ID of the node shared security group" + value = module.eks.node_security_group_id +} + +output "eks_managed_node_groups" { + description = "Map of attribute maps for all EKS managed node groups created" + value = module.eks.eks_managed_node_groups +} + +output "fsx_lustre_id" { + description = "FSx Lustre file system ID" + value = module.fsx_lustre.file_system_id +} + +output "fsx_lustre_mount_name" { + description = "FSx Lustre mount name" + value = module.fsx_lustre.mount_name +} + +output "fsx_lustre_dns_name" { + description = "FSx Lustre DNS name" + value = module.fsx_lustre.dns_name +} + +output "s3_mountpoint_service_account_arn" { + description = "ARN of the S3 Mountpoint service account" + value = module.s3_mountpoint.service_account_arn +} + +output "s3_mountpoint_role_arn" { + description = "ARN of the S3 Mountpoint IAM role" + value = module.s3_mountpoint.role_arn +} + +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${var.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + +output "node_health_dashboard_url" { + description = "URL of the CloudWatch dashboard for node health monitoring" + value = module.addons.node_health_dashboard_url +} + +output "node_health_sns_topic_arn" { + description = "ARN of the SNS topic for node health alerts" + value = module.addons.node_health_sns_topic_arn +} + +output "karpenter_role_arn" { + description = "ARN of the Karpenter IAM role" + value = module.addons.karpenter_role_arn +} + +output "karpenter_instance_profile_name" { + description = "Name of the Karpenter node instance profile" + value = module.addons.karpenter_instance_profile_name +} + +output "karpenter_queue_name" { + description = "Name of the Karpenter SQS queue for spot instance interruption handling" + value = module.addons.karpenter_queue_name +} \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/terraform.tfvars.example b/1.architectures/4.amazon-eks/terraform/terraform.tfvars.example new file mode 100644 index 000000000..d4b3d6cca --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/terraform.tfvars.example @@ -0,0 +1,67 @@ +# AWS Configuration +region = "us-west-2" +environment = "dev" + +# EKS Cluster Configuration +cluster_name = "eks-reference" +cluster_version = "1.28" + +# Network Configuration +vpc_cidr = "10.0.0.0/16" +private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] +public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] + +# Restrict API server access (replace with your IP ranges) +cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"] + +# Default Node Group Configuration +default_instance_types = ["m5.large", "m5.xlarge"] +default_min_size = 1 +default_max_size = 10 +default_desired_size = 3 + +# Node Auto Repair Configuration for Default Nodes +default_health_check_grace_period = 300 # 5 minutes +default_health_check_type = "EC2" + +# GPU Node Group Configuration +gpu_instance_types = ["g4dn.xlarge", "g4dn.2xlarge"] +gpu_min_size = 0 +gpu_max_size = 5 +gpu_desired_size = 1 + +# Node Auto Repair Configuration for GPU Nodes (longer grace period due to GPU driver initialization) +gpu_health_check_grace_period = 600 # 10 minutes +gpu_health_check_type = "EC2" + +# FSx for Lustre Configuration +fsx_storage_capacity = 1200 +fsx_deployment_type = "SCRATCH_2" +fsx_per_unit_storage_throughput = 50 +# fsx_s3_import_path = "s3://your-bucket-name/import-path/" +# fsx_s3_export_path = "s3://your-bucket-name/export-path/" + +# S3 Mountpoint Configuration +s3_mountpoint_bucket_name = "your-s3-bucket-name" +s3_mountpoint_namespace = "kube-system" + +# Karpenter Configuration +enable_karpenter = true +karpenter_chart_version = "v0.32.1" + +# Karpenter Node Pool Configuration +karpenter_default_capacity_types = ["spot", "on-demand"] +karpenter_default_instance_types = ["m5.large", "m5.xlarge", "m5.2xlarge", "m5a.large", "m5a.xlarge", "m5a.2xlarge", "c5.large", "c5.xlarge", "c5.2xlarge"] + +karpenter_gpu_capacity_types = ["on-demand"] +karpenter_gpu_instance_types = ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", "g5.xlarge", "g5.2xlarge", "p3.2xlarge"] + +# Add-on Configuration +enable_aws_load_balancer_controller = true +enable_nvidia_device_plugin = true +enable_metrics_server = true + +# Node Health Monitoring Configuration +enable_node_health_monitoring = true +enable_sns_alerts = false +# alert_email = "your-email@example.com" # Uncomment and set your email for alerts \ No newline at end of file diff --git a/1.architectures/4.amazon-eks/terraform/variables.tf b/1.architectures/4.amazon-eks/terraform/variables.tf new file mode 100644 index 000000000..fe52d6078 --- /dev/null +++ b/1.architectures/4.amazon-eks/terraform/variables.tf @@ -0,0 +1,251 @@ +variable "region" { + description = "AWS region" + type = string + default = "us-west-2" +} + +variable "environment" { + description = "Environment name" + type = string + default = "dev" +} + +variable "cluster_name" { + description = "Name of the EKS cluster" + type = string + default = "eks-reference" +} + +variable "cluster_version" { + description = "Kubernetes version to use for the EKS cluster" + type = string + default = "1.28" +} + +variable "vpc_cidr" { + description = "CIDR block for VPC" + type = string + default = "10.0.0.0/16" +} + +variable "private_subnets" { + description = "Private subnets for EKS cluster" + type = list(string) + default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] +} + +variable "public_subnets" { + description = "Public subnets for EKS cluster" + type = list(string) + default = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] +} + +variable "cluster_endpoint_public_access_cidrs" { + description = "List of CIDR blocks that can access the Amazon EKS public API server endpoint" + type = list(string) + default = ["0.0.0.0/0"] +} + +# Default Node Group Variables +variable "default_instance_types" { + description = "List of instance types for default node group" + type = list(string) + default = ["m5.large", "m5.xlarge"] +} + +variable "default_min_size" { + description = "Minimum number of nodes in default node group" + type = number + default = 1 +} + +variable "default_max_size" { + description = "Maximum number of nodes in default node group" + type = number + default = 10 +} + +variable "default_desired_size" { + description = "Desired number of nodes in default node group" + type = number + default = 3 +} + +variable "default_health_check_grace_period" { + description = "Grace period for health checks on default node group (seconds)" + type = number + default = 300 +} + +variable "default_health_check_type" { + description = "Health check type for default node group (EC2 or ELB)" + type = string + default = "EC2" + validation { + condition = contains(["EC2", "ELB"], var.default_health_check_type) + error_message = "Health check type must be either EC2 or ELB." + } +} + +# GPU Node Group Variables +variable "gpu_instance_types" { + description = "List of GPU instance types for GPU node group" + type = list(string) + default = ["g4dn.xlarge", "g4dn.2xlarge", "p3.2xlarge"] +} + +variable "gpu_min_size" { + description = "Minimum number of nodes in GPU node group" + type = number + default = 0 +} + +variable "gpu_max_size" { + description = "Maximum number of nodes in GPU node group" + type = number + default = 5 +} + +variable "gpu_desired_size" { + description = "Desired number of nodes in GPU node group" + type = number + default = 1 +} + +variable "gpu_health_check_grace_period" { + description = "Grace period for health checks on GPU node group (seconds) - GPU nodes need longer startup time" + type = number + default = 600 +} + +variable "gpu_health_check_type" { + description = "Health check type for GPU node group (EC2 or ELB)" + type = string + default = "EC2" + validation { + condition = contains(["EC2", "ELB"], var.gpu_health_check_type) + error_message = "Health check type must be either EC2 or ELB." + } +} + +# FSx for Lustre Variables +variable "fsx_storage_capacity" { + description = "Storage capacity for FSx Lustre in GiB" + type = number + default = 1200 +} + +variable "fsx_deployment_type" { + description = "Deployment type for FSx Lustre" + type = string + default = "SCRATCH_2" + validation { + condition = contains(["SCRATCH_1", "SCRATCH_2", "PERSISTENT_1", "PERSISTENT_2"], var.fsx_deployment_type) + error_message = "Valid values for fsx_deployment_type are SCRATCH_1, SCRATCH_2, PERSISTENT_1, or PERSISTENT_2." + } +} + +variable "fsx_per_unit_storage_throughput" { + description = "Per unit storage throughput for FSx Lustre in MB/s/TiB" + type = number + default = 50 +} + +variable "fsx_s3_import_path" { + description = "S3 import path for FSx Lustre" + type = string + default = null +} + +variable "fsx_s3_export_path" { + description = "S3 export path for FSx Lustre" + type = string + default = null +} + +# S3 Mountpoint Variables +variable "s3_mountpoint_bucket_name" { + description = "S3 bucket name for Mountpoint" + type = string + default = "" +} + +variable "s3_mountpoint_namespace" { + description = "Kubernetes namespace for S3 Mountpoint CSI driver" + type = string + default = "kube-system" +} + +# Addon Variables +# Karpenter Configuration +variable "enable_karpenter" { + description = "Enable Karpenter for node provisioning" + type = bool + default = true +} + +variable "karpenter_chart_version" { + description = "Version of the Karpenter Helm chart" + type = string + default = "v0.32.1" +} + +variable "karpenter_default_capacity_types" { + description = "Capacity types for Karpenter default node pool" + type = list(string) + default = ["spot", "on-demand"] +} + +variable "karpenter_default_instance_types" { + description = "Instance types for Karpenter default node pool" + type = list(string) + default = ["m5.large", "m5.xlarge", "m5.2xlarge", "m5a.large", "m5a.xlarge", "m5a.2xlarge", "c5.large", "c5.xlarge", "c5.2xlarge"] +} + +variable "karpenter_gpu_capacity_types" { + description = "Capacity types for Karpenter GPU node pool" + type = list(string) + default = ["on-demand"] +} + +variable "karpenter_gpu_instance_types" { + description = "Instance types for Karpenter GPU node pool" + type = list(string) + default = ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", "g5.xlarge", "g5.2xlarge", "p3.2xlarge", "p3.8xlarge"] +} + +variable "enable_aws_load_balancer_controller" { + description = "Enable AWS Load Balancer Controller" + type = bool + default = true +} + +variable "enable_nvidia_device_plugin" { + description = "Enable NVIDIA device plugin for GPU support" + type = bool + default = true +} + +variable "enable_metrics_server" { + description = "Enable metrics server" + type = bool + default = true +} + +variable "enable_node_health_monitoring" { + description = "Enable CloudWatch monitoring for node health and auto-repair" + type = bool + default = true +} + +variable "enable_sns_alerts" { + description = "Enable SNS alerts for node health issues" + type = bool + default = false +} + +variable "alert_email" { + description = "Email address for node health alerts" + type = string + default = "" +} \ No newline at end of file