From c0917eb32d75514dacf1d1aaaf15dc3ce8cececf Mon Sep 17 00:00:00 2001
From: Keita Watanabe <mlkeita@amazon.com>
Date: Mon, 16 Jun 2025 00:07:43 +0000
Subject: [PATCH] adding terraform EKS reference architecture

---
 .../4.amazon-eks/{ => eksctl}/README.md       |   0
 .../{ => eksctl}/eks-g4dn-vpc.yaml            |   0
 .../4.amazon-eks/{ => eksctl}/eks-g4dn.yaml   |   0
 .../{ => eksctl}/eks-g5-node-autorepair.yaml  |   0
 .../{ => eksctl}/eks-p4de-odcr-vpc.yaml       |   0
 .../{ => eksctl}/eks-p4de-odcr.yaml           |   0
 .../{ => eksctl}/eks-p5-capacity-block.yaml   |   0
 .../{ => eksctl}/eks-p5-odcr-vpc.yaml         |   0
 .../4.amazon-eks/terraform/.gitignore         |  34 +
 .../4.amazon-eks/terraform/DESTROY_PROCESS.md | 175 ++++++
 .../4.amazon-eks/terraform/README.md          | 470 ++++++++++++++
 .../4.amazon-eks/terraform/deploy.sh          | 255 ++++++++
 .../4.amazon-eks/terraform/destroy.sh         | 389 ++++++++++++
 .../examples/fsx-lustre-example.yaml          | 105 ++++
 .../terraform/examples/gpu-workload.yaml      |  55 ++
 .../examples/karpenter-workloads.yaml         | 262 ++++++++
 .../examples/node-auto-repair-test.yaml       | 197 ++++++
 .../examples/s3-mountpoint-example.yaml       | 181 ++++++
 .../terraform/examples/test-karpenter.sh      | 292 +++++++++
 .../4.amazon-eks/terraform/main.tf            | 389 ++++++++++++
 .../terraform/modules/addons/main.tf          | 590 ++++++++++++++++++
 .../terraform/modules/addons/outputs.tf       |  39 ++
 .../terraform/modules/addons/variables.tf     | 109 ++++
 .../terraform/modules/eks/main.tf             | 149 +++++
 .../terraform/modules/eks/outputs.tf          |  49 ++
 .../terraform/modules/eks/user_data.sh        | 108 ++++
 .../terraform/modules/eks/variables.tf        |  98 +++
 .../terraform/modules/fsx-lustre/main.tf      | 114 ++++
 .../terraform/modules/fsx-lustre/outputs.tf   |  34 +
 .../terraform/modules/fsx-lustre/variables.tf | 122 ++++
 .../terraform/modules/s3-mountpoint/main.tf   | 193 ++++++
 .../modules/s3-mountpoint/outputs.tf          |  21 +
 .../modules/s3-mountpoint/variables.tf        |  61 ++
 .../terraform/modules/vpc/main.tf             | 147 +++++
 .../terraform/modules/vpc/outputs.tf          |  69 ++
 .../terraform/modules/vpc/variables.tf        |  96 +++
 .../4.amazon-eks/terraform/outputs.tf         | 119 ++++
 .../terraform/terraform.tfvars.example        |  67 ++
 .../4.amazon-eks/terraform/variables.tf       | 251 ++++++++
 39 files changed, 5240 insertions(+)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/README.md (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-g4dn-vpc.yaml (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-g4dn.yaml (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-g5-node-autorepair.yaml (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p4de-odcr-vpc.yaml (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p4de-odcr.yaml (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p5-capacity-block.yaml (100%)
 rename 1.architectures/4.amazon-eks/{ => eksctl}/eks-p5-odcr-vpc.yaml (100%)
 create mode 100644 1.architectures/4.amazon-eks/terraform/.gitignore
 create mode 100644 1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md
 create mode 100644 1.architectures/4.amazon-eks/terraform/README.md
 create mode 100755 1.architectures/4.amazon-eks/terraform/deploy.sh
 create mode 100755 1.architectures/4.amazon-eks/terraform/destroy.sh
 create mode 100644 1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml
 create mode 100644 1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml
 create mode 100644 1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml
 create mode 100644 1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml
 create mode 100644 1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml
 create mode 100755 1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh
 create mode 100644 1.architectures/4.amazon-eks/terraform/main.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/addons/main.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/main.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/outputs.tf
 create mode 100644 1.architectures/4.amazon-eks/terraform/terraform.tfvars.example
 create mode 100644 1.architectures/4.amazon-eks/terraform/variables.tf

diff --git a/1.architectures/4.amazon-eks/README.md b/1.architectures/4.amazon-eks/eksctl/README.md
similarity index 100%
rename from 1.architectures/4.amazon-eks/README.md
rename to 1.architectures/4.amazon-eks/eksctl/README.md
diff --git a/1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml b/1.architectures/4.amazon-eks/eksctl/eks-g4dn-vpc.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-g4dn-vpc.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-g4dn-vpc.yaml
diff --git a/1.architectures/4.amazon-eks/eks-g4dn.yaml b/1.architectures/4.amazon-eks/eksctl/eks-g4dn.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-g4dn.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-g4dn.yaml
diff --git a/1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml b/1.architectures/4.amazon-eks/eksctl/eks-g5-node-autorepair.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-g5-node-autorepair.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-g5-node-autorepair.yaml
diff --git a/1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr-vpc.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-p4de-odcr-vpc.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr-vpc.yaml
diff --git a/1.architectures/4.amazon-eks/eks-p4de-odcr.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-p4de-odcr.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-p4de-odcr.yaml
diff --git a/1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p5-capacity-block.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-p5-capacity-block.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-p5-capacity-block.yaml
diff --git a/1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml b/1.architectures/4.amazon-eks/eksctl/eks-p5-odcr-vpc.yaml
similarity index 100%
rename from 1.architectures/4.amazon-eks/eks-p5-odcr-vpc.yaml
rename to 1.architectures/4.amazon-eks/eksctl/eks-p5-odcr-vpc.yaml
diff --git a/1.architectures/4.amazon-eks/terraform/.gitignore b/1.architectures/4.amazon-eks/terraform/.gitignore
new file mode 100644
index 000000000..3c75e8011
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/.gitignore
@@ -0,0 +1,34 @@
+# Terraform files
+*.tfstate
+*.tfstate.*
+*.tfvars
+!*.tfvars.example
+.terraform/
+.terraform.lock.hcl
+tfplan
+tfplan.*
+
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Logs
+*.log
+
+# Temporary files
+*.tmp
+*.temp
+
+# Kubectl config
+kubeconfig*
+
+# Backup files
+*.backup
+*.bak
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md b/1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md
new file mode 100644
index 000000000..e648362d5
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/DESTROY_PROCESS.md
@@ -0,0 +1,175 @@
+# Infrastructure Destruction Process
+
+This document outlines the safe destruction process implemented in `destroy.sh`.
+
+## Overview
+
+The `destroy.sh` script ensures safe cleanup of the EKS infrastructure by following a specific order to prevent orphaned AWS resources and failed Terraform destruction.
+
+## Destruction Process Flow
+
+### 1. Pre-Flight Checks
+- ✅ Verify Terraform is installed
+- ✅ Check kubectl connectivity to cluster
+- ✅ Identify cluster name from Terraform state or kubectl context
+- ✅ Confirm user intention with safety prompts
+
+### 2. Kubernetes Resource Cleanup
+
+#### Example Workloads
+```bash
+kubectl delete -f examples/gpu-workload.yaml
+kubectl delete -f examples/fsx-lustre-example.yaml  
+kubectl delete -f examples/s3-mountpoint-example.yaml
+kubectl delete -f examples/node-auto-repair-test.yaml
+```
+
+#### LoadBalancer Services
+- Identifies all `LoadBalancer` type services across all namespaces
+- Deletes each service individually with timeout protection
+- Waits for AWS Load Balancers to be fully terminated
+
+#### Ingress Resources
+- Finds all Ingress resources that may create ALBs/NLBs
+- Deletes Ingress resources to trigger ALB cleanup
+- Includes AWS Load Balancer Controller managed resources
+
+#### PersistentVolumeClaims
+- Locates all PVCs that may create EBS volumes
+- Deletes PVCs to release underlying EBS volumes
+- Covers FSx, S3 Mountpoint, and standard EBS storage
+
+#### AWS Load Balancer Controller Resources
+- Deletes TargetGroupBinding resources
+- Ensures ALB/NLB target groups are cleaned up
+- Prevents orphaned target groups
+
+### 3. Resource Cleanup Verification
+
+#### Wait Loop (10 minutes maximum)
+```bash
+# Continuously checks for:
+- LoadBalancer services: 0 remaining
+- PersistentVolumeClaims: 0 remaining  
+- Ingress resources: 0 remaining
+```
+
+#### AWS Resource Verification
+```bash
+# If AWS CLI available, checks for:
+aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, '$CLUSTER_NAME')]"
+aws ec2 describe-security-groups --filters "Name=group-name,Values=*$CLUSTER_NAME*"
+aws elbv2 describe-target-groups --query "TargetGroups[?contains(TargetGroupName, '$CLUSTER_NAME')]"
+```
+
+### 4. Terraform Destruction
+
+#### Safety Confirmation
+- Final confirmation prompt before destruction
+- Clear warning about data loss
+- Option to cancel at any point
+
+#### Terraform Commands
+```bash
+# Initialize if needed
+terraform init
+
+# Destroy with auto-approve
+terraform destroy -auto-approve
+```
+
+### 5. Local Cleanup
+
+#### File Cleanup
+```bash
+rm -f terraform.tfstate.backup*
+rm -f tfplan*
+rm -f kubeconfig*
+```
+
+## Why This Order Matters
+
+### 1. **Prevent Orphaned Resources**
+- Kubernetes-created AWS resources must be deleted first
+- Terraform doesn't track LoadBalancers created by services
+- PVCs create EBS volumes outside Terraform state
+
+### 2. **Avoid Terraform Errors**
+- Security groups can't be deleted if still attached to resources
+- Load balancers must be deleted before their target groups
+- VPC can't be deleted with remaining ENIs
+
+### 3. **Cost Management**
+- Prevents billing for orphaned load balancers
+- Ensures EBS volumes are properly deleted
+- Cleanup of target groups and security groups
+
+## Error Recovery
+
+### If Script Fails
+```bash
+# Manual cleanup commands provided in output
+kubectl get svc --all-namespaces --field-selector spec.type=LoadBalancer
+kubectl get pvc --all-namespaces
+kubectl get ingress --all-namespaces
+
+# AWS CLI cleanup
+aws elbv2 describe-load-balancers
+aws ec2 describe-security-groups --filters "Name=group-name,Values=*eks*"
+```
+
+### Force Destruction
+```bash
+# Skip Kubernetes cleanup if cluster inaccessible
+./destroy.sh --skip-k8s-cleanup
+
+# Bypass confirmations for automation
+./destroy.sh --force
+```
+
+## Script Features
+
+### ✅ **Safety First**
+- Multiple confirmation prompts
+- Comprehensive resource detection
+- Graceful error handling
+
+### ✅ **Comprehensive Cleanup**
+- All AWS resource types covered
+- Multiple cleanup strategies
+- Verification steps
+
+### ✅ **User-Friendly**
+- Colored output for clarity
+- Progress indicators
+- Detailed error messages
+
+### ✅ **Flexible Options**
+- Skip Kubernetes cleanup
+- Force mode for automation
+- Help documentation
+
+## Expected Timeline
+
+| Phase | Duration | Description |
+|-------|----------|-------------|
+| Kubernetes Cleanup | 2-5 minutes | Delete services, PVCs, ingresses |
+| AWS Resource Deletion | 5-10 minutes | Load balancers, target groups |
+| Terraform Destroy | 5-15 minutes | VPC, EKS, FSx, etc. |
+| **Total** | **12-30 minutes** | Complete infrastructure removal |
+
+## Best Practices
+
+1. **Always use the script** instead of direct `terraform destroy`
+2. **Verify cleanup** before proceeding with Terraform destruction
+3. **Check AWS console** for any remaining resources after completion
+4. **Backup important data** before running destruction
+5. **Test in non-production** environments first
+
+## Script Exit Codes
+
+- `0`: Successful completion
+- `1`: Critical error (missing tools, failed destruction)
+- `130`: User cancellation (Ctrl+C)
+
+This comprehensive approach ensures safe, complete, and cost-effective infrastructure destruction.
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/README.md b/1.architectures/4.amazon-eks/terraform/README.md
new file mode 100644
index 000000000..bfb415f34
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/README.md
@@ -0,0 +1,470 @@
+# EKS Terraform Reference Architecture
+
+This repository contains a comprehensive Terraform configuration for deploying an Amazon EKS cluster with advanced features including GPU support, FSx for Lustre, and Mountpoint for S3.
+
+## Architecture Overview
+
+This reference architecture includes:
+
+- **EKS Cluster**: Managed Kubernetes cluster with both default and GPU node groups
+- **GPU Support**: Dedicated GPU node groups with NVIDIA device plugin and node auto-repair
+- **Node Auto-Repair**: Automatic detection and replacement of unhealthy nodes
+- **Storage Solutions**:
+  - FSx for Lustre for high-performance computing workloads
+  - Mountpoint for S3 for object storage access
+  - EBS and EFS CSI drivers
+- **Networking**: VPC with public/private subnets and VPC endpoints
+- **Security**: IAM roles with least privilege access
+- **Monitoring**: CloudWatch integration and metrics server
+- **Auto-scaling**: Karpenter for intelligent and fast node provisioning
+
+## Prerequisites
+
+- AWS CLI configured with appropriate permissions
+- Terraform >= 1.0
+- kubectl
+- Helm (for add-ons)
+
+## Required AWS Permissions
+
+Your AWS credentials need the following permissions:
+- EKS cluster management
+- EC2 instance and VPC management
+- IAM role and policy management
+- FSx file system management
+- S3 bucket access
+- CloudWatch and logging
+
+## Quick Start
+
+1. Clone this repository:
+   ```bash
+   git clone <repository-url>
+   cd terraform-eks-reference-architecture
+   ```
+
+2. Copy the example variables file:
+   ```bash
+   cp terraform.tfvars.example terraform.tfvars
+   ```
+
+3. Edit `terraform.tfvars` with your specific values:
+   - Update `cluster_endpoint_public_access_cidrs` with your IP ranges
+   - Set `s3_mountpoint_bucket_name` to your S3 bucket name
+   - Configure FSx S3 import/export paths if needed
+
+4. Initialize Terraform:
+   ```bash
+   terraform init
+   ```
+
+5. Plan the deployment:
+   ```bash
+   terraform plan
+   ```
+
+6. Apply the configuration:
+   ```bash
+   terraform apply
+   ```
+
+7. Configure kubectl:
+   ```bash
+   aws eks --region <region> update-kubeconfig --name <cluster-name>
+   ```
+
+## Module Structure
+
+```
+modules/
+├── vpc/              # VPC, subnets, and networking
+├── eks/              # EKS cluster and managed node groups
+├── fsx-lustre/       # FSx for Lustre file system
+├── s3-mountpoint/    # Mountpoint for S3 integration
+└── addons/           # Kubernetes add-ons and controllers
+```
+
+## Configuration Options
+
+### Node Groups
+
+#### Default Node Group
+- **Instance Types**: Configurable (default: m5.large, m5.xlarge)
+- **Scaling**: Auto-scaling with configurable min/max/desired capacity
+- **AMI**: Amazon Linux 2 EKS optimized
+
+#### GPU Node Group
+- **Instance Types**: GPU-enabled instances (g4dn.xlarge, g4dn.2xlarge, p3.2xlarge)
+- **AMI**: Amazon Linux 2 EKS GPU optimized
+- **Taints**: Automatically taints GPU nodes
+- **Auto-repair**: Enabled with extended grace period for GPU driver initialization
+
+### Node Auto-Repair
+
+Both node groups are configured with automatic node repair capabilities:
+
+#### Default Node Group Auto-Repair
+- **Health Check Type**: EC2 instance health checks
+- **Grace Period**: 300 seconds (5 minutes)
+- **Monitoring**: Continuously monitors node health via EC2 instance status
+- **Action**: Automatically replaces unhealthy nodes
+
+#### GPU Node Group Auto-Repair
+- **Health Check Type**: EC2 instance health checks
+- **Grace Period**: 600 seconds (10 minutes) - Extended due to GPU driver initialization time
+- **Monitoring**: Enhanced monitoring for GPU-specific health issues
+- **Action**: Intelligent replacement considering GPU resource constraints
+
+#### Auto-Repair Features
+- **Proactive Monitoring**: Detects node issues before they impact workloads
+- **Graceful Replacement**: Ensures workloads are safely rescheduled before node termination
+- **Cost Optimization**: Prevents resource waste from unhealthy nodes
+- **Zero-Touch Operations**: Reduces manual intervention for node maintenance
+
+### Auto-Scaling with Karpenter
+
+The reference architecture uses **Karpenter** instead of Cluster Autoscaler for superior node provisioning:
+
+#### Karpenter Advantages
+- **Fast Provisioning**: Sub-minute node startup times
+- **Cost Optimization**: Intelligent instance selection and spot instance support
+- **Flexible Scheduling**: Pod-driven node selection with diverse instance types
+- **Efficient Packing**: Optimal resource utilization and consolidation
+- **Zero-Configuration**: Automatic node discovery and management
+
+#### Karpenter NodePools
+
+**Default NodePool** - For standard workloads:
+```yaml
+# Supports spot and on-demand instances
+capacity-types: ["spot", "on-demand"]
+instance-types: ["m5.*", "m5a.*", "c5.*"]
+consolidation: WhenUnderutilized (30s)
+expiration: 30 minutes
+```
+
+**GPU NodePool** - For GPU workloads:
+```yaml
+# GPU-specific instances with taints
+capacity-types: ["on-demand"]
+instance-types: ["g4dn.*", "g5.*", "p3.*"]
+consolidation: WhenEmpty (30s)
+expiration: 60 minutes
+gpu-taints: nvidia.com/gpu=true:NoSchedule
+```
+
+#### Karpenter vs Cluster Autoscaler
+
+| Feature | Karpenter | Cluster Autoscaler |
+|---------|-----------|-------------------|
+| **Provisioning Speed** | ~45 seconds | 3-5 minutes |
+| **Instance Selection** | Pod-driven | Node group limited |
+| **Spot Support** | Native & seamless | Limited |
+| **Cost Optimization** | Advanced bin-packing | Basic scaling |
+| **Configuration** | Declarative NodePools | ASG management |
+| **Multi-AZ** | Automatic | Manual setup |
+
+### Storage
+
+#### FSx for Lustre
+- **Deployment Types**: SCRATCH_1, SCRATCH_2, PERSISTENT_1, PERSISTENT_2
+- **S3 Integration**: Optional import/export paths
+- **Performance**: Configurable throughput
+- **Kubernetes Integration**: Automatic CSI driver and storage class creation
+
+#### Mountpoint for S3
+- **CSI Driver**: Automatically deployed and configured
+- **IAM Integration**: IRSA (IAM Roles for Service Accounts)
+- **Storage Classes**: Pre-configured for immediate use
+
+### Add-ons
+
+The following add-ons are included:
+
+- **Cluster Autoscaler**: Automatic node scaling
+- **AWS Load Balancer Controller**: ALB and NLB integration
+- **NVIDIA Device Plugin**: GPU resource management
+- **Metrics Server**: Resource metrics collection
+- **AWS Node Termination Handler**: Graceful spot instance handling
+- **EBS CSI Driver**: EBS volume management
+- **EFS CSI Driver**: EFS file system support
+
+## Security Best Practices
+
+- **Network Security**: Private subnets for worker nodes
+- **IAM**: Least privilege access with IRSA
+- **Encryption**: EBS volumes and secrets encryption
+- **VPC Endpoints**: Reduced internet traffic and improved security
+- **Security Groups**: Restrictive ingress rules
+
+## Monitoring and Logging
+
+- **CloudWatch**: Container Insights integration
+- **VPC Flow Logs**: Network traffic monitoring
+- **Node Metrics**: CPU, memory, and disk monitoring
+- **Application Logs**: Centralized logging to CloudWatch
+
+## Example Workloads
+
+### GPU Workload Example
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-workload
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-workload
+  template:
+    metadata:
+      labels:
+        app: gpu-workload
+    spec:
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Equal
+        value: "true"
+        effect: NoSchedule
+      nodeSelector:
+        nvidia.com/gpu: "true"
+      containers:
+      - name: gpu-container
+        image: nvidia/cuda:11.0-base
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+        command: ["nvidia-smi"]
+```
+
+### FSx Lustre Usage
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: fsx-lustre-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: fsx-lustre-sc
+  resources:
+    requests:
+      storage: 100Gi
+```
+
+### S3 Mountpoint Usage
+
+```yaml
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: s3-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: s3-mountpoint-sc
+  resources:
+    requests:
+      storage: 1000Gi
+```
+
+## Karpenter Workload Examples
+
+### Standard Workload with Spot Preference
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: web-app
+spec:
+  replicas: 5
+  template:
+    spec:
+      containers:
+      - name: nginx
+        image: nginx
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+      # Prefer spot instances for cost optimization
+      affinity:
+        nodeAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+              - key: karpenter.sh/capacity-type
+                operator: In
+                values: ["spot"]
+```
+
+### GPU Workload Example
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: gpu-training
+spec:
+  template:
+    spec:
+      containers:
+      - name: ml-training
+        image: nvidia/cuda:11.8-base
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+            cpu: 2000m
+            memory: 8Gi
+      nodeSelector:
+        node-type: gpu
+      tolerations:
+      - key: nvidia.com/gpu
+        effect: NoSchedule
+```
+
+### Testing Karpenter
+```bash
+# Deploy test workloads
+kubectl apply -f examples/karpenter-workloads.yaml
+
+# Run comprehensive Karpenter tests
+./examples/test-karpenter.sh test
+
+# Monitor Karpenter scaling
+kubectl get nodes -w
+kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -f
+```
+
+## Cost Optimization
+
+- **Spot Instances**: Can be enabled for cost savings
+- **Single NAT Gateway**: Reduces NAT gateway costs (configurable)
+- **VPC Endpoints**: Reduces data transfer costs
+- **Auto-scaling**: Right-sizing based on demand
+
+## Troubleshooting
+
+### Common Issues
+
+1. **GPU Nodes Not Ready**: Check NVIDIA driver installation in user data
+2. **FSx Mount Issues**: Verify security group rules and Lustre client installation
+3. **S3 Mountpoint Errors**: Check IAM permissions and bucket policies
+4. **Karpenter Issues**: Check NodePools, EC2NodeClasses, and IAM permissions
+5. **Node Auto-Repair Issues**: 
+   - Check EC2 instance health in AWS console
+   - Verify health check grace periods are appropriate
+   - Monitor CloudWatch metrics for node health events
+
+### Debugging Commands
+
+```bash
+# Check node status
+kubectl get nodes -o wide
+
+# Check GPU resources
+kubectl describe nodes -l nvidia.com/gpu=true
+
+# Check storage classes
+kubectl get storageclass
+
+# Check persistent volumes
+kubectl get pv,pvc
+
+# Check add-on status
+kubectl get pods -n kube-system
+
+# Monitor node health and auto-repair
+kubectl get nodes --show-labels
+kubectl describe node <node-name>
+
+# Check node group health in AWS CLI
+aws eks describe-nodegroup --cluster-name <cluster-name> --nodegroup-name <nodegroup-name>
+
+# Monitor auto-repair events
+kubectl get events --field-selector involvedObject.kind=Node --sort-by='.lastTimestamp'
+
+# Check Karpenter status and logs
+kubectl get nodepool,ec2nodeclass
+kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=50
+
+# Test Karpenter provisioning
+./examples/test-karpenter.sh monitor
+```
+
+## Cleanup
+
+### Safe Infrastructure Destruction
+
+Use the provided destroy script for safe cleanup:
+
+```bash
+./destroy.sh
+```
+
+The destroy script will:
+1. **Clean up Kubernetes resources** that create AWS resources (LoadBalancers, PVCs, Ingresses)
+2. **Wait for AWS resources** to be fully deleted
+3. **Run terraform destroy** to remove all infrastructure
+4. **Clean up local files** (state backups, plans, etc.)
+
+### Script Options
+
+```bash
+# Interactive cleanup (default)
+./destroy.sh
+
+# Skip Kubernetes cleanup (if cluster is not accessible)
+./destroy.sh --skip-k8s-cleanup
+
+# Force mode (skip confirmations)
+./destroy.sh --force
+
+# Get help
+./destroy.sh --help
+```
+
+### Manual Cleanup (if script fails)
+
+If the destroy script fails, you can manually clean up:
+
+```bash
+# Delete example workloads
+kubectl delete -f examples/ --ignore-not-found=true
+
+# Delete LoadBalancer services
+kubectl get svc --all-namespaces -o jsonpath='{range .items[?(@.spec.type=="LoadBalancer")]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' | while read ns svc; do kubectl delete svc $svc -n $ns; done
+
+# Delete PersistentVolumeClaims
+kubectl delete pvc --all --all-namespaces
+
+# Delete Ingress resources
+kubectl delete ingress --all --all-namespaces
+
+# Wait for AWS resources to be cleaned up (5-10 minutes)
+# Then run terraform destroy
+terraform destroy
+```
+
+**Important**: Always ensure Kubernetes resources are deleted before running `terraform destroy` to prevent orphaned AWS resources.
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Make changes and test
+4. Submit a pull request
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
+## Support
+
+For issues and questions:
+- Check the troubleshooting section
+- Review AWS EKS documentation
+- Open an issue in this repository
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/deploy.sh b/1.architectures/4.amazon-eks/terraform/deploy.sh
new file mode 100755
index 000000000..ac4429ad3
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/deploy.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check prerequisites
+check_prerequisites() {
+    print_status "Checking prerequisites..."
+    
+    # Check if terraform is installed
+    if ! command -v terraform &> /dev/null; then
+        print_error "Terraform is not installed. Please install Terraform first."
+        exit 1
+    fi
+    
+    # Check if AWS CLI is installed
+    if ! command -v aws &> /dev/null; then
+        print_error "AWS CLI is not installed. Please install AWS CLI first."
+        exit 1
+    fi
+    
+    # Check if kubectl is installed
+    if ! command -v kubectl &> /dev/null; then
+        print_error "kubectl is not installed. Please install kubectl first."
+        exit 1
+    fi
+    
+    # Check AWS credentials
+    if ! aws sts get-caller-identity &> /dev/null; then
+        print_error "AWS credentials not configured. Please run 'aws configure' first."
+        exit 1
+    fi
+    
+    print_success "All prerequisites are satisfied."
+}
+
+# Check if terraform.tfvars exists
+check_tfvars() {
+    if [ ! -f "terraform.tfvars" ]; then
+        print_warning "terraform.tfvars not found. Creating from example..."
+        cp terraform.tfvars.example terraform.tfvars
+        print_warning "Please edit terraform.tfvars with your specific values before proceeding."
+        print_warning "Key values to update:"
+        echo "  - cluster_endpoint_public_access_cidrs (your IP ranges)"
+        echo "  - s3_mountpoint_bucket_name (your S3 bucket name)"
+        echo "  - fsx_s3_import_path and fsx_s3_export_path (if using S3 integration)"
+        read -p "Press Enter to continue after editing terraform.tfvars..."
+    fi
+}
+
+# Initialize Terraform
+init_terraform() {
+    print_status "Initializing Terraform..."
+    terraform init
+    print_success "Terraform initialized successfully."
+}
+
+# Plan Terraform deployment
+plan_terraform() {
+    print_status "Planning Terraform deployment..."
+    terraform plan -out=tfplan
+    print_success "Terraform plan completed successfully."
+}
+
+# Apply Terraform configuration
+apply_terraform() {
+    print_status "Applying Terraform configuration..."
+    print_warning "This will create AWS resources that may incur costs."
+    read -p "Do you want to continue? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        terraform apply tfplan
+        print_success "Terraform apply completed successfully."
+    else
+        print_status "Deployment cancelled."
+        exit 0
+    fi
+}
+
+# Configure kubectl
+configure_kubectl() {
+    print_status "Configuring kubectl..."
+    
+    # Get cluster name and region from Terraform outputs
+    CLUSTER_NAME=$(terraform output -raw cluster_name)
+    REGION=$(terraform output -raw region)
+    
+    # Update kubeconfig
+    aws eks --region $REGION update-kubeconfig --name $CLUSTER_NAME
+    
+    # Test connection
+    if kubectl get nodes &> /dev/null; then
+        print_success "kubectl configured successfully."
+        print_status "Cluster nodes:"
+        kubectl get nodes -o wide
+    else
+        print_error "Failed to connect to cluster. Please check your configuration."
+        exit 1
+    fi
+}
+
+# Deploy example workloads
+deploy_examples() {
+    print_status "Do you want to deploy example workloads?"
+    read -p "Deploy examples? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        print_status "Deploying GPU workload example..."
+        kubectl apply -f examples/gpu-workload.yaml
+        
+        print_status "Deploying FSx Lustre example..."
+        kubectl apply -f examples/fsx-lustre-example.yaml
+        
+        print_status "Deploying S3 Mountpoint example..."
+        kubectl apply -f examples/s3-mountpoint-example.yaml
+        
+        print_success "Example workloads deployed successfully."
+        
+        print_status "Checking deployment status..."
+        kubectl get pods,pvc -o wide
+    fi
+}
+
+# Display cluster information
+show_cluster_info() {
+    print_status "Cluster Information:"
+    echo "===================="
+    
+    # Terraform outputs
+    echo "Cluster Name: $(terraform output -raw cluster_name)"
+    echo "Cluster Endpoint: $(terraform output -raw cluster_endpoint)"
+    echo "Region: $(terraform output -raw region)"
+    echo "VPC ID: $(terraform output -raw vpc_id)"
+    
+    echo ""
+    print_status "Useful Commands:"
+    echo "=================="
+    echo "View cluster nodes: kubectl get nodes -o wide"
+    echo "View all pods: kubectl get pods --all-namespaces"
+    echo "View storage classes: kubectl get storageclass"
+    echo "View persistent volumes: kubectl get pv,pvc --all-namespaces"
+    echo "Check GPU nodes: kubectl describe nodes -l nvidia.com/gpu=true"
+    echo "View cluster info: kubectl cluster-info"
+    
+    echo ""
+    print_status "Monitoring:"
+    echo "==========="
+    echo "Check cluster autoscaler: kubectl logs -n kube-system -l app=cluster-autoscaler"
+    echo "Check load balancer controller: kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller"
+    echo "Check NVIDIA device plugin: kubectl logs -n kube-system -l name=nvidia-device-plugin-ds"
+}
+
+# Main deployment function
+main() {
+    print_status "Starting EKS Reference Architecture Deployment"
+    echo "=============================================="
+    
+    check_prerequisites
+    check_tfvars
+    init_terraform
+    plan_terraform
+    apply_terraform
+    configure_kubectl
+    deploy_examples
+    show_cluster_info
+    
+    print_success "Deployment completed successfully!"
+    print_status "Your EKS cluster is ready to use."
+    echo ""
+    print_status "To destroy the infrastructure safely, use:"
+    echo "  ./destroy.sh"
+}
+
+# Cleanup function
+cleanup() {
+    print_status "Starting cleanup process..."
+    print_warning "This will destroy all resources created by Terraform."
+    print_warning "Make sure to delete any Kubernetes resources (LoadBalancers, PVCs) first!"
+    
+    read -p "Are you sure you want to destroy all resources? (y/N): " -n 1 -r
+    echo
+    if [[ $REPLY =~ ^[Yy]$ ]]; then
+        # Delete example workloads first
+        print_status "Deleting example workloads..."
+        kubectl delete -f examples/ --ignore-not-found=true || true
+        
+        # Wait for cleanup
+        print_status "Waiting for Kubernetes resources to be cleaned up..."
+        sleep 30
+        
+        # Destroy Terraform resources
+        print_status "Destroying Terraform resources..."
+        terraform destroy -auto-approve
+        
+        print_success "Cleanup completed successfully."
+    else
+        print_status "Cleanup cancelled."
+    fi
+}
+
+# Script usage
+usage() {
+    echo "Usage: $0 [command]"
+    echo ""
+    echo "Commands:"
+    echo "  deploy    - Deploy the EKS cluster (default)"
+    echo "  cleanup   - Destroy all resources"
+    echo "  help      - Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 deploy   # Deploy the cluster"
+    echo "  $0 cleanup  # Destroy the cluster"
+    echo "  $0          # Deploy the cluster (default)"
+}
+
+# Parse command line arguments
+case "${1:-deploy}" in
+    deploy)
+        main
+        ;;
+    cleanup)
+        cleanup
+        ;;
+    help|--help|-h)
+        usage
+        ;;
+    *)
+        print_error "Unknown command: $1"
+        usage
+        exit 1
+        ;;
+esac
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/destroy.sh b/1.architectures/4.amazon-eks/terraform/destroy.sh
new file mode 100755
index 000000000..aab52d3bb
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/destroy.sh
@@ -0,0 +1,389 @@
+#!/bin/bash
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Function to check if kubectl is configured
+check_kubectl() {
+    print_status "Checking kubectl configuration..."
+    if ! kubectl cluster-info &> /dev/null; then
+        print_warning "kubectl is not configured or cluster is not accessible."
+        print_warning "Some cleanup steps will be skipped."
+        return 1
+    fi
+    print_success "kubectl is configured and cluster is accessible."
+    return 0
+}
+
+# Function to get cluster name from terraform output
+get_cluster_name() {
+    if [ -f "terraform.tfstate" ]; then
+        CLUSTER_NAME=$(terraform output -raw cluster_name 2>/dev/null || echo "")
+        if [ -n "$CLUSTER_NAME" ]; then
+            print_status "Found cluster name from Terraform: $CLUSTER_NAME"
+            return 0
+        fi
+    fi
+    
+    # Try to get from kubectl context
+    CLUSTER_NAME=$(kubectl config current-context 2>/dev/null | grep -o 'arn:aws:eks:[^:]*:[^:]*:cluster/[^/]*' | cut -d'/' -f2 2>/dev/null || echo "")
+    if [ -n "$CLUSTER_NAME" ]; then
+        print_status "Found cluster name from kubectl context: $CLUSTER_NAME"
+        return 0
+    fi
+    
+    print_warning "Could not determine cluster name"
+    return 1
+}
+
+# Function to delete LoadBalancer services
+delete_load_balancers() {
+    print_status "Checking for LoadBalancer services..."
+    
+    LB_SERVICES=$(kubectl get svc --all-namespaces -o jsonpath='{range .items[?(@.spec.type=="LoadBalancer")]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "")
+    
+    if [ -n "$LB_SERVICES" ]; then
+        print_warning "Found LoadBalancer services that need to be deleted:"
+        echo "$LB_SERVICES"
+        
+        while IFS= read -r line; do
+            if [ -n "$line" ]; then
+                NAMESPACE=$(echo "$line" | awk '{print $1}')
+                SERVICE=$(echo "$line" | awk '{print $2}')
+                print_status "Deleting LoadBalancer service: $NAMESPACE/$SERVICE"
+                kubectl delete svc "$SERVICE" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete service $NAMESPACE/$SERVICE"
+            fi
+        done <<< "$LB_SERVICES"
+        
+        print_status "Waiting for LoadBalancers to be fully deleted..."
+        sleep 30
+    else
+        print_success "No LoadBalancer services found."
+    fi
+}
+
+# Function to delete Ingress resources
+delete_ingresses() {
+    print_status "Checking for Ingress resources..."
+    
+    INGRESSES=$(kubectl get ingress --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "")
+    
+    if [ -n "$INGRESSES" ]; then
+        print_warning "Found Ingress resources that need to be deleted:"
+        echo "$INGRESSES"
+        
+        while IFS= read -r line; do
+            if [ -n "$line" ]; then
+                NAMESPACE=$(echo "$line" | awk '{print $1}')
+                INGRESS=$(echo "$line" | awk '{print $2}')
+                print_status "Deleting Ingress: $NAMESPACE/$INGRESS"
+                kubectl delete ingress "$INGRESS" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete ingress $NAMESPACE/$INGRESS"
+            fi
+        done <<< "$INGRESSES"
+        
+        print_status "Waiting for Ingresses to be fully deleted..."
+        sleep 30
+    else
+        print_success "No Ingress resources found."
+    fi
+}
+
+# Function to delete PersistentVolumeClaims
+delete_pvcs() {
+    print_status "Checking for PersistentVolumeClaims..."
+    
+    PVCS=$(kubectl get pvc --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "")
+    
+    if [ -n "$PVCS" ]; then
+        print_warning "Found PersistentVolumeClaims that need to be deleted:"
+        echo "$PVCS"
+        
+        while IFS= read -r line; do
+            if [ -n "$line" ]; then
+                NAMESPACE=$(echo "$line" | awk '{print $1}')
+                PVC=$(echo "$line" | awk '{print $2}')
+                print_status "Deleting PVC: $NAMESPACE/$PVC"
+                kubectl delete pvc "$PVC" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete PVC $NAMESPACE/$PVC"
+            fi
+        done <<< "$PVCS"
+        
+        print_status "Waiting for PVCs to be fully deleted..."
+        sleep 30
+    else
+        print_success "No PersistentVolumeClaims found."
+    fi
+}
+
+# Function to delete example workloads
+delete_example_workloads() {
+    print_status "Deleting example workloads..."
+    
+    if [ -d "examples" ]; then
+        for example_file in examples/*.yaml; do
+            if [ -f "$example_file" ]; then
+                print_status "Deleting resources from $example_file"
+                kubectl delete -f "$example_file" --ignore-not-found=true --timeout=300s || print_warning "Failed to delete some resources from $example_file"
+            fi
+        done
+        print_success "Example workloads cleanup completed."
+    else
+        print_status "No examples directory found."
+    fi
+}
+
+# Function to delete AWS Load Balancer Controller resources
+delete_alb_resources() {
+    print_status "Checking for AWS Load Balancer Controller managed resources..."
+    
+    # Delete TargetGroupBinding resources
+    TGB_RESOURCES=$(kubectl get targetgroupbindings --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{.metadata.name}{"\n"}{end}' 2>/dev/null || echo "")
+    
+    if [ -n "$TGB_RESOURCES" ]; then
+        print_warning "Found TargetGroupBinding resources:"
+        while IFS= read -r line; do
+            if [ -n "$line" ]; then
+                NAMESPACE=$(echo "$line" | awk '{print $1}')
+                TGB=$(echo "$line" | awk '{print $2}')
+                print_status "Deleting TargetGroupBinding: $NAMESPACE/$TGB"
+                kubectl delete targetgroupbinding "$TGB" -n "$NAMESPACE" --timeout=300s || print_warning "Failed to delete TargetGroupBinding $NAMESPACE/$TGB"
+            fi
+        done <<< "$TGB_RESOURCES"
+    fi
+    
+    print_success "AWS Load Balancer Controller resources cleanup completed."
+}
+
+# Function to wait for all resources to be deleted
+wait_for_cleanup() {
+    print_status "Waiting for all Kubernetes resources to be fully cleaned up..."
+    
+    # Wait up to 10 minutes for resources to be deleted
+    for i in {1..60}; do
+        LB_COUNT=$(kubectl get svc --all-namespaces -o jsonpath='{range .items[?(@.spec.type=="LoadBalancer")]}{.metadata.name}{"\n"}{end}' 2>/dev/null | wc -l || echo "0")
+        PVC_COUNT=$(kubectl get pvc --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0")
+        INGRESS_COUNT=$(kubectl get ingress --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0")
+        
+        if [ "$LB_COUNT" -eq 0 ] && [ "$PVC_COUNT" -eq 0 ] && [ "$INGRESS_COUNT" -eq 0 ]; then
+            print_success "All Kubernetes resources have been cleaned up."
+            break
+        fi
+        
+        print_status "Still waiting for cleanup... (${i}/60) - LBs: $LB_COUNT, PVCs: $PVC_COUNT, Ingresses: $INGRESS_COUNT"
+        sleep 10
+    done
+}
+
+# Function to check for remaining AWS resources
+check_aws_resources() {
+    print_status "Checking for remaining AWS resources that might block Terraform destroy..."
+    
+    if command -v aws &> /dev/null && [ -n "$CLUSTER_NAME" ]; then
+        print_status "Checking for remaining Load Balancers..."
+        aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, '$CLUSTER_NAME')].LoadBalancerName" --output table 2>/dev/null || print_warning "Could not check ELBv2 resources"
+        
+        print_status "Checking for remaining Security Groups..."
+        aws ec2 describe-security-groups --filters "Name=group-name,Values=*$CLUSTER_NAME*" --query "SecurityGroups[].GroupName" --output table 2>/dev/null || print_warning "Could not check Security Groups"
+        
+        print_status "Checking for remaining Target Groups..."
+        aws elbv2 describe-target-groups --query "TargetGroups[?contains(TargetGroupName, '$CLUSTER_NAME')].TargetGroupName" --output table 2>/dev/null || print_warning "Could not check Target Groups"
+    else
+        print_warning "AWS CLI not available or cluster name not found. Skipping AWS resource check."
+    fi
+}
+
+# Function to run terraform destroy
+run_terraform_destroy() {
+    print_status "Running terraform destroy..."
+    print_warning "This will destroy all Terraform-managed infrastructure."
+    print_warning "Make sure you have backed up any important data."
+    
+    read -p "Are you sure you want to proceed with terraform destroy? (yes/no): " -r
+    if [[ $REPLY == "yes" ]]; then
+        print_status "Proceeding with terraform destroy..."
+        
+        # Initialize terraform if needed
+        if [ ! -d ".terraform" ]; then
+            print_status "Initializing Terraform..."
+            terraform init
+        fi
+        
+        # Run destroy with auto-approve
+        terraform destroy -auto-approve
+        
+        if [ $? -eq 0 ]; then
+            print_success "Terraform destroy completed successfully!"
+        else
+            print_error "Terraform destroy failed. Please check the output above."
+            exit 1
+        fi
+    else
+        print_status "Terraform destroy cancelled."
+        exit 0
+    fi
+}
+
+# Function to cleanup local files
+cleanup_local_files() {
+    print_status "Cleaning up local files..."
+    
+    # Remove terraform state backup files
+    rm -f terraform.tfstate.backup* 2>/dev/null || true
+    rm -f tfplan* 2>/dev/null || true
+    
+    # Remove kubectl config backups
+    rm -f kubeconfig* 2>/dev/null || true
+    
+    print_success "Local cleanup completed."
+}
+
+# Main function
+main() {
+    print_status "Starting EKS Infrastructure Destruction"
+    echo "========================================"
+    
+    print_warning "This script will:"
+    echo "1. Delete all Kubernetes resources that create AWS resources"
+    echo "2. Wait for cleanup to complete"
+    echo "3. Run terraform destroy to remove all infrastructure"
+    echo "4. Clean up local files"
+    echo ""
+    
+    read -p "Do you want to continue? (y/N): " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        print_status "Destruction cancelled."
+        exit 0
+    fi
+    
+    # Check prerequisites
+    if ! command -v terraform &> /dev/null; then
+        print_error "Terraform is not installed."
+        exit 1
+    fi
+    
+    # Get cluster name
+    get_cluster_name
+    
+    # Check kubectl and proceed with Kubernetes cleanup if available
+    if check_kubectl; then
+        print_status "Starting Kubernetes resource cleanup..."
+        
+        delete_example_workloads
+        delete_load_balancers
+        delete_ingresses
+        delete_alb_resources
+        delete_pvcs
+        wait_for_cleanup
+        
+        print_success "Kubernetes cleanup completed."
+    else
+        print_warning "Skipping Kubernetes cleanup due to connectivity issues."
+        print_warning "You may need to manually clean up AWS resources if Terraform destroy fails."
+    fi
+    
+    # Check for remaining AWS resources
+    check_aws_resources
+    
+    # Wait a bit more to ensure AWS resources are cleaned up
+    print_status "Waiting additional 60 seconds for AWS resource cleanup..."
+    sleep 60
+    
+    # Run terraform destroy
+    run_terraform_destroy
+    
+    # Clean up local files
+    cleanup_local_files
+    
+    print_success "Infrastructure destruction completed!"
+    print_status "All resources have been destroyed and local files cleaned up."
+}
+
+# Handle script termination
+cleanup_on_exit() {
+    print_warning "Script interrupted. Some resources may not be fully cleaned up."
+    print_warning "You may need to manually delete remaining AWS resources."
+}
+
+trap cleanup_on_exit EXIT
+
+# Help function
+usage() {
+    echo "Usage: $0 [options]"
+    echo ""
+    echo "Options:"
+    echo "  --skip-k8s-cleanup    Skip Kubernetes resource cleanup"
+    echo "  --force              Skip confirmation prompts"
+    echo "  --help               Show this help message"
+    echo ""
+    echo "This script safely destroys the EKS infrastructure by:"
+    echo "1. Cleaning up Kubernetes resources that create AWS resources"
+    echo "2. Waiting for AWS resources to be fully deleted"
+    echo "3. Running terraform destroy"
+    echo "4. Cleaning up local files"
+}
+
+# Parse command line arguments
+SKIP_K8S_CLEANUP=false
+FORCE=false
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --skip-k8s-cleanup)
+            SKIP_K8S_CLEANUP=true
+            shift
+            ;;
+        --force)
+            FORCE=true
+            shift
+            ;;
+        --help|-h)
+            usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Override main function for options
+if [ "$SKIP_K8S_CLEANUP" = true ]; then
+    print_warning "Skipping Kubernetes cleanup as requested."
+    run_terraform_destroy
+    cleanup_local_files
+    exit 0
+fi
+
+if [ "$FORCE" = true ]; then
+    print_warning "Force mode enabled. Skipping confirmations."
+    # Override read commands in functions
+    export REPLY="yes"
+fi
+
+# Run main function
+main
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml b/1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml
new file mode 100644
index 000000000..0b5e47f38
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/examples/fsx-lustre-example.yaml
@@ -0,0 +1,105 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: fsx-lustre-pvc
+  namespace: default
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: fsx-lustre-sc
+  resources:
+    requests:
+      storage: 100Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: fsx-lustre-test
+  namespace: default
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: fsx-lustre-test
+  template:
+    metadata:
+      labels:
+        app: fsx-lustre-test
+    spec:
+      containers:
+      - name: test-container
+        image: amazonlinux:latest
+        command:
+        - /bin/bash
+        - -c
+        - |
+          yum update -y
+          yum install -y util-linux
+          echo "Testing FSx Lustre mount at /mnt/fsx"
+          df -h /mnt/fsx
+          echo "Writing test file..."
+          echo "$(date): Hello from $(hostname)" >> /mnt/fsx/test-$(hostname).txt
+          echo "Reading test files:"
+          ls -la /mnt/fsx/
+          cat /mnt/fsx/test-*.txt
+          echo "FSx Lustre test completed. Sleeping..."
+          sleep 3600
+        volumeMounts:
+        - name: fsx-volume
+          mountPath: /mnt/fsx
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 200m
+            memory: 256Mi
+      volumes:
+      - name: fsx-volume
+        persistentVolumeClaim:
+          claimName: fsx-lustre-pvc
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: fsx-performance-test
+  namespace: default
+spec:
+  template:
+    spec:
+      containers:
+      - name: performance-test
+        image: amazonlinux:latest
+        command:
+        - /bin/bash
+        - -c
+        - |
+          yum update -y
+          yum install -y time
+          echo "Running FSx Lustre performance test..."
+          
+          # Write performance test
+          echo "Testing write performance..."
+          time dd if=/dev/zero of=/mnt/fsx/test-write-$(date +%s).dat bs=1M count=100 oflag=direct
+          
+          # Read performance test
+          echo "Testing read performance..."
+          time dd if=/mnt/fsx/test-write-*.dat of=/dev/null bs=1M iflag=direct
+          
+          echo "Performance test completed"
+        volumeMounts:
+        - name: fsx-volume
+          mountPath: /mnt/fsx
+        resources:
+          requests:
+            cpu: 500m
+            memory: 512Mi
+          limits:
+            cpu: 1000m
+            memory: 1Gi
+      volumes:
+      - name: fsx-volume
+        persistentVolumeClaim:
+          claimName: fsx-lustre-pvc
+      restartPolicy: Never
+  backoffLimit: 3
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml b/1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml
new file mode 100644
index 000000000..0cc704f04
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/examples/gpu-workload.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-test
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-test
+  template:
+    metadata:
+      labels:
+        app: gpu-test
+    spec:
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Equal
+        value: "true"
+        effect: NoSchedule
+      nodeSelector:
+        nvidia.com/gpu: "true"
+      containers:
+      - name: gpu-container
+        image: nvidia/cuda:11.8-base-ubuntu20.04
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            nvidia.com/gpu: 1
+        command: 
+        - /bin/bash
+        - -c
+        - |
+          nvidia-smi
+          echo "GPU test completed successfully"
+          sleep 3600
+        env:
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: "all"
+        - name: NVIDIA_DRIVER_CAPABILITIES
+          value: "compute,utility"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-test-service
+  namespace: default
+spec:
+  selector:
+    app: gpu-test
+  ports:
+  - port: 80
+    targetPort: 8080
+  type: ClusterIP
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml b/1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml
new file mode 100644
index 000000000..078b9ebc2
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/examples/karpenter-workloads.yaml
@@ -0,0 +1,262 @@
+---
+# Example 1: Standard workload that will be scheduled on Karpenter-managed nodes
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: karpenter-example-app
+  namespace: default
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: karpenter-example
+  template:
+    metadata:
+      labels:
+        app: karpenter-example
+    spec:
+      containers:
+      - name: app
+        image: nginx:latest
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 200m
+            memory: 256Mi
+        ports:
+        - containerPort: 80
+      # This workload will prefer spot instances but can use on-demand
+      nodeSelector:
+        node-type: default
+      tolerations:
+      - key: karpenter.sh/provisioner-name
+        operator: Exists
+        effect: NoSchedule
+
+---
+# Example 2: GPU workload that requires GPU nodes
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: karpenter-gpu-workload
+  namespace: default
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-job
+        image: nvidia/cuda:11.8-base-ubuntu20.04
+        command:
+        - /bin/bash
+        - -c
+        - |
+          echo "Starting GPU workload on Karpenter-managed node"
+          nvidia-smi
+          echo "GPU information:"
+          nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv
+          # Simulate GPU workload
+          echo "Running CUDA sample..."
+          sleep 300
+          echo "GPU workload completed"
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+            cpu: 1000m
+            memory: 2Gi
+          limits:
+            nvidia.com/gpu: 1
+            cpu: 2000m
+            memory: 4Gi
+      # This will specifically target GPU nodes managed by Karpenter
+      nodeSelector:
+        node-type: gpu
+        nvidia.com/gpu: "true"
+      tolerations:
+      - key: nvidia.com/gpu
+        value: "true"
+        effect: NoSchedule
+      - key: karpenter.sh/provisioner-name
+        operator: Exists
+        effect: NoSchedule
+      restartPolicy: Never
+  backoffLimit: 3
+
+---
+# Example 3: Burst workload that demonstrates Karpenter's fast scaling
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: karpenter-burst-workload
+  namespace: default
+spec:
+  replicas: 1  # Will be scaled up to demonstrate Karpenter
+  selector:
+    matchLabels:
+      app: burst-workload
+  template:
+    metadata:
+      labels:
+        app: burst-workload
+    spec:
+      containers:
+      - name: cpu-intensive
+        image: busybox:latest
+        command:
+        - /bin/sh
+        - -c
+        - |
+          echo "Starting CPU-intensive workload on $(hostname)"
+          echo "Node labels:"
+          cat /etc/hostname
+          # Simulate CPU-intensive work
+          while true; do
+            echo "Working... $(date)"
+            # Light CPU work to demonstrate scaling
+            dd if=/dev/zero of=/dev/null bs=1M count=100 2>/dev/null
+            sleep 10
+          done
+        resources:
+          requests:
+            cpu: 500m
+            memory: 512Mi
+          limits:
+            cpu: 1000m
+            memory: 1Gi
+      nodeSelector:
+        node-type: default
+      tolerations:
+      - key: karpenter.sh/provisioner-name
+        operator: Exists
+        effect: NoSchedule
+
+---
+# Example 4: Mixed workload that can run on both spot and on-demand
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: karpenter-mixed-workload
+  namespace: default
+spec:
+  replicas: 5
+  selector:
+    matchLabels:
+      app: mixed-workload
+  template:
+    metadata:
+      labels:
+        app: mixed-workload
+    spec:
+      containers:
+      - name: web-server
+        image: httpd:2.4
+        resources:
+          requests:
+            cpu: 250m
+            memory: 256Mi
+          limits:
+            cpu: 500m
+            memory: 512Mi
+        ports:
+        - containerPort: 80
+      # Demonstrate Karpenter's intelligent node selection
+      nodeSelector:
+        node-type: default
+      tolerations:
+      - key: karpenter.sh/provisioner-name
+        operator: Exists
+        effect: NoSchedule
+      affinity:
+        # Prefer spot instances for cost optimization
+        nodeAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            preference:
+              matchExpressions:
+              - key: karpenter.sh/capacity-type
+                operator: In
+                values: ["spot"]
+
+---
+# Example 5: Batch job demonstrating Karpenter's ability to scale from zero
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: karpenter-batch-processing
+  namespace: default
+spec:
+  parallelism: 10
+  completions: 50
+  template:
+    spec:
+      containers:
+      - name: batch-processor
+        image: alpine:latest
+        command:
+        - /bin/sh
+        - -c
+        - |
+          echo "Batch job starting on $(hostname)"
+          echo "Processing item $RANDOM..."
+          # Simulate batch processing work
+          sleep $((RANDOM % 60 + 30))
+          echo "Batch job completed on $(hostname)"
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 200m
+            memory: 256Mi
+      nodeSelector:
+        node-type: default
+      tolerations:
+      - key: karpenter.sh/provisioner-name
+        operator: Exists
+        effect: NoSchedule
+      restartPolicy: Never
+  backoffLimit: 3
+
+---
+# Service for the example app
+apiVersion: v1
+kind: Service
+metadata:
+  name: karpenter-example-service
+  namespace: default
+spec:
+  selector:
+    app: karpenter-example
+  ports:
+  - port: 80
+    targetPort: 80
+  type: ClusterIP
+
+---
+# HPA for demonstrating Karpenter's integration with pod autoscaling
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: karpenter-example-hpa
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: karpenter-burst-workload
+  minReplicas: 1
+  maxReplicas: 20
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 70
+  - type: Resource
+    resource:
+      name: memory
+      target:
+        type: Utilization
+        averageUtilization: 80
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml b/1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml
new file mode 100644
index 000000000..2d683120b
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/examples/node-auto-repair-test.yaml
@@ -0,0 +1,197 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: node-health-test-script
+  namespace: default
+data:
+  test-script.sh: |
+    #!/bin/bash
+    echo "Node Auto-Repair Test Script"
+    echo "=============================="
+    
+    # Get current node information
+    NODE_NAME=$(kubectl get nodes --no-headers -o custom-columns=":metadata.name" | head -1)
+    echo "Testing with node: $NODE_NAME"
+    
+    # Check node health status
+    echo "Current node status:"
+    kubectl describe node $NODE_NAME | grep -A 5 "Conditions:"
+    
+    # Monitor node health over time
+    echo "Monitoring node health for 5 minutes..."
+    for i in {1..30}; do
+      echo "Check $i/30 at $(date)"
+      kubectl get nodes --no-headers | grep -v "Ready"
+      sleep 10
+    done
+    
+    echo "Node health monitoring complete"
+
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: node-auto-repair-test
+  namespace: default
+spec:
+  template:
+    spec:
+      serviceAccountName: default
+      containers:
+      - name: node-health-tester
+        image: bitnami/kubectl:latest
+        command:
+        - /bin/bash
+        - /scripts/test-script.sh
+        volumeMounts:
+        - name: test-script
+          mountPath: /scripts
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 200m
+            memory: 256Mi
+      volumes:
+      - name: test-script
+        configMap:
+          name: node-health-test-script
+          defaultMode: 0755
+      restartPolicy: Never
+  backoffLimit: 1
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: node-health-monitor
+  namespace: default
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: node-health-monitor
+rules:
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["events"]
+  verbs: ["get", "list", "watch"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: node-health-monitor
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: node-health-monitor
+subjects:
+- kind: ServiceAccount
+  name: node-health-monitor
+  namespace: default
+
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-health-monitor
+  namespace: default
+  labels:
+    app: node-health-monitor
+spec:
+  selector:
+    matchLabels:
+      app: node-health-monitor
+  template:
+    metadata:
+      labels:
+        app: node-health-monitor
+    spec:
+      serviceAccountName: node-health-monitor
+      hostNetwork: true
+      hostPID: true
+      tolerations:
+      - key: node-role.kubernetes.io/master
+        effect: NoSchedule
+      - key: nvidia.com/gpu
+        effect: NoSchedule
+      containers:
+      - name: node-health-monitor
+        image: amazonlinux:latest
+        command:
+        - /bin/bash
+        - -c
+        - |
+          yum update -y
+          yum install -y procps-ng util-linux
+          
+          echo "Node Health Monitor Starting on $(hostname)"
+          echo "Node: $NODE_NAME"
+          echo "Namespace: $POD_NAMESPACE"
+          
+          while true; do
+            # Check system health
+            echo "=== Health Check at $(date) ==="
+            
+            # CPU and Memory usage
+            echo "CPU Usage:"
+            cat /proc/loadavg
+            
+            echo "Memory Usage:"
+            free -h
+            
+            # Disk usage
+            echo "Disk Usage:"
+            df -h | grep -E "(/$|/var|/tmp)"
+            
+            # Check for any hardware issues
+            echo "Checking dmesg for errors:"
+            dmesg | tail -10 | grep -i "error\|fail\|warning" || echo "No recent errors found"
+            
+            # GPU health check (if GPU node)
+            if command -v nvidia-smi &> /dev/null; then
+              echo "GPU Status:"
+              nvidia-smi --query-gpu=name,temperature.gpu,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits || echo "GPU check failed"
+            fi
+            
+            echo "Health check complete"
+            echo "=========================="
+            sleep 60
+          done
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        resources:
+          requests:
+            cpu: 50m
+            memory: 64Mi
+          limits:
+            cpu: 100m
+            memory: 128Mi
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: proc
+          mountPath: /host/proc
+          readOnly: true
+        - name: sys
+          mountPath: /host/sys
+          readOnly: true
+      volumes:
+      - name: proc
+        hostPath:
+          path: /proc
+      - name: sys
+        hostPath:
+          path: /sys
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml b/1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml
new file mode 100644
index 000000000..de90229f8
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/examples/s3-mountpoint-example.yaml
@@ -0,0 +1,181 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: s3-mountpoint-pvc
+  namespace: default
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: s3-mountpoint-sc
+  resources:
+    requests:
+      storage: 1000Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: s3-mountpoint-test
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: s3-mountpoint-test
+  template:
+    metadata:
+      labels:
+        app: s3-mountpoint-test
+    spec:
+      containers:
+      - name: test-container
+        image: amazonlinux:latest
+        command:
+        - /bin/bash
+        - -c
+        - |
+          echo "Testing S3 Mountpoint at /mnt/s3"
+          ls -la /mnt/s3/
+          echo "Writing test file..."
+          echo "$(date): Hello from $(hostname)" > /mnt/s3/test-$(hostname)-$(date +%s).txt
+          echo "Listing files in S3 mount:"
+          ls -la /mnt/s3/
+          echo "Reading back test files:"
+          cat /mnt/s3/test-*.txt || echo "No test files found yet"
+          echo "S3 Mountpoint test running. Sleeping..."
+          sleep 3600
+        volumeMounts:
+        - name: s3-volume
+          mountPath: /mnt/s3
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 200m
+            memory: 256Mi
+      volumes:
+      - name: s3-volume
+        persistentVolumeClaim:
+          claimName: s3-mountpoint-pvc
+---
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: s3-backup-job
+  namespace: default
+spec:
+  schedule: "0 */6 * * *" # Every 6 hours
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+          - name: backup-container
+            image: amazonlinux:latest
+            command:
+            - /bin/bash
+            - -c
+            - |
+              echo "Starting backup job at $(date)"
+              
+              # Create backup directory
+              mkdir -p /mnt/s3/backups/$(date +%Y-%m-%d)
+              
+              # Example: backup some application data
+              echo "Backup completed at $(date)" > /mnt/s3/backups/$(date +%Y-%m-%d)/backup-$(date +%H%M%S).log
+              
+              # List backup files
+              echo "Current backups:"
+              ls -la /mnt/s3/backups/
+              
+              echo "Backup job completed"
+            volumeMounts:
+            - name: s3-volume
+              mountPath: /mnt/s3
+            resources:
+              requests:
+                cpu: 100m
+                memory: 128Mi
+              limits:
+                cpu: 200m
+                memory: 256Mi
+          volumes:
+          - name: s3-volume
+            persistentVolumeClaim:
+              claimName: s3-mountpoint-pvc
+          restartPolicy: OnFailure
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: s3-app-config
+  namespace: default
+data:
+  app.conf: |
+    # Application configuration
+    data_path=/mnt/s3/data
+    log_path=/mnt/s3/logs
+    backup_path=/mnt/s3/backups
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: s3-data-processor
+  namespace: default
+spec:
+  serviceName: s3-data-processor
+  replicas: 1
+  selector:
+    matchLabels:
+      app: s3-data-processor
+  template:
+    metadata:
+      labels:
+        app: s3-data-processor
+    spec:
+      containers:
+      - name: data-processor
+        image: amazonlinux:latest
+        command:
+        - /bin/bash
+        - -c
+        - |
+          source /etc/app/app.conf
+          echo "Data processor starting..."
+          mkdir -p $data_path $log_path $backup_path
+          
+          while true; do
+            timestamp=$(date +%Y%m%d-%H%M%S)
+            echo "[$timestamp] Processing data..." | tee -a $log_path/processor-$timestamp.log
+            
+            # Simulate data processing
+            echo "Sample data: $(date)" > $data_path/data-$timestamp.dat
+            
+            # Archive old data every 10 iterations
+            if [ $(($(date +%S) % 10)) -eq 0 ]; then
+              echo "Archiving old data..."
+              tar -czf $backup_path/archive-$timestamp.tar.gz $data_path/*.dat
+              rm -f $data_path/*.dat
+            fi
+            
+            sleep 30
+          done
+        volumeMounts:
+        - name: s3-volume
+          mountPath: /mnt/s3
+        - name: config-volume
+          mountPath: /etc/app
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 200m
+            memory: 256Mi
+      volumes:
+      - name: s3-volume
+        persistentVolumeClaim:
+          claimName: s3-mountpoint-pvc
+      - name: config-volume
+        configMap:
+          name: s3-app-config
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh b/1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh
new file mode 100755
index 000000000..ff3455add
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/examples/test-karpenter.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Check if kubectl is available and connected
+check_kubectl() {
+    print_status "Checking kubectl connectivity..."
+    if ! kubectl cluster-info &> /dev/null; then
+        print_error "kubectl is not configured or cluster is not accessible."
+        exit 1
+    fi
+    print_success "kubectl is configured and cluster is accessible."
+}
+
+# Check if Karpenter is installed
+check_karpenter() {
+    print_status "Checking if Karpenter is installed..."
+    if kubectl get deployment -n karpenter karpenter &> /dev/null; then
+        print_success "Karpenter is installed and running."
+        kubectl get pods -n karpenter
+    else
+        print_error "Karpenter is not installed. Please deploy the cluster first."
+        exit 1
+    fi
+}
+
+# Check Karpenter NodePools and EC2NodeClasses
+check_karpenter_resources() {
+    print_status "Checking Karpenter NodePools..."
+    kubectl get nodepool
+    
+    print_status "Checking Karpenter EC2NodeClasses..."
+    kubectl get ec2nodeclass
+    
+    print_status "Checking current nodes..."
+    kubectl get nodes -o wide
+}
+
+# Deploy test workloads
+deploy_workloads() {
+    print_status "Deploying Karpenter test workloads..."
+    kubectl apply -f karpenter-workloads.yaml
+    
+    print_status "Waiting for deployments to be ready..."
+    kubectl wait --for=condition=available --timeout=300s deployment/karpenter-example-app
+    kubectl wait --for=condition=available --timeout=300s deployment/karpenter-mixed-workload
+    
+    print_success "Test workloads deployed successfully."
+}
+
+# Test Karpenter scaling
+test_scaling() {
+    print_status "Testing Karpenter node provisioning..."
+    
+    # Get initial node count
+    INITIAL_NODES=$(kubectl get nodes --no-headers | wc -l)
+    print_status "Initial node count: $INITIAL_NODES"
+    
+    # Scale up the burst workload to trigger node provisioning
+    print_status "Scaling up burst workload to trigger node provisioning..."
+    kubectl scale deployment karpenter-burst-workload --replicas=10
+    
+    # Wait for Karpenter to provision new nodes
+    print_status "Waiting for Karpenter to provision new nodes (this may take 2-3 minutes)..."
+    
+    for i in {1..18}; do  # Wait up to 3 minutes
+        CURRENT_NODES=$(kubectl get nodes --no-headers | wc -l)
+        if [ $CURRENT_NODES -gt $INITIAL_NODES ]; then
+            print_success "Karpenter provisioned new nodes! Current count: $CURRENT_NODES"
+            break
+        fi
+        echo "Waiting... ($i/18) Current nodes: $CURRENT_NODES"
+        sleep 10
+    done
+    
+    # Show new nodes
+    print_status "Current node status:"
+    kubectl get nodes -o wide
+    
+    # Show Karpenter logs
+    print_status "Recent Karpenter logs:"
+    kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=20
+}
+
+# Test GPU workload (if GPU nodes are available)
+test_gpu_workload() {
+    print_status "Testing GPU workload..."
+    
+    # Check if GPU NodePool exists
+    if kubectl get nodepool gpu &> /dev/null; then
+        print_status "GPU NodePool found. Deploying GPU workload..."
+        kubectl apply -f - <<EOF
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: karpenter-gpu-test
+  namespace: default
+spec:
+  template:
+    spec:
+      containers:
+      - name: gpu-test
+        image: nvidia/cuda:11.8-base-ubuntu20.04
+        command: ["nvidia-smi"]
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+      nodeSelector:
+        node-type: gpu
+      tolerations:
+      - key: nvidia.com/gpu
+        effect: NoSchedule
+      restartPolicy: Never
+  backoffLimit: 1
+EOF
+        
+        print_status "Waiting for GPU job to complete..."
+        kubectl wait --for=condition=complete --timeout=600s job/karpenter-gpu-test
+        
+        print_status "GPU job logs:"
+        kubectl logs job/karpenter-gpu-test
+        
+        print_success "GPU workload test completed."
+    else
+        print_warning "GPU NodePool not found. Skipping GPU test."
+    fi
+}
+
+# Test spot instance interruption handling
+test_spot_handling() {
+    print_status "Testing spot instance handling..."
+    
+    # Check if there are any spot instances
+    SPOT_NODES=$(kubectl get nodes -l karpenter.sh/capacity-type=spot --no-headers 2>/dev/null | wc -l || echo "0")
+    
+    if [ $SPOT_NODES -gt 0 ]; then
+        print_success "Found $SPOT_NODES spot instance(s)."
+        kubectl get nodes -l karpenter.sh/capacity-type=spot -o wide
+        
+        print_status "Karpenter should handle spot interruptions automatically."
+        print_status "Check SQS queue for interruption messages: $(kubectl get nodepool default -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "N/A")"
+    else
+        print_warning "No spot instances found. Karpenter may not have provisioned spot instances yet."
+    fi
+}
+
+# Monitor Karpenter metrics
+monitor_karpenter() {
+    print_status "Monitoring Karpenter status..."
+    
+    # Show NodePool status
+    print_status "NodePool status:"
+    kubectl get nodepool -o wide
+    
+    # Show node capacity and usage
+    print_status "Node resource usage:"
+    kubectl top nodes 2>/dev/null || print_warning "Metrics server not available"
+    
+    # Show pod distribution
+    print_status "Pod distribution across nodes:"
+    kubectl get pods -o wide | grep -E "(karpenter|gpu|burst|mixed)" | head -10
+    
+    # Show Karpenter events
+    print_status "Recent Karpenter events:"
+    kubectl get events --field-selector involvedObject.kind=Node --sort-by='.lastTimestamp' | tail -10
+}
+
+# Scale down test
+test_scale_down() {
+    print_status "Testing Karpenter scale-down behavior..."
+    
+    # Scale down workloads
+    print_status "Scaling down workloads..."
+    kubectl scale deployment karpenter-burst-workload --replicas=1
+    kubectl scale deployment karpenter-mixed-workload --replicas=2
+    
+    print_status "Workloads scaled down. Karpenter should consolidate or terminate unused nodes."
+    print_status "This process may take several minutes. Monitor with: kubectl get nodes -w"
+    
+    CURRENT_NODES=$(kubectl get nodes --no-headers | wc -l)
+    print_status "Current node count: $CURRENT_NODES"
+    print_status "Karpenter will evaluate nodes for termination based on the consolidation policy."
+}
+
+# Cleanup function
+cleanup() {
+    print_status "Cleaning up test resources..."
+    kubectl delete -f karpenter-workloads.yaml --ignore-not-found=true
+    kubectl delete job karpenter-gpu-test --ignore-not-found=true
+    print_success "Cleanup completed."
+}
+
+# Show help
+show_help() {
+    echo "Karpenter Test Script"
+    echo "Usage: $0 [command]"
+    echo ""
+    echo "Commands:"
+    echo "  deploy      Deploy test workloads"
+    echo "  test        Run comprehensive Karpenter tests"
+    echo "  scale       Test scaling behavior"
+    echo "  gpu         Test GPU workload"
+    echo "  monitor     Monitor Karpenter status"
+    echo "  cleanup     Remove test workloads"
+    echo "  help        Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 test     # Run full test suite"
+    echo "  $0 deploy   # Just deploy workloads"
+    echo "  $0 monitor  # Monitor current status"
+}
+
+# Main function
+main() {
+    case "${1:-test}" in
+        deploy)
+            check_kubectl
+            check_karpenter
+            deploy_workloads
+            ;;
+        test)
+            check_kubectl
+            check_karpenter
+            check_karpenter_resources
+            deploy_workloads
+            test_scaling
+            test_gpu_workload
+            test_spot_handling
+            monitor_karpenter
+            test_scale_down
+            ;;
+        scale)
+            check_kubectl
+            check_karpenter
+            test_scaling
+            ;;
+        gpu)
+            check_kubectl
+            check_karpenter
+            test_gpu_workload
+            ;;
+        monitor)
+            check_kubectl
+            check_karpenter
+            monitor_karpenter
+            ;;
+        cleanup)
+            check_kubectl
+            cleanup
+            ;;
+        help|--help|-h)
+            show_help
+            ;;
+        *)
+            print_error "Unknown command: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# Handle script interruption
+trap cleanup EXIT
+
+# Run main function
+main "$@"
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/main.tf b/1.architectures/4.amazon-eks/terraform/main.tf
new file mode 100644
index 000000000..2905f4164
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/main.tf
@@ -0,0 +1,389 @@
+terraform {
+  required_version = ">= 1.0"
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = "~> 2.23"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = "~> 2.11"
+    }
+    kubectl = {
+      source  = "gavinbunney/kubectl"
+      version = "~> 1.14"
+    }
+  }
+}
+
+provider "aws" {
+  region = var.region
+}
+
+provider "kubernetes" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  exec {
+    api_version = "client.authentication.k8s.io/v1beta1"
+    command     = "aws"
+    args        = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
+  }
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+    exec {
+      api_version = "client.authentication.k8s.io/v1beta1"
+      command     = "aws"
+      args        = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
+    }
+  }
+}
+
+provider "kubectl" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  exec {
+    api_version = "client.authentication.k8s.io/v1beta1"
+    command     = "aws"
+    args        = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
+  }
+}
+
+locals {
+  name = var.cluster_name
+  tags = {
+    Environment = var.environment
+    Project     = "EKS-Reference-Architecture"
+    ManagedBy   = "Terraform"
+  }
+}
+
+data "aws_availability_zones" "available" {
+  filter {
+    name   = "state"
+    values = ["available"]
+  }
+}
+
+data "aws_caller_identity" "current" {}
+
+module "vpc" {
+  source = "./modules/vpc"
+  
+  name = local.name
+  cidr = var.vpc_cidr
+  
+  azs             = slice(data.aws_availability_zones.available.names, 0, 3)
+  private_subnets = var.private_subnets
+  public_subnets  = var.public_subnets
+  
+  enable_nat_gateway = true
+  enable_vpn_gateway = false
+  enable_dns_hostnames = true
+  enable_dns_support   = true
+  
+  public_subnet_tags = {
+    "kubernetes.io/role/elb" = 1
+    "karpenter.sh/discovery" = local.name
+  }
+  
+  private_subnet_tags = {
+    "kubernetes.io/role/internal-elb" = 1
+    "karpenter.sh/discovery" = local.name
+  }
+  
+  tags = local.tags
+}
+
+module "eks" {
+  source = "./modules/eks"
+  
+  cluster_name    = local.name
+  cluster_version = var.cluster_version
+  
+  vpc_id                   = module.vpc.vpc_id
+  subnet_ids               = module.vpc.private_subnets
+  control_plane_subnet_ids = module.vpc.private_subnets
+  
+  cluster_endpoint_private_access = true
+  cluster_endpoint_public_access  = true
+  cluster_endpoint_public_access_cidrs = var.cluster_endpoint_public_access_cidrs
+  
+  cluster_encryption_config = [
+    {
+      provider_key_arn = aws_kms_key.eks.arn
+      resources        = ["secrets"]
+    }
+  ]
+  
+  cluster_addons = {
+    coredns = {
+      most_recent = true
+    }
+    kube-proxy = {
+      most_recent = true
+    }
+    vpc-cni = {
+      most_recent = true
+    }
+    aws-ebs-csi-driver = {
+      most_recent = true
+    }
+    aws-efs-csi-driver = {
+      most_recent = true
+    }
+  }
+  
+  # Karpenter requires at least one managed node group for system pods and Karpenter itself
+  eks_managed_node_groups = var.enable_karpenter ? {
+    # Minimal node group for Karpenter and system pods
+    karpenter = {
+      name = "${local.name}-karpenter"
+      
+      instance_types = ["m5.large"]
+      capacity_type  = "ON_DEMAND"
+      
+      min_size     = 2
+      max_size     = 3
+      desired_size = 2
+      
+      ami_type = "AL2_x86_64"
+      
+      labels = {
+        Environment = var.environment
+        NodeGroup   = "karpenter"
+        "karpenter.sh/discovery" = local.name
+      }
+      
+      # Prevent Karpenter from managing these nodes
+      taints = [
+        {
+          key    = "CriticalAddonsOnly"
+          value  = "true"
+          effect = "NO_SCHEDULE"
+        }
+      ]
+      
+      update_config = {
+        max_unavailable_percentage = 25
+      }
+
+      # Enable node auto repair
+      health_check_grace_period = var.default_health_check_grace_period
+      health_check_type         = var.default_health_check_type
+      
+      tags = merge(local.tags, {
+        "karpenter.sh/discovery" = local.name
+      })
+    }
+  } : {
+    # Original node groups when Karpenter is disabled
+    default = {
+      name = "${local.name}-default"
+      
+      instance_types = var.default_instance_types
+      capacity_type  = "ON_DEMAND"
+      
+      min_size     = var.default_min_size
+      max_size     = var.default_max_size
+      desired_size = var.default_desired_size
+      
+      ami_type = "AL2_x86_64"
+      
+      labels = {
+        Environment = var.environment
+        NodeGroup   = "default"
+      }
+      
+      taints = []
+      
+      update_config = {
+        max_unavailable_percentage = 25
+      }
+
+      # Enable node auto repair
+      health_check_grace_period = var.default_health_check_grace_period
+      health_check_type         = var.default_health_check_type
+      
+      tags = local.tags
+    }
+    
+    gpu = {
+      name = "${local.name}-gpu"
+      
+      instance_types = var.gpu_instance_types
+      capacity_type  = "ON_DEMAND"
+      
+      min_size     = var.gpu_min_size
+      max_size     = var.gpu_max_size
+      desired_size = var.gpu_desired_size
+      
+      ami_type = "AL2_x86_64_GPU"
+      
+      labels = {
+        Environment    = var.environment
+        NodeGroup      = "gpu"
+        "nvidia.com/gpu" = "true"
+      }
+      
+      taints = [
+        {
+          key    = "nvidia.com/gpu"
+          value  = "true"
+          effect = "NO_SCHEDULE"
+        }
+      ]
+      
+      update_config = {
+        max_unavailable_percentage = 25
+      }
+
+      # Enable node auto repair - GPU nodes need longer grace period
+      health_check_grace_period = var.gpu_health_check_grace_period
+      health_check_type         = var.gpu_health_check_type
+      
+      tags = local.tags
+    }
+  }
+  
+  node_security_group_additional_rules = {
+    ingress_self_all = {
+      description = "Node to node all ports/protocols"
+      protocol    = "-1"
+      from_port   = 0
+      to_port     = 0
+      type        = "ingress"
+      self        = true
+    }
+    
+    ingress_cluster_all = {
+      description                   = "Cluster to node all ports/protocols"
+      protocol                      = "-1"
+      from_port                     = 0
+      to_port                       = 0
+      type                          = "ingress"
+      source_cluster_security_group = true
+    }
+    
+    egress_all = {
+      description      = "Node all egress"
+      protocol         = "-1"
+      from_port        = 0
+      to_port          = 0
+      type             = "egress"
+      cidr_blocks      = ["0.0.0.0/0"]
+      ipv6_cidr_blocks = ["::/0"]
+    }
+  }
+  
+  node_security_group_tags = {
+    "karpenter.sh/discovery" = local.name
+  }
+  
+  tags = local.tags
+}
+
+resource "aws_kms_key" "eks" {
+  description             = "EKS Secret Encryption Key"
+  deletion_window_in_days = 7
+  enable_key_rotation     = true
+  
+  tags = local.tags
+}
+
+resource "aws_kms_alias" "eks" {
+  name          = "alias/eks-${local.name}"
+  target_key_id = aws_kms_key.eks.key_id
+}
+
+module "fsx_lustre" {
+  source = "./modules/fsx-lustre"
+  
+  name                     = "${local.name}-lustre"
+  subnet_ids               = [module.vpc.private_subnets[0]]
+  security_group_ids       = [aws_security_group.fsx_lustre.id]
+  storage_capacity         = var.fsx_storage_capacity
+  deployment_type          = var.fsx_deployment_type
+  per_unit_storage_throughput = var.fsx_per_unit_storage_throughput
+  
+  s3_import_path = var.fsx_s3_import_path
+  s3_export_path = var.fsx_s3_export_path
+  
+  tags = local.tags
+}
+
+resource "aws_security_group" "fsx_lustre" {
+  name        = "${local.name}-fsx-lustre"
+  description = "Security group for FSx Lustre"
+  vpc_id      = module.vpc.vpc_id
+  
+  ingress {
+    from_port   = 988
+    to_port     = 988
+    protocol    = "tcp"
+    cidr_blocks = [var.vpc_cidr]
+  }
+  
+  ingress {
+    from_port   = 1021
+    to_port     = 1023
+    protocol    = "tcp"
+    cidr_blocks = [var.vpc_cidr]
+  }
+  
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+  
+  tags = merge(local.tags, {
+    Name = "${local.name}-fsx-lustre"
+  })
+}
+
+module "s3_mountpoint" {
+  source = "./modules/s3-mountpoint"
+  
+  cluster_name            = module.eks.cluster_name
+  cluster_oidc_issuer_url = module.eks.cluster_oidc_issuer_url
+  
+  s3_bucket_name = var.s3_mountpoint_bucket_name
+  namespace      = var.s3_mountpoint_namespace
+  
+  tags = local.tags
+}
+
+module "addons" {
+  source = "./modules/addons"
+  
+  cluster_name            = module.eks.cluster_name
+  cluster_oidc_issuer_url = module.eks.cluster_oidc_issuer_url
+  cluster_version         = var.cluster_version
+  cluster_endpoint        = module.eks.cluster_endpoint
+  vpc_id                  = module.vpc.vpc_id
+  
+  enable_karpenter = var.enable_karpenter
+  karpenter_chart_version = var.karpenter_chart_version
+  karpenter_default_capacity_types = var.karpenter_default_capacity_types
+  karpenter_default_instance_types = var.karpenter_default_instance_types
+  karpenter_gpu_capacity_types = var.karpenter_gpu_capacity_types
+  karpenter_gpu_instance_types = var.karpenter_gpu_instance_types
+  
+  enable_aws_load_balancer_controller = var.enable_aws_load_balancer_controller
+  enable_nvidia_device_plugin = var.enable_nvidia_device_plugin
+  enable_metrics_server = var.enable_metrics_server
+  enable_node_health_monitoring = var.enable_node_health_monitoring
+  enable_sns_alerts = var.enable_sns_alerts
+  alert_email = var.alert_email
+  
+  tags = local.tags
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/addons/main.tf b/1.architectures/4.amazon-eks/terraform/modules/addons/main.tf
new file mode 100644
index 000000000..4994ed7ec
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/addons/main.tf
@@ -0,0 +1,590 @@
+data "aws_caller_identity" "current" {}
+data "aws_region" "current" {}
+
+# Karpenter
+module "karpenter" {
+  count   = var.enable_karpenter ? 1 : 0
+  source  = "terraform-aws-modules/eks/aws//modules/karpenter"
+  version = "~> 19.21"
+
+  cluster_name = var.cluster_name
+
+  irsa_oidc_provider_arn          = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}"
+  irsa_namespace_service_accounts = ["karpenter:karpenter"]
+
+  # Since Karpenter is running on EKS Managed Node Group,
+  # we need to ensure the access entry is not created for the Karpenter node IAM role
+  # Reference: https://github.com/aws/karpenter/issues/4002
+  create_access_entry = false
+
+  tags = var.tags
+}
+
+resource "helm_release" "karpenter" {
+  count = var.enable_karpenter ? 1 : 0
+
+  namespace        = "karpenter"
+  create_namespace = true
+
+  name       = "karpenter"
+  repository = "oci://public.ecr.aws/karpenter"
+  chart      = "karpenter"
+  version    = var.karpenter_chart_version
+
+  values = [
+    <<-EOT
+    settings:
+      clusterName: ${var.cluster_name}
+      clusterEndpoint: ${var.cluster_endpoint}
+      interruptionQueue: ${try(module.karpenter[0].queue_name, "")}
+    serviceAccount:
+      annotations:
+        eks.amazonaws.com/role-arn: ${try(module.karpenter[0].iam_role_arn, "")}
+    tolerations:
+      - key: CriticalAddonsOnly
+        operator: Exists
+      - effect: NoSchedule
+        key: node-role.kubernetes.io/master
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+          - matchExpressions:
+            - key: karpenter.sh/provisioner-name
+              operator: DoesNotExist
+    EOT
+  ]
+
+  depends_on = [module.karpenter]
+}
+
+# Karpenter EC2NodeClass for default nodes
+resource "kubectl_manifest" "karpenter_node_class_default" {
+  count = var.enable_karpenter ? 1 : 0
+
+  yaml_body = <<-YAML
+    apiVersion: karpenter.k8s.aws/v1beta1
+    kind: EC2NodeClass
+    metadata:
+      name: default
+    spec:
+      instanceStorePolicy: RAID0
+      amiFamily: AL2
+      subnetSelectorTerms:
+        - tags:
+            karpenter.sh/discovery: ${var.cluster_name}
+      securityGroupSelectorTerms:
+        - tags:
+            karpenter.sh/discovery: ${var.cluster_name}
+      instanceProfile: ${try(module.karpenter[0].node_instance_profile_name, "")}
+      userData: |
+        #!/bin/bash
+        /etc/eks/bootstrap.sh ${var.cluster_name}
+        # Install additional packages
+        yum update -y
+        yum install -y amazon-ssm-agent amazon-cloudwatch-agent
+        systemctl enable amazon-ssm-agent
+        systemctl start amazon-ssm-agent
+        systemctl enable amazon-cloudwatch-agent
+        systemctl start amazon-cloudwatch-agent
+      tags:
+        Name: "Karpenter-${var.cluster_name}-default"
+        Environment: ${var.tags.Environment}
+        NodeType: "default"
+  YAML
+
+  depends_on = [helm_release.karpenter]
+}
+
+# Karpenter EC2NodeClass for GPU nodes
+resource "kubectl_manifest" "karpenter_node_class_gpu" {
+  count = var.enable_karpenter && var.enable_nvidia_device_plugin ? 1 : 0
+
+  yaml_body = <<-YAML
+    apiVersion: karpenter.k8s.aws/v1beta1
+    kind: EC2NodeClass
+    metadata:
+      name: gpu
+    spec:
+      instanceStorePolicy: RAID0
+      amiFamily: AL2
+      subnetSelectorTerms:
+        - tags:
+            karpenter.sh/discovery: ${var.cluster_name}
+      securityGroupSelectorTerms:
+        - tags:
+            karpenter.sh/discovery: ${var.cluster_name}
+      instanceProfile: ${try(module.karpenter[0].node_instance_profile_name, "")}
+      userData: |
+        #!/bin/bash
+        /etc/eks/bootstrap.sh ${var.cluster_name} --container-runtime containerd
+        # Install NVIDIA drivers and container runtime
+        yum update -y
+        yum install -y nvidia-driver-latest-dkms nvidia-container-toolkit
+        yum install -y amazon-ssm-agent amazon-cloudwatch-agent
+        
+        # Configure containerd for GPU support
+        mkdir -p /etc/containerd
+        cat > /etc/containerd/config.toml << EOF
+        version = 2
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+          runtime_type = "io.containerd.runc.v2"
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+          SystemdCgroup = true
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
+          runtime_type = "io.containerd.runc.v2"
+        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
+          BinaryName = "/usr/bin/nvidia-container-runtime"
+          SystemdCgroup = true
+        EOF
+        
+        systemctl restart containerd
+        systemctl enable amazon-ssm-agent
+        systemctl start amazon-ssm-agent
+        systemctl enable amazon-cloudwatch-agent
+        systemctl start amazon-cloudwatch-agent
+      tags:
+        Name: "Karpenter-${var.cluster_name}-gpu"
+        Environment: ${var.tags.Environment}
+        NodeType: "gpu"
+  YAML
+
+  depends_on = [helm_release.karpenter]
+}
+
+# Karpenter NodePool for default workloads
+resource "kubectl_manifest" "karpenter_node_pool_default" {
+  count = var.enable_karpenter ? 1 : 0
+
+  yaml_body = <<-YAML
+    apiVersion: karpenter.sh/v1beta1
+    kind: NodePool
+    metadata:
+      name: default
+    spec:
+      template:
+        metadata:
+          labels:
+            provisioner: karpenter
+            node-type: default
+        spec:
+          nodeClassRef:
+            apiVersion: karpenter.k8s.aws/v1beta1
+            kind: EC2NodeClass
+            name: default
+          requirements:
+            - key: kubernetes.io/arch
+              operator: In
+              values: ["amd64"]
+            - key: kubernetes.io/os
+              operator: In
+              values: ["linux"]
+            - key: karpenter.sh/capacity-type
+              operator: In
+              values: ${jsonencode(var.karpenter_default_capacity_types)}
+            - key: node.kubernetes.io/instance-type
+              operator: In
+              values: ${jsonencode(var.karpenter_default_instance_types)}
+          nodePolicy:
+            terminationGracePeriod: 30s
+      limits:
+        cpu: 1000
+        memory: 1000Gi
+      disruption:
+        consolidationPolicy: WhenUnderutilized
+        consolidateAfter: 30s
+        expireAfter: 30m
+  YAML
+
+  depends_on = [kubectl_manifest.karpenter_node_class_default]
+}
+
+# Karpenter NodePool for GPU workloads
+resource "kubectl_manifest" "karpenter_node_pool_gpu" {
+  count = var.enable_karpenter && var.enable_nvidia_device_plugin ? 1 : 0
+
+  yaml_body = <<-YAML
+    apiVersion: karpenter.sh/v1beta1
+    kind: NodePool
+    metadata:
+      name: gpu
+    spec:
+      template:
+        metadata:
+          labels:
+            provisioner: karpenter
+            node-type: gpu
+            nvidia.com/gpu: "true"
+        spec:
+          nodeClassRef:
+            apiVersion: karpenter.k8s.aws/v1beta1
+            kind: EC2NodeClass
+            name: gpu
+          requirements:
+            - key: kubernetes.io/arch
+              operator: In
+              values: ["amd64"]
+            - key: kubernetes.io/os
+              operator: In
+              values: ["linux"]
+            - key: karpenter.sh/capacity-type
+              operator: In
+              values: ${jsonencode(var.karpenter_gpu_capacity_types)}
+            - key: node.kubernetes.io/instance-type
+              operator: In
+              values: ${jsonencode(var.karpenter_gpu_instance_types)}
+            - key: karpenter.k8s.aws/instance-gpu-count
+              operator: Gt
+              values: ["0"]
+          taints:
+            - key: nvidia.com/gpu
+              value: "true"
+              effect: NoSchedule
+          nodePolicy:
+            terminationGracePeriod: 60s
+      limits:
+        cpu: 1000
+        memory: 1000Gi
+        nvidia.com/gpu: 100
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 60m
+  YAML
+
+  depends_on = [kubectl_manifest.karpenter_node_class_gpu]
+}
+
+# AWS Load Balancer Controller
+module "load_balancer_controller_irsa_role" {
+  count   = var.enable_aws_load_balancer_controller ? 1 : 0
+  source  = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version = "~> 5.0"
+
+  role_name                             = "${var.cluster_name}-load-balancer-controller"
+  attach_load_balancer_controller_policy = true
+
+  oidc_providers = {
+    ex = {
+      provider_arn               = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}"
+      namespace_service_accounts = ["kube-system:aws-load-balancer-controller"]
+    }
+  }
+
+  tags = var.tags
+}
+
+resource "helm_release" "aws_load_balancer_controller" {
+  count = var.enable_aws_load_balancer_controller ? 1 : 0
+
+  name       = "aws-load-balancer-controller"
+  repository = "https://aws.github.io/eks-charts"
+  chart      = "aws-load-balancer-controller"
+  namespace  = "kube-system"
+  version    = "1.6.2"
+
+  set {
+    name  = "clusterName"
+    value = var.cluster_name
+  }
+
+  set {
+    name  = "serviceAccount.create"
+    value = "true"
+  }
+
+  set {
+    name  = "serviceAccount.name"
+    value = "aws-load-balancer-controller"
+  }
+
+  set {
+    name  = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
+    value = try(module.load_balancer_controller_irsa_role[0].iam_role_arn, "")
+  }
+
+  set {
+    name  = "region"
+    value = data.aws_region.current.name
+  }
+
+  set {
+    name  = "vpcId"
+    value = var.vpc_id
+  }
+
+  depends_on = [module.load_balancer_controller_irsa_role]
+}
+
+# NVIDIA Device Plugin
+resource "helm_release" "nvidia_device_plugin" {
+  count = var.enable_nvidia_device_plugin ? 1 : 0
+
+  name       = "nvidia-device-plugin"
+  repository = "https://nvidia.github.io/k8s-device-plugin"
+  chart      = "nvidia-device-plugin"
+  namespace  = "kube-system"
+  version    = "0.14.1"
+
+  set {
+    name  = "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].key"
+    value = "nvidia.com/gpu"
+  }
+
+  set {
+    name  = "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].operator"
+    value = "In"
+  }
+
+  set {
+    name  = "affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0]"
+    value = "true"
+  }
+
+  set {
+    name  = "tolerations[0].key"
+    value = "nvidia.com/gpu"
+  }
+
+  set {
+    name  = "tolerations[0].operator"
+    value = "Equal"
+  }
+
+  set {
+    name  = "tolerations[0].value"
+    value = "true"
+  }
+
+  set {
+    name  = "tolerations[0].effect"
+    value = "NoSchedule"
+  }
+}
+
+# Metrics Server
+resource "helm_release" "metrics_server" {
+  count = var.enable_metrics_server ? 1 : 0
+
+  name       = "metrics-server"
+  repository = "https://kubernetes-sigs.github.io/metrics-server/"
+  chart      = "metrics-server"
+  namespace  = "kube-system"
+  version    = "3.11.0"
+
+  set {
+    name  = "args[0]"
+    value = "--cert-dir=/tmp"
+  }
+
+  set {
+    name  = "args[1]"
+    value = "--secure-port=4443"
+  }
+
+  set {
+    name  = "args[2]"
+    value = "--kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname"
+  }
+
+  set {
+    name  = "args[3]"
+    value = "--kubelet-use-node-status-port"
+  }
+
+  set {
+    name  = "args[4]"
+    value = "--metric-resolution=15s"
+  }
+}
+
+# EBS CSI Driver
+module "ebs_csi_irsa_role" {
+  source  = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version = "~> 5.0"
+
+  role_name             = "${var.cluster_name}-ebs-csi-driver"
+  attach_ebs_csi_policy = true
+
+  oidc_providers = {
+    ex = {
+      provider_arn               = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}"
+      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
+    }
+  }
+
+  tags = var.tags
+}
+
+# EFS CSI Driver
+module "efs_csi_irsa_role" {
+  source  = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version = "~> 5.0"
+
+  role_name             = "${var.cluster_name}-efs-csi-driver"
+  attach_efs_csi_policy = true
+
+  oidc_providers = {
+    ex = {
+      provider_arn               = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${replace(var.cluster_oidc_issuer_url, "https://", "")}"
+      namespace_service_accounts = ["kube-system:efs-csi-controller-sa"]
+    }
+  }
+
+  tags = var.tags
+}
+
+# Node Termination Handler
+resource "helm_release" "aws_node_termination_handler" {
+  count = var.enable_node_termination_handler ? 1 : 0
+
+  name       = "aws-node-termination-handler"
+  repository = "https://aws.github.io/eks-charts"
+  chart      = "aws-node-termination-handler"
+  namespace  = "kube-system"
+  version    = "0.21.0"
+
+  set {
+    name  = "enableSpotInterruptionDraining"
+    value = "true"
+  }
+
+  set {
+    name  = "enableRebalanceMonitoring"
+    value = "true"
+  }
+
+  set {
+    name  = "enableScheduledEventDraining"
+    value = "true"
+  }
+
+  set {
+    name  = "enableRebalanceDraining"
+    value = "true"
+  }
+
+  set {
+    name  = "nodeSelector.karpenter\\.sh/provisioner-name"
+    value = ""
+  }
+}
+
+# CloudWatch Dashboard for Node Health Monitoring
+resource "aws_cloudwatch_dashboard" "node_health" {
+  count          = var.enable_node_health_monitoring ? 1 : 0
+  dashboard_name = "${var.cluster_name}-node-health"
+
+  dashboard_body = jsonencode({
+    widgets = [
+      {
+        type   = "metric"
+        x      = 0
+        y      = 0
+        width  = 12
+        height = 6
+
+        properties = {
+          metrics = [
+            ["AWS/EKS", "cluster_node_count", "ClusterName", var.cluster_name],
+            ["AWS/EKS", "cluster_failed_node_count", "ClusterName", var.cluster_name]
+          ]
+          view    = "timeSeries"
+          stacked = false
+          region  = data.aws_region.current.name
+          title   = "EKS Node Count"
+          period  = 300
+        }
+      },
+      {
+        type   = "metric"
+        x      = 0
+        y      = 6
+        width  = 12
+        height = 6
+
+        properties = {
+          metrics = [
+            ["AWS/EC2", "StatusCheckFailed", { "stat" : "Sum" }],
+            ["AWS/EC2", "StatusCheckFailed_Instance", { "stat" : "Sum" }],
+            ["AWS/EC2", "StatusCheckFailed_System", { "stat" : "Sum" }]
+          ]
+          view    = "timeSeries"
+          stacked = false
+          region  = data.aws_region.current.name
+          title   = "EC2 Status Check Failures"
+          period  = 300
+        }
+      }
+    ]
+  })
+
+  tags = var.tags
+}
+
+# CloudWatch Alarms for Node Health
+resource "aws_cloudwatch_metric_alarm" "node_health_check_failed" {
+  count = var.enable_node_health_monitoring ? 1 : 0
+
+  alarm_name          = "${var.cluster_name}-node-health-check-failed"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "2"
+  metric_name         = "StatusCheckFailed"
+  namespace           = "AWS/EC2"
+  period              = "300"
+  statistic           = "Maximum"
+  threshold           = "0"
+  alarm_description   = "This metric monitors EC2 instance status check failures for EKS nodes"
+  alarm_actions       = var.enable_sns_alerts ? [aws_sns_topic.node_health_alerts[0].arn] : []
+
+  dimensions = {
+    AutoScalingGroupName = "*${var.cluster_name}*"
+  }
+
+  tags = var.tags
+}
+
+resource "aws_cloudwatch_metric_alarm" "gpu_node_health_check_failed" {
+  count = var.enable_node_health_monitoring && var.enable_nvidia_device_plugin ? 1 : 0
+
+  alarm_name          = "${var.cluster_name}-gpu-node-health-check-failed"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "2"
+  metric_name         = "StatusCheckFailed"
+  namespace           = "AWS/EC2"
+  period              = "600"  # Longer period for GPU nodes
+  statistic           = "Maximum"
+  threshold           = "0"
+  alarm_description   = "This metric monitors EC2 instance status check failures for EKS GPU nodes"
+  alarm_actions       = var.enable_sns_alerts ? [aws_sns_topic.node_health_alerts[0].arn] : []
+
+  dimensions = {
+    AutoScalingGroupName = "*${var.cluster_name}*gpu*"
+  }
+
+  tags = var.tags
+}
+
+# SNS Topic for Node Health Alerts
+resource "aws_sns_topic" "node_health_alerts" {
+  count = var.enable_sns_alerts ? 1 : 0
+  name  = "${var.cluster_name}-node-health-alerts"
+  
+  tags = var.tags
+}
+
+resource "aws_sns_topic_subscription" "node_health_email" {
+  count     = var.enable_sns_alerts && var.alert_email != "" ? 1 : 0
+  topic_arn = aws_sns_topic.node_health_alerts[0].arn
+  protocol  = "email"
+  endpoint  = var.alert_email
+}
+
+# Custom CloudWatch Log Group for Node Auto-Repair Events
+resource "aws_cloudwatch_log_group" "node_auto_repair" {
+  count             = var.enable_node_health_monitoring ? 1 : 0
+  name              = "/aws/eks/${var.cluster_name}/node-auto-repair"
+  retention_in_days = 30
+
+  tags = var.tags
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf
new file mode 100644
index 000000000..27dfe079f
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/addons/outputs.tf
@@ -0,0 +1,39 @@
+output "karpenter_role_arn" {
+  description = "ARN of the Karpenter IAM role"
+  value       = try(module.karpenter[0].iam_role_arn, "")
+}
+
+output "karpenter_instance_profile_name" {
+  description = "Name of the Karpenter node instance profile"
+  value       = try(module.karpenter[0].node_instance_profile_name, "")
+}
+
+output "karpenter_queue_name" {
+  description = "Name of the Karpenter SQS queue"
+  value       = try(module.karpenter[0].queue_name, "")
+}
+
+output "load_balancer_controller_role_arn" {
+  description = "ARN of the load balancer controller IAM role"
+  value       = try(module.load_balancer_controller_irsa_role[0].iam_role_arn, "")
+}
+
+output "ebs_csi_driver_role_arn" {
+  description = "ARN of the EBS CSI driver IAM role"
+  value       = module.ebs_csi_irsa_role.iam_role_arn
+}
+
+output "efs_csi_driver_role_arn" {
+  description = "ARN of the EFS CSI driver IAM role"
+  value       = module.efs_csi_irsa_role.iam_role_arn
+}
+
+output "node_health_dashboard_url" {
+  description = "URL of the CloudWatch dashboard for node health monitoring"
+  value       = var.enable_node_health_monitoring ? "https://${data.aws_region.current.name}.console.aws.amazon.com/cloudwatch/home?region=${data.aws_region.current.name}#dashboards:name=${aws_cloudwatch_dashboard.node_health[0].dashboard_name}" : ""
+}
+
+output "node_health_sns_topic_arn" {
+  description = "ARN of the SNS topic for node health alerts"
+  value       = var.enable_sns_alerts ? aws_sns_topic.node_health_alerts[0].arn : ""
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf
new file mode 100644
index 000000000..d54b2d920
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/addons/variables.tf
@@ -0,0 +1,109 @@
+variable "cluster_name" {
+  description = "Name of the EKS cluster"
+  type        = string
+}
+
+variable "cluster_oidc_issuer_url" {
+  description = "The URL on the EKS cluster for the OpenID Connect identity provider"
+  type        = string
+}
+
+variable "cluster_version" {
+  description = "Kubernetes version to use for the EKS cluster"
+  type        = string
+}
+
+variable "vpc_id" {
+  description = "ID of the VPC where the cluster is deployed"
+  type        = string
+  default     = ""
+}
+
+variable "enable_karpenter" {
+  description = "Enable Karpenter for node provisioning"
+  type        = bool
+  default     = true
+}
+
+variable "karpenter_chart_version" {
+  description = "Version of the Karpenter Helm chart"
+  type        = string
+  default     = "v0.32.1"
+}
+
+variable "cluster_endpoint" {
+  description = "EKS cluster endpoint"
+  type        = string
+}
+
+variable "karpenter_default_capacity_types" {
+  description = "Capacity types for Karpenter default node pool"
+  type        = list(string)
+  default     = ["spot", "on-demand"]
+}
+
+variable "karpenter_default_instance_types" {
+  description = "Instance types for Karpenter default node pool"
+  type        = list(string)
+  default     = ["m5.large", "m5.xlarge", "m5.2xlarge", "m5a.large", "m5a.xlarge", "m5a.2xlarge"]
+}
+
+variable "karpenter_gpu_capacity_types" {
+  description = "Capacity types for Karpenter GPU node pool"
+  type        = list(string)
+  default     = ["on-demand"]
+}
+
+variable "karpenter_gpu_instance_types" {
+  description = "Instance types for Karpenter GPU node pool"
+  type        = list(string)
+  default     = ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", "g5.xlarge", "g5.2xlarge", "p3.2xlarge"]
+}
+
+variable "enable_aws_load_balancer_controller" {
+  description = "Enable AWS Load Balancer Controller"
+  type        = bool
+  default     = true
+}
+
+variable "enable_nvidia_device_plugin" {
+  description = "Enable NVIDIA device plugin for GPU support"
+  type        = bool
+  default     = true
+}
+
+variable "enable_metrics_server" {
+  description = "Enable metrics server"
+  type        = bool
+  default     = true
+}
+
+variable "enable_node_termination_handler" {
+  description = "Enable AWS Node Termination Handler"
+  type        = bool
+  default     = true
+}
+
+variable "enable_node_health_monitoring" {
+  description = "Enable CloudWatch monitoring for node health"
+  type        = bool
+  default     = true
+}
+
+variable "enable_sns_alerts" {
+  description = "Enable SNS alerts for node health issues"
+  type        = bool
+  default     = false
+}
+
+variable "alert_email" {
+  description = "Email address for node health alerts"
+  type        = string
+  default     = ""
+}
+
+variable "tags" {
+  description = "A map of tags to add to all resources"
+  type        = map(string)
+  default     = {}
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/main.tf b/1.architectures/4.amazon-eks/terraform/modules/eks/main.tf
new file mode 100644
index 000000000..fca6bb0f1
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/eks/main.tf
@@ -0,0 +1,149 @@
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 19.21"
+
+  cluster_name    = var.cluster_name
+  cluster_version = var.cluster_version
+
+  vpc_id                   = var.vpc_id
+  subnet_ids               = var.subnet_ids
+  control_plane_subnet_ids = var.control_plane_subnet_ids
+
+  cluster_endpoint_private_access = var.cluster_endpoint_private_access
+  cluster_endpoint_public_access  = var.cluster_endpoint_public_access
+  cluster_endpoint_public_access_cidrs = var.cluster_endpoint_public_access_cidrs
+
+  cluster_encryption_config = var.cluster_encryption_config
+
+  cluster_addons = var.cluster_addons
+
+  eks_managed_node_groups = var.eks_managed_node_groups
+
+  node_security_group_additional_rules = var.node_security_group_additional_rules
+
+  manage_aws_auth_configmap = true
+
+  aws_auth_roles = [
+    {
+      rolearn  = module.eks_managed_node_group_role.iam_role_arn
+      username = "system:node:{{EC2PrivateDNSName}}"
+      groups   = ["system:bootstrappers", "system:nodes"]
+    },
+  ]
+
+  tags = var.tags
+}
+
+resource "aws_iam_role" "eks_managed_node_group_role" {
+  name = "${var.cluster_name}-node-group-role"
+
+  assume_role_policy = jsonencode({
+    Statement = [{
+      Action = "sts:AssumeRole"
+      Effect = "Allow"
+      Principal = {
+        Service = "ec2.amazonaws.com"
+      }
+    }]
+    Version = "2012-10-17"
+  })
+
+  tags = var.tags
+}
+
+resource "aws_iam_role_policy_attachment" "eks_managed_node_group_role_policy" {
+  for_each = toset([
+    "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy",
+    "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy",
+    "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly",
+    "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess",
+    "arn:aws:iam::aws:policy/AmazonFSxClientFullAccess"
+  ])
+
+  policy_arn = each.value
+  role       = aws_iam_role.eks_managed_node_group_role.name
+}
+
+resource "aws_iam_instance_profile" "eks_managed_node_group_instance_profile" {
+  name = "${var.cluster_name}-node-group-instance-profile"
+  role = aws_iam_role.eks_managed_node_group_role.name
+
+  tags = var.tags
+}
+
+data "aws_ssm_parameter" "eks_ami_release_version" {
+  for_each = var.eks_managed_node_groups
+  
+  name = "/aws/service/eks/optimized-ami/${var.cluster_version}/amazon-linux-2${each.value.ami_type == "AL2_x86_64_GPU" ? "-gpu" : ""}/recommended/release_version"
+}
+
+resource "aws_launch_template" "eks_managed_node_group" {
+  for_each = var.eks_managed_node_groups
+
+  name_prefix   = "${var.cluster_name}-${each.key}-"
+  image_id      = data.aws_ami.eks_default[each.key].id
+  instance_type = each.value.instance_types[0]
+
+  vpc_security_group_ids = [module.eks.node_security_group_id]
+
+  user_data = base64encode(templatefile("${path.module}/user_data.sh", {
+    cluster_name        = var.cluster_name
+    endpoint            = module.eks.cluster_endpoint
+    ca_certificate      = module.eks.cluster_certificate_authority_data
+    bootstrap_arguments = each.value.ami_type == "AL2_x86_64_GPU" ? "--container-runtime containerd --use-max-pods false --b64-cluster-ca ${module.eks.cluster_certificate_authority_data} --apiserver-endpoint ${module.eks.cluster_endpoint}" : ""
+  }))
+
+  block_device_mappings {
+    device_name = "/dev/xvda"
+    ebs {
+      volume_size           = 50
+      volume_type          = "gp3"
+      iops                 = 3000
+      throughput           = 125
+      encrypted            = true
+      delete_on_termination = true
+    }
+  }
+
+  metadata_options {
+    http_endpoint               = "enabled"
+    http_tokens                 = "required"
+    http_put_response_hop_limit = 2
+    instance_metadata_tags      = "enabled"
+  }
+
+  tag_specifications {
+    resource_type = "instance"
+    tags = merge(var.tags, {
+      Name = "${var.cluster_name}-${each.key}"
+    })
+  }
+
+  tags = var.tags
+
+  lifecycle {
+    create_before_destroy = true
+  }
+}
+
+data "aws_ami" "eks_default" {
+  for_each = var.eks_managed_node_groups
+  
+  most_recent = true
+  owners      = ["602401143452"]
+
+  filter {
+    name   = "name"
+    values = ["amazon-eks-node-${var.cluster_version}-v*"]
+  }
+
+  filter {
+    name   = "architecture"
+    values = ["x86_64"]
+  }
+
+  filter {
+    name   = "virtualization-type"
+    values = ["hvm"]
+  }
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf
new file mode 100644
index 000000000..6515588ea
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/eks/outputs.tf
@@ -0,0 +1,49 @@
+output "cluster_arn" {
+  description = "The Amazon Resource Name (ARN) of the cluster"
+  value       = module.eks.cluster_arn
+}
+
+output "cluster_certificate_authority_data" {
+  description = "Base64 encoded certificate data required to communicate with the cluster"
+  value       = module.eks.cluster_certificate_authority_data
+}
+
+output "cluster_endpoint" {
+  description = "Endpoint for your Kubernetes API server"
+  value       = module.eks.cluster_endpoint
+}
+
+output "cluster_id" {
+  description = "The name/id of the EKS cluster"
+  value       = module.eks.cluster_id
+}
+
+output "cluster_name" {
+  description = "The name of the EKS cluster"
+  value       = module.eks.cluster_name
+}
+
+output "cluster_oidc_issuer_url" {
+  description = "The URL on the EKS cluster for the OpenID Connect identity provider"
+  value       = module.eks.cluster_oidc_issuer_url
+}
+
+output "cluster_version" {
+  description = "The Kubernetes version for the EKS cluster"
+  value       = module.eks.cluster_version
+}
+
+output "cluster_security_group_id" {
+  description = "Security group ID attached to the EKS cluster"
+  value       = module.eks.cluster_security_group_id
+}
+
+output "node_security_group_id" {
+  description = "ID of the node shared security group"
+  value       = module.eks.node_security_group_id
+}
+
+output "eks_managed_node_groups" {
+  description = "Map of attribute maps for all EKS managed node groups created"
+  value       = module.eks.eks_managed_node_groups
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh b/1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh
new file mode 100644
index 000000000..de51ed616
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/eks/user_data.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+set -o xtrace
+
+# Install SSM agent
+yum install -y amazon-ssm-agent
+systemctl enable amazon-ssm-agent
+systemctl start amazon-ssm-agent
+
+# Configure kubelet
+echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.conf
+echo "net.bridge.bridge-nf-call-iptables = 1" >> /etc/sysctl.conf
+echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.conf
+sysctl -p
+
+# Bootstrap the node
+/etc/eks/bootstrap.sh ${cluster_name} ${bootstrap_arguments}
+
+# Install additional packages for GPU nodes
+if [[ "${bootstrap_arguments}" == *"gpu"* ]]; then
+    # Install NVIDIA drivers and container runtime
+    yum install -y nvidia-driver-latest-dkms
+    yum install -y nvidia-container-toolkit
+    
+    # Configure containerd for GPU support
+    mkdir -p /etc/containerd
+    cat > /etc/containerd/config.toml << EOF
+version = 2
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
+  runtime_type = "io.containerd.runc.v2"
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
+  SystemdCgroup = true
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
+  runtime_type = "io.containerd.runc.v2"
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
+  BinaryName = "/usr/bin/nvidia-container-runtime"
+  SystemdCgroup = true
+EOF
+    
+    systemctl restart containerd
+fi
+
+# Install FSx Lustre client
+amazon-linux-extras install -y lustre2.10
+
+# Configure CloudWatch agent
+yum install -y amazon-cloudwatch-agent
+cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json << EOF
+{
+  "agent": {
+    "metrics_collection_interval": 60,
+    "run_as_user": "cwagent"
+  },
+  "metrics": {
+    "namespace": "EKS/Node",
+    "metrics_collected": {
+      "cpu": {
+        "measurement": [
+          "cpu_usage_idle",
+          "cpu_usage_iowait",
+          "cpu_usage_user",
+          "cpu_usage_system"
+        ],
+        "metrics_collection_interval": 60
+      },
+      "disk": {
+        "measurement": [
+          "used_percent"
+        ],
+        "metrics_collection_interval": 60,
+        "resources": [
+          "*"
+        ]
+      },
+      "diskio": {
+        "measurement": [
+          "io_time"
+        ],
+        "metrics_collection_interval": 60,
+        "resources": [
+          "*"
+        ]
+      },
+      "mem": {
+        "measurement": [
+          "mem_used_percent"
+        ],
+        "metrics_collection_interval": 60
+      },
+      "netstat": {
+        "measurement": [
+          "tcp_established",
+          "tcp_time_wait"
+        ],
+        "metrics_collection_interval": 60
+      },
+      "swap": {
+        "measurement": [
+          "swap_used_percent"
+        ],
+        "metrics_collection_interval": 60
+      }
+    }
+  }
+}
+EOF
+
+systemctl enable amazon-cloudwatch-agent
+systemctl start amazon-cloudwatch-agent
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf
new file mode 100644
index 000000000..9d79e3361
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/eks/variables.tf
@@ -0,0 +1,98 @@
+variable "cluster_name" {
+  description = "Name of the EKS cluster"
+  type        = string
+}
+
+variable "cluster_version" {
+  description = "Kubernetes version to use for the EKS cluster"
+  type        = string
+}
+
+variable "vpc_id" {
+  description = "ID of the VPC where to create the cluster"
+  type        = string
+}
+
+variable "subnet_ids" {
+  description = "List of subnet IDs where the EKS cluster will be deployed"
+  type        = list(string)
+}
+
+variable "control_plane_subnet_ids" {
+  description = "List of subnet IDs where the EKS cluster control plane will be deployed"
+  type        = list(string)
+}
+
+variable "cluster_endpoint_private_access" {
+  description = "Indicates whether or not the Amazon EKS private API server endpoint is enabled"
+  type        = bool
+  default     = false
+}
+
+variable "cluster_endpoint_public_access" {
+  description = "Indicates whether or not the Amazon EKS public API server endpoint is enabled"
+  type        = bool
+  default     = true
+}
+
+variable "cluster_endpoint_public_access_cidrs" {
+  description = "List of CIDR blocks which can access the Amazon EKS public API server endpoint"
+  type        = list(string)
+  default     = ["0.0.0.0/0"]
+}
+
+variable "cluster_encryption_config" {
+  description = "Configuration block with encryption configuration for the cluster"
+  type = list(object({
+    provider_key_arn = string
+    resources        = list(string)
+  }))
+  default = []
+}
+
+variable "cluster_addons" {
+  description = "Map of cluster addon configurations to enable for the cluster"
+  type = map(object({
+    most_recent = optional(bool)
+    version     = optional(string)
+  }))
+  default = {}
+}
+
+variable "eks_managed_node_groups" {
+  description = "Map of EKS managed node group definitions to create"
+  type = map(object({
+    name           = string
+    instance_types = list(string)
+    capacity_type  = string
+    min_size       = number
+    max_size       = number
+    desired_size   = number
+    ami_type       = string
+    labels         = map(string)
+    taints = list(object({
+      key    = string
+      value  = string
+      effect = string
+    }))
+    update_config = object({
+      max_unavailable_percentage = number
+    })
+    health_check_grace_period = optional(number)
+    health_check_type         = optional(string)
+    tags = map(string)
+  }))
+  default = {}
+}
+
+variable "node_security_group_additional_rules" {
+  description = "List of additional security group rules to add to the node security group"
+  type        = any
+  default     = {}
+}
+
+variable "tags" {
+  description = "A map of tags to add to all resources"
+  type        = map(string)
+  default     = {}
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf
new file mode 100644
index 000000000..ca5cfcb94
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/main.tf
@@ -0,0 +1,114 @@
+resource "aws_fsx_lustre_file_system" "main" {
+  storage_capacity            = var.storage_capacity
+  subnet_ids                  = var.subnet_ids
+  deployment_type             = var.deployment_type
+  per_unit_storage_throughput = var.per_unit_storage_throughput
+  security_group_ids          = var.security_group_ids
+
+  dynamic "log_configuration" {
+    for_each = var.log_configuration != null ? [var.log_configuration] : []
+    content {
+      destination = log_configuration.value.destination
+      level       = log_configuration.value.level
+    }
+  }
+
+  import_path = var.s3_import_path
+  export_path = var.s3_export_path
+
+  # Auto import and export configuration
+  auto_import_policy = var.auto_import_policy
+  
+  # Data compression
+  data_compression_type = var.data_compression_type
+
+  # Copy tags to snapshots
+  copy_tags_to_backups = var.copy_tags_to_backups
+
+  # Weekly maintenance window
+  weekly_maintenance_start_time = var.weekly_maintenance_start_time
+
+  # Backup configuration for PERSISTENT deployments
+  dynamic "backup_configuration" {
+    for_each = var.deployment_type == "PERSISTENT_1" || var.deployment_type == "PERSISTENT_2" ? [1] : []
+    content {
+      automatic_backup_retention_days = var.automatic_backup_retention_days
+      daily_automatic_backup_start_time = var.daily_automatic_backup_start_time
+    }
+  }
+
+  tags = merge(var.tags, {
+    Name = var.name
+  })
+}
+
+# Create CSI driver for FSx Lustre
+resource "kubernetes_storage_class" "fsx_lustre" {
+  metadata {
+    name = "fsx-lustre-sc"
+  }
+  
+  storage_provisioner = "fsx.csi.aws.com"
+  
+  parameters = {
+    subPath = "/"
+    dnsName = aws_fsx_lustre_file_system.main.dns_name
+    mountName = aws_fsx_lustre_file_system.main.mount_name
+  }
+  
+  mount_options = [
+    "flock"
+  ]
+}
+
+# Create a persistent volume for FSx Lustre
+resource "kubernetes_persistent_volume" "fsx_lustre" {
+  metadata {
+    name = "fsx-lustre-pv"
+  }
+  
+  spec {
+    capacity = {
+      storage = "${var.storage_capacity}Gi"
+    }
+    
+    access_modes = ["ReadWriteMany"]
+    
+    persistent_volume_source {
+      csi {
+        driver        = "fsx.csi.aws.com"
+        volume_handle = aws_fsx_lustre_file_system.main.id
+        
+        volume_attributes = {
+          dnsName   = aws_fsx_lustre_file_system.main.dns_name
+          mountName = aws_fsx_lustre_file_system.main.mount_name
+        }
+      }
+    }
+    
+    storage_class_name = kubernetes_storage_class.fsx_lustre.metadata[0].name
+  }
+}
+
+# Example PVC for FSx Lustre
+resource "kubernetes_persistent_volume_claim" "fsx_lustre_example" {
+  count = var.create_example_pvc ? 1 : 0
+  
+  metadata {
+    name      = "fsx-lustre-pvc"
+    namespace = var.example_namespace
+  }
+  
+  spec {
+    access_modes = ["ReadWriteMany"]
+    
+    resources {
+      requests = {
+        storage = "100Gi"
+      }
+    }
+    
+    storage_class_name = kubernetes_storage_class.fsx_lustre.metadata[0].name
+    volume_name        = kubernetes_persistent_volume.fsx_lustre.metadata[0].name
+  }
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf
new file mode 100644
index 000000000..3544f7d71
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/outputs.tf
@@ -0,0 +1,34 @@
+output "file_system_id" {
+  description = "Identifier of the file system"
+  value       = aws_fsx_lustre_file_system.main.id
+}
+
+output "file_system_arn" {
+  description = "Amazon Resource Name of the file system"
+  value       = aws_fsx_lustre_file_system.main.arn
+}
+
+output "dns_name" {
+  description = "DNS name for the file system"
+  value       = aws_fsx_lustre_file_system.main.dns_name
+}
+
+output "mount_name" {
+  description = "The value to be used when mounting the filesystem"
+  value       = aws_fsx_lustre_file_system.main.mount_name
+}
+
+output "network_interface_ids" {
+  description = "Set of Elastic Network Interface identifiers from which the file system is accessible"
+  value       = aws_fsx_lustre_file_system.main.network_interface_ids
+}
+
+output "storage_class_name" {
+  description = "Name of the Kubernetes storage class"
+  value       = kubernetes_storage_class.fsx_lustre.metadata[0].name
+}
+
+output "persistent_volume_name" {
+  description = "Name of the Kubernetes persistent volume"
+  value       = kubernetes_persistent_volume.fsx_lustre.metadata[0].name
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf
new file mode 100644
index 000000000..396ca89c8
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/fsx-lustre/variables.tf
@@ -0,0 +1,122 @@
+variable "name" {
+  description = "Name of the FSx Lustre file system"
+  type        = string
+}
+
+variable "storage_capacity" {
+  description = "Storage capacity (GiB) of the file system"
+  type        = number
+  validation {
+    condition     = var.storage_capacity >= 1200 && var.storage_capacity % 1200 == 0
+    error_message = "Storage capacity must be at least 1200 GiB and in increments of 1200 GiB."
+  }
+}
+
+variable "subnet_ids" {
+  description = "List of subnet IDs for the file system"
+  type        = list(string)
+}
+
+variable "security_group_ids" {
+  description = "List of security group IDs for the file system"
+  type        = list(string)
+}
+
+variable "deployment_type" {
+  description = "Deployment type for the file system"
+  type        = string
+  default     = "SCRATCH_2"
+  validation {
+    condition     = contains(["SCRATCH_1", "SCRATCH_2", "PERSISTENT_1", "PERSISTENT_2"], var.deployment_type)
+    error_message = "Valid values for deployment_type are SCRATCH_1, SCRATCH_2, PERSISTENT_1, or PERSISTENT_2."
+  }
+}
+
+variable "per_unit_storage_throughput" {
+  description = "Per unit storage throughput (MB/s/TiB)"
+  type        = number
+  default     = null
+}
+
+variable "s3_import_path" {
+  description = "S3 URI for importing data"
+  type        = string
+  default     = null
+}
+
+variable "s3_export_path" {
+  description = "S3 URI for exporting data"
+  type        = string
+  default     = null
+}
+
+variable "auto_import_policy" {
+  description = "How Amazon FSx keeps your file and directory listings up to date"
+  type        = string
+  default     = "NEW_CHANGED"
+  validation {
+    condition     = contains(["NONE", "NEW", "NEW_CHANGED", "NEW_CHANGED_DELETED"], var.auto_import_policy)
+    error_message = "Valid values are NONE, NEW, NEW_CHANGED, or NEW_CHANGED_DELETED."
+  }
+}
+
+variable "data_compression_type" {
+  description = "Sets the data compression configuration for the file system"
+  type        = string
+  default     = "NONE"
+  validation {
+    condition     = contains(["NONE", "LZ4"], var.data_compression_type)
+    error_message = "Valid values are NONE or LZ4."
+  }
+}
+
+variable "copy_tags_to_backups" {
+  description = "A boolean flag indicating whether tags for the file system should be copied to backups"
+  type        = bool
+  default     = false
+}
+
+variable "weekly_maintenance_start_time" {
+  description = "The preferred start time (in d:HH:MM format) to perform weekly maintenance"
+  type        = string
+  default     = "1:02:00"
+}
+
+variable "automatic_backup_retention_days" {
+  description = "The number of days to retain automatic backups"
+  type        = number
+  default     = 7
+}
+
+variable "daily_automatic_backup_start_time" {
+  description = "The preferred time (in HH:MM format) to take daily automatic backups"
+  type        = string
+  default     = "02:00"
+}
+
+variable "log_configuration" {
+  description = "The Lustre logging configuration"
+  type = object({
+    destination = string
+    level       = string
+  })
+  default = null
+}
+
+variable "create_example_pvc" {
+  description = "Whether to create an example PVC"
+  type        = bool
+  default     = false
+}
+
+variable "example_namespace" {
+  description = "Namespace for example resources"
+  type        = string
+  default     = "default"
+}
+
+variable "tags" {
+  description = "A map of tags to assign to the resource"
+  type        = map(string)
+  default     = {}
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf
new file mode 100644
index 000000000..3295e263d
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/main.tf
@@ -0,0 +1,193 @@
+data "aws_iam_policy_document" "s3_mountpoint_assume_role_policy" {
+  statement {
+    actions = ["sts:AssumeRoleWithWebIdentity"]
+    effect  = "Allow"
+
+    condition {
+      test     = "StringEquals"
+      variable = "${replace(var.cluster_oidc_issuer_url, "https://", "")}:sub"
+      values   = ["system:serviceaccount:${var.namespace}:mountpoint-s3-csi-driver"]
+    }
+
+    condition {
+      test     = "StringEquals"
+      variable = "${replace(var.cluster_oidc_issuer_url, "https://", "")}:aud"
+      values   = ["sts.amazonaws.com"]
+    }
+
+    principals {
+      identifiers = [var.oidc_provider_arn]
+      type        = "Federated"
+    }
+  }
+}
+
+resource "aws_iam_role" "s3_mountpoint" {
+  assume_role_policy = data.aws_iam_policy_document.s3_mountpoint_assume_role_policy.json
+  name               = "${var.cluster_name}-s3-mountpoint-csi-driver"
+  tags               = var.tags
+}
+
+data "aws_iam_policy_document" "s3_mountpoint" {
+  statement {
+    effect = "Allow"
+    actions = [
+      "s3:ListBucket",
+      "s3:GetObject",
+      "s3:PutObject",
+      "s3:DeleteObject",
+      "s3:GetObjectVersion",
+      "s3:DeleteObjectVersion",
+      "s3:ListBucketVersions"
+    ]
+    resources = [
+      "arn:aws:s3:::${var.s3_bucket_name}",
+      "arn:aws:s3:::${var.s3_bucket_name}/*"
+    ]
+  }
+}
+
+resource "aws_iam_policy" "s3_mountpoint" {
+  description = "S3 Mountpoint CSI Driver Policy"
+  name        = "${var.cluster_name}-s3-mountpoint-csi-driver"
+  policy      = data.aws_iam_policy_document.s3_mountpoint.json
+  tags        = var.tags
+}
+
+resource "aws_iam_role_policy_attachment" "s3_mountpoint" {
+  policy_arn = aws_iam_policy.s3_mountpoint.arn
+  role       = aws_iam_role.s3_mountpoint.name
+}
+
+# Create the service account
+resource "kubernetes_service_account" "s3_mountpoint" {
+  metadata {
+    name      = "mountpoint-s3-csi-driver"
+    namespace = var.namespace
+    annotations = {
+      "eks.amazonaws.com/role-arn" = aws_iam_role.s3_mountpoint.arn
+    }
+  }
+}
+
+# Deploy the Mountpoint for S3 CSI driver
+resource "helm_release" "mountpoint_s3_csi_driver" {
+  name       = "aws-mountpoint-s3-csi-driver"
+  repository = "https://awslabs.github.io/mountpoint-s3-csi-driver"
+  chart      = "aws-mountpoint-s3-csi-driver"
+  namespace  = var.namespace
+  version    = var.csi_driver_version
+
+  set {
+    name  = "node.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
+    value = aws_iam_role.s3_mountpoint.arn
+  }
+
+  set {
+    name  = "controller.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
+    value = aws_iam_role.s3_mountpoint.arn
+  }
+
+  set {
+    name  = "controller.replicaCount"
+    value = "2"
+  }
+
+  set {
+    name  = "node.tolerateAllTaints"
+    value = "true"
+  }
+
+  depends_on = [kubernetes_service_account.s3_mountpoint]
+}
+
+# Create a storage class for S3 Mountpoint
+resource "kubernetes_storage_class" "s3_mountpoint" {
+  metadata {
+    name = "s3-mountpoint-sc"
+  }
+  
+  storage_provisioner = "s3.csi.aws.com"
+  
+  parameters = {
+    bucketName = var.s3_bucket_name
+    region     = var.region
+  }
+  
+  volume_binding_mode = "Immediate"
+}
+
+# Example PVC for S3 Mountpoint
+resource "kubernetes_persistent_volume_claim" "s3_mountpoint_example" {
+  count = var.create_example_pvc ? 1 : 0
+  
+  metadata {
+    name      = "s3-mountpoint-pvc"
+    namespace = var.example_namespace
+  }
+  
+  spec {
+    access_modes = ["ReadWriteMany"]
+    
+    resources {
+      requests = {
+        storage = "1000Gi"
+      }
+    }
+    
+    storage_class_name = kubernetes_storage_class.s3_mountpoint.metadata[0].name
+  }
+}
+
+# Example deployment using S3 Mountpoint
+resource "kubernetes_deployment" "s3_mountpoint_example" {
+  count = var.create_example_deployment ? 1 : 0
+  
+  metadata {
+    name      = "s3-mountpoint-example"
+    namespace = var.example_namespace
+    labels = {
+      app = "s3-mountpoint-example"
+    }
+  }
+  
+  spec {
+    replicas = 1
+    
+    selector {
+      match_labels = {
+        app = "s3-mountpoint-example"
+      }
+    }
+    
+    template {
+      metadata {
+        labels = {
+          app = "s3-mountpoint-example"
+        }
+      }
+      
+      spec {
+        container {
+          image = "busybox:latest"
+          name  = "busybox"
+          
+          command = ["/bin/sh"]
+          args    = ["-c", "while true; do echo $(date) >> /mnt/s3/test.txt; sleep 30; done"]
+          
+          volume_mount {
+            mount_path = "/mnt/s3"
+            name       = "s3-volume"
+          }
+        }
+        
+        volume {
+          name = "s3-volume"
+          persistent_volume_claim {
+            claim_name = kubernetes_persistent_volume_claim.s3_mountpoint_example[0].metadata[0].name
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf
new file mode 100644
index 000000000..e75f2e743
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/outputs.tf
@@ -0,0 +1,21 @@
+output "role_arn" {
+  description = "ARN of the IAM role for S3 Mountpoint CSI driver"
+  value       = aws_iam_role.s3_mountpoint.arn
+}
+
+output "service_account_arn" {
+  description = "ARN of the Kubernetes service account"
+  value       = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${aws_iam_role.s3_mountpoint.name}"
+}
+
+output "service_account_name" {
+  description = "Name of the Kubernetes service account"
+  value       = kubernetes_service_account.s3_mountpoint.metadata[0].name
+}
+
+output "storage_class_name" {
+  description = "Name of the S3 Mountpoint storage class"
+  value       = kubernetes_storage_class.s3_mountpoint.metadata[0].name
+}
+
+data "aws_caller_identity" "current" {}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf
new file mode 100644
index 000000000..2ed7a235e
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/s3-mountpoint/variables.tf
@@ -0,0 +1,61 @@
+variable "cluster_name" {
+  description = "Name of the EKS cluster"
+  type        = string
+}
+
+variable "cluster_oidc_issuer_url" {
+  description = "The URL on the EKS cluster for the OpenID Connect identity provider"
+  type        = string
+}
+
+variable "oidc_provider_arn" {
+  description = "The ARN of the OIDC Provider for the EKS cluster" 
+  type        = string
+}
+
+variable "s3_bucket_name" {
+  description = "Name of the S3 bucket to mount"
+  type        = string
+}
+
+variable "namespace" {
+  description = "Kubernetes namespace for the S3 Mountpoint CSI driver"
+  type        = string
+  default     = "kube-system"
+}
+
+variable "region" {
+  description = "AWS region"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "csi_driver_version" {
+  description = "Version of the Mountpoint S3 CSI driver"
+  type        = string
+  default     = "1.4.0"
+}
+
+variable "create_example_pvc" {
+  description = "Whether to create an example PVC"
+  type        = bool
+  default     = false
+}
+
+variable "create_example_deployment" {
+  description = "Whether to create an example deployment"
+  type        = bool
+  default     = false
+}
+
+variable "example_namespace" {
+  description = "Namespace for example resources"
+  type        = string
+  default     = "default"
+}
+
+variable "tags" {
+  description = "A map of tags to assign to the resource"
+  type        = map(string)
+  default     = {}
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf b/1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf
new file mode 100644
index 000000000..752471940
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/vpc/main.tf
@@ -0,0 +1,147 @@
+module "vpc" {
+  source  = "terraform-aws-modules/vpc/aws"
+  version = "~> 5.0"
+
+  name = var.name
+  cidr = var.cidr
+
+  azs             = var.azs
+  private_subnets = var.private_subnets
+  public_subnets  = var.public_subnets
+
+  enable_nat_gateway = var.enable_nat_gateway
+  enable_vpn_gateway = var.enable_vpn_gateway
+  enable_dns_hostnames = var.enable_dns_hostnames
+  enable_dns_support   = var.enable_dns_support
+
+  # Single NAT Gateway for cost optimization (can be changed to one_nat_gateway_per_az = true for HA)
+  single_nat_gateway = var.single_nat_gateway
+  one_nat_gateway_per_az = var.one_nat_gateway_per_az
+
+  # VPC Flow Logs
+  enable_flow_log                      = var.enable_flow_log
+  create_flow_log_cloudwatch_iam_role  = var.create_flow_log_cloudwatch_iam_role
+  create_flow_log_cloudwatch_log_group = var.create_flow_log_cloudwatch_log_group
+
+  # Public subnet tags for ELB
+  public_subnet_tags = merge(var.public_subnet_tags, {
+    "kubernetes.io/role/elb" = "1"
+  })
+
+  # Private subnet tags for internal ELB
+  private_subnet_tags = merge(var.private_subnet_tags, {
+    "kubernetes.io/role/internal-elb" = "1"
+  })
+
+  tags = var.tags
+}
+
+# VPC Endpoints for cost optimization and security
+resource "aws_vpc_endpoint" "s3" {
+  vpc_id       = module.vpc.vpc_id
+  service_name = "com.amazonaws.${data.aws_region.current.name}.s3"
+  
+  vpc_endpoint_type = "Gateway"
+  route_table_ids   = concat(module.vpc.private_route_table_ids, module.vpc.public_route_table_ids)
+  
+  tags = merge(var.tags, {
+    Name = "${var.name}-s3-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ecr_dkr" {
+  vpc_id              = module.vpc.vpc_id
+  service_name        = "com.amazonaws.${data.aws_region.current.name}.ecr.dkr"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = module.vpc.private_subnets
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  
+  private_dns_enabled = true
+  
+  tags = merge(var.tags, {
+    Name = "${var.name}-ecr-dkr-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ecr_api" {
+  vpc_id              = module.vpc.vpc_id
+  service_name        = "com.amazonaws.${data.aws_region.current.name}.ecr.api"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = module.vpc.private_subnets
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  
+  private_dns_enabled = true
+  
+  tags = merge(var.tags, {
+    Name = "${var.name}-ecr-api-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "ec2" {
+  vpc_id              = module.vpc.vpc_id
+  service_name        = "com.amazonaws.${data.aws_region.current.name}.ec2"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = module.vpc.private_subnets
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  
+  private_dns_enabled = true
+  
+  tags = merge(var.tags, {
+    Name = "${var.name}-ec2-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "logs" {
+  vpc_id              = module.vpc.vpc_id
+  service_name        = "com.amazonaws.${data.aws_region.current.name}.logs"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = module.vpc.private_subnets
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  
+  private_dns_enabled = true
+  
+  tags = merge(var.tags, {
+    Name = "${var.name}-logs-endpoint"
+  })
+}
+
+resource "aws_vpc_endpoint" "sts" {
+  vpc_id              = module.vpc.vpc_id
+  service_name        = "com.amazonaws.${data.aws_region.current.name}.sts"
+  vpc_endpoint_type   = "Interface"
+  subnet_ids          = module.vpc.private_subnets
+  security_group_ids  = [aws_security_group.vpc_endpoints.id]
+  
+  private_dns_enabled = true
+  
+  tags = merge(var.tags, {
+    Name = "${var.name}-sts-endpoint"
+  })
+}
+
+# Security group for VPC endpoints
+resource "aws_security_group" "vpc_endpoints" {
+  name        = "${var.name}-vpc-endpoints"
+  description = "Security group for VPC endpoints"
+  vpc_id      = module.vpc.vpc_id
+
+  ingress {
+    from_port   = 443
+    to_port     = 443
+    protocol    = "tcp"
+    cidr_blocks = [var.cidr]
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = merge(var.tags, {
+    Name = "${var.name}-vpc-endpoints"
+  })
+}
+
+data "aws_region" "current" {}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf b/1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf
new file mode 100644
index 000000000..c65b90dc3
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/vpc/outputs.tf
@@ -0,0 +1,69 @@
+output "vpc_id" {
+  description = "ID of the VPC"
+  value       = module.vpc.vpc_id
+}
+
+output "vpc_arn" {
+  description = "The ARN of the VPC"
+  value       = module.vpc.vpc_arn
+}
+
+output "vpc_cidr_block" {
+  description = "The CIDR block of the VPC"
+  value       = module.vpc.vpc_cidr_block
+}
+
+output "private_subnets" {
+  description = "List of IDs of private subnets"
+  value       = module.vpc.private_subnets
+}
+
+output "public_subnets" {
+  description = "List of IDs of public subnets"
+  value       = module.vpc.public_subnets
+}
+
+output "private_subnet_arns" {
+  description = "List of ARNs of private subnets"
+  value       = module.vpc.private_subnet_arns
+}
+
+output "public_subnet_arns" {
+  description = "List of ARNs of public subnets"
+  value       = module.vpc.public_subnet_arns
+}
+
+output "private_subnets_cidr_blocks" {
+  description = "List of cidr_blocks of private subnets"
+  value       = module.vpc.private_subnets_cidr_blocks
+}
+
+output "public_subnets_cidr_blocks" {
+  description = "List of cidr_blocks of public subnets"
+  value       = module.vpc.public_subnets_cidr_blocks
+}
+
+output "internet_gateway_id" {
+  description = "The ID of the Internet Gateway"
+  value       = module.vpc.igw_id
+}
+
+output "nat_gateway_ids" {
+  description = "List of IDs of the NAT Gateways"
+  value       = module.vpc.natgw_ids
+}
+
+output "private_route_table_ids" {
+  description = "List of IDs of the private route tables"
+  value       = module.vpc.private_route_table_ids
+}
+
+output "public_route_table_ids" {
+  description = "List of IDs of the public route tables"
+  value       = module.vpc.public_route_table_ids
+}
+
+output "vpc_endpoints_security_group_id" {
+  description = "ID of the security group for VPC endpoints"
+  value       = aws_security_group.vpc_endpoints.id
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf b/1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf
new file mode 100644
index 000000000..43108a5ad
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/modules/vpc/variables.tf
@@ -0,0 +1,96 @@
+variable "name" {
+  description = "Name to be used on all the resources as identifier"
+  type        = string
+}
+
+variable "cidr" {
+  description = "The CIDR block for the VPC"
+  type        = string
+}
+
+variable "azs" {
+  description = "A list of availability zones names or ids in the region"
+  type        = list(string)
+}
+
+variable "private_subnets" {
+  description = "A list of private subnets inside the VPC"
+  type        = list(string)
+}
+
+variable "public_subnets" {
+  description = "A list of public subnets inside the VPC"
+  type        = list(string)
+}
+
+variable "enable_nat_gateway" {
+  description = "Should be true if you want to provision NAT Gateways for each of your private networks"
+  type        = bool
+  default     = true
+}
+
+variable "enable_vpn_gateway" {
+  description = "Should be true if you want to create a new VPN Gateway resource and attach it to the VPC"
+  type        = bool
+  default     = false
+}
+
+variable "enable_dns_hostnames" {
+  description = "Should be true to enable DNS hostnames in the VPC"
+  type        = bool
+  default     = true
+}
+
+variable "enable_dns_support" {
+  description = "Should be true to enable DNS support in the VPC"
+  type        = bool
+  default     = true
+}
+
+variable "single_nat_gateway" {
+  description = "Should be true to provision a single shared NAT Gateway across all of your private networks"
+  type        = bool
+  default     = true
+}
+
+variable "one_nat_gateway_per_az" {
+  description = "Should be true if you want only one NAT Gateway per availability zone"
+  type        = bool
+  default     = false
+}
+
+variable "enable_flow_log" {
+  description = "Whether or not to enable VPC Flow Logs"
+  type        = bool
+  default     = false
+}
+
+variable "create_flow_log_cloudwatch_iam_role" {
+  description = "Whether to create IAM role for VPC Flow Logs"
+  type        = bool
+  default     = false
+}
+
+variable "create_flow_log_cloudwatch_log_group" {
+  description = "Whether to create CloudWatch log group for VPC Flow Logs"
+  type        = bool
+  default     = false
+}
+
+variable "public_subnet_tags" {
+  description = "Additional tags for the public subnets"
+  type        = map(string)
+  default     = {}
+}
+
+variable "private_subnet_tags" {
+  description = "Additional tags for the private subnets"
+  type        = map(string)
+  default     = {}
+}
+
+variable "tags" {
+  description = "A map of tags to add to all resources"
+  type        = map(string)
+  default     = {}
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/outputs.tf b/1.architectures/4.amazon-eks/terraform/outputs.tf
new file mode 100644
index 000000000..247dcaa66
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/outputs.tf
@@ -0,0 +1,119 @@
+output "region" {
+  description = "AWS region"
+  value       = var.region
+}
+
+output "cluster_name" {
+  description = "EKS cluster name"
+  value       = module.eks.cluster_name
+}
+
+output "cluster_endpoint" {
+  description = "Endpoint for EKS control plane"
+  value       = module.eks.cluster_endpoint
+}
+
+output "cluster_security_group_id" {
+  description = "Security group ID attached to the EKS cluster"
+  value       = module.eks.cluster_security_group_id
+}
+
+output "cluster_certificate_authority_data" {
+  description = "Base64 encoded certificate data required to communicate with the cluster"
+  value       = module.eks.cluster_certificate_authority_data
+}
+
+output "cluster_oidc_issuer_url" {
+  description = "The URL on the EKS cluster for the OpenID Connect identity provider"
+  value       = module.eks.cluster_oidc_issuer_url
+}
+
+output "cluster_version" {
+  description = "The Kubernetes version for the EKS cluster"
+  value       = module.eks.cluster_version
+}
+
+output "vpc_id" {
+  description = "ID of the VPC where the cluster is deployed"
+  value       = module.vpc.vpc_id
+}
+
+output "vpc_cidr_block" {
+  description = "CIDR block of the VPC"
+  value       = module.vpc.vpc_cidr_block
+}
+
+output "private_subnets" {
+  description = "List of IDs of private subnets"
+  value       = module.vpc.private_subnets
+}
+
+output "public_subnets" {
+  description = "List of IDs of public subnets"
+  value       = module.vpc.public_subnets
+}
+
+output "node_security_group_id" {
+  description = "ID of the node shared security group"
+  value       = module.eks.node_security_group_id
+}
+
+output "eks_managed_node_groups" {
+  description = "Map of attribute maps for all EKS managed node groups created"
+  value       = module.eks.eks_managed_node_groups
+}
+
+output "fsx_lustre_id" {
+  description = "FSx Lustre file system ID"
+  value       = module.fsx_lustre.file_system_id
+}
+
+output "fsx_lustre_mount_name" {
+  description = "FSx Lustre mount name"
+  value       = module.fsx_lustre.mount_name
+}
+
+output "fsx_lustre_dns_name" {
+  description = "FSx Lustre DNS name"
+  value       = module.fsx_lustre.dns_name
+}
+
+output "s3_mountpoint_service_account_arn" {
+  description = "ARN of the S3 Mountpoint service account"
+  value       = module.s3_mountpoint.service_account_arn
+}
+
+output "s3_mountpoint_role_arn" {
+  description = "ARN of the S3 Mountpoint IAM role"
+  value       = module.s3_mountpoint.role_arn
+}
+
+output "configure_kubectl" {
+  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
+  value       = "aws eks --region ${var.region} update-kubeconfig --name ${module.eks.cluster_name}"
+}
+
+output "node_health_dashboard_url" {
+  description = "URL of the CloudWatch dashboard for node health monitoring"
+  value       = module.addons.node_health_dashboard_url
+}
+
+output "node_health_sns_topic_arn" {
+  description = "ARN of the SNS topic for node health alerts"
+  value       = module.addons.node_health_sns_topic_arn
+}
+
+output "karpenter_role_arn" {
+  description = "ARN of the Karpenter IAM role"
+  value       = module.addons.karpenter_role_arn
+}
+
+output "karpenter_instance_profile_name" {
+  description = "Name of the Karpenter node instance profile"
+  value       = module.addons.karpenter_instance_profile_name
+}
+
+output "karpenter_queue_name" {
+  description = "Name of the Karpenter SQS queue for spot instance interruption handling"
+  value       = module.addons.karpenter_queue_name
+}
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/terraform.tfvars.example b/1.architectures/4.amazon-eks/terraform/terraform.tfvars.example
new file mode 100644
index 000000000..d4b3d6cca
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/terraform.tfvars.example
@@ -0,0 +1,67 @@
+# AWS Configuration
+region = "us-west-2"
+environment = "dev"
+
+# EKS Cluster Configuration
+cluster_name = "eks-reference"
+cluster_version = "1.28"
+
+# Network Configuration
+vpc_cidr = "10.0.0.0/16"
+private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
+public_subnets = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
+
+# Restrict API server access (replace with your IP ranges)
+cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"]
+
+# Default Node Group Configuration
+default_instance_types = ["m5.large", "m5.xlarge"]
+default_min_size = 1
+default_max_size = 10
+default_desired_size = 3
+
+# Node Auto Repair Configuration for Default Nodes
+default_health_check_grace_period = 300  # 5 minutes
+default_health_check_type = "EC2"
+
+# GPU Node Group Configuration
+gpu_instance_types = ["g4dn.xlarge", "g4dn.2xlarge"]
+gpu_min_size = 0
+gpu_max_size = 5
+gpu_desired_size = 1
+
+# Node Auto Repair Configuration for GPU Nodes (longer grace period due to GPU driver initialization)
+gpu_health_check_grace_period = 600  # 10 minutes
+gpu_health_check_type = "EC2"
+
+# FSx for Lustre Configuration
+fsx_storage_capacity = 1200
+fsx_deployment_type = "SCRATCH_2"
+fsx_per_unit_storage_throughput = 50
+# fsx_s3_import_path = "s3://your-bucket-name/import-path/"
+# fsx_s3_export_path = "s3://your-bucket-name/export-path/"
+
+# S3 Mountpoint Configuration
+s3_mountpoint_bucket_name = "your-s3-bucket-name"
+s3_mountpoint_namespace = "kube-system"
+
+# Karpenter Configuration
+enable_karpenter = true
+karpenter_chart_version = "v0.32.1"
+
+# Karpenter Node Pool Configuration
+karpenter_default_capacity_types = ["spot", "on-demand"]
+karpenter_default_instance_types = ["m5.large", "m5.xlarge", "m5.2xlarge", "m5a.large", "m5a.xlarge", "m5a.2xlarge", "c5.large", "c5.xlarge", "c5.2xlarge"]
+
+karpenter_gpu_capacity_types = ["on-demand"]
+karpenter_gpu_instance_types = ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", "g5.xlarge", "g5.2xlarge", "p3.2xlarge"]
+
+# Add-on Configuration
+enable_aws_load_balancer_controller = true
+enable_nvidia_device_plugin = true
+enable_metrics_server = true
+
+# Node Health Monitoring Configuration
+enable_node_health_monitoring = true
+enable_sns_alerts = false
+# alert_email = "your-email@example.com"  # Uncomment and set your email for alerts
\ No newline at end of file
diff --git a/1.architectures/4.amazon-eks/terraform/variables.tf b/1.architectures/4.amazon-eks/terraform/variables.tf
new file mode 100644
index 000000000..fe52d6078
--- /dev/null
+++ b/1.architectures/4.amazon-eks/terraform/variables.tf
@@ -0,0 +1,251 @@
+variable "region" {
+  description = "AWS region"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "environment" {
+  description = "Environment name"
+  type        = string
+  default     = "dev"
+}
+
+variable "cluster_name" {
+  description = "Name of the EKS cluster"
+  type        = string
+  default     = "eks-reference"
+}
+
+variable "cluster_version" {
+  description = "Kubernetes version to use for the EKS cluster"
+  type        = string
+  default     = "1.28"
+}
+
+variable "vpc_cidr" {
+  description = "CIDR block for VPC"
+  type        = string
+  default     = "10.0.0.0/16"
+}
+
+variable "private_subnets" {
+  description = "Private subnets for EKS cluster"
+  type        = list(string)
+  default     = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
+}
+
+variable "public_subnets" {
+  description = "Public subnets for EKS cluster"
+  type        = list(string)
+  default     = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
+}
+
+variable "cluster_endpoint_public_access_cidrs" {
+  description = "List of CIDR blocks that can access the Amazon EKS public API server endpoint"
+  type        = list(string)
+  default     = ["0.0.0.0/0"]
+}
+
+# Default Node Group Variables
+variable "default_instance_types" {
+  description = "List of instance types for default node group"
+  type        = list(string)
+  default     = ["m5.large", "m5.xlarge"]
+}
+
+variable "default_min_size" {
+  description = "Minimum number of nodes in default node group"
+  type        = number
+  default     = 1
+}
+
+variable "default_max_size" {
+  description = "Maximum number of nodes in default node group"
+  type        = number
+  default     = 10
+}
+
+variable "default_desired_size" {
+  description = "Desired number of nodes in default node group"
+  type        = number
+  default     = 3
+}
+
+variable "default_health_check_grace_period" {
+  description = "Grace period for health checks on default node group (seconds)"
+  type        = number
+  default     = 300
+}
+
+variable "default_health_check_type" {
+  description = "Health check type for default node group (EC2 or ELB)"
+  type        = string
+  default     = "EC2"
+  validation {
+    condition     = contains(["EC2", "ELB"], var.default_health_check_type)
+    error_message = "Health check type must be either EC2 or ELB."
+  }
+}
+
+# GPU Node Group Variables
+variable "gpu_instance_types" {
+  description = "List of GPU instance types for GPU node group"
+  type        = list(string)
+  default     = ["g4dn.xlarge", "g4dn.2xlarge", "p3.2xlarge"]
+}
+
+variable "gpu_min_size" {
+  description = "Minimum number of nodes in GPU node group"
+  type        = number
+  default     = 0
+}
+
+variable "gpu_max_size" {
+  description = "Maximum number of nodes in GPU node group"
+  type        = number
+  default     = 5
+}
+
+variable "gpu_desired_size" {
+  description = "Desired number of nodes in GPU node group"
+  type        = number
+  default     = 1
+}
+
+variable "gpu_health_check_grace_period" {
+  description = "Grace period for health checks on GPU node group (seconds) - GPU nodes need longer startup time"
+  type        = number
+  default     = 600
+}
+
+variable "gpu_health_check_type" {
+  description = "Health check type for GPU node group (EC2 or ELB)"
+  type        = string
+  default     = "EC2"
+  validation {
+    condition     = contains(["EC2", "ELB"], var.gpu_health_check_type)
+    error_message = "Health check type must be either EC2 or ELB."
+  }
+}
+
+# FSx for Lustre Variables
+variable "fsx_storage_capacity" {
+  description = "Storage capacity for FSx Lustre in GiB"
+  type        = number
+  default     = 1200
+}
+
+variable "fsx_deployment_type" {
+  description = "Deployment type for FSx Lustre"
+  type        = string
+  default     = "SCRATCH_2"
+  validation {
+    condition     = contains(["SCRATCH_1", "SCRATCH_2", "PERSISTENT_1", "PERSISTENT_2"], var.fsx_deployment_type)
+    error_message = "Valid values for fsx_deployment_type are SCRATCH_1, SCRATCH_2, PERSISTENT_1, or PERSISTENT_2."
+  }
+}
+
+variable "fsx_per_unit_storage_throughput" {
+  description = "Per unit storage throughput for FSx Lustre in MB/s/TiB"
+  type        = number
+  default     = 50
+}
+
+variable "fsx_s3_import_path" {
+  description = "S3 import path for FSx Lustre"
+  type        = string
+  default     = null
+}
+
+variable "fsx_s3_export_path" {
+  description = "S3 export path for FSx Lustre"
+  type        = string
+  default     = null
+}
+
+# S3 Mountpoint Variables
+variable "s3_mountpoint_bucket_name" {
+  description = "S3 bucket name for Mountpoint"
+  type        = string
+  default     = ""
+}
+
+variable "s3_mountpoint_namespace" {
+  description = "Kubernetes namespace for S3 Mountpoint CSI driver"
+  type        = string
+  default     = "kube-system"
+}
+
+# Addon Variables
+# Karpenter Configuration
+variable "enable_karpenter" {
+  description = "Enable Karpenter for node provisioning"
+  type        = bool
+  default     = true
+}
+
+variable "karpenter_chart_version" {
+  description = "Version of the Karpenter Helm chart"
+  type        = string
+  default     = "v0.32.1"
+}
+
+variable "karpenter_default_capacity_types" {
+  description = "Capacity types for Karpenter default node pool"
+  type        = list(string)
+  default     = ["spot", "on-demand"]
+}
+
+variable "karpenter_default_instance_types" {
+  description = "Instance types for Karpenter default node pool"
+  type        = list(string)
+  default     = ["m5.large", "m5.xlarge", "m5.2xlarge", "m5a.large", "m5a.xlarge", "m5a.2xlarge", "c5.large", "c5.xlarge", "c5.2xlarge"]
+}
+
+variable "karpenter_gpu_capacity_types" {
+  description = "Capacity types for Karpenter GPU node pool"
+  type        = list(string)
+  default     = ["on-demand"]
+}
+
+variable "karpenter_gpu_instance_types" {
+  description = "Instance types for Karpenter GPU node pool"
+  type        = list(string)
+  default     = ["g4dn.xlarge", "g4dn.2xlarge", "g4dn.4xlarge", "g5.xlarge", "g5.2xlarge", "p3.2xlarge", "p3.8xlarge"]
+}
+
+variable "enable_aws_load_balancer_controller" {
+  description = "Enable AWS Load Balancer Controller"
+  type        = bool
+  default     = true
+}
+
+variable "enable_nvidia_device_plugin" {
+  description = "Enable NVIDIA device plugin for GPU support"
+  type        = bool
+  default     = true
+}
+
+variable "enable_metrics_server" {
+  description = "Enable metrics server"
+  type        = bool
+  default     = true
+}
+
+variable "enable_node_health_monitoring" {
+  description = "Enable CloudWatch monitoring for node health and auto-repair"
+  type        = bool
+  default     = true
+}
+
+variable "enable_sns_alerts" {
+  description = "Enable SNS alerts for node health issues"
+  type        = bool
+  default     = false
+}
+
+variable "alert_email" {
+  description = "Email address for node health alerts"
+  type        = string
+  default     = ""
+}
\ No newline at end of file