aws-samples · pengc99 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/manifests/modules/fundamentals/storage/fsxl/.workshop/cleanup.sh b/manifests/modules/fundamentals/storage/fsxl/.workshop/cleanup.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -e
+
+logmessage "Deleting assets-images folder..."
+
+# Delete local directory of image files
+rm -rf ~/environment/assets-images/
+
+check=$(helm list -n kube-system | grep aws-fsx-csi-driver || true)
+
+logmessage "Scaling down assets deployment..."
+
+kubectl scale -n assets --replicas=0 deployment/assets
+
+if [ ! -z "$check" ]; then
+  logmessage "Deleting FSX Lustre CSI driver addon..."
+
+  helm uninstall aws-fsx-csi-driver -n kube-system
+fi
+
+logmessage "Deleting PV and PVC that were created..."
+
+# Delete PVC
+kubectl delete pvc fsx-claim -n assets --ignore-not-found=true
+
+# Delete PV
+kubectl delete pv fsx-pv --ignore-not-found=true
+
diff --git a/manifests/modules/fundamentals/storage/fsxl/.workshop/terraform/main.tf b/manifests/modules/fundamentals/storage/fsxl/.workshop/terraform/main.tf
@@ -0,0 +1,156 @@
+# Attach AmazonFSxFullAccess managed policy
+resource "aws_iam_role_policy_attachment" "fsx_full_access" {
+  role       = "eks-workshop-ide-role"
+  policy_arn = "arn:aws:iam::aws:policy/AmazonFSxFullAccess"
+}
+
+# Add after the policy attachment
+resource "time_sleep" "wait_for_policy_propagation" {
+  depends_on = [aws_iam_role_policy_attachment.fsx_full_access]
+  create_duration = "5s"  # reduce to minimum amount possible
+}
+
+# Add Service_Linked_Role inline policy
+resource "aws_iam_role_policy" "service_linked_role" {
+  name = "Service_Linked_Role"
+  role = "eks-workshop-ide-role"
+
+  policy = <<EOF
+  {
+    "Version": "2012-10-17",
+    "Statement": {
+      "Effect": "Allow",
+      "Action": [
+        "iam:CreateServiceLinkedRole",
+        "iam:AttachRolePolicy",
+        "iam:PutRolePolicy"
+      ],
+      "Resource": "arn:aws:iam::*:role/aws-service-role/s3.data-source.lustre.fsx.amazonaws.com/*"
+    }
+  }
+  EOF
+}
+
+data "aws_caller_identity" "current" {}
+data "aws_partition" "current" {}
+
+data "aws_vpc" "selected_fsx" {
+  tags = {
+    created-by = "eks-workshop-v2"
+    env        = var.eks_cluster_id
+  }
+}
+
+data "aws_subnets" "private_fsx" {
+  tags = {
+    created-by = "eks-workshop-v2"
+    env        = var.eks_cluster_id
+  }
+
+  filter {
+    name   = "tag:Name"
+    values = ["*Public*"]
+  }
+}
+
+resource "aws_security_group" "fsx_lustre" {
+  name        = "${var.eks_cluster_id}-fsx-lustre"
+  description = "FSx for Lustre security group to allow access on required ports"
+  vpc_id      = data.aws_vpc.selected_fsx.id
+
+  ingress {
+    description = "Allow inbound traffic for Lustre"
+    from_port   = 988
+    to_port     = 988
+    protocol    = "tcp"
+    cidr_blocks = [data.aws_vpc.selected_fsx.cidr_block]
+  }
+
+  ingress {
+    description = "Allow inbound traffic for Lustre (UDP)"
+    from_port   = 988
+    to_port     = 988
+    protocol    = "udp"
+    cidr_blocks = [data.aws_vpc.selected_fsx.cidr_block]
+  }
+
+  ingress {
+    description = "Allow inbound traffic for Lustre data"
+    from_port   = 1018
+    to_port     = 1023
+    protocol    = "tcp"
+    cidr_blocks = [data.aws_vpc.selected_fsx.cidr_block]
+  }
+
+  egress {
+    description = "Allow all egress"
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+
+  tags = merge(
+    var.tags,
+    {
+      Name = "${var.eks_cluster_id}-fsxlustresecuritygroup"
+    }
+  )
+}
+
+# Create S3 bucket to initially put images in
+resource "aws_s3_bucket" "s3_data" {
+
+  bucket_prefix = "${var.addon_context.eks_cluster_id}-s3-data"
+  force_destroy = true
+}
+
+resource "aws_fsx_lustre_file_system" "fsx_lustre" {
+  depends_on = [time_sleep.wait_for_policy_propagation]
+
+  storage_capacity = 1200
+  subnet_ids       = [data.aws_subnets.private_fsx.ids[1]]
+  security_group_ids = [aws_security_group.fsx_lustre.id] 
+
+  # Additional recommended settings
+  file_system_type_version = "2.12"
+  deployment_type = "SCRATCH_2"
+  storage_type = "SSD"
+}
+
+resource "aws_iam_role_policy" "eks_workshop_ide_s3_put_access" {
+  name = "eks-workshop-ide-s3-put-access"
+  role = "eks-workshop-ide-role"
+
+  policy = <<EOF
+  {
+    "Version": "2012-10-17",
+    "Statement": [
+      {
+        "Effect": "Allow",
+        "Action": "s3:PutObject",
+        "Resource": "${aws_s3_bucket.s3_data.arn}/*"
+      }
+    ]
+  }
+  EOF
+}
+
+# Create FSx CSI Driver IAM Role and associated policy
+module "fsx_lustre_csi_driver_irsa" {
+  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version = "5.39.1"
+
+  role_name_prefix = "${var.addon_context.eks_cluster_id}-fsx-lustre-csi-"
+  policy_name_prefix = "${var.addon_context.eks_cluster_id}-fsx-lustre-csi-"
+
+  # IAM policy to attach to driver
+  attach_fsx_lustre_csi_policy = true
+
+  oidc_providers = {
+    ex = {
+      provider_arn               = var.addon_context.eks_oidc_provider_arn
+      namespace_service_accounts = ["kube-system:fsx-csi-controller-sa"]
+    }
+  }
+}
diff --git a/manifests/modules/fundamentals/storage/fsxl/.workshop/terraform/outputs.tf b/manifests/modules/fundamentals/storage/fsxl/.workshop/terraform/outputs.tf
@@ -0,0 +1,10 @@
+output "environment_variables" {
+  description = "Environment variables to be added to the IDE shell"
+  value = {
+    BUCKET_NAME       = aws_s3_bucket.s3_data.id
+    EKS_CLUSTER_NAME  = var.eks_cluster_id
+    FSX_ID            = aws_fsx_lustre_file_system.fsx_lustre.id
+    FSX_DNS_NAME      = aws_fsx_lustre_file_system.fsx_lustre.dns_name
+    FSX_MOUNT_NAME    = aws_fsx_lustre_file_system.fsx_lustre.mount_name
+  }
+}
diff --git a/manifests/modules/fundamentals/storage/fsxl/.workshop/terraform/vars.tf b/manifests/modules/fundamentals/storage/fsxl/.workshop/terraform/vars.tf
@@ -0,0 +1,35 @@
+# tflint-ignore: terraform_unused_declarations
+variable "eks_cluster_id" {
+  description = "EKS cluster name"
+  type        = string
+}
+
+# tflint-ignore: terraform_unused_declarations
+variable "eks_cluster_version" {
+  description = "EKS cluster version"
+  type        = string
+}
+
+# tflint-ignore: terraform_unused_declarations
+variable "cluster_security_group_id" {
+  description = "EKS cluster security group ID"
+  type        = any
+}
+
+# tflint-ignore: terraform_unused_declarations
+variable "addon_context" {
+  description = "Addon context that can be passed directly to blueprints addon modules"
+  type        = any
+}
+
+# tflint-ignore: terraform_unused_declarations
+variable "tags" {
+  description = "Tags to apply to AWS resources"
+  type        = any
+}
+
+# tflint-ignore: terraform_unused_declarations
+variable "resources_precreated" {
+  description = "Have expensive resources been created already"
+  type        = bool
+}
diff --git a/manifests/modules/fundamentals/storage/fsxl/deployment/deployment.yaml b/manifests/modules/fundamentals/storage/fsxl/deployment/deployment.yaml
@@ -0,0 +1,17 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: assets
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: assets
+          volumeMounts:
+            - name: fsx-lustre
+              mountPath: /fsx-lustre
+      volumes:
+        - name: fsx-lustre
+          persistentVolumeClaim:
+            claimName: fsx-claim
diff --git a/manifests/modules/fundamentals/storage/fsxl/deployment/fsxpvclaim.yaml b/manifests/modules/fundamentals/storage/fsxl/deployment/fsxpvclaim.yaml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: fsx-pv
+spec:
+  capacity:
+    storage: 1200Gi
+  volumeMode: Filesystem
+  accessModes:
+    - ReadWriteMany
+  mountOptions:
+    - flock
+  persistentVolumeReclaimPolicy: Retain
+  csi:
+    driver: fsx.csi.aws.com
+    volumeHandle: $FSX_ID
+    volumeAttributes:
+      dnsname: $FSX_DNS_NAME
+      mountname: $FSX_MOUNT_NAME
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: fsx-claim
+  namespace: assets
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: "" # required for static provisioning
+  resources:
+    requests:
+      storage: 1200Gi
diff --git a/manifests/modules/fundamentals/storage/fsxl/deployment/kustomization.yaml b/manifests/modules/fundamentals/storage/fsxl/deployment/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ../../../../../base-application/assets
+  - fsxpvclaim.yaml
+patches:
+  - path: deployment.yaml
diff --git a/website/docs/fundamentals/storage/fsx-for-lustre/10-ephemeral-container-storage.md b/website/docs/fundamentals/storage/fsx-for-lustre/10-ephemeral-container-storage.md
@@ -0,0 +1,101 @@
+---
+title: Ephemeral Container Storage
+sidebar_position: 10
+---
+
+In this section, we'll explore how to handle storage in Kubernetes deployments using a simple image hosting example. We'll start with an existing deployment from our sample store application and modify it to serve as an image host. The assets microservice runs a webserver on EKS, which is an excellent example for demonstrating deployments since they enable **horizontal scaling** and **declarative state management** of Pods.
+
+The assets component serves static product images from a container. These images are bundled into the container during the build process. However, this approach has a limitation - when new images are added to one container, they don't automatically appear in other containers. To address this, we'll implement a solution using [Amazon FSx for Lustre](https://aws.amazon.com/fsx/lustre/) combined with [Amazon S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) and Kubernetes [Persistent Volume](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to create a shared storage environment. This will allow multiple web server containers to serve assets while scaling to meet demand.
+
+Let's examine the current Deployment's volume configuration:
+
+```bash
+$ kubectl describe deployment -n assets
+Name:                   assets
+Namespace:              assets
+[...]
+  Containers:
+   assets:
+    Image:      public.ecr.aws/aws-containers/retail-store-sample-assets:0.4.0
+    Port:       8080/TCP
+    Host Port:  0/TCP
+    Limits:
+      memory:  128Mi
+    Requests:
+      cpu:     128m
+      memory:  128Mi
+    Liveness:  http-get http://:8080/health.html delay=0s timeout=1s period=3s #success=1 #failure=3
+    Environment Variables from:
+      assets      ConfigMap  Optional: false
+    Environment:  <none>
+    Mounts:
+      /tmp from tmp-volume (rw)
+  Volumes:
+   tmp-volume:
+    Type:          EmptyDir (a temporary directory that shares a pod's lifetime)
+    Medium:        Memory
+    SizeLimit:     <unset>
+[...]
+```
+
+Looking at the [`Volumes`](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir-configuration-example) section, we can see that the Deployment currently uses an [EmptyDir volume type](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) that exists only for the Pod's lifetime.
+
+![Assets with emptyDir](./assets/assets-emptydir.webp)
+
+An `emptyDir` volume is created when a Pod is assigned to a node and persists only while that Pod runs on that node. As its name suggests, the volume starts empty. While all containers within the Pod can read and write files in the emptyDir volume (even when mounted at different paths), **when a Pod is removed from a node for any reason, the data in the emptyDir is deleted permanently.** This makes EmptyDir unsuitable for sharing data between multiple Pods in the same Deployment when that data needs to persist.
+
+The container comes with some initial product images, which are copied during the build process to `/usr/share/nginx/html/assets`. We can verify this by running:
+
+```bash
+$ kubectl exec --stdin deployment/assets \
+  -n assets -- bash -c "ls /usr/share/nginx/html/assets/"
+chrono_classic.jpg
+gentleman.jpg
+pocket_watch.jpg
+smart_1.jpg
+smart_2.jpg
+wood_watch.jpg
+```
+
+To demonstrate the limitations of EmptyDir storage, let's scale up the `assets` Deployment to multiple replicas:
+
+```bash
+$ kubectl scale -n assets --replicas=2 deployment/assets
+deployment.apps/assets scaled
+
+$ kubectl rollout status -n assets deployment/assets --timeout=60s
+deployment "assets" successfully rolled out
+```
+
+Now, let's add a new product image called `divewatch.png` to the `/usr/share/nginx/html/assets` directory of the first Pod and verify it exists:
+
+```bash
+$ POD_NAME=$(kubectl -n assets get pods -o jsonpath='{.items[0].metadata.name}')
+$ kubectl exec --stdin $POD_NAME \
+  -n assets -- bash -c 'touch /usr/share/nginx/html/assets/divewatch.jpg'
+$ kubectl exec --stdin $POD_NAME \
+  -n assets -- bash -c 'ls /usr/share/nginx/html/assets'
+chrono_classic.jpg
+divewatch.jpg <-----------
+gentleman.jpg
+pocket_watch.jpg
+smart_1.jpg
+smart_2.jpg
+wood_watch.jpg
+```
+
+Let's check if the new product image `divewatch.jpg` appears in the second Pod:
+
+```bash
+$ POD_NAME=$(kubectl -n assets get pods -o jsonpath='{.items[1].metadata.name}')
+$ kubectl exec --stdin $POD_NAME \
+  -n assets -- bash -c 'ls /usr/share/nginx/html/assets'
+chrono_classic.jpg
+gentleman.jpg
+pocket_watch.jpg
+smart_1.jpg
+smart_2.jpg
+wood_watch.jpg
+```
+
+As we can see, `divewatch.jpg` doesn't exist in the second Pod. This demonstrates why we need a shared filesystem that persists across multiple Pods when scaling horizontally, allowing file updates without requiring redeployment.