From ab2e1d7aeab1fcae4b7a1441980731bb14d71c72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Mon, 18 Dec 2023 17:19:45 +0200 Subject: [PATCH] Add support for raid10 This removes the wait block for raid resync for two reasons: 1) raid0 does not have redundancy and therefore no initial resync[1] 2) with raid10 the resync time for 4x 1.9TB disks takes from tens of minutes to multiple hours, depending on sysctl params `dev.raid.speed_limit_min` and `dev.raid.speed_limit_max` and the speed of the disks. Initial resync for raid10 is not strictly needed[1] Filesystem creation: by default `mkfs.xfs` attempts to TRIM the drive. This is also something that can take tens of minutes or hours, depening on the size of drives. TRIM can be skipped, as instances are delivered with disks fully trimmed[2]. [1] https://raid.wiki.kernel.org/index.php/Initial_Array_Creation [2] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#InstanceStoreTrimSupport --- doc/USER_GUIDE.md | 6 +++++- files/bin/setup-local-disks | 39 ++++++++++++++++++++++++------------- files/bootstrap.sh | 2 +- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/doc/USER_GUIDE.md b/doc/USER_GUIDE.md index a546ab034..69f897835 100644 --- a/doc/USER_GUIDE.md +++ b/doc/USER_GUIDE.md @@ -377,7 +377,7 @@ For more information about image credential provider plugins, refer to the [Kube Some instance types launch with ephemeral NVMe instance storage (i3, i4i, c5d, c6id, etc). There are two main ways of utilizing this storage within Kubernetes: a single RAID-0 array for use by kubelet and containerd or mounting the individual disks for pod usage. -The EKS Optimized AMI includes a utility script to configure ephemeral storage. The script can be invoked by passing the `--local-disks ` flag to the `/etc/eks/bootstrap.sh` script or the script can be invoked directly at `/bin/setup-local-disks`. All disks are formatted with an XFS file system. +The EKS Optimized AMI includes a utility script to configure ephemeral storage. The script can be invoked by passing the `--local-disks ` flag to the `/etc/eks/bootstrap.sh` script or the script can be invoked directly at `/bin/setup-local-disks`. All disks are formatted with an XFS file system. Below are details on the two disk setup options: @@ -385,6 +385,10 @@ Below are details on the two disk setup options: A RAID-0 array is setup that includes all ephemeral NVMe instance storage disks. The containerd and kubelet state directories (`/var/lib/containerd` and `/var/lib/kubelet`) will then use the ephemeral storage for more and faster node ephemeral-storage. The node's ephemeral storage can be shared among pods that request ephemeral storage and container images that are downloaded to the node. +### RAID-10 for Kubelet and Containerd (raid10) + +A RAID-10 array is setup that includes all ephemeral NVMe instance storage disks, providing redundancy against maximum of 50% disk failures. Minimum of four disks are required. The containerd and kubelet state directories (`/var/lib/containerd` and `/var/lib/kubelet`) will then use the ephemeral storage for more and faster node ephemeral-storage. The node's ephemeral storage can be shared among pods that request ephemeral storage and container images that are downloaded to the node. + ### Mount for Persistent Volumes (mount) Another way of utilizing the ephemeral disks is to format and mount the individual disks. Mounting individual disks allows the [local-static-provisioner](https://github.com/kubernetes-sigs/sig-storage-local-static-provisioner) DaemonSet to create Persistent Volume Claims that pods can utilize. diff --git a/files/bin/setup-local-disks b/files/bin/setup-local-disks index 9cdb18dae..ea064b15c 100644 --- a/files/bin/setup-local-disks +++ b/files/bin/setup-local-disks @@ -10,18 +10,25 @@ err_report() { trap 'err_report $LINENO' ERR print_help() { - echo "usage: $0 " + echo "usage: $0 " echo "Sets up Amazon EC2 Instance Store NVMe disks" echo "" echo "-d, --dir directory to mount the filesystem(s) (default: /mnt/k8s-disks/)" echo "-h, --help print this help" } -# Sets up a RAID-0 of NVMe instance storage disks, moves -# the contents of /var/lib/kubelet and /var/lib/containerd +# Sets up a RAID-0 or RAID-10 of NVMe instance storage disks, +# moves the contents of /var/lib/kubelet and /var/lib/containerd # to the new mounted RAID, and bind mounts the kubelet and # containerd state directories. -maybe_raid0() { +# +# Do not wait for initial resync: raid0 has no redundancy so there +# is no initial resync. Raid10 does not strictly needed a resync, +# while the time taken for 4 1.9TB disk raid10 would be in range of +# 20 minutes to 20 days, depending on dev.raid.speed_limit_min and +# dev.raid.speed_limit_max sysctl parameters. +maybe_raid() { + local raid_level="$1" local md_name="kubernetes" local md_device="/dev/md/${md_name}" local md_config="/.aws/mdadm.conf" @@ -31,14 +38,10 @@ maybe_raid0() { if [[ ! -s "${md_config}" ]]; then mdadm --create --force --verbose \ "${md_device}" \ - --level=0 \ + --level="${raid_level}" \ --name="${md_name}" \ --raid-devices="${#EPHEMERAL_DISKS[@]}" \ "${EPHEMERAL_DISKS[@]}" - while [ -n "$(mdadm --detail "${md_device}" | grep -ioE 'State :.*resyncing')" ]; do - echo "Raid is resyncing..." - sleep 1 - done mdadm --detail --scan > "${md_config}" fi @@ -54,7 +57,7 @@ maybe_raid0() { ## for the log stripe unit, but the max log stripe unit is 256k. ## So instead, we use 32k (8 blocks) to avoid a warning of breaching the max. ## mkfs.xfs defaults to 32k after logging the warning since the default log buffer size is 32k. - mkfs.xfs -l su=8b "${md_device}" + mkfs.xfs -K -l su=8b "${md_device}" fi ## Create the mount directory @@ -188,8 +191,8 @@ set -- "${POSITIONAL[@]}" # restore positional parameters DISK_SETUP="$1" set -u -if [[ "${DISK_SETUP}" != "raid0" && "${DISK_SETUP}" != "mount" ]]; then - echo "Valid disk setup options are: raid0 or mount" +if [[ "${DISK_SETUP}" != "raid0" && "${DISK_SETUP}" != "raid10" && "${DISK_SETUP}" != "mount" ]]; then + echo "Valid disk setup options are: raid0, raid10 or mount" exit 1 fi @@ -208,11 +211,21 @@ fi ## Get devices of NVMe instance storage ephemeral disks EPHEMERAL_DISKS=($(realpath "${disks[@]}" | sort -u)) +## Also bail early if there are not enough disks for raid10 +if [[ "${DISK_SETUP}" == "raid10" && "${#EPHEMERAL_DISKS[@]}" -lt 4 ]]; then + echo "raid10 requires at least 4 disks, but only ${#EPHEMERAL_DISKS[@]} found, skipping disk setup" + exit 0 +fi + case "${DISK_SETUP}" in "raid0") - maybe_raid0 + maybe_raid 0 echo "Successfully setup RAID-0 consisting of ${EPHEMERAL_DISKS[@]}" ;; + "raid10") + maybe_raid 10 + echo "Successfully setup RAID-10 consisting of ${EPHEMERAL_DISKS[@]}" + ;; "mount") maybe_mount echo "Successfully setup disk mounts consisting of ${EPHEMERAL_DISKS[@]}" diff --git a/files/bootstrap.sh b/files/bootstrap.sh index 42567a495..8a50894d2 100755 --- a/files/bootstrap.sh +++ b/files/bootstrap.sh @@ -32,7 +32,7 @@ function print_help { echo "--enable-local-outpost Enable support for worker nodes to communicate with the local control plane when running on a disconnected Outpost. (true or false)" echo "--ip-family Specify ip family of the cluster" echo "--kubelet-extra-args Extra arguments to add to the kubelet. Useful for adding labels or taints." - echo "--local-disks Setup instance storage NVMe disks in raid0 or mount the individual disks for use by pods [mount | raid0]" + echo "--local-disks Setup instance storage NVMe disks in raid0 or mount the individual disks for use by pods " echo "--mount-bpf-fs Mount a bpffs at /sys/fs/bpf (default: true)" echo "--pause-container-account The AWS account (number) to pull the pause container from" echo "--pause-container-version The tag of the pause container"