From 5406073f3ded08729126e06d746635751f376f01 Mon Sep 17 00:00:00 2001 From: Jitesh Gupta Date: Tue, 13 May 2025 17:54:25 -0700 Subject: [PATCH 1/2] hugging face MI300 arc scale set --- .../linux-mi300-1gpu-ossci-hf.yaml | 133 ++++++++++++++++++ .../linux-mi300-2gpu-ossci-hf.yaml | 133 ++++++++++++++++++ 2 files changed, 266 insertions(+) create mode 100644 config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml create mode 100644 config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml diff --git a/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml b/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml new file mode 100644 index 0000000..ae6562b --- /dev/null +++ b/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml @@ -0,0 +1,133 @@ +# Cluster: Digital Ocean +# Deployment command: +# sudo helm upgrade --install "amd-mi300-ci-1gpu" --namespace "arc-hf-gpu-1" oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set -f + +## minRunners is the min number of idle runners. The target number of runners created will be +## calculated as a sum of minRunners and the number of jobs assigned to the scale set. +maxRunners: 6 +minRunners: 1 +controllerServiceAccount: + name: arc-gha-rs-controller + namespace: arc +template: + spec: + securityContext: + supplementalGroups: + - 110 + initContainers: + - name: init-dind-externals + image: ghcr.io/saienduri/ghascale-rocm-dev:main + imagePullPolicy: Always + command: + ["cp", "-r", "/home/runner/externals/.", "/home/runner/tmpDir/"] + volumeMounts: + - name: dind-externals + mountPath: /home/runner/tmpDir + - name: dind + image: ghcr.io/saienduri/dind:main + restartPolicy: Always + command: ["sh", "-c"] + args: + - | + dockerd --host=unix:///var/run/docker.sock --group=${DOCKER_GROUP_GID} --data-root=/home/runner/docker-data & + until docker info >/dev/null 2>&1; do sleep 5; done + tail -f /dev/null + env: + - name: DOCKER_GROUP_GID + value: "123" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + securityContext: + privileged: true + volumeMounts: + - name: docker-data + mountPath: /home/runner/docker-data + subPathExpr: $(POD_NAME)-docker + - name: work + mountPath: /home/runner/_work + - name: dind-sock + mountPath: /var/run + - name: dind-externals + mountPath: /home/runner/externals + - name: podinfo + mountPath: /etc/podinfo + - name: ci-data + mountPath: /mnt + readOnly: False + lifecycle: + preStop: + exec: + command: ["rm", "-rf", "/home/runner/docker-data"] + containers: + - name: runner + image: ghcr.io/saienduri/ghascale-rocm-dev:main + imagePullPolicy: Always + command: + - /bin/sh + - -c + - | + devices=$(ls -la /dev/dri/ | grep renderD | awk '{print $10}') + GHA_RENDER_DEVICES="--group-add 110" + for device in $devices; do + GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}" + done + echo "${GHA_RENDER_DEVICES}" > /etc/podinfo/gha-render-devices + # Wait for Docker to be ready before starting runner + echo "Waiting for docker..." + until docker info >/dev/null 2>&1; do sleep 5; done + /home/runner/run.sh + resources: + requests: + cpu: 10000m + memory: 200000Mi + ephemeral-storage: "200G" + amd.com/gpu: 1 + limits: + ephemeral-storage: "200G" + amd.com/gpu: 1 + env: + - name: DOCKER_HOST + value: unix:///var/run/docker.sock + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: work + mountPath: /home/runner/_work + - name: dind-sock + mountPath: /var/run + - name: dind-externals + mountPath: /home/runner/externals + - name: podinfo + mountPath: /etc/podinfo + - name: docker-data + mountPath: /home/runner/docker-data + subPathExpr: $(POD_NAME)-docker + - name: ci-data + mountPath: /mnt + readOnly: False + volumes: + - name: work + emptyDir: {} + - name: dind-sock + emptyDir: {} + - name: dind-externals + emptyDir: {} + - name: docker-data + emptyDir: {} + - name: podinfo + emptyDir: {} + - name: ci-data + hostPath: + path: /data/huggingface-ci + type: Directory +githubConfigUrl: https://github.com/huggingface +githubConfigSecret: arc-rs-hf-secret +runnerGroup: amd-mi300-runners diff --git a/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml b/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml new file mode 100644 index 0000000..91138a9 --- /dev/null +++ b/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml @@ -0,0 +1,133 @@ +# Cluster: Digital Ocean +# Deployment command: +# sudo helm upgrade --install "amd-mi300-ci-2gpu" --namespace "arc-hf-gpu-2" oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set -f + +## minRunners is the min number of idle runners. The target number of runners created will be +## calculated as a sum of minRunners and the number of jobs assigned to the scale set. +maxRunners: 6 +minRunners: 1 +controllerServiceAccount: + name: arc-gha-rs-controller + namespace: arc +template: + spec: + securityContext: + supplementalGroups: + - 110 + initContainers: + - name: init-dind-externals + image: ghcr.io/saienduri/ghascale-rocm-dev:main + imagePullPolicy: Always + command: + ["cp", "-r", "/home/runner/externals/.", "/home/runner/tmpDir/"] + volumeMounts: + - name: dind-externals + mountPath: /home/runner/tmpDir + - name: dind + image: ghcr.io/saienduri/dind:main + restartPolicy: Always + command: ["sh", "-c"] + args: + - | + dockerd --host=unix:///var/run/docker.sock --group=${DOCKER_GROUP_GID} --data-root=/home/runner/docker-data & + until docker info >/dev/null 2>&1; do sleep 5; done + tail -f /dev/null + env: + - name: DOCKER_GROUP_GID + value: "123" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + securityContext: + privileged: true + volumeMounts: + - name: docker-data + mountPath: /home/runner/docker-data + subPathExpr: $(POD_NAME)-docker + - name: work + mountPath: /home/runner/_work + - name: dind-sock + mountPath: /var/run + - name: dind-externals + mountPath: /home/runner/externals + - name: podinfo + mountPath: /etc/podinfo + - name: ci-data + mountPath: /mnt + readOnly: False + lifecycle: + preStop: + exec: + command: ["rm", "-rf", "/home/runner/docker-data"] + containers: + - name: runner + image: ghcr.io/saienduri/ghascale-rocm-dev:main + imagePullPolicy: Always + command: + - /bin/sh + - -c + - | + devices=$(ls -la /dev/dri/ | grep renderD | awk '{print $10}') + GHA_RENDER_DEVICES="--group-add 110" + for device in $devices; do + GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}" + done + echo "${GHA_RENDER_DEVICES}" > /etc/podinfo/gha-render-devices + # Wait for Docker to be ready before starting runner + echo "Waiting for docker..." + until docker info >/dev/null 2>&1; do sleep 5; done + /home/runner/run.sh + resources: + requests: + cpu: 20000m + memory: 400000Mi + ephemeral-storage: "400G" + amd.com/gpu: 2 + limits: + ephemeral-storage: "400G" + amd.com/gpu: 2 + env: + - name: DOCKER_HOST + value: unix:///var/run/docker.sock + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: work + mountPath: /home/runner/_work + - name: dind-sock + mountPath: /var/run + - name: dind-externals + mountPath: /home/runner/externals + - name: podinfo + mountPath: /etc/podinfo + - name: docker-data + mountPath: /home/runner/docker-data + subPathExpr: $(POD_NAME)-docker + - name: ci-data + mountPath: /mnt + readOnly: False + volumes: + - name: work + emptyDir: {} + - name: dind-sock + emptyDir: {} + - name: dind-externals + emptyDir: {} + - name: docker-data + emptyDir: {} + - name: podinfo + emptyDir: {} + - name: ci-data + hostPath: + path: /data/huggingface-ci + type: Directory +githubConfigUrl: https://github.com/huggingface +githubConfigSecret: arc-rs-hf-secret +runnerGroup: amd-mi300-runners From f547a3d7bcfd01c776368a04dc16493a8908c630 Mon Sep 17 00:00:00 2001 From: Jitesh Gupta Date: Sun, 18 May 2025 21:48:49 -0700 Subject: [PATCH 2/2] Adding logic to set gpu isolation env variables --- .../huggingface/linux-mi300-1gpu-ossci-hf.yaml | 14 +++++++++++++- .../huggingface/linux-mi300-2gpu-ossci-hf.yaml | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml b/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml index ae6562b..2fc5424 100644 --- a/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml +++ b/config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml @@ -70,10 +70,22 @@ template: - | devices=$(ls -la /dev/dri/ | grep renderD | awk '{print $10}') GHA_RENDER_DEVICES="--group-add 110" + ROCR_VISIBLE_DEVICES="" for device in $devices; do - GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}" + GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}" + # Also Set the GPU isolation environment variables. Ref: https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html + GPU_XCD=$(echo "${device}" | tr -d -c 0-9) + PHYSICAL_GPU_NUMBER=$(( GPU_XCD % 128 / 8)) + if [ -z "${ROCR_VISIBLE_DEVICES}" ]; then + ROCR_VISIBLE_DEVICES="${PHYSICAL_GPU_NUMBER}" + else + ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES},${PHYSICAL_GPU_NUMBER}" + fi done echo "${GHA_RENDER_DEVICES}" > /etc/podinfo/gha-render-devices + echo "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}" > /etc/podinfo/gha-gpu-isolation-settings + echo "HIP_VISIBLE_DEVICES=0" >> /etc/podinfo/gha-gpu-isolation-settings + # Wait for Docker to be ready before starting runner echo "Waiting for docker..." until docker info >/dev/null 2>&1; do sleep 5; done diff --git a/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml b/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml index 91138a9..0dd0adb 100644 --- a/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml +++ b/config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml @@ -70,10 +70,22 @@ template: - | devices=$(ls -la /dev/dri/ | grep renderD | awk '{print $10}') GHA_RENDER_DEVICES="--group-add 110" + ROCR_VISIBLE_DEVICES="" for device in $devices; do - GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}" + GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}" + # Also Set the GPU isolation environment variables. Ref: https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html + GPU_XCD=$(echo "${device}" | tr -d -c 0-9) + PHYSICAL_GPU_NUMBER=$(( GPU_XCD % 128 / 8)) + if [ -z "${ROCR_VISIBLE_DEVICES}" ]; then + ROCR_VISIBLE_DEVICES="${PHYSICAL_GPU_NUMBER}" + else + ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES},${PHYSICAL_GPU_NUMBER}" + fi done echo "${GHA_RENDER_DEVICES}" > /etc/podinfo/gha-render-devices + echo "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}" > /etc/podinfo/gha-gpu-isolation-settings + echo "HIP_VISIBLE_DEVICES=0,1" >> /etc/podinfo/gha-gpu-isolation-settings + # Wait for Docker to be ready before starting runner echo "Waiting for docker..." until docker info >/dev/null 2>&1; do sleep 5; done