Skip to content

HuggingFace MI300 CI runner scale set #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions config-files/huggingface/linux-mi300-1gpu-ossci-hf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# Cluster: Digital Ocean
# Deployment command:
# sudo helm upgrade --install "amd-mi300-ci-1gpu" --namespace "arc-hf-gpu-1" oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set -f <path-to-this-file>

## minRunners is the min number of idle runners. The target number of runners created will be
## calculated as a sum of minRunners and the number of jobs assigned to the scale set.
maxRunners: 6
minRunners: 1
controllerServiceAccount:
name: arc-gha-rs-controller
namespace: arc
template:
spec:
securityContext:
supplementalGroups:
- 110
initContainers:
- name: init-dind-externals
image: ghcr.io/saienduri/ghascale-rocm-dev:main
imagePullPolicy: Always
command:
["cp", "-r", "/home/runner/externals/.", "/home/runner/tmpDir/"]
volumeMounts:
- name: dind-externals
mountPath: /home/runner/tmpDir
- name: dind
image: ghcr.io/saienduri/dind:main
restartPolicy: Always
command: ["sh", "-c"]
args:
- |
dockerd --host=unix:///var/run/docker.sock --group=${DOCKER_GROUP_GID} --data-root=/home/runner/docker-data &
until docker info >/dev/null 2>&1; do sleep 5; done
tail -f /dev/null
env:
- name: DOCKER_GROUP_GID
value: "123"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
securityContext:
privileged: true
volumeMounts:
- name: docker-data
mountPath: /home/runner/docker-data
subPathExpr: $(POD_NAME)-docker
- name: work
mountPath: /home/runner/_work
- name: dind-sock
mountPath: /var/run
- name: dind-externals
mountPath: /home/runner/externals
- name: podinfo
mountPath: /etc/podinfo
- name: ci-data
mountPath: /mnt
readOnly: False
lifecycle:
preStop:
exec:
command: ["rm", "-rf", "/home/runner/docker-data"]
containers:
- name: runner
image: ghcr.io/saienduri/ghascale-rocm-dev:main
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
devices=$(ls -la /dev/dri/ | grep renderD | awk '{print $10}')
GHA_RENDER_DEVICES="--group-add 110"
ROCR_VISIBLE_DEVICES=""
for device in $devices; do
GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}"
# Also Set the GPU isolation environment variables. Ref: https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html
GPU_XCD=$(echo "${device}" | tr -d -c 0-9)
PHYSICAL_GPU_NUMBER=$(( GPU_XCD % 128 / 8))
if [ -z "${ROCR_VISIBLE_DEVICES}" ]; then
ROCR_VISIBLE_DEVICES="${PHYSICAL_GPU_NUMBER}"
else
ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES},${PHYSICAL_GPU_NUMBER}"
fi
done
echo "${GHA_RENDER_DEVICES}" > /etc/podinfo/gha-render-devices
echo "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}" > /etc/podinfo/gha-gpu-isolation-settings
echo "HIP_VISIBLE_DEVICES=0" >> /etc/podinfo/gha-gpu-isolation-settings

# Wait for Docker to be ready before starting runner
echo "Waiting for docker..."
until docker info >/dev/null 2>&1; do sleep 5; done
/home/runner/run.sh
resources:
requests:
cpu: 10000m
memory: 200000Mi
ephemeral-storage: "200G"
amd.com/gpu: 1
limits:
ephemeral-storage: "200G"
amd.com/gpu: 1
env:
- name: DOCKER_HOST
value: unix:///var/run/docker.sock
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: work
mountPath: /home/runner/_work
- name: dind-sock
mountPath: /var/run
- name: dind-externals
mountPath: /home/runner/externals
- name: podinfo
mountPath: /etc/podinfo
- name: docker-data
mountPath: /home/runner/docker-data
subPathExpr: $(POD_NAME)-docker
- name: ci-data
mountPath: /mnt
readOnly: False
volumes:
- name: work
emptyDir: {}
- name: dind-sock
emptyDir: {}
- name: dind-externals
emptyDir: {}
- name: docker-data
emptyDir: {}
- name: podinfo
emptyDir: {}
- name: ci-data
hostPath:
path: /data/huggingface-ci
type: Directory
githubConfigUrl: https://github.com/huggingface
githubConfigSecret: arc-rs-hf-secret
runnerGroup: amd-mi300-runners
145 changes: 145 additions & 0 deletions config-files/huggingface/linux-mi300-2gpu-ossci-hf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# Cluster: Digital Ocean
# Deployment command:
# sudo helm upgrade --install "amd-mi300-ci-2gpu" --namespace "arc-hf-gpu-2" oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set -f <path-to-this-file>

## minRunners is the min number of idle runners. The target number of runners created will be
## calculated as a sum of minRunners and the number of jobs assigned to the scale set.
maxRunners: 6
minRunners: 1
controllerServiceAccount:
name: arc-gha-rs-controller
namespace: arc
template:
spec:
securityContext:
supplementalGroups:
- 110
initContainers:
- name: init-dind-externals
image: ghcr.io/saienduri/ghascale-rocm-dev:main
imagePullPolicy: Always
command:
["cp", "-r", "/home/runner/externals/.", "/home/runner/tmpDir/"]
volumeMounts:
- name: dind-externals
mountPath: /home/runner/tmpDir
- name: dind
image: ghcr.io/saienduri/dind:main
restartPolicy: Always
command: ["sh", "-c"]
args:
- |
dockerd --host=unix:///var/run/docker.sock --group=${DOCKER_GROUP_GID} --data-root=/home/runner/docker-data &
until docker info >/dev/null 2>&1; do sleep 5; done
tail -f /dev/null
env:
- name: DOCKER_GROUP_GID
value: "123"
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
securityContext:
privileged: true
volumeMounts:
- name: docker-data
mountPath: /home/runner/docker-data
subPathExpr: $(POD_NAME)-docker
- name: work
mountPath: /home/runner/_work
- name: dind-sock
mountPath: /var/run
- name: dind-externals
mountPath: /home/runner/externals
- name: podinfo
mountPath: /etc/podinfo
- name: ci-data
mountPath: /mnt
readOnly: False
lifecycle:
preStop:
exec:
command: ["rm", "-rf", "/home/runner/docker-data"]
containers:
- name: runner
image: ghcr.io/saienduri/ghascale-rocm-dev:main
imagePullPolicy: Always
command:
- /bin/sh
- -c
- |
devices=$(ls -la /dev/dri/ | grep renderD | awk '{print $10}')
GHA_RENDER_DEVICES="--group-add 110"
ROCR_VISIBLE_DEVICES=""
for device in $devices; do
GHA_RENDER_DEVICES="${GHA_RENDER_DEVICES} --device /dev/dri/${device}"
# Also Set the GPU isolation environment variables. Ref: https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html
GPU_XCD=$(echo "${device}" | tr -d -c 0-9)
PHYSICAL_GPU_NUMBER=$(( GPU_XCD % 128 / 8))
if [ -z "${ROCR_VISIBLE_DEVICES}" ]; then
ROCR_VISIBLE_DEVICES="${PHYSICAL_GPU_NUMBER}"
else
ROCR_VISIBLE_DEVICES="${ROCR_VISIBLE_DEVICES},${PHYSICAL_GPU_NUMBER}"
fi
done
echo "${GHA_RENDER_DEVICES}" > /etc/podinfo/gha-render-devices
echo "ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES}" > /etc/podinfo/gha-gpu-isolation-settings
echo "HIP_VISIBLE_DEVICES=0,1" >> /etc/podinfo/gha-gpu-isolation-settings

# Wait for Docker to be ready before starting runner
echo "Waiting for docker..."
until docker info >/dev/null 2>&1; do sleep 5; done
/home/runner/run.sh
resources:
requests:
cpu: 20000m
memory: 400000Mi
ephemeral-storage: "400G"
amd.com/gpu: 2
limits:
ephemeral-storage: "400G"
amd.com/gpu: 2
env:
- name: DOCKER_HOST
value: unix:///var/run/docker.sock
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: work
mountPath: /home/runner/_work
- name: dind-sock
mountPath: /var/run
- name: dind-externals
mountPath: /home/runner/externals
- name: podinfo
mountPath: /etc/podinfo
- name: docker-data
mountPath: /home/runner/docker-data
subPathExpr: $(POD_NAME)-docker
- name: ci-data
mountPath: /mnt
readOnly: False
volumes:
- name: work
emptyDir: {}
- name: dind-sock
emptyDir: {}
- name: dind-externals
emptyDir: {}
- name: docker-data
emptyDir: {}
- name: podinfo
emptyDir: {}
- name: ci-data
hostPath:
path: /data/huggingface-ci
type: Directory
githubConfigUrl: https://github.com/huggingface
githubConfigSecret: arc-rs-hf-secret
runnerGroup: amd-mi300-runners