Skip to content

Commit 6a40247

Browse files
authored
revert: "chore: improve ray.sub generalization across clusters" (#1505)
Signed-off-by: Terry Kong <[email protected]>
1 parent 6a035bc commit 6a40247

File tree

1 file changed

+39
-12
lines changed

1 file changed

+39
-12
lines changed

ray.sub

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,25 @@
2424

2525
set -eoux pipefail
2626

27+
########################################################
28+
# Function to detect if SLURM cluster uses GRES
29+
########################################################
30+
maybe_gres_arg() {
31+
# Check if any nodes in the partition have GRES configured
32+
# Assumes a homogeneous allocation (not a heterogeneous job)
33+
if sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep -q "gpu:"; then
34+
# Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
35+
if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | awk -F: '{print $NF}') ]]; then
36+
echo "Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:") meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
37+
exit 1
38+
fi
39+
echo "--gres=gpu:${GPUS_PER_NODE}"
40+
return
41+
fi
42+
43+
# No GRES support detected
44+
echo ""
45+
}
2746

2847
########################################################
2948
# User defined variables
@@ -95,7 +114,15 @@ mkdir -p $LOG_DIR
95114
# Number of GPUs per worker node
96115
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
97116

98-
COMMON_SRUN_ARGS=""
117+
# Detect GRES support and set GRES_ARG
118+
GRES_ARG=$(maybe_gres_arg)
119+
if [[ -n "$GRES_ARG" ]]; then
120+
echo "[INFO] GRES support detected. Using: $GRES_ARG"
121+
else
122+
echo "[INFO] No GRES support detected. Running without --gres flag."
123+
fi
124+
125+
COMMON_SRUN_ARGS="$GRES_ARG"
99126
COMMON_SRUN_ARGS+=" --no-container-mount-home"
100127
COMMON_SRUN_ARGS+=" --mpi=pmix"
101128
COMMON_SRUN_ARGS+=" --container-mounts=$MOUNTS"
@@ -104,8 +131,8 @@ COMMON_SRUN_ARGS+=" --container-workdir=$SLURM_SUBMIT_DIR"
104131
# TODO: delete these (just for debugging)
105132
COMMON_SRUN_ARGS+=" -p $SLURM_JOB_PARTITION"
106133
COMMON_SRUN_ARGS+=" -A $SLURM_JOB_ACCOUNT"
107-
# Claim all the CPU/memory/GPUs on the node
108-
COMMON_SRUN_ARGS+=" --exclusive"
134+
# Number of CPUs per worker node
135+
CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((GPUS_PER_NODE * 16))}
109136

110137
num_retries=3
111138

@@ -275,7 +302,7 @@ touch $LOG_DIR/ENDED
275302
exit 1
276303
EOF
277304
)
278-
srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
305+
srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
279306
SRUN_PIDS["ray-head"]=$!
280307

281308
NUM_ACTORS=$((GPUS_PER_NODE * SLURM_JOB_NUM_NODES))
@@ -374,13 +401,13 @@ touch $LOG_DIR/ENDED
374401
exit 1
375402
EOF
376403
)
377-
srun $COMMON_SRUN_ARGS --container-name=ray-worker-$i --nodes=1 -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
404+
srun $COMMON_SRUN_ARGS --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
378405
SRUN_PIDS["ray-worker-$i"]=$!
379406
sleep 3
380407
done
381408

382409
# Then we wait here for the file to be created by the head node container
383-
while check_srun_processes && ! srun --overlap --nodes=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD; do
410+
while check_srun_processes && ! srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STARTED_RAY_HEAD; do
384411
echo "[INFO][$(date)] Waiting for head node container to start..."
385412
sleep 2
386413
done
@@ -389,7 +416,7 @@ done
389416
# Before we launch a job on this cluster we need to make sure that the bringup is complete
390417
# We do so by querying the number of worker_units in the ray cluster and asserting = NUM_ACTORS
391418
extract_worker_units() {
392-
status_output=$(srun --overlap --container-name=ray-head --nodes=1 -w "$head_node" ray status)
419+
status_output=$(srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" ray status)
393420
if echo "$status_output" | grep -q "worker_units"; then
394421
worker_units=$(echo "$status_output" | grep "worker_units" | awk -F'[/. ]' '{print $4}')
395422
echo $worker_units
@@ -419,7 +446,7 @@ echo "All workers connected!"
419446
# This driver process is responsible for launching a job on the Ray cluster
420447
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1)
421448
if [[ -n "$COMMAND" ]]; then
422-
srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
449+
srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
423450
else
424451
echo "[INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
425452
cat <<EOF >$SLURM_SUBMIT_DIR/${SLURM_JOB_ID}-attach.sh
@@ -430,9 +457,9 @@ WORKER_NUM=\${1:-}
430457
if [[ -z "\$WORKER_NUM" ]]; then
431458
# Empty means we are on the head node
432459
if [[ -n "\${COMMAND:-}" ]]; then
433-
srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 -w "$head_node" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
460+
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
434461
else
435-
srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 -w "$head_node" --jobid $SLURM_JOB_ID --pty bash
462+
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" --jobid $SLURM_JOB_ID --pty bash
436463
fi
437464
else
438465
# Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
@@ -443,9 +470,9 @@ else
443470
fi
444471
nodes_array=($nodes)
445472
if [[ -n "\${COMMAND:-}" ]]; then
446-
srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
473+
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID bash -c "\$COMMAND"
447474
else
448-
srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID --pty bash
475+
srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "\${nodes_array[\$WORKER_NUM]}" --jobid $SLURM_JOB_ID --pty bash
449476
fi
450477
fi
451478
EOF

0 commit comments

Comments
 (0)