2424
2525set -eoux pipefail
2626
27+ # #######################################################
28+ # Function to detect if SLURM cluster uses GRES
29+ # #######################################################
30+ maybe_gres_arg () {
31+ # Check if any nodes in the partition have GRES configured
32+ # Assumes a homogeneous allocation (not a heterogeneous job)
33+ if sinfo -p $SLURM_JOB_PARTITION -h -o " %G" | grep -q " gpu:" ; then
34+ # Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
35+ if [[ $GPUS_PER_NODE -ne $( sinfo -p $SLURM_JOB_PARTITION -h -o " %G" | grep " gpu:" | awk -F: ' {print $NF}' ) ]]; then
36+ echo " Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $( sinfo -p $SLURM_JOB_PARTITION -h -o " %G" | grep " gpu:" ) meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
37+ exit 1
38+ fi
39+ echo " --gres=gpu:${GPUS_PER_NODE} "
40+ return
41+ fi
42+
43+ # No GRES support detected
44+ echo " "
45+ }
2746
2847# #######################################################
2948# User defined variables
@@ -95,7 +114,15 @@ mkdir -p $LOG_DIR
95114# Number of GPUs per worker node
96115GPUS_PER_NODE=${GPUS_PER_NODE:- 8}
97116
98- COMMON_SRUN_ARGS=" "
117+ # Detect GRES support and set GRES_ARG
118+ GRES_ARG=$( maybe_gres_arg)
119+ if [[ -n " $GRES_ARG " ]]; then
120+ echo " [INFO] GRES support detected. Using: $GRES_ARG "
121+ else
122+ echo " [INFO] No GRES support detected. Running without --gres flag."
123+ fi
124+
125+ COMMON_SRUN_ARGS=" $GRES_ARG "
99126COMMON_SRUN_ARGS+=" --no-container-mount-home"
100127COMMON_SRUN_ARGS+=" --mpi=pmix"
101128COMMON_SRUN_ARGS+=" --container-mounts=$MOUNTS "
@@ -104,8 +131,8 @@ COMMON_SRUN_ARGS+=" --container-workdir=$SLURM_SUBMIT_DIR"
104131# TODO: delete these (just for debugging)
105132COMMON_SRUN_ARGS+=" -p $SLURM_JOB_PARTITION "
106133COMMON_SRUN_ARGS+=" -A $SLURM_JOB_ACCOUNT "
107- # Claim all the CPU/memory/GPUs on the node
108- COMMON_SRUN_ARGS+= " --exclusive "
134+ # Number of CPUs per worker node
135+ CPUS_PER_WORKER= ${CPUS_PER_WORKER :- $((GPUS_PER_NODE * 16))}
109136
110137num_retries=3
111138
@@ -275,7 +302,7 @@ touch $LOG_DIR/ENDED
275302exit 1
276303EOF
277304)
278- srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 -w " $head_node " -o $LOG_DIR /ray-head.log bash -x -c " $head_cmd " &
305+ srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 --cpus-per-task= $CPUS_PER_WORKER - w " $head_node " -o $LOG_DIR /ray-head.log bash -x -c " $head_cmd " &
279306SRUN_PIDS[" ray-head" ]=$!
280307
281308NUM_ACTORS=$(( GPUS_PER_NODE * SLURM_JOB_NUM_NODES))
@@ -374,13 +401,13 @@ touch $LOG_DIR/ENDED
374401exit 1
375402EOF
376403)
377- srun $COMMON_SRUN_ARGS --container-name=ray-worker-$i --nodes=1 -w " $node_i " -o $LOG_DIR /ray-worker-$i .log bash -x -c " $worker_cmd " &
404+ srun $COMMON_SRUN_ARGS --container-name=ray-worker-$i --exact -- nodes=1 --ntasks=1 --cpus-per-task= $CPUS_PER_WORKER -w " $node_i " -o $LOG_DIR /ray-worker-$i .log bash -x -c " $worker_cmd " &
378405 SRUN_PIDS[" ray-worker-$i " ]=$!
379406 sleep 3
380407done
381408
382409# Then we wait here for the file to be created by the head node container
383- while check_srun_processes && ! srun --overlap --nodes=1 -w $head_node test -f $LOG_DIR /STARTED_RAY_HEAD; do
410+ while check_srun_processes && ! srun --overlap --nodes=1 --ntasks=1 - w $head_node test -f $LOG_DIR /STARTED_RAY_HEAD; do
384411 echo " [INFO][$( date) ] Waiting for head node container to start..."
385412 sleep 2
386413done
389416# Before we launch a job on this cluster we need to make sure that the bringup is complete
390417# We do so by querying the number of worker_units in the ray cluster and asserting = NUM_ACTORS
391418extract_worker_units () {
392- status_output=$( srun --overlap --container-name=ray-head --nodes=1 -w " $head_node " ray status)
419+ status_output=$( srun --overlap --container-name=ray-head --nodes=1 --ntasks=1 - w " $head_node " ray status)
393420 if echo " $status_output " | grep -q " worker_units" ; then
394421 worker_units=$( echo " $status_output " | grep " worker_units" | awk -F' [/. ]' ' {print $4}' )
395422 echo $worker_units
@@ -419,7 +446,7 @@ echo "All workers connected!"
419446# This driver process is responsible for launching a job on the Ray cluster
420447CONTAINER_CWD=$( scontrol show job $SLURM_JOB_ID | grep -oP ' WorkDir=\K[^ ]+' | head -1)
421448if [[ -n " $COMMAND " ]]; then
422- srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 -w " $head_node " -o $LOG_DIR /ray-driver.log bash -c " $COMMAND "
449+ srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 - w " $head_node " -o $LOG_DIR /ray-driver.log bash -c " $COMMAND "
423450else
424451 echo " [INFO]: Ray Cluster is idled, run this on the slurm head node to get a shell to the head node:"
425452 cat << EOF >$SLURM_SUBMIT_DIR /${SLURM_JOB_ID} -attach.sh
@@ -430,9 +457,9 @@ WORKER_NUM=\${1:-}
430457if [[ -z "\$ WORKER_NUM" ]]; then
431458 # Empty means we are on the head node
432459 if [[ -n "\$ {COMMAND:-}" ]]; then
433- srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 -w "$head_node " --jobid $SLURM_JOB_ID bash -c "\$ COMMAND"
460+ srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks =1 -w "$head_node " --jobid $SLURM_JOB_ID bash -c "\$ COMMAND"
434461 else
435- srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 -w "$head_node " --jobid $SLURM_JOB_ID --pty bash
462+ srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks =1 -w "$head_node " --jobid $SLURM_JOB_ID --pty bash
436463 fi
437464else
438465 # Worker numbers 1 through N-1 correspond to ray-worker-1 through ray-worker-(N-1)
443470 fi
444471 nodes_array=($nodes )
445472 if [[ -n "\$ {COMMAND:-}" ]]; then
446- srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$ WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 -w "\$ {nodes_array[\$ WORKER_NUM]}" --jobid $SLURM_JOB_ID bash -c "\$ COMMAND"
473+ srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$ WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks =1 -w "\$ {nodes_array[\$ WORKER_NUM]}" --jobid $SLURM_JOB_ID bash -c "\$ COMMAND"
447474 else
448- srun --no-container-mount-home -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$ WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 -w "\$ {nodes_array[\$ WORKER_NUM]}" --jobid $SLURM_JOB_ID --pty bash
475+ srun --no-container-mount-home $GRES_ARG -A $SLURM_JOB_ACCOUNT -p $SLURM_JOB_PARTITION --overlap --container-name=ray-worker-\$ WORKER_NUM --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks =1 -w "\$ {nodes_array[\$ WORKER_NUM]}" --jobid $SLURM_JOB_ID --pty bash
449476 fi
450477fi
451478EOF
0 commit comments