Skip to content

Commit 1f8c6c0

Browse files
committed
Update ray template
Signed-off-by: Hemil Desai <[email protected]>
1 parent f104fe6 commit 1f8c6c0

File tree

3 files changed

+264
-33
lines changed

3 files changed

+264
-33
lines changed

nemo_run/run/ray/templates/ray.sub.j2

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,36 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
2828
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
2929

3030
# Ports for the head node
31-
PORT=${PORT:-6379}
31+
PORT=${PORT:-54514}
3232
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
3333
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3434
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
3535
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
36+
RAY_DEBUGGER_ARGS=
37+
if [ "${RAY_DEBUG:-}" = "legacy" ]; then
38+
RAY_DEBUGGER_ARGS="--ray-debugger-external"
39+
fi
40+
41+
# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
42+
# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
43+
# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
44+
export RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
45+
46+
# Setting ulimit is recommended by ray best practices page
47+
# @ https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html
48+
# It's session based and won't affect the system outside the script
49+
# Ensure that the soft limit isn't above the hard limit
50+
if [[ $(ulimit -Hn) == "unlimited" ]] || [[ 65535 -lt $(ulimit -Hn) ]]; then
51+
ulimit -Sn 65535
52+
elif [[ $(ulimit -Hn) != "unlimited" ]] && [[ $(ulimit -Hn) -lt 65535 ]]; then
53+
echo "[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
54+
fi
3655

3756
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
3857
# (not including the other ports set by this script). So this range is chosen to be
3958
# somewhere in the middle
4059
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
41-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
60+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
4261

4362
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
4463
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -82,13 +101,66 @@ gpus_per_node=8
82101

83102
num_retries={{ num_retries }}
84103

104+
# Track backgrounded srun client PIDs for head and workers
105+
declare -A SRUN_PIDS
106+
107+
# Verify all backgrounded srun client processes are still alive; exit fast if any died
108+
check_srun_processes() {
109+
for name in "${!SRUN_PIDS[@]}"; do
110+
pid="${SRUN_PIDS[$name]}"
111+
# Check if the process is still running
112+
if ! kill -0 "$pid" 2>/dev/null; then
113+
echo "[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit." >&2
114+
# Signal sidecars inside containers to terminate ASAP
115+
touch "$LOG_DIR/ENDED"
116+
exit 1
117+
fi
118+
done
119+
}
120+
85121
# Getting the node names and IP addresses in the SLURM allocation
86122
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
87123
nodes_array=($nodes)
88124
ip_addresses_array=()
89125

90126
for node in $nodes; do
91-
ip_address=$(host $node | awk '/has address/ { print $4 }')
127+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
128+
echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
129+
ip_address=""
130+
131+
# Method 1: Try host command
132+
echo "[DEBUG] Method 1: host command"
133+
ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
134+
echo "[DEBUG] host result: '$ip_address'"
135+
136+
# Method 2: If host fails, try getent
137+
if [[ -z "$ip_address" ]]; then
138+
echo "[DEBUG] Method 2: getent hosts"
139+
ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
140+
echo "[DEBUG] getent result: '$ip_address'"
141+
fi
142+
143+
# Method 3: If getent fails, try nslookup
144+
if [[ -z "$ip_address" ]]; then
145+
echo "[DEBUG] Method 3: nslookup"
146+
ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
147+
echo "[DEBUG] nslookup result: '$ip_address'"
148+
fi
149+
150+
# Method 4: If all DNS methods fail, try ping to extract IP
151+
if [[ -z "$ip_address" ]]; then
152+
echo "[DEBUG] Method 4: ping"
153+
ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
154+
echo "[DEBUG] ping result: '$ip_address'"
155+
fi
156+
157+
# If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
158+
if [[ -z "$ip_address" ]]; then
159+
echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
160+
ip_address=$node
161+
fi
162+
163+
echo "[INFO] Node: $node -> IP: $ip_address"
92164
# Add the IP address to the array
93165
ip_addresses_array+=("$ip_address")
94166
done
@@ -184,12 +256,13 @@ ray start --head \
184256
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
185257
--dashboard-port=${DASHBOARD_PORT} \
186258
\
187-
--node-manager-port=${NODE_MANAGER_PORT} \
188-
--object-manager-port=${OBJECT_MANAGER_PORT} \
189-
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
190-
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
191-
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
192-
--metrics-export-port=${METRICS_EXPORT_PORT} \
259+
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
260+
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
261+
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
262+
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
263+
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
264+
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
265+
$RAY_DEBUGGER_ARGS \
193266
\
194267
--block
195268
EOFINNER
@@ -207,6 +280,7 @@ exit 1
207280
EOF
208281
)
209282
srun {{ common_srun_args }} --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/{{ ray_log_prefix }}head.log bash -x -c "$head_cmd" &
283+
SRUN_PIDS["ray-head"]=$!
210284

211285
# Wait for the head node container to start and for Ray to be ready
212286
elapsed_time=0
@@ -217,6 +291,7 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
217291
exit 1
218292
fi
219293
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
294+
check_srun_processes
220295
sleep 2
221296
elapsed_time=$((elapsed_time + 2))
222297
done
@@ -261,7 +336,6 @@ monitor-sidecar &
261336
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
262337
263338
cat <<EOFINNER | tee /launch-worker.sh
264-
sleep 5
265339
ray start --address "$ip_head" \
266340
--disable-usage-stats \
267341
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
@@ -274,6 +348,7 @@ ray start --address "$ip_head" \
274348
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
275349
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
276350
--metrics-export-port=${METRICS_EXPORT_PORT} \
351+
$RAY_DEBUGGER_ARGS \
277352
\
278353
--block
279354
EOFINNER
@@ -293,6 +368,7 @@ EOF
293368
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
294369
fi
295370
srun {{ common_srun_args }} ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/{{ ray_log_prefix }}worker-$i.log bash -x -c "$worker_cmd" &
371+
SRUN_PIDS["ray-worker-$i"]=$!
296372
sleep 3
297373
done
298374

@@ -316,9 +392,10 @@ extract_worker_units() {
316392
while true; do
317393
worker_units=$(extract_worker_units)
318394
echo "[INFO] Number of actors online: $worker_units/$NUM_ACTORS"
319-
if [ "$worker_units" -eq "$NUM_ACTORS" ]; then
395+
if [[ "$worker_units" -eq "$NUM_ACTORS" ]]; then
320396
break
321397
fi
398+
check_srun_processes
322399
sleep 2
323400
done
324401

test/core/execution/artifacts/expected_ray_cluster.sub

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,36 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
3030
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
3131

3232
# Ports for the head node
33-
PORT=${PORT:-6379}
33+
PORT=${PORT:-54514}
3434
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
3535
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
3636
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
3737
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
38+
RAY_DEBUGGER_ARGS=
39+
if [ "${RAY_DEBUG:-}" = "legacy" ]; then
40+
RAY_DEBUGGER_ARGS="--ray-debugger-external"
41+
fi
42+
43+
# After ray>=2.47, this feature is enabled by default which creates uv venvs for any py_executable starting with `uv run`.
44+
# There is severe contention and performance issues with this enabled considering our dependencies are so large and occasionally
45+
# need to be compiled, so NeMo RL has an implementation in nemo_rl/utils/venv.py that does it once per node as opposed to once per task.
46+
export RAY_ENABLE_UV_RUN_RUNTIME_ENV=0
47+
48+
# Setting ulimit is recommended by ray best practices page
49+
# @ https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html
50+
# It's session based and won't affect the system outside the script
51+
# Ensure that the soft limit isn't above the hard limit
52+
if [[ $(ulimit -Hn) == "unlimited" ]] || [[ 65535 -lt $(ulimit -Hn) ]]; then
53+
ulimit -Sn 65535
54+
elif [[ $(ulimit -Hn) != "unlimited" ]] && [[ $(ulimit -Hn) -lt 65535 ]]; then
55+
echo "[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
56+
fi
3857

3958
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
4059
# (not including the other ports set by this script). So this range is chosen to be
4160
# somewhere in the middle
4261
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
43-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
62+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
4463

4564
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
4665
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -84,13 +103,66 @@ gpus_per_node=8
84103

85104
num_retries=1
86105

106+
# Track backgrounded srun client PIDs for head and workers
107+
declare -A SRUN_PIDS
108+
109+
# Verify all backgrounded srun client processes are still alive; exit fast if any died
110+
check_srun_processes() {
111+
for name in "${!SRUN_PIDS[@]}"; do
112+
pid="${SRUN_PIDS[$name]}"
113+
# Check if the process is still running
114+
if ! kill -0 "$pid" 2>/dev/null; then
115+
echo "[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit." >&2
116+
# Signal sidecars inside containers to terminate ASAP
117+
touch "$LOG_DIR/ENDED"
118+
exit 1
119+
fi
120+
done
121+
}
122+
87123
# Getting the node names and IP addresses in the SLURM allocation
88124
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
89125
nodes_array=($nodes)
90126
ip_addresses_array=()
91127

92128
for node in $nodes; do
93-
ip_address=$(host $node | awk '/has address/ { print $4 }')
129+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
130+
echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
131+
ip_address=""
132+
133+
# Method 1: Try host command
134+
echo "[DEBUG] Method 1: host command"
135+
ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
136+
echo "[DEBUG] host result: '$ip_address'"
137+
138+
# Method 2: If host fails, try getent
139+
if [[ -z "$ip_address" ]]; then
140+
echo "[DEBUG] Method 2: getent hosts"
141+
ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
142+
echo "[DEBUG] getent result: '$ip_address'"
143+
fi
144+
145+
# Method 3: If getent fails, try nslookup
146+
if [[ -z "$ip_address" ]]; then
147+
echo "[DEBUG] Method 3: nslookup"
148+
ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
149+
echo "[DEBUG] nslookup result: '$ip_address'"
150+
fi
151+
152+
# Method 4: If all DNS methods fail, try ping to extract IP
153+
if [[ -z "$ip_address" ]]; then
154+
echo "[DEBUG] Method 4: ping"
155+
ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
156+
echo "[DEBUG] ping result: '$ip_address'"
157+
fi
158+
159+
# If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
160+
if [[ -z "$ip_address" ]]; then
161+
echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
162+
ip_address=$node
163+
fi
164+
165+
echo "[INFO] Node: $node -> IP: $ip_address"
94166
# Add the IP address to the array
95167
ip_addresses_array+=("$ip_address")
96168
done
@@ -178,12 +250,13 @@ ray start --head \
178250
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
179251
--dashboard-port=${DASHBOARD_PORT} \
180252
\
181-
--node-manager-port=${NODE_MANAGER_PORT} \
182-
--object-manager-port=${OBJECT_MANAGER_PORT} \
183-
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
184-
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
185-
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
186-
--metrics-export-port=${METRICS_EXPORT_PORT} \
253+
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
254+
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
255+
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
256+
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
257+
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
258+
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
259+
$RAY_DEBUGGER_ARGS \
187260
\
188261
--block
189262
EOFINNER
@@ -201,6 +274,7 @@ exit 1
201274
EOF
202275
)
203276
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
277+
SRUN_PIDS["ray-head"]=$!
204278

205279
# Wait for the head node container to start and for Ray to be ready
206280
elapsed_time=0
@@ -211,6 +285,7 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
211285
exit 1
212286
fi
213287
echo "[INFO][$(date)] Waiting for Ray head node container to start and be ready... ($elapsed_time/$RAY_HEAD_START_TIMEOUT seconds)"
288+
check_srun_processes
214289
sleep 2
215290
elapsed_time=$((elapsed_time + 2))
216291
done
@@ -251,7 +326,6 @@ monitor-sidecar &
251326
sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
252327
253328
cat <<EOFINNER | tee /launch-worker.sh
254-
sleep 5
255329
ray start --address "$ip_head" \
256330
--disable-usage-stats \
257331
--resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
@@ -264,6 +338,7 @@ ray start --address "$ip_head" \
264338
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
265339
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
266340
--metrics-export-port=${METRICS_EXPORT_PORT} \
341+
$RAY_DEBUGGER_ARGS \
267342
\
268343
--block
269344
EOFINNER
@@ -283,6 +358,7 @@ EOF
283358
OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
284359
fi
285360
srun --container-image=nvcr.io/nvidia/pytorch:24.01-py3 --no-container-mount-home --mpi=pmix -A=test_account -p=gpu --gres=gpu:8 --container-mounts /tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster:/tmp/test_jobs/test-ray-cluster,/tmp/test_jobs/test-ray-cluster/logs:/tmp/test_jobs/test-ray-cluster/logs --container-workdir=/workspace ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
361+
SRUN_PIDS["ray-worker-$i"]=$!
286362
sleep 3
287363
done
288364

@@ -306,9 +382,10 @@ extract_worker_units() {
306382
while true; do
307383
worker_units=$(extract_worker_units)
308384
echo "[INFO] Number of actors online: $worker_units/$NUM_ACTORS"
309-
if [ "$worker_units" -eq "$NUM_ACTORS" ]; then
385+
if [[ "$worker_units" -eq "$NUM_ACTORS" ]]; then
310386
break
311387
fi
388+
check_srun_processes
312389
sleep 2
313390
done
314391

0 commit comments

Comments
 (0)