You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
echo"[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
54
+
fi
36
55
37
56
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
38
57
# (not including the other ports set by this script). So this range is chosen to be
39
58
# somewhere in the middle
40
59
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
41
-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
60
+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
42
61
43
62
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
44
63
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -82,13 +101,66 @@ gpus_per_node=8
82
101
83
102
num_retries={{ num_retries }}
84
103
104
+
# Track backgrounded srun client PIDs for head and workers
105
+
declare -A SRUN_PIDS
106
+
107
+
# Verify all backgrounded srun client processes are still alive; exit fast if any died
108
+
check_srun_processes() {
109
+
fornamein"${!SRUN_PIDS[@]}";do
110
+
pid="${SRUN_PIDS[$name]}"
111
+
# Check if the process is still running
112
+
if!kill -0 "$pid"2>/dev/null;then
113
+
echo"[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit.">&2
114
+
# Signal sidecars inside containers to terminate ASAP
115
+
touch "$LOG_DIR/ENDED"
116
+
exit 1
117
+
fi
118
+
done
119
+
}
120
+
85
121
# Getting the node names and IP addresses in the SLURM allocation
86
122
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
87
123
nodes_array=($nodes)
88
124
ip_addresses_array=()
89
125
90
126
fornodein$nodes;do
91
-
ip_address=$(getent hosts "$node"| awk '{print $1}'| head -n1)
127
+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
128
+
echo"[DEBUG] Resolving hostname: $node using enhanced resolution methods"
echo"[WARNING]: Cannot increase ulimit on file descriptors to 65535 according ray recommendation: https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html. Speak to cluster admins to increase, otherwise ray may crash unexpectedly."
56
+
fi
38
57
39
58
# On our clusters, the largest port range on an idle worker appeared between 52369-64607
40
59
# (not including the other ports set by this script). So this range is chosen to be
41
60
# somewhere in the middle
42
61
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
43
-
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
62
+
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54513}
44
63
45
64
# Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
46
65
RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ray-cluster}
@@ -84,13 +103,66 @@ gpus_per_node=8
84
103
85
104
num_retries=1
86
105
106
+
# Track backgrounded srun client PIDs for head and workers
107
+
declare -A SRUN_PIDS
108
+
109
+
# Verify all backgrounded srun client processes are still alive; exit fast if any died
110
+
check_srun_processes() {
111
+
fornamein"${!SRUN_PIDS[@]}";do
112
+
pid="${SRUN_PIDS[$name]}"
113
+
# Check if the process is still running
114
+
if!kill -0 "$pid"2>/dev/null;then
115
+
echo"[ERROR] Background srun '$name' died (pid=$pid). Could be a failure in startup or an issue with the node preventing the srun to start. Attempting to exit.">&2
116
+
# Signal sidecars inside containers to terminate ASAP
117
+
touch "$LOG_DIR/ENDED"
118
+
exit 1
119
+
fi
120
+
done
121
+
}
122
+
87
123
# Getting the node names and IP addresses in the SLURM allocation
88
124
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
89
125
nodes_array=($nodes)
90
126
ip_addresses_array=()
91
127
92
128
fornodein$nodes;do
93
-
ip_address=$(getent hosts "$node"| awk '{print $1}'| head -n1)
129
+
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
130
+
echo"[DEBUG] Resolving hostname: $node using enhanced resolution methods"
0 commit comments