Skip to content

Commit 02bee78

Browse files
committed
add gpu checker
1 parent 440fac2 commit 02bee78

File tree

4 files changed

+606
-74
lines changed

4 files changed

+606
-74
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,99 @@ jobs:
7070
- name: Install NVidia GPU operator for KinD
7171
uses: ./common/github-actions/nvidia-gpu-operator
7272

73+
- name: Verify GPU availability in KinD
74+
run: |
75+
echo "Checking for available GPUs in the KinD cluster..."
76+
77+
# Wait for GPU operator pods to be ready (with timeout)
78+
echo "Waiting for GPU operator pods to be ready..."
79+
TIMEOUT=300 # 5 minutes timeout
80+
END=$((SECONDS + TIMEOUT))
81+
82+
while [ $SECONDS -lt $END ]; do
83+
# Get total number of pods in the namespace
84+
TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
85+
86+
# Count pods that are either running and ready or completed successfully
87+
# Exclude pods that are still initializing
88+
READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
89+
90+
if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
91+
echo "All GPU operator pods are ready or completed successfully!"
92+
break
93+
fi
94+
95+
echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
96+
echo "Pod status:"
97+
kubectl get pods -n gpu-operator
98+
sleep 10
99+
done
100+
101+
if [ $SECONDS -ge $END ]; then
102+
echo "::error::Timeout waiting for GPU operator pods to be ready"
103+
echo "GPU operator pod status:"
104+
kubectl get pods -n gpu-operator -o wide
105+
echo "GPU operator pod logs:"
106+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
107+
echo "GPU operator pod events:"
108+
kubectl get events -n gpu-operator
109+
exit 1
110+
fi
111+
112+
echo "Node details:"
113+
kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
114+
115+
# Check if GPU operator has labeled nodes
116+
GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
117+
if [ "$GPU_LABELS" -eq 0 ]; then
118+
echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
119+
echo "Full node descriptions for debugging:"
120+
kubectl describe nodes
121+
exit 1
122+
fi
123+
124+
# Check if GPUs are actually allocatable
125+
GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
126+
if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
127+
echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
128+
echo "Checking GPU operator pods:"
129+
kubectl get pods -n gpu-operator -o wide
130+
echo "GPU operator pod logs:"
131+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
132+
echo "GPU operator pod events:"
133+
kubectl get events -n gpu-operator
134+
echo "GPU operator pod descriptions:"
135+
kubectl describe pods -n gpu-operator
136+
exit 1
137+
fi
138+
139+
echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
140+
73141
- name: Deploy CodeFlare stack
74142
id: deploy
75143
run: |
76144
cd codeflare-operator
77145
echo Setting up CodeFlare stack
78146
make setup-e2e
147+
148+
# Create ConfigMap to disable mTLS
149+
echo "Creating ConfigMap to disable mTLS..."
150+
cat <<EOF | kubectl apply -f -
151+
apiVersion: v1
152+
kind: ConfigMap
153+
metadata:
154+
name: codeflare-operator-config
155+
namespace: ray-system
156+
data:
157+
config.yaml: |
158+
kuberay:
159+
mTLSEnabled: false
160+
rayDashboardOAuthEnabled: false
161+
ingressDomain: "kind"
162+
appwrapper:
163+
enabled: true
164+
EOF
165+
79166
echo Deploying CodeFlare operator
80167
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
81168
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager

codeflare-kuberay.code-workspace

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"folders": [
3+
{
4+
"path": "/Users/bkeane/Code/github.com/codeflare-sdk"
5+
},
6+
{
7+
"path": "/Users/bkeane/Code/github.com/kuberay"
8+
},
9+
{
10+
"path": "/Users/bkeane/Code/github.com/codeflare-operator"
11+
}
12+
]
13+
}

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 137 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import logging
1212
import time
1313
import os
14+
import subprocess
1415

1516
from support import *
1617

@@ -22,40 +23,34 @@
2223
@pytest.mark.kind
2324
class TestRayLocalInteractiveOauth:
2425
def setup_method(self):
25-
logger.info("Setting up test environment...")
2626
initialize_kubernetes_client(self)
27-
logger.info("Kubernetes client initialized")
27+
logger.info("Kubernetes client initalized")
2828

2929
def teardown_method(self):
30-
logger.info("Cleaning up test environment...")
3130
delete_namespace(self)
3231
delete_kueue_resources(self)
33-
logger.info("Cleanup completed")
3432

3533
def test_local_interactives(self):
36-
logger.info("Starting test_local_interactives...")
3734
self.setup_method()
3835
create_namespace(self)
3936
create_kueue_resources(self)
4037
self.run_local_interactives()
41-
logger.info("test_local_interactives completed")
4238

4339
@pytest.mark.nvidia_gpu
4440
def test_local_interactives_nvidia_gpu(self):
45-
logger.info("Starting test_local_interactives_nvidia_gpu...")
4641
self.setup_method()
4742
create_namespace(self)
4843
create_kueue_resources(self)
4944
self.run_local_interactives(number_of_gpus=1)
50-
logger.info("test_local_interactives_nvidia_gpu completed")
5145

5246
def run_local_interactives(
5347
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
5448
):
5549
cluster_name = "test-ray-cluster-li"
5650
logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs")
5751

58-
logger.info("Creating cluster configuration...")
52+
ray.shutdown()
53+
5954
cluster = Cluster(
6055
ClusterConfiguration(
6156
name=cluster_name,
@@ -66,87 +61,185 @@ def run_local_interactives(
6661
head_memory_requests=2,
6762
head_memory_limits=2,
6863
worker_cpu_requests="500m",
69-
worker_cpu_limits=1,
64+
worker_cpu_limits="500m",
7065
worker_memory_requests=1,
7166
worker_memory_limits=4,
7267
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
7368
write_to_file=True,
7469
verify_tls=False,
7570
)
7671
)
77-
logger.info("Cluster configuration created")
7872

79-
logger.info("Starting cluster deployment...")
8073
cluster.up()
8174
logger.info("Cluster deployment initiated")
8275

83-
logger.info("Waiting for cluster to be ready...")
8476
cluster.wait_ready()
77+
cluster.status()
8578
logger.info("Cluster is ready")
8679

87-
logger.info("Generating TLS certificates...")
88-
generate_cert.generate_tls_cert(cluster_name, self.namespace)
89-
logger.info("TLS certificates generated")
80+
logger.info("Waiting for head and worker pods to be fully ready...")
81+
TIMEOUT = 300 # 5 minutes timeout
82+
END = time.time() + TIMEOUT
9083

91-
logger.info("Exporting environment variables...")
92-
generate_cert.export_env(cluster_name, self.namespace)
93-
logger.info("Environment variables exported")
84+
head_pod_name = None
85+
worker_pod_name = None
9486

95-
client_url = cluster.local_client_url()
96-
logger.info(f"Ray client URL: {client_url}")
87+
while time.time() < END:
88+
# Dynamically find pod names using substrings
89+
if not head_pod_name:
90+
head_pod_name = kubectl_get_pod_name_by_substring(
91+
self.namespace, cluster_name, "head"
92+
)
93+
if head_pod_name:
94+
logger.info(f"Discovered head pod by substring: {head_pod_name}")
95+
else:
96+
logger.info(
97+
f"Head pod not yet found by searching for '{cluster_name}' and 'head' in pod names. Retrying..."
98+
)
9799

98-
logger.info("Checking cluster status...")
99-
status = cluster.status()
100-
logger.info(f"Cluster status: {status}")
100+
if not worker_pod_name:
101+
worker_pod_name = kubectl_get_pod_name_by_substring(
102+
self.namespace, cluster_name, "worker"
103+
)
104+
if worker_pod_name:
105+
logger.info(
106+
f"Discovered worker pod by substring: {worker_pod_name}"
107+
)
108+
else:
109+
logger.info(
110+
f"Worker pod not yet found by searching for '{cluster_name}' and 'worker' in pod names. Retrying..."
111+
)
101112

102-
logger.info("Checking cluster dashboard URI...")
103-
dashboard_uri = cluster.cluster_dashboard_uri()
104-
logger.info(f"Dashboard URI: {dashboard_uri}")
113+
head_status = "NotFound"
114+
worker_status = "NotFound"
105115

106-
logger.info("Checking cluster URI...")
107-
cluster_uri = cluster.cluster_uri()
108-
logger.info(f"Cluster URI: {cluster_uri}")
116+
if head_pod_name:
117+
head_status = kubectl_get_pod_status(self.namespace, head_pod_name)
118+
if worker_pod_name:
119+
worker_status = kubectl_get_pod_status(self.namespace, worker_pod_name)
109120

110-
logger.info("Shutting down any existing Ray connections...")
111-
ray.shutdown()
112-
logger.info("Ray shutdown completed")
121+
logger.info(f"Head pod ({head_pod_name or 'N/A'}) status: {head_status}")
122+
logger.info(
123+
f"Worker pod ({worker_pod_name or 'N/A'}) status: {worker_status}"
124+
)
125+
126+
if (
127+
head_pod_name
128+
and worker_pod_name
129+
and "Running" in head_status
130+
and "Running" in worker_status
131+
):
132+
head_ready = kubectl_get_pod_ready(self.namespace, head_pod_name)
133+
worker_ready = kubectl_get_pod_ready(self.namespace, worker_pod_name)
134+
135+
if head_ready and worker_ready:
136+
logger.info("All discovered pods and containers are ready!")
137+
break
138+
else:
139+
logger.info(
140+
"Discovered pods are running but containers are not all ready yet..."
141+
)
142+
if not head_ready and head_pod_name:
143+
head_container_status = kubectl_get_pod_container_status(
144+
self.namespace, head_pod_name
145+
)
146+
logger.info(
147+
f"Head pod ({head_pod_name}) container status: {head_container_status}"
148+
)
149+
if not worker_ready and worker_pod_name:
150+
worker_container_status = kubectl_get_pod_container_status(
151+
self.namespace, worker_pod_name
152+
)
153+
logger.info(
154+
f"Worker pod ({worker_pod_name}) container status: {worker_container_status}"
155+
)
156+
elif (head_pod_name and "Error" in head_status) or (
157+
worker_pod_name and "Error" in worker_status
158+
):
159+
logger.error(
160+
"Error getting pod status for one or more pods, retrying..."
161+
)
162+
else:
163+
logger.info(
164+
f"Waiting for pods to be discovered and running... Current status - Head ({head_pod_name or 'N/A'}): {head_status}, Worker ({worker_pod_name or 'N/A'}): {worker_status}"
165+
)
166+
167+
time.sleep(10)
168+
169+
if time.time() >= END:
170+
logger.error("Timeout waiting for pods to be ready or discovered")
171+
if not head_pod_name or not worker_pod_name:
172+
logger.error(
173+
"Could not discover head and/or worker pods by name substring. Listing all pods in namespace for debugging:"
174+
)
175+
try:
176+
all_pods_result = subprocess.run(
177+
["kubectl", "get", "pods", "-n", self.namespace, "-o", "wide"],
178+
capture_output=True,
179+
text=True,
180+
check=False,
181+
)
182+
logger.error(
183+
f"Pods in namespace '{self.namespace}':\n{all_pods_result.stdout}"
184+
)
185+
if all_pods_result.stderr:
186+
logger.error(f"Error listing pods: {all_pods_result.stderr}")
187+
except Exception as e_pods:
188+
logger.error(f"Exception while trying to list all pods: {e_pods}")
189+
190+
if head_pod_name:
191+
logger.error(
192+
f"Final head pod ({head_pod_name}) status: {kubectl_get_pod_container_status(self.namespace, head_pod_name)}"
193+
)
194+
else:
195+
logger.error(
196+
f"Final head pod status: Not Discovered by searching for '{cluster_name}' and 'head' in pod names."
197+
)
198+
199+
if worker_pod_name:
200+
logger.error(
201+
f"Final worker pod ({worker_pod_name}) status: {kubectl_get_pod_container_status(self.namespace, worker_pod_name)}"
202+
)
203+
else:
204+
logger.error(
205+
f"Final worker pod status: Not Discovered by searching for '{cluster_name}' and 'worker' in pod names."
206+
)
207+
raise TimeoutError(
208+
"Pods did not become ready (or were not discovered by name substring) within the timeout period"
209+
)
210+
211+
generate_cert.generate_tls_cert(cluster_name, self.namespace)
212+
generate_cert.export_env(cluster_name, self.namespace)
213+
214+
client_url = cluster.local_client_url()
215+
cluster.status()
113216

114217
logger.info("Initializing Ray connection...")
115218
try:
116-
ray.init(address=client_url, logging_level="DEBUG")
219+
ray.init(address=client_url, logging_level="INFO")
117220
logger.info("Ray initialization successful")
118221
except Exception as e:
119222
logger.error(f"Ray initialization failed: {str(e)}")
120223
logger.error(f"Error type: {type(e)}")
121224
raise
122225

123-
logger.info("Defining Ray remote functions...")
124-
125226
@ray.remote(num_gpus=number_of_gpus / 2)
126227
def heavy_calculation_part(num_iterations):
127-
logger.info(
128-
f"Starting heavy_calculation_part with {num_iterations} iterations"
129-
)
130228
result = 0.0
131229
for i in range(num_iterations):
132230
for j in range(num_iterations):
133231
for k in range(num_iterations):
134232
result += math.sin(i) * math.cos(j) * math.tan(k)
135-
logger.info("heavy_calculation_part completed")
136233
return result
137234

138235
@ray.remote(num_gpus=number_of_gpus / 2)
139236
def heavy_calculation(num_iterations):
140-
logger.info(f"Starting heavy_calculation with {num_iterations} iterations")
141237
results = ray.get(
142238
[heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]
143239
)
144-
logger.info("heavy_calculation completed")
145240
return sum(results)
146241

147-
logger.info("Submitting calculation task...")
148242
ref = heavy_calculation.remote(3000)
149-
logger.info("Task submitted, waiting for result...")
150243

151244
try:
152245
result = ray.get(ref)
@@ -161,10 +254,5 @@ def heavy_calculation(num_iterations):
161254
ray.cancel(ref)
162255
logger.info("Task cancelled")
163256

164-
logger.info("Shutting down Ray...")
165257
ray.shutdown()
166-
logger.info("Ray shutdown completed")
167-
168-
logger.info("Tearing down cluster...")
169258
cluster.down()
170-
logger.info("Cluster teardown completed")

0 commit comments

Comments
 (0)