@@ -141,6 +141,9 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
141
141
142
142
name = self.config.name
143
143
namespace = self.config.namespace
144
+ head_cpus = self.config.head_cpus
145
+ head_memory = self.config.head_memory
146
+ head_gpus = self.config.head_gpus
144
147
min_cpu = self.config.min_cpus
145
148
max_cpu = self.config.max_cpus
146
149
min_memory = self.config.min_memory
@@ -158,6 +161,9 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
158
161
return generate_appwrapper(
159
162
name=name,
160
163
namespace=namespace,
164
+ head_cpus=head_cpus,
165
+ head_memory=head_memory,
166
+ head_gpus=head_gpus,
161
167
min_cpu=min_cpu,
162
168
max_cpu=max_cpu,
163
169
min_memory=min_memory,
@@ -290,7 +296,7 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
290
296
else:
291
297
return False
292
298
293
- def wait_ready(self, timeout: Optional[int] = None):
299
+ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True ):
294
300
"""
295
301
Waits for requested cluster to be ready, up to an optional timeout (s).
296
302
Checks every five seconds.
@@ -300,19 +306,32 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
300
306
dashboard_ready = False
301
307
status = None
302
308
time = 0
303
- while not ready or not dashboard_ready :
309
+ while not ready:
304
310
status, ready = self.status(print_to_console=False)
305
- dashboard_ready = self.is_dashboard_ready()
306
311
if status == CodeFlareClusterStatus.UNKNOWN:
307
312
print(
308
313
"WARNING: Current cluster status is unknown, have you run cluster.up yet?"
309
314
)
310
- if not ready or not dashboard_ready:
315
+ if not ready:
316
+ if timeout and time >= timeout:
317
+ raise TimeoutError(
318
+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
319
+ )
320
+ sleep(5)
321
+ time += 5
322
+ print("Requested cluster is up and running!")
323
+
324
+ while dashboard_check and not dashboard_ready:
325
+ dashboard_ready = self.is_dashboard_ready()
326
+ if not dashboard_ready:
311
327
if timeout and time >= timeout:
312
- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
328
+ raise TimeoutError(
329
+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
330
+ )
313
331
sleep(5)
314
332
time += 5
315
- print("Requested cluster and dashboard are up and running!")
333
+ if dashboard_ready:
334
+ print("Dashboard is ready!")
316
335
317
336
def details(self, print_to_console: bool = True) -> RayCluster:
318
337
cluster = _copy_to_ray(self)
@@ -640,6 +659,15 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
640
659
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
641
660
namespace=rc["metadata"]["namespace"],
642
661
dashboard=ray_route,
662
+ head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
663
+ "resources"
664
+ ]["limits"]["cpu"],
665
+ head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
666
+ "resources"
667
+ ]["limits"]["memory"],
668
+ head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
669
+ "resources"
670
+ ]["limits"]["nvidia.com/gpu"],
643
671
)
644
672
645
673
@@ -670,6 +698,9 @@ <h1 class="title">Module <code>codeflare_sdk.cluster.cluster</code></h1>
670
698
worker_gpu=cluster.config.num_gpus,
671
699
namespace=cluster.config.namespace,
672
700
dashboard=cluster.cluster_dashboard_uri(),
701
+ head_cpus=cluster.config.head_cpus,
702
+ head_mem=cluster.config.head_memory,
703
+ head_gpu=cluster.config.head_gpus,
673
704
)
674
705
if ray.status == CodeFlareClusterStatus.READY:
675
706
ray.status = RayClusterStatus.READY
@@ -879,6 +910,9 @@ <h2 class="section-title" id="header-classes">Classes</h2>
879
910
880
911
name = self.config.name
881
912
namespace = self.config.namespace
913
+ head_cpus = self.config.head_cpus
914
+ head_memory = self.config.head_memory
915
+ head_gpus = self.config.head_gpus
882
916
min_cpu = self.config.min_cpus
883
917
max_cpu = self.config.max_cpus
884
918
min_memory = self.config.min_memory
@@ -896,6 +930,9 @@ <h2 class="section-title" id="header-classes">Classes</h2>
896
930
return generate_appwrapper(
897
931
name=name,
898
932
namespace=namespace,
933
+ head_cpus=head_cpus,
934
+ head_memory=head_memory,
935
+ head_gpus=head_gpus,
899
936
min_cpu=min_cpu,
900
937
max_cpu=max_cpu,
901
938
min_memory=min_memory,
@@ -1028,7 +1065,7 @@ <h2 class="section-title" id="header-classes">Classes</h2>
1028
1065
else:
1029
1066
return False
1030
1067
1031
- def wait_ready(self, timeout: Optional[int] = None):
1068
+ def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True ):
1032
1069
"""
1033
1070
Waits for requested cluster to be ready, up to an optional timeout (s).
1034
1071
Checks every five seconds.
@@ -1038,19 +1075,32 @@ <h2 class="section-title" id="header-classes">Classes</h2>
1038
1075
dashboard_ready = False
1039
1076
status = None
1040
1077
time = 0
1041
- while not ready or not dashboard_ready :
1078
+ while not ready:
1042
1079
status, ready = self.status(print_to_console=False)
1043
- dashboard_ready = self.is_dashboard_ready()
1044
1080
if status == CodeFlareClusterStatus.UNKNOWN:
1045
1081
print(
1046
1082
"WARNING: Current cluster status is unknown, have you run cluster.up yet?"
1047
1083
)
1048
- if not ready or not dashboard_ready:
1084
+ if not ready:
1085
+ if timeout and time >= timeout:
1086
+ raise TimeoutError(
1087
+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
1088
+ )
1089
+ sleep(5)
1090
+ time += 5
1091
+ print("Requested cluster is up and running!")
1092
+
1093
+ while dashboard_check and not dashboard_ready:
1094
+ dashboard_ready = self.is_dashboard_ready()
1095
+ if not dashboard_ready:
1049
1096
if timeout and time >= timeout:
1050
- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
1097
+ raise TimeoutError(
1098
+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
1099
+ )
1051
1100
sleep(5)
1052
1101
time += 5
1053
- print("Requested cluster and dashboard are up and running!")
1102
+ if dashboard_ready:
1103
+ print("Dashboard is ready!")
1054
1104
1055
1105
def details(self, print_to_console: bool = True) -> RayCluster:
1056
1106
cluster = _copy_to_ray(self)
@@ -1267,6 +1317,9 @@ <h3>Methods</h3>
1267
1317
1268
1318
name = self.config.name
1269
1319
namespace = self.config.namespace
1320
+ head_cpus = self.config.head_cpus
1321
+ head_memory = self.config.head_memory
1322
+ head_gpus = self.config.head_gpus
1270
1323
min_cpu = self.config.min_cpus
1271
1324
max_cpu = self.config.max_cpus
1272
1325
min_memory = self.config.min_memory
@@ -1284,6 +1337,9 @@ <h3>Methods</h3>
1284
1337
return generate_appwrapper(
1285
1338
name=name,
1286
1339
namespace=namespace,
1340
+ head_cpus=head_cpus,
1341
+ head_memory=head_memory,
1342
+ head_gpus=head_gpus,
1287
1343
min_cpu=min_cpu,
1288
1344
max_cpu=max_cpu,
1289
1345
min_memory=min_memory,
@@ -1653,7 +1709,7 @@ <h3>Methods</h3>
1653
1709
</ details >
1654
1710
</ dd >
1655
1711
< dt id ="codeflare_sdk.cluster.cluster.Cluster.wait_ready "> < code class ="name flex ">
1656
- < span > def < span class ="ident "> wait_ready</ span > </ span > (< span > self, timeout: Optional[int] = None)</ span >
1712
+ < span > def < span class ="ident "> wait_ready</ span > </ span > (< span > self, timeout: Optional[int] = None, dashboard_check: bool = True )</ span >
1657
1713
</ code > </ dt >
1658
1714
< dd >
1659
1715
< div class ="desc "> < p > Waits for requested cluster to be ready, up to an optional timeout (s).
@@ -1662,7 +1718,7 @@ <h3>Methods</h3>
1662
1718
< summary >
1663
1719
< span > Expand source code</ span >
1664
1720
</ summary >
1665
- < pre > < code class ="python "> def wait_ready(self, timeout: Optional[int] = None):
1721
+ < pre > < code class ="python "> def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True ):
1666
1722
"""
1667
1723
Waits for requested cluster to be ready, up to an optional timeout (s).
1668
1724
Checks every five seconds.
@@ -1672,19 +1728,32 @@ <h3>Methods</h3>
1672
1728
dashboard_ready = False
1673
1729
status = None
1674
1730
time = 0
1675
- while not ready or not dashboard_ready :
1731
+ while not ready:
1676
1732
status, ready = self.status(print_to_console=False)
1677
- dashboard_ready = self.is_dashboard_ready()
1678
1733
if status == CodeFlareClusterStatus.UNKNOWN:
1679
1734
print(
1680
1735
"WARNING: Current cluster status is unknown, have you run cluster.up yet?"
1681
1736
)
1682
- if not ready or not dashboard_ready:
1737
+ if not ready:
1738
+ if timeout and time >= timeout:
1739
+ raise TimeoutError(
1740
+ f"wait() timed out after waiting {timeout}s for cluster to be ready"
1741
+ )
1742
+ sleep(5)
1743
+ time += 5
1744
+ print("Requested cluster is up and running!")
1745
+
1746
+ while dashboard_check and not dashboard_ready:
1747
+ dashboard_ready = self.is_dashboard_ready()
1748
+ if not dashboard_ready:
1683
1749
if timeout and time >= timeout:
1684
- raise TimeoutError(f"wait() timed out after waiting {timeout}s")
1750
+ raise TimeoutError(
1751
+ f"wait() timed out after waiting {timeout}s for dashboard to be ready"
1752
+ )
1685
1753
sleep(5)
1686
1754
time += 5
1687
- print("Requested cluster and dashboard are up and running!")</ code > </ pre >
1755
+ if dashboard_ready:
1756
+ print("Dashboard is ready!")</ code > </ pre >
1688
1757
</ details >
1689
1758
</ dd >
1690
1759
</ dl >
0 commit comments