Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Batch] Support over 64 CPU machine types #14643

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions batch/batch/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,12 +318,14 @@ def __init__(self, task_manager: aiotools.BackgroundTaskManager):
self.internet_interface = INTERNET_INTERFACE

async def reserve(self):
for subnet_index in range(N_SLOTS + N_JVM_CONTAINERS):
N_PUBLIC_INTERFACES = min(255, N_SLOTS + N_JVM_CONTAINERS)
for subnet_index in range(N_PUBLIC_INTERFACES):
public = NetworkNamespace(subnet_index, private=False, internet_interface=self.internet_interface)
await public.init()
self.public_networks.put_nowait(public)

for subnet_index in range(N_SLOTS):
N_PRIVATE_INTERFACES = min(255, N_SLOTS)
for subnet_index in range(N_PRIVATE_INTERFACES):
private = NetworkNamespace(subnet_index, private=True, internet_interface=self.internet_interface)

await private.init()
Expand Down
13 changes: 13 additions & 0 deletions batch/test/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1531,6 +1531,19 @@ async def test_gpu_accesibility_g2(client: BatchClient):
pass


@skip_in_azure
async def test_over_64_cpus(client: BatchClient):
# This test is being added to validate high CPU counts in custom machines.
# The relevant part of this machine type ('highmem-96') is the CPU count, which is 96.
b = create_batch(client)
resources = {'machine_type': 'n1-highmem-96', 'preemptible': False}
j = b.create_job(DOCKER_ROOT_IMAGE, ['true'], resources=resources)
b.submit()
status = j.wait()
assert status['state'] == 'Success', str((status, b.debug_info()))
assert 'job-private' in status['status']['worker'], str((status, b.debug_info()))


def test_job_private_instance_preemptible(client: BatchClient):
b = create_batch(client)
resources = {'machine_type': smallest_machine_type()}
Expand Down