diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index e41d64436..763010c35 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=nvidia/cuda:12.3.0-devel-ubuntu22.04 +ARG BASE_IMAGE=nvidia/cuda:12.4.1-devel-ubuntu22.04 ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com ARG SRC_MANIFEST_FILE=manifest.yaml @@ -59,6 +59,16 @@ apt_packages=( wget # llvm.sh lsb-release software-properties-common + # OFED for RoCE and InfiniteBand support + rdma-core + libibverbs1 + libibverbs-dev + librdmacm1 + librdmacm-dev + libibumad3 + libibumad-dev + ibverbs-utils + ibverbs-providers ) if [[ $(dpkg --print-architecture) == arm64 ]]; then # h5py: The newest release of of h5py (3.11.0) does not include ARM wheels and causes pip to build h5py. @@ -155,13 +165,6 @@ RUN install-cudnn.sh ADD install-nccl.sh /usr/local/bin RUN install-nccl.sh -############################################################################### -## RoCE and InfiniteBand support -############################################################################### - -ADD install-ofed.sh /usr/local/bin -RUN install-ofed.sh - ############################################################################## ## Amazon EFA support (need to run it inside container separately) ##############################################################################