From b0dfe321fb31ca217d442b3f4183cdf5d27d81ef Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Wed, 25 Dec 2024 09:51:44 -0800 Subject: [PATCH 1/9] Duplicates of Peter's changes for CI --- Dockerfile | 50 +++++++++++++++++++++++--------------------- requirements-cve.txt | 2 +- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5a10d8d4ed..8545645adb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,14 +5,14 @@ # https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/.gitlab-ci.yml # We should keep versions in our container up to date to ensure that we get the latest tested perf improvements and # training loss curves from NeMo. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3 FROM rust:1.82.0 as rust-env RUN rustup set profile minimal && \ - rustup install 1.82.0 && \ - rustup target add x86_64-unknown-linux-gnu && \ - rustup default 1.82.0 + rustup install 1.82.0 && \ + rustup target add x86_64-unknown-linux-gnu && \ + rustup default 1.82.0 FROM ${BASE_IMAGE} AS bionemo2-base @@ -73,11 +73,6 @@ RUN rm -rf /build # Addressing Security Scan Vulnerabilities RUN rm -rf /opt/pytorch/pytorch/third_party/onnx -RUN apt-get update && \ - apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \ - rm -rf /var/lib/apt/lists/* -RUN apt purge -y libslurm37 libpmi2-0 && \ - apt autoremove -y # Use UV to install python packages from the workspace. This just installs packages into the system's python @@ -92,7 +87,7 @@ ENV UV_LINK_MODE=copy \ # Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their # installation. These involve building some torch extensions, so they can take a while to install. RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \ - uv pip install --no-build-isolation -r /requirements-pyg.txt + uv pip install --break-system-packages --no-build-isolation -r /requirements-pyg.txt WORKDIR /workspace/bionemo2 @@ -111,17 +106,27 @@ ENV RUSTUP_HOME="/usr/local/rustup" RUN --mount=type=bind,source=./.git,target=./.git \ --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \ --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \ - <=1.16.0 +onnx>=1.17.0 setuptools>=70.0.0 aiohttp>=3.9.4 jupyterlab>=3.6.8 From 772a8b3c796d1a04a566431617ae10983f3088e0 Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Wed, 25 Dec 2024 15:43:30 -0800 Subject: [PATCH 2/9] ARM build changes --- Dockerfile.arm | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/Dockerfile.arm b/Dockerfile.arm index c293ebb8c2..89f452c96d 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -1,5 +1,5 @@ # Base image with apex and transformer engine, but without NeMo or Megatron-LM. -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3 FROM rust:1.82.0 as rust-env @@ -117,7 +117,7 @@ ENV UV_LINK_MODE=copy \ # installation. These involve building some torch extensions, so they can take a while to install. RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \ --mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \ - uv pip install --no-build-isolation -r /requirements-pyg.txt + uv pip install --no-build-isolation --break-system-packages -r /requirements-pyg.txt ENV WORKDIR=/workspace/bionemo2 WORKDIR ${WORKDIR} @@ -176,17 +176,26 @@ WORKDIR /workspace/bionemo2 RUN --mount=type=bind,source=./.git,target=./.git \ --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \ --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \ - < Date: Sat, 28 Dec 2024 19:14:07 -0800 Subject: [PATCH 3/9] ARM build fixes --- Dockerfile.arm | 74 ++++++++++++++++++++---------- arm_build/decord_ffmpeg6_fix.patch | 73 +++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 25 deletions(-) create mode 100644 arm_build/decord_ffmpeg6_fix.patch diff --git a/Dockerfile.arm b/Dockerfile.arm index 89f452c96d..c65f05f715 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -93,17 +93,17 @@ RUN rm -rf /build # Addressing Security Scan Vulnerabilities RUN rm -rf /opt/pytorch/pytorch/third_party/onnx -RUN apt-get update && \ - apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \ - rm -rf /var/lib/apt/lists/* -RUN apt purge -y libslurm37 libpmi2-0 && \ +# RUN apt-get update && \ +# apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \ +# rm -rf /var/lib/apt/lists/* +RUN apt purge -y libpmi2-0 && \ apt autoremove -y -RUN source /usr/local/nvm/nvm.sh && \ - NODE_VER=$(nvm current) && \ - nvm deactivate && \ - nvm uninstall $NODE_VER && \ - sed -i "/NVM/d" /root/.bashrc && \ - sed -i "/nvm.sh/d" /etc/bash.bashrc +# RUN source /usr/local/nvm/nvm.sh && \ +# NODE_VER=$(nvm current) && \ +# nvm deactivate && \ +# nvm uninstall $NODE_VER && \ +# sed -i "/NVM/d" /root/.bashrc && \ +# sed -i "/nvm.sh/d" /etc/bash.bashrc # Use UV to install python packages from the workspace. This just installs packages into the system's python # environment, and does not use the current uv.lock file. @@ -133,19 +133,43 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}" ENV RUSTUP_HOME="/usr/local/rustup" -# Build decord +# # Build decord +# This needs a specific version of ffmpeg: +# root@e1fc53d00844:/workspace/bionemo2# ffmpeg -version +# ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers +# built with gcc 11 (Ubuntu 11.2.0-19ubuntu1) +# configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared +# libavutil 56. 70.100 / 56. 70.100 +# libavcodec 58.134.100 / 58.134.100 +# libavformat 58. 76.100 / 58. 76.100 +# libavdevice 58. 13.100 / 58. 13.100 +# libavfilter 7.110.100 / 7.110.100 +# libswscale 5. 9.100 / 5. 9.100 +# libswresample 3. 9.100 / 3. 9.100 +# libpostproc 55. 9.100 / 55. 9.100 +# +# Issue link: https://github.com/dmlc/decord/issues/257 +# Diff to make it all work https://github.com/dmlc/decord/issues/186#issuecomment-1171882325 + +# Consider this: +# sudo apt install libnvidia-decode-550 +# cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/ +# cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release + RUN apt-get update && \ apt-get install -y build-essential python3-dev python3-setuptools make cmake && \ - apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev && \ - git clone --recursive https://github.com/dmlc/decord && \ - cd decord && \ - mkdir build && cd build && \ - cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \ - make && \ - cd ../python && \ - pip install . && \ - cd ${WORKDIR} && \ - rm -rf decord + apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev +# && cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/ +RUN --mount=type=bind,source=./arm_build/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \ + git clone --recursive https://github.com/dmlc/decord && \ + cd decord && git apply /decord_ffmpeg6_fix.patch && \ + mkdir build && cd build && \ + cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \ + make && \ + cd ../python && \ + pip install . && \ + cd ${WORKDIR} && \ + rm -rf decord RUN pip install --upgrade pip setuptools RUN pip install setuptools_scm py-cpuinfo @@ -183,14 +207,14 @@ RUN --mount=type=bind,source=./.git,target=./.git \ pip install --use-deprecated=legacy-resolver --no-build-isolation --break-system-packages \ tensorstore==0.1.45 -RUN sed -i 's/^Version: 0\.0\.0$/Version: 0.1.45/' /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info/METADATA && mv /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info /usr/local/lib/python3.12/dist-packages/tensorstore-0.1.45.dist-info - -RUN pip show tensorstore - +# For some reason, we do not need to do the tensorstore verson package hack on arm64 +# RUN sed -i 's/^Version: 0\.0\.0$/Version: 0.1.45/' /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info/METADATA && mv /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info /usr/local/lib/python3.12/dist-packages/tensorstore-0.1.45.dist-info RUN --mount=type=bind,source=./.git,target=./.git \ --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \ --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \ +# Comment out mamba install in NeMo as this causes issues. + sed -i "/mamba-ssm/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt && \ uv pip install --no-build-isolation --break-system-packages \ ./3rdparty/* \ ./sub-packages/bionemo-* \ diff --git a/arm_build/decord_ffmpeg6_fix.patch b/arm_build/decord_ffmpeg6_fix.patch new file mode 100644 index 0000000000..cac6892280 --- /dev/null +++ b/arm_build/decord_ffmpeg6_fix.patch @@ -0,0 +1,73 @@ +# This is a patch file for decord https://github.com/dmlc/decord +# needed to build decord against ffmpeg6, taken from +# https://github.com/dmlc/decord/issues/186#issuecomment-1171882325 +# This needs to be removed once decord natively supports latest ffmpeg versions. +diff --git a/src/video/ffmpeg/ffmpeg_common.h b/src/video/ffmpeg/ffmpeg_common.h +index b0b973f..f0f7316 100644 +--- a/src/video/ffmpeg/ffmpeg_common.h ++++ b/src/video/ffmpeg/ffmpeg_common.h +@@ -21,6 +21,7 @@ + extern "C" { + #endif + #include ++#include + #include + #include + #include +diff --git a/src/video/nvcodec/cuda_threaded_decoder.cc b/src/video/nvcodec/cuda_threaded_decoder.cc +index 62bc7ee..957a90d 100644 +--- a/src/video/nvcodec/cuda_threaded_decoder.cc ++++ b/src/video/nvcodec/cuda_threaded_decoder.cc +@@ -17,7 +17,7 @@ namespace decord { + namespace cuda { + using namespace runtime; + +-CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat) ++CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat) + : device_id_(device_id), stream_({device_id, false}), device_{}, ctx_{}, parser_{}, decoder_{}, + pkt_queue_{}, frame_queue_{}, + run_(false), frame_count_(0), draining_(false), +@@ -70,7 +70,7 @@ CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, + } + } + +-void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat) { ++void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat) { + const char* bsf_name = nullptr; + if (AV_CODEC_ID_H264 == codecpar->codec_id) { + // H.264 +diff --git a/src/video/nvcodec/cuda_threaded_decoder.h b/src/video/nvcodec/cuda_threaded_decoder.h +index d7e6fcd..61958a1 100644 +--- a/src/video/nvcodec/cuda_threaded_decoder.h ++++ b/src/video/nvcodec/cuda_threaded_decoder.h +@@ -46,7 +46,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface { + using FrameOrderQueuePtr = std::unique_ptr; + + public: +- CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat); ++ CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat); + void SetCodecContext(AVCodecContext *dec_ctx, int width = -1, int height = -1, int rotation = 0); + bool Initialized() const; + void Start(); +@@ -70,7 +70,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface { + void LaunchThreadImpl(); + void RecordInternalError(std::string message); + void CheckErrorStatus(); +- void InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat); ++ void InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat); + + int device_id_; + CUStream stream_; +diff --git a/src/video/video_reader.cc b/src/video/video_reader.cc +index af4858d..99c9635 100644 +--- a/src/video/video_reader.cc ++++ b/src/video/video_reader.cc +@@ -145,7 +145,7 @@ VideoReader::~VideoReader(){ + + void VideoReader::SetVideoStream(int stream_nb) { + if (!fmt_ctx_) return; +- AVCodec *dec; ++ const AVCodec *dec; + int st_nb = av_find_best_stream(fmt_ctx_.get(), AVMEDIA_TYPE_VIDEO, stream_nb, -1, &dec, 0); + // LOG(INFO) << "find best stream: " << st_nb; + CHECK_GE(st_nb, 0) << "ERROR cannot find video stream with wanted index: " << stream_nb; From aef2816fb0c8325cbca4ce061d0e94a70cbec001 Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Sat, 28 Dec 2024 22:10:43 -0800 Subject: [PATCH 4/9] Remove bazel cache to fix CVEs --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 8545645adb..742487f3cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -248,6 +248,7 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup # RUN rm -rf /usr/local/cargo /usr/local/rustup +RUN rm -rf /root/.cache/bazel RUN chmod 777 -R /workspace/bionemo2/ # Transformer engine attention defaults From 555022ff062f574f03ed81edd255afc561a4405a Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Fri, 3 Jan 2025 13:43:10 -0800 Subject: [PATCH 5/9] more arm fixes --- Dockerfile.arm | 6 ++- ci/scripts/get_system_arch.sh | 16 ++++++ ci/scripts/setup_ngc_cli.sh | 93 +++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 2 deletions(-) create mode 100755 ci/scripts/get_system_arch.sh create mode 100755 ci/scripts/setup_ngc_cli.sh diff --git a/Dockerfile.arm b/Dockerfile.arm index c65f05f715..5e19aa2dd9 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -219,7 +219,8 @@ RUN --mount=type=bind,source=./.git,target=./.git \ ./3rdparty/* \ ./sub-packages/bionemo-* \ -r /requirements-cve.txt \ - -r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target + -r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target \ + && rm -rf /root/.cache/* # In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the # base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that @@ -345,9 +346,10 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup # RUN rm -rf /usr/local/cargo /usr/local/rustup -RUN rm -rf /root/.cache/bazel RUN chmod 777 -R /workspace/bionemo2/ +# TODO fix /usr/local/lib/python3.12/dist-packages/faiss/loader.py + # Transformer engine attention defaults # We have to declare this again because the devcontainer splits from the release image's base. # FIXME the following results in unstable training curves even if faster. diff --git a/ci/scripts/get_system_arch.sh b/ci/scripts/get_system_arch.sh new file mode 100755 index 0000000000..50fd9d125d --- /dev/null +++ b/ci/scripts/get_system_arch.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +ARCH=$(uname -m) + +SYSTEM_ARCH="" + +if [ "${ARCH}" = "aarch64" ] || [ "${ARCH}" = "arm64" ]; then + SYSTEM_ARCH="arm64" +elif [ "${ARCH}" = "x86_64" ]; then + SYSTEM_ARCH="amd64" +else + echo "Unsupported architecture: ${ARCH}" + exit 1 +fi + +echo "${SYSTEM_ARCH}" diff --git a/ci/scripts/setup_ngc_cli.sh b/ci/scripts/setup_ngc_cli.sh new file mode 100755 index 0000000000..0e91534963 --- /dev/null +++ b/ci/scripts/setup_ngc_cli.sh @@ -0,0 +1,93 @@ +#!/bin/sh + +# Exit immediately if a command exits with a non-zero status +set -e +# Usage documentation: +# This script installs and configures the NVIDIA NGC CLI tool. +# +# Arguments: +# --ngc-api-key : Your NGC API key (mandatory if login is enabled). +# --ngc-org : Your NGC organization (mandatory if login is enabled). +# --ngc-team : Your NGC team (mandatory if login is enabled). +# --installation-folder : Directory where the NGC CLI will be installed (mandatory). +# --no-ngc-login : Flag to bypass NGC login configuration. +# +# Example usage: +# ./setup-ngc-cli.sh --ngc-api-key YOUR_API_KEY --ngc-org YOUR_ORG --ngc-team YOUR_TEAM --installation-folder /path/to/install +# ./setup-ngc-cli.sh --installation-folder /path/to/install --no-ngc-login + +# Default value for NGC login +NGC_LOGIN="True" + +# Parse input arguments +while [ $# -gt 0 ]; do + KEY="$1" + case $KEY in + --ngc-api-key) + NGC_KEY="$2" + shift 2;; + --ngc-org) + NGC_ORG="$2" + shift 2;; + --ngc-team) + NGC_TEAM="$2" + shift 2;; + --installation-folder) + INSTALLATION_DIR="$2" + shift 2;; + --no-ngc-login) + NGC_LOGIN="False" + shift;; + --*=|-*) + echo "Error: Unsupported keyword ${KEY}" >&2 + exit 1 ;; + *) + echo "Error: Unsupported positional argument ${KEY}" >&2 + exit 1 ;; + esac +done + +# Validate required arguments +if [ -z "${INSTALLATION_DIR}" ]; then + echo "Error: --installation-folder must be set" >&2 + exit 1 +fi + +if [ "$NGC_LOGIN" = "True" ] && { [ -z "${NGC_KEY}" ] || [ -z "${NGC_ORG}" ] || [ -z "${NGC_TEAM}" ]; }; then + echo "Error: The arguments --ngc-api-key, --ngc-org, and --ngc-team must be specified. To bypass NGC login, use the --no-ngc-login flag." >&2 + exit 1 +fi + +# Create installation directory and move into it +mkdir -p "${INSTALLATION_DIR}" +cd "${INSTALLATION_DIR}" + +# Select appropriate version of NGC CLI +FILE_TO_DOWNLOAD="ngccli_linux.zip" +if SYS_ARCH=$(./ci/scripts/get_system_arch.sh); then + echo "System architecture: ${SYS_ARCH}" + if [ "${SYS_ARCH}" = "arm64" ]; then + FILE_TO_DOWNLOAD="ngccli_arm64.zip" + fi +else + echo "Error determining system architecture" + exit 1 +fi + +# Install required packages +apt update > /dev/null && apt install -y wget unzip > /dev/null + +# Download and extract the NGC CLI +wget --content-disposition https://ngc.nvidia.com/downloads/${FILE_TO_DOWNLOAD} -O ngccli.zip +unzip -q -o ngccli.zip +chmod u+x ngc-cli/ngc + +# Verify integrity of the files if md5 checksum file exists +find ngc-cli/ -type f -exec md5sum {} + | LC_ALL=C sort | md5sum -c ngc-cli.md5 + +./ngc-cli/ngc --version + +# Configure NGC login if needed +if [ "$NGC_LOGIN" = "True" ]; then + printf "%s\n" "${NGC_KEY}" json "${NGC_ORG}" "${NGC_TEAM}" no-ace | ./ngc-cli/ngc config set +fi From 5610dc8b3faef1c20f66285dd7bf233db37a8729 Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Fri, 3 Jan 2025 14:32:24 -0800 Subject: [PATCH 6/9] add nvidia-smi to test script for debugging --- ci/scripts/run_pytest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/run_pytest.sh b/ci/scripts/run_pytest.sh index 6a63bf0606..400c582583 100755 --- a/ci/scripts/run_pytest.sh +++ b/ci/scripts/run_pytest.sh @@ -26,7 +26,7 @@ source "$(dirname "$0")/utils.sh" if ! set_bionemo_home; then exit 1 fi - +nvidia-smi python -m coverage erase error=false From 4e6154085673f0f19b796321beccd6baf5a5a330 Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Mon, 6 Jan 2025 12:21:13 -0800 Subject: [PATCH 7/9] add uname -a to understand kernel issues --- ci/scripts/run_pytest.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/run_pytest.sh b/ci/scripts/run_pytest.sh index 400c582583..51ba9336f2 100755 --- a/ci/scripts/run_pytest.sh +++ b/ci/scripts/run_pytest.sh @@ -26,6 +26,7 @@ source "$(dirname "$0")/utils.sh" if ! set_bionemo_home; then exit 1 fi +uname -a nvidia-smi python -m coverage erase From 65b8ab20380993ef3bef9bcd43975a69e34610c8 Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Mon, 6 Jan 2025 13:49:34 -0800 Subject: [PATCH 8/9] triton update for arm --- Dockerfile.arm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile.arm b/Dockerfile.arm index 5e19aa2dd9..696ca11105 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -58,8 +58,8 @@ RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-di RUN git clone https://github.com/llvm/llvm-project.git && \ pip install ninja && \ cd llvm-project && \ - git fetch origin 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \ - git checkout 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \ + git fetch origin 49af6502c6dcb4a7f7520178bd14df396f78240c && \ + git checkout 49af6502c6dcb4a7f7520178bd14df396f78240c && \ mkdir build && cd build && \ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" && \ ninja && \ @@ -69,8 +69,8 @@ RUN git clone https://github.com/llvm/llvm-project.git && \ git clone https://github.com/triton-lang/triton.git && \ pip install cmake wheel pybind11 && \ cd triton && \ - git fetch origin 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \ - git checkout 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \ + git fetch origin release/3.1.x && \ + git checkout release/3.1.x && \ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install python/ && \ cd ${WORKDIR} && \ From 5d214123e2d50a573ff7d4b48251e02c778b5b2e Mon Sep 17 00:00:00 2001 From: Timur Rvachov Date: Wed, 8 Jan 2025 08:13:07 -0800 Subject: [PATCH 9/9] more dockerfile fixes --- Dockerfile.arm | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/Dockerfile.arm b/Dockerfile.arm index 696ca11105..9a28f94b18 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -55,23 +55,27 @@ RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-di git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.0.post2 # Build LLVM and triton +# It's important to select a specific version of LLVM as per triton's README instructions, and +# also important to constrain the build targets to the systems we care about or else there will +# be many strange unlinked symbol issues. Here we assume this dockerfile is build on an aarch64 +# target (host), and build for NVIDIA GPUS (NVPTX). RUN git clone https://github.com/llvm/llvm-project.git && \ pip install ninja && \ cd llvm-project && \ - git fetch origin 49af6502c6dcb4a7f7520178bd14df396f78240c && \ - git checkout 49af6502c6dcb4a7f7520178bd14df396f78240c && \ + git fetch origin 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \ + git checkout 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \ mkdir build && cd build && \ - cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" && \ - ninja && \ - export LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build && \ + cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX" ../llvm && \ + ninja +ENV LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build - cd ${WORKDIR} && \ +RUN cd ${WORKDIR} && \ git clone https://github.com/triton-lang/triton.git && \ pip install cmake wheel pybind11 && \ cd triton && \ git fetch origin release/3.1.x && \ git checkout release/3.1.x && \ - LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install python/ && \ + LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install --verbose python/ && \ cd ${WORKDIR} && \ rm -rf llvm-project && \ @@ -320,6 +324,12 @@ for sub in ./3rdparty/* ./sub-packages/bionemo-*; do uv pip install --no-deps --no-build-isolation --break-system-packages --editable $sub done EOF +# This is needed because faiss is not compatible with ARM at all. +# Bionemo doesn't use faiss, but megatron core does. +# We do not use this codepath at all, therefore we just make is_sve_supported return False +# to circumvent python import issues +RUN sed -i '42i\ # Bionemo hack to fix ARM issues with faiss\n return False' /usr/local/lib/python3.12/dist-packages/faiss/loader.py + # Since the entire repo is owned by root, swithcing username for development breaks things. ARG USERNAME=bionemo RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/