Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Duplicates of Peter's changes for CI #556

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 27 additions & 24 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
# https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/.gitlab-ci.yml
# We should keep versions in our container up to date to ensure that we get the latest tested perf improvements and
# training loss curves from NeMo.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3

FROM rust:1.82.0 as rust-env

RUN rustup set profile minimal && \
rustup install 1.82.0 && \
rustup target add x86_64-unknown-linux-gnu && \
rustup default 1.82.0
rustup install 1.82.0 && \
rustup target add x86_64-unknown-linux-gnu && \
rustup default 1.82.0

FROM ${BASE_IMAGE} AS bionemo2-base

Expand Down Expand Up @@ -73,11 +73,6 @@ RUN rm -rf /build

# Addressing Security Scan Vulnerabilities
RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
RUN apt-get update && \
apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
rm -rf /var/lib/apt/lists/*
RUN apt purge -y libslurm37 libpmi2-0 && \
apt autoremove -y


# Use UV to install python packages from the workspace. This just installs packages into the system's python
Expand All @@ -92,7 +87,7 @@ ENV UV_LINK_MODE=copy \
# Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
# installation. These involve building some torch extensions, so they can take a while to install.
RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
uv pip install --no-build-isolation -r /requirements-pyg.txt
uv pip install --break-system-packages --no-build-isolation -r /requirements-pyg.txt

WORKDIR /workspace/bionemo2

Expand All @@ -111,17 +106,27 @@ ENV RUSTUP_HOME="/usr/local/rustup"
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
<<EOF
set -eo pipefail
uv pip install maturin --no-build-isolation && uv pip install --no-build-isolation \
uv pip install maturin --no-build-isolation --break-system-packages
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
pip install --use-deprecated=legacy-resolver --no-build-isolation --break-system-packages \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How to pull correct wheel? @pstjohn

tensorstore==0.1.45

RUN sed -i 's/^Version: 0\.0\.0$/Version: 0.1.45/' /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info/METADATA && mv /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info /usr/local/lib/python3.12/dist-packages/tensorstore-0.1.45.dist-info

RUN pip show tensorstore


RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
uv pip install --no-build-isolation --break-system-packages \
./3rdparty/* \
./sub-packages/bionemo-* \
-r /requirements-cve.txt \
-r /requirements-test.txt
rm -rf ./3rdparty
rm -rf /tmp/*
rm -rf ./sub-packages/bionemo-noodles/target
EOF
-r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target


# In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
# base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
Expand Down Expand Up @@ -179,7 +184,7 @@ ENV RUSTUP_HOME="/usr/local/rustup"
RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked <<EOF
set -eo pipefail
uv pip install -r /workspace/bionemo2/requirements-dev.txt
uv pip install -r /workspace/bionemo2/requirements-dev.txt --break-system-packages
rm -rf /tmp/*
EOF

Expand Down Expand Up @@ -209,15 +214,12 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

RUN uv pip uninstall maturin
RUN uv pip install maturin --no-build-isolation

RUN <<EOF
set -eo pipefail
find . -name __pycache__ -type d -print | xargs rm -rf
uv pip install --no-build-isolation --editable ./internal/infra-bionemo
uv pip install --break-system-packages --no-build-isolation --editable ./internal/infra-bionemo
for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
uv pip install --no-deps --no-build-isolation --editable $sub
uv pip install --break-system-packages --no-deps --no-build-isolation --editable $sub
done
EOF

Expand Down Expand Up @@ -246,6 +248,7 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup


# RUN rm -rf /usr/local/cargo /usr/local/rustup
RUN rm -rf /root/.cache/bazel
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RUN chmod 777 -R /workspace/bionemo2/

# Transformer engine attention defaults
Expand Down
101 changes: 67 additions & 34 deletions Dockerfile.arm
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Base image with apex and transformer engine, but without NeMo or Megatron-LM.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3

FROM rust:1.82.0 as rust-env

Expand Down Expand Up @@ -93,17 +93,17 @@ RUN rm -rf /build

# Addressing Security Scan Vulnerabilities
RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
RUN apt-get update && \
apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
rm -rf /var/lib/apt/lists/*
RUN apt purge -y libslurm37 libpmi2-0 && \
# RUN apt-get update && \
# apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
# rm -rf /var/lib/apt/lists/*
RUN apt purge -y libpmi2-0 && \
apt autoremove -y
RUN source /usr/local/nvm/nvm.sh && \
NODE_VER=$(nvm current) && \
nvm deactivate && \
nvm uninstall $NODE_VER && \
sed -i "/NVM/d" /root/.bashrc && \
sed -i "/nvm.sh/d" /etc/bash.bashrc
# RUN source /usr/local/nvm/nvm.sh && \
# NODE_VER=$(nvm current) && \
# nvm deactivate && \
# nvm uninstall $NODE_VER && \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@trvachov remove comments

# sed -i "/NVM/d" /root/.bashrc && \
# sed -i "/nvm.sh/d" /etc/bash.bashrc

# Use UV to install python packages from the workspace. This just installs packages into the system's python
# environment, and does not use the current uv.lock file.
Expand All @@ -117,7 +117,7 @@ ENV UV_LINK_MODE=copy \
# installation. These involve building some torch extensions, so they can take a while to install.
RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \
uv pip install --no-build-isolation -r /requirements-pyg.txt
uv pip install --no-build-isolation --break-system-packages -r /requirements-pyg.txt

ENV WORKDIR=/workspace/bionemo2
WORKDIR ${WORKDIR}
Expand All @@ -133,19 +133,43 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

# Build decord
# # Build decord
# This needs a specific version of ffmpeg:
# root@e1fc53d00844:/workspace/bionemo2# ffmpeg -version
# ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
# built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
# configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
# libavutil 56. 70.100 / 56. 70.100
# libavcodec 58.134.100 / 58.134.100
# libavformat 58. 76.100 / 58. 76.100
# libavdevice 58. 13.100 / 58. 13.100
# libavfilter 7.110.100 / 7.110.100
# libswscale 5. 9.100 / 5. 9.100
# libswresample 3. 9.100 / 3. 9.100
# libpostproc 55. 9.100 / 55. 9.100
#
# Issue link: https://github.com/dmlc/decord/issues/257
# Diff to make it all work https://github.com/dmlc/decord/issues/186#issuecomment-1171882325

# Consider this:
# sudo apt install libnvidia-decode-550
# cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/
# cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release

RUN apt-get update && \
apt-get install -y build-essential python3-dev python3-setuptools make cmake && \
apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev && \
git clone --recursive https://github.com/dmlc/decord && \
cd decord && \
mkdir build && cd build && \
cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \
make && \
cd ../python && \
pip install . && \
cd ${WORKDIR} && \
rm -rf decord
apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
# && cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/
RUN --mount=type=bind,source=./arm_build/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \
git clone --recursive https://github.com/dmlc/decord && \
cd decord && git apply /decord_ffmpeg6_fix.patch && \
mkdir build && cd build && \
cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \
make && \
cd ../python && \
pip install . && \
cd ${WORKDIR} && \
rm -rf decord

RUN pip install --upgrade pip setuptools
RUN pip install setuptools_scm py-cpuinfo
Expand Down Expand Up @@ -176,17 +200,26 @@ WORKDIR /workspace/bionemo2
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
<<EOF
set -eo pipefail
uv pip install maturin --no-build-isolation && uv pip install --no-build-isolation \
uv pip install maturin --no-build-isolation --break-system-packages
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
pip install --use-deprecated=legacy-resolver --no-build-isolation --break-system-packages \
tensorstore==0.1.45

# For some reason, we do not need to do the tensorstore verson package hack on arm64
# RUN sed -i 's/^Version: 0\.0\.0$/Version: 0.1.45/' /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info/METADATA && mv /usr/local/lib/python3.12/dist-packages/tensorstore-0.0.0.dist-info /usr/local/lib/python3.12/dist-packages/tensorstore-0.1.45.dist-info

RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
# Comment out mamba install in NeMo as this causes issues.
sed -i "/mamba-ssm/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt && \
uv pip install --no-build-isolation --break-system-packages \
./3rdparty/* \
./sub-packages/bionemo-* \
-r /requirements-cve.txt \
-r /requirements-test.txt
rm -rf ./3rdparty
rm -rf /tmp/*
rm -rf ./sub-packages/bionemo-noodles/target
EOF
-r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target

# In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
# base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
Expand Down Expand Up @@ -244,7 +277,7 @@ ENV RUSTUP_HOME="/usr/local/rustup"
RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked <<EOF
set -eo pipefail
uv pip install -r /workspace/bionemo2/requirements-dev.txt
uv pip install --break-system-packages -r /workspace/bionemo2/requirements-dev.txt
rm -rf /tmp/*
EOF

Expand Down Expand Up @@ -276,14 +309,14 @@ ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

RUN uv pip uninstall maturin
RUN uv pip install maturin --no-build-isolation
RUN uv pip install maturin --break-system-packages --no-build-isolation # why are we doing this twice?

RUN <<EOF
set -eo pipefail
find . -name __pycache__ -type d -print | xargs rm -rf
uv pip install --no-build-isolation --editable ./internal/infra-bionemo
for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
uv pip install --no-deps --no-build-isolation --editable $sub
uv pip install --no-deps --no-build-isolation --break-system-packages --editable $sub
done
EOF
# Since the entire repo is owned by root, swithcing username for development breaks things.
Expand Down
73 changes: 73 additions & 0 deletions arm_build/decord_ffmpeg6_fix.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# This is a patch file for decord https://github.com/dmlc/decord
# needed to build decord against ffmpeg6, taken from
# https://github.com/dmlc/decord/issues/186#issuecomment-1171882325
# This needs to be removed once decord natively supports latest ffmpeg versions.
diff --git a/src/video/ffmpeg/ffmpeg_common.h b/src/video/ffmpeg/ffmpeg_common.h
index b0b973f..f0f7316 100644
--- a/src/video/ffmpeg/ffmpeg_common.h
+++ b/src/video/ffmpeg/ffmpeg_common.h
@@ -21,6 +21,7 @@
extern "C" {
#endif
#include <libavcodec/avcodec.h>
+#include <libavcodec/bsf.h>
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
#include <libavfilter/avfilter.h>
diff --git a/src/video/nvcodec/cuda_threaded_decoder.cc b/src/video/nvcodec/cuda_threaded_decoder.cc
index 62bc7ee..957a90d 100644
--- a/src/video/nvcodec/cuda_threaded_decoder.cc
+++ b/src/video/nvcodec/cuda_threaded_decoder.cc
@@ -17,7 +17,7 @@ namespace decord {
namespace cuda {
using namespace runtime;

-CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat)
+CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat)
: device_id_(device_id), stream_({device_id, false}), device_{}, ctx_{}, parser_{}, decoder_{},
pkt_queue_{}, frame_queue_{},
run_(false), frame_count_(0), draining_(false),
@@ -70,7 +70,7 @@ CUThreadedDecoder::CUThreadedDecoder(int device_id, AVCodecParameters *codecpar,
}
}

-void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat) {
+void CUThreadedDecoder::InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat) {
const char* bsf_name = nullptr;
if (AV_CODEC_ID_H264 == codecpar->codec_id) {
// H.264
diff --git a/src/video/nvcodec/cuda_threaded_decoder.h b/src/video/nvcodec/cuda_threaded_decoder.h
index d7e6fcd..61958a1 100644
--- a/src/video/nvcodec/cuda_threaded_decoder.h
+++ b/src/video/nvcodec/cuda_threaded_decoder.h
@@ -46,7 +46,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface {
using FrameOrderQueuePtr = std::unique_ptr<FrameOrderQueue>;

public:
- CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, AVInputFormat *iformat);
+ CUThreadedDecoder(int device_id, AVCodecParameters *codecpar, const AVInputFormat *iformat);
void SetCodecContext(AVCodecContext *dec_ctx, int width = -1, int height = -1, int rotation = 0);
bool Initialized() const;
void Start();
@@ -70,7 +70,7 @@ class CUThreadedDecoder final : public ThreadedDecoderInterface {
void LaunchThreadImpl();
void RecordInternalError(std::string message);
void CheckErrorStatus();
- void InitBitStreamFilter(AVCodecParameters *codecpar, AVInputFormat *iformat);
+ void InitBitStreamFilter(AVCodecParameters *codecpar, const AVInputFormat *iformat);

int device_id_;
CUStream stream_;
diff --git a/src/video/video_reader.cc b/src/video/video_reader.cc
index af4858d..99c9635 100644
--- a/src/video/video_reader.cc
+++ b/src/video/video_reader.cc
@@ -145,7 +145,7 @@ VideoReader::~VideoReader(){

void VideoReader::SetVideoStream(int stream_nb) {
if (!fmt_ctx_) return;
- AVCodec *dec;
+ const AVCodec *dec;
int st_nb = av_find_best_stream(fmt_ctx_.get(), AVMEDIA_TYPE_VIDEO, stream_nb, -1, &dec, 0);
// LOG(INFO) << "find best stream: " << st_nb;
CHECK_GE(st_nb, 0) << "ERROR cannot find video stream with wanted index: " << stream_nb;
2 changes: 1 addition & 1 deletion requirements-cve.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
onnx>=1.16.0
onnx>=1.17.0
setuptools>=70.0.0
aiohttp>=3.9.4
jupyterlab>=3.6.8
Expand Down
Loading