Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

build: Push requirements #415

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
80 changes: 34 additions & 46 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ ARG MAX_JOBS=8
# Git refs for dependencies
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG PYTRITON_VERSION=0.5.10
ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main
ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
ARG NEMO_TAG=ko3n1g/build/move-to-req # On: main
ARG MCORE_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
ARG ALIGNER_COMMIT=main
ARG TRTLLM_VERSION=v0.13.0
ARG PROTOBUF_VERSION=4.24.4
Expand All @@ -34,8 +34,6 @@ git checkout -f $ALIGNER_COMMIT
# case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it
# case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail
git pull --rebase || true

pip install --no-cache-dir --no-deps -e .
EOF

FROM ${BASE_IMAGE} as final
Expand All @@ -44,31 +42,31 @@ WORKDIR /opt
# needed in case git complains that it can't detect a valid email, this email is fake but works
RUN git config --global user.email "[email protected]"
# install latest apex
ARG APEX_TAG
RUN pip uninstall -y apex && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
if [ ! -z $APEX_TAG ]; then \
git fetch origin $APEX_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
# ARG APEX_TAG
# RUN pip uninstall -y apex && \
# git clone https://github.com/NVIDIA/apex && \
# cd apex && \
# if [ ! -z $APEX_TAG ]; then \
# git fetch origin $APEX_TAG && \
# git checkout FETCH_HEAD; \
# fi && \
# pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./

# Git LFS
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
apt-get install git-lfs && \
git lfs install && \
apt-get clean

# TRTLLM
ARG TRTLLM_VERSION
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
cd TensorRT-LLM && \
git checkout ${TRTLLM_VERSION} && \
. docker/common/install_tensorrt.sh && \
python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \
pip install -e .
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
# # TRTLLM
# ARG TRTLLM_VERSION
# RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
# cd TensorRT-LLM && \
# git checkout ${TRTLLM_VERSION} && \
# . docker/common/install_tensorrt.sh && \
# python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \
# pip install -e .
# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/

# install TransformerEngine
ARG MAX_JOBS
Expand All @@ -77,47 +75,37 @@ RUN pip uninstall -y transformer-engine && \
git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
if [ ! -z $TE_TAG ]; then \
git fetch origin $TE_TAG && \
git checkout FETCH_HEAD; \
git fetch origin $TE_TAG && \
git checkout FETCH_HEAD; \
fi && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .

# place any util pkgs here
ARG PYTRITON_VERSION
RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
ARG PROTOBUF_VERSION
RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
RUN pip install --upgrade-strategy only-if-needed jsonlines
git submodule init && git submodule update

# NeMo
ARG NEMO_TAG
RUN git clone https://github.com/NVIDIA/NeMo.git && \
cd NeMo && \
git pull && \
if [ ! -z $NEMO_TAG ]; then \
git fetch origin $NEMO_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip uninstall -y nemo_toolkit sacrebleu && \
pip install -e ".[nlp]" && \
cd nemo/collections/nlp/data/language_modeling/megatron && make
git fetch origin $NEMO_TAG && \
git checkout FETCH_HEAD; \
fi

# MLM
ARG MLM_TAG
ARG MCORE_TAG
RUN pip uninstall -y megatron-core && \
git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git pull && \
if [ ! -z $MLM_TAG ]; then \
git fetch origin $MLM_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -e .
if [ ! -z $MCORE_TAG ]; then \
git fetch origin $MCORE_TAG && \
git checkout FETCH_HEAD; \
fi

COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
ARG PYTRITON_VERSION
ARG PROTOBUF_VERSION
RUN cd /opt/NeMo-Aligner && \
pip install --no-deps -e .
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .

RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch

Expand Down
13 changes: 4 additions & 9 deletions nemo_aligner/utils/trt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ def append_and_repad_list(list_of_items, item_to_append, pad_id):


class GPTGenerateTRTLLM:
# If a tokenizer does not have a pad_id, we use a large negative number and replace
# with self.eos_id after generation.
# Use a reserved negative number since there is variation between tokenizers if
# they (1) have a pad_id (2) don't have a pad_id or (3) have None as the pad_id.
# This pad_id is replaced with eos_id after generation.
DEFAULT_PAD_ID = -42

def __init__(
Expand All @@ -72,12 +73,6 @@ def __init__(
"You are trying to use NeMo-Aligner's TensorRT-LLM acceleration for LLM generation. Please build the dockerfile to enable this feature: https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile"
)

# If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
# - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
# be a token that the model is guaranteed to never generate)
assert (
tokenizer.pad_id != tokenizer.eos_id
), f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} when using TRT-LLM. This is to make sure all code goes into the same path and include the eos_id when the response lengths are computed"
assert max_input_len > 0
assert max_generation_length > 0
assert (
Expand All @@ -104,7 +99,7 @@ def __init__(
rng_generator.manual_seed(seed)
self.rng_generator = rng_generator

self.pad_id = tokenizer.pad_id if tokenizer.pad_id is not None else GPTGenerateTRTLLM.DEFAULT_PAD_ID
self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID
self.eos_id = tokenizer.eos_id
end_strings = list(end_strings)

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
profile = "black" # black-compatible
line_length = 119 # should match black parameters
ignore_whitespace = true # ignore whitespace for compatibility with the initial style
py_version = 38 # python 3.8 as a target version
py_version = 310 # python 3.9 as a target version
requires-python = ">=3.10"
known_first_party = ["nemo", "nemo_aligner"] # FIRSTPARTY section
known_third_party = ["examples"] # THIRDPARTY section
sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
Expand Down
25 changes: 21 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import subprocess
from distutils import cmd as distutils_cmd
from distutils import log as distutils_log

import re
import setuptools

spec = importlib.util.spec_from_file_location("package_info", "nemo_aligner/package_info.py")
Expand Down Expand Up @@ -62,13 +62,30 @@
# Dependency Loading #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #

# Function to replace ${VAR} or ${VAR:-default_value} with environment variable or default
def replace_env_vars(text):
# Regex to match ${VAR} or ${VAR:-default_value}
pattern = re.compile(r"\$\{(\w+)(:-([^}]*))?\}")

def replace_var(match):
var_name = match.group(1) # The environment variable name
default_value = match.group(3) # The default value if provided

# Return the environment variable value or the default (if available) or empty string
return os.environ.get(var_name, default_value if default_value is not None else f"${{{var_name}}}")

# Substitute all patterns in the text
return pattern.sub(replace_var, text)


def req_file(filename, folder="requirements"):
with open(os.path.join(folder, filename), encoding="utf-8") as f:
content = f.readlines()
# you may also want to remove whitespace characters
# Example: `\n` at the end of each line
return [x.strip() for x in content if x.strip()]
requirements = [x.strip() for x in content]
requirements = [
replace_env_vars(line.strip()) for line in requirements if line.strip() and not line.startswith("#")
]
return requirements


install_requires = req_file("requirements.txt", folder="setup")
Expand Down
7 changes: 4 additions & 3 deletions setup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
datasets>=3.0.1
jsonlines
megatron_core>=0.8
nemo_toolkit[nlp]
nvidia-pytriton
nemo_toolkit[nlp] @ git+https://github.com/NVIDIA/NeMo.git@${NEMO_TAG}#egg=nemo_toolkit[nlp]
nvidia-pytriton #==${PYTRITON_VERSION:-0.5.10}
protobuf==${PROTOBUF_VERSION:-4.24.4}
Loading