Skip to content

Commit

Permalink
ADLR/megatron-lm!2137 - ci: Enable dev container for new features
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Sep 23, 2024
1 parent 0fd4617 commit 697ea61
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ variables:

# CI wide variables
CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
UNIT_TEST_TIMEOUT: 15
Expand Down
4 changes: 4 additions & 0 deletions .gitlab/stages/01.tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ build_image:
FILE: Dockerfile.ci
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
TAG: mcore-docker-node-large
- IMAGE: CI_MCORE_DEV_IMAGE
FILE: Dockerfile.ci.dev
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
TAG: mcore-docker-node-large
- IMAGE: CI_NEMO_IMAGE
FILE: Dockerfile.ci
BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
Expand Down
1 change: 1 addition & 0 deletions .gitlab/stages/02.functional-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jet-generate:
--h100-cluster $H100_CLUSTER \
--container-tag ${CI_PIPELINE_ID} \
--container-image ${CI_MCORE_IMAGE} \
--container-image-dev ${CI_MCORE_DEV_IMAGE} \
--output-path "jet-trigger-job.yaml" \
${RELEASE_ARGS[@]}
artifacts:
Expand Down
62 changes: 62 additions & 0 deletions Dockerfile.ci.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# syntax=docker/dockerfile:1.3-labs

ARG FROM_IMAGE_NAME
FROM $FROM_IMAGE_NAME as build_causal_conv1d
WORKDIR /opt
RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/[email protected]

FROM $FROM_IMAGE_NAME as build_grouped_gemm
WORKDIR /opt
RUN pip3 wheel -v git+https://github.com/fanshiqing/[email protected]

FROM $FROM_IMAGE_NAME as build_mamba_ssm
WORKDIR /opt
RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/[email protected]

FROM $FROM_IMAGE_NAME as main
ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
apt-get install -y --no-install-recommends gettext python3-venv && \
apt-get clean && \
python -m venv /opt/jet && \
wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
chmod a+x /usr/local/bin/yq

COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./

RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
einops \
flask-restful \
nltk \
pytest \
pytest-cov \
pytest_mock \
pytest-random-order \
sentencepiece \
wrapt \
zarr \
wandb \
triton==2.1.0 \
causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
tensorstore==0.1.45 && \
rm *.whl

# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker
COPY . /opt/megatron-lm
RUN pip install /opt/megatron-lm
ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH"

##### For NVIDIANS only #####
FROM main as jet
ARG CACHEBUST=0
RUN --mount=type=secret,id=JET_INDEX_URLS \
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \
pip install jet-client --upgrade $JET_INDEX_URLS && \
/opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS
ENV PATH="$PATH:/opt/jet/bin"
###
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import pathlib
from typing import Optional

import click
import yaml

from tests.functional_tests.python_test_utils.jet import common

BASE_PATH = pathlib.Path(__file__).parent.resolve()


@click.command()
@click.option("--scope", required=True, type=str, help="Test scope")
@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
@click.option("--container-image", required=True, type=str, help="Container tag to use")
@click.option("--container-image", required=True, type=str, help="LTS Container tag to use")
@click.option("--container-image-dev", required=True, type=str, help="Dev Container tag to use")
@click.option("--container-tag", required=True, type=str, help="Container tag to use")
@click.option(
"--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
Expand All @@ -28,6 +32,7 @@ def main(
h100_cluster: str,
output_path: str,
container_image: str,
container_image_dev: str,
container_tag: str,
run_name: Optional[str] = None,
wandb_experiment: Optional[str] = None,
Expand Down Expand Up @@ -55,6 +60,25 @@ def main(
f"--cluster {cluster}",
]

with open(
pathlib.Path(
BASE_PATH
/ ".."
/ ".."
/ "test_cases"
/ test_case.spec.model
/ test_case.spec.test_case
/ "model_config.yaml"
)
) as stream:
try:
test_case_dict = yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)

if 'EXPERIMENTAL' in test_case_dict and test_case_dict['EXPERIMENTAL']:
script.append(f"--container-image {container_image_dev}")

if run_name is not None and wandb_experiment is not None:
script.append(f"--run-name {run_name}")
script.append(f"--wandb-experiment {wandb_experiment}")
Expand Down

0 comments on commit 697ea61

Please sign in to comment.