From 697ea615896640e91c0bb807297cab2625228de6 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 23 Sep 2024 14:27:37 -0700 Subject: [PATCH] ADLR/megatron-lm!2137 - ci: Enable dev container for new features --- .gitlab-ci.yml | 1 + .gitlab/stages/01.tests.yml | 4 ++ .gitlab/stages/02.functional-tests.yml | 1 + Dockerfile.ci.dev | 62 +++++++++++++++++++ .../jet/generate_jet_trigger_job.py | 26 +++++++- 5 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 Dockerfile.ci.dev diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fb222e080b..52ae2a886e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -89,6 +89,7 @@ variables: # CI wide variables CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci + CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting UNIT_TEST_TIMEOUT: 15 diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml index 2fe5ddafae..68c1afcc6d 100644 --- a/.gitlab/stages/01.tests.yml +++ b/.gitlab/stages/01.tests.yml @@ -21,6 +21,10 @@ build_image: FILE: Dockerfile.ci BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 TAG: mcore-docker-node-large + - IMAGE: CI_MCORE_DEV_IMAGE + FILE: Dockerfile.ci.dev + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 + TAG: mcore-docker-node-large - IMAGE: CI_NEMO_IMAGE FILE: Dockerfile.ci BASE_IMAGE: nvcr.io/nvidian/nemo:nightly diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml index 3ac0bcc0c5..531527b8b4 100644 --- a/.gitlab/stages/02.functional-tests.yml +++ b/.gitlab/stages/02.functional-tests.yml @@ -55,6 +55,7 @@ jet-generate: --h100-cluster $H100_CLUSTER \ --container-tag ${CI_PIPELINE_ID} \ --container-image ${CI_MCORE_IMAGE} \ + --container-image-dev ${CI_MCORE_DEV_IMAGE} \ --output-path "jet-trigger-job.yaml" \ ${RELEASE_ARGS[@]} artifacts: diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev new file mode 100644 index 0000000000..fa13c48fd4 --- /dev/null +++ b/Dockerfile.ci.dev @@ -0,0 +1,62 @@ +# syntax=docker/dockerfile:1.3-labs + +ARG FROM_IMAGE_NAME +FROM $FROM_IMAGE_NAME as build_causal_conv1d +WORKDIR /opt +RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/Dao-AILab/causal-conv1d.git@v1.2.2.post1 + +FROM $FROM_IMAGE_NAME as build_grouped_gemm +WORKDIR /opt +RUN pip3 wheel -v git+https://github.com/fanshiqing/grouped_gemm@v1.1.2 + +FROM $FROM_IMAGE_NAME as build_mamba_ssm +WORKDIR /opt +RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3 + +FROM $FROM_IMAGE_NAME as main +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends gettext python3-venv && \ + apt-get clean && \ + python -m venv /opt/jet && \ + wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \ + chmod a+x /usr/local/bin/yq + +COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./ +COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./ +COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./ + +RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \ +einops \ +flask-restful \ +nltk \ +pytest \ +pytest-cov \ +pytest_mock \ +pytest-random-order \ +sentencepiece \ +wrapt \ +zarr \ +wandb \ +triton==2.1.0 \ +causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \ +mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \ +grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \ +tensorstore==0.1.45 && \ +rm *.whl + +# Since megatron does not have any dependencies (and isn't a dependency to any other package), we can install it separately to make everything a bit quicker +COPY . /opt/megatron-lm +RUN pip install /opt/megatron-lm +ENV PYTHONPATH="/opt/megatron-lm:$PYTHONPATH" + +##### For NVIDIANS only ##### +FROM main as jet +ARG CACHEBUST=0 +RUN --mount=type=secret,id=JET_INDEX_URLS \ + JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS) && \ + pip install jet-client --upgrade $JET_INDEX_URLS && \ + /opt/jet/bin/pip install jet-api --upgrade $JET_INDEX_URLS +ENV PATH="$PATH:/opt/jet/bin" +### \ No newline at end of file diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py index 42030257c5..beeb31860d 100644 --- a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py +++ b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py @@ -1,3 +1,4 @@ +import pathlib from typing import Optional import click @@ -5,13 +6,16 @@ from tests.functional_tests.python_test_utils.jet import common +BASE_PATH = pathlib.Path(__file__).parent.resolve() + @click.command() @click.option("--scope", required=True, type=str, help="Test scope") @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on") @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on") @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to") -@click.option("--container-image", required=True, type=str, help="Container tag to use") +@click.option("--container-image", required=True, type=str, help="LTS Container tag to use") +@click.option("--container-image-dev", required=True, type=str, help="Dev Container tag to use") @click.option("--container-tag", required=True, type=str, help="Container tag to use") @click.option( "--run-name", required=False, type=str, help="Run name (only relevant for release tests)" @@ -28,6 +32,7 @@ def main( h100_cluster: str, output_path: str, container_image: str, + container_image_dev: str, container_tag: str, run_name: Optional[str] = None, wandb_experiment: Optional[str] = None, @@ -55,6 +60,25 @@ def main( f"--cluster {cluster}", ] + with open( + pathlib.Path( + BASE_PATH + / ".." + / ".." + / "test_cases" + / test_case.spec.model + / test_case.spec.test_case + / "model_config.yaml" + ) + ) as stream: + try: + test_case_dict = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + if 'EXPERIMENTAL' in test_case_dict and test_case_dict['EXPERIMENTAL']: + script.append(f"--container-image {container_image_dev}") + if run_name is not None and wandb_experiment is not None: script.append(f"--run-name {run_name}") script.append(f"--wandb-experiment {wandb_experiment}")