Skip to content

Commit 4876ee1

Browse files
committed
Merge branch 'ko3n1g/chore/bump-pyt' into 'main'
chore: Bump Pytorch container See merge request ADLR/megatron-lm!2017
2 parents 772faca + 831d64d commit 4876ee1

File tree

197 files changed

+5140
-79
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

197 files changed

+5140
-79
lines changed

.gitlab/stages/01.tests.yml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ build_image:
2323
TAG: mcore-docker-node-large
2424
- IMAGE: CI_MCORE_DEV_IMAGE
2525
FILE: Dockerfile.ci.dev
26-
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
26+
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
2727
TAG: mcore-docker-node-large
2828
- IMAGE: CI_NEMO_IMAGE
2929
FILE: Dockerfile.ci
@@ -92,10 +92,12 @@ unit_tests:
9292
matrix:
9393
- TAG: latest
9494
IMAGE: ${CI_MCORE_IMAGE}
95-
# - TAG: latest
96-
# IMAGE: ${CI_MCORE_DEV_IMAGE}
95+
- TAG: latest
96+
IMAGE: ${CI_MCORE_DEV_IMAGE}
9797
- TAG: core_r0.9.0
9898
IMAGE: ${CI_MCORE_IMAGE}
99+
- TAG: core_r0.9.0
100+
IMAGE: ${CI_MCORE_DEV_IMAGE}
99101
tags: [8xL40S]
100102
variables:
101103
GIT_STRATEGY: clone
@@ -109,6 +111,9 @@ unit_tests:
109111
fi
110112
script:
111113
- |
114+
export NVTE_FLASH_ATTN=0
115+
export NVTE_FUSED_ATTN=0
116+
112117
cd /opt/megatron-lm
113118
if [[ $UNIT_TEST_REPEAT -eq 0 ]]; then
114119
exit 0
@@ -118,12 +123,9 @@ unit_tests:
118123
SEED=$((RANDOM % 9000 + 1000));
119124
ARGS=()
120125
if [[ $TAG != latest ]]; then
121-
ARGS+=(-m "not internal")
126+
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
122127
else
123-
ARGS+=(-m "not flaky")
124-
fi
125-
if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then
126-
ARGS+=(-m "experimental")
128+
ARGS+=(-m "not flaky and not flaky_in_dev")
127129
fi
128130
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
129131
done

.gitlab/stages/02.functional-tests.yml

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,25 +51,56 @@ jet-generate:
5151
export PYTHONPATH=$(pwd)
5252
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
5353
--scope $FUNCTIONAL_TEST_SCOPE \
54+
--environment dev \
5455
--a100-cluster $A100_CLUSTER \
5556
--h100-cluster $H100_CLUSTER \
57+
--container-image ${CI_MCORE_IMAGE} \
5658
--container-tag ${CI_PIPELINE_ID} \
59+
--output-path "jet-trigger-job-dev.yaml" \
60+
${RELEASE_ARGS[@]}
61+
62+
- |
63+
export PYTHONPATH=$(pwd)
64+
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
65+
--scope $FUNCTIONAL_TEST_SCOPE \
66+
--environment lts \
67+
--a100-cluster $A100_CLUSTER \
68+
--h100-cluster $H100_CLUSTER \
5769
--container-image ${CI_MCORE_IMAGE} \
58-
--container-image-dev ${CI_MCORE_DEV_IMAGE} \
59-
--output-path "jet-trigger-job.yaml" \
70+
--container-tag ${CI_PIPELINE_ID} \
71+
--output-path "jet-trigger-job-lts.yaml" \
6072
${RELEASE_ARGS[@]}
6173
artifacts:
6274
paths:
63-
- jet-trigger-job.yaml
75+
- jet-trigger-job-lts.yaml
76+
- jet-trigger-job-dev.yaml
6477
- tests/functional_tests/local_recipes
6578

66-
jet-trigger:
79+
jet-trigger-lts:
80+
stage: functional_tests
81+
needs: [jet-generate]
82+
extends: [.jet_common]
83+
trigger:
84+
include:
85+
- artifact: jet-trigger-job-lts.yaml
86+
job: jet-generate
87+
strategy: depend
88+
variables:
89+
RO_API_TOKEN: $PAT
90+
CONTAINER_TAG: $CI_PIPELINE_ID
91+
CI_MCORE_IMAGE: $CI_MCORE_IMAGE
92+
GITLAB_ENDPOINT: $GITLAB_ENDPOINT
93+
PARENT_PIPELINE_ID: $CI_PIPELINE_ID
94+
inherit:
95+
variables: true
96+
97+
jet-trigger-dev:
6798
stage: functional_tests
6899
needs: [jet-generate]
69100
extends: [.jet_common]
70101
trigger:
71102
include:
72-
- artifact: jet-trigger-job.yaml
103+
- artifact: jet-trigger-job-dev.yaml
73104
job: jet-generate
74105
strategy: depend
75106
variables:
@@ -81,10 +112,10 @@ jet-trigger:
81112
inherit:
82113
variables: true
83114

84-
jet-results-notify:
115+
jet-results-notify-lts:
85116
extends: [.jet_common]
86117
image: ${GITLAB_ENDPOINT}:5005/dl/jet/api:latest
87-
needs: [jet-trigger]
118+
needs: [jet-trigger-lts, jet-trigger-dev]
88119
tags:
89120
- mcore-docker-node-small
90121
before_script:
@@ -96,7 +127,7 @@ jet-results-notify:
96127
- export GITLAB_ENDPOINT
97128
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
98129
- export DATE=$(date +"%Y-%m-%d")
99-
- bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID}
130+
- bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID} lts
100131
artifacts:
101132
when: always
102133
paths:
@@ -106,3 +137,27 @@ jet-results-notify:
106137
when: always
107138
- when: never
108139

140+
jet-results-notify-dev:
141+
extends: [.jet_common]
142+
image: ${GITLAB_ENDPOINT}:5005/dl/jet/api:latest
143+
needs: [jet-trigger-lts, jet-trigger-dev]
144+
tags:
145+
- mcore-docker-node-small
146+
before_script:
147+
- jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $VAULT_JWT_TOKEN
148+
script:
149+
- env
150+
- export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
151+
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
152+
- export GITLAB_ENDPOINT
153+
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
154+
- export DATE=$(date +"%Y-%m-%d")
155+
- bash tests/functional_tests/shell_test_utils/notify.sh ${CI_PIPELINE_ID} dev
156+
artifacts:
157+
when: always
158+
paths:
159+
- scripts
160+
rules:
161+
- if: $CI_PIPELINE_SOURCE == "schedule" && $FUNCTIONAL_TEST == "yes"
162+
when: always
163+
- when: never

Dockerfile.ci.dev

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ RUN pip3 wheel -v git+https://github.com/fanshiqing/[email protected]
1111

1212
FROM $FROM_IMAGE_NAME as build_mamba_ssm
1313
WORKDIR /opt
14-
RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.0.3
14+
RUN MAMBA_FORCE_BUILD=TRUE pip3 wheel -v git+https://github.com/state-spaces/mamba.git@v2.2.0
1515

1616
FROM $FROM_IMAGE_NAME as main
1717
ENV DEBIAN_FRONTEND=noninteractive
@@ -23,9 +23,9 @@ RUN apt-get update && \
2323
wget https://github.com/mikefarah/yq/releases/download/v4.44.1/yq_linux_amd64 -O /usr/local/bin/yq && \
2424
chmod a+x /usr/local/bin/yq
2525

26-
COPY --from=build_causal_conv1d /opt/causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl ./
27-
COPY --from=build_grouped_gemm /opt/grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl ./
28-
COPY --from=build_mamba_ssm /opt/mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl ./
26+
COPY --from=build_causal_conv1d /opt/causal_conv1d-*.whl ./
27+
COPY --from=build_grouped_gemm /opt/grouped_gemm-*.whl ./
28+
COPY --from=build_mamba_ssm /opt/mamba_ssm-*.whl ./
2929

3030
RUN pip3 install --no-cache-dir --upgrade-strategy only-if-needed -v \
3131
einops \
@@ -40,10 +40,9 @@ tiktoken \
4040
wrapt \
4141
zarr \
4242
wandb \
43-
triton==2.1.0 \
44-
causal_conv1d-1.2.2.post1-cp310-cp310-linux_x86_64.whl \
45-
mamba_ssm-2.0.3-cp310-cp310-linux_x86_64.whl \
46-
grouped_gemm-1.1.2-cp310-cp310-linux_x86_64.whl \
43+
causal_conv1d-*.whl \
44+
mamba_ssm-*.whl \
45+
grouped_gemm-*.whl \
4746
tensorstore==0.1.45 && \
4847
rm *.whl
4948

0 commit comments

Comments
 (0)