Skip to content

Commit 32e427d

Browse files
committed
Merge branch 'ko3n1g/ci/fix-stage' into 'main'
ci: Allow dry-run of publish See merge request ADLR/megatron-lm!2262
2 parents 345b102 + 2501d52 commit 32e427d

File tree

3 files changed

+217
-67
lines changed

3 files changed

+217
-67
lines changed

.gitlab-ci.yml

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ workflow:
1919
FUNCTIONAL_TEST_SCOPE: mr
2020
FUNCTIONAL_TEST_CLUSTER_A100: ""
2121
FUNCTIONAL_TEST_CLUSTER_H100: ""
22+
PUBLISH: "no"
2223
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
2324
variables:
2425
UNIT_TEST_REPEAT: 5
@@ -27,6 +28,7 @@ workflow:
2728
FUNCTIONAL_TEST_SCOPE: nightly
2829
FUNCTIONAL_TEST_CLUSTER_A100: ""
2930
FUNCTIONAL_TEST_CLUSTER_H100: ""
31+
PUBLISH: "no"
3032
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
3133
variables:
3234
UNIT_TEST_REPEAT: 5
@@ -35,29 +37,36 @@ workflow:
3537
FUNCTIONAL_TEST_SCOPE: weekly
3638
FUNCTIONAL_TEST_CLUSTER_A100: ""
3739
FUNCTIONAL_TEST_CLUSTER_H100: ""
40+
PUBLISH: "no"
3841
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
3942
variables:
4043
FUNCTIONAL_TEST: "no"
44+
PUBLISH: "no"
4145
- when: never
4246
auto_cancel:
4347
on_new_commit: interruptible
4448

4549
stages:
4650
- test
4751
- functional_tests
48-
- convergence_tests
4952
- publish
5053

5154
default:
5255
interruptible: true
5356

5457
variables:
55-
UNIT_TEST_TIMEOUT:
56-
value: "15"
57-
description: Timeout (minutes) for Unit tests (all repeats)
58+
UNIT_TEST:
59+
value: "yes"
60+
options:
61+
- "yes"
62+
- "no"
63+
description: To run the funtional test suite
5864
UNIT_TEST_REPEAT:
5965
value: "1"
6066
description: "Number of repetitions"
67+
UNIT_TEST_TIMEOUT:
68+
value: "15"
69+
description: Timeout (minutes) for Unit tests (all repeats)
6170
FUNCTIONAL_TEST:
6271
value: "yes"
6372
options:

.gitlab/stages/01.test.yml

Lines changed: 151 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
.test_rules:
22
rules:
3-
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
3+
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
44
allow_failure: true
55
when: on_success
66
- when: on_success
@@ -46,7 +46,7 @@ test:build_image:
4646
4747
ADDITIONAL_PARAMS=()
4848
49-
if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then
49+
if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" ]]; then
5050
ADDITIONAL_PARAMS+=("--pull")
5151
ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main")
5252
fi
@@ -118,10 +118,10 @@ test:build_image:
118118
paths:
119119
- coverage
120120
rules:
121-
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
121+
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true" && $UNIT_TEST_REPEAT != '0'
122122
allow_failure: true
123123
when: on_success
124-
- if: $UNIT_TEST_REPEAT != '0'
124+
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
125125
when: on_success
126126

127127
test:pyt(LTS)_mcore(latest):
@@ -135,6 +135,8 @@ test:pyt(LTS)_mcore(0.9.0):
135135
variables:
136136
TAG: core_r0.9.0
137137
IMAGE: ${CI_MCORE_LTS_IMAGE}
138+
UNIT_TEST_REPEAT: 1
139+
UNIT_TEST_TIMEOUT: 15
138140

139141
test:pyt(DEV)_mcore(latest):
140142
extends: [.unit_tests]
@@ -147,8 +149,10 @@ test:pyt(DEV)_mcore(0.9.0):
147149
variables:
148150
TAG: core_r0.9.0
149151
IMAGE: ${CI_MCORE_DEV_IMAGE}
152+
UNIT_TEST_REPEAT: 1
153+
UNIT_TEST_TIMEOUT: 15
150154

151-
test:notify:
155+
test:notify_unit_tests:
152156
extends: [.test_rules]
153157
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
154158
needs:
@@ -229,4 +233,145 @@ test:secret_detection:
229233
echo "Atleast one vulnerability has been found"
230234
cat gl-secret-detection-report.json | jq '.'
231235
exit 1
232-
fi
236+
fi
237+
238+
test:pypi_build_wheel:
239+
extends: [.test_rules]
240+
image:
241+
name: quay.io/pypa/manylinux_2_28_x86_64
242+
entrypoint: [""]
243+
tags: [mcore-docker-node-small]
244+
variables:
245+
PUBLISH_DRYRUN: "yes"
246+
script:
247+
- echo $PUBLISH_DRYRUN
248+
- >
249+
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
250+
sed -i "/^PATCH/c\PATCH = $((RANDOM % 9000 + 1000))" megatron/core/package_info.py
251+
fi
252+
- /opt/python/cp310-cp310/bin/python -m build
253+
- /opt/python/cp311-cp311/bin/python -m build
254+
- auditwheel repair dist/*.whl
255+
artifacts:
256+
paths:
257+
- megatron/core/package_info.py
258+
- wheelhouse/
259+
260+
test:pypi_test_wheel:
261+
extends: [.test_rules]
262+
image: nvcr.io/nvidia/pytorch:24.01-py3
263+
needs: [test:pypi_build_wheel]
264+
tags: [mcore-docker-node-small]
265+
variables:
266+
PUBLISH_DRYRUN: "yes"
267+
script:
268+
- EXPECTED_RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
269+
- rm -rf megatron
270+
- pip install wheelhouse/*cp310*.whl
271+
272+
- RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
273+
- >
274+
echo "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
275+
- test "$EXPECTED_RELEASE_NUMBER" == "$RELEASE_NUMBER"
276+
artifacts:
277+
paths:
278+
- wheelhouse/
279+
280+
test:pypi_push_wheel:
281+
extends: [.test_rules]
282+
image: python:3.10
283+
tags: [mcore-docker-node-small]
284+
needs: [test:pypi_test_wheel]
285+
variables:
286+
PUBLISH_DRYRUN: "yes"
287+
script:
288+
- >
289+
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
290+
REPOSITORY=testpypi
291+
export TWINE_USERNAME=$TWINE_TEST_USERNAME
292+
export TWINE_PASSWORT=$TWINE_TEST_PASSWORD
293+
else
294+
REPOSITORY=pypi
295+
export TWINE_USERNAME=$TWINE_PROD_USERNAME
296+
export TWINE_PASSWORT=$TWINE_PROD_PASSWORD
297+
fi
298+
- pip install twine
299+
- twine upload -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository $REPOSITORY wheelhouse/*
300+
301+
test:gh_release:
302+
extends: [.test_rules]
303+
tags: [mcore-docker-node-small]
304+
image: nvcr.io/nvidia/pytorch:24.01-py3
305+
variables:
306+
PUBLISH_DRYRUN: "yes"
307+
script:
308+
- RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
309+
- NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
310+
- CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
311+
- CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
312+
- >
313+
PAYLOAD=$(jq -nc \
314+
--arg CI_COMMIT_BRANCH "$CI_COMMIT_BRANCH" \
315+
--arg NAME "$NAME" \
316+
--arg BODY "$CHANGELOG" \
317+
'{
318+
"tag_name": $CI_COMMIT_BRANCH,
319+
"target_commitish": $CI_COMMIT_BRANCH,
320+
"name": $NAME,
321+
"body": $BODY,
322+
"draft": false,
323+
"prerelease": false,
324+
"generate_release_notes": false
325+
}'
326+
)
327+
- >
328+
CMD=$(echo curl -L \
329+
-X POST \
330+
-H "Accept: application/vnd.github+json" \
331+
-H "Authorization: Bearer $GH_TOKEN" \
332+
-H "X-GitHub-Api-Version: 2022-11-28" \
333+
https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
334+
-d "$PAYLOAD"
335+
)
336+
337+
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
338+
echo "$CMD"
339+
else
340+
eval "$CMD"
341+
fi
342+
343+
test:notify_release:
344+
needs: [test:pypi_push_wheel, test:gh_release]
345+
extends: [.test_rules]
346+
image: nvcr.io/nvidia/pytorch:24.01-py3
347+
tags: [mcore-docker-node-small]
348+
variables:
349+
PUBLISH_DRYRUN: "yes"
350+
script:
351+
- VERSION=$(python -c "from megatron import core; print(core.__version__)")
352+
- URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_r$VERSION"
353+
- >
354+
MESSAGE='{
355+
"blocks": [
356+
{
357+
"type": "section",
358+
"text": {
359+
"type": "mrkdwn",
360+
"text": "Releasebot 🤖: Megatron-Core released <'$URL'|core_r'$VERSION'> 🚀"
361+
}
362+
}
363+
]
364+
}'
365+
- echo "$MESSAGE"
366+
- >
367+
CMD=$(echo curl \
368+
-X POST \
369+
-H "Content-type: application/json" \
370+
--data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
371+
)
372+
373+
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
374+
echo "$CMD"
375+
else
376+
eval "$CMD"
377+
fi

0 commit comments

Comments
 (0)