Skip to content

Commit 7e93e37

Browse files
committed
fix: adds missing support for mcore dist opt and adds test for moe
Signed-off-by: Terry Kong <[email protected]> moe test is all2all Signed-off-by: Terry Kong <[email protected]> other params Signed-off-by: Terry Kong <[email protected]> fix peft mixtral Signed-off-by: Terry Kong <[email protected]>
1 parent 852d847 commit 7e93e37

File tree

21 files changed

+182
-46
lines changed

21 files changed

+182
-46
lines changed

.github/workflows/cicd-main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ jobs:
9494
- kd-llama3
9595
- sft-llama3
9696
- rm-llama3
97+
- dpo-mixtral-ep
98+
- dpo-mixtral-peft-tp-sp
9799
with:
98100
RUNNER: self-hosted-azure
99101
# Fairly aggresive timeout that all functional tests should try to adhere to

Dockerfile

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
# if you get errors building TE or Apex, decrease this to 4
1212
ARG MAX_JOBS=8
1313
# Git refs for dependencies
14-
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
15-
ARG PYTRITON_VERSION=0.5.10
16-
ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main
17-
ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
14+
ARG TE_TAG=2215fa5c7557b66034068816020f9f611019e457
15+
ARG PYTRITON_VERSION=0.5.12
16+
ARG NEMO_TAG=2b87d313bc04f456311b2fb99acafbfdaeb5e1e1 # On: r2.0.0
17+
ARG MLM_TAG=1b869f019af2c7aabf9c4deffe6eb64ebef88608 # On: core_r0.9.0
1818
ARG ALIGNER_COMMIT=main
1919
ARG TRTLLM_VERSION=v0.13.0
2020
ARG PROTOBUF_VERSION=4.24.4
@@ -130,16 +130,18 @@ git fetch -a
130130
# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
131131
# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
132132
# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
133+
# ba8edbd2063f3349c40c9c73e5bae46abbe65f94: fix: regular torch optims (e.g., sgd) no longer error with closure spec NeMo#11189
133134
for pr_and_commit in \
134135
"10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
135136
"10652 60e677423667c029dd05875da72bf0719774f844" \
136137
"10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
138+
"11189 ba8edbd2063f3349c40c9c73e5bae46abbe65f94" \
137139
; do
138140
pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
139141
head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
140142
git fetch origin $head_pr_commit:PR-${pr}
141143
# cherry-picks all commits between main and the top of the PR
142-
git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
144+
git cherry-pick -m 1 --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
143145
# Tag cherry-picks to help
144146
git tag cherry-pick-PR-${pr}
145147
done

examples/nlp/gpt/conf/gpt_dpo.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ trainer:
66
devices: 8
77
accelerator: gpu
88
precision: bf16
9+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
910

1011
# dpo specific args
1112
dpo:
@@ -17,6 +18,7 @@ trainer:
1718

1819
# how many GBS we loop over
1920
limit_val_batches: 1.0
21+
# TODO: delete once Megatron Core optimizer becomes default
2022
gradient_clip_val: 1.0
2123

2224
# do not change these

examples/nlp/gpt/conf/gpt_kto.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ trainer:
66
devices: 8
77
accelerator: gpu
88
precision: bf16
9+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
910

1011
# kto specific args
1112
kto:
@@ -17,6 +18,7 @@ trainer:
1718

1819
# how many GBS we loop over
1920
limit_val_batches: 1.0
21+
# TODO: delete once Megatron Core optimizer becomes default
2022
gradient_clip_val: 1.0
2123

2224
# do not change these

examples/nlp/gpt/conf/gpt_ppo_actor.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ trainer:
77
devices: 8
88
accelerator: gpu
99
precision: bf16
10+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
1011

1112
ppo:
1213
# How many steps we train warmup the critic for (without training the policy)
@@ -21,6 +22,7 @@ trainer:
2122
max_steps: -1 # max PPO steps (-1 to go through the whole train set)
2223
val_check_interval: 10
2324
save_interval: ${.val_check_interval}
25+
# TODO: delete once Megatron Core optimizer becomes default
2426
gradient_clip_val: 1.0
2527

2628
# PPO args to generate the data for training

examples/nlp/gpt/conf/gpt_ppo_critic.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ trainer:
66
devices: 8
77
accelerator: gpu
88
precision: bf16
9+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
910

1011
ppo:
1112
port: 5556
@@ -15,6 +16,7 @@ trainer:
1516

1617
# used to set the learning rate scheduler
1718
max_steps: 10000
19+
# TODO: delete once Megatron Core optimizer becomes default
1820
gradient_clip_val: 1.0
1921

2022
# a PyTriton parameter to specify

examples/nlp/gpt/conf/gpt_rs_actor.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ trainer:
77
devices: 8
88
accelerator: gpu
99
precision: bf16
10+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
1011

1112
rs:
1213
max_epochs: 1
1314
max_steps: -1 # max rs steps (-1 to go through the whole train set)
1415
val_check_interval: 10
1516
save_interval: ${.val_check_interval}
17+
# TODO: delete once Megatron Core optimizer becomes default
1618
gradient_clip_val: 1.0
1719

1820
# pick up from the model
@@ -177,4 +179,4 @@ model:
177179
# define fields from the base model's config that should be ignored when merging with this config.
178180
overwrite_base_config:
179181
data:
180-
data_prefix: True
182+
data_prefix: True

examples/nlp/gpt/conf/gpt_sft.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ trainer:
55
devices: 1
66
accelerator: gpu
77
precision: bf16
8+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
89

910
sft:
1011
max_epochs: 1
@@ -15,6 +16,7 @@ trainer:
1516
limit_train_batches: 1.0
1617

1718
limit_val_batches: 1.0
19+
# TODO: delete once Megatron Core optimizer becomes default
1820
gradient_clip_val: 1.0
1921

2022
# can be used to register any custom metrics that require token-by-token generation

examples/nlp/gpt/conf/gpt_spin.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ trainer:
66
devices: 8
77
accelerator: gpu
88
precision: bf16-mixed
9+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
910

1011
# spin specific args
1112
spin:
@@ -18,6 +19,7 @@ trainer:
1819

1920
# how many GBS we loop over
2021
limit_val_batches: 1.0
22+
# TODO: delete once Megatron Core optimizer becomes default
2123
gradient_clip_val: 1.0
2224

2325
# do not change these

examples/nlp/gpt/conf/training_rm.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ trainer:
66
devices: 8
77
accelerator: gpu
88
precision: bf16
9+
gradient_clip_val: 0.0 # No need to change. Megatron Core optimizer uses this value
910

1011
# rm specific args
1112
rm:
@@ -20,6 +21,7 @@ trainer:
2021
# set to float for a percentage
2122
# of the validation dataset
2223
limit_val_batches: 1.0
24+
# TODO: delete once Megatron Core optimizer becomes default
2325
gradient_clip_val: 1.0
2426

2527
# do not change these

0 commit comments

Comments
 (0)