From 8e06e039f44b50d9022e2d67de91c7754fc6d8a4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 7 Nov 2024 17:08:39 +0000 Subject: [PATCH] test cleanup Signed-off-by: Terry Kong --- Dockerfile | 4 ++- tests/functional/dpo.sh | 0 tests/functional/ppo.sh | 0 tests/functional/rm.sh | 0 tests/functional/test_cases/dpo-llama3 | 7 +++-- tests/functional/test_cases/dpo-mixtral-ep | 19 ++++++++++-- .../test_cases/dpo-mixtral-peft-tp-sp | 31 +++++++++++++++++++ tests/functional/test_cases/dpo-mixtral-sp | 13 -------- .../test_cases/ppo-llama3-pp2-reshard | 3 +- tests/functional/test_cases/rm-llama3 | 18 ++++++++++- 10 files changed, 75 insertions(+), 20 deletions(-) mode change 100644 => 100755 tests/functional/dpo.sh mode change 100644 => 100755 tests/functional/ppo.sh mode change 100644 => 100755 tests/functional/rm.sh create mode 100755 tests/functional/test_cases/dpo-mixtral-peft-tp-sp delete mode 100755 tests/functional/test_cases/dpo-mixtral-sp mode change 100644 => 100755 tests/functional/test_cases/rm-llama3 diff --git a/Dockerfile b/Dockerfile index e2e3b1670..3e752c72f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -131,17 +131,19 @@ git fetch -a # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 # ba8edbd2063f3349c40c9c73e5bae46abbe65f94: fix: regular torch optims (e.g., sgd) no longer error with closure spec NeMo#11189 +# 35a7f718237cf011215db9e92273ed7236d0e8b1: Fix for crash with LoRA + tp_overlap_comm=false + sequence_parallel=true NeMo#10920 for pr_and_commit in \ "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ "10652 60e677423667c029dd05875da72bf0719774f844" \ "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ "11189 ba8edbd2063f3349c40c9c73e5bae46abbe65f94" \ + "10920 53cf6527571b29379188c8bb0dba8e507db3cca1" \ ; do pr=$(cut -f1 -d' ' <<<"$pr_and_commit") head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") git fetch origin $head_pr_commit:PR-${pr} # cherry-picks all commits between main and the top of the PR - git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} + git cherry-pick -m 1 --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} # Tag cherry-picks to help git tag cherry-pick-PR-${pr} done diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh old mode 100644 new mode 100755 diff --git a/tests/functional/ppo.sh b/tests/functional/ppo.sh old mode 100644 new mode 100755 diff --git a/tests/functional/rm.sh b/tests/functional/rm.sh old mode 100644 new mode 100755 diff --git a/tests/functional/test_cases/dpo-llama3 b/tests/functional/test_cases/dpo-llama3 index 8e40e94c8..f841ab8b0 100755 --- a/tests/functional/test_cases/dpo-llama3 +++ b/tests/functional/test_cases/dpo-llama3 @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,4 +20,6 @@ cd $SCRIPT_DIR set -eoux pipefail PRETRAINED_CHECKPOINT_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \ -bash ../dpo.sh +bash ../dpo.sh \ + ++model.optim.name=mcore_distributed_optim \ + 2>&1 | tee $(basename $0).log diff --git a/tests/functional/test_cases/dpo-mixtral-ep b/tests/functional/test_cases/dpo-mixtral-ep index 752a1cb12..79f6ffd1d 100755 --- a/tests/functional/test_cases/dpo-mixtral-ep +++ b/tests/functional/test_cases/dpo-mixtral-ep @@ -1,12 +1,27 @@ #!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd $SCRIPT_DIR set -eoux pipefail -PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \ +PRETRAINED_CHECKPOINT_NEMO_FILE=$ALIGNER_CI_DIR/checkpoints/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \ bash ../dpo.sh \ ++model.optim.name=mcore_distributed_optim \ ++model.expert_model_parallel_size=2 \ - 2>&1 | tee $(basename $0 .sh).log + 2>&1 | tee $(basename $0).log diff --git a/tests/functional/test_cases/dpo-mixtral-peft-tp-sp b/tests/functional/test_cases/dpo-mixtral-peft-tp-sp new file mode 100755 index 000000000..5350d42a3 --- /dev/null +++ b/tests/functional/test_cases/dpo-mixtral-peft-tp-sp @@ -0,0 +1,31 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + +set -eoux pipefail + +PRETRAINED_CHECKPOINT_NEMO_FILE=$ALIGNER_CI_DIR/checkpoints/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \ +bash ../dpo.sh \ + ++model.optim.name=mcore_distributed_optim \ + ++model.tensor_model_parallel_size=2 \ + ++model.expert_model_parallel_size=1 \ + ++model.sequence_parallel=True \ + ++model.tp_comm_overlap_disable_qkv=True \ + model.data.pad_length_to_multiple_of=2 \ + model.peft.peft_scheme=lora \ + 2>&1 | tee $(basename $0).log diff --git a/tests/functional/test_cases/dpo-mixtral-sp b/tests/functional/test_cases/dpo-mixtral-sp deleted file mode 100755 index 389b1c32b..000000000 --- a/tests/functional/test_cases/dpo-mixtral-sp +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -cd $SCRIPT_DIR - -set -eoux pipefail - -PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \ -bash ../dpo.sh \ - ++model.optim.name=mcore_distributed_optim \ - ++model.tensor_model_parallel_size=2 \ - ++model.expert_model_parallel_size=1 \ - ++model.sequence_parallel=True \ - 2>&1 | tee $(basename $0 .sh).log diff --git a/tests/functional/test_cases/ppo-llama3-pp2-reshard b/tests/functional/test_cases/ppo-llama3-pp2-reshard index 9169b10da..880e3ebce 100755 --- a/tests/functional/test_cases/ppo-llama3-pp2-reshard +++ b/tests/functional/test_cases/ppo-llama3-pp2-reshard @@ -25,4 +25,5 @@ GBS=2 \ RESHARD=True \ RM_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/llama3--nlayers4-hidden64-ffn224-dummy_rm-megatron_gpt.nemo \ ACTOR_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \ - bash ../ppo.sh + bash ../ppo.sh \ + 2>&1 | tee $(basename $0).log diff --git a/tests/functional/test_cases/rm-llama3 b/tests/functional/test_cases/rm-llama3 old mode 100644 new mode 100755 index 05caba634..830b2b111 --- a/tests/functional/test_cases/rm-llama3 +++ b/tests/functional/test_cases/rm-llama3 @@ -1,8 +1,24 @@ #!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) cd $SCRIPT_DIR set -eoux pipefail PRETRAINED_CHECKPOINT_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \ -bash ../rm.sh \ No newline at end of file +bash ../rm.sh \ + 2>&1 | tee $(basename $0).log \ No newline at end of file