From 8e06e039f44b50d9022e2d67de91c7754fc6d8a4 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 7 Nov 2024 17:08:39 +0000
Subject: [PATCH] test cleanup

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 Dockerfile                                    |  4 ++-
 tests/functional/dpo.sh                       |  0
 tests/functional/ppo.sh                       |  0
 tests/functional/rm.sh                        |  0
 tests/functional/test_cases/dpo-llama3        |  7 +++--
 tests/functional/test_cases/dpo-mixtral-ep    | 19 ++++++++++--
 .../test_cases/dpo-mixtral-peft-tp-sp         | 31 +++++++++++++++++++
 tests/functional/test_cases/dpo-mixtral-sp    | 13 --------
 .../test_cases/ppo-llama3-pp2-reshard         |  3 +-
 tests/functional/test_cases/rm-llama3         | 18 ++++++++++-
 10 files changed, 75 insertions(+), 20 deletions(-)
 mode change 100644 => 100755 tests/functional/dpo.sh
 mode change 100644 => 100755 tests/functional/ppo.sh
 mode change 100644 => 100755 tests/functional/rm.sh
 create mode 100755 tests/functional/test_cases/dpo-mixtral-peft-tp-sp
 delete mode 100755 tests/functional/test_cases/dpo-mixtral-sp
 mode change 100644 => 100755 tests/functional/test_cases/rm-llama3

diff --git a/Dockerfile b/Dockerfile
index e2e3b1670..3e752c72f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -131,17 +131,19 @@ git fetch -a
 # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
 # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
 # ba8edbd2063f3349c40c9c73e5bae46abbe65f94: fix: regular torch optims (e.g., sgd) no longer error with closure spec NeMo#11189
+# 35a7f718237cf011215db9e92273ed7236d0e8b1: Fix for crash with LoRA + tp_overlap_comm=false + sequence_parallel=true NeMo#10920
 for pr_and_commit in \
   "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
   "10652 60e677423667c029dd05875da72bf0719774f844" \
   "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
   "11189 ba8edbd2063f3349c40c9c73e5bae46abbe65f94" \
+  "10920 53cf6527571b29379188c8bb0dba8e507db3cca1" \
 ; do
   pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
   head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
   git fetch origin $head_pr_commit:PR-${pr}
   # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
+  git cherry-pick -m 1 --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
   # Tag cherry-picks to help
   git tag cherry-pick-PR-${pr}
 done
diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
old mode 100644
new mode 100755
diff --git a/tests/functional/ppo.sh b/tests/functional/ppo.sh
old mode 100644
new mode 100755
diff --git a/tests/functional/rm.sh b/tests/functional/rm.sh
old mode 100644
new mode 100755
diff --git a/tests/functional/test_cases/dpo-llama3 b/tests/functional/test_cases/dpo-llama3
index 8e40e94c8..f841ab8b0 100755
--- a/tests/functional/test_cases/dpo-llama3
+++ b/tests/functional/test_cases/dpo-llama3
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,4 +20,6 @@ cd $SCRIPT_DIR
 set -eoux pipefail
 
 PRETRAINED_CHECKPOINT_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \
-bash ../dpo.sh
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  2>&1 | tee $(basename $0).log
diff --git a/tests/functional/test_cases/dpo-mixtral-ep b/tests/functional/test_cases/dpo-mixtral-ep
index 752a1cb12..79f6ffd1d 100755
--- a/tests/functional/test_cases/dpo-mixtral-ep
+++ b/tests/functional/test_cases/dpo-mixtral-ep
@@ -1,12 +1,27 @@
 #!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd $SCRIPT_DIR
 
 set -eoux pipefail
 
-PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
+PRETRAINED_CHECKPOINT_NEMO_FILE=$ALIGNER_CI_DIR/checkpoints/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
 bash ../dpo.sh \
   ++model.optim.name=mcore_distributed_optim \
   ++model.expert_model_parallel_size=2 \
-  2>&1 | tee $(basename $0 .sh).log
+  2>&1 | tee $(basename $0).log
 
diff --git a/tests/functional/test_cases/dpo-mixtral-peft-tp-sp b/tests/functional/test_cases/dpo-mixtral-peft-tp-sp
new file mode 100755
index 000000000..5350d42a3
--- /dev/null
+++ b/tests/functional/test_cases/dpo-mixtral-peft-tp-sp
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
+set -eoux pipefail
+
+PRETRAINED_CHECKPOINT_NEMO_FILE=$ALIGNER_CI_DIR/checkpoints/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
+bash ../dpo.sh \
+  ++model.optim.name=mcore_distributed_optim \
+  ++model.tensor_model_parallel_size=2 \
+  ++model.expert_model_parallel_size=1 \
+  ++model.sequence_parallel=True \
+  ++model.tp_comm_overlap_disable_qkv=True \
+  model.data.pad_length_to_multiple_of=2 \
+  model.peft.peft_scheme=lora \
+  2>&1 | tee $(basename $0).log
diff --git a/tests/functional/test_cases/dpo-mixtral-sp b/tests/functional/test_cases/dpo-mixtral-sp
deleted file mode 100755
index 389b1c32b..000000000
--- a/tests/functional/test_cases/dpo-mixtral-sp
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-cd $SCRIPT_DIR
-
-set -eoux pipefail
-
-PRETRAINED_CHECKPOINT_NEMO_FILE=/home/terryk/saved_experiments/tiny-mixtral-nlayers2-hidden128-ffn448-nhead4-qgroup2.nemo \
-bash ../dpo.sh \
-  ++model.optim.name=mcore_distributed_optim \
-  ++model.tensor_model_parallel_size=2 \
-  ++model.expert_model_parallel_size=1 \
-  ++model.sequence_parallel=True \
-  2>&1 | tee $(basename $0 .sh).log
diff --git a/tests/functional/test_cases/ppo-llama3-pp2-reshard b/tests/functional/test_cases/ppo-llama3-pp2-reshard
index 9169b10da..880e3ebce 100755
--- a/tests/functional/test_cases/ppo-llama3-pp2-reshard
+++ b/tests/functional/test_cases/ppo-llama3-pp2-reshard
@@ -25,4 +25,5 @@ GBS=2 \
     RESHARD=True \
     RM_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/llama3--nlayers4-hidden64-ffn224-dummy_rm-megatron_gpt.nemo \
     ACTOR_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \
-    bash ../ppo.sh
+    bash ../ppo.sh \
+    2>&1 | tee $(basename $0).log
diff --git a/tests/functional/test_cases/rm-llama3 b/tests/functional/test_cases/rm-llama3
old mode 100644
new mode 100755
index 05caba634..830b2b111
--- a/tests/functional/test_cases/rm-llama3
+++ b/tests/functional/test_cases/rm-llama3
@@ -1,8 +1,24 @@
 #!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 cd $SCRIPT_DIR
 
 set -eoux pipefail
 
 PRETRAINED_CHECKPOINT_NEMO_FILE=${ALIGNER_CI_DIR}/checkpoints/tiny-llama3-results-nlayers2-hidden128-ffn448-nhead4-qgroup2-megatron_gpt.nemo \
-bash ../rm.sh
\ No newline at end of file
+bash ../rm.sh \
+  2>&1 | tee $(basename $0).log
\ No newline at end of file