Merge branch 'ko3n1g/ci/flaky-unit-tests' into 'main'

tests: Verify flaky tests See merge request ADLR/megatron-lm!2271
NVIDIA · Oct 28, 2024 · f1f0392 · f1f0392
2 parents 8ba37c0 + a616d45
commit f1f0392
Show file tree

Hide file tree

Showing 16 changed files with 18 additions and 29 deletions.
diff --git a/.gitlab/stages/01.test.yml b/.gitlab/stages/01.test.yml
@@ -100,7 +100,7 @@ test:build_image:
       - BUCKET: tests/unit_tests/transformer/
       - BUCKET: other
   script:
-    - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
+    - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e IMAGE -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
     - |
       CMD=$(cat <<"RUN_TEST_EOF"
       set -euxo pipefail
@@ -111,23 +111,31 @@ test:build_image:
 
       for i in $(seq $UNIT_TEST_REPEAT); do
         SEED=$((RANDOM % 9000 + 1000));
-        ARGS=()
+        MARKER=()
         if [[ $TAG != latest ]]; then
-          ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
+          MARKER+=("not internal")
+        fi
+        if [[ "$IMAGE" == *dev* ]]; then
+          MARKER+=("not flaky_in_dev")
         else
-          ARGS+=(-m "not flaky and not flaky_in_dev")
+          MARKER+=("not flaky")
         fi
+        MARKER_ARG=$(printf "%s" "${MARKER[0]}")
+        for element in "${MARKER[@]:1}"; do
+          MARKER_ARG+=" and $element"
+        done
 
         if [[ $BUCKET == other ]]; then
           BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " "))
-          ARGS+=(${BUCKETS[@]})
-          BUCKET=(tests/unit_tests)
+          IGNORE_ARGS=(${BUCKETS[@]})
+          BUCKET=tests/unit_tests
         else
-          BUCKET=(${BUCKET})
+          IGNORE_ARGS=()
+          BUCKET=${BUCKET}
         fi
 
         if [[ -d $BUCKET ]]; then
-          timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" ${BUCKET[@]}
+          timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${IGNORE_ARGS[@]}" -m "${MARKER_ARG}" $BUCKET
         fi
       done
       RUN_TEST_EOF

diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py
@@ -89,7 +89,6 @@ class _LocalClientError(Exception):
 setattr(exceptions, "ClientError", _LocalClientError)
 
 
-@pytest.mark.flaky
 def test_bin_reader():
     with tempfile.TemporaryDirectory() as temp_dir:
         # set the default nltk data path

diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py
@@ -26,7 +26,6 @@ def sample_N(dataset, N, randomize):
     return samples
 
 
-@pytest.mark.flaky
 def test_mock_gpt_dataset():
     if torch.distributed.is_available():
         Utils.initialize_distributed()

diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py
@@ -183,7 +183,6 @@ def gpt2_merge(odir):
     return path
 
 
-@pytest.mark.flaky
 def test_preprocess_data_gpt():
     with tempfile.TemporaryDirectory() as temp_dir:
 
@@ -215,6 +214,7 @@ def bert_vocab(odir):
 
 
 @pytest.mark.flaky
+@pytest.mark.flaky_in_dev
 def test_preprocess_data_bert():
     with tempfile.TemporaryDirectory() as temp_dir:
 

diff --git a/tests/unit_tests/dist_checkpointing/models/test_mamba.py b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
@@ -74,7 +74,6 @@ class TestMambaReconfiguration:
             # (False, (1, 1, 4), (8, 1, 1), True),
         ],
     )
-    @pytest.mark.flaky
     def test_parallel_reconfiguration_e2e(
         self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl
     ):

diff --git a/tests/unit_tests/dist_checkpointing/models/test_retro_model.py b/tests/unit_tests/dist_checkpointing/models/test_retro_model.py
@@ -63,7 +63,6 @@ def teardown_method(self, method):
     @pytest.mark.parametrize('src_spec_type', ['te', 'local'])
     @pytest.mark.parametrize('dst_spec_type', ['te', 'local'])
     @pytest.mark.parametrize('model_type', ['retro'])
-    @pytest.mark.flaky_in_dev
     def test_sharded_state_dict_save_load(
         self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type
     ):

diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py
@@ -71,7 +71,6 @@ def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt):
 
     @pytest.mark.parametrize('async_save', [False, True])
     @pytest.mark.parametrize('worker_fn', [write_data_os_err_mock_fn])
-    @pytest.mark.flaky
     def test_errors_are_reported(self, tmp_path_dist_ckpt, async_save, worker_fn):
         Utils.initialize_model_parallel(2, 4)
         sharded_state_dict = {

diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -33,7 +33,6 @@ def teardown_method(self, method):
         ('src_tp_pp', 'dest_tp_pp'),
         [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
     )
-    @pytest.mark.flaky
     def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
         Utils.initialize_model_parallel(*src_tp_pp)
         with TempNamedDir(

diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py
@@ -51,7 +51,6 @@ def get_ten(dtype: str = 'fp8'):
             (False, (2, 4), (2, 4), None),
         ],
     )
-    @pytest.mark.flaky
     def test_fp8_save_load(
         self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo
     ):

diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
@@ -282,6 +282,7 @@ def test_load_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):
 
     @pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda'])
     @pytest.mark.flaky
+    @pytest.mark.flaky_in_dev
     def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(2, 1)
 

diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
@@ -29,7 +29,6 @@ def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
-    @pytest.mark.flaky
     def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0
@@ -118,7 +117,6 @@ def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
 
 class TestLegacySaveAndLoad:
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
-    @pytest.mark.flaky
     def test_basic_save_load_scenario(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0

diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -178,7 +178,6 @@ def teardown_method(self, method):
             # ((2, 1), 2, 2),
         ],
     )
-    @pytest.mark.flaky
     def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn):
         src_world_size = tp_pp[0] * tp_pp[1] * src_dp
         dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp
@@ -256,7 +255,6 @@ def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl,
         ('src_tp_pp', 'dest_tp_pp', 'use_glu'),
         [((2, 2), (2, 4), False), ((1, 8), (4, 1), True), ((2, 4), (4, 2), False)],
     )
-    @pytest.mark.flaky
     def test_finetune_doesnt_load_optimizer(
         self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu
     ):
@@ -329,7 +327,6 @@ def test_finetune_doesnt_load_optimizer(
                 assert not diffs[0] and not diffs[1] and diffs[2]
                 assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
 
-    @pytest.mark.flaky
     def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt):
         # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
         tp = 4
@@ -398,7 +395,6 @@ def teardown_method(self, method):
     @pytest.mark.parametrize(
         ('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (4, 2)), ((8, 1), (1, 2))]
     )
-    @pytest.mark.flaky
     def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
         # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
         Utils.initialize_model_parallel(*src_tp_pp)
@@ -465,7 +461,6 @@ def teardown_method(self, method):
         ('src_tp_pp', 'dest_tp_pp'),
         [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
     )
-    @pytest.mark.flaky
     def test_optimizer_resharding(
         self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_dist_opt, bf16
     ):
@@ -517,7 +512,6 @@ def test_optimizer_resharding(
             ((2, 1, 2), (1, 1, 8)),
         ],
     )
-    @pytest.mark.flaky
     def test_chained_optimizer_resharding(
         self,
         tmp_path_dist_ckpt,

diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -58,7 +58,6 @@ def get_model_and_buffers(
 @pytest.mark.parametrize("use_distributed_optimizer", [False, True])
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("shared_embedding", [False, True])
-@pytest.mark.flaky
 def test_bucket_sizes(
     bucket_size: Optional[int], use_distributed_optimizer: bool, bias: bool, shared_embedding: bool
 ):

diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
@@ -70,7 +70,6 @@ def test_capacity_forward_backward(self, tp_size, ep_size):
     @pytest.mark.internal
     @pytest.mark.timeout(120)
     @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
-    @pytest.mark.flaky
     def test_capacity_padding_forward_backward(self, tp_size, ep_size):
         container = MoEModelTestContainer(
             tp_size=tp_size,

diff --git a/tests/unit_tests/transformer/moe/test_upcycling.py b/tests/unit_tests/transformer/moe/test_upcycling.py
@@ -128,7 +128,6 @@ def teardown_method(self, method):
         destroy_num_microbatches_calculator()
 
     @pytest.mark.internal
-    @pytest.mark.flaky  # TODO: Fix the test
     @pytest.mark.parametrize(
         ('tp_pp_ep', 'enable_vp', 'enable_grouped_gemm'), [((1, 1, 2), (False), (False))]
     )

diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py
@@ -81,7 +81,6 @@ def setup_method(self, method):
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
-    @pytest.mark.flaky_in_dev
     def test_constructor(self):
 
         config = self.get_config()
@@ -193,7 +192,6 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
             config.hidden_size,
         )
 
-    @pytest.mark.flaky_in_dev
     def test_gpu_forward(self):
         for recompute_granularity in (None, 'selective'):
             for use_transformer_engine in (True, False):