diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml index dc59e026ac..ed80e96fee 100644 --- a/.gitlab/stages/01.tests.yml +++ b/.gitlab/stages/01.tests.yml @@ -119,6 +119,8 @@ unit_tests: ARGS=() if [[ $TAG != latest ]]; then ARGS+=(-m "not internal") + else + ARGS+=(-m "not flaky") fi if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then ARGS+=(-m "experimental") diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py index 854936cdb3..b8b6ec5dd7 100644 --- a/tests/unit_tests/data/test_bin_reader.py +++ b/tests/unit_tests/data/test_bin_reader.py @@ -89,7 +89,7 @@ class _LocalClientError(Exception): setattr(exceptions, "ClientError", _LocalClientError) -@pytest.mark.skip(reason="Tests are flaky and need to be debugged") +@pytest.mark.flaky def test_bin_reader(): with tempfile.TemporaryDirectory() as temp_dir: # set the default nltk data path diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py index 953845f1c9..817ea227f1 100644 --- a/tests/unit_tests/data/test_gpt_dataset.py +++ b/tests/unit_tests/data/test_gpt_dataset.py @@ -26,7 +26,7 @@ def sample_N(dataset, N, randomize): return samples -@pytest.mark.skip(reason="Tests are flaky and need to be debugged") +@pytest.mark.flaky def test_mock_gpt_dataset(): if torch.distributed.is_available(): Utils.initialize_distributed() diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py index 0b460f51a9..4eca14e588 100644 --- a/tests/unit_tests/data/test_preprocess_data.py +++ b/tests/unit_tests/data/test_preprocess_data.py @@ -183,7 +183,7 @@ def gpt2_merge(odir): return path -@pytest.mark.skip(reason="Tests are flaky and need to be debugged") +@pytest.mark.flaky def test_preprocess_data_gpt(): with tempfile.TemporaryDirectory() as temp_dir: @@ -214,7 +214,7 @@ def bert_vocab(odir): return path -@pytest.mark.skip(reason="Tests are flaky and need to be debugged") +@pytest.mark.flaky def test_preprocess_data_bert(): with tempfile.TemporaryDirectory() as temp_dir: diff --git a/tests/unit_tests/dist_checkpointing/models/test_mamba.py b/tests/unit_tests/dist_checkpointing/models/test_mamba.py index 175db4580a..6bdcd9b827 100644 --- a/tests/unit_tests/dist_checkpointing/models/test_mamba.py +++ b/tests/unit_tests/dist_checkpointing/models/test_mamba.py @@ -74,7 +74,7 @@ class TestMambaReconfiguration: # (False, (1, 1, 4), (8, 1, 1), True), ], ) - @pytest.mark.skip(reason="Flaky test; needs to be debugged") + @pytest.mark.flaky def test_parallel_reconfiguration_e2e( self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl ): diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py index 1238d09f76..d2dcb367c7 100644 --- a/tests/unit_tests/dist_checkpointing/test_fp8.py +++ b/tests/unit_tests/dist_checkpointing/test_fp8.py @@ -51,7 +51,7 @@ def get_ten(dtype: str = 'fp8'): (False, (2, 4), (2, 4), None), ], ) - @pytest.mark.skip(reason="Flaky test") + @pytest.mark.flaky def test_fp8_save_load( self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo ): diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py index 1e7001477e..623e37d6b8 100644 --- a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py +++ b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py @@ -280,8 +280,8 @@ def test_load_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt): assert loaded_state_dict.keys() == state_dict.keys() - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") @pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda']) + @pytest.mark.flaky def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt): Utils.initialize_model_parallel(2, 1) diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py index d5d5cdce8f..346751e264 100644 --- a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py +++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py @@ -29,7 +29,7 @@ def teardown_method(self, method): Utils.destroy_model_parallel() @pytest.mark.parametrize(('tp,pp'), [(2, 4)]) - @pytest.mark.skip(reason="Flaky test") + @pytest.mark.flaky def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp): Utils.initialize_model_parallel(tp, pp) num_floating_point_operations_so_far = 0 @@ -118,7 +118,7 @@ def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp): class TestLegacySaveAndLoad: @pytest.mark.parametrize(('tp,pp'), [(2, 4)]) - @pytest.mark.skip(reason="Flaky test") + @pytest.mark.flaky def test_basic_save_load_scenario(self, tmp_path_dist_ckpt, tp, pp): Utils.initialize_model_parallel(tp, pp) num_floating_point_operations_so_far = 0 diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py index d82a8be95a..19d1ee9e85 100644 --- a/tests/unit_tests/dist_checkpointing/test_optimizer.py +++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py @@ -178,7 +178,7 @@ def teardown_method(self, method): # ((2, 1), 2, 2), ], ) - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") + @pytest.mark.flaky def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn): src_world_size = tp_pp[0] * tp_pp[1] * src_dp dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp @@ -256,7 +256,7 @@ def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, ('src_tp_pp', 'dest_tp_pp', 'use_glu'), [((2, 2), (2, 4), False), ((1, 8), (4, 1), True), ((2, 4), (4, 2), False)], ) - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") + @pytest.mark.flaky def test_finetune_doesnt_load_optimizer( self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu ): @@ -329,7 +329,7 @@ def test_finetune_doesnt_load_optimizer( assert not diffs[0] and not diffs[1] and diffs[2] assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict)) - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") + @pytest.mark.flaky def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt): # sync=True to make sure other ranks wait for rank 0 to finish creating directory. tp = 4 @@ -398,7 +398,7 @@ def teardown_method(self, method): @pytest.mark.parametrize( ('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (4, 2)), ((8, 1), (1, 2))] ) - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") + @pytest.mark.flaky def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp): # sync=True to make sure other ranks wait for rank 0 to finish creating directory. Utils.initialize_model_parallel(*src_tp_pp) @@ -465,7 +465,7 @@ def teardown_method(self, method): ('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))], ) - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") + @pytest.mark.flaky def test_optimizer_resharding( self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_dist_opt, bf16 ): diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py index 60427d18b5..9174665eed 100644 --- a/tests/unit_tests/distributed/test_param_and_grad_buffer.py +++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py @@ -58,7 +58,7 @@ def get_model_and_buffers( @pytest.mark.parametrize("use_distributed_optimizer", [False, True]) @pytest.mark.parametrize("bias", [False, True]) @pytest.mark.parametrize("shared_embedding", [False, True]) -@pytest.mark.skip(reason="Flaky test") +@pytest.mark.flaky def test_bucket_sizes( bucket_size: Optional[int], use_distributed_optimizer: bool, bias: bool, shared_embedding: bool ): diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py index 858f5fee50..ad829881d0 100644 --- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py @@ -70,7 +70,7 @@ def test_capacity_forward_backward(self, tp_size, ep_size): @pytest.mark.internal @pytest.mark.timeout(120) @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)]) - @pytest.mark.skip(reason="Tests are flaky and need to be debugged") + @pytest.mark.flaky def test_capacity_padding_forward_backward(self, tp_size, ep_size): container = MoEModelTestContainer( tp_size=tp_size, diff --git a/tests/unit_tests/transformer/moe/test_upcycling.py b/tests/unit_tests/transformer/moe/test_upcycling.py index 2057715684..b5a98c3713 100644 --- a/tests/unit_tests/transformer/moe/test_upcycling.py +++ b/tests/unit_tests/transformer/moe/test_upcycling.py @@ -128,7 +128,7 @@ def teardown_method(self, method): destroy_num_microbatches_calculator() @pytest.mark.internal - @pytest.mark.skipif(True, reason="The test is flaky") # TODO: Fix the test + @pytest.mark.flaky # TODO: Fix the test @pytest.mark.parametrize( ('tp_pp_ep', 'enable_vp', 'enable_grouped_gemm'), [((1, 1, 2), (False), (False))] )