diff --git a/.gitlab/stages/01.tests.yml b/.gitlab/stages/01.tests.yml
index dc59e026ac..ed80e96fee 100644
--- a/.gitlab/stages/01.tests.yml
+++ b/.gitlab/stages/01.tests.yml
@@ -119,6 +119,8 @@ unit_tests:
         ARGS=()
         if [[ $TAG != latest ]]; then
           ARGS+=(-m "not internal")
+        else
+          ARGS+=(-m "not flaky")
         fi
         if [[ $IMAGE == ${CI_MCORE_DEV_IMAGE} ]]; then
           ARGS+=(-m "experimental")
diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py
index 854936cdb3..b8b6ec5dd7 100644
--- a/tests/unit_tests/data/test_bin_reader.py
+++ b/tests/unit_tests/data/test_bin_reader.py
@@ -89,7 +89,7 @@ class _LocalClientError(Exception):
 setattr(exceptions, "ClientError", _LocalClientError)
 
 
-@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+@pytest.mark.flaky
 def test_bin_reader():
     with tempfile.TemporaryDirectory() as temp_dir:
         # set the default nltk data path
diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py
index 953845f1c9..817ea227f1 100644
--- a/tests/unit_tests/data/test_gpt_dataset.py
+++ b/tests/unit_tests/data/test_gpt_dataset.py
@@ -26,7 +26,7 @@ def sample_N(dataset, N, randomize):
     return samples
 
 
-@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+@pytest.mark.flaky
 def test_mock_gpt_dataset():
     if torch.distributed.is_available():
         Utils.initialize_distributed()
diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py
index 0b460f51a9..4eca14e588 100644
--- a/tests/unit_tests/data/test_preprocess_data.py
+++ b/tests/unit_tests/data/test_preprocess_data.py
@@ -183,7 +183,7 @@ def gpt2_merge(odir):
     return path
 
 
-@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+@pytest.mark.flaky
 def test_preprocess_data_gpt():
     with tempfile.TemporaryDirectory() as temp_dir:
 
@@ -214,7 +214,7 @@ def bert_vocab(odir):
     return path
 
 
-@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+@pytest.mark.flaky
 def test_preprocess_data_bert():
     with tempfile.TemporaryDirectory() as temp_dir:
 
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mamba.py b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
index 175db4580a..6bdcd9b827 100644
--- a/tests/unit_tests/dist_checkpointing/models/test_mamba.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
@@ -74,7 +74,7 @@ class TestMambaReconfiguration:
             # (False, (1, 1, 4), (8, 1, 1), True),
         ],
     )
-    @pytest.mark.skip(reason="Flaky test; needs to be debugged")
+    @pytest.mark.flaky
     def test_parallel_reconfiguration_e2e(
         self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl
     ):
diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py
index 1238d09f76..d2dcb367c7 100644
--- a/tests/unit_tests/dist_checkpointing/test_fp8.py
+++ b/tests/unit_tests/dist_checkpointing/test_fp8.py
@@ -51,7 +51,7 @@ def get_ten(dtype: str = 'fp8'):
             (False, (2, 4), (2, 4), None),
         ],
     )
-    @pytest.mark.skip(reason="Flaky test")
+    @pytest.mark.flaky
     def test_fp8_save_load(
         self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo
     ):
diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
index 1e7001477e..623e37d6b8 100644
--- a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
+++ b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
@@ -280,8 +280,8 @@ def test_load_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):
 
         assert loaded_state_dict.keys() == state_dict.keys()
 
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
     @pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda'])
+    @pytest.mark.flaky
     def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt):
         Utils.initialize_model_parallel(2, 1)
 
diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
index d5d5cdce8f..346751e264 100644
--- a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
+++ b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
@@ -29,7 +29,7 @@ def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
-    @pytest.mark.skip(reason="Flaky test")
+    @pytest.mark.flaky
     def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0
@@ -118,7 +118,7 @@ def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
 
 class TestLegacySaveAndLoad:
     @pytest.mark.parametrize(('tp,pp'), [(2, 4)])
-    @pytest.mark.skip(reason="Flaky test")
+    @pytest.mark.flaky
     def test_basic_save_load_scenario(self, tmp_path_dist_ckpt, tp, pp):
         Utils.initialize_model_parallel(tp, pp)
         num_floating_point_operations_so_far = 0
diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
index d82a8be95a..19d1ee9e85 100644
--- a/tests/unit_tests/dist_checkpointing/test_optimizer.py
+++ b/tests/unit_tests/dist_checkpointing/test_optimizer.py
@@ -178,7 +178,7 @@ def teardown_method(self, method):
             # ((2, 1), 2, 2),
         ],
     )
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+    @pytest.mark.flaky
     def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn):
         src_world_size = tp_pp[0] * tp_pp[1] * src_dp
         dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp
@@ -256,7 +256,7 @@ def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl,
         ('src_tp_pp', 'dest_tp_pp', 'use_glu'),
         [((2, 2), (2, 4), False), ((1, 8), (4, 1), True), ((2, 4), (4, 2), False)],
     )
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+    @pytest.mark.flaky
     def test_finetune_doesnt_load_optimizer(
         self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu
     ):
@@ -329,7 +329,7 @@ def test_finetune_doesnt_load_optimizer(
                 assert not diffs[0] and not diffs[1] and diffs[2]
                 assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))
 
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+    @pytest.mark.flaky
     def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt):
         # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
         tp = 4
@@ -398,7 +398,7 @@ def teardown_method(self, method):
     @pytest.mark.parametrize(
         ('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (4, 2)), ((8, 1), (1, 2))]
     )
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+    @pytest.mark.flaky
     def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
         # sync=True to make sure other ranks wait for rank 0 to finish creating directory.
         Utils.initialize_model_parallel(*src_tp_pp)
@@ -465,7 +465,7 @@ def teardown_method(self, method):
         ('src_tp_pp', 'dest_tp_pp'),
         [((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
     )
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+    @pytest.mark.flaky
     def test_optimizer_resharding(
         self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_dist_opt, bf16
     ):
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
index 60427d18b5..9174665eed 100644
--- a/tests/unit_tests/distributed/test_param_and_grad_buffer.py
+++ b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -58,7 +58,7 @@ def get_model_and_buffers(
 @pytest.mark.parametrize("use_distributed_optimizer", [False, True])
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("shared_embedding", [False, True])
-@pytest.mark.skip(reason="Flaky test")
+@pytest.mark.flaky
 def test_bucket_sizes(
     bucket_size: Optional[int], use_distributed_optimizer: bool, bias: bool, shared_embedding: bool
 ):
diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
index 858f5fee50..ad829881d0 100644
--- a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
@@ -70,7 +70,7 @@ def test_capacity_forward_backward(self, tp_size, ep_size):
     @pytest.mark.internal
     @pytest.mark.timeout(120)
     @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
-    @pytest.mark.skip(reason="Tests are flaky and need to be debugged")
+    @pytest.mark.flaky
     def test_capacity_padding_forward_backward(self, tp_size, ep_size):
         container = MoEModelTestContainer(
             tp_size=tp_size,
diff --git a/tests/unit_tests/transformer/moe/test_upcycling.py b/tests/unit_tests/transformer/moe/test_upcycling.py
index 2057715684..b5a98c3713 100644
--- a/tests/unit_tests/transformer/moe/test_upcycling.py
+++ b/tests/unit_tests/transformer/moe/test_upcycling.py
@@ -128,7 +128,7 @@ def teardown_method(self, method):
         destroy_num_microbatches_calculator()
 
     @pytest.mark.internal
-    @pytest.mark.skipif(True, reason="The test is flaky")  # TODO: Fix the test
+    @pytest.mark.flaky  # TODO: Fix the test
     @pytest.mark.parametrize(
         ('tp_pp_ep', 'enable_vp', 'enable_grouped_gemm'), [((1, 1, 2), (False), (False))]
     )