Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/flaky-unit-tests' into 'main'
Browse files Browse the repository at this point in the history
tests: Verify flaky tests

See merge request ADLR/megatron-lm!2271
  • Loading branch information
ko3n1g committed Oct 28, 2024
2 parents 8ba37c0 + a616d45 commit f1f0392
Show file tree
Hide file tree
Showing 16 changed files with 18 additions and 29 deletions.
24 changes: 16 additions & 8 deletions .gitlab/stages/01.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ test:build_image:
- BUCKET: tests/unit_tests/transformer/
- BUCKET: other
script:
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e BUCKET -e TAG -e IMAGE -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
- |
CMD=$(cat <<"RUN_TEST_EOF"
set -euxo pipefail
Expand All @@ -111,23 +111,31 @@ test:build_image:
for i in $(seq $UNIT_TEST_REPEAT); do
SEED=$((RANDOM % 9000 + 1000));
ARGS=()
MARKER=()
if [[ $TAG != latest ]]; then
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
MARKER+=("not internal")
fi
if [[ "$IMAGE" == *dev* ]]; then
MARKER+=("not flaky_in_dev")
else
ARGS+=(-m "not flaky and not flaky_in_dev")
MARKER+=("not flaky")
fi
MARKER_ARG=$(printf "%s" "${MARKER[0]}")
for element in "${MARKER[@]:1}"; do
MARKER_ARG+=" and $element"
done
if [[ $BUCKET == other ]]; then
BUCKETS=($(cat /opt/megatron-lm/.gitlab/stages/01.test.yml | yq '.".unit_tests".parallel.matrix | del(.[] | select(.BUCKET == "other")) | .[].BUCKET' | tr " " "\n" | sed 's/[^ ]*/--ignore &/g' | tr "\n" " "))
ARGS+=(${BUCKETS[@]})
BUCKET=(tests/unit_tests)
IGNORE_ARGS=(${BUCKETS[@]})
BUCKET=tests/unit_tests
else
BUCKET=(${BUCKET})
IGNORE_ARGS=()
BUCKET=${BUCKET}
fi
if [[ -d $BUCKET ]]; then
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" ${BUCKET[@]}
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${IGNORE_ARGS[@]}" -m "${MARKER_ARG}" $BUCKET
fi
done
RUN_TEST_EOF
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/data/test_bin_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ class _LocalClientError(Exception):
setattr(exceptions, "ClientError", _LocalClientError)


@pytest.mark.flaky
def test_bin_reader():
with tempfile.TemporaryDirectory() as temp_dir:
# set the default nltk data path
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/data/test_gpt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def sample_N(dataset, N, randomize):
return samples


@pytest.mark.flaky
def test_mock_gpt_dataset():
if torch.distributed.is_available():
Utils.initialize_distributed()
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/data/test_preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def gpt2_merge(odir):
return path


@pytest.mark.flaky
def test_preprocess_data_gpt():
with tempfile.TemporaryDirectory() as temp_dir:

Expand Down Expand Up @@ -215,6 +214,7 @@ def bert_vocab(odir):


@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_preprocess_data_bert():
with tempfile.TemporaryDirectory() as temp_dir:

Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/dist_checkpointing/models/test_mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ class TestMambaReconfiguration:
# (False, (1, 1, 4), (8, 1, 1), True),
],
)
@pytest.mark.flaky
def test_parallel_reconfiguration_e2e(
self, tmp_path_dist_ckpt, src_tp_pp_exp, dest_tp_pp_exp, use_glu, use_fpsl
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def teardown_method(self, method):
@pytest.mark.parametrize('src_spec_type', ['te', 'local'])
@pytest.mark.parametrize('dst_spec_type', ['te', 'local'])
@pytest.mark.parametrize('model_type', ['retro'])
@pytest.mark.flaky_in_dev
def test_sharded_state_dict_save_load(
self, tmp_path_dist_ckpt, src_spec_type, dst_spec_type, model_type
):
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/dist_checkpointing/test_async_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def test_async_is_equivalent_to_sync(self, tmp_path_dist_ckpt):

@pytest.mark.parametrize('async_save', [False, True])
@pytest.mark.parametrize('worker_fn', [write_data_os_err_mock_fn])
@pytest.mark.flaky
def test_errors_are_reported(self, tmp_path_dist_ckpt, async_save, worker_fn):
Utils.initialize_model_parallel(2, 4)
sharded_state_dict = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def teardown_method(self, method):
('src_tp_pp', 'dest_tp_pp'),
[((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
)
@pytest.mark.flaky
def test_partition_change_save_load(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
Utils.initialize_model_parallel(*src_tp_pp)
with TempNamedDir(
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/dist_checkpointing/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def get_ten(dtype: str = 'fp8'):
(False, (2, 4), (2, 4), None),
],
)
@pytest.mark.flaky
def test_fp8_save_load(
self, tmp_path_dist_ckpt, use_fpsl, src_tp_pp, dest_tp_pp, load_exchange_algo
):
Expand Down
1 change: 1 addition & 0 deletions tests/unit_tests/dist_checkpointing/test_fully_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ def test_load_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):

@pytest.mark.parametrize('state_dict_device', ['cpu', 'cuda'])
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_memory_usage(self, state_dict_device, tmp_path_dist_ckpt):
Utils.initialize_model_parallel(2, 1)

Expand Down
2 changes: 0 additions & 2 deletions tests/unit_tests/dist_checkpointing/test_nonpersistent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
@pytest.mark.flaky
def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
Utils.initialize_model_parallel(tp, pp)
num_floating_point_operations_so_far = 0
Expand Down Expand Up @@ -118,7 +117,6 @@ def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):

class TestLegacySaveAndLoad:
@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
@pytest.mark.flaky
def test_basic_save_load_scenario(self, tmp_path_dist_ckpt, tp, pp):
Utils.initialize_model_parallel(tp, pp)
num_floating_point_operations_so_far = 0
Expand Down
6 changes: 0 additions & 6 deletions tests/unit_tests/dist_checkpointing/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ def teardown_method(self, method):
# ((2, 1), 2, 2),
],
)
@pytest.mark.flaky
def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl, initialize_fn):
src_world_size = tp_pp[0] * tp_pp[1] * src_dp
dest_world_size = tp_pp[0] * tp_pp[1] * dest_dp
Expand Down Expand Up @@ -256,7 +255,6 @@ def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl,
('src_tp_pp', 'dest_tp_pp', 'use_glu'),
[((2, 2), (2, 4), False), ((1, 8), (4, 1), True), ((2, 4), (4, 2), False)],
)
@pytest.mark.flaky
def test_finetune_doesnt_load_optimizer(
self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu
):
Expand Down Expand Up @@ -329,7 +327,6 @@ def test_finetune_doesnt_load_optimizer(
assert not diffs[0] and not diffs[1] and diffs[2]
assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))

@pytest.mark.flaky
def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt):
# sync=True to make sure other ranks wait for rank 0 to finish creating directory.
tp = 4
Expand Down Expand Up @@ -398,7 +395,6 @@ def teardown_method(self, method):
@pytest.mark.parametrize(
('src_tp_pp', 'dest_tp_pp'), [((2, 4), (2, 4)), ((2, 4), (4, 2)), ((8, 1), (1, 2))]
)
@pytest.mark.flaky
def test_fp32_optimizer_resharding(self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp):
# sync=True to make sure other ranks wait for rank 0 to finish creating directory.
Utils.initialize_model_parallel(*src_tp_pp)
Expand Down Expand Up @@ -465,7 +461,6 @@ def teardown_method(self, method):
('src_tp_pp', 'dest_tp_pp'),
[((2, 4), (2, 4)), ((2, 4), (2, 2)), ((2, 4), (4, 2)), ((8, 1), (1, 2))],
)
@pytest.mark.flaky
def test_optimizer_resharding(
self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_dist_opt, bf16
):
Expand Down Expand Up @@ -517,7 +512,6 @@ def test_optimizer_resharding(
((2, 1, 2), (1, 1, 8)),
],
)
@pytest.mark.flaky
def test_chained_optimizer_resharding(
self,
tmp_path_dist_ckpt,
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/distributed/test_param_and_grad_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def get_model_and_buffers(
@pytest.mark.parametrize("use_distributed_optimizer", [False, True])
@pytest.mark.parametrize("bias", [False, True])
@pytest.mark.parametrize("shared_embedding", [False, True])
@pytest.mark.flaky
def test_bucket_sizes(
bucket_size: Optional[int], use_distributed_optimizer: bool, bias: bool, shared_embedding: bool
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def test_capacity_forward_backward(self, tp_size, ep_size):
@pytest.mark.internal
@pytest.mark.timeout(120)
@pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2), (1, 1)])
@pytest.mark.flaky
def test_capacity_padding_forward_backward(self, tp_size, ep_size):
container = MoEModelTestContainer(
tp_size=tp_size,
Expand Down
1 change: 0 additions & 1 deletion tests/unit_tests/transformer/moe/test_upcycling.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ def teardown_method(self, method):
destroy_num_microbatches_calculator()

@pytest.mark.internal
@pytest.mark.flaky # TODO: Fix the test
@pytest.mark.parametrize(
('tp_pp_ep', 'enable_vp', 'enable_grouped_gemm'), [((1, 1, 2), (False), (False))]
)
Expand Down
2 changes: 0 additions & 2 deletions tests/unit_tests/transformer/test_retro_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def setup_method(self, method):
def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.flaky_in_dev
def test_constructor(self):

config = self.get_config()
Expand Down Expand Up @@ -193,7 +192,6 @@ def run_gpu_forward(self, recompute_granularity, use_transformer_engine):
config.hidden_size,
)

@pytest.mark.flaky_in_dev
def test_gpu_forward(self):
for recompute_granularity in (None, 'selective'):
for use_transformer_engine in (True, False):
Expand Down

0 comments on commit f1f0392

Please sign in to comment.