From be823c6830db4ed373188d4eb605484decd8c647 Mon Sep 17 00:00:00 2001 From: ruisizhang123 Date: Thu, 20 Nov 2025 15:23:28 -0800 Subject: [PATCH] add CI to guard compiler optimization passes --- .../integration_test_8gpu_simple_fsdp.yaml | 6 +- tests/integration_tests/run_tests.py | 17 +- .../tests/compiler_pass_integration_tests.py | 261 ++++++++++++++++++ ...tests.py => frontend_integration_tests.py} | 26 -- 4 files changed, 280 insertions(+), 30 deletions(-) create mode 100755 torchtitan/experiments/simple_fsdp/tests/compiler_pass_integration_tests.py rename torchtitan/experiments/simple_fsdp/tests/{integration_tests.py => frontend_integration_tests.py} (92%) diff --git a/.github/workflows/integration_test_8gpu_simple_fsdp.yaml b/.github/workflows/integration_test_8gpu_simple_fsdp.yaml index 9a1a0a2866..d0e642a7e4 100644 --- a/.github/workflows/integration_test_8gpu_simple_fsdp.yaml +++ b/.github/workflows/integration_test_8gpu_simple_fsdp.yaml @@ -50,7 +50,11 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded - python -m torchtitan.experiments.simple_fsdp.tests.integration_tests artifacts-to-be-uploaded --ngpu 8 + # Run front-end integration tests of SimpleFSDP + python -m torchtitan.experiments.simple_fsdp.tests.frontend_integration_tests artifacts-to-be-uploaded --ngpu 8 + + # Run backend pass integration tests of SimpleFSDP + python -m torchtitan.experiments.simple_fsdp.tests.compiler_pass_integration_tests artifacts-to-be-uploaded --ngpu 8 --comm_mode local_tensor # Run the numerics unit tests of SimpleFSDP torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py -v diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py index c233904165..9803a70bac 100644 --- a/tests/integration_tests/run_tests.py +++ b/tests/integration_tests/run_tests.py @@ -29,7 +29,9 @@ def _run_cmd(cmd): return subprocess.run([cmd], text=True, shell=True) -def run_single_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): +def run_single_test( + test_flavor: OverrideDefinitions, full_path: str, output_dir: str, comm_mode: str +): # run_test supports sequence of tests. test_name = test_flavor.test_name dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}" @@ -37,7 +39,8 @@ def run_single_test(test_flavor: OverrideDefinitions, full_path: str, output_dir all_ranks = ",".join(map(str, range(test_flavor.ngpu))) for idx, override_arg in enumerate(test_flavor.override_args): - cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh" + cmd = f"CONFIG_FILE={full_path} COMM_MODE={comm_mode} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh" + # dump compile trace for debugging purpose cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd @@ -102,7 +105,9 @@ def run_tests(args, test_list: list[OverrideDefinitions]): f" because --ngpu arg is {args.ngpu}" ) else: - run_single_test(test_flavor, args.config_path, args.output_dir) + run_single_test( + test_flavor, args.config_path, args.output_dir, args.comm_mode + ) def main(): @@ -110,6 +115,12 @@ def main(): parser.add_argument( "output_dir", help="Directory to dump results generated by tests" ) + parser.add_argument( + "comm_mode", + default="default", + choices=["default", "fake_backend", "local_tensor"], + help="Communication mode to validate tests", + ) parser.add_argument( "--gpu_arch_type", default="cuda", diff --git a/torchtitan/experiments/simple_fsdp/tests/compiler_pass_integration_tests.py b/torchtitan/experiments/simple_fsdp/tests/compiler_pass_integration_tests.py new file mode 100755 index 0000000000..f603e1e605 --- /dev/null +++ b/torchtitan/experiments/simple_fsdp/tests/compiler_pass_integration_tests.py @@ -0,0 +1,261 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +from tests.integration_tests import OverrideDefinitions +from tests.integration_tests.run_tests import run_tests + + +def build_simple_fsdp_test_list() -> list[OverrideDefinitions]: + """ + key is the config file name and value is a list of OverrideDefinitions + that is used to generate variations of integration tests based on the + same root config file. + """ + integration_tests_flavors = [ + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.llama3", + "--model.flavor 8B", + "--compile.enable", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes auto_bucketing", + ], + ], + "1D+autobucketing", + "1d_autobucketing", + ngpu=8, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.llama3", + "--model.flavor 8B", + "--compile.enable", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes transformer_block_bucketing", + ], + ], + "1D+transformer_block_bucketing", + "1d_transformer_block_bucketing", + ngpu=8, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.llama3", + "--model.flavor 8B", + "--parallelism.tensor_parallel_degree 2", + "--compile.enable", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes auto_bucketing", + ], + ], + "2D+autobucketing", + "2d_autobucketing", + ngpu=8, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.llama3", + "--model.flavor 8B", + "--parallelism.tensor_parallel_degree 2", + "--compile.enable", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes transformer_block_bucketing", + ], + ], + "2D+transformer_block_bucketing", + "2d_transformer_block_bucketing", + ngpu=8, + ), + # TODO(ruisizhang123): add back after passes + PP is supported + # OverrideDefinitions( + # [ + # [ + # "--model.name simple_fsdp.llama3", + # "--model.flavor 8B", + # "--parallelism.tensor_parallel_degree 2", + # "--parallelism.pipeline_parallel_degree 2", + # "--compile.enable", + # "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + # "--compile.backend aot_eager", + # "--compile.graph_passes auto_bucketing", + # ], + # ], + # "3D+autobucketing", + # "3d_autobucketing", + # ngpu=8, + # ), + # OverrideDefinitions( + # [ + # [ + # "--model.name simple_fsdp.llama3", + # "--model.flavor 8B", + # "--parallelism.tensor_parallel_degree 2", + # "--parallelism.pipeline_parallel_degree 2", + # "--compile.enable", + # "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + # "--compile.backend aot_eager", + # "--compile.graph_passes transformer_block_bucketing", + # ], + # ], + # "3D+transformer_block_bucketing", + # "3d_transformer_block_bucketing", + # ngpu=8, + # ), + # OverrideDefinitions( + # [ + # [ + # "--model.name simple_fsdp.llama3", + # "--model.flavor 8B", + # "--parallelism.tensor_parallel_degree 2", + # "--parallelism.context_parallel_degree 2", + # "--compile.enable", + # "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + # "--compile.backend aot_eager", + # "--compile.graph_passes auto_bucketing", + # ], + # ], + # "FSDP+TP+CP+autobucketing", + # "fsdp+tp+cp_autobucketing", + # ngpu=8, + # ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.llama3", + "--model.flavor 8B", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.context_parallel_degree 2", + "--compile.enable", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes transformer_block_bucketing", + ], + ], + "FSDP+TP+CP+transformer_block_bucketing", + "fsdp+tp+cp_transformer_block_bucketing", + ngpu=8, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.deepseek_v3", + "--model.flavor 16B", + "--parallelism.data_parallel_shard_degree 4", + "--parallelism.expert_parallel_degree 2", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes auto_bucketing", + ], + ], + "FSDP+EP+autobucketing", + "fsdp+ep_autobucketing", + ngpu=4, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.deepseek_v3", + "--model.flavor 16B", + "--parallelism.data_parallel_shard_degree 4", + "--parallelism.expert_parallel_degree 2", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes transformer_block_bucketing", + ], + ], + "FSDP+EP+transformer_block_bucketing", + "fsdp+ep_transformer_block_bucketing", + ngpu=4, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.deepseek_v3", + "--model.flavor 16B", + "--parallelism.data_parallel_shard_degree 2", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.expert_parallel_degree 4", + "--parallelism.expert_tensor_parallel_degree 1", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes auto_bucketing", + ], + ], + "FSDP+TP+EP+autobucketing", + "fsdp+tp+ep_autobucketing", + ngpu=4, + ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp.deepseek_v3", + "--model.flavor 16B", + "--parallelism.data_parallel_shard_degree 2", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.expert_parallel_degree 4", + "--parallelism.expert_tensor_parallel_degree 1", + "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", + "--compile.backend aot_eager", + "--compile.graph_passes transformer_block_bucketing", + ], + ], + "FSDP+TP+EP+transformer_block_bucketing", + "fsdp+tp+ep_transformer_block_bucketing", + ngpu=4, + ), + ] + return integration_tests_flavors + + +_TEST_SUITES_FUNCTION = { + "simple_fsdp": build_simple_fsdp_test_list, +} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("output_dir") + parser.add_argument( + "--comm_mode", + default="default", + choices=["default", "fake_backend", "local_tensor"], + help="Communication mode to validate tests", + ) + parser.add_argument( + "--config_path", + default="./tests/integration_tests/base_config.toml", + help="Base config path for integration tests. This is the config that will be used as a base for all tests.", + ) + parser.add_argument( + "--test_name", + default="all", + help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", + ) + parser.add_argument("--ngpu", default=8, type=int) + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + if os.listdir(args.output_dir): + raise RuntimeError("Please provide an empty output directory.") + + test_list = _TEST_SUITES_FUNCTION["simple_fsdp"]() + run_tests(args, test_list) + + +if __name__ == "__main__": + main() diff --git a/torchtitan/experiments/simple_fsdp/tests/integration_tests.py b/torchtitan/experiments/simple_fsdp/tests/frontend_integration_tests.py similarity index 92% rename from torchtitan/experiments/simple_fsdp/tests/integration_tests.py rename to torchtitan/experiments/simple_fsdp/tests/frontend_integration_tests.py index c3cee7b52f..b6dd3dead3 100755 --- a/torchtitan/experiments/simple_fsdp/tests/integration_tests.py +++ b/torchtitan/experiments/simple_fsdp/tests/frontend_integration_tests.py @@ -29,32 +29,6 @@ def build_simple_fsdp_test_list() -> list[OverrideDefinitions]: "1D", "1d", ), - OverrideDefinitions( - [ - [ - "--model.name simple_fsdp.llama3", - "--compile.enable", - "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", - "--compile.backend aot_eager", - "--compile.graph_passes auto_bucketing", - ], - ], - "1D+autobucketing", - "1d_autobucketing", - ), - OverrideDefinitions( - [ - [ - "--model.name simple_fsdp.llama3", - "--compile.enable", - "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config", - "--compile.backend aot_eager", - "--compile.graph_passes transformer_block_bucketing", - ], - ], - "1D+transformer_block_bucketing", - "1d_transformer_block_bucketing", - ), OverrideDefinitions( [ [