diff --git a/CMakeLists.txt b/CMakeLists.txt index ccda3d89fb5..b4a6136ae70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1111,6 +1111,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST) target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}") target_include_directories(${TEST_NAME} SYSTEM PRIVATE + ${NVFUSER_ROOT}/third_party/benchmark/include ${NVFUSER_ROOT}/third_party/googletest/googletest/include ${NVFUSER_ROOT}/third_party/googletest/googlemock/include ) @@ -1123,6 +1124,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK) dynamic_type GTest::gtest GTest::gmock + benchmark::benchmark flatbuffers ${TORCH_LIBRARIES} ) diff --git a/tests/cpp/multidevice.cpp b/tests/cpp/multidevice.cpp index 4d962ffa922..52f286a5e56 100644 --- a/tests/cpp/multidevice.cpp +++ b/tests/cpp/multidevice.cpp @@ -7,7 +7,14 @@ // clang-format on #include #include -#include + +#include +#include +#include +#include + +#include +#include #ifdef NVFUSER_DISTRIBUTED #include @@ -33,7 +40,7 @@ void MultiDeviceTestEnvironment::TearDown() { Communicator::getInstance().cleanup(); } -MultiDeviceTest::MultiDeviceTest() { +MultiDeviceFixture::MultiDeviceFixture() { // Enable logging in c10d so debug messages can be printed out via // `TORCH_DISTRIBUTED_DEBUG`. c10d::setDebugLevelFromEnvironment(); @@ -42,6 +49,9 @@ MultiDeviceTest::MultiDeviceTest() { tensor_options_ = at::TensorOptions().dtype(at::kFloat).device(communicator_->device()); debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr; +} + +MultiDeviceTest::MultiDeviceTest() { disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr; } @@ -55,8 +65,16 @@ MultiDeviceTest::~MultiDeviceTest() { } } +void MultiDeviceBenchmark::TearDown(benchmark::State& state) { + // Unlike testing::Test, a benchmark::Fixture is destructed after `main` + // exits, not after each benchmark. Therefore, we have to put barrier in + // TearDown instead of the destructor. + if (communicator_->is_available()) { + communicator_->barrier(); + } +} + void MultiDeviceTest::SetUp() { - // Set the same random seed for all processes. NVFuserTest::SetUp(); if (!disable_skip && !communicator_->is_available()) { @@ -64,7 +82,7 @@ void MultiDeviceTest::SetUp() { } } -at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) { +at::Tensor MultiDeviceFixture::shardTensor(at::Tensor tensor, TensorView* tv) { if (!isSharded(tv)) { return tensor; } @@ -75,7 +93,7 @@ at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) { tv->getDeviceMesh()); } -at::Tensor MultiDeviceTest::shardTensor( +at::Tensor MultiDeviceFixture::shardTensor( at::Tensor tensor, const int64_t axis, const DeviceMesh& mesh) { @@ -162,8 +180,27 @@ void MultiDeviceTest::validate( } // namespace nvfuser +namespace { +bool wantsBenchmarks(int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + std::string_view a(argv[i]); + if (a.starts_with("--benchmark")) + return true; + } + return false; +} +} // namespace + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); testing::AddGlobalTestEnvironment(new nvfuser::MultiDeviceTestEnvironment()); + + if (wantsBenchmarks(argc, argv)) { + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; + } + return RUN_ALL_TESTS(); } diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h index fa043ef3f6d..3702c35ff16 100644 --- a/tests/cpp/multidevice.h +++ b/tests/cpp/multidevice.h @@ -7,6 +7,9 @@ // clang-format on #pragma once +#include +#include + #include #include #include @@ -22,11 +25,11 @@ class MultiDeviceTestEnvironment : public testing::Environment { void TearDown() override; }; -class MultiDeviceTest : public NVFuserTest { +// Fixture class containing the logic for multi-device testing. +// Does not inherit from NVFuserTest or testing::Test. +class MultiDeviceFixture { protected: - MultiDeviceTest(); - ~MultiDeviceTest(); - void SetUp() override; + MultiDeviceFixture(); // Returns a shard of the tensor according to the sharding annotation in tv // for the deviceId. If tensor is not sharded returns the original tensor. @@ -40,18 +43,33 @@ class MultiDeviceTest : public NVFuserTest { int64_t axis, const DeviceMesh& mesh); + Communicator* communicator_; + c10::TensorOptions tensor_options_; + bool debug_print; +}; + +// Test class that inherits from NVFuserTest and uses MultiDeviceFixture. +class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture { + protected: + MultiDeviceTest(); + ~MultiDeviceTest(); + void SetUp() override; + // Validate the outputs of a fusion against expected outputs. static void validate( const std::vector& expected_outputs, const KernelArgumentHolder& outputs, const std::vector& atols); - Communicator* communicator_; - c10::TensorOptions tensor_options_; - bool debug_print; bool disable_skip; }; +class MultiDeviceBenchmark : public benchmark::Fixture, + public MultiDeviceFixture { + protected: + void TearDown(benchmark::State& state) override; +}; + // This macro is supposed to be used in a test case of a MultiDeviceTest or its // `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not // made a function because that function wouldn't be able to skip the test by diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index ff4c98936df..0e37c254090 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -5,6 +5,7 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on +#include #include #include @@ -1283,4 +1284,39 @@ TEST_F(MultiDeviceTest, MultipleIncompatibleReshapes) { EXPECT_FALSE(runtime->isSegmented()); } } + +BENCHMARK_DEFINE_F(MultiDeviceBenchmark, Reduction)(benchmark::State& state) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + auto mesh = DeviceMesh::createForNumDevices(communicator_->size()); + + TensorView* in = makeContigTensor(2); + TensorView* out = sum(in, {0}); + + fusion->addInput(in); + fusion->addOutput(out); + + in->setDeviceMesh(mesh); + in->axis(0)->parallelize(ParallelType::DIDx); + + auto unsharded_in_tensor = + at::randn({mesh.size(), state.range(0)}, tensor_options_); + auto in_tensor = shardTensor(unsharded_in_tensor, in); + + FusionExecutorCache executor_cache(std::move(fusion)); + + for (auto _ : state) { + executor_cache.runFusionWithInputs({in_tensor}); + } +} + +// `Iterations` ensures that all processes run the benchmark for the same number +// of iterations. Without it, Google Benchmark adaptively determines the +// iteration count per process, which can differ across processes and cause +// collective operations (like allreduce) to hang indefinitely. +BENCHMARK_REGISTER_F(MultiDeviceBenchmark, Reduction) + ->Arg(4) + ->Arg(8) + ->Iterations(10); + } // namespace nvfuser