diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccda3d89fb5..b4a6136ae70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1111,6 +1111,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
   target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST)
   target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}")
   target_include_directories(${TEST_NAME} SYSTEM PRIVATE
+    ${NVFUSER_ROOT}/third_party/benchmark/include
     ${NVFUSER_ROOT}/third_party/googletest/googletest/include
     ${NVFUSER_ROOT}/third_party/googletest/googlemock/include
   )
@@ -1123,6 +1124,7 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
     dynamic_type
     GTest::gtest
     GTest::gmock
+    benchmark::benchmark
     flatbuffers
     ${TORCH_LIBRARIES}
   )
diff --git a/tests/cpp/multidevice.cpp b/tests/cpp/multidevice.cpp
index 4d962ffa922..52f286a5e56 100644
--- a/tests/cpp/multidevice.cpp
+++ b/tests/cpp/multidevice.cpp
@@ -7,7 +7,14 @@
 // clang-format on
 #include <sys/types.h>
 #include <unistd.h>
-#include <mutex>
+
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
 
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/debug.h>
@@ -33,7 +40,7 @@ void MultiDeviceTestEnvironment::TearDown() {
   Communicator::getInstance().cleanup();
 }
 
-MultiDeviceTest::MultiDeviceTest() {
+MultiDeviceFixture::MultiDeviceFixture() {
   // Enable logging in c10d so debug messages can be printed out via
   // `TORCH_DISTRIBUTED_DEBUG`.
   c10d::setDebugLevelFromEnvironment();
@@ -42,6 +49,9 @@ MultiDeviceTest::MultiDeviceTest() {
   tensor_options_ =
       at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
   debug_print = getNvFuserEnv("MULTIDEVICE_DEBUG_PRINT") != nullptr;
+}
+
+MultiDeviceTest::MultiDeviceTest() {
   disable_skip = getNvFuserEnv("MULTIDEVICE_DISABLE_SKIP") != nullptr;
 }
 
@@ -55,8 +65,16 @@ MultiDeviceTest::~MultiDeviceTest() {
   }
 }
 
+void MultiDeviceBenchmark::TearDown(benchmark::State& state) {
+  // Unlike testing::Test, a benchmark::Fixture is destructed after `main`
+  // exits, not after each benchmark. Therefore, we have to put barrier in
+  // TearDown instead of the destructor.
+  if (communicator_->is_available()) {
+    communicator_->barrier();
+  }
+}
+
 void MultiDeviceTest::SetUp() {
-  // Set the same random seed for all processes.
   NVFuserTest::SetUp();
 
   if (!disable_skip && !communicator_->is_available()) {
@@ -64,7 +82,7 @@ void MultiDeviceTest::SetUp() {
   }
 }
 
-at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) {
+at::Tensor MultiDeviceFixture::shardTensor(at::Tensor tensor, TensorView* tv) {
   if (!isSharded(tv)) {
     return tensor;
   }
@@ -75,7 +93,7 @@ at::Tensor MultiDeviceTest::shardTensor(at::Tensor tensor, TensorView* tv) {
       tv->getDeviceMesh());
 }
 
-at::Tensor MultiDeviceTest::shardTensor(
+at::Tensor MultiDeviceFixture::shardTensor(
     at::Tensor tensor,
     const int64_t axis,
     const DeviceMesh& mesh) {
@@ -162,8 +180,27 @@ void MultiDeviceTest::validate(
 
 } // namespace nvfuser
 
+namespace {
+bool wantsBenchmarks(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    std::string_view a(argv[i]);
+    if (a.starts_with("--benchmark"))
+      return true;
+  }
+  return false;
+}
+} // namespace
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::AddGlobalTestEnvironment(new nvfuser::MultiDeviceTestEnvironment());
+
+  if (wantsBenchmarks(argc, argv)) {
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    benchmark::Shutdown();
+    return 0;
+  }
+
   return RUN_ALL_TESTS();
 }
diff --git a/tests/cpp/multidevice.h b/tests/cpp/multidevice.h
index fa043ef3f6d..3702c35ff16 100644
--- a/tests/cpp/multidevice.h
+++ b/tests/cpp/multidevice.h
@@ -7,6 +7,9 @@
 // clang-format on
 #pragma once
 
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+
 #include <multidevice/communication.h>
 #include <multidevice/communicator.h>
 #include <multidevice/execution_utils.h>
@@ -22,11 +25,11 @@ class MultiDeviceTestEnvironment : public testing::Environment {
   void TearDown() override;
 };
 
-class MultiDeviceTest : public NVFuserTest {
+// Fixture class containing the logic for multi-device testing.
+// Does not inherit from NVFuserTest or testing::Test.
+class MultiDeviceFixture {
  protected:
-  MultiDeviceTest();
-  ~MultiDeviceTest();
-  void SetUp() override;
+  MultiDeviceFixture();
 
   // Returns a shard of the tensor according to the sharding annotation in tv
   // for the deviceId. If tensor is not sharded returns the original tensor.
@@ -40,18 +43,33 @@ class MultiDeviceTest : public NVFuserTest {
       int64_t axis,
       const DeviceMesh& mesh);
 
+  Communicator* communicator_;
+  c10::TensorOptions tensor_options_;
+  bool debug_print;
+};
+
+// Test class that inherits from NVFuserTest and uses MultiDeviceFixture.
+class MultiDeviceTest : public NVFuserTest, public MultiDeviceFixture {
+ protected:
+  MultiDeviceTest();
+  ~MultiDeviceTest();
+  void SetUp() override;
+
   // Validate the outputs of a fusion against expected outputs.
   static void validate(
       const std::vector<at::Tensor>& expected_outputs,
       const KernelArgumentHolder& outputs,
       const std::vector<double>& atols);
 
-  Communicator* communicator_;
-  c10::TensorOptions tensor_options_;
-  bool debug_print;
   bool disable_skip;
 };
 
+class MultiDeviceBenchmark : public benchmark::Fixture,
+                             public MultiDeviceFixture {
+ protected:
+  void TearDown(benchmark::State& state) override;
+};
+
 // This macro is supposed to be used in a test case of a MultiDeviceTest or its
 // `SetUp` method, which have access to GTEST_SKIP and communicator_. It's not
 // made a function because that function wouldn't be able to skip the test by
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index ff4c98936df..0e37c254090 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include <benchmark/benchmark.h>
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
@@ -1283,4 +1284,39 @@ TEST_F(MultiDeviceTest, MultipleIncompatibleReshapes) {
     EXPECT_FALSE(runtime->isSegmented());
   }
 }
+
+BENCHMARK_DEFINE_F(MultiDeviceBenchmark, Reduction)(benchmark::State& state) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  auto mesh = DeviceMesh::createForNumDevices(communicator_->size());
+
+  TensorView* in = makeContigTensor(2);
+  TensorView* out = sum(in, {0});
+
+  fusion->addInput(in);
+  fusion->addOutput(out);
+
+  in->setDeviceMesh(mesh);
+  in->axis(0)->parallelize(ParallelType::DIDx);
+
+  auto unsharded_in_tensor =
+      at::randn({mesh.size(), state.range(0)}, tensor_options_);
+  auto in_tensor = shardTensor(unsharded_in_tensor, in);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  for (auto _ : state) {
+    executor_cache.runFusionWithInputs({in_tensor});
+  }
+}
+
+// `Iterations` ensures that all processes run the benchmark for the same number
+// of iterations. Without it, Google Benchmark adaptively determines the
+// iteration count per process, which can differ across processes and cause
+// collective operations (like allreduce) to hang indefinitely.
+BENCHMARK_REGISTER_F(MultiDeviceBenchmark, Reduction)
+    ->Arg(4)
+    ->Arg(8)
+    ->Iterations(10);
+
 } // namespace nvfuser