WIP

wujingyue · wujingyue · commit fd5af5daba2c · 2025-11-18T13:54:06.000-08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1173,17 +1173,17 @@ if(BUILD_TEST)
   list(APPEND MULTIDEVICE_TEST_SRCS
     ${NVFUSER_ROOT}/tests/cpp/multidevice.cpp
     ${NVFUSER_ROOT}/tests/cpp/multidevice_transformer.cpp
-    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir_overlap.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_communications.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_communicator.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir_overlap.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_ipc.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_matmul.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_pipeline.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_sharding.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_stream_parallel_type.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_transformer.cpp
-    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_ipc.cpp
   )
   add_test_without_main(test_multidevice "${MULTIDEVICE_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_multidevice)
diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
@@ -777,7 +777,7 @@ void HostIrEvaluator::handle(ShardByStream* shard) {
   IterDomain* stream_id = *i;
 
   auto in_tensor = getKnownConcreteValue(shard->in()).as<at::Tensor>();
-  int64_t stream_index =
+  auto stream_index =
       expr_evaluator_.evaluate(shard->stream_index()).as<int64_t>();
   at::Tensor out_tensor =
       in_tensor
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
@@ -271,7 +271,7 @@ Wait::Wait(IrBuilderPasskey passkey, Expr* expr)
   NVF_ERROR(
       (expr->isOneOf<Communication, P2PCommunication, EndCoalescing>()),
       expr,
-      "must be a Communication, a P2PCommunication, or a EndCoalescing");
+      " must be a Communication, a P2PCommunication, or a EndCoalescing");
 }
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(Wait)
diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
@@ -20,18 +20,22 @@ namespace nvfuser {
 namespace {
 
 struct LoopInfo {
-  hir::ForLoop* loop;
+  hir::ForLoop* loop = nullptr;
 
   // The Scope that owns `loop`. It's one level outer than `loop`'s body scope.
-  Scope* parent_scope;
+  Scope* parent_scope = nullptr;
 
   // The iterator that points to `loop`. This way, we can insert instructions,
   // e.g. Allocate, right before the loop.
   Scope::Iterator parent_insertion_point;
 };
 
 std::ostream& operator<<(std::ostream& os, const LoopInfo& loop_info) {
-  os << loop_info.loop->toInlineString();
+  if (loop_info.loop == nullptr) {
+    os << "<null>";
+  } else {
+    os << loop_info.loop->toInlineString();
+  }
   return os;
 }
 
@@ -131,7 +135,7 @@ Expr* cloneWithNewOperands(
   int64_t out_replaced = std::ranges::count_if(new_outs, maybe_replace);
 
   if (in_replaced == 0 && out_replaced == 0) {
-    return 0;
+    return e;
   }
 
   if (out_replaced > 0) {
@@ -151,6 +155,14 @@ void lowerSegment(
     hir::HostIrContainer& hic,
     LoopNest& loop_nest,
     IrCloner& ir_cloner) {
+  Scope& innermost_scope = loop_nest.innermostScope();
+  // FIXME: cleanup. innermost can return an empty LoopInfo when the nest is
+  // empty.
+  LoopInfo innermost;
+  if (!loop_nest.empty()) {
+    innermost = loop_nest.innermost();
+  }
+
   switch (group.schedulerType()) {
     case SchedulerType::Communication: {
       auto device_id = Communicator::getInstance().deviceId();
@@ -162,24 +174,50 @@ void lowerSegment(
       // without cloning the value again.
       Expr* e = ir_cloner.clone(group.exprs().front());
 
-      for (auto* c : convertSingleOpToCommunication(e, device_id)) {
+      // FIXME: should this be associated with the scope?
+      std::unordered_map<Val*, Val*> replacement_map;
+      for (Expr* c : convertSingleOpToCommunication(e, device_id)) {
         NVF_ERROR(
             c->isA<Communication>(),
             "Exprs in a Communication group should be Communication: ",
             c);
-        // Allocate the recv buffers of communications
         auto* communication = c->as<Communication>();
-        TensorView* tv = communication->out();
-        if (tv->getDeviceMesh().has(device_id)) {
-          auto* allocate =
-              IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
-          // TODO: allocation may have to go to the top level. See how
-          // SchedulerType::ExprEval handles allocations.
-          loop_nest.innermostScope().push_back(allocate);
+        TensorView* in = communication->in();
+        TensorView* out = communication->out();
+        if (getShardedIterDomain(in, ParallelType::Stream, DomainType::kLoop) !=
+                nullptr &&
+            getShardedIterDomain(
+                in, ParallelType::Stream, DomainType::kAllocation) == nullptr) {
+          auto [i, inserted] = replacement_map.try_emplace(
+              in, hir::shardByStream(in, innermost.loop->index()));
+          if (inserted) {
+            innermost_scope.push_back(i->second->definition());
+          }
         }
-        loop_nest.innermostScope().push_back(communication);
-        auto wait = IrBuilder::create<hir::Wait>(communication);
-        loop_nest.innermostScope().push_back(wait);
+
+        // Allocate the recv buffers of communications
+        auto* allocate =
+            IrBuilder::create<kir::Allocate>(out, MemoryType::Global);
+        if (getShardedIterDomain(
+                out, ParallelType::Stream, DomainType::kLoop) != nullptr &&
+            getShardedIterDomain(
+                out, ParallelType::Stream, DomainType::kAllocation) ==
+                nullptr) {
+          innermost.parent_scope->insert(
+              innermost.parent_insertion_point, allocate);
+          auto [i, inserted] = replacement_map.try_emplace(
+              out, hir::shardByStream(out, innermost.loop->index()));
+          NVF_ERROR(inserted);
+          innermost_scope.push_back(i->second->definition());
+        } else {
+          innermost_scope.push_back(allocate);
+        }
+
+        Expr* new_c = cloneWithNewOperands(c, replacement_map);
+        innermost_scope.push_back(new_c);
+
+        auto* wait = IrBuilder::create<hir::Wait>(new_c);
+        innermost_scope.push_back(wait);
       }
       break;
     }
@@ -211,14 +249,11 @@ void lowerSegment(
       // TensorViews.
       if (loop_nest.empty()) {
         for (Expr* e : exprs) {
-          loop_nest.innermostScope().push_back(e);
+          innermost_scope.push_back(e);
         }
         break;
       }
 
-      auto [for_loop, parent_scope, parent_insertion_point] =
-          loop_nest.innermost();
-
       std::unordered_map<Val*, Val*> replacement_map;
       for (Expr* e : exprs) {
         for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
@@ -228,9 +263,9 @@ void lowerSegment(
                   in, ParallelType::Stream, DomainType::kAllocation) ==
                   nullptr) {
             auto [i, inserted] = replacement_map.try_emplace(
-                in, hir::shardByStream(in, for_loop->index()));
+                in, hir::shardByStream(in, innermost.loop->index()));
             if (inserted) {
-              for_loop->body().push_back(i->second->definition());
+              innermost_scope.push_back(i->second->definition());
             }
           }
         }
@@ -241,21 +276,22 @@ void lowerSegment(
               nullptr) {
             auto* allocate =
                 IrBuilder::create<kir::Allocate>(out, MemoryType::Global);
-            parent_scope->insert(parent_insertion_point, allocate);
+            innermost.parent_scope->insert(
+                innermost.parent_insertion_point, allocate);
             // Loop is stream parallelized but allocation is not. Therefore,
             // `out` should be allocated outside the loop.
             //
             // I use try_emplace here so shardByStream is called only when `out`
             // is missing.
             auto [i, inserted] = replacement_map.try_emplace(
-                out, hir::shardByStream(out, for_loop->index()));
+                out, hir::shardByStream(out, innermost.loop->index()));
             NVF_ERROR(inserted);
-            for_loop->body().push_back(i->second->definition());
+            innermost_scope.push_back(i->second->definition());
           }
         }
 
         Expr* new_e = cloneWithNewOperands(e, replacement_map);
-        for_loop->body().push_back(new_e);
+        innermost_scope.push_back(new_e);
       }
       break;
     }
@@ -280,7 +316,7 @@ void lowerSegment(
         auto* tv = out->as<TensorView>();
         auto* allocate =
             IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
-        loop_nest.innermostScope().push_back(allocate);
+        innermost_scope.push_back(allocate);
       }
 
       // Add the LaunchKernel instruction.
@@ -296,7 +332,7 @@ void lowerSegment(
           ins,
           outs,
           cache_id);
-      loop_nest.innermostScope().push_back(launch_kernel);
+      innermost_scope.push_back(launch_kernel);
     }
   } // switch
 } // lowerSegment
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
@@ -62,12 +62,12 @@ class NVF_API Communicator {
   }
 
   // returns the number of processes in the communicator
-  auto size() const {
+  int64_t size() const {
     return size_;
   }
 
   // returns the local number of processes in the communicator (within the node)
-  auto local_size() const {
+  int64_t local_size() const {
     return local_size_;
   }
 
@@ -89,7 +89,7 @@ class NVF_API Communicator {
       const std::string& prefix = "");
 
   // returns the device associated with the current process
-  auto device() const {
+  at::Device device() const {
     return at::Device("cuda:" + std::to_string(local_rank_));
   }
 
diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -7,6 +7,8 @@
 // clang-format on
 #include <runtime/fusion_kernel_runtime.h>
 
+#include <c10/cuda/CUDAGuard.h>
+
 #include <fusion.h>
 #include <fusion_profiler.h>
 #include <fusion_segmenter.h>
@@ -25,8 +27,6 @@
 #include <serde/fusion_cache_generated.h>
 #include <type.h>
 
-#include <c10/cuda/CUDAGuard.h>
-
 namespace nvfuser {
 
 namespace {
diff --git a/tests/cpp/test_multidevice_stream_parallel_type.cpp b/tests/cpp/test_multidevice_stream_parallel_type.cpp
@@ -5,8 +5,6 @@
 * SPDX-License-Identifier: BSD-3-Clause
 */
 // clang-format on
-#include <iterator>
-
 #include <cuda_profiler_api.h>
 
 #include <fusion.h>
@@ -24,7 +22,6 @@
 namespace nvfuser {
 
 using testing::ElementsAre;
-using testing::SizeIs;
 
 using MultiDeviceStreamParallelTypeTest = MultiDeviceTest;
 
diff --git a/tests/python/multidevice/test_overlap.py b/tests/python/multidevice/test_overlap.py
@@ -10,6 +10,49 @@
 from nvfuser_direct import DataType, FusionDefinition, CommunicatorBackend, TensorView
 
 
+@pytest.mark.mpi
+def test_row_parallel_linear_forward(multidevice_direct_test):
+    # This is a port of CollectiveBasedOverlapTest.RowParallelLinear_Forward.
+    h, s, t = 2, 3, 6
+    d = multidevice_direct_test.size
+    if (h * 4) % d != 0:
+        pytest.skip(
+            f"Row-parallel linear requires {h * 4} to be divisible by world size {d}."
+        )
+    assert t % s == 0
+
+    mesh = nvfuser.multidevice.DeviceMesh(range(d))
+
+    with FusionDefinition() as fd:
+        inp = fd.define_tensor(
+            shape=[-1, h * 4], contiguity=True, dtype=DataType.BFloat16
+        )
+        weight = fd.define_tensor(
+            shape=[h, h * 4], contiguity=True, dtype=DataType.BFloat16
+        )
+        out = fd.ops.linear(inp, weight)
+        fd.add_output(out)
+
+        for tv in (inp, weight):
+            tv.set_device_mesh(mesh)
+
+        inp.split(0, s, inner_split=False)
+        inp.axis(0).parallelize(nvfuser.ParallelType.stream)
+        inp.split(2, d, inner_split=False)
+        inp.axis(2).parallelize(nvfuser.ParallelType.mesh_x)
+        weight.split(1, d, inner_split=False)
+        weight.axis(1).parallelize(nvfuser.ParallelType.mesh_x)
+
+    inp_ref = torch.randint(-2, 3, (t, h * 4), dtype=torch.int32).to(torch.bfloat16)
+    weight_ref = torch.randint(-2, 3, (h, h * 4), dtype=torch.int32).to(torch.bfloat16)
+    out_ref = torch.nn.functional.linear(inp_ref, weight_ref)
+
+    inp = (multidevice_direct_test.shard_tensor(inp_ref, -1, mesh),)
+    weight = (multidevice_direct_test.shard_tensor(weight_ref, -1, mesh),)
+    (out,) = fd.execute([inp, weight], _enable_options=["host_ir_lowering"])
+    torch.testing.assert_close(out.cpu(), out_ref)
+
+
 @pytest.mark.mpi
 @pytest.mark.parametrize("backend_type", [CommunicatorBackend.nccl])
 @pytest.mark.parametrize("s", [1, 8])

Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,7 @@ Wait::Wait(IrBuilderPasskey passkey, Expr* expr)`
`271`	`271`	`NVF_ERROR(`
`272`	`272`	`(expr->isOneOf<Communication, P2PCommunication, EndCoalescing>()),`
`273`	`273`	`expr,`
`274`		`- "must be a Communication, a P2PCommunication, or a EndCoalescing");`
	`274`	`+ " must be a Communication, a P2PCommunication, or a EndCoalescing");`
`275`	`275`	`}`
`276`	`276`
`277`	`277`	`NVFUSER_DEFINE_CLONE_AND_CREATE(Wait)`
Original file line number	Diff line number	Diff line change
`@@ -62,12 +62,12 @@ class NVF_API Communicator {`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`// returns the number of processes in the communicator`
`65`		`- auto size() const {`
	`65`	`+ int64_t size() const {`
`66`	`66`	`return size_;`
`67`	`67`	`}`
`68`	`68`
`69`	`69`	`// returns the local number of processes in the communicator (within the node)`
`70`		`- auto local_size() const {`
	`70`	`+ int64_t local_size() const {`
`71`	`71`	`return local_size_;`
`72`	`72`	`}`
`73`	`73`
`@@ -89,7 +89,7 @@ class NVF_API Communicator {`
`89`	`89`	`const std::string& prefix = "");`
`90`	`90`
`91`	`91`	`// returns the device associated with the current process`
`92`		`- auto device() const {`
	`92`	`+ at::Device device() const {`
`93`	`93`	`return at::Device("cuda:" + std::to_string(local_rank_));`
`94`	`94`	`}`
`95`	`95`