Fix how we decide whether to create ShardByStream for inputs (#5562)

wujingyue · greptile-apps[bot] · web-flow · commit 02e00557dac7 · 2025-11-21T09:48:43.000-08:00
## Summary
- add a helper to fetch the requested TensorView domain and let
haveDifferentShardings take explicit producer and consumer DomainType
arguments
- update all callers, including the resharding passes and stream unit
test, to pass the desired domain types
- fix a bug where host IR lowering should inspect input allocation (not
loop) domains when deciding whether to put shardByStream

## Testing
- Not run (not requested)

---------

Co-authored-by: greptile-apps[bot] &lt;165735046+greptile-apps[bot]@users.noreply.github.com&gt;
diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
@@ -84,29 +84,45 @@ std::ostream& operator<<(std::ostream& os, const LoopNest& loop_nest) {
   return os;
 }
 
+int numParallelIterDomains(const TensorView* tv) {
+  return std::ranges::count_if(
+      tv->getLoopDomain(), [](IterDomain* id) { return id->isParallelized(); });
+}
+
+template <typename R>
+TensorView* findMostParallelTensorView(const R& range) {
+  TensorView* reference = nullptr;
+  int max_parallel_count = -1;
+  for (TensorView* tv : range) {
+    auto parallel_count = numParallelIterDomains(tv);
+    if (parallel_count > max_parallel_count) {
+      max_parallel_count = parallel_count;
+      reference = tv;
+    }
+  }
+  return reference;
+}
+
 // Finds the TensorView in the group whose loop domain has the most parallel
 // types and returns its loop domain.
-const std::vector<IterDomain*>& findReferenceLoopDomain(
+const std::vector<IterDomain*>& findMostParallelLoopDomain(
     const SegmentedGroup& group) {
-  TensorView* reference_tv = nullptr;
+  TensorView* reference = nullptr;
   int max_parallel_count = -1;
-  for (auto* expr : group.exprs()) {
-    for (auto* tv : ir_utils::filterByType<TensorView>(expr->outputs())) {
-      auto loop_domain = tv->getLoopDomain();
-      int parallel_count = 0;
-      for (auto* id : loop_domain) {
-        if (id->isParallelized()) {
-          parallel_count++;
-        }
-      }
-      if (parallel_count > max_parallel_count) {
-        max_parallel_count = parallel_count;
-        reference_tv = tv;
-      }
+  for (Expr* expr : group.exprs()) {
+    TensorView* tv = findMostParallelTensorView(
+        ir_utils::filterByType<TensorView>(expr->outputs()));
+    if (tv == nullptr) {
+      continue;
+    }
+    auto parallel_count = numParallelIterDomains(tv);
+    if (parallel_count > max_parallel_count) {
+      max_parallel_count = parallel_count;
+      reference = tv;
     }
   }
-  NVF_ERROR(reference_tv != nullptr);
-  return reference_tv->getLoopDomain();
+  NVF_ERROR(reference != nullptr, "Can't find any TensorView in ", &group);
+  return reference->getLoopDomain();
 }
 
 // Returns a new Expr with the inputs and outputs replaced by the replacement
@@ -217,11 +233,34 @@ void lowerSegment(
       std::unordered_map<Val*, Val*> replacement_map;
       for (Expr* e : exprs) {
         for (auto* in : ir_utils::filterByType<TensorView>(e->inputs())) {
-          if (getShardedIterDomain(
-                  in, ParallelType::Stream, DomainType::kLoop) != nullptr &&
-              getShardedIterDomain(
-                  in, ParallelType::Stream, DomainType::kAllocation) ==
-                  nullptr) {
+          // A loop domain should go with an Expr rather than each individual
+          // output TensorView. Before this is fixed, pick the most parallel
+          // output TensorView as a proxy.
+          TensorView* out = findMostParallelTensorView(
+              ir_utils::filterByType<TensorView>(e->outputs()));
+          if (out == nullptr) {
+            continue;
+          }
+          // Check whether in's **allocation** and out's loop are sharded on
+          // ParallelType::Stream consistently. If not, insert a ShardByStream.
+          //
+          // Consider the following example:
+          // ```
+          // in: [m, k]    w: [k, n]   # logical/allocation
+          //            |
+          //            | matmul
+          //            v
+          //      out: [m, n]     logical
+          //           / \.
+          //          s  m/s      loop
+          // ```
+          // `in` needs to be sharded by stream regardless of its loop domain.
+          if (haveDifferentShardings(
+                  in,
+                  DomainType::kAllocation,
+                  out,
+                  DomainType::kLoop,
+                  {ParallelType::Stream})) {
             auto [i, inserted] = replacement_map.try_emplace(
                 in, hir::shardByStream(in, for_loop->index()));
             if (inserted) {
@@ -345,7 +384,7 @@ std::unique_ptr<hir::HostIrContainer> lowerSegmentedFusionToHostIr(
   for (SegmentedGroup* group :
        prepareRuntimeOrder(segmented_fusion).group_run_order) {
     const std::vector<IterDomain*>& curr_ref_loop =
-        findReferenceLoopDomain(*group);
+        findMostParallelLoopDomain(*group);
     const int64_t inline_position =
         computeInlinePosition(prev_ref_loop, curr_ref_loop, id_model);
     while (loop_nest.size() > inline_position) {
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
@@ -43,6 +43,26 @@ std::ostream& operator<<(std::ostream& os, DomainType domain_type) {
   std::unreachable();
 }
 
+namespace {
+
+const std::vector<IterDomain*>& getDomainOf(
+    const TensorView* tv,
+    DomainType domain_type) {
+  switch (domain_type) {
+    case DomainType::kRoot:
+      return tv->getMaybeRootDomain();
+    case DomainType::kLogical:
+      return tv->getLogicalDomain();
+    case DomainType::kLoop:
+      return tv->getLoopDomain();
+    case DomainType::kAllocation:
+      return tv->getMaybeAllocationDomain();
+  }
+  std::unreachable();
+}
+
+} // namespace
+
 bool isSharded(const TensorView* tv) {
   bool is_sharded = false;
   for (IterDomain* id : tv->getLoopDomain()) {
@@ -214,20 +234,7 @@ IterDomain* getShardedIterDomain(
     const TensorView* tv,
     const ParallelType parallel_type,
     const DomainType domain_type) {
-  const std::vector<IterDomain*>& domain =
-      [&]() -> const std::vector<IterDomain*>& {
-    switch (domain_type) {
-      case DomainType::kRoot:
-        return tv->getMaybeRootDomain();
-      case DomainType::kLogical:
-        return tv->getLogicalDomain();
-      case DomainType::kLoop:
-        return tv->getLoopDomain();
-      case DomainType::kAllocation:
-        return tv->getMaybeAllocationDomain();
-    }
-    std::unreachable();
-  }();
+  const auto& domain = getDomainOf(tv, domain_type);
 
   for (IterDomain* id : domain | TensorDomain::kNoReductions) {
     if (id->getParallelType() == parallel_type) {
@@ -318,7 +325,9 @@ std::unordered_set<IterDomain*> getInputsInTargetDomain(
 
 bool haveDifferentShardings(
     const TensorView* producer,
+    DomainType producer_domain_type,
     const TensorView* consumer,
+    DomainType consumer_domain_type,
     const std::unordered_set<ParallelType>& parallel_types) {
   // cpu scalars are not parallelized
   if (producer->isCpuScalar() || consumer->isCpuScalar()) {
@@ -342,6 +351,9 @@ bool haveDifferentShardings(
     return true;
   }
 
+  const auto& producer_domain = getDomainOf(producer, producer_domain_type);
+  const auto& consumer_domain = getDomainOf(consumer, consumer_domain_type);
+
   // Special handling of SelectOp for a quick fix
   // TODO: work on a proper implementation
   if (consumer->definition()->isA<SelectOp>()) {
@@ -373,8 +385,8 @@ bool haveDifferentShardings(
             .mapBroadcast(false)
             .mapConsumerToProducer();
     return !std::all_of(
-        consumer->getLoopDomain().begin(),
-        consumer->getLoopDomain().end(),
+        consumer_domain.begin(),
+        consumer_domain.end(),
         [&c2p, &parallel_types](IterDomain* c_id) {
           auto p_id = c2p.at(c_id);
           auto p_id_pt = p_id->getParallelType();
@@ -455,9 +467,9 @@ bool haveDifferentShardings(
   // optimization, we create indices only for those that parallel_types depend
   // on.
   std::unordered_map<ParallelType, IterDomain*> p_parallel_type_to_id =
-      mapDeviceAndStreamParallelTypeToId(producer->getLoopDomain());
+      mapDeviceAndStreamParallelTypeToId(producer_domain);
   std::unordered_map<ParallelType, IterDomain*> c_parallel_type_to_id =
-      mapDeviceAndStreamParallelTypeToId(consumer->getLoopDomain());
+      mapDeviceAndStreamParallelTypeToId(consumer_domain);
   for (const auto parallel_type : parallel_types) {
     if (IterDomain* p_loop_id =
             getOrDefault(p_parallel_type_to_id, parallel_type)) {
@@ -548,6 +560,14 @@ bool haveDifferentShardings(
   return false;
 }
 
+bool haveDifferentShardings(
+    const TensorView* producer,
+    const TensorView* consumer,
+    const std::unordered_set<ParallelType>& parallel_types) {
+  return haveDifferentShardings(
+      producer, DomainType::kLoop, consumer, DomainType::kLoop, parallel_types);
+}
+
 bool isResharding(const Expr* expr) {
   FUSER_PERF_SCOPE("isResharding");
 
diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
@@ -49,6 +49,14 @@ NVF_API bool isResharding(const Expr* expr);
 
 // Returns whether two tensors have different shardings. Expect a
 // producer/consumer relationship between the arguments.
+bool haveDifferentShardings(
+    const TensorView* producer,
+    DomainType producer_domain_type,
+    const TensorView* consumer,
+    DomainType consumer_domain_type,
+    const std::unordered_set<ParallelType>& parallel_types);
+
+// Same as the above but checks loop domains for both producer and consumer.
 bool haveDifferentShardings(
     const TensorView* producer,
     const TensorView* consumer,
diff --git a/tests/cpp/test_stream.cpp b/tests/cpp/test_stream.cpp
@@ -78,8 +78,6 @@ TEST_F(StreamTest, Matmul) {
     fusion->addInput(w);
     fusion->addOutput(out);
 
-    w->outer_split(1, c);
-    w->axis(1)->parallelize(ParallelType::Stream);
     out->outer_split(1, c);
     out->axis(1)->parallelize(ParallelType::Stream);
   }

Original file line number	Diff line number	Diff line change
`@@ -78,8 +78,6 @@ TEST_F(StreamTest, Matmul) {`
`78`	`78`	`fusion->addInput(w);`
`79`	`79`	`fusion->addOutput(out);`
`80`	`80`
`81`		`- w->outer_split(1, c);`
`82`		`- w->axis(1)->parallelize(ParallelType::Stream);`
`83`	`81`	`out->outer_split(1, c);`
`84`	`82`	`out->axis(1)->parallelize(ParallelType::Stream);`
`85`	`83`	`}`