NVIDIA · Priya2698 · Feb 13, 2025
diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
@@ -1207,6 +1207,7 @@ TensorView* castIntermediateValueInCompleteFusion(
 void SegmentedFusion::finalize() {
   impl_.cleanUnused();
   castInputOutputToLowerPrecision(edges());
+  setAllocationAsLoopForShardedTvs();
 }
 
 //! Lower FP precision of inputs and outputs specified by the given
@@ -1435,6 +1436,22 @@ void SegmentedFusion::revertInputOutputPrecisionChanges(
   }
 }
 
+void SegmentedFusion::setAllocationAsLoopForShardedTvs() {
+  auto set_allocation_as_loop = [](std::vector<Val*> vals) {
+    auto tvs = ir_utils::filterByType<TensorView>(vals);
+    std::for_each(tvs.begin(), tvs.end(), [](TensorView* tv) {
+      if (isSharded(tv)) {
+          tv->setAllocationDomain(tv->getLoopDomain(), true);
+        }
+      });
+  };
+
+  for (auto group : groups()) {
+    set_allocation_as_loop(group->inputs());
+    set_allocation_as_loop(group->outputs());
+  }
+}
+
 //! An utility class to compute and maintain the "producers of"
 //!   relationship in a segmented graph. Space heavy and should
 //!   avoid use on very large graphs.

diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
@@ -390,6 +390,9 @@ class SegmentedFusion {
   //! Deserialize SegmentedFusion using flatbuffers
   void deserialize(const serde::SegmentedFusion* buffer);
 
+  //! Set allocation domain as loop domain for sharded tensors
+  void setAllocationAsLoopForShardedTvs();
+
  private:
   void validateDAG() const;
   void validateDisjoint() const;

diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
@@ -88,20 +88,31 @@ std::pair<std::vector<IterDomain*>, std::vector<IterDomain*>> getShardingChanges
 }
 
 bool isSharded(const TensorView* tv) {
-  bool is_sharded = false;
-  for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) {
-    if (!alloc_id->isDeviceDim()) {
-      continue;
-    }
+  // First check allocation domain if available, or the logical domain.
+  auto num_sharded_axes = std::count_if(
+      tv->getMaybeAllocationDomain().begin(),
+      tv->getMaybeAllocationDomain().end(),
+      [](IterDomain* id) { return id->isDeviceDim(); });
 
-    // Only one axis can be sharded on DIDx.
-    NVF_ERROR(
-        !is_sharded,
-        "Multiple IterDomains parallelized on DIDx in TensorView ",
-        tv);
-    is_sharded = true;
+  if (num_sharded_axes == 1) {
+    return true;
   }
-  return is_sharded;
+
+  // Check if only the loop domain is sharded.
+  // It is possible if the allocation domain has not been set yet.
+  if (num_sharded_axes == 0) {
+    num_sharded_axes = std::count_if(
+      tv->getLoopDomain().begin(),
+      tv->getLoopDomain().end(),
+      [](IterDomain* id) { return id->isDeviceDim(); });
+  }
+
+  NVF_ERROR(
+      num_sharded_axes <= 1,
+      "Multiple IterDomains parallelized on DIDx in TensorView ",
+      tv);
+
+  return num_sharded_axes == 1;
 }
 
 namespace {

diff --git a/csrc/preseg_passes/pre_segmenter.cpp b/csrc/preseg_passes/pre_segmenter.cpp
@@ -43,7 +43,7 @@ namespace nvfuser::preseg_passes {
   OptimizationPass<PropagateShardingsPass>::runPass(fusion);
   OptimizationPass<InsertReshardingsPass>::runPass(fusion);
   OptimizationPass<ReorderShardedAxisPass>::runPass(fusion);
-  OptimizationPass<MakeReshardingContiguousPass>::runPass(fusion);
+  // OptimizationPass<MakeReshardingContiguousPass>::runPass(fusion);
 
   // Replace TensorViews with zero extent. Outputs and inputs may still be empty
   OptimizationPass<RemoveEmptyPass>::runPass(fusion);