NVIDIA
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/Options.h
Lines changed: 4 additions & 0 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/Options.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/StableHloToExecutable.h
Lines changed: 3 additions & 0 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Compiler/StableHloToExecutable.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/IR/PlanOps.td
Lines changed: 117 additions & 23 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/IR/PlanOps.td
Lines changed: 117 additions & 23 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.h
Lines changed: 2 additions & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td
Lines changed: 14 additions & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td
Lines changed: 14 additions & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/lib/Compiler/StableHloToExecutable.cpp
Lines changed: 20 additions & 6 deletions b/‎mlir-tensorrt/compiler/lib/Compiler/StableHloToExecutable.cpp
Lines changed: 20 additions & 6 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Conversion/TensorRTRuntimeToExecutor/TensorRTRuntimeToExecutor.cpp
Lines changed: 6 additions & 4 deletions b/‎mlir-tensorrt/compiler/lib/Conversion/TensorRTRuntimeToExecutor/TensorRTRuntimeToExecutor.cpp
Lines changed: 6 additions & 4 deletions
@@ -34,6 +34,9 @@ namespace mlirtrt::compiler {
 /// DebugOptions are options that are common to different compiler API
 /// interfaces.
 struct DebugOptions {
+  /// Dump textual pipeline passes
+  bool dumpTextualPipeline = false;
+
   /// A directory path where the IR will be dumped during compilation
   /// using the `mlir-print-ir-tree-dir` mechanism.
   std::string dumpIRPath = "";
@@ -49,6 +52,7 @@ struct DebugOptions {
   mlir::SmallVector<std::string> llvmDebugTypes = {};
 
   void addToOptions(mlir::OptionsContext &context) {
+    context.addOption("dump-textual-pipeline", dumpTextualPipeline);
     context.addOption("mlir-print-ir-tree-dir", dumpIRPath, llvm::cl::init(""));
     context.addOption("debug", enableLLVMDebugFlag);
     context.addList<std::string>("debug-only", llvmDebugTypes,
 
@@ -128,6 +128,9 @@ struct StableHLOToExecutableOptions : public mlir::OptionsContext {
   /// Whether to disallow host tensors in TensorRT clusters.
   bool disallowHostTensorsInTensorRTClusters = false;
 
+  /// Whether to use non-DPS style calling convention.
+  bool useNonDPSCallConv = false;
+
   /// Entrypoint function name.
   std::string entrypoint = "main";
 
 
@@ -131,10 +131,57 @@ def Plan_InlineGroupOp : Plan_GroupOpBase<"inline_group", [
 }
 
 //===----------------------------------------------------------------------===//
-// InlineClosedGroupOp
+// Plan_InlineClosedGroupBase
 //===----------------------------------------------------------------------===//
 
-def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
+class Plan_InlineClosedGroupBase<string mnemonic, list<Trait> traits = []> :
+    Plan_GroupOpBase<mnemonic, traits> {
+
+  code baseInlineClosedExtraClassDeclaration = baseExtraClassDeclaration # [{
+    // Common methods for both DPS and non-DPS versions
+    bool argHasTensorType(unsigned inputIdx) {
+      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
+      return isa<RankedTensorType>(getInputs()[inputIdx].getType());
+    }
+
+    BoundsAttr getInputBoundsAttr(unsigned inputIdx) {
+      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
+      return cast<BoundsAttr>(getInputAttrs()[inputIdx]);
+    }
+
+    /// Populate the `input_attrs` from an array of BoundsAttrs.
+    void setInputAttrsAttr(ArrayRef<BoundsAttr> boundsAttrs) {
+      setInputAttrsAttr(::mlir::ArrayAttr::get(
+        getOperation()->getContext(),
+        ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
+      ));
+    }
+
+    void getSuccessorRegionsBase(RegionBranchPoint point,
+                             SmallVectorImpl<RegionSuccessor> &regions) {
+      // If the predecessor is the InlineClosedGroupOp, branch into the body.
+      if (point.isParent()) {
+        regions.push_back(RegionSuccessor(&getBody(), getBody().getArguments()));
+        return;
+      }
+
+      // Otherwise, the region branches back to the parent operation.
+      regions.push_back(RegionSuccessor(getResults()));                             
+    }
+
+    OperandRange getEntrySuccessorOperandsBase(RegionBranchPoint point) {
+      return getOperands();
+    }
+  }];
+
+  let extraClassDeclaration = baseInlineClosedExtraClassDeclaration;
+}
+
+//===----------------------------------------------------------------------===//
+// Plan_InlineClosedGroupOp
+//===----------------------------------------------------------------------===//
+
+def Plan_InlineClosedGroupOp : Plan_InlineClosedGroupBase<"inline_closed_group", [
   IsolatedFromAbove,
   AttrSizedOperandSegments,
   DestinationStyleOpInterface,
@@ -226,24 +273,12 @@ def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
                    CArg<"ArrayRef<BoundsAttr>", "{}">:$res_attrs)>
   ];
 
-  let extraClassDeclaration = baseExtraClassDeclaration # [{
+  let extraClassDeclaration = baseInlineClosedExtraClassDeclaration # [{
 
     MutableOperandRange getDpsInitsMutable() {
       return getOutsMutable();
     }
 
-    /// Returns true if the `i-th` input argument has a tensor type.
-    bool argHasTensorType(unsigned inputIdx) {
-      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
-      return isa<RankedTensorType>(getInputs()[inputIdx].getType());
-    }
-
-    /// Returns the i-th input argument's bounds attribute.
-    BoundsAttr getInputBoundsAttr(unsigned inputIdx) {
-      assert(inputIdx < getInputs().size() && "input index out-of-bounds");
-      return cast<BoundsAttr>(getInputAttrs()[inputIdx]);
-    }
-
     ArrayRef<BlockArgument> getRegionOutArgs() {
       return getBody().getArguments().take_back(getOuts().size());
     }
@@ -255,16 +290,75 @@ def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
         ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
       ));
     }
+  }];
+}
 
-    /// Populate the `input_attrs` from an array of BoundsAttrs.
-    void setInputAttrsAttr(ArrayRef<BoundsAttr> boundsAttrs) {
-      setInputAttrsAttr(::mlir::ArrayAttr::get(
-        getOperation()->getContext(),
-        ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
-      ));
-    }
+//===----------------------------------------------------------------------===//
+// InlineClosedGroupNonDPSOp
+//===----------------------------------------------------------------------===//
+
+def Plan_InlineClosedGroupNonDPSOp : Plan_InlineClosedGroupBase<"inline_closed_group_non_dps", [
+  IsolatedFromAbove,
+  SingleBlockImplicitTerminator<"plan::YieldOp">,
+  DeclareOpInterfaceMethods<RegionBranchOpInterface,
+    ["getEntrySuccessorOperands"]>,
+  DeclareOpInterfaceMethods<OpAsmOpInterface,
+    ["getAsmBlockArgumentNames"]>
+]> {
+  let description = [{
+  The `plan.inline_closed_group_non_dps` operation is a variant of the
+  `plan.inline_closed_group` operation that does not use destination-passing style
+  (DPS). It is isolated from above and explicitly captures input operands,
+  but unlike its DPS counterpart, it does not capture destination operands.
+  This operation takes input operands and their corresponding bounds attributes, 
+  and produces results. The `input_attrs` hold bounds attribute information for 
+  the input operands. The absence of bounds information is allowed (`none` bounds).
+
+  The `target` attribute specifies the execution target for the group.    
+
+  #### Example
+
+  Consider the following simple program containing operations with dynamically shaped operands:
+
+  ```mlir
+  %0 = ... : tensor<?xf32> // A dynamically shaped operand
+  %1 = ... : index         // A dynamic calculation of %0's extent
+
+  %2 = plan.inline_closed_group_non_dps target(#plan.cluster_target<tensorrt>)
+    inputs(%0, %1 : tensor<?xf32>, index)
+    in_attrs [#plan.bounds<shape, , >, #plan.bounds<none>] -> tensor<?xf32> {
+    %3 = plan.with_shape %0 (%1) : (tensor<?xf32>, index) -> tensor<?xf32>
+    %4 = stablehlo.exponential %3 : tensor<?xf32>
+    yield %4 : tensor<?xf32>
+  }
 
   }];
+  let arguments = (ins Variadic<AnyTypeOf<[AnyRankedTensor, AnySignlessIntegerOrIndex]>>:$inputs,
+                       BoundsAttrArray:$input_attrs,
+                       AnyAttr:$target);
+
+  let results = (outs Variadic<AnyTypeOf<[AnyRankedTensor]>>:$results);
+
+  let assemblyFormat = [{
+    `target` `(` $target `)` `\n`
+    `inputs` `(` ( $inputs^ `:` type($inputs) `)` ) : ( `)` ) ?  `\n`
+    `in_attrs` $input_attrs `\n`
+     attr-dict-with-keyword `->` type($results)
+     $body
+  }];
+
+  let hasVerifier = 1;
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<(ins "TypeRange":$results, 
+                   "Attribute":$target,
+                   "ValueRange":$inputs,
+                   CArg<"ArrayRef<BoundsAttr>", "{}">:$input_attrs)>,
+  ];
+
+  let extraClassDeclaration = baseInlineClosedExtraClassDeclaration;
 }
 
 //===----------------------------------------------------------------------===//
@@ -276,7 +370,7 @@ def Plan_YieldOp : Plan_Op<"yield", [
       Terminator,
       ReturnLike,
       ParentOneOf<["plan::InlineGroupOp",
-                   "plan::InlineClosedGroupOp"]>]> {
+                   "plan::InlineClosedGroupOp", "plan::InlineClosedGroupNonDPSOp"]>]> {
 
   let arguments = (ins Variadic<AnyType>:$results);
 
 
@@ -69,7 +69,8 @@ executorOneShotModuleBufferize(ModuleOp targetOp,
                                const ExecutorBufferizationOptions &options);
 
 /// Build a pipeline (targeting ModuleOp) for bufferization.
-void buildPlanBufferizationPipeline(OpPassManager &pm);
+void buildPlanBufferizationPipeline(
+    OpPassManager &pm, const plan::PlanAllocTensorsPassOptions &options);
 
 /// Build a post-bufferization pipeline that performs optimizations on memrefs.
 void buildPlanBufferOptimizationPipeline(OpPassManager &pm);
 
@@ -248,6 +248,9 @@ def StablehloClusteringPass : Pass<"stablehlo-clustering", "::mlir::ModuleOp"> {
     Option<"entrypoint", "entrypoint", "std::string", "\"\"",
       "the name of the entrypoint function; if empty then the clustering runs"
       " on all functions">,
+    Option<"useNonDPSCallConv",
+      "use-non-dps-call-conv", "bool", "false",
+      "allow tensorrt based output allocations using output allocator">,      
     Option<"disallowHostTensorsInTensorRTClusters",
       "disallow-host-tensors-in-tensorrt-clusters", "bool", "false",
       "don't cluster host tensors in TensorRT clusters">,
@@ -332,7 +335,10 @@ def CreateClosedRegionsPass : Pass<"plan-create-closed-regions", "::mlir::Module
     Option<"testPreWalkOrder", "test-pre-walk-order", "bool", "false",
       "(used only in testing) specifies to outline regions by walking in "
       " pre-order; used for verifying results are not sensitive "
-      "to traversal order">
+      "to traversal order">,
+    Option<"useNonDPSCallConv", "use-non-dps-call-conv", "bool", 
+           /*default=*/"false", 
+           "Allow TensorRT-based output allocations using output allocator">
   ];
 
   let dependentDialects = [
@@ -428,6 +434,13 @@ def PlanAllocTensorsPass : Pass<"plan-alloc-tensors",
     "::mlir::bufferization::BufferizationDialect",
     "::mlir::plan::PlanDialect"
   ];
+
+  let options = [
+    Option<"useNonDPSCallConv", "use-non-dps-call-conv", "bool", 
+           /*default=*/"false", 
+           "Allow TensorRT-based output allocations using output allocator">
+  ];
+
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -222,6 +222,10 @@ StableHLOToExecutableOptions::StableHLOToExecutableOptions(
       disallowHostTensorsInTensorRTClusters, llvm::cl::init(false),
       llvm::cl::desc("Don't allow TensorRt clusters to contain host tensor "
                      "calculations (but they can still be inputs)"));
+  addOption(
+      "use-non-dps-call-conv", useNonDPSCallConv, llvm::cl::init(false),
+      llvm::cl::desc(
+          "allow tensorrt based output allocations using output allocator"));
   addOption("executor-index-bitwidth", executorIndexBitwidth,
             llvm::cl::init(64));
   addOption("device-compute-capability", deviceComputeCapability,
@@ -303,6 +307,7 @@ void StableHloToExecutableTask::buildStablehloClusteringPipeline(
   plan::StablehloClusteringPassOptions clusteringOpts{};
   clusteringOpts.disallowHostTensorsInTensorRTClusters =
       opts.disallowHostTensorsInTensorRTClusters;
+  clusteringOpts.useNonDPSCallConv = opts.useNonDPSCallConv;
   clusteringOpts.entrypoint = opts.entrypoint;
   plan::buildPlanSegmentationPipeline(pm, clusteringOpts);
 
@@ -336,7 +341,9 @@ void StableHloToExecutableTask::buildPostClusteringPipeline(
 
   // Perform bufferization.
   pm.addPass(createMemRefCastEliminationPass());
-  pm.addPass(plan::createPlanAllocTensorsPass());
+  plan::PlanAllocTensorsPassOptions allocTensorsOpts{};
+  allocTensorsOpts.useNonDPSCallConv = opts.useNonDPSCallConv;
+  pm.addPass(plan::createPlanAllocTensorsPass(allocTensorsOpts));
   pm.addPass(plan::createPlanBufferizePass());
   pm.addPass(createMemRefCastEliminationPass());
   pm.addPass(createCanonicalizerPass());
@@ -485,13 +492,14 @@ StableHloToExecutableTask::compileStableHLOToExecutable(
     runner = pm.get();
   }
 
-  runner->printAsTextualPipeline(llvm::dbgs());
+  if (options.debugOptions.dumpTextualPipeline)
+    runner->printAsTextualPipeline(llvm::dbgs());
 
   // Setup pass manager
-  // if (failed(runner->run(module)))
-  //   return getInternalErrorStatus(
-  //       "failed to run compilation on module with symbol name: {0}",
-  //       module.getName() ? *module.getName() : "no-symbol-name");
+  if (failed(runner->run(module)))
+    return getInternalErrorStatus(
+        "failed to run compilation on module with symbol name: {0}",
+        module.getName() ? *module.getName() : "no-symbol-name");
 
   // Translate to Runtime Executable
   FailureOr<std::unique_ptr<runtime::ExecutableStorage>> exeStorage =
@@ -524,6 +532,11 @@ struct ClusteringPipelineCliOpts
       *this, "device-compute-capability",
       llvm::cl::desc("target device compute capability (SM version)"),
       llvm::cl::init(60)};
+  Option<bool> useNonDPSCallConv{
+      *this, "use-non-dps-call-conv",
+      llvm::cl::desc(
+          "allow tensorrt based output allocations using output allocator"),
+      llvm::cl::init(false)};
   Option<int64_t> deviceMaxSharedMemoryPerBlockKb{
       *this, "device-max-smem-per-block",
       llvm::cl::desc("max shared memory per block (in kilobytes)"),
@@ -551,6 +564,7 @@ static StableHLOToExecutableOptions populateStablehloClusteringPipelineOpts(
   opts.deviceComputeCapability = cliOpts.deviceComputeCapability;
   opts.deviceMaxSharedMemoryPerBlockKb =
       cliOpts.deviceMaxSharedMemoryPerBlockKb;
+  opts.useNonDPSCallConv = cliOpts.useNonDPSCallConv;
   opts.shouldInferDeviceOptionsFromHost = cliOpts.inferDeviceOptionsFromHost;
   opts.entrypoint = cliOpts.entrypoint;
   return opts;
 
@@ -263,7 +263,7 @@ struct ConvertEnqueueAllocToCall
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     // Function name for the enqueue alloc operation
-    std::string funcName = "_trtrt_alloc_enqueue";
+    std::string funcName = "_trtrt_enqueue_alloc";
 
     // Create new operands for the call op
     SmallVector<Value> newOperands = {adaptor.getExecutionContext(),
@@ -394,7 +394,7 @@ struct ConvertEnqueueAllocToCall
           ArrayRef<OpFoldResult>{this->createIndexConstant(b, 0),
                                  rewriter.getI64IntegerAttr(offset++)});
 
-      Value rankValue = b.create<executor::LoadOp>(
+      [[maybe_unused]] Value rankValue = b.create<executor::LoadOp>(
           b.getI64Type(), outputDescriptors, rankOffset);
       Value intPtr = b.create<executor::LoadOp>(
           b.getI64Type(), outputDescriptors, devicePtrOffset);
@@ -429,8 +429,10 @@ struct ConvertEnqueueAllocToCall
       resultRange.append(shapes.begin(), shapes.end());
       resultRange.append(strides.begin(), strides.end());
 
-      Value result = b.create<executor::CreateTableOp>(executor::TableType::get(
-          b.getContext(), llvm::to_vector(TypeRange(resultRange))));
+      Value result = b.create<executor::CreateTableOp>(
+          executor::TableType::get(b.getContext(),
+                                   llvm::to_vector(TypeRange(resultRange))),
+          resultRange);
 
       results.push_back(result);
     }