Skip to content

Commit

Permalink
Add plan dialect changes
Browse files Browse the repository at this point in the history
  • Loading branch information
jhalakpatel committed Oct 14, 2024
1 parent b232f2b commit 8a9ae8c
Show file tree
Hide file tree
Showing 21 changed files with 693 additions and 381 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ namespace mlirtrt::compiler {
/// DebugOptions are options that are common to different compiler API
/// interfaces.
struct DebugOptions {
/// Dump textual pipeline passes
bool dumpTextualPipeline = false;

/// A directory path where the IR will be dumped during compilation
/// using the `mlir-print-ir-tree-dir` mechanism.
std::string dumpIRPath = "";
Expand All @@ -49,6 +52,7 @@ struct DebugOptions {
mlir::SmallVector<std::string> llvmDebugTypes = {};

void addToOptions(mlir::OptionsContext &context) {
context.addOption("dump-textual-pipeline", dumpTextualPipeline);
context.addOption("mlir-print-ir-tree-dir", dumpIRPath, llvm::cl::init(""));
context.addOption("debug", enableLLVMDebugFlag);
context.addList<std::string>("debug-only", llvmDebugTypes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ struct StableHLOToExecutableOptions : public mlir::OptionsContext {
/// Whether to disallow host tensors in TensorRT clusters.
bool disallowHostTensorsInTensorRTClusters = false;

/// Whether to use non-DPS style calling convention.
bool useNonDPSCallConv = false;

/// Entrypoint function name.
std::string entrypoint = "main";

Expand Down
140 changes: 117 additions & 23 deletions mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/IR/PlanOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,57 @@ def Plan_InlineGroupOp : Plan_GroupOpBase<"inline_group", [
}

//===----------------------------------------------------------------------===//
// InlineClosedGroupOp
// Plan_InlineClosedGroupBase
//===----------------------------------------------------------------------===//

def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
class Plan_InlineClosedGroupBase<string mnemonic, list<Trait> traits = []> :
Plan_GroupOpBase<mnemonic, traits> {

code baseInlineClosedExtraClassDeclaration = baseExtraClassDeclaration # [{
// Common methods for both DPS and non-DPS versions
bool argHasTensorType(unsigned inputIdx) {
assert(inputIdx < getInputs().size() && "input index out-of-bounds");
return isa<RankedTensorType>(getInputs()[inputIdx].getType());
}

BoundsAttr getInputBoundsAttr(unsigned inputIdx) {
assert(inputIdx < getInputs().size() && "input index out-of-bounds");
return cast<BoundsAttr>(getInputAttrs()[inputIdx]);
}

/// Populate the `input_attrs` from an array of BoundsAttrs.
void setInputAttrsAttr(ArrayRef<BoundsAttr> boundsAttrs) {
setInputAttrsAttr(::mlir::ArrayAttr::get(
getOperation()->getContext(),
ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
));
}

void getSuccessorRegionsBase(RegionBranchPoint point,
SmallVectorImpl<RegionSuccessor> &regions) {
// If the predecessor is the InlineClosedGroupOp, branch into the body.
if (point.isParent()) {
regions.push_back(RegionSuccessor(&getBody(), getBody().getArguments()));
return;
}

// Otherwise, the region branches back to the parent operation.
regions.push_back(RegionSuccessor(getResults()));
}

OperandRange getEntrySuccessorOperandsBase(RegionBranchPoint point) {
return getOperands();
}
}];

let extraClassDeclaration = baseInlineClosedExtraClassDeclaration;
}

//===----------------------------------------------------------------------===//
// Plan_InlineClosedGroupOp
//===----------------------------------------------------------------------===//

def Plan_InlineClosedGroupOp : Plan_InlineClosedGroupBase<"inline_closed_group", [
IsolatedFromAbove,
AttrSizedOperandSegments,
DestinationStyleOpInterface,
Expand Down Expand Up @@ -226,24 +273,12 @@ def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
CArg<"ArrayRef<BoundsAttr>", "{}">:$res_attrs)>
];

let extraClassDeclaration = baseExtraClassDeclaration # [{
let extraClassDeclaration = baseInlineClosedExtraClassDeclaration # [{

MutableOperandRange getDpsInitsMutable() {
return getOutsMutable();
}

/// Returns true if the `i-th` input argument has a tensor type.
bool argHasTensorType(unsigned inputIdx) {
assert(inputIdx < getInputs().size() && "input index out-of-bounds");
return isa<RankedTensorType>(getInputs()[inputIdx].getType());
}

/// Returns the i-th input argument's bounds attribute.
BoundsAttr getInputBoundsAttr(unsigned inputIdx) {
assert(inputIdx < getInputs().size() && "input index out-of-bounds");
return cast<BoundsAttr>(getInputAttrs()[inputIdx]);
}

ArrayRef<BlockArgument> getRegionOutArgs() {
return getBody().getArguments().take_back(getOuts().size());
}
Expand All @@ -255,16 +290,75 @@ def Plan_InlineClosedGroupOp : Plan_GroupOpBase<"inline_closed_group", [
ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
));
}
}];
}

/// Populate the `input_attrs` from an array of BoundsAttrs.
void setInputAttrsAttr(ArrayRef<BoundsAttr> boundsAttrs) {
setInputAttrsAttr(::mlir::ArrayAttr::get(
getOperation()->getContext(),
ArrayRef<Attribute>(boundsAttrs.begin(), boundsAttrs.end())
));
}
//===----------------------------------------------------------------------===//
// InlineClosedGroupNonDPSOp
//===----------------------------------------------------------------------===//

def Plan_InlineClosedGroupNonDPSOp : Plan_InlineClosedGroupBase<"inline_closed_group_non_dps", [
IsolatedFromAbove,
SingleBlockImplicitTerminator<"plan::YieldOp">,
DeclareOpInterfaceMethods<RegionBranchOpInterface,
["getEntrySuccessorOperands"]>,
DeclareOpInterfaceMethods<OpAsmOpInterface,
["getAsmBlockArgumentNames"]>
]> {
let description = [{
The `plan.inline_closed_group_non_dps` operation is a variant of the
`plan.inline_closed_group` operation that does not use destination-passing style
(DPS). It is isolated from above and explicitly captures input operands,
but unlike its DPS counterpart, it does not capture destination operands.
This operation takes input operands and their corresponding bounds attributes,
and produces results. The `input_attrs` hold bounds attribute information for
the input operands. The absence of bounds information is allowed (`none` bounds).

The `target` attribute specifies the execution target for the group.

#### Example

Consider the following simple program containing operations with dynamically shaped operands:

```mlir
%0 = ... : tensor<?xf32> // A dynamically shaped operand
%1 = ... : index // A dynamic calculation of %0's extent

%2 = plan.inline_closed_group_non_dps target(#plan.cluster_target<tensorrt>)
inputs(%0, %1 : tensor<?xf32>, index)
in_attrs [#plan.bounds<shape, , >, #plan.bounds<none>] -> tensor<?xf32> {
%3 = plan.with_shape %0 (%1) : (tensor<?xf32>, index) -> tensor<?xf32>
%4 = stablehlo.exponential %3 : tensor<?xf32>
yield %4 : tensor<?xf32>
}

}];
let arguments = (ins Variadic<AnyTypeOf<[AnyRankedTensor, AnySignlessIntegerOrIndex]>>:$inputs,
BoundsAttrArray:$input_attrs,
AnyAttr:$target);

let results = (outs Variadic<AnyTypeOf<[AnyRankedTensor]>>:$results);

let assemblyFormat = [{
`target` `(` $target `)` `\n`
`inputs` `(` ( $inputs^ `:` type($inputs) `)` ) : ( `)` ) ? `\n`
`in_attrs` $input_attrs `\n`
attr-dict-with-keyword `->` type($results)
$body
}];

let hasVerifier = 1;

let skipDefaultBuilders = 1;

let builders = [
OpBuilder<(ins "TypeRange":$results,
"Attribute":$target,
"ValueRange":$inputs,
CArg<"ArrayRef<BoundsAttr>", "{}">:$input_attrs)>,
];

let extraClassDeclaration = baseInlineClosedExtraClassDeclaration;
}

//===----------------------------------------------------------------------===//
Expand All @@ -276,7 +370,7 @@ def Plan_YieldOp : Plan_Op<"yield", [
Terminator,
ReturnLike,
ParentOneOf<["plan::InlineGroupOp",
"plan::InlineClosedGroupOp"]>]> {
"plan::InlineClosedGroupOp", "plan::InlineClosedGroupNonDPSOp"]>]> {

let arguments = (ins Variadic<AnyType>:$results);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ executorOneShotModuleBufferize(ModuleOp targetOp,
const ExecutorBufferizationOptions &options);

/// Build a pipeline (targeting ModuleOp) for bufferization.
void buildPlanBufferizationPipeline(OpPassManager &pm);
void buildPlanBufferizationPipeline(
OpPassManager &pm, const plan::PlanAllocTensorsPassOptions &options);

/// Build a post-bufferization pipeline that performs optimizations on memrefs.
void buildPlanBufferOptimizationPipeline(OpPassManager &pm);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,9 @@ def StablehloClusteringPass : Pass<"stablehlo-clustering", "::mlir::ModuleOp"> {
Option<"entrypoint", "entrypoint", "std::string", "\"\"",
"the name of the entrypoint function; if empty then the clustering runs"
" on all functions">,
Option<"useNonDPSCallConv",
"use-non-dps-call-conv", "bool", "false",
"allow tensorrt based output allocations using output allocator">,
Option<"disallowHostTensorsInTensorRTClusters",
"disallow-host-tensors-in-tensorrt-clusters", "bool", "false",
"don't cluster host tensors in TensorRT clusters">,
Expand Down Expand Up @@ -332,7 +335,10 @@ def CreateClosedRegionsPass : Pass<"plan-create-closed-regions", "::mlir::Module
Option<"testPreWalkOrder", "test-pre-walk-order", "bool", "false",
"(used only in testing) specifies to outline regions by walking in "
" pre-order; used for verifying results are not sensitive "
"to traversal order">
"to traversal order">,
Option<"useNonDPSCallConv", "use-non-dps-call-conv", "bool",
/*default=*/"false",
"Allow TensorRT-based output allocations using output allocator">
];

let dependentDialects = [
Expand Down Expand Up @@ -428,6 +434,13 @@ def PlanAllocTensorsPass : Pass<"plan-alloc-tensors",
"::mlir::bufferization::BufferizationDialect",
"::mlir::plan::PlanDialect"
];

let options = [
Option<"useNonDPSCallConv", "use-non-dps-call-conv", "bool",
/*default=*/"false",
"Allow TensorRT-based output allocations using output allocator">
];

}

//===----------------------------------------------------------------------===//
Expand Down
26 changes: 20 additions & 6 deletions mlir-tensorrt/compiler/lib/Compiler/StableHloToExecutable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ StableHLOToExecutableOptions::StableHLOToExecutableOptions(
disallowHostTensorsInTensorRTClusters, llvm::cl::init(false),
llvm::cl::desc("Don't allow TensorRt clusters to contain host tensor "
"calculations (but they can still be inputs)"));
addOption(
"use-non-dps-call-conv", useNonDPSCallConv, llvm::cl::init(false),
llvm::cl::desc(
"allow tensorrt based output allocations using output allocator"));
addOption("executor-index-bitwidth", executorIndexBitwidth,
llvm::cl::init(64));
addOption("device-compute-capability", deviceComputeCapability,
Expand Down Expand Up @@ -303,6 +307,7 @@ void StableHloToExecutableTask::buildStablehloClusteringPipeline(
plan::StablehloClusteringPassOptions clusteringOpts{};
clusteringOpts.disallowHostTensorsInTensorRTClusters =
opts.disallowHostTensorsInTensorRTClusters;
clusteringOpts.useNonDPSCallConv = opts.useNonDPSCallConv;
clusteringOpts.entrypoint = opts.entrypoint;
plan::buildPlanSegmentationPipeline(pm, clusteringOpts);

Expand Down Expand Up @@ -336,7 +341,9 @@ void StableHloToExecutableTask::buildPostClusteringPipeline(

// Perform bufferization.
pm.addPass(createMemRefCastEliminationPass());
pm.addPass(plan::createPlanAllocTensorsPass());
plan::PlanAllocTensorsPassOptions allocTensorsOpts{};
allocTensorsOpts.useNonDPSCallConv = opts.useNonDPSCallConv;
pm.addPass(plan::createPlanAllocTensorsPass(allocTensorsOpts));
pm.addPass(plan::createPlanBufferizePass());
pm.addPass(createMemRefCastEliminationPass());
pm.addPass(createCanonicalizerPass());
Expand Down Expand Up @@ -485,13 +492,14 @@ StableHloToExecutableTask::compileStableHLOToExecutable(
runner = pm.get();
}

runner->printAsTextualPipeline(llvm::dbgs());
if (options.debugOptions.dumpTextualPipeline)
runner->printAsTextualPipeline(llvm::dbgs());

// Setup pass manager
// if (failed(runner->run(module)))
// return getInternalErrorStatus(
// "failed to run compilation on module with symbol name: {0}",
// module.getName() ? *module.getName() : "no-symbol-name");
if (failed(runner->run(module)))
return getInternalErrorStatus(
"failed to run compilation on module with symbol name: {0}",
module.getName() ? *module.getName() : "no-symbol-name");

// Translate to Runtime Executable
FailureOr<std::unique_ptr<runtime::ExecutableStorage>> exeStorage =
Expand Down Expand Up @@ -524,6 +532,11 @@ struct ClusteringPipelineCliOpts
*this, "device-compute-capability",
llvm::cl::desc("target device compute capability (SM version)"),
llvm::cl::init(60)};
Option<bool> useNonDPSCallConv{
*this, "use-non-dps-call-conv",
llvm::cl::desc(
"allow tensorrt based output allocations using output allocator"),
llvm::cl::init(false)};
Option<int64_t> deviceMaxSharedMemoryPerBlockKb{
*this, "device-max-smem-per-block",
llvm::cl::desc("max shared memory per block (in kilobytes)"),
Expand Down Expand Up @@ -551,6 +564,7 @@ static StableHLOToExecutableOptions populateStablehloClusteringPipelineOpts(
opts.deviceComputeCapability = cliOpts.deviceComputeCapability;
opts.deviceMaxSharedMemoryPerBlockKb =
cliOpts.deviceMaxSharedMemoryPerBlockKb;
opts.useNonDPSCallConv = cliOpts.useNonDPSCallConv;
opts.shouldInferDeviceOptionsFromHost = cliOpts.inferDeviceOptionsFromHost;
opts.entrypoint = cliOpts.entrypoint;
return opts;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ struct ConvertEnqueueAllocToCall
ImplicitLocOpBuilder b(op.getLoc(), rewriter);

// Function name for the enqueue alloc operation
std::string funcName = "_trtrt_alloc_enqueue";
std::string funcName = "_trtrt_enqueue_alloc";

// Create new operands for the call op
SmallVector<Value> newOperands = {adaptor.getExecutionContext(),
Expand Down Expand Up @@ -394,7 +394,7 @@ struct ConvertEnqueueAllocToCall
ArrayRef<OpFoldResult>{this->createIndexConstant(b, 0),
rewriter.getI64IntegerAttr(offset++)});

Value rankValue = b.create<executor::LoadOp>(
[[maybe_unused]] Value rankValue = b.create<executor::LoadOp>(
b.getI64Type(), outputDescriptors, rankOffset);
Value intPtr = b.create<executor::LoadOp>(
b.getI64Type(), outputDescriptors, devicePtrOffset);
Expand Down Expand Up @@ -429,8 +429,10 @@ struct ConvertEnqueueAllocToCall
resultRange.append(shapes.begin(), shapes.end());
resultRange.append(strides.begin(), strides.end());

Value result = b.create<executor::CreateTableOp>(executor::TableType::get(
b.getContext(), llvm::to_vector(TypeRange(resultRange))));
Value result = b.create<executor::CreateTableOp>(
executor::TableType::get(b.getContext(),
llvm::to_vector(TypeRange(resultRange))),
resultRange);

results.push_back(result);
}
Expand Down
Loading

0 comments on commit 8a9ae8c

Please sign in to comment.