From 4c213b78120ec58a6590a1fa685b718bfbc520f9 Mon Sep 17 00:00:00 2001 From: Alexandre Eichenberger Date: Thu, 12 Oct 2023 12:34:48 -0400 Subject: [PATCH] Add a framework for NNPA op placement heuristics (#2541) Signed-off-by: Alexandre Eichenberger --- .../NNPA/Compiler/NNPACompilerOptions.cpp | 17 +- .../NNPA/Compiler/NNPACompilerOptions.hpp | 9 +- .../NNPA/Compiler/NNPACompilerUtils.cpp | 2 +- .../Conversion/ONNXToZHigh/CMakeLists.txt | 1 + .../ONNXToZHigh/DevicePlacement.cpp | 70 ++- .../ONNXToZHigh/DevicePlacementHeuristic.cpp | 496 ++++++++++++++++++ .../ONNXToZHigh/DevicePlacementHeuristic.hpp | 87 +++ .../NNPA/Conversion/ONNXToZHigh/PerfModel.cpp | 364 +++++++------ .../NNPA/Conversion/ONNXToZHigh/PerfModel.hpp | 12 +- .../NNPA/Conversion/ONNXToZHigh/PerfModel.inc | 303 +++++++---- src/Accelerators/NNPA/NNPAAccelerator.cpp | 2 +- src/Accelerators/NNPA/Pass/NNPAPasses.hpp | 4 +- src/Conversion/ONNXToKrnl/Math/Reduction.cpp | 11 +- .../device_placement_pass_perf_model.mlir | 12 +- utils/make-report.py | 15 +- 15 files changed, 1085 insertions(+), 320 deletions(-) create mode 100644 src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp create mode 100644 src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp index dc15408f07..b704f954f0 100644 --- a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp +++ b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp @@ -55,10 +55,17 @@ llvm::cl::opt nnpaSaveDevicePlacementFile{ llvm::cl::desc("Save device placement configuration to a JSON file."), llvm::cl::init(""), llvm::cl::cat(OnnxMlirOptions)}; -llvm::cl::opt nnpaEnableZHighPerfModel("enable-zhigh-perf-model", - llvm::cl::desc("Enabling performance cost model to estimate if ONNX " - "operations will be faster on the NNPA or the CPU. Works " - "best with static shapes. Default is false."), - llvm::cl::init(false), llvm::cl::cat(OnnxMlirOptions)); +llvm::cl::opt nnpaPlacementHeuristic{ + "nnpa-placement-heuristic", + llvm::cl::desc( + "[Optional] Choose NNPA-related heuristic to place operations " + "on NNPA device:"), + llvm::cl::values( + clEnumVal(QualifyingOps, "Place all qualifying ops on NNPA (default)"), + clEnumVal(FasterOps, "Place qualifying ops that are faster on NNPA"), + clEnumVal(FasterOpsWSU, "FasterOps with stick/unstick cost"), + clEnumVal(MuchFasterOpsWSU, + "Much/Significantly FasterOps with stick/unstick cost")), + llvm::cl::init(QualifyingOps), llvm::cl::cat(OnnxMlirOptions)}; } // namespace onnx_mlir diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp index 0598e81c87..a3e7f7a09f 100644 --- a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp +++ b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp @@ -41,11 +41,18 @@ typedef enum { EmitZHighIR, } NNPAEmissionTargetType; +typedef enum { + QualifyingOps, /* Any ops that qualify for NNPA will go on NNPA. */ + FasterOps, /* Only qualifying ops that are faster on NNPA */ + FasterOpsWSU, /* FasterOps with With Stick and Unstick (WSU) cost.*/ + MuchFasterOpsWSU, /* FasterOpsWSU only if significantly faster. */ +} NNPAPlacementHeuristic; + extern llvm::cl::OptionCategory OnnxMlirOptions; extern llvm::cl::opt nnpaEmissionTarget; extern llvm::cl::opt nnpaClipToDLFloatRange; extern llvm::cl::opt nnpaEnableZHighToOnnx; -extern llvm::cl::opt nnpaEnableZHighPerfModel; +extern llvm::cl::opt nnpaPlacementHeuristic; extern llvm::cl::opt profileZHighIR; extern llvm::cl::opt nnpaLoadDevicePlacementFile; extern llvm::cl::opt nnpaSaveDevicePlacementFile; diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp index 143c34cfb1..b078640e82 100644 --- a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp +++ b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp @@ -152,7 +152,7 @@ void addPassesNNPA(mlir::OwningOpRef &module, if (emissionTarget >= EmitONNXIR) { addONNXToMLIRPasses(pm, /*target CPU*/ maccel.empty()); pm.addPass(onnx_mlir::createDevicePlacementPass(nnpaLoadDevicePlacementFile, - nnpaSaveDevicePlacementFile, nnpaEnableZHighPerfModel)); + nnpaSaveDevicePlacementFile, nnpaPlacementHeuristic)); } if (emissionTarget >= EmitMLIR) { diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt index 63f1f049b7..de58e1277e 100644 --- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt @@ -58,6 +58,7 @@ add_onnx_mlir_library(OMZHighToONNX add_onnx_mlir_library(OMDevicePlacement DevicePlacement.cpp + DevicePlacementHeuristic.cpp PerfModel.cpp DEPENDS diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp index ffb3ba39b9..0aa4d0c913 100644 --- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp @@ -34,9 +34,10 @@ #include "llvm/Support/JSON.h" #include "llvm/Support/MemoryBuffer.h" +#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp" +#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp" #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp" #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp" -#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp" #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.hpp" #include "src/Dialect/ONNX/ONNXOps.hpp" #include "src/Pass/Passes.hpp" @@ -60,12 +61,14 @@ struct DevicePlacementPass DevicePlacementPass() = default; DevicePlacementPass(const DevicePlacementPass &pass) - : PassWrapper>() {} + : PassWrapper>() { + this->placementHeuristic = QualifyingOps; + } DevicePlacementPass(std::string loadConfigFile, std::string saveConfigFile, - bool useZHighPerfModel) { + NNPAPlacementHeuristic placementHeuristic) { this->loadConfigFile = loadConfigFile; this->saveConfigFile = saveConfigFile; - this->useZHighPerfModel = useZHighPerfModel; + this->placementHeuristic = placementHeuristic; } StringRef getArgument() const override { return "device-placement"; } @@ -82,9 +85,28 @@ struct DevicePlacementPass llvm::cl::desc("Path to load a device configuration file in JSON format"), llvm::cl::init("")}; - Option useZHighPerfModel{*this, "use-zhigh-perf-model", - llvm::cl::desc("Enable ZHigh cost model for ops on NNPA vs CPU"), + // Placement heuristic switches (policy driven by placementHeuristic). + NNPAPlacementHeuristic placementHeuristic; + // Option useXXX listed in decreasing order of priority, if multiple are + // selected. + Option useMuchFasterWithStickOps{*this, "use-much-faster-wsu", + llvm::cl::desc("Enable FasterOpsWithStickUnstick NNPAPlacementHeuristic"), + llvm::cl::init(false)}; + Option useFasterWithStickOps{*this, "use-faster-wsu", + llvm::cl::desc("Enable FasterOpsWithStickUnstick NNPAPlacementHeuristic"), llvm::cl::init(false)}; + Option useFasterOps{*this, "use-faster", + llvm::cl::desc("Enable FasterOps NNPAPlacementHeuristic"), + llvm::cl::init(false)}; + // Method to override placement using useXXX flags + void initPlacementHeuristic() { + if (useMuchFasterWithStickOps) + placementHeuristic = MuchFasterOpsWSU; + else if (useFasterWithStickOps) + placementHeuristic = FasterOpsWSU; + else if (useFasterOps) + placementHeuristic = FasterOps; + } void runOnOperation() final; @@ -189,26 +211,18 @@ void DevicePlacementPass::runOnOperation() { OpSetType cpuOps = llvm::set_intersection( legalizedOps1, llvm::set_intersection(legalizedOps2, legalizedOps3)); - // Now annotate accelerator operations in the IR with `device` attribute, - // according to the compiler decision. - for (Operation *op : ops) { - // Set device if it is empty or unavailable. - StringAttr device = op->getAttrOfType(DEVICE_ATTRIBUTE); - if (device && !device.getValue().empty()) - continue; - // Op that is legal (should remain on the CPU) as determined by compiler - // analysis. - if (cpuOps.contains(op)) - continue; - // Now we have an operation that can work on the NNPA, check if its - // beneficial - if (useZHighPerfModel && !isOpFasterOnNNPA(op, &dimAnalysis)) { - op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, CPU_DEVICE)); - continue; - } - // Compiler determined that we want this op on the NNPA, mark as such. - op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, NNPA_DEVICE)); - } + initPlacementHeuristic(); + if (placementHeuristic == QualifyingOps) + PlaceAllLegalOpsOnNNPA(context, ops, cpuOps); + else if (placementHeuristic == FasterOps) + PlaceBeneficialOpsOnNNPA(context, ops, &dimAnalysis, cpuOps); + else if (placementHeuristic == FasterOpsWSU) + PlaceBeneficialOpsOnNNPAWithStickUnstick( + context, module, ops, &dimAnalysis, cpuOps); + else if (placementHeuristic == MuchFasterOpsWSU) + PlaceBeneficialOpsOnNNPAWithStickUnstick(context, module, ops, &dimAnalysis, + cpuOps, /*min factor*/ 3.0, /*significant CPU Factor*/ 2.0, + /*significant NNPA Factor*/ 8.0); // Create a JSON configuration file if required. if (!saveConfigFile.empty()) @@ -306,9 +320,9 @@ std::unique_ptr createDevicePlacementPass() { std::unique_ptr createDevicePlacementPass( std::string loadConfigFile, std::string saveConfigFile, - bool useZHighPerfModel) { + NNPAPlacementHeuristic placementHeuristic) { return std::make_unique( - loadConfigFile, saveConfigFile, useZHighPerfModel); + loadConfigFile, saveConfigFile, placementHeuristic); } } // namespace onnx_mlir diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp new file mode 100644 index 0000000000..dc3beb3ebe --- /dev/null +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp @@ -0,0 +1,496 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +//===-------- DevicePlacementHeuristic.hpp - Place ops using model -------===// +// +// Copyright 2023 The IBM Research Authors. +// +// ============================================================================= +// +// This file contains heuristics to place operations on CPU or NNPA. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/Debug.h" + +#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp" +#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp" +#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp" +#include "src/Dialect/ONNX/ONNXOps.hpp" +#include "src/Dialect/ONNX/ONNXOps/OpHelper.hpp" + +#include +#include + +#define DEBUG_TYPE "device-placement-heuristic" +#define DEBUG 2 + +using namespace mlir; +using namespace onnx_mlir; + +namespace { + +//===----------------------------------------------------------------------===// +// Support to classify ops. + +bool isMappedToDevice(Operation *op) { + StringAttr device = op->getAttrOfType(DEVICE_ATTRIBUTE); + return device && !device.getValue().empty(); +} + +bool isMappedToCPU(Operation *op) { + StringAttr device = op->getAttrOfType(DEVICE_ATTRIBUTE); + return device && device.getValue().equals_insensitive(CPU_DEVICE); +} + +bool isMappedToNNPA(Operation *op) { + StringAttr device = op->getAttrOfType(DEVICE_ATTRIBUTE); + return device && device.getValue().equals_insensitive(NNPA_DEVICE); +} + +// Determine if op is unsuitable because its not an ONNX op of interest, or it +// is already mapped to the CPU device. +bool isNNPAFriendlyOp(Operation *op) { + if (op->getDialect()->getNamespace() != ONNXDialect::getDialectNamespace()) + return false; + // These ops are NNPA unfriendly. Constants are friendly. + if (isa(op)) + return false; + // If `device` is already set to CPU, it is NNPA unfriendly + if (isMappedToCPU(op)) + return false; + return true; +} + +//===----------------------------------------------------------------------===// +// Support functions op assignment. + +// Return true with a debug message reporting reason for success on NNPA. +inline bool fasterOnNNPA(Operation *op, bool significant = false) { + LLVM_DEBUG({ + if (significant) + llvm::dbgs() << " Significantly faster "; + else + llvm::dbgs() << " Faster "; + llvm::dbgs() << "on NNPA model for op:"; + op->dump(); + }); + return true; +} + +// Return false with a debug message reporting reason for failure on NNPA. +inline bool fasterOnCPU(Operation *op, bool significant = false) { + LLVM_DEBUG({ + if (significant) + llvm::dbgs() << " Significantly faster "; + else + llvm::dbgs() << " Faster "; + llvm::dbgs() << "on CPU model for op:"; + op->dump(); + }); + return false; +} + +inline void assignToNNPA(Operation *op, MLIRContext *context) { + LLVM_DEBUG({ + llvm::dbgs() << "Assign to NNPA:"; + op->dump(); + }); + op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, NNPA_DEVICE)); +} + +inline void assignToCPU(Operation *op, MLIRContext *context) { + LLVM_DEBUG({ + llvm::dbgs() << "Assign to CPU:"; + op->dump(); + }); + op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, CPU_DEVICE)); +} + +//===----------------------------------------------------------------------===// +// Support functions simple cost model analysis, based solely on one operation. + +// Simply determine if operation is faster on CPU or NNPA. +bool isOpFasterOnNNPA(Operation *op, const DimAnalysis *dimAnalysis) { + LLVM_DEBUG({ + llvm::dbgs() << "\nTest cost-benefit of CPU/NNPA for op\n"; + op->dump(); + }); + // Estimate time + double cpuTime, nnpaTime; + if (!estimateTimeForOpWithModel(op, dimAnalysis, cpuTime, nnpaTime)) { + // No performance model for this operation, assume faster on NNPA; + cpuTime = 1; + nnpaTime = 0; + } + if (nnpaTime < cpuTime) + return fasterOnNNPA(op); + return fasterOnCPU(op); +} + +//===----------------------------------------------------------------------===// +// Support functions cost/benefit operation that takes stick/unstick into +// account. + +struct DevicePlacementWithStickUnstickCost { + DevicePlacementWithStickUnstickCost() = delete; + DevicePlacementWithStickUnstickCost(MLIRContext *context, ModuleOp module, + const DimAnalysis *dimAnalysis, const OpSetType &cpuOps) + : context(context), dimAnalysis(dimAnalysis), cpuOps(cpuOps) { + characterizeOps(module); + } + + // Data + MLIRContext *context; + const DimAnalysis *dimAnalysis; + // All ops that must execute on CPU, aka not eligible to run on NNPA. Ops + // in this set can be marked as device=CPU. + const OpSetType &cpuOps; + // All ops that may execute on NNPA. Ops in this set can be marked as + // device=CPU or NNPA. + OpSetType nnpaCandidateOps; + // All ops that run on CPU but do not require stick/unstick at runtime. Ops in + // thi set can be marked as device=CPU. + OpSetType nnpaNeutralOps; + + void characterizeOps(ModuleOp module) { + nnpaCandidateOps.clear(); + nnpaNeutralOps.clear(); + module.walk([&](Operation *op) -> WalkResult { + // Skip ops that are NNPA unfriendly such as ops already assigned to CPU. + if (!isNNPAFriendlyOp(op)) + return WalkResult::advance(); + // Ops that cannot/may not go on NNPA but can operate on NNPA data "for + // free" are included here in NNPA neutral ops. + // I assume here (not really true) that transpose and reshape can carry + // the stickified data. + if (isa(op)) { + nnpaNeutralOps.insert(op); + return WalkResult::advance(); + } + // Skip ops that the compiler determined are not suitable for NNPA. + if (cpuOps.contains(op)) + return WalkResult::advance(); + // Remaining ops can be mapped to NNPA. + nnpaCandidateOps.insert(op); + return WalkResult::advance(); + }); +#if DEBUG >= 2 + LLVM_DEBUG({ + llvm::dbgs() << "\nCPU Ops:\n"; + for (auto op : cpuOps) { + if (isa(op)) + continue; + llvm::dbgs() << "cpu "; + op->dump(); + } + llvm::dbgs() << "\nNNPA Neutral Ops:\n"; + for (auto op : nnpaNeutralOps) { + if (isa(op)) + continue; + llvm::dbgs() << "neutral "; + op->dump(); + } + llvm::dbgs() << "\nNNPA Candidate Ops:\n"; + for (auto op : nnpaCandidateOps) { + llvm::dbgs() << "candidate "; + op->dump(); + } + }); +#endif + } + + void classifyValueUsage(Value value, Operation *opToSkip, int64_t &cpuOpCount, + int64_t &nnpaOpCount, int64_t &nnpaCandidateOpCount, + int64_t &nnpaNeutralOpCount) { + cpuOpCount = nnpaOpCount = nnpaCandidateOpCount = nnpaNeutralOpCount = 0; + + std::string msg = ""; + for (Operation *userOp : value.getUsers()) { + // Skip op if requested. + if (userOp == opToSkip) { + LLVM_DEBUG(msg = " Skipped op."); + // Test ops that are already mapped. + } else if (isMappedToCPU(userOp)) + cpuOpCount++; + else if (isMappedToNNPA(userOp)) + nnpaOpCount++; + // Not mapped, test now ops that are candidate to execute on NNPA. + else if (nnpaCandidateOps.contains(userOp)) + nnpaCandidateOpCount++; + // Not candidate, test now ops that are neutral to NNPA. + else if (nnpaNeutralOps.contains(userOp)) + nnpaNeutralOpCount++; + // None of the above, will be on CPU. + else + cpuOpCount++; + } + LLVM_DEBUG({ + llvm::dbgs() << " Use pattern for value from " + << value.getDefiningOp()->getName() << ": used by CPU " + << cpuOpCount << ", NNPA " << nnpaOpCount + << ", NNPA candidates " << nnpaCandidateOpCount + << ", neutral " << nnpaNeutralOpCount << "." << msg << "\n"; + }); + } + + // Cost benefit analysis of moving this op X to the NNPA, with respect the ops + // that are using the results of op X. Positive cost are additional cost to + // have op X on NNPA, negative costs are benefits to have op X on NNPA. + double costBenefitIncurredForResults(Operation *opX) { + assert(!isMappedToDevice(opX) && "cannot evaluate an already mapped op"); + double totalCostBenefit = 0; + LLVM_DEBUG(llvm::dbgs() << " Look at cost benefit for results:\n"); + for (Value resVal : opX->getResults()) { + // Look at all the users of currRes and classify them. + int64_t cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount; + classifyValueUsage(resVal, /*skip op*/ nullptr, cpuOpCount, nnpaOpCount, + nnpaCandidateOpCount, nnpaNeutralOpCount); + /* + Case study: + 1) Op X remains on CPU | 2) Op X migrates to NNPA: + X.CPU | X.NNPA + / | \ | / | \ + / stick? stick | unstick unstick? \ + / | \ | / | \ + CPU Candidate NNPA | CPU Candidate NNPA + on NNPA | on CPU + placing X on NNPA: | + cost: | +1 unstick if has CPU users + benefit: | -1 stick if has NNPA users + + TODO: If migrate X to NNPA, could attribute some benefits for having + users that are NNPA. + */ + double costOfUnstickOp = estimateTimeForUnstickOp(resVal); + double costOfStickOp = estimateTimeForStickOp(resVal); + if (cpuOpCount > 0) { + // Moving this op to NNPA will cost one unstick as there are one or + // more ops that must execute on CPU. + LLVM_DEBUG( + llvm::dbgs() << " +1 unstick: " << costOfUnstickOp << "\n"); + totalCostBenefit += costOfUnstickOp; + } + if (nnpaOpCount > 0) { + // Moving this op to NNPA will remove the need to stick this result + LLVM_DEBUG( + llvm::dbgs() << " -1 stick: " << -costOfStickOp << "\n"); + totalCostBenefit -= costOfStickOp; + } + } + return totalCostBenefit; + } + + // Cost benefit analysis of moving this op X to the NNPA, with respect the ops + // that define the inputs of op X. Positive cost are additional cost to + // have op X on NNPA, negative costs are benefits to have op X on NNPA. + double costBenefitIncurredForInputs(Operation *opX) { + assert(!isMappedToDevice(opX) && "cannot evaluate an already mapped op"); + double totalCostBenefit = 0; + LLVM_DEBUG(llvm::dbgs() << " Look at cost benefit for inputs:\n"); + OpSetType visitedDefiningOps; + for (Value inputVal : opX->getOperands()) { + // Investigate the operation that defines inputVal (which is used by op) + Operation *definingOp = inputVal.getDefiningOp(); + if (!definingOp) + continue; + // If we have AddOp(%3, %3), should visit cost associated with %3 input + // only once. + if (visitedDefiningOps.contains(definingOp)) { + LLVM_DEBUG(llvm::dbgs() << " has multiple use of same input\n"); + continue; + } + visitedDefiningOps.insert(definingOp); + + // Classify all other users of this input value. + int64_t cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount; + classifyValueUsage(inputVal, /*skip op X that we are analyzing*/ opX, + cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount); + /* + Case study: + 3) Op X remains on CPU | 4) Op X remains on CPU + def.CPU ----. | def.NNPA -----. + / | \ \ | / | \ \ + / stick? stick \ | unstick unstick? \ unstick + / | \ \ | / | \ \ + CPU Candidate NNPA X.CPU | CPU Candidate NNPA X.CPU + on NNPA | on CPU + + 5) Op X migrates to NNPA | 6) Op X migrates to NNPA + def.CPU ----. | def.NNPA -----. + / | \ \ | / | \ \ + / stick? stick stick | unstick unstick? \ \ + / | \ \ | / | \ \ + CPU Candidate NNPA X.NNPA | CPU Candidate NNPA X.NNPA + on NNPA | on CPU + + placing X on NNPA: | + cost: +1 stick if first NNPA | + benefit: | -1 stick + */ + double costOfStickOp = estimateTimeForStickOp(inputVal); + if (isMappedToCPU(definingOp) || + !(nnpaCandidateOps.contains(definingOp) || + nnpaNeutralOps.contains(definingOp))) { + // Case 5. + if (nnpaOpCount == 0) { + LLVM_DEBUG(llvm::dbgs() << " def-op on cpu (case 5), +1 stick " + << costOfStickOp << ".\n"); + totalCostBenefit += costOfStickOp; + } + } + if (isMappedToNNPA(definingOp)) { + // Case 6. + LLVM_DEBUG(llvm::dbgs() << " def-op on NNPA (case 6), -1 stick " + << -costOfStickOp << ".\n"); + totalCostBenefit -= costOfStickOp; + } + } + return totalCostBenefit; + } + + bool significantlyFaster(double fast, double slow, double factor) { + // At least factor x faster. + return factor * fast <= slow; + } + + // Determine if op is faster on the NNPA or not. To be faster than the CPU, + // expect the NNPA to be at least minFactor faster than CPU. Significant is + // set if the op is significantFactor faster / slower on the device. + bool isOpFasterOnNNPA(Operation *op, double minFactor, + double significantCPUFactor, double significantNNPAFactor, + bool &significant) { + LLVM_DEBUG({ + llvm::dbgs() + << "\nTest cost-benefit with stick/unstick of CPU/NNPA for op\n"; + op->dump(); + }); + // Estimate time + double cpuTime, nnpaTime, nnpaTimeWithOverheads; + if (estimateTimeForOpWithModel(op, dimAnalysis, cpuTime, nnpaTime)) { + // Has performance model, account for stick/unstick. + double useCostBenefit = costBenefitIncurredForResults(op); + double inputCostBenefit = costBenefitIncurredForInputs(op); + nnpaTimeWithOverheads = nnpaTime + useCostBenefit + inputCostBenefit; + LLVM_DEBUG(llvm::dbgs() + << " New estimated nnpa time with stick/unstick:" + << nnpaTimeWithOverheads << " vs cpu " << cpuTime << ".\n"); + } else { + // No performance model for this operation, assume faster on NNPA; + cpuTime = 10; + nnpaTime = nnpaTimeWithOverheads = 1; + LLVM_DEBUG(llvm::dbgs() << " no time estimate, assume NNPA better\n."); + } + if (nnpaTimeWithOverheads * minFactor <= cpuTime) { + // For significant, don't take overheads into account as it may change + // depending on mapping. + significant = + significantlyFaster(nnpaTime, cpuTime, significantNNPAFactor); + return fasterOnNNPA(op, significant); + } + // For significant, don't take overheads into account as it may change + // depending on mapping. + significant = significantlyFaster(cpuTime, nnpaTime, significantCPUFactor); + return fasterOnCPU(op, significant); + } + +}; // DevicePlacementWithStickUnstickCost + +} // namespace + +//===----------------------------------------------------------------------===// +// Exported heuristics for device placement. + +namespace onnx_mlir { + +void PlaceAllLegalOpsOnNNPA(MLIRContext *context, + const SmallVector &ops, const OpSetType &cpuOps) { + for (Operation *op : ops) { + if (isMappedToDevice(op)) + continue; + // Op that cannot go on NNPA. + if (cpuOps.contains(op)) + continue; + // Compiler determined that we want this op on the NNPA, mark as such. + assignToNNPA(op, context); + } +} + +void PlaceBeneficialOpsOnNNPA(MLIRContext *context, + const SmallVector &ops, const DimAnalysis *dimAnalysis, + const OpSetType &cpuOps) { + for (Operation *op : ops) { + if (isMappedToDevice(op)) + continue; + // Op that cannot go on NNPA. + if (cpuOps.contains(op)) + continue; + // Now we have an operation that can work on the NNPA, check if its + // beneficial + if (!isOpFasterOnNNPA(op, dimAnalysis)) { + assignToCPU(op, context); + continue; + } + // Compiler determined that we want this op on the NNPA, mark as such. + assignToNNPA(op, context); + } +} + +void PlaceBeneficialOpsOnNNPAWithStickUnstick(MLIRContext *context, + ModuleOp module, const SmallVector &ops, + const DimAnalysis *dimAnalysis, const OpSetType &cpuOps, double minFactor, + double significantCPUFactor, double significantNNPAFactor) { + // Init model. + DevicePlacementWithStickUnstickCost model( + context, module, dimAnalysis, cpuOps); + int64_t ub = 5; + int64_t i = 0; + while (i < ub) { + int64_t modified = 0; + bool first = (i == 0); + bool last = (i == ub - 1); + LLVM_DEBUG(llvm::dbgs() << "\n\n\nPlacement Iteration " << i << "\n\n"); + for (Operation *op : ops) { + if (isMappedToDevice(op)) + continue; + // Op that cannot go on NNPA. + if (cpuOps.contains(op)) + continue; + // Now we have an operation that can work on the NNPA, check if its + // beneficial + bool significant; + if (!model.isOpFasterOnNNPA(op, minFactor, significantCPUFactor, + significantNNPAFactor, significant)) { + if (last || significant) { + modified++; + assignToCPU(op, context); + } + continue; + } + // Compiler determined that we want this op on the NNPA, mark as such. + if (!first || significant) { + modified++; + assignToNNPA(op, context); + } + } + if (last) { + break; + } else if (first) { + LLVM_DEBUG(llvm::dbgs() << "\nFirst, go on.\n"); + ++i; + } else if (modified) { + LLVM_DEBUG(llvm::dbgs() << "\nHad " << modified << " changes, go on.\n"); + ++i; + } else { + LLVM_DEBUG(llvm::dbgs() << "\nHad no changes, skip to last iter.\n"); + i = ub - 1; + } + } +} + +} // namespace onnx_mlir diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp new file mode 100644 index 0000000000..d03647fcd7 --- /dev/null +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp @@ -0,0 +1,87 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +//===-------- DevicePlacementHeuristic.hpp - Place ops using model -------===// +// +// Copyright 2023 The IBM Research Authors. +// +// ============================================================================= +// +// This file contains heuristics to place operations on CPU or NNPA. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "mlir/IR/BuiltinOps.h" + +#include "src/Dialect/ONNX/ONNXDimAnalysis.hpp" + +namespace onnx_mlir { + +using OpSetType = mlir::DenseSet; + +/** + * Place all ops that qualify for NNPA executions on the NNPA. + * + * @param context Context of the model. + * @param ops ONNX ops that should be considered in the device assignment. + * @param cpuOps Set of ops that must execute on CPU. + */ + +void PlaceAllLegalOpsOnNNPA(mlir::MLIRContext *context, + const llvm::SmallVector &ops, + const OpSetType &cpuOps); + +/** + * Place all ops that qualify for NNPA execution on the NNPA when the + * operations are estimated run faster on the NNPA. + * + * @param context Context of the model. + * @param ops ONNX ops that should be considered in the device assignment. + * @param dimAnalysis Pointer to dimension analysis tool for disambiguating + * dynamic shape dimensions. + * @param cpuOps Set of ops that must execute on CPU. + */ + +void PlaceBeneficialOpsOnNNPA(mlir::MLIRContext *context, + const llvm::SmallVector &ops, + const DimAnalysis *dimAnalysis, const OpSetType &cpuOps); + +/** + * Place all operations that qualify for NNPA execution on the NNPA when the + * operations are estimated run faster on the NNPA, including the costs of Stick + * and Unstick necessary for NNPA execution. The algorithm starts to place on + * the CPU/NNPA operations that are significantly faster on CPU/NNPA. Then it + * aims to add operations to the NNPA when the new operations are faster + * including the additional (if any) stick/unstick required for these less + * significantly faster NNPA operations. Three factors below can modify the + * sensitivity at which ops are assigned to the NNPA. + * + * @param context Context of the model. + * @param ops ONNX ops that should be considered in the device assignment. + * @param dimAnalysis Pointer to dimension analysis tool for disambiguating + * dynamic shape dimensions. + * @param cpuOps Set of ops that must execute on CPU. + * @param minFactor NNPA (including stick/unstick) has to be at least minFactor + * times faster than CPU for an op to be assigned to the NNPA. + * @param significantCPUFactor CPU has to be at least significantFactor faster + * than NNPA to seed/force computations on the CPU. + * @param significantNNPAFactor NNPA has to be at least significantFactor faster + * than CPU to seed/force computations on the NNPA. + * + * @note The significantCPUFactor can be smaller, as if it's not looking good + * for the NNPA, we may as well seed the computation on CPU for ops that are + * much better on the CPU. For significantNNPAFactor, we may want it much higher + * as we might want only to send there really beneficial ops on the NNPA. + * Combining a high significantNNPAFactor with a large minFactor, the heuristic + * will put only ops that are really beneficial on the NNPA. + */ +void PlaceBeneficialOpsOnNNPAWithStickUnstick(mlir::MLIRContext *context, + mlir::ModuleOp module, const llvm::SmallVector &ops, + const DimAnalysis *dimAnalysis, const OpSetType &cpuOps, + double minFactor = 1.1, double significantCPUFactor = 2.0, + double significantNNPAFactor = 3.0); + +} // namespace onnx_mlir diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp index 7e29bf62d0..61157e1a96 100644 --- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp @@ -36,25 +36,7 @@ using PERF_MODEL4 = std::function; #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc" //===----------------------------------------------------------------------===// -// Support functions for reporting. - -// Return true with a debug message reporting reason for success on NNPA. -inline bool fasterOnNNPA(Operation *op, std::string msg) { - LLVM_DEBUG({ - llvm::dbgs() << " Faster on NNPA: " << msg << " For op:"; - op->dump(); - }); - return true; -} - -// Return false with a debug message reporting reason for failure on NNPA. -inline bool fasterOnCPU(Operation *op, std::string msg) { - LLVM_DEBUG({ - llvm::dbgs() << " Faster on CPU: " << msg << " For op:"; - op->dump(); - }); - return false; -} +// Support functions // Summarize higher dims (leaving ub..rank-1 untouched). If none, return size // of 1. Otherwise, returns the cumulative multiplication of each of the static @@ -76,10 +58,8 @@ inline int64_t summarizeHigherDims( //===----------------------------------------------------------------------===// // Support for unary/binary elementwise with possibly unknown dimensions. -bool isElementwiseOpFasterOnNNPA(Operation *op, Value oper, - const DimAnalysis *dimAnalysis, PERF_MODEL3 modelForCPU, - PERF_MODEL3 modelForNNPA) { - +void processDim(Value oper, int64_t &e4, int64_t &e3, int64_t &e2, int64_t &e1, + std::string &msg) { // At this time, use only 1 of the two operands. ShapedType operType = oper.getType().dyn_cast_or_null(); assert(operType && operType.hasRank() && "expected shaped type with rank"); @@ -88,12 +68,12 @@ bool isElementwiseOpFasterOnNNPA(Operation *op, Value oper, llvm::ArrayRef shape = operType.getShape(); // Gather all 4th...nth ranked shape together. If all dynamic; assume size // of 1. - std::string msg = ""; + LLVM_DEBUG(msg = ""); bool hasDynamicE4; - int64_t e4 = summarizeHigherDims(shape, operRank - 3, hasDynamicE4); - int64_t e3 = operRank >= 3 ? shape[operRank - 3] : 1; - int64_t e2 = operRank >= 2 ? shape[operRank - 2] : 1; - int64_t e1 = operRank >= 1 ? shape[operRank - 1] : 1; + e4 = summarizeHigherDims(shape, operRank - 3, hasDynamicE4); + e3 = operRank >= 3 ? shape[operRank - 3] : 1; + e2 = operRank >= 2 ? shape[operRank - 2] : 1; + e1 = operRank >= 1 ? shape[operRank - 1] : 1; // Handle dynamic shapes, eventually it would be good to have ranges given by // the user. if (hasDynamicE4) { @@ -111,20 +91,31 @@ bool isElementwiseOpFasterOnNNPA(Operation *op, Value oper, e1 = 64; // Assume full. LLVM_DEBUG(msg += " E1=64: dyn, assume full tile."); } - double nnpaEstimatedTime = modelForNNPA(e4 * e3, e2, e1); - double cpuEstimatedTime = modelForCPU(e4 * e3, e2, e1); - LLVM_DEBUG(fprintf(stderr, " Estimated times: nnpa %f, cpu %f\n", - nnpaEstimatedTime, cpuEstimatedTime)); - if (nnpaEstimatedTime < cpuEstimatedTime) - return fasterOnNNPA(op, "Model estimates faster time on NNPA." + msg); - return fasterOnCPU(op, "Model estimates faster time on CPU." + msg); +} + +void estimateTimeForElementwiseOp(Operation *op, Value oper, + const DimAnalysis *dimAnalysis, PERF_MODEL3 modelForCPU, + PERF_MODEL3 modelForNNPA, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + + // Process dim (collapse and handle dynamic sizes). + int64_t e4, e3, e2, e1; + std::string msg; + processDim(oper, e4, e3, e2, e1, msg); + + cpuEstimatedTime = modelForCPU(e4 * e3, e2, e1); + nnpaEstimatedTime = modelForNNPA(e4 * e3, e2, e1); + LLVM_DEBUG(llvm::dbgs() << " Estimated times for op " << op->getName() + << ": nnpa " << nnpaEstimatedTime << ", cpu " + << cpuEstimatedTime << "." << msg.c_str() << "\n"); } //===----------------------------------------------------------------------===// // Support for matmul with possibly unknown dimensions. -bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed, - bool bTransposed, const DimAnalysis *dimAnalysis) { +void estimateTimeForMatMulOp(Operation *op, Value a, Value b, bool aTransposed, + bool bTransposed, const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { // Scanning A. ShapedType aType = a.getType().dyn_cast_or_null(); assert(aType && aType.hasRank() && "expected shaped type with A rank"); @@ -152,7 +143,8 @@ bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed, int64_t N = aN, M = aM, K = bK; // Rules common to matmul with/without broadcast. // Ideally we would have ranges to estimate cost when dynamic. - std::string msg = ""; + std::string msg; + LLVM_DEBUG(msg = ""); // Assume the broadcast B dim of the matmul will be small. if (aBDynamic) { LLVM_DEBUG(msg += " B+ for input A: assume size 1 for dynamic dims."); @@ -216,14 +208,15 @@ bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed, hasBroadcast /* no perf measurement yet for broadcast case*/) { // For no broadcast, pick the largest B dimension. int64_t B = std::max(aB, bB); - double nnpaEstimatedTime = estimatedTimeForNNPA_MatMul_3ds(B, N, M, K); - double cpuEstimatedTime = estimatedTimeForCPU_MatMul_3ds(B, N, M, K); - LLVM_DEBUG(fprintf(stderr, - " Times for matmul: nnpa %f, cpu %f with dim (%i, %i, %i, %d)\n", - nnpaEstimatedTime, cpuEstimatedTime, (int)B, (int)N, (int)M, (int)K)); - if (nnpaEstimatedTime < cpuEstimatedTime) - return fasterOnNNPA(op, "Model estimates faster time on NNPA." + msg); - return fasterOnCPU(op, "Model estimates faster time on CPU." + msg); + nnpaEstimatedTime = estimatedTimeForNNPA_MatMul_3ds(B, N, M, K); + cpuEstimatedTime = estimatedTimeForCPU_MatMul_3ds(B, N, M, K); + LLVM_DEBUG(llvm::dbgs() + << " Estimated times for op " << op->getName() << " with dim (" + << B << ", " << N << ", " << M << ", " << K << "): nnpa " + << nnpaEstimatedTime << ", cpu " << cpuEstimatedTime << "." + << msg.c_str() << "\n"); + + return; } llvm_unreachable("should not get here"); } @@ -232,189 +225,240 @@ bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed, // Processing for each op: binary elementwise. template -bool checkIfOpFasterOnNNPA(OP_TYPE op, const DimAnalysis *dimAnalysis) { +void estimateTimeForOp(OP_TYPE op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { llvm_unreachable("should have a model for all defined ops"); } template <> -bool checkIfOpFasterOnNNPA( - ONNXAddOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Add_3ds, estimatedTimeForNNPA_Add_3ds); +void estimateTimeForOp(ONNXAddOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Add_3ds, estimatedTimeForNNPA_Add_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXDivOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Div_3ds, estimatedTimeForNNPA_Div_3ds); +void estimateTimeForOp(ONNXDivOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Div_3ds, estimatedTimeForNNPA_Div_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXMaxOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Max_3ds, estimatedTimeForNNPA_Max_3ds); +void estimateTimeForOp(ONNXMaxOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Max_3ds, estimatedTimeForNNPA_Max_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXMinOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Min_3ds, estimatedTimeForNNPA_Min_3ds); +void estimateTimeForOp(ONNXMinOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Min_3ds, estimatedTimeForNNPA_Min_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXMulOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Mul_3ds, estimatedTimeForNNPA_Mul_3ds); +void estimateTimeForOp(ONNXMulOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Mul_3ds, estimatedTimeForNNPA_Mul_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXPowOp op, const DimAnalysis *dimAnalysis) { +void estimateTimeForOp(ONNXPowOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { int64_t exponentValue; if (hasIntegerPowerExponent(&op, exponentValue)) { if (exponentValue == 2) - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Pow_2_3ds, - estimatedTimeForNNPA_Pow_2_3ds); + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), + dimAnalysis, estimatedTimeForCPU_Pow2_3ds, + estimatedTimeForNNPA_Pow2_3ds, cpuEstimatedTime, nnpaEstimatedTime); if (exponentValue == 3) - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Pow_3_3ds, - estimatedTimeForNNPA_Pow_3_3ds); + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), + dimAnalysis, estimatedTimeForCPU_Pow3_3ds, + estimatedTimeForNNPA_Pow3_3ds, cpuEstimatedTime, nnpaEstimatedTime); if (exponentValue == 4) - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Pow_4_3ds, - estimatedTimeForNNPA_Pow_4_3ds); + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), + dimAnalysis, estimatedTimeForCPU_Pow4_3ds, + estimatedTimeForNNPA_Pow4_3ds, cpuEstimatedTime, nnpaEstimatedTime); } // For other power exponent, just use pow of 8. - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Pow_8_3ds, - estimatedTimeForNNPA_Pow_8_3ds); + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Pow8_3ds, estimatedTimeForNNPA_Pow8_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXSubOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - dimAnalysis, estimatedTimeForCPU_Sub_3ds, estimatedTimeForNNPA_Sub_3ds); +void estimateTimeForOp(ONNXSubOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis, + estimatedTimeForCPU_Sub_3ds, estimatedTimeForNNPA_Sub_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } //===----------------------------------------------------------------------===// // Processing for each op: unary elementwise. template <> -bool checkIfOpFasterOnNNPA( - ONNXExpOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(), - dimAnalysis, estimatedTimeForCPU_Exp_3ds, estimatedTimeForNNPA_Exp_3ds); +void estimateTimeForOp(ONNXExpOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_Exp_3ds, estimatedTimeForNNPA_Exp_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXLogOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(), - dimAnalysis, estimatedTimeForCPU_Log_3ds, estimatedTimeForNNPA_Log_3ds); +void estimateTimeForOp(ONNXLogOp op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_Log_3ds, estimatedTimeForNNPA_Log_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXReluOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(), - dimAnalysis, estimatedTimeForCPU_Relu_3ds, estimatedTimeForNNPA_Relu_3ds); +void estimateTimeForOp(ONNXReluOp op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_Relu_3ds, estimatedTimeForNNPA_Relu_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXSigmoidOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(), - dimAnalysis, estimatedTimeForCPU_Sigmoid_3ds, - estimatedTimeForNNPA_Sigmoid_3ds); +void estimateTimeForOp(ONNXSigmoidOp op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_Sigmoid_3ds, estimatedTimeForNNPA_Sigmoid_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXSoftmaxOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(), - dimAnalysis, estimatedTimeForCPU_Softmax_3ds, - estimatedTimeForNNPA_Softmax_3ds); +void estimateTimeForOp(ONNXSoftmaxOp op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_Softmax_3ds, estimatedTimeForNNPA_Softmax_3ds, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXTanhOp op, const DimAnalysis *dimAnalysis) { - return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(), - dimAnalysis, estimatedTimeForCPU_Tanh_3ds, estimatedTimeForNNPA_Tanh_3ds); +void estimateTimeForOp(ONNXTanhOp op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_Tanh_3ds, estimatedTimeForNNPA_Tanh_3ds, + cpuEstimatedTime, nnpaEstimatedTime); +} + +//===----------------------------------------------------------------------===// +// Processing for each op: ReduceMean. + +template <> +void estimateTimeForOp(ONNXReduceMeanV13Op op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis, + estimatedTimeForCPU_ReduceMean_4d, estimatedTimeForNNPA_ReduceMean_4d, + cpuEstimatedTime, nnpaEstimatedTime); } //===----------------------------------------------------------------------===// // Processing for each op: MatMul. template <> -bool checkIfOpFasterOnNNPA( - ONNXMatMulOp op, const DimAnalysis *dimAnalysis) { - return isMatMulOpFasterOnNNPA(op.getOperation(), op.getOperand(0), - op.getOperand(1), false /*a transposed*/, false /*b transposed*/, - dimAnalysis); +void estimateTimeForOp(ONNXMatMulOp op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForMatMulOp(op.getOperation(), op.getOperand(0), op.getOperand(1), + false /*a transposed*/, false /*b transposed*/, dimAnalysis, + cpuEstimatedTime, nnpaEstimatedTime); } template <> -bool checkIfOpFasterOnNNPA( - ONNXGemmOp op, const DimAnalysis *dimAnalysis) { - return isMatMulOpFasterOnNNPA(op.getOperation(), op.getA(), op.getB(), - op.getTransA(), op.getTransB(), dimAnalysis); +void estimateTimeForOp(ONNXGemmOp op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime) { + estimateTimeForMatMulOp(op.getOperation(), op.getA(), op.getB(), + op.getTransA(), op.getTransB(), dimAnalysis, cpuEstimatedTime, + nnpaEstimatedTime); } } // namespace +namespace onnx_mlir { + //===----------------------------------------------------------------------===// -// Function to perform evaluation. +// Estimate time for ops that have a model. + +double estimateTimeForStickOp(Value oper) { + // Process dim (collapse and handle dynamic sizes). + int64_t e4, e3, e2, e1; + std::string msg; + processDim(oper, e4, e3, e2, e1, msg); + return estimatedTimeForNNPA_Stick_3ds(e4 * e3, e2, e1); +} -namespace onnx_mlir { +double estimateTimeForUnstickOp(Value oper) { + // Process dim (collapse and handle dynamic sizes). + int64_t e4, e3, e2, e1; + std::string msg; + processDim(oper, e4, e3, e2, e1, msg); + return estimatedTimeForNNPA_Unstick_3ds(e4 * e3, e2, e1); +} -bool isOpFasterOnNNPA(mlir::Operation *op, const DimAnalysis *dimAnalysis) { - LLVM_DEBUG({ - llvm::dbgs() << "Test cost-benefit of CPU/NNPA for op "; - op->dump(); - }); - // Binary elementwise NNPA candidate ops. +bool estimateTimeForOpWithModel(Operation *op, const DimAnalysis *dimAnalysis, + double &cpuEstimatedTime, double &nnpaEstimatedTime) { + bool opHasModel = true; if (auto addOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(addOp, dimAnalysis); - if (auto divOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(divOp, dimAnalysis); - if (auto maxOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(maxOp, dimAnalysis); - if (auto minOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(minOp, dimAnalysis); - if (auto mulOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(mulOp, dimAnalysis); - if (auto powOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(powOp, dimAnalysis); - if (auto subOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(subOp, dimAnalysis); + estimateTimeForOp(addOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto divOp = dyn_cast(op)) + estimateTimeForOp(divOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto maxOp = dyn_cast(op)) + estimateTimeForOp(maxOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto minOp = dyn_cast(op)) + estimateTimeForOp(minOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto mulOp = dyn_cast(op)) + estimateTimeForOp(mulOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto powOp = dyn_cast(op)) + estimateTimeForOp(powOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto subOp = dyn_cast(op)) + estimateTimeForOp(subOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); // Unary elementwise NNPA candidate ops. - if (auto expOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(expOp, dimAnalysis); - if (auto logOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(logOp, dimAnalysis); - if (auto reluOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(reluOp, dimAnalysis); - if (auto sigmoidOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(sigmoidOp, dimAnalysis); - if (auto softmaxOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(softmaxOp, dimAnalysis); - if (auto tanhOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(tanhOp, dimAnalysis); + else if (auto expOp = dyn_cast(op)) + estimateTimeForOp(expOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto logOp = dyn_cast(op)) + estimateTimeForOp(logOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto reluOp = dyn_cast(op)) + estimateTimeForOp(reluOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto sigmoidOp = dyn_cast(op)) + estimateTimeForOp( + sigmoidOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto softmaxOp = dyn_cast(op)) + estimateTimeForOp( + softmaxOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto tanhOp = dyn_cast(op)) + estimateTimeForOp(tanhOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + // Reduce + else if (auto reduceMeanOp = dyn_cast(op)) + estimateTimeForOp( + reduceMeanOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); // Matmul. - if (auto matMulOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(matMulOp, dimAnalysis); - if (auto gemmOp = dyn_cast(op)) - return checkIfOpFasterOnNNPA(gemmOp, dimAnalysis); - - // Unknown, issue a warning and assume its faster on NNPA - return fasterOnNNPA(op, "Candidate for NNPA without model; please add."); + else if (auto matMulOp = dyn_cast(op)) + estimateTimeForOp( + matMulOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else if (auto gemmOp = dyn_cast(op)) + estimateTimeForOp(gemmOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime); + else + opHasModel = false; + + return opHasModel; } } // namespace onnx_mlir diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp index 8ec4e89786..1b00220ee7 100644 --- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp @@ -21,6 +21,14 @@ namespace onnx_mlir { -bool isOpFasterOnNNPA(mlir::Operation *op, const DimAnalysis *dimAnalysis); +// When an op has a model, define the CPU and NNPA estimated times and return +// true. When an op does not have a model, just return false. +bool estimateTimeForOpWithModel(mlir::Operation *op, + const DimAnalysis *dimAnalysis, double &cpuEstimatedTime, + double &nnpaEstimatedTime); -} +// Estimate the CPU time for stick/unstick given the shape in oper +double estimateTimeForStickOp(mlir::Value oper); +double estimateTimeForUnstickOp(mlir::Value oper); + +} // namespace onnx_mlir diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc index fe89463d1b..069e06af92 100644 --- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc +++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc @@ -10,207 +10,292 @@ inline static double ms_ceiling(double n, double m) { return ceil(n / m) * m; } -// Operation Add_3ds: estimated times with cross over at complexity = 58292. +// Operation Add_3ds: estimated times. double estimatedTimeForCPU_Add_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998796798458582 - return 1.3014297439117151e-10 * complexity + 2.0196878739035375e-07; + // Regression for CPU with r2 = 0.9989711028792525 + return 3.9686846353007493e-07 + 1.1794164898251022e-10 * complexity; } +// Operation Add_3ds: estimated times. double estimatedTimeForNNPA_Add_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9946449269392905 - return 1.0267638124762705e-10 * complexity + 1.8030585573544791e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9994266956239162 + return 2.128070603450555e-06 + 3.884079345448728e-11 * complexity + + 3.8840793454487276e-11 * complexity2; } -// Operation Div_3ds: estimated times with cross over at complexity = 2483. +// Operation Div_3ds: estimated times. double estimatedTimeForCPU_Div_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9999989217222092 - return 1.4517483410281062e-09 * complexity + 4.819629870926124e-07; + // Regression for CPU with r2 = 0.9999973603706902 + return 6.024413187183776e-07 + 1.444210277092263e-09 * complexity; } +// Operation Div_3ds: estimated times. double estimatedTimeForNNPA_Div_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.993964642126504 - return 1.0448506395503133e-10 * complexity + 3.8276566890878624e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9991809091918582 + return 4.268037890056841e-06 + 3.958236721770986e-11 * complexity + + 3.9582367217709856e-11 * complexity2; } -// Operation Exp_3ds: estimated times with cross over at complexity = -1037. +// Operation Exp_3ds: estimated times. double estimatedTimeForCPU_Exp_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9953451710472131 - return 3.728416286592805e-09 * complexity + 6.0588917718826046e-06; + // Regression for CPU with r2 = 0.9964027710112378 + return -5.114482496042976e-06 + 4.191612771812482e-09 * complexity; } +// Operation Exp_3ds: estimated times. double estimatedTimeForNNPA_Exp_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9891659822712001 - return 8.495396362536427e-11 * complexity + 2.2771053257892673e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9996489747804022 + return 2.940912571153368e-06 + 3.030831435560512e-11 * complexity + + 3.030831435560511e-11 * complexity2; } -// Operation Log_3ds: estimated times with cross over at complexity = 214. +// Operation Log_3ds: estimated times. double estimatedTimeForCPU_Log_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9999978790893227 - return 8.437935291413468e-09 * complexity + -5.198279741842633e-07; + // Regression for CPU with r2 = 0.98951908796714 + return -1.1673535780041665e-05 + 5.568744038404678e-09 * complexity; } +// Operation Log_3ds: estimated times. double estimatedTimeForNNPA_Log_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9860021150687228 - return 9.34925048506708e-11 * complexity + 1.2737044319477467e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9994743517297515 + return 1.9298869463234537e-06 + 3.5198842979463965e-11 * complexity + + 3.519884297946396e-11 * complexity2; } -// Operation MatMul_3ds: estimated times with cross over at complexity = 30745. +// Operation MatMul_3ds: estimated times. double estimatedTimeForCPU_MatMul_3ds(double B, double N, double M, double K) { double complexity = B * (N * M * K); - // Regression for CPU with r2 = 0.9977563719224963 - return 8.516416082324592e-11 * complexity + 2.813872063426722e-07; + // Regression for CPU with r2 = 0.9993063132437037 + return 1.2274778896376187e-06 + 8.277833300031912e-11 * complexity; } +// Operation MatMul_3ds: estimated times. double estimatedTimeForNNPA_MatMul_3ds(double B, double N, double M, double K) { double complexity = B * ms_ceiling(N, 2.0) * ms_ceiling(M, 64.0) * ms_ceiling(K, 64.0); - // Regression for NNPA with r2 = 0.6995070738236333 - return 6.1299740116770674e-12 * complexity + 2.7113063170731707e-06; + double complexity2 = + B * ms_ceiling(N, 32.0) * ms_ceiling(M, 64.0) * ms_ceiling(K, 64.0); + // Regression for NNPA with r2 = 0.7300356886725241 + return 3.2376205400192875e-06 + 1.2476855786580124e-12 * complexity + + 1.2476855786580124e-12 * complexity2; } -// Operation Max_3ds: estimated times with cross over at complexity = 28928. +// Operation Max_3ds: estimated times. double estimatedTimeForCPU_Max_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.999833820537654 - return 1.2859513298352582e-10 * complexity + 2.1445698817257752e-07; + // Regression for CPU with r2 = 0.999934744283212 + return 3.2005877071463053e-07 + 1.1769966151617187e-10 * complexity; } +// Operation Max_3ds: estimated times. double estimatedTimeForNNPA_Max_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9951863938046909 - return 1.0420243623790107e-10 * complexity + 9.20104282137138e-07; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.999738981118584 + return 1.4606129913618409e-06 + 3.931840035694782e-11 * complexity + + 3.93184003569478e-11 * complexity2; } -// Operation Min_3ds: estimated times with cross over at complexity = 99463. +// Operation Min_3ds: estimated times. double estimatedTimeForCPU_Min_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998504038595571 - return 1.28567886153589e-10 * complexity + 2.0245445263399963e-07; + // Regression for CPU with r2 = 0.9999348493035218 + return 3.5344354604038237e-07 + 1.175917606999914e-10 * complexity; } +// Operation Min_3ds: estimated times. double estimatedTimeForNNPA_Min_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9946372532381683 - return 1.0339073648992919e-10 * complexity + 2.70667430925335e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9994174686041679 + return 3.092302220284167e-06 + 3.897841215495711e-11 * complexity + + 3.8978412154957106e-11 * complexity2; } -// Operation Mul_3ds: estimated times with cross over at complexity = 55288. +// Operation Mul_3ds: estimated times. double estimatedTimeForCPU_Mul_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998538384140817 - return 1.2961833783614705e-10 * complexity + 2.2021597368460279e-07; + // Regression for CPU with r2 = 0.9999212032519668 + return 3.506815960365203e-07 + 1.1755692215565626e-10 * complexity; } +// Operation Mul_3ds: estimated times. double estimatedTimeForNNPA_Mul_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9944216212877205 - return 1.0281908200330892e-10 * complexity + 1.7019104732413162e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9994566182095322 + return 2.0629649376500924e-06 + 3.899817129736847e-11 * complexity + + 3.8998171297368456e-11 * complexity2; } -// Operation Pow_2_3ds: estimated times with cross over at complexity = 56111. -double estimatedTimeForCPU_Pow_2_3ds(double e3, double e2, double e1) { +// Operation Pow2_3ds: estimated times. +double estimatedTimeForCPU_Pow2_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998705421933038 - return 1.2964568661871788e-10 * complexity + 2.0823276086876973e-07; + // Regression for CPU with r2 = 0.9999320368494156 + return 4.0421584022966975e-07 + 1.1715722909330777e-10 * complexity; } -double estimatedTimeForNNPA_Pow_2_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9945018708254405 - return 1.0317090324802998e-10 * complexity + 1.693775697889311e-06; +// Operation Pow2_3ds: estimated times. +double estimatedTimeForNNPA_Pow2_3ds(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9994873079624843 + return 2.0966320666458707e-06 + 3.885726395091773e-11 * complexity + + 3.8857263950917726e-11 * complexity2; } -// Operation Pow_3_3ds: estimated times with cross over at complexity = 21706. -double estimatedTimeForCPU_Pow_3_3ds(double e3, double e2, double e1) { +// Operation Pow3_3ds: estimated times. +double estimatedTimeForCPU_Pow3_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9997984857078572 - return 3.7494455439876194e-10 * complexity + -8.306238888444397e-07; + // Regression for CPU with r2 = 0.9998245341999067 + return 1.643157439725954e-06 + 2.5576590285464804e-10 * complexity; } -double estimatedTimeForNNPA_Pow_3_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9411544973356172 - return 2.085058530670984e-10 * complexity + 2.7822286559924233e-06; +// Operation Pow3_3ds: estimated times. +double estimatedTimeForNNPA_Pow3_3ds(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.3286847571457816 + return 5.54645111424955e-06 + 1.594942039674421e-10 * complexity + + 1.5949420396744207e-10 * complexity2; } -// Operation Pow_4_3ds: estimated times with cross over at complexity = 41065. -double estimatedTimeForCPU_Pow_4_3ds(double e3, double e2, double e1) { +// Operation Pow4_3ds: estimated times. +double estimatedTimeForCPU_Pow4_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998522920538492 - return 2.5405902189076053e-10 * complexity + 1.143001005006861e-07; + // Regression for CPU with r2 = 0.9999197459422748 + return 9.47474072842895e-07 + 2.4163095833040496e-10 * complexity; } -double estimatedTimeForNNPA_Pow_4_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9978757256268103 - return 1.811269802457481e-10 * complexity + 3.109286345912229e-06; +// Operation Pow4_3ds: estimated times. +double estimatedTimeForNNPA_Pow4_3ds(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9994611814325557 + return 3.901310093115667e-06 + 7.801645620198453e-11 * complexity + + 7.801645620198452e-11 * complexity2; } -// Operation Pow_8_3ds: estimated times with cross over at complexity = 38894. -double estimatedTimeForCPU_Pow_8_3ds(double e3, double e2, double e1) { +// Operation Pow8_3ds: estimated times. +double estimatedTimeForCPU_Pow8_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998492647347293 - return 3.784270893083021e-10 * complexity + 3.846063745069773e-08; + // Regression for CPU with r2 = 0.9997872605697207 + return 8.784826358183354e-07 + 3.649713012323953e-10 * complexity; } -double estimatedTimeForNNPA_Pow_8_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9984689810839804 - return 2.5809839682053116e-10 * complexity + 4.718642808461304e-06; +// Operation Pow8_3ds: estimated times. +double estimatedTimeForNNPA_Pow8_3ds(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9995041673834962 + return 6.134676383478624e-06 + 1.1626159876565245e-10 * complexity + + 1.1626159876565244e-10 * complexity2; } -// Operation Relu_3ds: estimated times with cross over at complexity = 12461. +// Operation ReduceMean_4d: estimated times. +double estimatedTimeForCPU_ReduceMean_4d(double e3, double e2, double e1) { + double complexity = e3 * e2 * e1; + // Regression for CPU with r2 = 0.9581972945355149 + return -1.561907564731308e-07 + 1.2218609072525066e-10 * complexity; +} +// Operation ReduceMean_4d: estimated times. +double estimatedTimeForNNPA_ReduceMean_4d(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.21677591777344662 + return 1.1936219405338953e-05 + 1.0676642952683933e-11 * complexity + + 1.0676642952683933e-11 * complexity2; +} + +// Operation Relu_3ds: estimated times. double estimatedTimeForCPU_Relu_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9999336782083352 - return 1.287932373847373e-10 * complexity + 2.1978022850964433e-07; + // Regression for CPU with r2 = 0.9997916471943519 + return 4.020992561015175e-07 + 1.1775068214689546e-10 * complexity; } +// Operation Relu_3ds: estimated times. double estimatedTimeForNNPA_Relu_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9932963892258988 - return 8.174802287383875e-11 * complexity + 8.060373976318752e-07; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9996967196634997 + return 1.2244816818061312e-06 + 2.8344213155279377e-11 * complexity + + 2.8344213155279377e-11 * complexity2; } -// Operation Sigmoid_3ds: estimated times with cross over at complexity = 605. +// Operation Sigmoid_3ds: estimated times. double estimatedTimeForCPU_Sigmoid_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9999988107319201 - return 5.386219204160899e-09 * complexity + 5.634215435092746e-07; + // Regression for CPU with r2 = 0.9999935862933553 + return 2.4666188614796535e-07 + 5.3819773454779955e-09 * complexity; } +// Operation Sigmoid_3ds: estimated times. double estimatedTimeForNNPA_Sigmoid_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9723030037974931 - return 1.0653822523387642e-10 * complexity + 3.7620795110656856e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9997032611893206 + return 4.552265280283248e-06 + 4.268025052443249e-11 * complexity + + 4.2680250524432486e-11 * complexity2; } -// Operation Softmax_3ds: estimated times with cross over at complexity = 3256. +// Operation Softmax_3ds: estimated times. double estimatedTimeForCPU_Softmax_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9997073553561724 - return 6.319129534911603e-09 * complexity + 2.8581895675517657e-06; + // Regression for CPU with r2 = 0.9998169150056859 + return 3.850778825086575e-06 + 6.476546494036936e-09 * complexity; } +// Operation Softmax_3ds: estimated times. double estimatedTimeForNNPA_Softmax_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.7561317069552476 - return 1.5370950144430126e-09 * complexity + 1.8432846457914885e-05; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.6458689171873927 + return 3.823709210789688e-05 + 7.316577699975697e-10 * complexity + + 7.316577699975696e-10 * complexity2; } -// Operation Sub_3ds: estimated times with cross over at complexity = 27788. +// Operation Stick_3ds: estimated times. +double estimatedTimeForNNPA_Stick_3ds(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9838446914891756 + return -1.1787349678206611e-07 + 9.738975985428137e-11 * complexity + + 9.738975985428137e-11 * complexity2; +} + +// Operation Sub_3ds: estimated times. double estimatedTimeForCPU_Sub_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.9998718450512123 - return 1.2958900359490862e-10 * complexity + 2.0747306461438247e-07; + // Regression for CPU with r2 = 0.9989967088496832 + return 4.6884880875538195e-07 + 1.178543625471088e-10 * complexity; } +// Operation Sub_3ds: estimated times. double estimatedTimeForNNPA_Sub_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9949061007697938 - return 1.0456801418978144e-10 * complexity + 9.027743246689109e-07; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9997163549514938 + return 1.3893829024566132e-06 + 3.958841159751218e-11 * complexity + + 3.95884115975122e-11 * complexity2; } -// Operation Tanh_3ds: estimated times with cross over at complexity = 474. +// Operation Tanh_3ds: estimated times. double estimatedTimeForCPU_Tanh_3ds(double e3, double e2, double e1) { double complexity = e3 * e2 * e1; - // Regression for CPU with r2 = 0.999998963010189 - return 5.609614699943962e-09 * complexity + 1.8748266930146244e-07; + // Regression for CPU with r2 = 0.9899146645413962 + return 4.591865418171123e-06 + 1.5243041278914726e-09 * complexity; } +// Operation Tanh_3ds: estimated times. double estimatedTimeForNNPA_Tanh_3ds(double e3, double e2, double e1) { - double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); - // Regression for NNPA with r2 = 0.9897555127053886 - return 8.058830271076489e-11 * complexity + 2.8126039207664257e-06; + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9992996189301544 + return 3.1652117218733632e-06 + 2.7515670717117405e-11 * complexity + + 2.7515670717117402e-11 * complexity2; } + +// Operation Unstick_3ds: estimated times. +double estimatedTimeForNNPA_Unstick_3ds(double e3, double e2, double e1) { + double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0); + double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0); + // Regression for NNPA with r2 = 0.9804634203145826 + return -3.9924342483643434e-07 + 1.0477727134169295e-10 * complexity + + 1.0477727134169292e-10 * complexity2; +} \ No newline at end of file diff --git a/src/Accelerators/NNPA/NNPAAccelerator.cpp b/src/Accelerators/NNPA/NNPAAccelerator.cpp index 6634f3f71d..848acdf643 100644 --- a/src/Accelerators/NNPA/NNPAAccelerator.cpp +++ b/src/Accelerators/NNPA/NNPAAccelerator.cpp @@ -74,7 +74,7 @@ void NNPAAccelerator::registerPasses(int optLevel) const { LLVM_DEBUG(llvm::dbgs() << "Registering passes for NNPA accelerator\n"); mlir::registerPass([]() -> std::unique_ptr { return onnx_mlir::createDevicePlacementPass(nnpaLoadDevicePlacementFile, - nnpaSaveDevicePlacementFile, nnpaEnableZHighPerfModel); + nnpaSaveDevicePlacementFile, nnpaPlacementHeuristic); }); mlir::registerPass([]() -> std::unique_ptr { diff --git a/src/Accelerators/NNPA/Pass/NNPAPasses.hpp b/src/Accelerators/NNPA/Pass/NNPAPasses.hpp index 4691bb9a66..e5f8783390 100644 --- a/src/Accelerators/NNPA/Pass/NNPAPasses.hpp +++ b/src/Accelerators/NNPA/Pass/NNPAPasses.hpp @@ -17,13 +17,15 @@ #include "mlir/Pass/Pass.h" +#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp" + namespace onnx_mlir { // Add pass for device placement. std::unique_ptr createDevicePlacementPass(); std::unique_ptr createDevicePlacementPass( std::string loadConfigFile, std::string saveConfigFile, - bool useZHighPerfModel); + NNPAPlacementHeuristic placementHeuristic); /// Add pass for lowering ONNX ops to ZHigh ops. std::unique_ptr createONNXToZHighPass(); diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp index 25ac9a9043..7c8d461d5e 100644 --- a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp +++ b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp @@ -702,10 +702,13 @@ struct ONNXReductionOpLowering : public OpConversionPattern { genScalarReduction(rewriter, create, op, elementOutType, input, alloc, inRank, outRank, dynamicAxes, maskVal, outInDimMap, divisorForMean, enableParallel); - onnxToKrnlSimdReport(op, /*successful*/ false, /*vl*/ 0, - estimatedSimdLoopTripCount, - (parallelSimd ? "no simd because no supported for parallel scheme" - : "unsupported")); + std::string msg; + if (parallelSimd) + msg = "no simd because no supported for parallel scheme"; + else + msg = "unsupported"; + onnxToKrnlSimdReport( + op, /*successful*/ false, /*vl*/ 0, estimatedSimdLoopTripCount, msg); } rewriter.replaceOp(op, alloc); return success(); diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir index 52a5e22ffc..5d1868b275 100644 --- a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir +++ b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir @@ -1,4 +1,4 @@ -// RUN: onnx-mlir-opt --device-placement=use-zhigh-perf-model=true --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s +// RUN: onnx-mlir-opt --device-placement=use-faster=true --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s // ----- // Shape is such that this op is nearly guaranteed to be faster on CPU. @@ -16,16 +16,16 @@ func.func @add_cpu(%arg0: tensor<1024x32x1xf32>) -> tensor<1024x32x1xf32> attrib // ----- - // Shape is such that this op is nearly guaranteed to be faster on NNPA; so no device="cpu" here. -func.func @add_nnpa(%arg0: tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> attributes {input_names = ["x"], output_names = ["output"]} { - %0 = "onnx.Add"(%arg0, %arg0) : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> + +func.func @matmul_nnpa(%arg0: tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> attributes {input_names = ["x"], output_names = ["output"]} { + %0 = "onnx.MatMul"(%arg0, %arg0) : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> return %0 : tensor<1024x1024x1024xf32> // mlir2FileCheck.py -// CHECK-LABEL: func.func @add_nnpa +// CHECK-LABEL: func.func @matmul_nnpa // CHECK-SAME: ([[PARAM_0_:%.+]]: tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> attributes {input_names = ["x"], output_names = ["output"]} { -// CHECK: [[VAR_0_:%.+]] = "onnx.Add"([[PARAM_0_]], [[PARAM_0_]]) {device = "nnpa"} : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> +// CHECK: [[VAR_0_:%.+]] = "onnx.MatMul"([[PARAM_0_]], [[PARAM_0_]]) {device = "nnpa"} : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> // CHECK: return [[VAR_0_]] : tensor<1024x1024x1024xf32> // CHECK: } } diff --git a/utils/make-report.py b/utils/make-report.py index dc5ad9325a..e13c46a6a0 100755 --- a/utils/make-report.py +++ b/utils/make-report.py @@ -539,7 +539,8 @@ def make_report(stat_message): def main(argv): - global report_level, focus_on_op_with_pattern, supported_only, time_unit, verbose + global report_level, focus_on_op_with_pattern, supported_only, time_unit + global verbose global sorting_preference compile_file_name = "" @@ -557,7 +558,8 @@ def main(argv): "help", "level=", "runtime=", - "stats=" "sort=", + "stats=", + "sort=", "supported", "unit=", "verbose", @@ -641,13 +643,22 @@ def main(argv): if compile_file_name and runtime_file_name: parse_file_for_perf(runtime_file_name, "PERF", warmup_num) parse_file_for_stat(compile_file_name, make_stats) + print( + 'Report using runtime file "' + + runtime_file_name + + '" and compile file "' + + compile_file_name + + '"' + ) make_report(make_legend) elif compile_file_name: parse_file_for_stat(compile_file_name, make_stats) + print('Report using compile file "' + compile_file_name + '"') make_report(make_legend) elif runtime_file_name: parse_file_for_perf(runtime_file_name, "PERF", warmup_num) parse_file_for_stat(runtime_file_name, "PERF") + print('Report using runtime file "' + runtime_file_name + '"') make_report(make_legend) else: print_usage("Command requires an input file name (compile/runtime or both).\n")