From 4c213b78120ec58a6590a1fa685b718bfbc520f9 Mon Sep 17 00:00:00 2001
From: Alexandre Eichenberger <alexe@us.ibm.com>
Date: Thu, 12 Oct 2023 12:34:48 -0400
Subject: [PATCH] Add a framework for NNPA op placement heuristics (#2541)

Signed-off-by: Alexandre Eichenberger <alexe@us.ibm.com>
---
 .../NNPA/Compiler/NNPACompilerOptions.cpp     |  17 +-
 .../NNPA/Compiler/NNPACompilerOptions.hpp     |   9 +-
 .../NNPA/Compiler/NNPACompilerUtils.cpp       |   2 +-
 .../Conversion/ONNXToZHigh/CMakeLists.txt     |   1 +
 .../ONNXToZHigh/DevicePlacement.cpp           |  70 ++-
 .../ONNXToZHigh/DevicePlacementHeuristic.cpp  | 496 ++++++++++++++++++
 .../ONNXToZHigh/DevicePlacementHeuristic.hpp  |  87 +++
 .../NNPA/Conversion/ONNXToZHigh/PerfModel.cpp | 364 +++++++------
 .../NNPA/Conversion/ONNXToZHigh/PerfModel.hpp |  12 +-
 .../NNPA/Conversion/ONNXToZHigh/PerfModel.inc | 303 +++++++----
 src/Accelerators/NNPA/NNPAAccelerator.cpp     |   2 +-
 src/Accelerators/NNPA/Pass/NNPAPasses.hpp     |   4 +-
 src/Conversion/ONNXToKrnl/Math/Reduction.cpp  |  11 +-
 .../device_placement_pass_perf_model.mlir     |  12 +-
 utils/make-report.py                          |  15 +-
 15 files changed, 1085 insertions(+), 320 deletions(-)
 create mode 100644 src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp
 create mode 100644 src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
index dc15408f07..b704f954f0 100644
--- a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
+++ b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
@@ -55,10 +55,17 @@ llvm::cl::opt<std::string> nnpaSaveDevicePlacementFile{
     llvm::cl::desc("Save device placement configuration to a JSON file."),
     llvm::cl::init(""), llvm::cl::cat(OnnxMlirOptions)};
 
-llvm::cl::opt<bool> nnpaEnableZHighPerfModel("enable-zhigh-perf-model",
-    llvm::cl::desc("Enabling performance cost model to estimate if ONNX "
-                   "operations will be faster on the NNPA or the CPU. Works "
-                   "best with static shapes. Default is false."),
-    llvm::cl::init(false), llvm::cl::cat(OnnxMlirOptions));
+llvm::cl::opt<NNPAPlacementHeuristic> nnpaPlacementHeuristic{
+    "nnpa-placement-heuristic",
+    llvm::cl::desc(
+        "[Optional] Choose NNPA-related heuristic to place operations "
+        "on NNPA device:"),
+    llvm::cl::values(
+        clEnumVal(QualifyingOps, "Place all qualifying ops on NNPA (default)"),
+        clEnumVal(FasterOps, "Place qualifying ops that are faster on NNPA"),
+        clEnumVal(FasterOpsWSU, "FasterOps with stick/unstick cost"),
+        clEnumVal(MuchFasterOpsWSU,
+            "Much/Significantly FasterOps with stick/unstick cost")),
+    llvm::cl::init(QualifyingOps), llvm::cl::cat(OnnxMlirOptions)};
 
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
index 0598e81c87..a3e7f7a09f 100644
--- a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
+++ b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
@@ -41,11 +41,18 @@ typedef enum {
   EmitZHighIR,
 } NNPAEmissionTargetType;
 
+typedef enum {
+  QualifyingOps,    /* Any ops that qualify for NNPA will go on NNPA. */
+  FasterOps,        /* Only qualifying ops that are faster on NNPA */
+  FasterOpsWSU,     /* FasterOps with With Stick and Unstick (WSU) cost.*/
+  MuchFasterOpsWSU, /* FasterOpsWSU only if significantly faster. */
+} NNPAPlacementHeuristic;
+
 extern llvm::cl::OptionCategory OnnxMlirOptions;
 extern llvm::cl::opt<onnx_mlir::NNPAEmissionTargetType> nnpaEmissionTarget;
 extern llvm::cl::opt<bool> nnpaClipToDLFloatRange;
 extern llvm::cl::opt<bool> nnpaEnableZHighToOnnx;
-extern llvm::cl::opt<bool> nnpaEnableZHighPerfModel;
+extern llvm::cl::opt<NNPAPlacementHeuristic> nnpaPlacementHeuristic;
 extern llvm::cl::opt<bool> profileZHighIR;
 extern llvm::cl::opt<std::string> nnpaLoadDevicePlacementFile;
 extern llvm::cl::opt<std::string> nnpaSaveDevicePlacementFile;
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
index 143c34cfb1..b078640e82 100644
--- a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
+++ b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
@@ -152,7 +152,7 @@ void addPassesNNPA(mlir::OwningOpRef<mlir::ModuleOp> &module,
   if (emissionTarget >= EmitONNXIR) {
     addONNXToMLIRPasses(pm, /*target CPU*/ maccel.empty());
     pm.addPass(onnx_mlir::createDevicePlacementPass(nnpaLoadDevicePlacementFile,
-        nnpaSaveDevicePlacementFile, nnpaEnableZHighPerfModel));
+        nnpaSaveDevicePlacementFile, nnpaPlacementHeuristic));
   }
 
   if (emissionTarget >= EmitMLIR) {
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt
index 63f1f049b7..de58e1277e 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt
@@ -58,6 +58,7 @@ add_onnx_mlir_library(OMZHighToONNX
 
 add_onnx_mlir_library(OMDevicePlacement
   DevicePlacement.cpp
+  DevicePlacementHeuristic.cpp
   PerfModel.cpp
 
   DEPENDS
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
index ffb3ba39b9..0aa4d0c913 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
@@ -34,9 +34,10 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
+#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp"
+#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp"
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp"
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp"
-#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp"
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 #include "src/Pass/Passes.hpp"
@@ -60,12 +61,14 @@ struct DevicePlacementPass
 
   DevicePlacementPass() = default;
   DevicePlacementPass(const DevicePlacementPass &pass)
-      : PassWrapper<DevicePlacementPass, OperationPass<ModuleOp>>() {}
+      : PassWrapper<DevicePlacementPass, OperationPass<ModuleOp>>() {
+    this->placementHeuristic = QualifyingOps;
+  }
   DevicePlacementPass(std::string loadConfigFile, std::string saveConfigFile,
-      bool useZHighPerfModel) {
+      NNPAPlacementHeuristic placementHeuristic) {
     this->loadConfigFile = loadConfigFile;
     this->saveConfigFile = saveConfigFile;
-    this->useZHighPerfModel = useZHighPerfModel;
+    this->placementHeuristic = placementHeuristic;
   }
 
   StringRef getArgument() const override { return "device-placement"; }
@@ -82,9 +85,28 @@ struct DevicePlacementPass
       llvm::cl::desc("Path to load a device configuration file in JSON format"),
       llvm::cl::init("")};
 
-  Option<bool> useZHighPerfModel{*this, "use-zhigh-perf-model",
-      llvm::cl::desc("Enable ZHigh cost model for ops on NNPA vs CPU"),
+  // Placement heuristic switches (policy driven by placementHeuristic).
+  NNPAPlacementHeuristic placementHeuristic;
+  // Option useXXX listed in decreasing order of priority, if multiple are
+  // selected.
+  Option<bool> useMuchFasterWithStickOps{*this, "use-much-faster-wsu",
+      llvm::cl::desc("Enable FasterOpsWithStickUnstick NNPAPlacementHeuristic"),
+      llvm::cl::init(false)};
+  Option<bool> useFasterWithStickOps{*this, "use-faster-wsu",
+      llvm::cl::desc("Enable FasterOpsWithStickUnstick NNPAPlacementHeuristic"),
       llvm::cl::init(false)};
+  Option<bool> useFasterOps{*this, "use-faster",
+      llvm::cl::desc("Enable FasterOps NNPAPlacementHeuristic"),
+      llvm::cl::init(false)};
+  // Method to override placement using useXXX flags
+  void initPlacementHeuristic() {
+    if (useMuchFasterWithStickOps)
+      placementHeuristic = MuchFasterOpsWSU;
+    else if (useFasterWithStickOps)
+      placementHeuristic = FasterOpsWSU;
+    else if (useFasterOps)
+      placementHeuristic = FasterOps;
+  }
 
   void runOnOperation() final;
 
@@ -189,26 +211,18 @@ void DevicePlacementPass::runOnOperation() {
   OpSetType cpuOps = llvm::set_intersection(
       legalizedOps1, llvm::set_intersection(legalizedOps2, legalizedOps3));
 
-  // Now annotate accelerator operations in the IR with `device` attribute,
-  // according to the compiler decision.
-  for (Operation *op : ops) {
-    // Set device if it is empty or unavailable.
-    StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
-    if (device && !device.getValue().empty())
-      continue;
-    // Op that is legal (should remain on the CPU) as determined by compiler
-    // analysis.
-    if (cpuOps.contains(op))
-      continue;
-    // Now we have an operation that can work on the NNPA, check if its
-    // beneficial
-    if (useZHighPerfModel && !isOpFasterOnNNPA(op, &dimAnalysis)) {
-      op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, CPU_DEVICE));
-      continue;
-    }
-    // Compiler determined that we want this op on the NNPA, mark as such.
-    op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, NNPA_DEVICE));
-  }
+  initPlacementHeuristic();
+  if (placementHeuristic == QualifyingOps)
+    PlaceAllLegalOpsOnNNPA(context, ops, cpuOps);
+  else if (placementHeuristic == FasterOps)
+    PlaceBeneficialOpsOnNNPA(context, ops, &dimAnalysis, cpuOps);
+  else if (placementHeuristic == FasterOpsWSU)
+    PlaceBeneficialOpsOnNNPAWithStickUnstick(
+        context, module, ops, &dimAnalysis, cpuOps);
+  else if (placementHeuristic == MuchFasterOpsWSU)
+    PlaceBeneficialOpsOnNNPAWithStickUnstick(context, module, ops, &dimAnalysis,
+        cpuOps, /*min factor*/ 3.0, /*significant CPU Factor*/ 2.0,
+        /*significant NNPA Factor*/ 8.0);
 
   // Create a JSON configuration file if required.
   if (!saveConfigFile.empty())
@@ -306,9 +320,9 @@ std::unique_ptr<mlir::Pass> createDevicePlacementPass() {
 
 std::unique_ptr<mlir::Pass> createDevicePlacementPass(
     std::string loadConfigFile, std::string saveConfigFile,
-    bool useZHighPerfModel) {
+    NNPAPlacementHeuristic placementHeuristic) {
   return std::make_unique<DevicePlacementPass>(
-      loadConfigFile, saveConfigFile, useZHighPerfModel);
+      loadConfigFile, saveConfigFile, placementHeuristic);
 }
 
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp
new file mode 100644
index 0000000000..dc3beb3ebe
--- /dev/null
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.cpp
@@ -0,0 +1,496 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===-------- DevicePlacementHeuristic.hpp - Place ops using model  -------===//
+//
+// Copyright 2023 The IBM Research Authors.
+//
+// =============================================================================
+//
+// This file contains heuristics to place operations on CPU or NNPA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/Debug.h"
+
+#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp"
+#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp"
+#include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+#include "src/Dialect/ONNX/ONNXOps/OpHelper.hpp"
+
+#include <cmath>
+#include <functional>
+
+#define DEBUG_TYPE "device-placement-heuristic"
+#define DEBUG 2
+
+using namespace mlir;
+using namespace onnx_mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Support to classify ops.
+
+bool isMappedToDevice(Operation *op) {
+  StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
+  return device && !device.getValue().empty();
+}
+
+bool isMappedToCPU(Operation *op) {
+  StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
+  return device && device.getValue().equals_insensitive(CPU_DEVICE);
+}
+
+bool isMappedToNNPA(Operation *op) {
+  StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
+  return device && device.getValue().equals_insensitive(NNPA_DEVICE);
+}
+
+// Determine if op is unsuitable because its not an ONNX op of interest, or it
+// is already mapped to the CPU device.
+bool isNNPAFriendlyOp(Operation *op) {
+  if (op->getDialect()->getNamespace() != ONNXDialect::getDialectNamespace())
+    return false;
+  // These ops are NNPA unfriendly. Constants are friendly.
+  if (isa<ONNXEntryPointOp, ONNXReturnOp>(op))
+    return false;
+  // If `device` is already set to CPU, it is NNPA unfriendly
+  if (isMappedToCPU(op))
+    return false;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Support functions op assignment.
+
+// Return true with a debug message reporting reason for success on NNPA.
+inline bool fasterOnNNPA(Operation *op, bool significant = false) {
+  LLVM_DEBUG({
+    if (significant)
+      llvm::dbgs() << "  Significantly faster ";
+    else
+      llvm::dbgs() << "  Faster ";
+    llvm::dbgs() << "on NNPA model for op:";
+    op->dump();
+  });
+  return true;
+}
+
+// Return false with a debug message reporting reason for failure on NNPA.
+inline bool fasterOnCPU(Operation *op, bool significant = false) {
+  LLVM_DEBUG({
+    if (significant)
+      llvm::dbgs() << "  Significantly faster ";
+    else
+      llvm::dbgs() << "  Faster ";
+    llvm::dbgs() << "on CPU model for op:";
+    op->dump();
+  });
+  return false;
+}
+
+inline void assignToNNPA(Operation *op, MLIRContext *context) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Assign to NNPA:";
+    op->dump();
+  });
+  op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, NNPA_DEVICE));
+}
+
+inline void assignToCPU(Operation *op, MLIRContext *context) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Assign to CPU:";
+    op->dump();
+  });
+  op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, CPU_DEVICE));
+}
+
+//===----------------------------------------------------------------------===//
+// Support functions simple cost model analysis, based solely on one operation.
+
+// Simply determine if operation is faster on CPU or NNPA.
+bool isOpFasterOnNNPA(Operation *op, const DimAnalysis *dimAnalysis) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "\nTest cost-benefit of CPU/NNPA for op\n";
+    op->dump();
+  });
+  // Estimate time
+  double cpuTime, nnpaTime;
+  if (!estimateTimeForOpWithModel(op, dimAnalysis, cpuTime, nnpaTime)) {
+    // No performance model for this operation, assume faster on NNPA;
+    cpuTime = 1;
+    nnpaTime = 0;
+  }
+  if (nnpaTime < cpuTime)
+    return fasterOnNNPA(op);
+  return fasterOnCPU(op);
+}
+
+//===----------------------------------------------------------------------===//
+// Support functions cost/benefit operation that takes stick/unstick into
+// account.
+
+struct DevicePlacementWithStickUnstickCost {
+  DevicePlacementWithStickUnstickCost() = delete;
+  DevicePlacementWithStickUnstickCost(MLIRContext *context, ModuleOp module,
+      const DimAnalysis *dimAnalysis, const OpSetType &cpuOps)
+      : context(context), dimAnalysis(dimAnalysis), cpuOps(cpuOps) {
+    characterizeOps(module);
+  }
+
+  // Data
+  MLIRContext *context;
+  const DimAnalysis *dimAnalysis;
+  // All ops that must execute on CPU, aka not eligible to run on  NNPA. Ops
+  // in this set can be marked as device=CPU.
+  const OpSetType &cpuOps;
+  // All ops that may execute on NNPA. Ops in this set can be marked as
+  // device=CPU or NNPA.
+  OpSetType nnpaCandidateOps;
+  // All ops that run on CPU but do not require stick/unstick at runtime. Ops in
+  // thi set can be marked as device=CPU.
+  OpSetType nnpaNeutralOps;
+
+  void characterizeOps(ModuleOp module) {
+    nnpaCandidateOps.clear();
+    nnpaNeutralOps.clear();
+    module.walk([&](Operation *op) -> WalkResult {
+      // Skip ops that are NNPA unfriendly such as ops already assigned to CPU.
+      if (!isNNPAFriendlyOp(op))
+        return WalkResult::advance();
+      // Ops that cannot/may not go on NNPA but can operate on NNPA data "for
+      // free" are included here in NNPA neutral ops.
+      // I assume here (not really true) that transpose and reshape can carry
+      // the stickified data.
+      if (isa<ONNXConstantOp, ONNXTransposeOp, ONNXReshapeOp>(op)) {
+        nnpaNeutralOps.insert(op);
+        return WalkResult::advance();
+      }
+      // Skip ops that the compiler determined are not suitable for NNPA.
+      if (cpuOps.contains(op))
+        return WalkResult::advance();
+      // Remaining ops can be mapped to NNPA.
+      nnpaCandidateOps.insert(op);
+      return WalkResult::advance();
+    });
+#if DEBUG >= 2
+    LLVM_DEBUG({
+      llvm::dbgs() << "\nCPU Ops:\n";
+      for (auto op : cpuOps) {
+        if (isa<ONNXConstantOp, func::FuncOp>(op))
+          continue;
+        llvm::dbgs() << "cpu ";
+        op->dump();
+      }
+      llvm::dbgs() << "\nNNPA Neutral Ops:\n";
+      for (auto op : nnpaNeutralOps) {
+        if (isa<ONNXConstantOp, func::FuncOp>(op))
+          continue;
+        llvm::dbgs() << "neutral ";
+        op->dump();
+      }
+      llvm::dbgs() << "\nNNPA Candidate Ops:\n";
+      for (auto op : nnpaCandidateOps) {
+        llvm::dbgs() << "candidate ";
+        op->dump();
+      }
+    });
+#endif
+  }
+
+  void classifyValueUsage(Value value, Operation *opToSkip, int64_t &cpuOpCount,
+      int64_t &nnpaOpCount, int64_t &nnpaCandidateOpCount,
+      int64_t &nnpaNeutralOpCount) {
+    cpuOpCount = nnpaOpCount = nnpaCandidateOpCount = nnpaNeutralOpCount = 0;
+
+    std::string msg = "";
+    for (Operation *userOp : value.getUsers()) {
+      // Skip op if requested.
+      if (userOp == opToSkip) {
+        LLVM_DEBUG(msg = " Skipped op.");
+        // Test ops that are already mapped.
+      } else if (isMappedToCPU(userOp))
+        cpuOpCount++;
+      else if (isMappedToNNPA(userOp))
+        nnpaOpCount++;
+      // Not mapped, test now ops that are candidate to execute on NNPA.
+      else if (nnpaCandidateOps.contains(userOp))
+        nnpaCandidateOpCount++;
+      // Not candidate, test now ops that are neutral to NNPA.
+      else if (nnpaNeutralOps.contains(userOp))
+        nnpaNeutralOpCount++;
+      // None of the above, will be on CPU.
+      else
+        cpuOpCount++;
+    }
+    LLVM_DEBUG({
+      llvm::dbgs() << "    Use pattern for value from "
+                   << value.getDefiningOp()->getName() << ": used by CPU "
+                   << cpuOpCount << ", NNPA " << nnpaOpCount
+                   << ", NNPA candidates " << nnpaCandidateOpCount
+                   << ", neutral " << nnpaNeutralOpCount << "." << msg << "\n";
+    });
+  }
+
+  // Cost benefit analysis of moving this op X to the NNPA, with respect the ops
+  // that are using the results of op X. Positive cost are additional cost to
+  // have op X on NNPA, negative costs are benefits to have op X on NNPA.
+  double costBenefitIncurredForResults(Operation *opX) {
+    assert(!isMappedToDevice(opX) && "cannot evaluate an already mapped op");
+    double totalCostBenefit = 0;
+    LLVM_DEBUG(llvm::dbgs() << "  Look at cost benefit for results:\n");
+    for (Value resVal : opX->getResults()) {
+      // Look at all the users of currRes and classify them.
+      int64_t cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount;
+      classifyValueUsage(resVal, /*skip op*/ nullptr, cpuOpCount, nnpaOpCount,
+          nnpaCandidateOpCount, nnpaNeutralOpCount);
+      /*
+        Case study:
+        1)  Op X remains on CPU  | 2) Op X migrates to NNPA:
+                   X.CPU         |          X.NNPA
+                /    |    \      |      /     |      \
+               /   stick? stick  | unstick unstick?   \
+              /      |       \   |    /       |        \
+            CPU  Candidate  NNPA |  CPU   Candidate    NNPA
+                 on NNPA         |        on CPU
+        placing X on NNPA:       |
+            cost:                | +1 unstick if has CPU users
+            benefit:             | -1 stick if has NNPA users
+
+        TODO: If migrate X to NNPA, could attribute some benefits for having
+        users that are NNPA.
+      */
+      double costOfUnstickOp = estimateTimeForUnstickOp(resVal);
+      double costOfStickOp = estimateTimeForStickOp(resVal);
+      if (cpuOpCount > 0) {
+        // Moving this op to NNPA will cost one unstick as there are one or
+        // more ops that must execute on CPU.
+        LLVM_DEBUG(
+            llvm::dbgs() << "      +1 unstick: " << costOfUnstickOp << "\n");
+        totalCostBenefit += costOfUnstickOp;
+      }
+      if (nnpaOpCount > 0) {
+        // Moving this op to NNPA will remove the need to stick this result
+        LLVM_DEBUG(
+            llvm::dbgs() << "      -1 stick: " << -costOfStickOp << "\n");
+        totalCostBenefit -= costOfStickOp;
+      }
+    }
+    return totalCostBenefit;
+  }
+
+  // Cost benefit analysis of moving this op X to the NNPA, with respect the ops
+  // that define the inputs of op X. Positive cost are additional cost to
+  // have op X on NNPA, negative costs are benefits to have op X on NNPA.
+  double costBenefitIncurredForInputs(Operation *opX) {
+    assert(!isMappedToDevice(opX) && "cannot evaluate an already mapped op");
+    double totalCostBenefit = 0;
+    LLVM_DEBUG(llvm::dbgs() << "  Look at cost benefit for inputs:\n");
+    OpSetType visitedDefiningOps;
+    for (Value inputVal : opX->getOperands()) {
+      // Investigate the operation that defines inputVal (which is used by op)
+      Operation *definingOp = inputVal.getDefiningOp();
+      if (!definingOp)
+        continue;
+      // If we have AddOp(%3, %3), should visit cost associated with %3 input
+      // only once.
+      if (visitedDefiningOps.contains(definingOp)) {
+        LLVM_DEBUG(llvm::dbgs() << "    has multiple use of same input\n");
+        continue;
+      }
+      visitedDefiningOps.insert(definingOp);
+
+      // Classify all other users of this input value.
+      int64_t cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount;
+      classifyValueUsage(inputVal, /*skip op X that we are analyzing*/ opX,
+          cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount);
+      /*
+        Case study:
+        3) Op X remains on CPU           | 4) Op X remains on CPU
+                  def.CPU ----.          |        def.NNPA -----.
+                /    |    \     \        |      /     |    \     \
+               /   stick? stick  \       | unstick unstick? \   unstick
+              /      |       \    \      |    /       |      \     \
+            CPU  Candidate  NNPA  X.CPU  |  CPU  Candidate  NNPA  X.CPU
+                 on NNPA                 |       on CPU
+
+        5) Op X migrates to NNPA         | 6) Op X migrates to NNPA
+                  def.CPU ----.          |        def.NNPA -----.
+                /    |    \     \        |      /     |    \     \
+               /   stick? stick stick    | unstick unstick? \     \
+              /      |       \    \      |    /       |      \     \
+            CPU  Candidate  NNPA  X.NNPA |  CPU  Candidate  NNPA  X.NNPA
+                 on NNPA                 |       on CPU
+
+        placing X on NNPA:               |
+            cost: +1 stick if first NNPA |
+            benefit:                     | -1 stick
+      */
+      double costOfStickOp = estimateTimeForStickOp(inputVal);
+      if (isMappedToCPU(definingOp) ||
+          !(nnpaCandidateOps.contains(definingOp) ||
+              nnpaNeutralOps.contains(definingOp))) {
+        // Case 5.
+        if (nnpaOpCount == 0) {
+          LLVM_DEBUG(llvm::dbgs() << "      def-op on cpu (case 5), +1 stick "
+                                  << costOfStickOp << ".\n");
+          totalCostBenefit += costOfStickOp;
+        }
+      }
+      if (isMappedToNNPA(definingOp)) {
+        // Case 6.
+        LLVM_DEBUG(llvm::dbgs() << "      def-op on NNPA (case 6), -1 stick "
+                                << -costOfStickOp << ".\n");
+        totalCostBenefit -= costOfStickOp;
+      }
+    }
+    return totalCostBenefit;
+  }
+
+  bool significantlyFaster(double fast, double slow, double factor) {
+    // At least factor x faster.
+    return factor * fast <= slow;
+  }
+
+  // Determine if op is faster on the NNPA or not. To be faster than the CPU,
+  // expect the NNPA to be at least minFactor faster than CPU. Significant is
+  // set if the op is significantFactor faster / slower on the device.
+  bool isOpFasterOnNNPA(Operation *op, double minFactor,
+      double significantCPUFactor, double significantNNPAFactor,
+      bool &significant) {
+    LLVM_DEBUG({
+      llvm::dbgs()
+          << "\nTest cost-benefit with stick/unstick of CPU/NNPA for op\n";
+      op->dump();
+    });
+    // Estimate time
+    double cpuTime, nnpaTime, nnpaTimeWithOverheads;
+    if (estimateTimeForOpWithModel(op, dimAnalysis, cpuTime, nnpaTime)) {
+      // Has performance model, account for stick/unstick.
+      double useCostBenefit = costBenefitIncurredForResults(op);
+      double inputCostBenefit = costBenefitIncurredForInputs(op);
+      nnpaTimeWithOverheads = nnpaTime + useCostBenefit + inputCostBenefit;
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  New estimated nnpa time with stick/unstick:"
+                 << nnpaTimeWithOverheads << " vs cpu " << cpuTime << ".\n");
+    } else {
+      // No performance model for this operation, assume faster on NNPA;
+      cpuTime = 10;
+      nnpaTime = nnpaTimeWithOverheads = 1;
+      LLVM_DEBUG(llvm::dbgs() << "    no time estimate, assume NNPA better\n.");
+    }
+    if (nnpaTimeWithOverheads * minFactor <= cpuTime) {
+      // For significant, don't take overheads into account as it may change
+      // depending on mapping.
+      significant =
+          significantlyFaster(nnpaTime, cpuTime, significantNNPAFactor);
+      return fasterOnNNPA(op, significant);
+    }
+    // For significant, don't take overheads into account as it may change
+    // depending on mapping.
+    significant = significantlyFaster(cpuTime, nnpaTime, significantCPUFactor);
+    return fasterOnCPU(op, significant);
+  }
+
+}; // DevicePlacementWithStickUnstickCost
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Exported heuristics for device placement.
+
+namespace onnx_mlir {
+
+void PlaceAllLegalOpsOnNNPA(MLIRContext *context,
+    const SmallVector<Operation *, 32> &ops, const OpSetType &cpuOps) {
+  for (Operation *op : ops) {
+    if (isMappedToDevice(op))
+      continue;
+    // Op that cannot go on NNPA.
+    if (cpuOps.contains(op))
+      continue;
+    // Compiler determined that we want this op on the NNPA, mark as such.
+    assignToNNPA(op, context);
+  }
+}
+
+void PlaceBeneficialOpsOnNNPA(MLIRContext *context,
+    const SmallVector<Operation *, 32> &ops, const DimAnalysis *dimAnalysis,
+    const OpSetType &cpuOps) {
+  for (Operation *op : ops) {
+    if (isMappedToDevice(op))
+      continue;
+    // Op that cannot go on NNPA.
+    if (cpuOps.contains(op))
+      continue;
+    // Now we have an operation that can work on the NNPA, check if its
+    // beneficial
+    if (!isOpFasterOnNNPA(op, dimAnalysis)) {
+      assignToCPU(op, context);
+      continue;
+    }
+    // Compiler determined that we want this op on the NNPA, mark as such.
+    assignToNNPA(op, context);
+  }
+}
+
+void PlaceBeneficialOpsOnNNPAWithStickUnstick(MLIRContext *context,
+    ModuleOp module, const SmallVector<Operation *, 32> &ops,
+    const DimAnalysis *dimAnalysis, const OpSetType &cpuOps, double minFactor,
+    double significantCPUFactor, double significantNNPAFactor) {
+  // Init model.
+  DevicePlacementWithStickUnstickCost model(
+      context, module, dimAnalysis, cpuOps);
+  int64_t ub = 5;
+  int64_t i = 0;
+  while (i < ub) {
+    int64_t modified = 0;
+    bool first = (i == 0);
+    bool last = (i == ub - 1);
+    LLVM_DEBUG(llvm::dbgs() << "\n\n\nPlacement Iteration " << i << "\n\n");
+    for (Operation *op : ops) {
+      if (isMappedToDevice(op))
+        continue;
+      // Op that cannot go on NNPA.
+      if (cpuOps.contains(op))
+        continue;
+      // Now we have an operation that can work on the NNPA, check if its
+      // beneficial
+      bool significant;
+      if (!model.isOpFasterOnNNPA(op, minFactor, significantCPUFactor,
+              significantNNPAFactor, significant)) {
+        if (last || significant) {
+          modified++;
+          assignToCPU(op, context);
+        }
+        continue;
+      }
+      // Compiler determined that we want this op on the NNPA, mark as such.
+      if (!first || significant) {
+        modified++;
+        assignToNNPA(op, context);
+      }
+    }
+    if (last) {
+      break;
+    } else if (first) {
+      LLVM_DEBUG(llvm::dbgs() << "\nFirst, go on.\n");
+      ++i;
+    } else if (modified) {
+      LLVM_DEBUG(llvm::dbgs() << "\nHad " << modified << " changes, go on.\n");
+      ++i;
+    } else {
+      LLVM_DEBUG(llvm::dbgs() << "\nHad no changes, skip to last iter.\n");
+      i = ub - 1;
+    }
+  }
+}
+
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp
new file mode 100644
index 0000000000..d03647fcd7
--- /dev/null
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacementHeuristic.hpp
@@ -0,0 +1,87 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===-------- DevicePlacementHeuristic.hpp - Place ops using model  -------===//
+//
+// Copyright 2023 The IBM Research Authors.
+//
+// =============================================================================
+//
+// This file contains heuristics to place operations on CPU or NNPA.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "mlir/IR/BuiltinOps.h"
+
+#include "src/Dialect/ONNX/ONNXDimAnalysis.hpp"
+
+namespace onnx_mlir {
+
+using OpSetType = mlir::DenseSet<mlir::Operation *>;
+
+/**
+ *  Place all ops that qualify for NNPA executions on the NNPA.
+ *
+ * @param context Context of the model.
+ * @param ops ONNX ops that should be considered in the device assignment.
+ * @param cpuOps Set of ops that must execute on CPU.
+ */
+
+void PlaceAllLegalOpsOnNNPA(mlir::MLIRContext *context,
+    const llvm::SmallVector<mlir::Operation *, 32> &ops,
+    const OpSetType &cpuOps);
+
+/**
+ *  Place all ops that qualify for NNPA execution on the NNPA when the
+ * operations are estimated run faster on the NNPA.
+ *
+ * @param context Context of the model.
+ * @param ops ONNX ops that should be considered in the device assignment.
+ * @param dimAnalysis Pointer to dimension analysis tool for disambiguating
+ * dynamic shape dimensions.
+ * @param cpuOps Set of ops that must execute on CPU.
+ */
+
+void PlaceBeneficialOpsOnNNPA(mlir::MLIRContext *context,
+    const llvm::SmallVector<mlir::Operation *, 32> &ops,
+    const DimAnalysis *dimAnalysis, const OpSetType &cpuOps);
+
+/**
+ * Place all operations that qualify for NNPA execution on the NNPA when the
+ * operations are estimated run faster on the NNPA, including the costs of Stick
+ * and Unstick necessary for NNPA execution. The algorithm starts to place on
+ * the CPU/NNPA operations that are significantly faster on CPU/NNPA. Then it
+ * aims to add operations to the NNPA when the new operations are faster
+ * including the additional (if any) stick/unstick required for these less
+ * significantly faster NNPA operations. Three factors below can modify the
+ * sensitivity at which ops are assigned to the NNPA.
+ *
+ * @param context Context of the model.
+ * @param ops ONNX ops that should be considered in the device assignment.
+ * @param dimAnalysis Pointer to dimension analysis tool for disambiguating
+ * dynamic shape dimensions.
+ * @param cpuOps Set of ops that must execute on CPU.
+ * @param minFactor NNPA (including stick/unstick) has to be at least minFactor
+ * times faster than CPU for an op to be assigned to the NNPA.
+ * @param significantCPUFactor CPU has to be at least significantFactor faster
+ * than NNPA to seed/force computations on the CPU.
+ * @param significantNNPAFactor NNPA has to be at least significantFactor faster
+ * than CPU to seed/force computations on the NNPA.
+ *
+ * @note The significantCPUFactor can be smaller, as if it's not looking good
+ * for the NNPA, we may as well seed the computation on CPU for ops that are
+ * much better on the CPU. For significantNNPAFactor, we may want it much higher
+ * as we might want only to send there really beneficial ops on the NNPA.
+ * Combining a high significantNNPAFactor with a large minFactor, the heuristic
+ * will put only ops that are really beneficial on the NNPA.
+ */
+void PlaceBeneficialOpsOnNNPAWithStickUnstick(mlir::MLIRContext *context,
+    mlir::ModuleOp module, const llvm::SmallVector<mlir::Operation *, 32> &ops,
+    const DimAnalysis *dimAnalysis, const OpSetType &cpuOps,
+    double minFactor = 1.1, double significantCPUFactor = 2.0,
+    double significantNNPAFactor = 3.0);
+
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp
index 7e29bf62d0..61157e1a96 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp
@@ -36,25 +36,7 @@ using PERF_MODEL4 = std::function<double(double, double, double, double)>;
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc"
 
 //===----------------------------------------------------------------------===//
-// Support functions for reporting.
-
-// Return true with a debug message reporting reason for success on NNPA.
-inline bool fasterOnNNPA(Operation *op, std::string msg) {
-  LLVM_DEBUG({
-    llvm::dbgs() << "  Faster on NNPA: " << msg << " For op:";
-    op->dump();
-  });
-  return true;
-}
-
-// Return false with a debug message reporting reason for failure on NNPA.
-inline bool fasterOnCPU(Operation *op, std::string msg) {
-  LLVM_DEBUG({
-    llvm::dbgs() << "  Faster on CPU: " << msg << " For op:";
-    op->dump();
-  });
-  return false;
-}
+// Support functions
 
 // Summarize higher dims (leaving ub..rank-1 untouched). If none, return size
 // of 1. Otherwise, returns the cumulative multiplication of each of the static
@@ -76,10 +58,8 @@ inline int64_t summarizeHigherDims(
 //===----------------------------------------------------------------------===//
 // Support for unary/binary elementwise with possibly unknown dimensions.
 
-bool isElementwiseOpFasterOnNNPA(Operation *op, Value oper,
-    const DimAnalysis *dimAnalysis, PERF_MODEL3 modelForCPU,
-    PERF_MODEL3 modelForNNPA) {
-
+void processDim(Value oper, int64_t &e4, int64_t &e3, int64_t &e2, int64_t &e1,
+    std::string &msg) {
   // At this time, use only 1 of the two operands.
   ShapedType operType = oper.getType().dyn_cast_or_null<ShapedType>();
   assert(operType && operType.hasRank() && "expected shaped type with rank");
@@ -88,12 +68,12 @@ bool isElementwiseOpFasterOnNNPA(Operation *op, Value oper,
   llvm::ArrayRef<int64_t> shape = operType.getShape();
   // Gather all 4th...nth ranked shape together. If all dynamic; assume size
   // of 1.
-  std::string msg = "";
+  LLVM_DEBUG(msg = "");
   bool hasDynamicE4;
-  int64_t e4 = summarizeHigherDims(shape, operRank - 3, hasDynamicE4);
-  int64_t e3 = operRank >= 3 ? shape[operRank - 3] : 1;
-  int64_t e2 = operRank >= 2 ? shape[operRank - 2] : 1;
-  int64_t e1 = operRank >= 1 ? shape[operRank - 1] : 1;
+  e4 = summarizeHigherDims(shape, operRank - 3, hasDynamicE4);
+  e3 = operRank >= 3 ? shape[operRank - 3] : 1;
+  e2 = operRank >= 2 ? shape[operRank - 2] : 1;
+  e1 = operRank >= 1 ? shape[operRank - 1] : 1;
   // Handle dynamic shapes, eventually it would be good to have ranges given by
   // the user.
   if (hasDynamicE4) {
@@ -111,20 +91,31 @@ bool isElementwiseOpFasterOnNNPA(Operation *op, Value oper,
     e1 = 64; // Assume full.
     LLVM_DEBUG(msg += " E1=64: dyn, assume full tile.");
   }
-  double nnpaEstimatedTime = modelForNNPA(e4 * e3, e2, e1);
-  double cpuEstimatedTime = modelForCPU(e4 * e3, e2, e1);
-  LLVM_DEBUG(fprintf(stderr, "  Estimated times: nnpa %f, cpu %f\n",
-      nnpaEstimatedTime, cpuEstimatedTime));
-  if (nnpaEstimatedTime < cpuEstimatedTime)
-    return fasterOnNNPA(op, "Model estimates faster time on NNPA." + msg);
-  return fasterOnCPU(op, "Model estimates faster time on CPU." + msg);
+}
+
+void estimateTimeForElementwiseOp(Operation *op, Value oper,
+    const DimAnalysis *dimAnalysis, PERF_MODEL3 modelForCPU,
+    PERF_MODEL3 modelForNNPA, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+
+  // Process dim (collapse and handle dynamic sizes).
+  int64_t e4, e3, e2, e1;
+  std::string msg;
+  processDim(oper, e4, e3, e2, e1, msg);
+
+  cpuEstimatedTime = modelForCPU(e4 * e3, e2, e1);
+  nnpaEstimatedTime = modelForNNPA(e4 * e3, e2, e1);
+  LLVM_DEBUG(llvm::dbgs() << "  Estimated times for op " << op->getName()
+                          << ": nnpa " << nnpaEstimatedTime << ", cpu "
+                          << cpuEstimatedTime << "." << msg.c_str() << "\n");
 }
 
 //===----------------------------------------------------------------------===//
 // Support for matmul with possibly unknown dimensions.
 
-bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed,
-    bool bTransposed, const DimAnalysis *dimAnalysis) {
+void estimateTimeForMatMulOp(Operation *op, Value a, Value b, bool aTransposed,
+    bool bTransposed, const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
   // Scanning A.
   ShapedType aType = a.getType().dyn_cast_or_null<ShapedType>();
   assert(aType && aType.hasRank() && "expected shaped type with A rank");
@@ -152,7 +143,8 @@ bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed,
   int64_t N = aN, M = aM, K = bK;
   // Rules common to matmul with/without broadcast.
   // Ideally we would have ranges to estimate cost when dynamic.
-  std::string msg = "";
+  std::string msg;
+  LLVM_DEBUG(msg = "");
   // Assume the broadcast B dim of the matmul will be small.
   if (aBDynamic) {
     LLVM_DEBUG(msg += " B+ for input A: assume size 1 for dynamic dims.");
@@ -216,14 +208,15 @@ bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed,
       hasBroadcast /* no perf measurement yet for broadcast case*/) {
     // For no broadcast, pick the largest B dimension.
     int64_t B = std::max(aB, bB);
-    double nnpaEstimatedTime = estimatedTimeForNNPA_MatMul_3ds(B, N, M, K);
-    double cpuEstimatedTime = estimatedTimeForCPU_MatMul_3ds(B, N, M, K);
-    LLVM_DEBUG(fprintf(stderr,
-        "  Times for matmul: nnpa %f, cpu %f with dim (%i, %i, %i, %d)\n",
-        nnpaEstimatedTime, cpuEstimatedTime, (int)B, (int)N, (int)M, (int)K));
-    if (nnpaEstimatedTime < cpuEstimatedTime)
-      return fasterOnNNPA(op, "Model estimates faster time on NNPA." + msg);
-    return fasterOnCPU(op, "Model estimates faster time on CPU." + msg);
+    nnpaEstimatedTime = estimatedTimeForNNPA_MatMul_3ds(B, N, M, K);
+    cpuEstimatedTime = estimatedTimeForCPU_MatMul_3ds(B, N, M, K);
+    LLVM_DEBUG(llvm::dbgs()
+               << "  Estimated times for op " << op->getName() << " with dim ("
+               << B << ", " << N << ", " << M << ", " << K << "): nnpa "
+               << nnpaEstimatedTime << ", cpu " << cpuEstimatedTime << "."
+               << msg.c_str() << "\n");
+
+    return;
   }
   llvm_unreachable("should not get here");
 }
@@ -232,189 +225,240 @@ bool isMatMulOpFasterOnNNPA(Operation *op, Value a, Value b, bool aTransposed,
 // Processing for each op: binary elementwise.
 
 template <typename OP_TYPE>
-bool checkIfOpFasterOnNNPA(OP_TYPE op, const DimAnalysis *dimAnalysis) {
+void estimateTimeForOp(OP_TYPE op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
   llvm_unreachable("should have a model for all defined ops");
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXAddOp>(
-    ONNXAddOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Add_3ds, estimatedTimeForNNPA_Add_3ds);
+void estimateTimeForOp<ONNXAddOp>(ONNXAddOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Add_3ds, estimatedTimeForNNPA_Add_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXDivOp>(
-    ONNXDivOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Div_3ds, estimatedTimeForNNPA_Div_3ds);
+void estimateTimeForOp<ONNXDivOp>(ONNXDivOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Div_3ds, estimatedTimeForNNPA_Div_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXMaxOp>(
-    ONNXMaxOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Max_3ds, estimatedTimeForNNPA_Max_3ds);
+void estimateTimeForOp<ONNXMaxOp>(ONNXMaxOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Max_3ds, estimatedTimeForNNPA_Max_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXMinOp>(
-    ONNXMinOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Min_3ds, estimatedTimeForNNPA_Min_3ds);
+void estimateTimeForOp<ONNXMinOp>(ONNXMinOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Min_3ds, estimatedTimeForNNPA_Min_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXMulOp>(
-    ONNXMulOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Mul_3ds, estimatedTimeForNNPA_Mul_3ds);
+void estimateTimeForOp<ONNXMulOp>(ONNXMulOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Mul_3ds, estimatedTimeForNNPA_Mul_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXPowOp>(
-    ONNXPowOp op, const DimAnalysis *dimAnalysis) {
+void estimateTimeForOp<ONNXPowOp>(ONNXPowOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
   int64_t exponentValue;
   if (hasIntegerPowerExponent(&op, exponentValue)) {
     if (exponentValue == 2)
-      return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-          dimAnalysis, estimatedTimeForCPU_Pow_2_3ds,
-          estimatedTimeForNNPA_Pow_2_3ds);
+      estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0),
+          dimAnalysis, estimatedTimeForCPU_Pow2_3ds,
+          estimatedTimeForNNPA_Pow2_3ds, cpuEstimatedTime, nnpaEstimatedTime);
     if (exponentValue == 3)
-      return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-          dimAnalysis, estimatedTimeForCPU_Pow_3_3ds,
-          estimatedTimeForNNPA_Pow_3_3ds);
+      estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0),
+          dimAnalysis, estimatedTimeForCPU_Pow3_3ds,
+          estimatedTimeForNNPA_Pow3_3ds, cpuEstimatedTime, nnpaEstimatedTime);
     if (exponentValue == 4)
-      return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-          dimAnalysis, estimatedTimeForCPU_Pow_4_3ds,
-          estimatedTimeForNNPA_Pow_4_3ds);
+      estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0),
+          dimAnalysis, estimatedTimeForCPU_Pow4_3ds,
+          estimatedTimeForNNPA_Pow4_3ds, cpuEstimatedTime, nnpaEstimatedTime);
   }
   // For other power exponent, just use pow of 8.
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Pow_8_3ds,
-      estimatedTimeForNNPA_Pow_8_3ds);
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Pow8_3ds, estimatedTimeForNNPA_Pow8_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXSubOp>(
-    ONNXSubOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      dimAnalysis, estimatedTimeForCPU_Sub_3ds, estimatedTimeForNNPA_Sub_3ds);
+void estimateTimeForOp<ONNXSubOp>(ONNXSubOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(0), dimAnalysis,
+      estimatedTimeForCPU_Sub_3ds, estimatedTimeForNNPA_Sub_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 //===----------------------------------------------------------------------===//
 // Processing for each op: unary elementwise.
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXExpOp>(
-    ONNXExpOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(),
-      dimAnalysis, estimatedTimeForCPU_Exp_3ds, estimatedTimeForNNPA_Exp_3ds);
+void estimateTimeForOp<ONNXExpOp>(ONNXExpOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Exp_3ds, estimatedTimeForNNPA_Exp_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXLogOp>(
-    ONNXLogOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(),
-      dimAnalysis, estimatedTimeForCPU_Log_3ds, estimatedTimeForNNPA_Log_3ds);
+void estimateTimeForOp<ONNXLogOp>(ONNXLogOp op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Log_3ds, estimatedTimeForNNPA_Log_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXReluOp>(
-    ONNXReluOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(),
-      dimAnalysis, estimatedTimeForCPU_Relu_3ds, estimatedTimeForNNPA_Relu_3ds);
+void estimateTimeForOp<ONNXReluOp>(ONNXReluOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Relu_3ds, estimatedTimeForNNPA_Relu_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXSigmoidOp>(
-    ONNXSigmoidOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(),
-      dimAnalysis, estimatedTimeForCPU_Sigmoid_3ds,
-      estimatedTimeForNNPA_Sigmoid_3ds);
+void estimateTimeForOp<ONNXSigmoidOp>(ONNXSigmoidOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Sigmoid_3ds, estimatedTimeForNNPA_Sigmoid_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXSoftmaxOp>(
-    ONNXSoftmaxOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(),
-      dimAnalysis, estimatedTimeForCPU_Softmax_3ds,
-      estimatedTimeForNNPA_Softmax_3ds);
+void estimateTimeForOp<ONNXSoftmaxOp>(ONNXSoftmaxOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Softmax_3ds, estimatedTimeForNNPA_Softmax_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXTanhOp>(
-    ONNXTanhOp op, const DimAnalysis *dimAnalysis) {
-  return isElementwiseOpFasterOnNNPA(op.getOperation(), op.getOperand(),
-      dimAnalysis, estimatedTimeForCPU_Tanh_3ds, estimatedTimeForNNPA_Tanh_3ds);
+void estimateTimeForOp<ONNXTanhOp>(ONNXTanhOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Tanh_3ds, estimatedTimeForNNPA_Tanh_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
+}
+
+//===----------------------------------------------------------------------===//
+// Processing for each op: ReduceMean.
+
+template <>
+void estimateTimeForOp<ONNXReduceMeanV13Op>(ONNXReduceMeanV13Op op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_ReduceMean_4d, estimatedTimeForNNPA_ReduceMean_4d,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 //===----------------------------------------------------------------------===//
 // Processing for each op: MatMul.
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXMatMulOp>(
-    ONNXMatMulOp op, const DimAnalysis *dimAnalysis) {
-  return isMatMulOpFasterOnNNPA(op.getOperation(), op.getOperand(0),
-      op.getOperand(1), false /*a transposed*/, false /*b transposed*/,
-      dimAnalysis);
+void estimateTimeForOp<ONNXMatMulOp>(ONNXMatMulOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForMatMulOp(op.getOperation(), op.getOperand(0), op.getOperand(1),
+      false /*a transposed*/, false /*b transposed*/, dimAnalysis,
+      cpuEstimatedTime, nnpaEstimatedTime);
 }
 
 template <>
-bool checkIfOpFasterOnNNPA<ONNXGemmOp>(
-    ONNXGemmOp op, const DimAnalysis *dimAnalysis) {
-  return isMatMulOpFasterOnNNPA(op.getOperation(), op.getA(), op.getB(),
-      op.getTransA(), op.getTransB(), dimAnalysis);
+void estimateTimeForOp<ONNXGemmOp>(ONNXGemmOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForMatMulOp(op.getOperation(), op.getA(), op.getB(),
+      op.getTransA(), op.getTransB(), dimAnalysis, cpuEstimatedTime,
+      nnpaEstimatedTime);
 }
 
 } // namespace
 
+namespace onnx_mlir {
+
 //===----------------------------------------------------------------------===//
-// Function to perform evaluation.
+// Estimate time for ops that have a model.
+
+double estimateTimeForStickOp(Value oper) {
+  // Process dim (collapse and handle dynamic sizes).
+  int64_t e4, e3, e2, e1;
+  std::string msg;
+  processDim(oper, e4, e3, e2, e1, msg);
+  return estimatedTimeForNNPA_Stick_3ds(e4 * e3, e2, e1);
+}
 
-namespace onnx_mlir {
+double estimateTimeForUnstickOp(Value oper) {
+  // Process dim (collapse and handle dynamic sizes).
+  int64_t e4, e3, e2, e1;
+  std::string msg;
+  processDim(oper, e4, e3, e2, e1, msg);
+  return estimatedTimeForNNPA_Unstick_3ds(e4 * e3, e2, e1);
+}
 
-bool isOpFasterOnNNPA(mlir::Operation *op, const DimAnalysis *dimAnalysis) {
-  LLVM_DEBUG({
-    llvm::dbgs() << "Test cost-benefit of CPU/NNPA for op ";
-    op->dump();
-  });
-  // Binary elementwise NNPA candidate ops.
+bool estimateTimeForOpWithModel(Operation *op, const DimAnalysis *dimAnalysis,
+    double &cpuEstimatedTime, double &nnpaEstimatedTime) {
+  bool opHasModel = true;
   if (auto addOp = dyn_cast<ONNXAddOp>(op))
-    return checkIfOpFasterOnNNPA(addOp, dimAnalysis);
-  if (auto divOp = dyn_cast<ONNXDivOp>(op))
-    return checkIfOpFasterOnNNPA(divOp, dimAnalysis);
-  if (auto maxOp = dyn_cast<ONNXMaxOp>(op))
-    return checkIfOpFasterOnNNPA(maxOp, dimAnalysis);
-  if (auto minOp = dyn_cast<ONNXMinOp>(op))
-    return checkIfOpFasterOnNNPA(minOp, dimAnalysis);
-  if (auto mulOp = dyn_cast<ONNXMulOp>(op))
-    return checkIfOpFasterOnNNPA(mulOp, dimAnalysis);
-  if (auto powOp = dyn_cast<ONNXPowOp>(op))
-    return checkIfOpFasterOnNNPA(powOp, dimAnalysis);
-  if (auto subOp = dyn_cast<ONNXSubOp>(op))
-    return checkIfOpFasterOnNNPA(subOp, dimAnalysis);
+    estimateTimeForOp(addOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto divOp = dyn_cast<ONNXDivOp>(op))
+    estimateTimeForOp(divOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto maxOp = dyn_cast<ONNXMaxOp>(op))
+    estimateTimeForOp(maxOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto minOp = dyn_cast<ONNXMinOp>(op))
+    estimateTimeForOp(minOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto mulOp = dyn_cast<ONNXMulOp>(op))
+    estimateTimeForOp(mulOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto powOp = dyn_cast<ONNXPowOp>(op))
+    estimateTimeForOp(powOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto subOp = dyn_cast<ONNXSubOp>(op))
+    estimateTimeForOp(subOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
   // Unary elementwise NNPA candidate ops.
-  if (auto expOp = dyn_cast<ONNXExpOp>(op))
-    return checkIfOpFasterOnNNPA(expOp, dimAnalysis);
-  if (auto logOp = dyn_cast<ONNXLogOp>(op))
-    return checkIfOpFasterOnNNPA(logOp, dimAnalysis);
-  if (auto reluOp = dyn_cast<ONNXReluOp>(op))
-    return checkIfOpFasterOnNNPA(reluOp, dimAnalysis);
-  if (auto sigmoidOp = dyn_cast<ONNXSigmoidOp>(op))
-    return checkIfOpFasterOnNNPA(sigmoidOp, dimAnalysis);
-  if (auto softmaxOp = dyn_cast<ONNXSoftmaxOp>(op))
-    return checkIfOpFasterOnNNPA(softmaxOp, dimAnalysis);
-  if (auto tanhOp = dyn_cast<ONNXTanhOp>(op))
-    return checkIfOpFasterOnNNPA(tanhOp, dimAnalysis);
+  else if (auto expOp = dyn_cast<ONNXExpOp>(op))
+    estimateTimeForOp(expOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto logOp = dyn_cast<ONNXLogOp>(op))
+    estimateTimeForOp(logOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto reluOp = dyn_cast<ONNXReluOp>(op))
+    estimateTimeForOp(reluOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto sigmoidOp = dyn_cast<ONNXSigmoidOp>(op))
+    estimateTimeForOp(
+        sigmoidOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto softmaxOp = dyn_cast<ONNXSoftmaxOp>(op))
+    estimateTimeForOp(
+        softmaxOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto tanhOp = dyn_cast<ONNXTanhOp>(op))
+    estimateTimeForOp(tanhOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  // Reduce
+  else if (auto reduceMeanOp = dyn_cast<ONNXReduceMeanV13Op>(op))
+    estimateTimeForOp(
+        reduceMeanOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
   // Matmul.
-  if (auto matMulOp = dyn_cast<ONNXMatMulOp>(op))
-    return checkIfOpFasterOnNNPA(matMulOp, dimAnalysis);
-  if (auto gemmOp = dyn_cast<ONNXGemmOp>(op))
-    return checkIfOpFasterOnNNPA(gemmOp, dimAnalysis);
-
-  // Unknown, issue a warning and assume its faster on NNPA
-  return fasterOnNNPA(op, "Candidate for NNPA without model; please add.");
+  else if (auto matMulOp = dyn_cast<ONNXMatMulOp>(op))
+    estimateTimeForOp(
+        matMulOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op))
+    estimateTimeForOp(gemmOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else
+    opHasModel = false;
+
+  return opHasModel;
 }
 
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp
index 8ec4e89786..1b00220ee7 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp
@@ -21,6 +21,14 @@
 
 namespace onnx_mlir {
 
-bool isOpFasterOnNNPA(mlir::Operation *op, const DimAnalysis *dimAnalysis);
+// When an op has a model, define the CPU and NNPA estimated times and return
+// true. When an op does not have a model, just return false.
+bool estimateTimeForOpWithModel(mlir::Operation *op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime);
 
-}
+// Estimate the CPU time for stick/unstick given the shape in oper
+double estimateTimeForStickOp(mlir::Value oper);
+double estimateTimeForUnstickOp(mlir::Value oper);
+
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc
index fe89463d1b..069e06af92 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.inc
@@ -10,207 +10,292 @@
 
 inline static double ms_ceiling(double n, double m) { return ceil(n / m) * m; }
 
-// Operation Add_3ds: estimated times with cross over at complexity = 58292.
+// Operation Add_3ds: estimated times.
 double estimatedTimeForCPU_Add_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998796798458582
-  return 1.3014297439117151e-10 * complexity + 2.0196878739035375e-07;
+  // Regression for CPU with r2 = 0.9989711028792525
+  return 3.9686846353007493e-07 + 1.1794164898251022e-10 * complexity;
 }
+// Operation Add_3ds: estimated times.
 double estimatedTimeForNNPA_Add_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9946449269392905
-  return 1.0267638124762705e-10 * complexity + 1.8030585573544791e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9994266956239162
+  return 2.128070603450555e-06 + 3.884079345448728e-11 * complexity +
+         3.8840793454487276e-11 * complexity2;
 }
 
-// Operation Div_3ds: estimated times with cross over at complexity = 2483.
+// Operation Div_3ds: estimated times.
 double estimatedTimeForCPU_Div_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9999989217222092
-  return 1.4517483410281062e-09 * complexity + 4.819629870926124e-07;
+  // Regression for CPU with r2 = 0.9999973603706902
+  return 6.024413187183776e-07 + 1.444210277092263e-09 * complexity;
 }
+// Operation Div_3ds: estimated times.
 double estimatedTimeForNNPA_Div_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.993964642126504
-  return 1.0448506395503133e-10 * complexity + 3.8276566890878624e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9991809091918582
+  return 4.268037890056841e-06 + 3.958236721770986e-11 * complexity +
+         3.9582367217709856e-11 * complexity2;
 }
 
-// Operation Exp_3ds: estimated times with cross over at complexity = -1037.
+// Operation Exp_3ds: estimated times.
 double estimatedTimeForCPU_Exp_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9953451710472131
-  return 3.728416286592805e-09 * complexity + 6.0588917718826046e-06;
+  // Regression for CPU with r2 = 0.9964027710112378
+  return -5.114482496042976e-06 + 4.191612771812482e-09 * complexity;
 }
+// Operation Exp_3ds: estimated times.
 double estimatedTimeForNNPA_Exp_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9891659822712001
-  return 8.495396362536427e-11 * complexity + 2.2771053257892673e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9996489747804022
+  return 2.940912571153368e-06 + 3.030831435560512e-11 * complexity +
+         3.030831435560511e-11 * complexity2;
 }
 
-// Operation Log_3ds: estimated times with cross over at complexity = 214.
+// Operation Log_3ds: estimated times.
 double estimatedTimeForCPU_Log_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9999978790893227
-  return 8.437935291413468e-09 * complexity + -5.198279741842633e-07;
+  // Regression for CPU with r2 = 0.98951908796714
+  return -1.1673535780041665e-05 + 5.568744038404678e-09 * complexity;
 }
+// Operation Log_3ds: estimated times.
 double estimatedTimeForNNPA_Log_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9860021150687228
-  return 9.34925048506708e-11 * complexity + 1.2737044319477467e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9994743517297515
+  return 1.9298869463234537e-06 + 3.5198842979463965e-11 * complexity +
+         3.519884297946396e-11 * complexity2;
 }
 
-// Operation MatMul_3ds: estimated times with cross over at complexity = 30745.
+// Operation MatMul_3ds: estimated times.
 double estimatedTimeForCPU_MatMul_3ds(double B, double N, double M, double K) {
   double complexity = B * (N * M * K);
-  // Regression for CPU with r2 = 0.9977563719224963
-  return 8.516416082324592e-11 * complexity + 2.813872063426722e-07;
+  // Regression for CPU with r2 = 0.9993063132437037
+  return 1.2274778896376187e-06 + 8.277833300031912e-11 * complexity;
 }
+// Operation MatMul_3ds: estimated times.
 double estimatedTimeForNNPA_MatMul_3ds(double B, double N, double M, double K) {
   double complexity =
       B * ms_ceiling(N, 2.0) * ms_ceiling(M, 64.0) * ms_ceiling(K, 64.0);
-  // Regression for NNPA with r2 = 0.6995070738236333
-  return 6.1299740116770674e-12 * complexity + 2.7113063170731707e-06;
+  double complexity2 =
+      B * ms_ceiling(N, 32.0) * ms_ceiling(M, 64.0) * ms_ceiling(K, 64.0);
+  // Regression for NNPA with r2 = 0.7300356886725241
+  return 3.2376205400192875e-06 + 1.2476855786580124e-12 * complexity +
+         1.2476855786580124e-12 * complexity2;
 }
 
-// Operation Max_3ds: estimated times with cross over at complexity = 28928.
+// Operation Max_3ds: estimated times.
 double estimatedTimeForCPU_Max_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.999833820537654
-  return 1.2859513298352582e-10 * complexity + 2.1445698817257752e-07;
+  // Regression for CPU with r2 = 0.999934744283212
+  return 3.2005877071463053e-07 + 1.1769966151617187e-10 * complexity;
 }
+// Operation Max_3ds: estimated times.
 double estimatedTimeForNNPA_Max_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9951863938046909
-  return 1.0420243623790107e-10 * complexity + 9.20104282137138e-07;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.999738981118584
+  return 1.4606129913618409e-06 + 3.931840035694782e-11 * complexity +
+         3.93184003569478e-11 * complexity2;
 }
 
-// Operation Min_3ds: estimated times with cross over at complexity = 99463.
+// Operation Min_3ds: estimated times.
 double estimatedTimeForCPU_Min_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998504038595571
-  return 1.28567886153589e-10 * complexity + 2.0245445263399963e-07;
+  // Regression for CPU with r2 = 0.9999348493035218
+  return 3.5344354604038237e-07 + 1.175917606999914e-10 * complexity;
 }
+// Operation Min_3ds: estimated times.
 double estimatedTimeForNNPA_Min_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9946372532381683
-  return 1.0339073648992919e-10 * complexity + 2.70667430925335e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9994174686041679
+  return 3.092302220284167e-06 + 3.897841215495711e-11 * complexity +
+         3.8978412154957106e-11 * complexity2;
 }
 
-// Operation Mul_3ds: estimated times with cross over at complexity = 55288.
+// Operation Mul_3ds: estimated times.
 double estimatedTimeForCPU_Mul_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998538384140817
-  return 1.2961833783614705e-10 * complexity + 2.2021597368460279e-07;
+  // Regression for CPU with r2 = 0.9999212032519668
+  return 3.506815960365203e-07 + 1.1755692215565626e-10 * complexity;
 }
+// Operation Mul_3ds: estimated times.
 double estimatedTimeForNNPA_Mul_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9944216212877205
-  return 1.0281908200330892e-10 * complexity + 1.7019104732413162e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9994566182095322
+  return 2.0629649376500924e-06 + 3.899817129736847e-11 * complexity +
+         3.8998171297368456e-11 * complexity2;
 }
 
-// Operation Pow_2_3ds: estimated times with cross over at complexity = 56111.
-double estimatedTimeForCPU_Pow_2_3ds(double e3, double e2, double e1) {
+// Operation Pow2_3ds: estimated times.
+double estimatedTimeForCPU_Pow2_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998705421933038
-  return 1.2964568661871788e-10 * complexity + 2.0823276086876973e-07;
+  // Regression for CPU with r2 = 0.9999320368494156
+  return 4.0421584022966975e-07 + 1.1715722909330777e-10 * complexity;
 }
-double estimatedTimeForNNPA_Pow_2_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9945018708254405
-  return 1.0317090324802998e-10 * complexity + 1.693775697889311e-06;
+// Operation Pow2_3ds: estimated times.
+double estimatedTimeForNNPA_Pow2_3ds(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9994873079624843
+  return 2.0966320666458707e-06 + 3.885726395091773e-11 * complexity +
+         3.8857263950917726e-11 * complexity2;
 }
 
-// Operation Pow_3_3ds: estimated times with cross over at complexity = 21706.
-double estimatedTimeForCPU_Pow_3_3ds(double e3, double e2, double e1) {
+// Operation Pow3_3ds: estimated times.
+double estimatedTimeForCPU_Pow3_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9997984857078572
-  return 3.7494455439876194e-10 * complexity + -8.306238888444397e-07;
+  // Regression for CPU with r2 = 0.9998245341999067
+  return 1.643157439725954e-06 + 2.5576590285464804e-10 * complexity;
 }
-double estimatedTimeForNNPA_Pow_3_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9411544973356172
-  return 2.085058530670984e-10 * complexity + 2.7822286559924233e-06;
+// Operation Pow3_3ds: estimated times.
+double estimatedTimeForNNPA_Pow3_3ds(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.3286847571457816
+  return 5.54645111424955e-06 + 1.594942039674421e-10 * complexity +
+         1.5949420396744207e-10 * complexity2;
 }
 
-// Operation Pow_4_3ds: estimated times with cross over at complexity = 41065.
-double estimatedTimeForCPU_Pow_4_3ds(double e3, double e2, double e1) {
+// Operation Pow4_3ds: estimated times.
+double estimatedTimeForCPU_Pow4_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998522920538492
-  return 2.5405902189076053e-10 * complexity + 1.143001005006861e-07;
+  // Regression for CPU with r2 = 0.9999197459422748
+  return 9.47474072842895e-07 + 2.4163095833040496e-10 * complexity;
 }
-double estimatedTimeForNNPA_Pow_4_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9978757256268103
-  return 1.811269802457481e-10 * complexity + 3.109286345912229e-06;
+// Operation Pow4_3ds: estimated times.
+double estimatedTimeForNNPA_Pow4_3ds(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9994611814325557
+  return 3.901310093115667e-06 + 7.801645620198453e-11 * complexity +
+         7.801645620198452e-11 * complexity2;
 }
 
-// Operation Pow_8_3ds: estimated times with cross over at complexity = 38894.
-double estimatedTimeForCPU_Pow_8_3ds(double e3, double e2, double e1) {
+// Operation Pow8_3ds: estimated times.
+double estimatedTimeForCPU_Pow8_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998492647347293
-  return 3.784270893083021e-10 * complexity + 3.846063745069773e-08;
+  // Regression for CPU with r2 = 0.9997872605697207
+  return 8.784826358183354e-07 + 3.649713012323953e-10 * complexity;
 }
-double estimatedTimeForNNPA_Pow_8_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9984689810839804
-  return 2.5809839682053116e-10 * complexity + 4.718642808461304e-06;
+// Operation Pow8_3ds: estimated times.
+double estimatedTimeForNNPA_Pow8_3ds(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9995041673834962
+  return 6.134676383478624e-06 + 1.1626159876565245e-10 * complexity +
+         1.1626159876565244e-10 * complexity2;
 }
 
-// Operation Relu_3ds: estimated times with cross over at complexity = 12461.
+// Operation ReduceMean_4d: estimated times.
+double estimatedTimeForCPU_ReduceMean_4d(double e3, double e2, double e1) {
+  double complexity = e3 * e2 * e1;
+  // Regression for CPU with r2 = 0.9581972945355149
+  return -1.561907564731308e-07 + 1.2218609072525066e-10 * complexity;
+}
+// Operation ReduceMean_4d: estimated times.
+double estimatedTimeForNNPA_ReduceMean_4d(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.21677591777344662
+  return 1.1936219405338953e-05 + 1.0676642952683933e-11 * complexity +
+         1.0676642952683933e-11 * complexity2;
+}
+
+// Operation Relu_3ds: estimated times.
 double estimatedTimeForCPU_Relu_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9999336782083352
-  return 1.287932373847373e-10 * complexity + 2.1978022850964433e-07;
+  // Regression for CPU with r2 = 0.9997916471943519
+  return 4.020992561015175e-07 + 1.1775068214689546e-10 * complexity;
 }
+// Operation Relu_3ds: estimated times.
 double estimatedTimeForNNPA_Relu_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9932963892258988
-  return 8.174802287383875e-11 * complexity + 8.060373976318752e-07;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9996967196634997
+  return 1.2244816818061312e-06 + 2.8344213155279377e-11 * complexity +
+         2.8344213155279377e-11 * complexity2;
 }
 
-// Operation Sigmoid_3ds: estimated times with cross over at complexity = 605.
+// Operation Sigmoid_3ds: estimated times.
 double estimatedTimeForCPU_Sigmoid_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9999988107319201
-  return 5.386219204160899e-09 * complexity + 5.634215435092746e-07;
+  // Regression for CPU with r2 = 0.9999935862933553
+  return 2.4666188614796535e-07 + 5.3819773454779955e-09 * complexity;
 }
+// Operation Sigmoid_3ds: estimated times.
 double estimatedTimeForNNPA_Sigmoid_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9723030037974931
-  return 1.0653822523387642e-10 * complexity + 3.7620795110656856e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9997032611893206
+  return 4.552265280283248e-06 + 4.268025052443249e-11 * complexity +
+         4.2680250524432486e-11 * complexity2;
 }
 
-// Operation Softmax_3ds: estimated times with cross over at complexity = 3256.
+// Operation Softmax_3ds: estimated times.
 double estimatedTimeForCPU_Softmax_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9997073553561724
-  return 6.319129534911603e-09 * complexity + 2.8581895675517657e-06;
+  // Regression for CPU with r2 = 0.9998169150056859
+  return 3.850778825086575e-06 + 6.476546494036936e-09 * complexity;
 }
+// Operation Softmax_3ds: estimated times.
 double estimatedTimeForNNPA_Softmax_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.7561317069552476
-  return 1.5370950144430126e-09 * complexity + 1.8432846457914885e-05;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.6458689171873927
+  return 3.823709210789688e-05 + 7.316577699975697e-10 * complexity +
+         7.316577699975696e-10 * complexity2;
 }
 
-// Operation Sub_3ds: estimated times with cross over at complexity = 27788.
+// Operation Stick_3ds: estimated times.
+double estimatedTimeForNNPA_Stick_3ds(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9838446914891756
+  return -1.1787349678206611e-07 + 9.738975985428137e-11 * complexity +
+         9.738975985428137e-11 * complexity2;
+}
+
+// Operation Sub_3ds: estimated times.
 double estimatedTimeForCPU_Sub_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.9998718450512123
-  return 1.2958900359490862e-10 * complexity + 2.0747306461438247e-07;
+  // Regression for CPU with r2 = 0.9989967088496832
+  return 4.6884880875538195e-07 + 1.178543625471088e-10 * complexity;
 }
+// Operation Sub_3ds: estimated times.
 double estimatedTimeForNNPA_Sub_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9949061007697938
-  return 1.0456801418978144e-10 * complexity + 9.027743246689109e-07;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9997163549514938
+  return 1.3893829024566132e-06 + 3.958841159751218e-11 * complexity +
+         3.95884115975122e-11 * complexity2;
 }
 
-// Operation Tanh_3ds: estimated times with cross over at complexity = 474.
+// Operation Tanh_3ds: estimated times.
 double estimatedTimeForCPU_Tanh_3ds(double e3, double e2, double e1) {
   double complexity = e3 * e2 * e1;
-  // Regression for CPU with r2 = 0.999998963010189
-  return 5.609614699943962e-09 * complexity + 1.8748266930146244e-07;
+  // Regression for CPU with r2 = 0.9899146645413962
+  return 4.591865418171123e-06 + 1.5243041278914726e-09 * complexity;
 }
+// Operation Tanh_3ds: estimated times.
 double estimatedTimeForNNPA_Tanh_3ds(double e3, double e2, double e1) {
-  double complexity = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
-  // Regression for NNPA with r2 = 0.9897555127053886
-  return 8.058830271076489e-11 * complexity + 2.8126039207664257e-06;
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9992996189301544
+  return 3.1652117218733632e-06 + 2.7515670717117405e-11 * complexity +
+         2.7515670717117402e-11 * complexity2;
 }
+
+// Operation Unstick_3ds: estimated times.
+double estimatedTimeForNNPA_Unstick_3ds(double e3, double e2, double e1) {
+  double complexity = e3 * ms_ceiling(e2, 2.0) * ms_ceiling(e1, 64.0);
+  double complexity2 = e3 * ms_ceiling(e2, 32.0) * ms_ceiling(e1, 64.0);
+  // Regression for NNPA with r2 = 0.9804634203145826
+  return -3.9924342483643434e-07 + 1.0477727134169295e-10 * complexity +
+         1.0477727134169292e-10 * complexity2;
+}
\ No newline at end of file
diff --git a/src/Accelerators/NNPA/NNPAAccelerator.cpp b/src/Accelerators/NNPA/NNPAAccelerator.cpp
index 6634f3f71d..848acdf643 100644
--- a/src/Accelerators/NNPA/NNPAAccelerator.cpp
+++ b/src/Accelerators/NNPA/NNPAAccelerator.cpp
@@ -74,7 +74,7 @@ void NNPAAccelerator::registerPasses(int optLevel) const {
   LLVM_DEBUG(llvm::dbgs() << "Registering passes for NNPA accelerator\n");
   mlir::registerPass([]() -> std::unique_ptr<mlir::Pass> {
     return onnx_mlir::createDevicePlacementPass(nnpaLoadDevicePlacementFile,
-        nnpaSaveDevicePlacementFile, nnpaEnableZHighPerfModel);
+        nnpaSaveDevicePlacementFile, nnpaPlacementHeuristic);
   });
 
   mlir::registerPass([]() -> std::unique_ptr<mlir::Pass> {
diff --git a/src/Accelerators/NNPA/Pass/NNPAPasses.hpp b/src/Accelerators/NNPA/Pass/NNPAPasses.hpp
index 4691bb9a66..e5f8783390 100644
--- a/src/Accelerators/NNPA/Pass/NNPAPasses.hpp
+++ b/src/Accelerators/NNPA/Pass/NNPAPasses.hpp
@@ -17,13 +17,15 @@
 
 #include "mlir/Pass/Pass.h"
 
+#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp"
+
 namespace onnx_mlir {
 
 // Add pass for device placement.
 std::unique_ptr<mlir::Pass> createDevicePlacementPass();
 std::unique_ptr<mlir::Pass> createDevicePlacementPass(
     std::string loadConfigFile, std::string saveConfigFile,
-    bool useZHighPerfModel);
+    NNPAPlacementHeuristic placementHeuristic);
 
 /// Add pass for lowering ONNX ops to ZHigh ops.
 std::unique_ptr<mlir::Pass> createONNXToZHighPass();
diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
index 25ac9a9043..7c8d461d5e 100644
--- a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
@@ -702,10 +702,13 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
       genScalarReduction(rewriter, create, op, elementOutType, input, alloc,
           inRank, outRank, dynamicAxes, maskVal, outInDimMap, divisorForMean,
           enableParallel);
-      onnxToKrnlSimdReport(op, /*successful*/ false, /*vl*/ 0,
-          estimatedSimdLoopTripCount,
-          (parallelSimd ? "no simd because no supported for parallel scheme"
-                        : "unsupported"));
+      std::string msg;
+      if (parallelSimd)
+        msg = "no simd because no supported for parallel scheme";
+      else
+        msg = "unsupported";
+      onnxToKrnlSimdReport(
+          op, /*successful*/ false, /*vl*/ 0, estimatedSimdLoopTripCount, msg);
     }
     rewriter.replaceOp(op, alloc);
     return success();
diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir
index 52a5e22ffc..5d1868b275 100644
--- a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --device-placement=use-zhigh-perf-model=true --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s
+// RUN: onnx-mlir-opt --device-placement=use-faster=true --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s
 // -----
 
 // Shape is such that this op is nearly guaranteed to be faster on CPU.
@@ -16,16 +16,16 @@ func.func @add_cpu(%arg0: tensor<1024x32x1xf32>) -> tensor<1024x32x1xf32> attrib
 
 // -----
 
-
 // Shape is such that this op is nearly guaranteed to be faster on NNPA; so no device="cpu" here.
-func.func @add_nnpa(%arg0: tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> attributes {input_names = ["x"], output_names = ["output"]} {
-  %0 = "onnx.Add"(%arg0, %arg0) : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32>
+
+func.func @matmul_nnpa(%arg0: tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> attributes {input_names = ["x"], output_names = ["output"]} {
+  %0 = "onnx.MatMul"(%arg0, %arg0) : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32>
   return %0 : tensor<1024x1024x1024xf32>
 
 // mlir2FileCheck.py
-// CHECK-LABEL:  func.func @add_nnpa
+// CHECK-LABEL:  func.func @matmul_nnpa
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32> attributes {input_names = ["x"], output_names = ["output"]} {
-// CHECK:           [[VAR_0_:%.+]] = "onnx.Add"([[PARAM_0_]], [[PARAM_0_]]) {device = "nnpa"} : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32>
+// CHECK:           [[VAR_0_:%.+]] = "onnx.MatMul"([[PARAM_0_]], [[PARAM_0_]]) {device = "nnpa"} : (tensor<1024x1024x1024xf32>, tensor<1024x1024x1024xf32>) -> tensor<1024x1024x1024xf32>
 // CHECK:           return [[VAR_0_]] : tensor<1024x1024x1024xf32>
 // CHECK:         }
 }
diff --git a/utils/make-report.py b/utils/make-report.py
index dc5ad9325a..e13c46a6a0 100755
--- a/utils/make-report.py
+++ b/utils/make-report.py
@@ -539,7 +539,8 @@ def make_report(stat_message):
 
 
 def main(argv):
-    global report_level, focus_on_op_with_pattern, supported_only, time_unit, verbose
+    global report_level, focus_on_op_with_pattern, supported_only, time_unit
+    global verbose
     global sorting_preference
 
     compile_file_name = ""
@@ -557,7 +558,8 @@ def main(argv):
                 "help",
                 "level=",
                 "runtime=",
-                "stats=" "sort=",
+                "stats=",
+                "sort=",
                 "supported",
                 "unit=",
                 "verbose",
@@ -641,13 +643,22 @@ def main(argv):
     if compile_file_name and runtime_file_name:
         parse_file_for_perf(runtime_file_name, "PERF", warmup_num)
         parse_file_for_stat(compile_file_name, make_stats)
+        print(
+            'Report using runtime file "'
+            + runtime_file_name
+            + '" and compile file "'
+            + compile_file_name
+            + '"'
+        )
         make_report(make_legend)
     elif compile_file_name:
         parse_file_for_stat(compile_file_name, make_stats)
+        print('Report using compile file "' + compile_file_name + '"')
         make_report(make_legend)
     elif runtime_file_name:
         parse_file_for_perf(runtime_file_name, "PERF", warmup_num)
         parse_file_for_stat(runtime_file_name, "PERF")
+        print('Report using runtime file "' + runtime_file_name + '"')
         make_report(make_legend)
     else:
         print_usage("Command requires an input file name (compile/runtime or both).\n")