onnx
diff --git a/‎docs/BuildOnLinuxOSX.md
Lines changed: 1 addition & 1 deletion b/‎docs/BuildOnLinuxOSX.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/BuildOnWindows.md
Lines changed: 1 addition & 1 deletion b/‎docs/BuildOnWindows.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Accelerators/NNPA/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎src/Accelerators/NNPA/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎src/Accelerators/NNPA/Conversion/ONNXToZHigh/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp
Lines changed: 34 additions & 2 deletions b/‎src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.cpp
Lines changed: 34 additions & 2 deletions
@@ -15,7 +15,7 @@ Firstly, install MLIR (as a part of LLVM-Project):
 ``` bash
 git clone -n https://github.com/llvm/llvm-project.git
 # Check out a specific branch that is known to work with ONNX-MLIR.
-cd llvm-project && git checkout 7a33569510535f0b917a2e50f644bf57490aee24 && cd ..
+cd llvm-project && git checkout f8cb7987c64dcffb72414a40560055cb717dbf74 && cd ..
 ```
 
 [same-as-file]: <> (utils/build-mlir.sh)
 
@@ -52,7 +52,7 @@ Install MLIR (as a part of LLVM-Project):
 ```shell
 git clone -n https://github.com/llvm/llvm-project.git
 # Check out a specific branch that is known to work with ONNX-MLIR.
-cd llvm-project && git checkout 7a33569510535f0b917a2e50f644bf57490aee24 && cd ..
+cd llvm-project && git checkout f8cb7987c64dcffb72414a40560055cb717dbf74 && cd ..
 ```
 
 [same-as-file]: <> (utils/build-mlir.cmd)
 
@@ -36,7 +36,7 @@ include(zdnn.cmake)
 setup_zdnn(v1.1.2)
 
 # Note: consider to use a compile flag instead.
-option(ZDNNX_DEBUG "Enable ZDNNX debug information" ON)
+option(ZDNNX_DEBUG "Enable ZDNNX debug information" OFF)
 if (ZDNNX_DEBUG)
   add_compile_definitions(ZDNNX_DEBUG)
 endif()
 
@@ -33,11 +33,11 @@ add_onnx_mlir_library(OMRewriteONNXForZHigh
   libzdnn
 
   LINK_LIBS PUBLIC
+  OMLayoutHelper
   OMNNPACompilerOptions
   OMONNXOps
   OMONNXToKrnl
   OMZHighOps
-  OMLayoutHelper
 
 
   ACCEL_INCLUDE_DIRS PRIVATE
@@ -71,6 +71,7 @@ add_onnx_mlir_library(OMDevicePlacement
   OMONNXOps
   OMONNXToZHigh
   OMRewriteONNXForZHigh
+  OMLayoutHelper
 
   ACCEL_INCLUDE_DIRS PRIVATE
   ${NNPA_INCLUDE_PATH}
 
@@ -16,6 +16,7 @@
 #include "llvm/Support/Debug.h"
 
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/PerfModel.hpp"
+#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps/OpHelper.hpp"
 
@@ -121,6 +122,7 @@ void estimateTimeForMatMulOp(Operation *op, Value a, Value b, bool aTransposed,
   assert(aType && aType.hasRank() && "expected shaped type with A rank");
   int64_t aRank = aType.getRank();
   llvm::ArrayRef<int64_t> aShape = aType.getShape();
+  // a => matrix A; B => the Batch dims (aka all but the last 2 dims).
   bool aBDynamic;
   int64_t aB = summarizeHigherDims(aShape, aRank - 2, aBDynamic);
   int64_t aNIndex = aTransposed ? aRank - 1 : aRank - 2;
@@ -132,6 +134,7 @@ void estimateTimeForMatMulOp(Operation *op, Value a, Value b, bool aTransposed,
   assert(bType && bType.hasRank() && "expected shaped type with B rank");
   int64_t bRank = bType.getRank();
   llvm::ArrayRef<int64_t> bShape = bType.getShape();
+  // b => matrix B; B => the Batch dims (aka all but the last 2 dims).
   bool bBDynamic;
   int64_t bB = summarizeHigherDims(bShape, bRank - 2, bBDynamic);
   int64_t bMIndex = bTransposed ? bRank - 1 : bRank - 2;
@@ -312,6 +315,15 @@ void estimateTimeForOp<ONNXExpOp>(ONNXExpOp op, const DimAnalysis *dimAnalysis,
       cpuEstimatedTime, nnpaEstimatedTime);
 }
 
+template <>
+void estimateTimeForOp<ONNXGeluOp>(ONNXGeluOp op,
+    const DimAnalysis *dimAnalysis, double &cpuEstimatedTime,
+    double &nnpaEstimatedTime) {
+  estimateTimeForElementwiseOp(op.getOperation(), op.getOperand(), dimAnalysis,
+      estimatedTimeForCPU_Gelu_3ds, estimatedTimeForNNPA_Gelu_3ds,
+      cpuEstimatedTime, nnpaEstimatedTime);
+}
+
 template <>
 void estimateTimeForOp<ONNXLogOp>(ONNXLogOp op, const DimAnalysis *dimAnalysis,
     double &cpuEstimatedTime, double &nnpaEstimatedTime) {
@@ -401,15 +413,33 @@ double estimateTimeForStickOp(Value oper) {
   int64_t e4, e3, e2, e1;
   std::string msg;
   processDim(oper, e4, e3, e2, e1, msg);
-  return estimatedTimeForNNPA_Stick_3ds(e4 * e3, e2, e1);
+  // March 14, no NNPA support.
+  if (isLessEqualNNPALevel(NNPALevel::M14))
+    return arch14_estimatedTimeForCPU_Stick_3ds(e4 * e3, e2, e1);
+  // Else returns minimum between CPU and NNPA
+  if (isLessEqualNNPALevel(NNPALevel::M15)) {
+    double cpuTime = arch15_estimatedTimeForCPU_Stick_3ds(e4 * e3, e2, e1);
+    double nnpaTime = arch15_estimatedTimeForNNPA_Stick_3ds(e4 * e3, e2, e1);
+    return cpuTime < nnpaTime ? cpuTime : nnpaTime;
+  }
+  llvm_unreachable("add new NNPA architecture model here");
 }
 
 double estimateTimeForUnstickOp(Value oper) {
   // Process dim (collapse and handle dynamic sizes).
   int64_t e4, e3, e2, e1;
   std::string msg;
   processDim(oper, e4, e3, e2, e1, msg);
-  return estimatedTimeForNNPA_Unstick_3ds(e4 * e3, e2, e1);
+  // March 14, no NNPA support.
+  if (isLessEqualNNPALevel(NNPALevel::M14))
+    return arch14_estimatedTimeForCPU_Unstick_3ds(e4 * e3, e2, e1);
+  // Else returns minimum between CPU and NNPA
+  if (isLessEqualNNPALevel(NNPALevel::M15)) {
+    double cpuTime = arch15_estimatedTimeForCPU_Unstick_3ds(e4 * e3, e2, e1);
+    double nnpaTime = arch15_estimatedTimeForNNPA_Unstick_3ds(e4 * e3, e2, e1);
+    return cpuTime < nnpaTime ? cpuTime : nnpaTime;
+  }
+  llvm_unreachable("add new NNPA architecture model here");
 }
 
 bool estimateTimeForOpWithModel(Operation *op, const DimAnalysis *dimAnalysis,
@@ -432,6 +462,8 @@ bool estimateTimeForOpWithModel(Operation *op, const DimAnalysis *dimAnalysis,
   // Unary elementwise NNPA candidate ops.
   else if (auto expOp = mlir::dyn_cast<ONNXExpOp>(op))
     estimateTimeForOp(expOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
+  else if (auto geluOp = mlir::dyn_cast<ONNXGeluOp>(op))
+    estimateTimeForOp(geluOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
   else if (auto logOp = mlir::dyn_cast<ONNXLogOp>(op))
     estimateTimeForOp(logOp, dimAnalysis, cpuEstimatedTime, nnpaEstimatedTime);
   else if (auto reluOp = mlir::dyn_cast<ONNXReluOp>(op))