Transpose NCHW to NHWC if compiler stick/unstick is enabled and enable model zoo tests on NNPA (#2893)

tungld · web-flow · commit 36aa0ba080d5 · 2024-08-02T10:43:05.000+09:00
* Transpose NCHW to NHWC if compiler stick/unstick is enabled

Signed-off-by: Tung D. Le &lt;tung@jp.ibm.com&gt;

* disable createZHighClipToDLFloatPass

Signed-off-by: Tung D. Le &lt;tung@jp.ibm.com&gt;

---------

Signed-off-by: Tung D. Le &lt;tung@jp.ibm.com&gt;
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
@@ -29,7 +29,8 @@ llvm::cl::opt<NNPAEmissionTargetType> nnpaEmissionTarget(
 llvm::cl::opt<bool> nnpaClipToDLFloatRange("nnpa-clip-to-dlfloat-range",
     llvm::cl::desc("Clip CPU tensors to dlfloat range before stickification to "
                    "avoid out-of-range. Only clip Softmax inputs at this "
-                   "moment. Default is true."),
+                   "moment. Default is true. This option will be removed and "
+                   "replaced by --nnpa-saturation in the future."),
     llvm::cl::init(true), llvm::cl::cat(OnnxMlirOptions));
 
 llvm::cl::opt<bool> nnpaEnableZHighToOnnx("enable-zhigh-to-onnx",
@@ -55,7 +56,7 @@ llvm::cl::opt<bool> nnpaEnableCompilerStickUnstick(
     "enable-compiler-stick-unstick",
     llvm::cl::desc("[Experimental feature] Enable the compiler generate some "
                    "stick/unstick code. Default is true."),
-    llvm::cl::init(true), llvm::cl::cat(OnnxMlirOptions));
+    llvm::cl::init(true), llvm::cl::cat(OnnxMlirCommonOptions));
 
 llvm::cl::opt<bool> nnpaEnableScalarBcastBinary(
     "nnpa-enable-scalar-bcast-binary",
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
@@ -103,7 +103,8 @@ void addONNXToZHighPasses(mlir::PassManager &pm) {
   // Clip zhigh.Stick inputs if required. This is to avoid out-of-range of
   // dlfloat. Do constant propagation after clipping to remove ONNX ops used for
   // clipping such as ONNXMax if applicable.
-  if (nnpaClipToDLFloatRange) {
+  // This pass will be removed and replaced by nnpa-saturation in the future.
+  if (!nnpaEnableSaturation && nnpaClipToDLFloatRange) {
     pm.addNestedPass<func::FuncOp>(
         onnx_mlir::zhigh::createZHighClipToDLFloatPass());
     pm.addNestedPass<func::FuncOp>(onnx_mlir::createConstPropONNXToONNXPass());
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.cpp b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.cpp
@@ -507,8 +507,9 @@ struct ZHighToZLowStickOpLowering : public ConversionPattern {
     StringAttr layout = stickOp.getLayoutAttr();
     IntegerAttr saturation = stickOp.getSaturationAttr();
 
-    IndexExprBuilderForKrnl createKrnlIE(rewriter, loc);
-    ZHighStickOpShapeHelper shapeHelper(op, operands, &createKrnlIE);
+    MultiDialectBuilder<OnnxBuilder, IndexExprBuilderForKrnl> create(
+        rewriter, loc);
+    ZHighStickOpShapeHelper shapeHelper(op, operands, &create.krnlIE);
     shapeHelper.computeShapeAndAssertOnFailure();
 
     // Convert ZTensor type to MemRefType.
@@ -518,9 +519,17 @@ struct ZHighToZLowStickOpLowering : public ConversionPattern {
     // Allocate a buffer for the result MemRef.
     Value alloc = insertAllocForZMemRef(
         zMemRefType, shapeHelper.getOutputDims(), op, rewriter);
-    // Set pre-transformed layout: if NHWC, we can directly stickify from NCHW.
-    if (isNHWCLayout(layout))
-      layout = getNCHWLayoutAttr(rewriter);
+    if (isNHWCLayout(layout)) {
+      if (nnpaEnableCompilerStickUnstick) {
+        // Compiler-generated stick hasn't supported NCHW yet.
+        // Explicitly transpose NCHW to NHWC.
+        input = create.onnx.toMemref(
+            create.onnx.transposeInt64(input, ArrayRef<int64_t>({0, 2, 3, 1})));
+      } else
+        // Otherwise, we can directly stickify from NCHW.
+        // Set pre-transformed layout to NCHW.
+        layout = getNCHWLayoutAttr(rewriter);
+    }
 
     // Else, emit a ZLow operation.
     rewriter.create<ZLowStickOp>(loc, input, alloc, layout, saturation);
@@ -625,24 +634,50 @@ struct ZHighToZLowUnstickOpLowering : public ConversionPattern {
     StringAttr layout =
         getZTensorLayoutAttr(rewriter, op->getOperand(0).getType());
 
-    IndexExprBuilderForKrnl createKrnlIE(rewriter, loc);
-    ZHighUnstickOpShapeHelper shapeHelper(op, operands, &createKrnlIE);
+    MultiDialectBuilder<OnnxBuilder, IndexExprBuilderForKrnl> create(
+        rewriter, loc);
+    ZHighUnstickOpShapeHelper shapeHelper(op, operands, &create.krnlIE);
     shapeHelper.computeShapeAndAssertOnFailure();
 
     // Convert ZTensor type to MemRefType.
     ZMemRefType zMemRefType =
         convertZTensorToMemRefType(*op->result_type_begin());
 
     // Allocate a buffer for the result MemRef.
-    Value alloc = insertAllocForZMemRef(
-        zMemRefType, shapeHelper.getOutputDims(), op, rewriter);
-
-    // Set layout: if NHWC, we can directly unstickify to NCHW.
-    if (isNHWCLayout(layout))
-      layout = getNCHWLayoutAttr(rewriter);
+    Value alloc = nullptr;
+    if (isNHWCLayout(layout)) {
+      if (nnpaEnableCompilerStickUnstick) {
+        // Compiler-generated unstick hasn't supported NCHW yet.
+        // This code allocates a NHWC buffer. It gets dims from the NCHW input.
+        SmallVector<IndexExpr> dimList;
+        dimList.emplace_back(shapeHelper.getOutputDims()[0]);
+        dimList.emplace_back(shapeHelper.getOutputDims()[2]);
+        dimList.emplace_back(shapeHelper.getOutputDims()[3]);
+        dimList.emplace_back(shapeHelper.getOutputDims()[1]);
+        MultiDialectBuilder<MemRefBuilder> create(rewriter, loc);
+        MemRefType resType = zMemRefType.value;
+        ArrayRef<int64_t> shape = resType.getShape();
+        alloc = create.mem.alignedAlloc(
+            MemRefType::get({shape[0], shape[2], shape[3], shape[1]},
+                resType.getElementType()),
+            dimList);
+      } else {
+        // Otherwise, we can directly stickify from NCHW.
+        // Set pre-transformed layout to NCHW.
+        layout = getNCHWLayoutAttr(rewriter);
+      }
+    }
+    if (alloc == nullptr)
+      alloc = insertAllocForZMemRef(
+          zMemRefType, shapeHelper.getOutputDims(), op, rewriter);
 
     // Emit a ZLow operation.
     rewriter.create<ZLowUnstickOp>(loc, input, alloc, layout);
+    if (isNHWCLayout(layout) && nnpaEnableCompilerStickUnstick)
+      // Compiler-generated unstick hasn't supported NCHW yet.
+      // Explicitly transpose NHWC to NCHW.
+      alloc =
+          create.onnx.transposeInt64(alloc, ArrayRef<int64_t>({0, 3, 1, 2}));
     rewriter.replaceOp(op, alloc);
     return success();
   }
diff --git a/test/accelerators/NNPA/backend/CMakeLists.txt b/test/accelerators/NNPA/backend/CMakeLists.txt
@@ -362,12 +362,12 @@ set(NNPA_TEST_LIST
     # ==LIM== Input tensor must be less than or equal to 4 dimensions.
 
     # Model
-    # test_densenet121_cpu  #  accurary error
-    #test_inception_v1_cpu,zdnn_conv2d
-    #test_resnet50_cpu,zdnn_conv2d
-    #test_shufflenet_cpu,zdnn_matmul_op_ext
-    #test_squeezenet_cpu,zdnn_conv
-    #test_vgg19_cpu,zdnn_conv
+    test_densenet121_cpu,zdnn_conv2d
+    test_inception_v1_cpu,zdnn_conv2d
+    test_resnet50_cpu,zdnn_conv2d
+    test_shufflenet_cpu,zdnn_matmul_op_ext
+    # test_squeezenet_cpu,zdnn_conv # got NaN results
+    test_vgg19_cpu,zdnn_conv
 )
 set(ENV_TEST_CASE_BY_USER "")
 foreach(test_name IN LISTS NNPA_TEST_LIST)
@@ -394,6 +394,9 @@ add_custom_target(check-onnx-backend-nnpa
   COMMAND
     TEST_INSTRUCTION_CHECK=true
     ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-nnpa
+    # Needed for convolution models to avoid NaN outputs.
+    # Remove this if saturation is enabled by default.
+    TEST_COMPILE_ARGS="--nnpa-saturation=true"
     ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
@@ -405,6 +408,9 @@ add_custom_target(check-onnx-backend-dynamic-nnpa
     ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-dynamic-nnpa
     TEST_INSTRUCTION_CHECK=true
     TEST_DYNAMIC=true
+    # Needed for convolution models to avoid NaN outputs.
+    # Remove this if saturation is enabled by default.
+    TEST_COMPILE_ARGS="--nnpa-saturation=true"
     ${NNPA_TESTS_ENVS_DYNAMIC} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
@@ -418,6 +424,9 @@ add_custom_target(check-onnx-backend-constant-nnpa
     # TEST_INSTRUCTION_CHECK=true
     ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-constant-nnpa
     TEST_CONSTANT=true
+    # Needed for convolution models to avoid NaN outputs.
+    # Remove this if saturation is enabled by default.
+    TEST_COMPILE_ARGS="--nnpa-saturation=true" 
     ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
@@ -427,6 +436,9 @@ add_custom_target(check-onnx-backend-constant-nnpa
 add_custom_target(check-onnx-backend-compilerlib-nnpa
   COMMAND
     TEST_COMPILERLIB=true ONNX_HOME=${CMAKE_CURRENT_BINARY_DIR}
+    # Needed for convolution models to avoid NaN outputs.
+    # Remove this if saturation is enabled by default.
+    TEST_COMPILE_ARGS="--nnpa-saturation=true" 
     ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
diff --git a/test/backend/common.py b/test/backend/common.py
@@ -142,6 +142,13 @@ def compile_model(model, emit):
     command_list.append(model_name)
     command_list.append("-o=" + exec_base)
 
+    # Additional args passed in by TEST_COMPILE_ARGS
+    # Args are separated by ';'
+    additional_args = os.getenv("TEST_COMPILE_ARGS")
+    if additional_args is not None:
+        compile_args = additional_args.split(";")
+        command_list += compile_args
+
     # Call frontend to process model_name.onnx, bit code will be generated.
     dynamic_inputs_dims = determine_dynamic_parameters(name)
     if args.verbose:
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/compiler-stick-unstick.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/compiler-stick-unstick.mlir
@@ -0,0 +1,76 @@
+// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=true --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @should_lower_to_zlow(%arg0: tensor<1x3x5x7xf32>) -> tensor<*xf32> {
+  %0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x3x5x7xf32>) -> tensor<*xf16>
+  %1 = "zhigh.Unstick"(%0) : (tensor<*xf16>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3 floordiv 64, d1, d2 floordiv 32, d2 mod 32, d3 mod 64)>
+// CHECK-LABEL:  func.func @should_lower_to_zlow
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5x7xf32>) -> memref<1x3x5x7xf32> {
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<1x5x7x3xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<1x5x7x3xf32>
+// CHECK-DAG:       [[LOOP_0_:%.+]]:4 = krnl.define_loops 4
+// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2, [[LOOP_0_]]#3) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 3, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 5, [[LOOP_0_]]#3 -> [[I_3_:%.+]] = 0 to 7){
+// CHECK:             [[VAR_2_:%.+]]:4 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2, [[LOOP_0_]]#3) : (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index, index)
+// CHECK:             [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_2_]]#0, [[VAR_2_]]#1, [[VAR_2_]]#2, [[VAR_2_]]#3] : memref<1x3x5x7xf32>
+// CHECK:             krnl.store [[LOAD_PARAM_0_MEM_]], [[RES_1_]]{{.}}[[VAR_2_]]#0, [[VAR_2_]]#2, [[VAR_2_]]#3, [[VAR_2_]]#1] : memref<1x5x7x3xf32>
+// CHECK:           }
+// CHECK:           "zlow.stick"([[RES_1_]], [[RES_]]) {layout = "NHWC"} : (memref<1x5x7x3xf32>, memref<1x5x7x3xf16, #map>) -> ()
+// CHECK:           [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<1x5x7x3xf32>
+// CHECK:           "zlow.unstick"([[RES_]], [[RES_]]_1) {layout = "NHWC"} : (memref<1x5x7x3xf16, #map>, memref<1x5x7x3xf32>) -> ()
+// CHECK-DAG:       [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5x7xf32>
+// CHECK-DAG:       [[LOOP_1_:%.+]]:4 = krnl.define_loops 4
+// CHECK:           krnl.iterate([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2, [[LOOP_1_]]#3) with ([[LOOP_1_]]#0 -> [[I_4_:%.+]] = 0 to 1, [[LOOP_1_]]#1 -> [[I_5_:%.+]] = 0 to 5, [[LOOP_1_]]#2 -> [[I_6_:%.+]] = 0 to 7, [[LOOP_1_]]#3 -> [[I_7_:%.+]] = 0 to 3){
+// CHECK:             [[VAR_2_1_:%.+]]:4 = krnl.get_induction_var_value([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2, [[LOOP_1_]]#3) : (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index, index)
+// CHECK:             [[LOAD_PARAM_0_MEM_1_:%.+]] = krnl.load [[RES_2_]]{{.}}[[VAR_2_1_]]#0, [[VAR_2_1_]]#1, [[VAR_2_1_]]#2, [[VAR_2_1_]]#3] : memref<1x5x7x3xf32>
+// CHECK:             krnl.store [[LOAD_PARAM_0_MEM_1_]], [[RES_3_]]{{.}}[[VAR_2_1_]]#0, [[VAR_2_1_]]#3, [[VAR_2_1_]]#1, [[VAR_2_1_]]#2] : memref<1x3x5x7xf32>
+// CHECK:           }
+// CHECK:           return [[RES_3_]] : memref<1x3x5x7xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @should_lower_to_zlow_unknown_dims(%arg0: tensor<1x?x?x7xf32>) -> tensor<*xf32> {
+  %0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x?x?x7xf32>) -> tensor<*xf16>
+  %1 = "zhigh.Unstick"(%0) : (tensor<*xf16>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3 floordiv 64, d1, d2 floordiv 32, d2 mod 32, d3 mod 64)>
+// CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0, d1) -> (d1)>
+// CHECK-LABEL:  func.func @should_lower_to_zlow_unknown_dims
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x?x?x7xf32>) -> memref<1x?x?x7xf32> {
+// CHECK-DAG:       [[CST_2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : index
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_dim_:%.+]] = memref.dim [[PARAM_0_]], [[CST_1_]] : memref<1x?x?x7xf32>
+// CHECK-DAG:       [[VAR_dim_0_:%.+]] = memref.dim [[PARAM_0_]], [[CST_2_]] : memref<1x?x?x7xf32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc([[VAR_dim_0_]], [[VAR_dim_]]) {{.*}}: memref<1x?x7x?xf16, #map>
+// CHECK-DAG:       [[VAR_dim_1_:%.+]] = memref.dim [[PARAM_0_]], [[CST_2_]] : memref<1x?x?x7xf32>
+// CHECK-DAG:       [[VAR_dim_2_:%.+]] = memref.dim [[PARAM_0_]], [[CST_1_]] : memref<1x?x?x7xf32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc([[VAR_dim_1_]], [[VAR_dim_2_]]) {{.*}}: memref<1x?x7x?xf32>
+// CHECK-DAG:       [[LOOP_0_:%.+]]:4 = krnl.define_loops 4
+// CHECK-DAG:       [[VAR_dim_4_:%.+]] = memref.dim [[PARAM_0_]], [[CST_1_]] : memref<1x?x?x7xf32>
+// CHECK-DAG:       [[VAR_dim_5_:%.+]] = memref.dim [[PARAM_0_]], [[CST_2_]] : memref<1x?x?x7xf32>
+// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2, [[LOOP_0_]]#3) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to [[MAP_1_]]([[VAR_dim_4_]]), [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to [[MAP_2_]]([[VAR_dim_4_]], [[VAR_dim_5_]]), [[LOOP_0_]]#3 -> [[I_3_:%.+]] = 0 to 7){
+// CHECK:             [[VAR_2_:%.+]]:4 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2, [[LOOP_0_]]#3) : (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index, index)
+// CHECK:             [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_2_]]#0, [[VAR_2_]]#1, [[VAR_2_]]#2, [[VAR_2_]]#3] : memref<1x?x?x7xf32>
+// CHECK:             krnl.store [[LOAD_PARAM_0_MEM_]], [[RES_1_]]{{.}}[[VAR_2_]]#0, [[VAR_2_]]#2, [[VAR_2_]]#3, [[VAR_2_]]#1] : memref<1x?x7x?xf32>
+// CHECK:           }
+// CHECK:           "zlow.stick"([[RES_1_]], [[RES_]]) {layout = "NHWC"} : (memref<1x?x7x?xf32>, memref<1x?x7x?xf16, #map>) -> ()
+// CHECK:           [[RES_2_:%.+]] = memref.alloc([[VAR_dim_0_]], [[VAR_dim_]]) {{.*}}: memref<1x?x7x?xf32>
+// CHECK:           "zlow.unstick"([[RES_]], [[RES_]]_6) {layout = "NHWC"} : (memref<1x?x7x?xf16, #map>, memref<1x?x7x?xf32>) -> ()
+// CHECK-DAG:       [[RES_3_:%.+]] = memref.alloc([[VAR_dim_]], [[VAR_dim_]]_0) {{.*}}: memref<1x?x?x7xf32>
+// CHECK-DAG:       [[LOOP_1_:%.+]]:4 = krnl.define_loops 4
+// CHECK:           krnl.iterate([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2, [[LOOP_1_]]#3) with ([[LOOP_1_]]#0 -> [[I_4_:%.+]] = 0 to 1, [[LOOP_1_]]#1 -> [[I_5_:%.+]] = 0 to [[MAP_1_]]([[VAR_dim_0_]]), [[LOOP_1_]]#2 -> [[I_6_:%.+]] = 0 to 7, [[LOOP_1_]]#3 -> [[I_7_:%.+]] = 0 to [[MAP_2_]]([[VAR_dim_0_]], [[VAR_dim_]])){
+// CHECK:             [[VAR_2_1_:%.+]]:4 = krnl.get_induction_var_value([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2, [[LOOP_1_]]#3) : (!krnl.loop, !krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index, index)
+// CHECK:             [[LOAD_PARAM_0_MEM_1_:%.+]] = krnl.load [[RES_2_]]{{.}}[[VAR_2_1_]]#0, [[VAR_2_1_]]#1, [[VAR_2_1_]]#2, [[VAR_2_1_]]#3] : memref<1x?x7x?xf32>
+// CHECK:             krnl.store [[LOAD_PARAM_0_MEM_1_]], [[RES_3_]]{{.}}[[VAR_2_1_]]#0, [[VAR_2_1_]]#3, [[VAR_2_1_]]#1, [[VAR_2_1_]]#2] : memref<1x?x?x7xf32>
+// CHECK:           }
+// CHECK:           return [[RES_3_]] : memref<1x?x?x7xf32>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stick-unstick.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stick-unstick.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<1x3x5x7xf32>) -> tensor<*xf32> {
   %0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x3x5x7xf32>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/test-datalayout.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/test-datalayout.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow_1d(%arg0: tensor<7xf32>) -> tensor<*xf16> {
   %0 = "zhigh.Stick"(%arg0) {layout = "1D"} : (tensor<7xf32>) -> tensor<*xf16>

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file \| FileCheck %s`
	`1`	`+// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file \| FileCheck %s`
`2`	`2`
`3`	`3`	`func.func @should_lower_to_zlow(%arg0: tensor<1x3x5x7xf32>) -> tensor<*xf32> {`
`4`	`4`	`%0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x3x5x7xf32>) -> tensor<*xf16>`