Decompose Hardswish into simpler ONNX ops (#3107)

kumarappan-cmyk · tungld · web-flow · commit bd070eac8977 · 2025-04-14T11:11:46.000+09:00
* Decompose and lower Hardswish

Signed-off-by: Kumarappan &lt;kumarappan.thiyagarajan@multicorewareinc.com&gt;

* Providing the decomposition as compile time option with krnl dialect lowering as default

Signed-off-by: Kumarappan &lt;kumarappan.thiyagarajan@multicorewareinc.com&gt;

---------

Signed-off-by: Kumarappan &lt;kumarappan.thiyagarajan@multicorewareinc.com&gt;
Co-authored-by: Tung D. Le &lt;tung@jp.ibm.com&gt;
diff --git a/docs/SupportedONNXOps-cpu.md b/docs/SupportedONNXOps-cpu.md
@@ -92,7 +92,7 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 22. Limitatio
 | **HammingWindow** |none | | | |
 | **HannWindow** |none | | | |
 | **HardSigmoid** |6 - * | | |
-| **HardSwish** |none | | | |
+| **HardSwish** |14 - * | | | |
 | **Hardmax** |6 - * | | |
 | **Identity** |16 - * |Sequence identity not supported. Does not support int4 and uint4. | |
 | **If** |16 - * |Sequence and Optional outputs are not supported. Does not support int4 and uint4. | |
diff --git a/src/Compiler/CompilerOptions.cpp b/src/Compiler/CompilerOptions.cpp
@@ -48,6 +48,7 @@ bool enableKrnlBufferReuse;                            // common for both
 bool enableSafeCodeGen;                                // common for both
 bool disableMemRefPrefetch;                            // common for both
 uint64_t compilationNumThreads;                        // common for both
+std::vector<std::string> decomposeOpsInONNX;           // common for both
 EmissionTargetType emissionTarget;                     // onnx-mlir only
 bool invokeOnnxVersionConverter;                       // onnx-mlir only
 bool preserveLocations;                                // onnx-mlir only
@@ -264,6 +265,15 @@ static llvm::cl::opt<bool, true> disableMemRefPrefetchOpt(
     llvm::cl::location(disableMemRefPrefetch), llvm::cl::init(false),
     llvm::cl::cat(OnnxMlirCommonOptions));
 
+static llvm::cl::list<std::string, std::vector<std::string>>
+    decomposeOpsInONNXOpt("decompose-op-in-onnx",
+        llvm::cl::desc("Specify ONNX operations to decompose.\n"
+                       "Supported Ops - HardSwish"),
+        llvm::cl::value_desc("ONNX operation to decompose"),
+        llvm::cl::location(decomposeOpsInONNX),
+        llvm::cl::cat(OnnxMlirCommonOptions), llvm::cl::CommaSeparated,
+        llvm::cl::ZeroOrMore);
+
 static llvm::cl::opt<bool, true> disableRecomposeOptionOpt("disable-recompose",
     llvm::cl::desc("Disable recomposition of ONNX operations."),
     llvm::cl::location(disableRecomposeOption), llvm::cl::init(false),
diff --git a/src/Compiler/CompilerOptions.hpp b/src/Compiler/CompilerOptions.hpp
@@ -94,6 +94,7 @@ extern bool enableKrnlBufferReuse;                            // common for both
 extern bool enableSafeCodeGen;                                // common for both
 extern bool disableMemRefPrefetch;                            // common for both
 extern uint64_t compilationNumThreads;                        // common for both
+extern std::vector<std::string> decomposeOpsInONNX;           // common for both
 extern EmissionTargetType emissionTarget;                     // onnx-mlir only
 extern bool invokeOnnxVersionConverter;                       // onnx-mlir only
 extern bool preserveLocations;                                // onnx-mlir only
diff --git a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
@@ -646,6 +646,51 @@ Value emitScalarOpFor<ONNXHardSigmoidOp>(ConversionPatternRewriter &rewriter,
   return clipHighest;
 }
 
+//===----------------------------------------------------------------------===//
+// Scalar unary ops for lowering ONNXHardSwishOp
+//===----------------------------------------------------------------------===//
+template <>
+struct ScalarOp<ONNXHardSwishOp> {
+  using FOp = CustomScalarOp;
+  using IOp = NotSuportedScalarOp;
+};
+
+template <>
+GenOpMix getGenOpMix<ONNXHardSwishOp>(Type t, Operation *op) {
+  return {{GenericOps::ArithmeticGop, 3}, {GenericOps::MulGop, 2}};
+}
+
+template <>
+Value emitScalarOpFor<ONNXHardSwishOp>(ConversionPatternRewriter &rewriter,
+    Location loc, Operation *op, Type elementType,
+    ArrayRef<Value> scalarOperands) {
+  // HardSwish(x) = x * max(0, min(1, (x / 6) + 0.5))
+  CheckIfCustomScalarOpIsSupported<ONNXHardSwishOp>(elementType);
+  Value operand = scalarOperands[0];
+
+  // Define constants: alpha = 1/6, beta = 0.5
+  MultiDialectBuilder<MathBuilder> create(rewriter, loc);
+  Value zero = create.math.constant(elementType, 0);
+  Value one = create.math.constant(elementType, 1);
+  Value alpha = create.math.constant(elementType, 1.0 / 6.0);
+  Value beta = create.math.constant(elementType, 0.5);
+
+  // Compute (x / 6) + 0.5
+  Value scaledX = create.math.mul(operand, alpha);
+  Value shiftedX = create.math.add(scaledX, beta);
+
+  // Apply min(1, shiftedX)
+  Value minOp = create.math.min(shiftedX, one);
+
+  // Apply max(0, minOp)
+  Value maxOp = create.math.max(minOp, zero);
+
+  // Compute final HardSwish: x * max(0, min(1, (x / 6) + 0.5))
+  Value result = create.math.mul(operand, maxOp);
+
+  return result;
+}
+
 //===----------------------------------------------------------------------===//
 // Scalar unary ops for lowering ONNXEluOp
 //===----------------------------------------------------------------------===//
@@ -1714,12 +1759,12 @@ bool OpFusionHelper::checkFusibleOp(Operation *useOp, Operation *defOp,
       mlir::ONNXEluOp, mlir::ONNXErfOp, mlir::ONNXAcosOp, mlir::ONNXAcoshOp,
       mlir::ONNXAsinOp, mlir::ONNXAsinhOp, mlir::ONNXAtanhOp, mlir::ONNXExpOp,
       mlir::ONNXFloorOp, mlir::ONNXGeluOp, mlir::ONNXHardSigmoidOp,
-      mlir::ONNXIsInfOp, mlir::ONNXIsNaNOp, mlir::ONNXLeakyReluOp,
-      mlir::ONNXLogOp, mlir::ONNXNegOp, mlir::ONNXNotOp, mlir::ONNXReciprocalOp,
-      mlir::ONNXReluOp, mlir::ONNXRoundOp, mlir::ONNXSeluOp,
-      mlir::ONNXSigmoidOp, mlir::ONNXSignOp, mlir::ONNXSinOp, mlir::ONNXSinhOp,
-      mlir::ONNXSoftplusOp, mlir::ONNXSoftsignOp, mlir::ONNXSqrtOp,
-      mlir::ONNXTanOp, mlir::ONNXTanhOp,
+      mlir::ONNXHardSwishOp, mlir::ONNXIsInfOp, mlir::ONNXIsNaNOp,
+      mlir::ONNXLeakyReluOp, mlir::ONNXLogOp, mlir::ONNXNegOp, mlir::ONNXNotOp,
+      mlir::ONNXReciprocalOp, mlir::ONNXReluOp, mlir::ONNXRoundOp,
+      mlir::ONNXSeluOp, mlir::ONNXSigmoidOp, mlir::ONNXSignOp, mlir::ONNXSinOp,
+      mlir::ONNXSinhOp, mlir::ONNXSoftplusOp, mlir::ONNXSoftsignOp,
+      mlir::ONNXSqrtOp, mlir::ONNXTanOp, mlir::ONNXTanhOp,
       // Binary Op
       mlir::ONNXEqualOp, mlir::ONNXGreaterOp, mlir::ONNXGreaterOrEqualOp,
       mlir::ONNXLessOp, mlir::ONNXLessOrEqualOp, mlir::ONNXModOp,
@@ -2674,6 +2719,7 @@ void populateLoweringONNXElementwiseOpPattern(RewritePatternSet &patterns,
       ONNXElementwiseBinaryOpLowering<mlir::ONNXGreaterOp>,
       ONNXElementwiseBinaryOpLowering<mlir::ONNXGreaterOrEqualOp>,
       ONNXElementwiseUnaryOpLowering<mlir::ONNXHardSigmoidOp>,
+      ONNXElementwiseUnaryOpLowering<mlir::ONNXHardSwishOp>,
       ONNXElementwiseUnaryOpLowering<mlir::ONNXIsInfOp>,
       ONNXElementwiseUnaryOpLowering<mlir::ONNXIsNaNOp>,
       ONNXElementwiseUnaryOpLowering<mlir::ONNXLeakyReluOp>,
diff --git a/src/Dialect/ONNX/DialectBuilder.cpp b/src/Dialect/ONNX/DialectBuilder.cpp
@@ -116,6 +116,13 @@ Value OnnxBuilder::constantInt64(const ArrayRef<int64_t> intVals) const {
   return constant(denseAttr);
 }
 
+Value OnnxBuilder::constantFloat32(const ArrayRef<float> floatVals) const {
+  auto shape = RankedTensorType::get(
+      {static_cast<int64_t>(floatVals.size())}, b().getF32Type());
+  DenseElementsAttr denseAttr = DenseElementsAttr::get(shape, floatVals);
+  return constant(denseAttr);
+}
+
 Value OnnxBuilder::conv(Type Y, Value X, Value W, Value B, StringRef autoPad,
     ArrayRef<int64_t> dilations, int64_t group, ArrayRef<int64_t> kernelShape,
     ArrayRef<int64_t> pads, ArrayRef<int64_t> strides) const {
diff --git a/src/Dialect/ONNX/DialectBuilder.hpp b/src/Dialect/ONNX/DialectBuilder.hpp
@@ -70,6 +70,7 @@ struct OnnxBuilder : DialectBuilder {
   // ONNXConstantOp
   mlir::Value constant(mlir::Attribute denseAttr) const;
   mlir::Value constantInt64(const mlir::ArrayRef<int64_t> intVals) const;
+  mlir::Value constantFloat32(const mlir::ArrayRef<float> floatVals) const;
 
   // ONNXConvOp
   mlir::Value conv(mlir::Type Y, mlir::Value X, mlir::Value W, mlir::Value B,
diff --git a/src/Dialect/ONNX/Transforms/Decompose.cpp b/src/Dialect/ONNX/Transforms/Decompose.cpp
@@ -26,6 +26,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "src/Compiler/CompilerOptions.hpp"
 #include "llvm/Support/Debug.h"
 
 #include "src/Dialect/ONNX/DialectBuilder.hpp"
@@ -1288,6 +1289,63 @@ class ReplaceCastLikeByCastPattern : public OpRewritePattern<ONNXCastLikeOp> {
   }
 };
 
+// =============================================================================
+// Decompose Hardswish to simpler ONNX ops
+// =============================================================================
+// DecomposeHardSwishPattern replaces ONNXHardSwishOp with its equivalent
+// mathematical decomposition using basic ONNX operations:
+//
+//    HardSwish(x) = x * max(0, min(1, (x / 6) + 0.5))
+//
+// This pass:
+//  - Multiplies input by `1/6`
+//  - Adds `0.5` to the scaled input
+//  - Clamps the result between `0` and `1` using Min and Max ops
+//  - Multiplies the clamped value with the original input
+
+struct DecomposeHardSwishPattern : public OpRewritePattern<ONNXHardSwishOp> {
+  using OpRewritePattern<ONNXHardSwishOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      ONNXHardSwishOp hardswishOp, PatternRewriter &rewriter) const final {
+
+    // Get location and element type
+    Location loc = hardswishOp.getLoc();
+    onnx_mlir::MultiDialectBuilder<onnx_mlir::OnnxBuilder> create(
+        rewriter, loc);
+
+    Value alphaConst = create.onnx.constantFloat32(1.0f / 6.0f);
+    Value betaConst = create.onnx.constantFloat32(0.5f);
+    Value minConst = create.onnx.constantFloat32(1.0f);
+    Value maxConst = create.onnx.constantFloat32(0.0f);
+
+    // Multiply input by alpha
+    auto scaledInput =
+        rewriter.create<ONNXMulOp>(loc, hardswishOp.getOperand().getType(),
+            hardswishOp.getOperand(), alphaConst);
+
+    // Add beta to (input * alpha)
+    auto shiftedInput = rewriter.create<ONNXAddOp>(
+        loc, scaledInput.getType(), scaledInput, betaConst);
+
+    // Compute min(1.0, shiftedInput)
+    auto minOp = rewriter.create<ONNXMinOp>(
+        loc, shiftedInput.getType(), ValueRange({shiftedInput, minConst}));
+
+    // Compute max(0, min(1, shiftedInput))
+    auto maxOp = rewriter.create<ONNXMaxOp>(
+        loc, minOp.getType(), ValueRange({minOp, maxConst}));
+
+    // Compute final HardSwish: input * max(0, min(1, add(mul(x, alpha), beta)))
+    auto hardswishResult = rewriter.create<ONNXMulOp>(loc,
+        hardswishOp.getOperand().getType(), hardswishOp.getOperand(), maxOp);
+
+    // Replace the original HardSwishOp with the new computation
+    rewriter.replaceOp(hardswishOp, hardswishResult.getResult());
+    return success();
+  }
+};
+
 struct DecomposeONNXToONNXPass
     : public PassWrapper<DecomposeONNXToONNXPass, OperationPass<func::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(DecomposeONNXToONNXPass)
@@ -1364,6 +1422,13 @@ void DecomposeONNXToONNXPass::runOnOperation() {
   target.addIllegalOp<ONNXUpsampleOp>();
   target.addIllegalOp<ONNXUpsampleV7Op>();
 
+  if (!onnx_mlir::decomposeOpsInONNX.empty()) {
+    for (const auto &op : onnx_mlir::decomposeOpsInONNX) {
+      if (op == "HardSwish") {
+        target.addIllegalOp<ONNXHardSwishOp>();
+      }
+    }
+  }
   target.addDynamicallyLegalOp<ONNXEinsumOp>([](ONNXEinsumOp op) {
     return !onnx_mlir::DecomposeEinsumPattern::isDecomposable(op);
   });
@@ -1439,6 +1504,14 @@ void onnx_mlir::getDecomposeONNXToONNXPatterns(
   patterns.insert<SoftmaxCrossEntropyPattern>(context);
   patterns.insert<SumToAddPattern>(context);
 
+  if (!onnx_mlir::decomposeOpsInONNX.empty()) {
+    for (const auto &op : onnx_mlir::decomposeOpsInONNX) {
+      if (op == "HardSwish") {
+        patterns.insert<DecomposeHardSwishPattern>(context);
+      }
+    }
+  }
+
   // TODO: consider whether to include SoftmaxPattern here
 }
 
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize.mlir
@@ -1247,6 +1247,40 @@ func.func private @test_hardsigmoid(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
 
 // -----
 
+func.func private @test_hardswish(%arg0: tensor<?x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.HardSwish"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+  
+  // mlir2FileCheck.py
+  // CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0) -> (d0)>
+  // CHECK-LABEL:  func.func private @test_hardswish
+  // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<?x10xf32>) -> memref<?x10xf32> {
+  // CHECK-DAG:       [[CST_HALF_:%.+]] = arith.constant 5.000000e-01 : f32
+  // CHECK-DAG:       [[CST_ONE_SIXTH_:%.+]] = arith.constant 0.166666672 : f32
+  // CHECK-DAG:       [[CST_ONE_:%.+]] = arith.constant 1.000000e+00 : f32
+  // CHECK-DAG:       [[CST_ZERO_:%.+]] = arith.constant 0.000000e+00 : f32
+  // CHECK-DAG:       [[CST_IDX0_:%.+]] = arith.constant 0 : index
+  // CHECK:           [[VAR_DIM_:%.+]] = memref.dim [[PARAM_0_]], [[CST_IDX0_]] : memref<?x10xf32>
+  // CHECK-DAG:       [[RES_:%.+]] = memref.alloc([[VAR_DIM_]]) {{.*}}: memref<?x10xf32>
+  // CHECK-DAG:       [[LOOPS_:%.+]]:2 = krnl.define_loops 2
+  // CHECK-DAG:       [[DIM_:%.+]] = memref.dim [[PARAM_0_]], [[CST_IDX0_]] : memref<?x10xf32>
+  // CHECK:           krnl.iterate([[LOOPS_]]#0, [[LOOPS_]]#1) with ([[LOOPS_]]#0 -> [[I0_:%.+]] = 0 to [[MAP_0_]]([[DIM_]]), [[LOOPS_]]#1 -> [[I1_:%.+]] = 0 to 10){
+  // CHECK:             [[IVS_:%.+]]:2 = krnl.get_induction_var_value([[LOOPS_]]#0, [[LOOPS_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index)
+  // CHECK:             [[LOAD_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[IVS_]]#0, [[IVS_]]#1] : memref<?x10xf32>
+  // CHECK:             [[SCALE_:%.+]] = arith.mulf [[LOAD_]], [[CST_ONE_SIXTH_]] : f32
+  // CHECK:             [[SHIFTED_:%.+]] = arith.addf [[SCALE_]], [[CST_HALF_]] : f32
+  // CHECK:             [[CLAMPED1_:%.+]] = arith.minnumf [[SHIFTED_]], [[CST_ONE_]] : f32
+  // CHECK:             [[CLAMPED2_:%.+]] = arith.maxnumf [[CLAMPED1_]], [[CST_ZERO_]] : f32
+  // CHECK:             [[MUL_FINAL_:%.+]] = arith.mulf [[LOAD_]], [[CLAMPED2_]] : f32
+  // CHECK:             krnl.store [[MUL_FINAL_]], [[RES_]]{{.}}[[IVS_]]#0, [[IVS_]]#1] : memref<?x10xf32>
+  // CHECK:           }
+  // CHECK:           return [[RES_]] : memref<?x10xf32>
+  // CHECK:         }
+
+}
+
+// -----
+
 func.func private @test_reciprocal(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Reciprocal"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32>
   "func.return"(%0) : (tensor<*xf32>) -> ()
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize_O3.mlir
@@ -1860,6 +1860,55 @@ func.func private @test_hardsigmoid(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
 // -----
 
 
+func.func private @test_hardswish(%arg0: tensor<?x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.HardSwish"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<()[s0] -> (s0 * 40 + 128)>
+// CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<()[s0] -> (s0 * 10)>
+// CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<()[s0, s1, s2] -> (s2)>
+// CHECK-LABEL:  func.func private @test_hardswish
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<?x10xf32>) -> memref<?x10xf32> {
+// CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<5.000000e-01> : vector<32xf32>
+// CHECK-DAG:       [[VAR_cst_0_:%.+]] = arith.constant dense<0.166666672> : vector<32xf32>
+// CHECK-DAG:       [[VAR_cst_1_:%.+]] = arith.constant dense<1.000000e+00> : vector<32xf32>
+// CHECK-DAG:       [[VAR_cst_2_:%.+]] = arith.constant dense<0.000000e+00> : vector<32xf32>
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
+// CHECK:           [[VAR_dim_:%.+]] = memref.dim [[PARAM_0_]], [[CST_0_]] : memref<?x10xf32>
+// CHECK:           [[VAR_0_:%.+]] = affine.apply [[MAP_0_]](){{.}}[[VAR_dim_]]{{.}}
+// CHECK:           [[RES_:%.+]] = memref.alloc([[VAR_0_]]) {{.*}}: memref<?xi8>
+// CHECK-DAG:       [[VAR_view_:%.+]] = memref.view [[RES_]]{{.}}[[CST_0_]]{{.}}{{.}}[[VAR_dim_]]{{.}} : memref<?xi8> to memref<?x10xf32>
+// CHECK-DAG:       [[VAR_dim_2_:%.+]] = memref.dim [[PARAM_0_]], [[CST_0_]] : memref<?x10xf32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_1_:%.+]] = affine.apply [[MAP_1_]](){{.}}[[VAR_dim_2_]]{{.}}
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<1xindex>
+// CHECK:           affine.store [[VAR_1_]], [[RES_1_]][0] : memref<1xindex>
+// CHECK-DAG:       [[VAR_reshape_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_1_]]) : (memref<?x10xf32>, memref<1xindex>) -> memref<?xf32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = affine.apply [[MAP_1_]](){{.}}[[VAR_dim_]]{{.}}
+// CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<1xindex>
+// CHECK:           affine.store [[VAR_2_]], [[RES_2_]][0] : memref<1xindex>
+// CHECK:           [[VAR_reshape_5_:%.+]] = memref.reshape [[VAR_view_]]([[RES_2_]]) : (memref<?x10xf32>, memref<1xindex>) -> memref<?xf32>
+// CHECK:           krnl.iterate() with (){
+// CHECK:             [[LOOP_0_:%.+]] = krnl.define_loops 1
+// CHECK:             [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
+// CHECK:             krnl.iterate([[BLOCK_TILE__0_]]) with ([[LOOP_0_]] -> [[I_0_:%.+]] = 0 to [[MAP_2_]](){{.}}[[VAR_dim_]], [[VAR_dim_]]_3, [[VAR_2_]]{{.}}){
+// CHECK:               [[IV_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__0_]]) : (!krnl.loop) -> index
+// CHECK:               [[VLOAD_:%.+]] = vector.load [[VAR_reshape_]]{{\[}}[[IV_]]] : memref<?xf32>, vector<32xf32>
+// CHECK:               [[MUL_1_:%.+]] = arith.mulf [[VLOAD_]], [[VAR_cst_0_]] : vector<32xf32>
+// CHECK:               [[ADD_:%.+]] = arith.addf [[MUL_1_]], [[VAR_cst_]] : vector<32xf32>
+// CHECK:               [[MIN_:%.+]] = arith.minnumf [[ADD_]], [[VAR_cst_1_]] : vector<32xf32>
+// CHECK:               [[MAX_:%.+]] = arith.maxnumf [[MIN_]], [[VAR_cst_2_]] : vector<32xf32>
+// CHECK:               [[MUL_2_:%.+]] = arith.mulf [[VLOAD_]], [[MAX_]] : vector<32xf32>
+// CHECK:               vector.store [[MUL_2_]], [[VAR_reshape_5_]]{{\[}}[[IV_]]] : memref<?xf32>, vector<32xf32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return [[VAR_view_]] : memref<?x10xf32>
+// CHECK:         }
+}
+
+// -----
+
 func.func private @test_reciprocal(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Reciprocal"(%arg0) : (tensor<?x10xf32>) -> tensor<*xf32>
   "func.return"(%0) : (tensor<*xf32>) -> ()
diff --git a/test/mlir/onnx/onnx_decompose_hardswish.mlir b/test/mlir/onnx/onnx_decompose_hardswish.mlir
@@ -0,0 +1,20 @@
+// RUN: onnx-mlir-opt --decompose-onnx --decompose-op-in-onnx HardSwish %s | FileCheck %s
+func.func @test_hardswish(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %0 = "onnx.HardSwish"(%arg0) {onnx_node_name = "/hardswish/HardSwish"} :
+       (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  onnx.Return %0 : tensor<?x?x?xf32>
+
+  // CHECK-LABEL:       func @test_hardswish
+  // CHECK-NOT: "onnx.HardSwish"
+  // CHECK-SAME:     (%[[ARG0:.*]]: {{.*}})
+  // CHECK-NEXT: %[[C1:.*]] = onnx.Constant dense<0.166666672> : tensor<1xf32>
+  // CHECK-NEXT: %[[C2:.*]] = onnx.Constant dense<5.000000e-01> : tensor<1xf32>
+  // CHECK-NEXT: %[[C3:.*]] = onnx.Constant dense<1.000000e+00> : tensor<1xf32>
+  // CHECK-NEXT: %[[C4:.*]] = onnx.Constant dense<0.000000e+00> : tensor<1xf32>
+  // CHECK-NEXT: %[[MUL1:.*]] = "onnx.Mul"(%[[ARG0]], %[[C1]]) : (tensor<?x?x?xf32>, tensor<1xf32>) -> tensor<?x?x?xf32>
+  // CHECK-NEXT: %[[ADD:.*]] = "onnx.Add"(%[[MUL1]], %[[C2]]) : (tensor<?x?x?xf32>, tensor<1xf32>) -> tensor<?x?x?xf32>
+  // CHECK-NEXT: %[[MIN:.*]] = "onnx.Min"(%[[ADD]], %[[C3]]) : (tensor<?x?x?xf32>, tensor<1xf32>) -> tensor<?x?x?xf32>
+  // CHECK-NEXT: %[[MAX:.*]] = "onnx.Max"(%[[MIN]], %[[C4]]) : (tensor<?x?x?xf32>, tensor<1xf32>) -> tensor<?x?x?xf32>
+  // CHECK-NEXT: %[[MUL2:.*]] = "onnx.Mul"(%[[ARG0]], %[[MAX]]) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  // CHECK-NEXT: onnx.Return %[[MUL2]] : tensor<?x?x?xf32>
+}