added support for no-zero-point quantization (#2938)

AlexandreEichenberger · tungld · web-flow · commit a6ebca0297ca · 2024-09-17T10:04:28.000-04:00
Signed-off-by: Alexandre Eichenberger &lt;alexe@us.ibm.com&gt;
Co-authored-by: Tung D. Le &lt;tung@jp.ibm.com&gt;
diff --git a/src/Compiler/CompilerOptions.cpp b/src/Compiler/CompilerOptions.cpp
@@ -42,6 +42,7 @@ bool enableONNXHybridPass;                             // common for both
 std::vector<std::string> functionsToDecompose;         // common for both
 std::string opsForCall;                                // common for both
 bool disableKrnlOpFusion;                              // common for both
+bool disableQuantZeroPoint;                            // common for both
 bool enableKrnlBufferReuse;                            // common for both
 bool disableMemRefPrefetch;                            // common for both
 EmissionTargetType emissionTarget;                     // onnx-mlir only
@@ -195,7 +196,7 @@ static llvm::cl::list<std::string, std::vector<std::string>>
         llvm::cl::cat(OnnxMlirCommonOptions));
 
 static llvm::cl::opt<bool, true> enableONNXHybridPassOpt("onnx-hybrid-pass",
-    llvm::cl::desc("Enable ONNX hybrid pass (default=true)\n"
+    llvm::cl::desc("Enable ONNX hybrid pass (default=true).\n"
                    "Set to 'false' if you want to disable ONNX hybrid pass."),
     llvm::cl::location(enableONNXHybridPass), llvm::cl::init(true),
     llvm::cl::cat(OnnxMlirCommonOptions));
@@ -208,11 +209,20 @@ static llvm::cl::list<std::string, std::vector<std::string>>
 
 static llvm::cl::opt<bool, true> disableKrnlOpFusionOpt(
     "disable-krnl-op-fusion",
-    llvm::cl::desc("disable op fusion in onnx-to-krnl pass (default=false)\n"
+    llvm::cl::desc("Disable op fusion in onnx-to-krnl pass (default=false).\n"
                    "Set to 'true' if you want to disable fusion."),
     llvm::cl::location(disableKrnlOpFusion), llvm::cl::init(false),
     llvm::cl::cat(OnnxMlirCommonOptions));
 
+static llvm::cl::opt<bool, true> disable_quantization_zero_point(
+    "disable-quantization-zero-point",
+    llvm::cl::desc(
+        "Disable the use of zero-point in quantization (default=false).\n"
+        "Set to 'true' if you want to disable the use of zero-point\n"
+        "in dyn/static quantization/dequantization."),
+    llvm::cl::location(disableQuantZeroPoint), llvm::cl::init(false),
+    llvm::cl::cat(OnnxMlirCommonOptions));
+
 static llvm::cl::opt<bool, true> enableKrnlBufferReuseOpt(
     "enable-krnl-buffer-reuse",
     llvm::cl::desc("enable buffer reuse within an op in onnx-to-krnl pass"
@@ -223,7 +233,7 @@ static llvm::cl::opt<bool, true> enableKrnlBufferReuseOpt(
 
 static llvm::cl::opt<bool, true> disableMemRefPrefetchOpt(
     "disable-memref-prefetch",
-    llvm::cl::desc("disable generation of memref.prefetch (default=false)\n"
+    llvm::cl::desc("Disable generation of memref.prefetch (default=false).\n"
                    "Set to 'true' if you want to disable prefetch."),
     llvm::cl::location(disableMemRefPrefetch), llvm::cl::init(false),
     llvm::cl::cat(OnnxMlirCommonOptions));
@@ -1145,7 +1155,6 @@ std::string getLibraryPath() {
 // as lrodataScript.
 std::string getToolPath(
     const std::string &tool, bool flag /*false by default*/) {
-
   if (!flag) {
     std::string execDir = llvm::sys::path::parent_path(getExecPath()).str();
     llvm::SmallString<8> toolPath(execDir);
diff --git a/src/Compiler/CompilerOptions.hpp b/src/Compiler/CompilerOptions.hpp
@@ -87,6 +87,7 @@ extern bool enableONNXHybridPass;                             // common for both
 extern std::vector<std::string> functionsToDecompose;         // common for both
 extern std::string opsForCall;                                // common for both
 extern bool disableKrnlOpFusion;                              // common for both
+extern bool disableQuantZeroPoint;                            // common for both
 extern bool enableKrnlBufferReuse;                            // common for both
 extern bool disableMemRefPrefetch;                            // common for both
 extern EmissionTargetType emissionTarget;                     // onnx-mlir only
diff --git a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
@@ -1358,9 +1358,15 @@ Value emitScalarOpFor<ONNXDequantizeLinearOp>(
   Value scaleFloat = scalarOperands[1];
   Value zeroPointInt = scalarOperands[2];
 
-  Value zeroPointFloat = create.math.cast(elementType, zeroPointInt);
   Value xFloat = create.math.cast(elementType, XInt);
-  Value sub = create.math.sub(xFloat, zeroPointFloat);
+
+  Value sub;
+  if (!disableQuantZeroPoint && !isNoneValue(zeroPointInt)) {
+    Value zeroPointFloat = create.math.cast(elementType, zeroPointInt);
+    sub = create.math.sub(xFloat, zeroPointFloat);
+  } else {
+    sub = xFloat;
+  }
   Value res = create.math.mul(sub, scaleFloat);
   return res;
 }
diff --git a/src/Conversion/ONNXToKrnl/Quantization/DynamicQuantizeLinear.cpp b/src/Conversion/ONNXToKrnl/Quantization/DynamicQuantizeLinear.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/Compiler/CompilerOptions.hpp"
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 #include "src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp"
 #include "src/Dialect/Krnl/DialectBuilder.hpp"
@@ -29,7 +30,7 @@ void emitDynamicQuantizationLinearScalarParameters(
     ConversionPatternRewriter &rewriter, Location loc, Operation *op,
     MemRefType inputType, MemRefType quantizedType, Value input, Value qMin,
     Value qMax, Value &scale, Value &zeroPoint, Value &quantizedZeroPoint,
-    bool enableSIMD, bool enableParallel) {
+    bool wantZeroPoint, bool enableSIMD, bool enableParallel) {
   MultiDialectBuilder<KrnlBuilder, MathBuilder> create(rewriter, loc);
 
   // Types
@@ -62,11 +63,15 @@ void emitDynamicQuantizationLinearScalarParameters(
   scale = create.math.div(xDiff, boundDiff);
 
   // Compute y_zero_point.
-  Value interZeroPoint = create.math.sub(qMin, create.math.div(xMin, scale));
-  // Saturate zero point.
-  Value saturateZeroPoint = create.math.clip(interZeroPoint, qMin, qMax);
-  // Round zero point.
-  zeroPoint = create.math.round(saturateZeroPoint);
+  if (wantZeroPoint) {
+    Value interZeroPoint = create.math.sub(qMin, create.math.div(xMin, scale));
+    // Saturate zero point.
+    Value saturateZeroPoint = create.math.clip(interZeroPoint, qMin, qMax);
+    // Round zero point.
+    zeroPoint = create.math.round(saturateZeroPoint);
+  } else {
+    zeroPoint = zero;
+  }
   quantizedZeroPoint = create.math.cast(quantizedElementType, zeroPoint);
 }
 
@@ -122,15 +127,17 @@ struct ONNXDynamicQuantizeLinearOpLowering
     Value qMin = create.math.constant(elementType, 0.0);
     Value scale, zeroPoint, zeroPointInt;
 
+    bool wantZeroPoint = !disableQuantZeroPoint;
     emitDynamicQuantizationLinearScalarParameters(rewriter, loc, op,
         xMemRefType, yMemRefType, X, qMin, qMax, scale, zeroPoint, zeroPointInt,
-        enableSIMD, enableParallel);
+        wantZeroPoint, enableSIMD, enableParallel);
     create.krnl.store(scale, YScale);
     create.krnl.store(zeroPointInt, YZeroPoint);
 
     emitQuantizationLinearScalarParameters(rewriter, loc, op, xMemRefType,
         yMemRefType, Y, shapeHelper.getOutputDims(0), X, qMin, qMax, scale,
-        zeroPoint, enableSIMD, enableParallel);
+        zeroPoint, wantZeroPoint /*wanted one, so we have a zero point*/,
+        enableSIMD, enableParallel);
 
     rewriter.replaceOp(op, {Y, YScale, YZeroPoint});
     onnxToKrnlSimdReport(op);
diff --git a/src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp b/src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp
@@ -23,7 +23,8 @@ void emitQuantizationLinearScalarParameters(
     mlir::Operation *op, mlir::MemRefType inputType,
     mlir::MemRefType quantizedType, mlir::Value alloc, DimsExpr &allocDims,
     mlir::Value input, mlir::Value qMin, mlir::Value qMax, mlir::Value scale,
-    mlir::Value zeroPoint, bool enableSIMD, bool enableParallel);
+    mlir::Value zeroPoint, bool hasZeroPoint, bool enableSIMD,
+    bool enableParallel);
 
 // Scan the input to compute scale, zeroPoint, and quantizedZeroPoint given qMin
 // and qMax.
@@ -32,5 +33,6 @@ void emitDynamicQuantizationLinearScalarParameters(
     mlir::Operation *op, mlir::MemRefType inputType,
     mlir::MemRefType quantizedType, mlir::Value input, mlir::Value qMin,
     mlir::Value qMax, mlir::Value &scale, mlir::Value &zeroPoint,
-    mlir::Value &quantizedZeroPoint, bool enableSIMD, bool enableParallel);
+    mlir::Value &quantizedZeroPoint, bool wantZeroPoint, bool enableSIMD,
+    bool enableParallel);
 } // namespace onnx_mlir
diff --git a/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp b/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/Compiler/CompilerOptions.hpp"
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 #include "src/Dialect/Krnl/DialectBuilder.hpp"
 #include "src/Dialect/ONNX/DialectBuilder.hpp"
@@ -26,7 +27,8 @@ namespace onnx_mlir {
 void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
     Location loc, Operation *op, MemRefType inputType, MemRefType quantizedType,
     Value alloc, DimsExpr &allocDims, Value input, Value qMin, Value qMax,
-    Value scale, Value zeroPoint, bool enableSIMD, bool enableParallel) {
+    Value scale, Value zeroPoint, bool hasZeroPoint, bool enableSIMD,
+    bool enableParallel) {
   MultiDialectBuilder<KrnlBuilder, MemRefBuilder, MathBuilder> create(
       rewriter, loc);
 
@@ -77,7 +79,11 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
         // Round
         Value roundX = create.math.round(scaleX);
         // Adjust
-        Value adjustX = create.math.add(roundX, zeroPoint);
+        Value adjustX;
+        if (hasZeroPoint)
+          adjustX = create.math.add(roundX, zeroPoint);
+        else
+          adjustX = roundX;
         // Saturate
         Value saturateX = create.math.clip(adjustX, qMin, qMax);
         Value res = create.math.cast(quantizedElementType, saturateX);
@@ -160,15 +166,21 @@ struct ONNXQuantizeLinearOpLowering
 
     // Load y_zero_point.
     Value zeroPoint;
+    bool hasZeroPoint = false;
     if (!isNoneValue(YZeroPoint)) {
       zeroPoint = create.krnl.load(adaptor.getYZeroPoint());
       zeroPoint = create.math.cast(elementType, zeroPoint);
-    } else
-      zeroPoint = create.math.constant(elementType, 0.0);
-
+      hasZeroPoint = true;
+    }
+    if (disableQuantZeroPoint) {
+      // TODO: should we expect to disable hasZeroPoint forcefully, or generate
+      // an error if we had a zero point? Right now, just forcefully assert we
+      // have no zero point, i.e. ignore one even if we had a zero point.
+      hasZeroPoint = false;
+    }
     emitQuantizationLinearScalarParameters(rewriter, loc, op, xMemRefType,
         yMemRefType, Y, shapeHelper.getOutputDims(0), X, qMin, qMax, scale,
-        zeroPoint, enableSIMD, enableParallel);
+        zeroPoint, hasZeroPoint, enableSIMD, enableParallel);
 
     rewriter.replaceOp(op, {Y});
     onnxToKrnlSimdReport(op);
diff --git a/test/mlir/conversion/onnx_to_krnl/Quantization/DequantizeLinear_with_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/Quantization/DequantizeLinear_with_canonicalize.mlir
@@ -3,6 +3,8 @@
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
 
+// -----
+
 func.func @test_dequantizelinear_i8(%arg0: tensor<4xi8>, %arg1: tensor<f32>, %arg2: tensor<i8>) -> tensor<4xf32> {
   %0 = "onnx.DequantizeLinear"(%arg0, %arg1, %arg2) {axis = 1 : si64} : (tensor<4xi8>, tensor<f32>, tensor<i8>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
@@ -29,10 +31,12 @@ func.func @test_dequantizelinear_i8(%arg0: tensor<4xi8>, %arg1: tensor<f32>, %ar
 
 // -----
 
+
 func.func @test_dequantizelinear_ui8(%arg0: tensor<4xui8>, %arg1: tensor<f32>, %arg2: tensor<ui8>) -> tensor<4xf32> {
   %0 = "onnx.DequantizeLinear"(%arg0, %arg1, %arg2) {axis = 1 : si64} : (tensor<4xui8>, tensor<f32>, tensor<ui8>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 
+// mlir2FileCheck.py
 // CHECK-LABEL:  func.func @test_dequantizelinear_ui8
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<4xui8>, [[PARAM_1_:%.+]]: memref<f32>, [[PARAM_2_:%.+]]: memref<ui8>) -> memref<4xf32> {
 // CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<4xf32>
@@ -42,11 +46,11 @@ func.func @test_dequantizelinear_ui8(%arg0: tensor<4xui8>, %arg1: tensor<f32>, %
 // CHECK-DAG:         [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_1_]]{{.}} : memref<4xui8>
 // CHECK-DAG:         [[LOAD_PARAM_1_MEM_:%.+]] = krnl.load [[PARAM_1_]][] : memref<f32>
 // CHECK-DAG:         [[LOAD_PARAM_2_MEM_:%.+]] = krnl.load [[PARAM_2_]][] : memref<ui8>
-// CHECK:             [[VAR_5_:%.+]] = builtin.unrealized_conversion_cast [[LOAD_PARAM_2_MEM_]] : ui8 to i8
+// CHECK:             [[VAR_5_:%.+]] = builtin.unrealized_conversion_cast [[LOAD_PARAM_0_MEM_]] : ui8 to i8
 // CHECK-DAG:         [[VAR_6_:%.+]] = arith.uitofp [[VAR_5_]] : i8 to f32
-// CHECK-DAG:         [[VAR_7_:%.+]] = builtin.unrealized_conversion_cast [[LOAD_PARAM_0_MEM_]] : ui8 to i8
+// CHECK-DAG:         [[VAR_7_:%.+]] = builtin.unrealized_conversion_cast [[LOAD_PARAM_2_MEM_]] : ui8 to i8
 // CHECK:             [[VAR_8_:%.+]] = arith.uitofp [[VAR_7_]] : i8 to f32
-// CHECK:             [[VAR_9_:%.+]] = arith.subf [[VAR_8_]], [[VAR_6_]] : f32
+// CHECK:             [[VAR_9_:%.+]] = arith.subf [[VAR_6_]], [[VAR_8_]] : f32
 // CHECK:             [[VAR_10_:%.+]] = arith.mulf [[VAR_9_]], [[LOAD_PARAM_1_MEM_]] : f32
 // CHECK:             krnl.store [[VAR_10_]], [[RES_]]{{.}}[[VAR_1_]]{{.}} : memref<4xf32>
 // CHECK:           }
diff --git a/test/mlir/conversion/onnx_to_krnl/Quantization/QuantizationWithoutZeroPoint.mlir b/test/mlir/conversion/onnx_to_krnl/Quantization/QuantizationWithoutZeroPoint.mlir