attempt with temp buffers inside main loop

AlexandreEichenberger · AlexandreEichenberger · commit 3a620342af7b · 2024-09-19T20:40:36.000-04:00
Signed-off-by: Alexandre Eichenberger &lt;alexe@us.ibm.com&gt;
diff --git a/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp b/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp
@@ -57,12 +57,6 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
         {GenericOps::FloorGop, 2},
         {GenericOps::EstimatedVectorRegisterPressure,
             8 /* Little parallelism in code. */}};
-    // Because quantization transforms, for example, a 4 bytes input type of
-    // float into 1 byte output type of char, and since most of the computations
-    // are in float, we need to provide the float type below to let the function
-    // see that most generic operations are supported for floats. But at the
-    // same time, we need a minimum total unrolling of 16 so as to generate a
-    // single vector of uint8.
     totVL = computeSuitableUnrollFactor(inputType /* use unquantized type*/,
         innermostLoopCollapse, mix, canOverCompute, simdLoopStaticTripCount,
         simdOnly);
@@ -77,18 +71,19 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
   DimsExpr outputAF;
   outputAF.emplace_back(zero);
 
-#if 0
- // Insert / extract are slow on z16: 169us for 64K vals.
-  MemRefType outputType = llvm::cast<MemRefType>(alloc.getType());
-  totVL = boostVLForMinUnroll(inputType, outputType, totVL);
-  VectorType quantizedVectorType =
-      VectorType::get({totVL}, quantizedElementType);
-  Value qDummy = create.vec.loadIE(quantizedVectorType, flatAlloc, {zero});
+#if 1
+  // Allocate output buffers.
+  MemRefType inputBufferType =
+      MemRefType::get({totVL}, inputType.getElementType());
+  Value inputBuffer = create.mem.alignedAlloc(inputBufferType);
+  MemRefType outputBufferType = MemRefType::get({totVL}, quantizedElementType);
+  VectorType outputVectorType = VectorType::get({totVL}, quantizedElementType);
+  Value outputBuffer = create.mem.alignedAlloc(outputBufferType);
 
   create.krnl.simdIterateIE(simdLb, simdUb, totVL, simdOnly, enableParallel,
       {flatInput}, {inputAF}, {flatAlloc}, {outputAF},
       {[&](const KrnlBuilder &kb, ArrayRef<Value> inputVals, int64_t VL) {
-        MultiDialectBuilder<VectorBuilder, MathBuilder> create(kb);
+        MultiDialectBuilder<KrnlBuilder, MathBuilder, VectorBuilder> create(kb);
         Value x = inputVals[0];
         // Scale
         Value scaleX = create.math.div(x, scale);
@@ -102,21 +97,22 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
           adjustX = roundX;
         // Saturate: use max into a min.
         Value saturateX = create.math.clip(adjustX, qMin, qMax);
-        Value res;
-        if (VL == 1) {
-          res = create.math.cast(quantizedElementType, saturateX);
-        } else {
-          res = qDummy; //
-          for (int64_t v = 0; v < VL; ++v) {
-            Value element = create.vec.extractElement(saturateX, v);
-            Value resElement = create.math.cast(quantizedElementType, element);
-            res = create.vec.insertElement(res, resElement, v);
-          }
+        if (VL == 1)
+          return create.math.cast(quantizedElementType, saturateX);
+        // Has VL values; first save all VL into buffer.
+        create.vec.storeIE(saturateX, inputBuffer, {zero});
+        // Now process each value in turn
+        for (int64_t v = 0; v < VL; ++v) {
+          IndexExpr vv = LitIE(v);
+          Value scalarSaturateX = create.krnl.loadIE(inputBuffer, {vv});
+          Value scalarRes =
+              create.math.cast(quantizedElementType, scalarSaturateX);
+          create.krnl.storeIE(scalarRes, outputBuffer, {vv});
         }
-        return res;
+        // Reload the output buffer as one vector.
+        return create.vec.loadIE(outputVectorType, outputBuffer, {zero});
       }});
-
-#elif 1
+#else
   // faster than original loop on z16, takes 124us for 64k vals
   // Allocate output buffers.
   MemRefType flatBufferType = llvm::cast<MemRefType>(flatInput.getType());
@@ -141,40 +137,30 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
           adjustX = roundX;
         // Saturate: use max into a min.
         Value saturateX = create.math.clip(adjustX, qMin, qMax);
+        // Old approach.
+        // return create.math.cast(quantizedElementType, saturateX);
         return saturateX;
       }});
-  create.krnl.forLoopIE(simdLb, simdUb, 1, /*parallel*/ false,
+
+  // A second loop that performs scalar float to int performs better than the
+  // compiler's attempt to generate SIMD conversion code. This might not hold
+  // with all data types, but is definitely noticeable with uint8.
+  //
+  // Todo: we might save the vector to a buffer on the fly (avoiding a second
+  // loop as below), and then reload each value as scalar and then saved them as
+  // scalar (thus avoiding the insert/extract SIMD operations that also do not
+  // perform well). The problem is that the current SIMD scheme expect a return
+  // value, either SIMD in SIMD mode or scalar in scalar mode. Thus that
+  // alternative scheme is not easy to pull off here.
+  create.krnl.forLoopIE(simdLb, simdUb, 1, enableParallel,
       [&](KrnlBuilder &kb, ValueRange loopInd) {
         MultiDialectBuilder<KrnlBuilder, MemRefBuilder, MathBuilder> create(kb);
         Value buffVal = create.krnl.loadIE(flatBuffer, {zero}, {loopInd[0]});
         Value res = create.math.cast(quantizedElementType, buffVal);
         create.krnl.storeIE(res, flatAlloc, {zero}, {loopInd[0]});
       });
-#else
-  // original, slow on z16 where it takes 158us
-  MemRefType outputType = llvm::cast<MemRefType>(alloc.getType());
-  totVL = boostVLForMinUnroll(inputType, outputType, totVL);
-  create.krnl.simdIterateIE(simdLb, simdUb, totVL, simdOnly, enableParallel,
-      {flatInput}, {inputAF}, {flatAlloc}, {outputAF},
-      {[&](const KrnlBuilder &kb, ArrayRef<Value> inputVals, int64_t VL) {
-        MultiDialectBuilder<MathBuilder> create(kb);
-        Value x = inputVals[0];
-        // Scale
-        Value scaleX = create.math.div(x, scale);
-        // Round
-        Value roundX = create.math.round(scaleX);
-        // Adjust
-        Value adjustX;
-        if (hasZeroPoint)
-          adjustX = create.math.add(roundX, zeroPoint);
-        else
-          adjustX = roundX;
-        // Saturate: use max into a min.
-        Value saturateX = create.math.clip(adjustX, qMin, qMax);
-        Value res = create.math.cast(quantizedElementType, saturateX);
-        return res;
-      }});
 #endif
+
   if (totVL > 1)
     onnxToKrnlSimdReport(op, /*successful*/ true, totVL,
         simdLoopStaticTripCount, "quantizationLinear whole tensor");