Fix gemm-tensor-of-ptr performance regression (#4209)

whitneywhtsang · web-flow · commit e4f83c9998a3 · 2025-05-14T21:30:16.000-04:00
Verified all inductor tests below are passing: ``` python test/inductor/test_select_algorithm.py TestSelectAlgorithm.test_addmm_fp16 python test/inductor/test_select_algorithm.py TestSelectAlgorithm.test_convolution1 python test/inductor/test_max_autotune.py TestPrologueFusion.test_multiple_inputs_sizes2 python test/inductor/test_max_autotune.py TestPrologueFusion.test_upcast_sizes2 ``` Benchmark CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/15033893513/job/42251970440 Fixes #4206. Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/test/Triton/Intel/RemoveMasks/loop-canonical-masks.mlir b/test/Triton/Intel/RemoveMasks/loop-canonical-masks.mlir
@@ -110,7 +110,6 @@ module {
   // CHECK: }
 
   tt.func public @test_kernel2(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-    %c7_i32 = arith.constant 7 : i32
     %c8_i32 = arith.constant 8 : i32
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32>
     %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x256xf16>
@@ -165,7 +164,7 @@ module {
     %33 = arith.addi %31, %32 : tensor<64x256xi32>
     %34 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<64x256x!tt.ptr<f16>>
     %35 = tt.addptr %34, %33 : tensor<64x256x!tt.ptr<f16>>, tensor<64x256xi32>
-    %36:3 = scf.for %arg3 = %c0_i32 to %c7_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %35) -> (tensor<128x256xf32>, tensor<128x64x!tt.ptr<f16>>, tensor<64x256x!tt.ptr<f16>>)  : i32 {
+    %36:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %27, %arg6 = %35) -> (tensor<128x256xf32>, tensor<128x64x!tt.ptr<f16>>, tensor<64x256x!tt.ptr<f16>>)  : i32 {
       %51 = arith.muli %arg3, %c64_i32 : i32
       %52 = arith.subi %c512_i32, %51 : i32
       %53 = tt.splat %52 : i32 -> tensor<1x64xi32>
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp
@@ -119,7 +119,7 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
       int64_t N =
           cast<arith::ConstantIntOp>(maskInfo.N.getDefiningOp()).value();
       unsigned END = maskInfo.END;
-      bool cond = UB <= ((N + END - 1) / END) - 1;
+      bool cond = UB == ((N - END) / END) + 1;
       return builder.create<arith::ConstantIntOp>(forOp.getLoc(), cond,
                                                   builder.getI1Type());
     }
@@ -156,7 +156,8 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
       int64_t UB = cast<arith::ConstantIntOp>(defOp).value();
       int64_t N =
           cast<arith::ConstantIntOp>(maskInfo.N.getDefiningOp()).value();
-      return UB == ((N + maskInfo.END - 1) / maskInfo.END) - 1;
+      unsigned END = maskInfo.END;
+      return UB == ((N - END) / END) + 1;
     }
 
     if (!isa<arith::DivSIOp>(defOp))