[AffineParallelOpUnparallelize] Simplify nested SCF IndexSwitch that has a modulo expression argument (#8401)

jiahanxie353 · web-flow · commit f2f202c2daa7 · 2025-04-12T10:57:12.000-04:00
diff --git a/lib/Dialect/Calyx/Transforms/AffinePloopUnparallelize.cpp b/lib/Dialect/Calyx/Transforms/AffinePloopUnparallelize.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Pass/PassManager.h"
@@ -75,6 +76,9 @@ class AffinePloopUnparallelize
 
     int64_t factor = factorAttr.getInt();
 
+    SmallVector<scf::IndexSwitchOp> simplifiableIndexSwitchOps =
+        collectSimplifiableIndexSwitchOps(affineParallelOp, factor);
+
     auto outerLoop = rewriter.create<affine::AffineForOp>(
         loc, lowerBound, rewriter.getDimIdentityMap(), upperBound,
         rewriter.getDimIdentityMap(), step * factor);
@@ -132,8 +136,47 @@ class AffinePloopUnparallelize
     rewriter.setInsertionPointToEnd(destBlock);
     rewriter.create<affine::AffineYieldOp>(loc);
 
+    for (auto indexSwitchOp : simplifiableIndexSwitchOps) {
+      indexSwitchOp.setOperand(innerParallel.getIVs().front());
+    }
+
     return success();
   }
+
+private:
+  // Collect all simplifiable `scf.index_switch` ops in `affineParallelOp`. An
+  // `scf.index_switch` op is simpliiable if its argument only depends on
+  // `affineParallelOp`'s loop IV and if it's a result of a modulo expression.
+  SmallVector<scf::IndexSwitchOp>
+  collectSimplifiableIndexSwitchOps(affine::AffineParallelOp affineParallelOp,
+                                    int64_t factor) const {
+    SmallVector<scf::IndexSwitchOp> result;
+    affineParallelOp->walk([&](scf::IndexSwitchOp indexSwitchOp) {
+      auto switchArg = indexSwitchOp.getArg();
+      auto affineApplyOp =
+          dyn_cast_or_null<affine::AffineApplyOp>(switchArg.getDefiningOp());
+      if (!affineApplyOp || affineApplyOp->getNumOperands() != 1 ||
+          affineApplyOp->getNumResults() != 1)
+        return WalkResult::advance();
+
+      auto affineMap = affineApplyOp.getAffineMap();
+      auto binExpr = dyn_cast<AffineBinaryOpExpr>(affineMap.getResult(0));
+      if (!binExpr || binExpr.getKind() != AffineExprKind::Mod)
+        return WalkResult::advance();
+
+      if (affineApplyOp.getOperand(0) != affineParallelOp.getIVs().front())
+        return WalkResult::advance();
+
+      auto rhs = binExpr.getRHS();
+      auto constRhs = dyn_cast<AffineConstantExpr>(rhs);
+      if (!constRhs || factor != constRhs.getValue())
+        return WalkResult::advance();
+
+      result.push_back(indexSwitchOp);
+      return WalkResult::advance();
+    });
+    return result;
+  }
 };
 
 namespace {
diff --git a/test/Dialect/Calyx/affine-ploop-unparallelize.mlir b/test/Dialect/Calyx/affine-ploop-unparallelize.mlir
@@ -55,3 +55,121 @@ module {
     return
   }
 }
+
+// -----
+
+// Test simplify `scf.index_switch` with nested `affine.parallel`s
+
+// CHECK-LABEL:   func.func @main(
+// CHECK-SAME:                    %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<4x6xf32>,
+// CHECK-SAME:                    %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<4x6xf32>,
+// CHECK-SAME:                    %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<4x6xf32>,
+// CHECK-SAME:                    %[[VAL_3:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<4x6xf32>,
+// CHECK-SAME:                    %[[VAL_4:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<4x6xf32>,
+// CHECK-SAME:                    %[[VAL_5:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: memref<4x6xf32>) {
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           affine.for %[[VAL_7:.*]] = 0 to 8 step 2 {
+// CHECK:             affine.parallel (%[[VAL_8:.*]]) = (0) to (2) {
+// CHECK:               affine.for %[[VAL_9:.*]] = 0 to 18 step 3 {
+// CHECK:                 affine.parallel (%[[VAL_10:.*]]) = (0) to (3) {
+// CHECK:                   scf.index_switch %[[VAL_8]]
+// CHECK:                   case 0 {
+// CHECK:                     scf.index_switch %[[VAL_10]]
+// CHECK:                     case 0 {
+// CHECK:                       affine.store %[[VAL_6]], %[[VAL_0]][(%[[VAL_7]] + %[[VAL_8]]) floordiv 2, (%[[VAL_9]] + %[[VAL_10]]) floordiv 3] : memref<4x6xf32>
+// CHECK:                       scf.yield
+// CHECK:                     }
+// CHECK:                     case 1 {
+// CHECK:                       affine.store %[[VAL_6]], %[[VAL_1]][(%[[VAL_7]] + %[[VAL_8]]) floordiv 2, (%[[VAL_9]] + %[[VAL_10]]) floordiv 3] : memref<4x6xf32>
+// CHECK:                       scf.yield
+// CHECK:                     }
+// CHECK:                     case 2 {
+// CHECK:                       affine.store %[[VAL_6]], %[[VAL_2]][(%[[VAL_7]] + %[[VAL_8]]) floordiv 2, (%[[VAL_9]] + %[[VAL_10]]) floordiv 3] : memref<4x6xf32>
+// CHECK:                       scf.yield
+// CHECK:                     }
+// CHECK:                     default {
+// CHECK:                     }
+// CHECK:                     scf.yield
+// CHECK:                   }
+// CHECK:                   case 1 {
+// CHECK:                     scf.index_switch %[[VAL_10]]
+// CHECK:                     case 0 {
+// CHECK:                       affine.store %[[VAL_6]], %[[VAL_3]][(%[[VAL_7]] + %[[VAL_8]]) floordiv 2, (%[[VAL_9]] + %[[VAL_10]]) floordiv 3] : memref<4x6xf32>
+// CHECK:                       scf.yield
+// CHECK:                     }
+// CHECK:                     case 1 {
+// CHECK:                       affine.store %[[VAL_6]], %[[VAL_4]][(%[[VAL_7]] + %[[VAL_8]]) floordiv 2, (%[[VAL_9]] + %[[VAL_10]]) floordiv 3] : memref<4x6xf32>
+// CHECK:                       scf.yield
+// CHECK:                     }
+// CHECK:                     case 2 {
+// CHECK:                       affine.store %[[VAL_6]], %[[VAL_5]][(%[[VAL_7]] + %[[VAL_8]]) floordiv 2, (%[[VAL_9]] + %[[VAL_10]]) floordiv 3] : memref<4x6xf32>
+// CHECK:                       scf.yield
+// CHECK:                     }
+// CHECK:                     default {
+// CHECK:                     }
+// CHECK:                     scf.yield
+// CHECK:                   }
+// CHECK:                   default {
+// CHECK:                   }
+// CHECK:                 }
+// CHECK:               } {unparallelized}
+// CHECK:             }
+// CHECK:           } {unparallelized}
+// CHECK:           return
+// CHECK:         }
+
+#map = affine_map<(d0) -> (d0 mod 2)>
+#map1 = affine_map<(d0) -> (d0 mod 3)>
+module {
+  func.func @main(%arg0: memref<4x6xf32>, %arg1: memref<4x6xf32>, %arg2: memref<4x6xf32>, %arg3: memref<4x6xf32>, %arg4: memref<4x6xf32>, %arg5: memref<4x6xf32>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.parallel (%arg6) = (0) to (8) {
+      affine.parallel (%arg7) = (0) to (18) {
+        %0 = affine.apply #map(%arg6)
+        scf.index_switch %0
+        case 0 {
+          %1 = affine.apply #map1(%arg7)
+          scf.index_switch %1
+          case 0 {
+            affine.store %cst, %arg0[%arg6 floordiv 2, %arg7 floordiv 3] : memref<4x6xf32>
+            scf.yield
+          }
+          case 1 {
+            affine.store %cst, %arg1[%arg6 floordiv 2, %arg7 floordiv 3] : memref<4x6xf32>
+            scf.yield
+          }
+          case 2 {
+            affine.store %cst, %arg2[%arg6 floordiv 2, %arg7 floordiv 3] : memref<4x6xf32>
+            scf.yield
+          }
+          default {
+          }
+          scf.yield
+        }
+        case 1 {
+          %1 = affine.apply #map1(%arg7)
+          scf.index_switch %1
+          case 0 {
+            affine.store %cst, %arg3[%arg6 floordiv 2, %arg7 floordiv 3] : memref<4x6xf32>
+            scf.yield
+          }
+          case 1 {
+            affine.store %cst, %arg4[%arg6 floordiv 2, %arg7 floordiv 3] : memref<4x6xf32>
+            scf.yield
+          }
+          case 2 {
+            affine.store %cst, %arg5[%arg6 floordiv 2, %arg7 floordiv 3] : memref<4x6xf32>
+            scf.yield
+          }
+          default {
+          }
+          scf.yield
+        }
+        default {
+        }
+      } {unparallelize.factor=3}
+    } {unparallelize.factor=2}
+    return
+  }
+}
+