ROCm
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 6 additions & 4 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 6 additions & 4 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp
Lines changed: 2 additions & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 13 additions & 7 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 13 additions & 7 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Lines changed: 80 additions & 77 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
Lines changed: 80 additions & 77 deletions
diff --git a/‎python/triton/knobs.py
Lines changed: 1 addition & 0 deletions b/‎python/triton/knobs.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎third_party/amd/backend/compiler.py
Lines changed: 2 additions & 1 deletion b/‎third_party/amd/backend/compiler.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.h
Lines changed: 3 additions & 4 deletions b/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.h
Lines changed: 3 additions & 4 deletions
diff --git a/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Lines changed: 5 additions & 1 deletion b/‎third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Lines changed: 5 additions & 1 deletion
@@ -282,10 +282,12 @@ LinearLayout getTmemLoadLayoutSplitLongM(int M, int N, RankedTensorType oldType,
                                          int numWarps);
 
 // Create LinearLayout for scale in scaled mfma.
-LinearLayout chooseScaledMfmaScaleLayout(
-    MLIRContext *ctx, int dotOperandIdx,
-    const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
-    ArrayRef<int64_t> dotOperandShape, unsigned mfmaMDim);
+LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
+                                         ArrayRef<int64_t> dotOperandShape,
+                                         unsigned mfmaMDim,
+                                         ArrayRef<unsigned> tilesPerWarp,
+                                         ArrayRef<unsigned> warpsPerCTA,
+                                         bool preshuffleScales);
 
 // Create a LinearLayout similar to mfmaLayout, but changing each thread to hold
 // 8 elements. This layout is useful for emitting the widest 128-bit global
 
@@ -1016,6 +1016,7 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
     "unsigned": $versionMajor,
     "unsigned": $versionMinor,
     ArrayRefParameter<"unsigned">:$warpsPerCTA,
+    ArrayRefParameter<"unsigned">:$tilesPerWarp,
     "unsigned":$MDim,
     "unsigned":$NDim,
     "bool":$isTransposed,
 
@@ -38,6 +38,8 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_ASYNC_COPY_OVERLAP",
     "TRITON_HIP_ENABLE_F16_ASYNC_PINGPONG",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
+    "TRITON_HIP_PRESHUFFLE_SCALES",
+    "TRITON_HIP_BYPASS_LDS_FOR_SCALES",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
     "TRITON_HIP_ASYNC_FAST_SWIZZLE",
     "TRITON_LLVM_DEBUG_ONLY",
 
@@ -1287,6 +1287,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
   unsigned versionMajor = 0;
   unsigned versionMinor = 0;
   SmallVector<unsigned> warpsPerCTA;
+  SmallVector<unsigned> tilesPerWarp;
   SmallVector<unsigned> instrShape;
   bool isTransposed;
   std::optional<SmallVector<unsigned>> CTAsPerCGA;
@@ -1306,6 +1307,11 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
       if (parseIntArrayAttr(parser, attr, warpsPerCTA, "warpsPerCTA").failed())
         return {};
     }
+    if (attr.getName() == "tilesPerWarp") {
+      if (parseIntArrayAttr(parser, attr, tilesPerWarp, "tilesPerWarp")
+              .failed())
+        return {};
+    }
     if (attr.getName() == "instrShape") {
       if (parseIntArrayAttr(parser, attr, instrShape, "instrShape").failed())
         return {};
@@ -1339,27 +1345,27 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
 
   return parser.getChecked<AMDMfmaEncodingAttr>(
       parser.getContext(), versionMajor, versionMinor, warpsPerCTA,
-      instrShape[0], instrShape[1], isTransposed, *CTALayout);
+      tilesPerWarp, instrShape[0], instrShape[1], isTransposed, *CTALayout);
 }
 
 void AMDMfmaEncodingAttr::print(AsmPrinter &printer) const {
   printer << "<{"
           << "versionMajor = " << getVersionMajor()                      //
           << ", versionMinor = " << getVersionMinor()                    //
           << ", warpsPerCTA = [" << getWarpsPerCTA() << "]"              //
+          << ", tilesPerWarp = [" << getTilesPerWarp() << "]"            //
           << ", instrShape = [" << ArrayRef{getMDim(), getNDim()} << "]" //
           << ", isTransposed = " << getIsTransposed();
   maybePrintCTALayout(getContext(), printer, getCTALayout(),
                       /*rank=*/getRank());
   printer << "}>";
 }
 
-LogicalResult
-AMDMfmaEncodingAttr::verify(function_ref<mlir::InFlightDiagnostic()> emitError,
-                            unsigned versionMajor, unsigned versionMinor,
-                            llvm::ArrayRef<unsigned int> warpsPerCTA,
-                            unsigned mDim, unsigned nDim, bool isTransposed,
-                            mlir::triton::gpu::CTALayoutAttr) {
+LogicalResult AMDMfmaEncodingAttr::verify(
+    function_ref<mlir::InFlightDiagnostic()> emitError, unsigned versionMajor,
+    unsigned versionMinor, llvm::ArrayRef<unsigned int> warpsPerCTA,
+    llvm::ArrayRef<unsigned int> tilesPerWarp, unsigned mDim, unsigned nDim,
+    bool isTransposed, mlir::triton::gpu::CTALayoutAttr) {
   if (!(versionMajor >= 0 && versionMajor <= 4)) {
     return emitError() << "major version must be in the [0, 4] range";
   }
 
@@ -462,6 +462,11 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
            {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, /*gap*/ {4, 0}, {8, 0}}}},
           {outDimNames[order[0]], outDimNames[order[1]]});
   }
+
+  auto tilesPerWarp = getTilesPerWarp();
+  tileLayout *=
+      mlir::triton::identityStandardND(kRegister, tilesPerWarp, order);
+
   if (hasBatchDim) {
     assert(order[2] == 0);
     // Extend the base vector with one value to accommodate for the batch
@@ -637,31 +642,6 @@ LinearLayout chooseDotDsReadB64TrLayout(DotOperandEncodingAttr dotMfmaLayout,
 
 LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
                                    ArrayRef<int64_t> shape) {
-
-  // Current linear layout conversion for dot operand is only necessary to
-  // enable LDS bypass for operand B in the MFMA dot path. To achieve
-  // performance gains from bypassing LDS, the following conditions must be met:
-  //
-  // 1) opIdx == 1: Currently, only the B tensor (e.g. weights in moe-like
-  //    kernels) bypasses LDS. This constraint is not strict and support for
-  //    bypassing operand A (e.g. Q tensor in flash attention) will be added in
-  //    the future.
-  //
-  // 2) B tensor must be column major: This is required to support vectorized
-  //    global load instructions, as MFMA instructions expect threads to hold B
-  //    operand elements along the K dimension.
-  //
-  // 3) kWidth == 8: Ensures maximum global load vectorization for fp16
-  //    operations.
-  //    TODO: Generalize conversion to handle maximum kWidth for other types
-  //    (i.e. fp8).
-  //
-  // 4) warpsPerCTA[mDim] == 1: This guarantees that every B tensor element is
-  //    held by exactly one thread, maintaining the same number of global loads
-  //    as in a blocked layout.
-  //
-  // Other use of Linear layout is a support of rare corner cases,
-  // for example one instruction tile is larger than tensor
   auto mfmaLayout = llvm::cast<AMDMfmaEncodingAttr>(dotMfmaLayout.getParent());
 
   auto rank = shape.size();
@@ -672,6 +652,8 @@ LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
   auto kDim = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
   int32_t kSize = shape[kDim];
   auto warpsPerCTA = mfmaLayout.getWarpsPerCTA();
+  auto tilesPerWarp = mfmaLayout.getTilesPerWarp();
+  auto tilePerWarpNonK = tilesPerWarp[kDim];
 
   MLIRContext *ctx = dotMfmaLayout.getContext();
   SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
@@ -725,6 +707,11 @@ LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
   for (int32_t elem = kTileSize; elem < kSize; elem *= 2)
     registerBase.emplace_back(std::vector<int32_t>{elem, 0});
 
+  // Add repeats of registers along non-K dimension to register base vectors
+  for (int32_t elem = mfmaLayout.getMDim();
+       elem < tilePerWarpNonK * mfmaLayout.getMDim(); elem *= 2)
+    registerBase.emplace_back(std::vector<int32_t>{0, elem});
+
   // Base vectors above are defined in a fixed order [non-k-dim, k-dim].
   // To assign them to actual matrix dimensions `order` array is used.
   // For operand A: non-k-dim -> dim0, k-dim -> dim1
@@ -745,7 +732,9 @@ LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
   LinearLayout ctaLayout = tileLayout.transposeOuts(outDimNames) *
                            warpLayout.transposeOuts(outDimNames);
 
-  return combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
+  auto finalLayout =
+      combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
+  return finalLayout;
 }
 
 LinearLayout
@@ -1446,10 +1435,12 @@ LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
   return chooseDotDsReadB64TrLayout(dot, shape, elemBitWidth);
 }
 
-LinearLayout chooseScaledMfmaScaleLayout(
-    MLIRContext *ctx, int dotOperandIdx,
-    const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
-    ArrayRef<int64_t> dotOperandShape, unsigned mfmaMDim) {
+LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
+                                         ArrayRef<int64_t> dotOperandShape,
+                                         unsigned mfmaMDim,
+                                         ArrayRef<unsigned> tilesPerWarp,
+                                         ArrayRef<unsigned> warpsPerCTA,
+                                         bool preshuffleScales) {
   using basisT = std::vector<std::vector<int32_t>>;
   unsigned rank = dotOperandShape.size();
   auto order = mlir::triton::gpu::getMatrixOrder(rank, /*rowMajor=*/true);
@@ -1458,31 +1449,16 @@ LinearLayout chooseScaledMfmaScaleLayout(
   StringAttr kLane = StringAttr::get(ctx, "lane");
   StringAttr kWarp = StringAttr::get(ctx, "warp");
   StringAttr kBlock = StringAttr::get(ctx, "block");
-  // Init register layout. Will be adjusted later
-  auto regs = mlir::triton::identityStandardND(kRegister, {1, 1}, order);
-  LinearLayout lanes = LinearLayout::empty();
+  auto kDim = dotOperandIdx == 0 ? rank - 1 : rank - 2;
+  auto tilePerWarpNonK = tilesPerWarp[kDim];
+
   // In scaled dot, the shapes of operands(without batch dimension) are,
   // respectively:
   // - A: [M, K]
   // - B: [K, N]
   // - aScale: [M, K / 32]
   // - bScale: [N, K / 32]
   //
-  // To correctly feed A/B and its scale into instruction, we need to
-  // distribute aScale/bScale among warps in the same way as A/B. But bScale
-  // is not transposed like B. So we need to transpose the warp layout of
-  // bScale.
-  //
-  // The tricky part is, our desired outputs are [dim0, dim1], but
-  // at this position, the layouts are transposed to [dim1, dim0]. So
-  // instead of reverse bScale's layout, we need to reverse aScale's. There
-  // will be a transpose in the end to correct everything.
-  basisT warps = dotOperandWarpBasis;
-  if (dotOperandIdx == 0) {
-    for (auto &basis : warps) {
-      std::reverse(basis.begin(), basis.end());
-    }
-  }
   // In general, for both 32x32 and 16x16 scaled mfma, and no matter what
   // data type the A/B operand is, each lane takes 32 elements from A/B
   // alone K dim, and 1 or 2 elements from scale accordingly. The number of
@@ -1492,43 +1468,70 @@ LinearLayout chooseScaledMfmaScaleLayout(
   // For mxfp4, these 32 elements are consecutive, so only 1 scale element
   // is required. But for mxfp6/mxfp8, there are 2 16-consecutive elements
   // blocks, so 2 scale elements are required.
+  int32_t kSize = dotOperandShape[1];
+
+  std::vector<std::vector<int32_t>> registerBase;
+  std::vector<std::vector<int32_t>> laneBase;
+
+  auto kTileSize = mfmaMDim == 32 ? 2 : 4;
+
+  if (preshuffleScales) {
+    auto sizePerThreadPerTile = 1;
+    auto numKTiles = kSize / kTileSize;
+    for (int32_t elem = 1;
+         elem < sizePerThreadPerTile * numKTiles * tilePerWarpNonK; elem *= 2)
+      registerBase.emplace_back(std::vector<int32_t>{elem, 0});
+  } else {
+    for (int32_t elem = kTileSize; elem < kSize; elem *= 2)
+      registerBase.emplace_back(std::vector<int32_t>{elem, 0});
+
+    for (int32_t elem = mfmaMDim; elem < tilePerWarpNonK * mfmaMDim; elem *= 2)
+      registerBase.emplace_back(std::vector<int32_t>{0, elem});
+  }
   if (mfmaMDim == 32) {
+    if (preshuffleScales) {
+      assert(false && "Preshuffling scales not yet implemented for mDim == 32");
+    }
     // For ROCDL::mfma_scale_f32_32x32x64_f8f6f4 with fp4 input, each lane
     // takes 32 consecutive elements from A alone K dimension. The first
     // 32 lanes collectively handle A[0:32][0:32], and the other 32 lanes
     // collectively handle A[0:32][32:64]. Each lane take 1 scale element
     // accordingly. Similar to B and bScale.
-    lanes = LinearLayout(
-        {{kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {1, 0}}},
-         {kWarp, warps},
-         {kBlock, {}}},
-        {standardOutDims[order[0]], standardOutDims[order[1]]});
+    laneBase = {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {1, 0}};
   } else {
     assert(mfmaMDim == 16);
-    // For ROCDL::mfma_scale_f32_16x16x128_f8f6f4 with fp4 input, each lane
-    // takes 32 consecutive elements from A alone K dimension. The first
-    // 16 lanes collectively handle A[0:16][0:32], and another 16 lanes
-    // collectively handle A[0:16][32:64] and so on. Each lane take 1 scale
-    // element accordingly. Similar to B and bScale.
-    lanes =
-        LinearLayout({{kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}}},
-                      {kWarp, warps},
-                      {kBlock, {}}},
-                     {standardOutDims[order[0]], standardOutDims[order[1]]});
-  }
-  LinearLayout newLL = regs * lanes;
-
-  // Adjust register-level layout to fill the shape, at this level, both
-  // aScale and bScale should align with A operand.
-  SmallVector<int, 2> repOrder = {1, 0};
-  for (auto d : repOrder) {
-    auto outDim = standardOutDims[d];
-    auto dimSize = newLL.getOutDimSize(outDim);
-    newLL *= LinearLayout::identity1D(dotOperandShape[d] / dimSize, kRegister,
-                                      outDim);
-  }
-  newLL = newLL.transposeOuts(standardOutDims);
-  return newLL;
+    if (preshuffleScales) {
+      laneBase = {{4, 0}, {0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}};
+    } else {
+      // For ROCDL::mfma_scale_f32_16x16x128_f8f6f4 with fp4 input, each lane
+      // takes 32 consecutive elements from A alone K dimension. The first
+      // 16 lanes collectively handle A[0:16][0:32], and another 16 lanes
+      // collectively handle A[0:16][32:64] and so on. Each lane take 1 scale
+      // element accordingly. Similar to B and bScale.
+      laneBase = {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}, {2, 0}};
+    }
+  }
+
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+  LinearLayout tileLayout({{kRegister, registerBase}, {kLane, laneBase}},
+                          {outDimNames[order[0]], outDimNames[order[1]]});
+
+  SmallVector<unsigned> warpsPerCTANew{warpsPerCTA[0], warpsPerCTA[1]};
+  SmallVector<unsigned> warpOrder{1, 0};
+
+  if (dotOperandIdx == 1) {
+    std::swap(warpsPerCTANew[0], warpsPerCTANew[1]);
+    std::swap(warpOrder[0], warpOrder[1]);
+  }
+
+  LinearLayout warpLayout =
+      identityStandardND(kWarp, warpsPerCTANew, warpOrder);
+  LinearLayout ctaLayout = tileLayout.transposeOuts(outDimNames) *
+                           warpLayout.transposeOuts(outDimNames);
+
+  auto ctaLay = CTALayoutAttr::get(/*context=*/ctx, /*CTAsPerCGA=*/{1, 1},
+                                   /*CTASplitNum=*/{1, 1}, /*CTAOrder=*/{1, 0});
+  return combineCtaCgaWithShape(ctaLayout, ctaLay, dotOperandShape);
 }
 
 std::optional<LinearLayout>
 
@@ -442,6 +442,7 @@ class amd_knobs(base_knobs):
     # We use strs so that we can have a default value based on other runtime info
     use_block_pingpong: env_opt_bool = env_opt_bool("TRITON_HIP_USE_BLOCK_PINGPONG")
     use_in_thread_transpose: env_opt_bool = env_opt_bool("TRITON_HIP_USE_IN_THREAD_TRANSPOSE")
+    preshuffle_scales: env_opt_bool = env_opt_bool("TRITON_HIP_PRESHUFFLE_SCALES")
 
     global_prefetch: env_int = env_int("TRITON_HIP_GLOBAL_PREFETCH")
     local_prefetch: env_int = env_int("TRITON_HIP_LOCAL_PREFETCH")
 
@@ -212,6 +212,7 @@ def make_ttir(mod, metadata, options):
     def make_ttgir(mod, metadata, options):
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
+        preshuffle_scales = knobs.amd.preshuffle_scales
         passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
                                            options.num_ctas)
         pm.run(mod)
@@ -220,7 +221,7 @@ def make_ttgir(mod, metadata, options):
         passes.ttgpuir.add_coalesce(pm)
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)
-        amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
+        amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack, preshuffle_scales)
         passes.ttgpuir.add_remove_layout_conversions(pm)
         amd.passes.ttgpuir.add_optimize_epilogue(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
 
@@ -13,10 +13,9 @@ createTritonAMDGPUStreamPipelinePass(int numStages = 2, int globalPrefetch = 0,
                                      int localPrefetch = 0,
                                      bool useAsyncCopy = false);
 
-std::unique_ptr<Pass>
-createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(),
-                                       int matrixInstructionSize = 0,
-                                       int kpack = 1);
+std::unique_ptr<Pass> createTritonAMDGPUAccelerateMatmulPass(
+    std::string archGenName = std::string(), int matrixInstructionSize = 0,
+    int kpack = 1, bool preshuffleScales = false);
 
 std::unique_ptr<Pass> createTritonAMDGPUCanonicalizeLoopsPass();
 
 
@@ -52,7 +52,11 @@ def TritonAMDGPUAccelerateMatmul : Pass<"tritonamdgpu-accelerate-matmul", "mlir:
            "enforce matrix instruction MN size">,
     Option<"kPack", "kPack",
            "int32_t", /*default*/"1",
-           "KWidth / kBase">
+           "KWidth / kBase">,
+    Option<"preshuffleScales", "preshuffle-scales",
+           "bool", /*default*/"false",
+           "preshuffle scaledDot scales">
+
   ];
 }