[AMD] Addressed comments of the PR

ravil-mobile · ravil-mobile · commit 67b292c5bf53 · 2025-05-20T16:04:40.000Z
diff --git a/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp
@@ -1,3 +1,4 @@
+#include "../TritonAMDGPUToLLVM/Utility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "TritonAMDGPUToLLVM/GCNAsmFormat.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
@@ -49,6 +50,7 @@ using namespace mlir::triton;
 // clang-format on
 
 namespace {
+
 struct ExtractSliceOpConversion
     : public ConvertOpToLLVMPattern<amdgpu::ExtractSliceOp> {
   explicit ExtractSliceOpConversion(LLVMTypeConverter &typeConverter,
@@ -60,61 +62,61 @@ struct ExtractSliceOpConversion
                               ConversionPatternRewriter &rewriter) const {
     Location loc = op->getLoc();
     auto srcTy = cast<RankedTensorType>(op.getSource().getType());
-    auto srcLayout = srcTy.getEncoding();
+    auto dstTy = cast<RankedTensorType>(op.getType());
     auto srcShape = srcTy.getShape();
-    auto resultTy = cast<RankedTensorType>(op.getType());
-    auto vals = unpackLLElements(loc, adaptor.getSource(), rewriter);
-    auto elemsPerThread = triton::gpu::getElemsPerThread(srcTy);
-    auto contigPerThread = triton::gpu::getContigPerThread(srcTy);
-    auto totalContigPerThread = product<unsigned>(contigPerThread);
-    auto order = triton::gpu::getOrder(srcTy);
+    auto dstShape = dstTy.getShape();
 
-    // Calculate valid total number of workers in each dimension
+    auto vals = unpackLLElements(loc, adaptor.getSource(), rewriter);
     auto shapePerCTATile = triton::gpu::getShapePerCTATile(srcTy);
-    shapePerCTATile[0] =
-        std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);
-    shapePerCTATile[1] =
-        std::min(static_cast<unsigned>(srcShape[1]), shapePerCTATile[1]);
-
-    // Rank == 2 checked in the verifier
-    SmallVector<int64_t, 2> sizes;
-    for (auto i = 0; i < 2; ++i) {
-      sizes.push_back(resultTy.getDimSize(i));
-    }
+    auto srcCTAShape = LLVM::AMD::multiDimElementwise<int64_t, unsigned>(
+        srcShape, shapePerCTATile, std::divides<unsigned>());
+    auto dstCTAShape = LLVM::AMD::multiDimElementwise<int64_t, unsigned>(
+        dstShape, shapePerCTATile, std::divides<unsigned>());
 
+    auto numCTATiles = std::accumulate(dstCTAShape.begin(), dstCTAShape.end(),
+                                       1, std::multiplies<>());
     auto offsets = op.getStaticOffsets();
+    auto firstTileCoordinate =
+        LLVM::AMD::multiDimElementwise<int64_t, unsigned>(
+            offsets, shapePerCTATile, std::divides<unsigned>());
 
-    // Calculate offsets and sizes in terms of CTA units.
-    std::array<int64_t, 2> CTAOffsets{offsets[0] / shapePerCTATile[0],
-                                      offsets[1] / shapePerCTATile[1]};
-    std::array<int64_t, 2> CTASizes{sizes[0] / shapePerCTATile[0],
-                                    sizes[1] / shapePerCTATile[1]};
-    std::array<int64_t, 2> CTAPerShape{srcShape[0] / shapePerCTATile[0],
-                                       srcShape[1] / shapePerCTATile[1]};
-
-    // The diagram above illustrates the graphical representation of the
-    // skipElems, tensorStride, and lastIdx variables.
-    auto skipElems = CTAOffsets[order[1]] * (elemsPerThread[order[0]] *
-                                             contigPerThread[order[1]]) +
-                     CTAOffsets[order[0]] * totalContigPerThread;
-    auto tensorStride =
-        (CTAPerShape[order[0]] - CTASizes[order[0]]) * totalContigPerThread;
-    auto lastIdx =
-        (CTAOffsets[order[1]] + CTASizes[order[1]] - 1) *
-            elemsPerThread[order[0]] * contigPerThread[order[1]] +
-        (CTAOffsets[order[0]] + CTASizes[order[0]]) * totalContigPerThread;
-
-    assert(lastIdx <= vals.size());
+    Attribute srcEncoding = srcTy.getEncoding();
+    Attribute dstEncoding = dstTy.getEncoding();
+    auto linearLayoutSrc = triton::gpu::toLinearLayout(srcShape, srcEncoding);
+    auto linearLayoutDst = triton::gpu::toLinearLayout(dstShape, dstEncoding);
 
+    auto srcCTAOrder =
+        LLVM::AMD::getCTATileOrder(srcTy.getContext(), linearLayoutSrc);
+    auto dstCTAOrder =
+        LLVM::AMD::getCTATileOrder(srcTy.getContext(), linearLayoutDst);
+
+    unsigned elemsPerThreadPerCTA =
+        triton::gpu::getTotalElemsPerThread(srcTy) /
+        std::accumulate(srcCTAShape.begin(), srcCTAShape.end(), 1,
+                        std::multiplies<>());
+
+    // 1. Process CTA tiles in the destination tensor according to the
+    // destination's linear layout order of CTA tiles.
+    // 2. For each tile position in the destination tensor, compute its
+    // corresponding position in the source tensor.
+    // 3. Copy the values from the source tile to the destination slice.
     SmallVector<Value> resultVals;
-    for (int i = skipElems; i < lastIdx; i += tensorStride) {
-      for (int j = 0; j < totalContigPerThread * CTASizes[order[0]]; ++j, ++i) {
-        assert(i < lastIdx);
-        resultVals.push_back(vals[i]);
+    for (size_t i = 0; i < numCTATiles; i++) {
+      auto coordInDstTensor =
+          mlir::LLVM::delinearize(i, dstCTAShape, dstCTAOrder);
+      auto coordInSrcTensor =
+          LLVM::AMD::multiDimElementwise<unsigned, unsigned>(
+              coordInDstTensor, firstTileCoordinate, std::plus<unsigned>());
+      auto linearIdxInSrcTensor =
+          mlir::LLVM::linearize(coordInSrcTensor, srcCTAShape, srcCTAOrder);
+
+      for (size_t j = 0; j < elemsPerThreadPerCTA; j++) {
+        resultVals.push_back(
+            vals[linearIdxInSrcTensor * elemsPerThreadPerCTA + j]);
       }
     }
     Value ret = packLLElements(loc, this->getTypeConverter(), resultVals,
-                               rewriter, resultTy);
+                               rewriter, dstTy);
 
     rewriter.replaceOp(op, ret);
     return success();
@@ -124,11 +126,7 @@ struct ExtractSliceOpConversion
   matchAndRewrite(amdgpu::ExtractSliceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto srcTy = op.getSource().getType();
-    if (isa<BlockedEncodingAttr, AMDMfmaEncodingAttr>(
-            op.getSource().getType().getEncoding())) {
-      return processLayout(op, adaptor, rewriter);
-    }
-    return failure();
+    return processLayout(op, adaptor, rewriter);
   }
 };
 } // namespace
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -755,4 +755,43 @@ void addLocalLoadNoAliasScope(AliasAnalysisOpInterface llLoadOp) {
   llLoadOp.setAliasScopes(aliasScopes);
 }
 
+SmallVector<unsigned> getCTATileOrder(MLIRContext *ctx,
+                                      const LinearLayout &layout) {
+  auto llEnc = triton::gpu::LinearEncodingAttr::get(ctx, layout);
+  auto regDim = StringAttr::get(ctx, "register");
+  auto &bases = layout.getBases().find(regDim)->second;
+
+  // Compute number of CTA tiles in a layout.
+  unsigned totalElems = layout.getTotalOutDimSize();
+  auto ctaShape = llEnc.getShapePerCTATile();
+  unsigned elemsPerCTA =
+      std::accumulate(ctaShape.begin(), ctaShape.end(), 1, std::multiplies<>());
+  assert((totalElems % elemsPerCTA) == 0 &&
+         "Total elements must be divisible by elemsPerCTA");
+  unsigned numCTAs = totalElems / elemsPerCTA;
+
+  // To determine the CTA tile order, start by identifying the register basis
+  // vector that corresponds to the first element of the second CTA tile. The
+  // nonzero index in the logical tensor it maps to indicates the most minor
+  // dimension. Then, for each subsequent basis register (first element of
+  // some CTA tile), extract the next nonzero index to build the full dimension
+  // order.
+  unsigned totalPerThread =
+      product(llEnc.basesPerDim(regDim, /*skipBroadcast=*/false)) / numCTAs;
+  unsigned startIndex = static_cast<unsigned>(std::log2(totalPerThread));
+
+  llvm::SmallSetVector<unsigned, 8> order;
+  for (unsigned i = startIndex; i < bases.size(); ++i) {
+    auto it = std::find_if(bases[i].begin(), bases[i].end(),
+                           [](unsigned v) { return v != 0; });
+    if (it != bases[i].end())
+      order.insert(std::distance(bases[i].begin(), it));
+  }
+
+  // Append any dims missing from our default order.
+  for (unsigned dim : llEnc.getOrder())
+    order.insert(dim);
+
+  return SmallVector<unsigned>(order.begin(), order.end());
+}
 } // namespace mlir::LLVM::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -137,6 +137,23 @@ void addLocalLoadNoAliasScope(AliasAnalysisOpInterface llLoadOp);
 // Attaches the "AsyncCopies" alias scope to llLoadDirectToLdsOp
 void addAsyncCopyAliasScope(AliasAnalysisOpInterface llLoadDirectToLdsOp);
 
+// Determine the order in which CTA tiles are laid out across the tensor.
+SmallVector<unsigned> getCTATileOrder(MLIRContext *ctx,
+                                      const LinearLayout &layout);
+
+template <typename T, typename U, typename BinaryOp>
+std::vector<unsigned> multiDimElementwise(const ArrayRef<T> &lhs,
+                                          const ArrayRef<U> &rhs, BinaryOp op) {
+  assert(lhs.size() == rhs.size() && "Input dimensions must match");
+  std::vector<unsigned> result;
+  result.reserve(lhs.size());
+  for (size_t i = 0, n = lhs.size(); i < n; ++i) {
+    unsigned a = static_cast<unsigned>(lhs[i]);
+    unsigned b = static_cast<unsigned>(rhs[i]);
+    result.push_back(op(a, b));
+  }
+  return result;
+}
 } // namespace mlir::LLVM::AMD
 
 #endif // TRITON_THIRD_PARTY_AMD_LIB_TRITONAMDGPUTOLLVM_UTILITY_H_
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp
@@ -1518,6 +1518,12 @@ LogicalResult Pingponger::transformFP4(OpBuilder &builder, Location loc) {
   builder.setInsertionPointAfter(dotSOps[0]);
   if (sliceDotScaled(builder, loc, dotSOps[0], 4).failed())
     return failure();
+
+  if (genAsyncCopySlices(builder).failed()) {
+    LDBG("failed to slice global-to-local async copies");
+    return failure();
+  }
+
   updateOpInsertion(dotSliceOps[0]);
 
   appendOp(builder.create<ROCDL::SchedBarrier>(loc, 0));
@@ -1681,10 +1687,6 @@ void Pingponger::getDotPingponged() {
       return;
     }
 
-    if (llvm::failed(genAsyncCopySlices(builder))) {
-      LDBG("failed to slice global-to-local async copies");
-    }
-
     auto updateSignature = updateForOpSignature(builder);
     if (llvm::failed(updateSignature)) {
       LDBG("failed to update forOp signature");
@@ -1695,6 +1697,18 @@ void Pingponger::getDotPingponged() {
         LDBG("failed to update forOp signature");
       }
     }
+
+    forOp->walk([](ttg::AsyncCommitGroupOp groupOp) {
+      auto users = groupOp.getResult().getUsers();
+      if (users.empty()) {
+        SmallVector<Operation *> toDeleteVec;
+        for (auto token : groupOp.getInputTokens()) {
+          toDeleteVec.push_back(token.getDefiningOp());
+        }
+        groupOp->erase();
+        llvm::for_each(toDeleteVec, [](Operation *op) { op->erase(); });
+      }
+    });
     addAsymmetricSyncToLoop(builder, loc);
     return;
   }