ROCm · ravil-mobile · Feb 3, 2025 · Jan 20, 2025 · Jan 23, 2025 · Jan 24, 2025
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -63,6 +63,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerOptimizeAMDLDSUsage();
 
   // TritonAMDGPUTransforms passes
+  mlir::registerTritonAMDGPUMembarAnalysis();
   mlir::registerTritonAMDGPUAccelerateMatmul();
   mlir::registerTritonAMDGPUOptimizeEpilogue();
   mlir::registerTritonAMDGPUHoistLayoutConversions();
@@ -80,6 +81,8 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
 
   // NVWS passes
   mlir::registerNVWSTransformsPasses();
+  mlir::registerTritonAMDGPURefineOps();
+  mlir::registerTritonAMDGPURescheduleOps();
 
   registry.insert<
       mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,

diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -394,16 +394,6 @@ class SharedMemoryObject {
     return offsets[dim];
   }
 
-  // TODO(Keren): deprecate the method once AMD backend has cleaned up
-  Value getBaseBeforeSlice(int dim, Location loc,
-                           RewriterBase &rewriter) const {
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    Value cSwizzleOffset = getCSwizzleOffset(dim);
-    Value offset = b.sub(b.i32_val(0), cSwizzleOffset);
-    Type type = base.getType();
-    return b.gep(type, baseElemType, base, offset);
-  }
-
 private:
   static SmallVector<unsigned>
   getOrderForShape(ArrayRef<int64_t> shape, ArrayRef<unsigned> layoutOrder) {

diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -457,7 +457,22 @@ AMDMfmaEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
       identityStandardND(S("warp"), getWarpsPerCTA(), order);
   LinearLayout ctaLayout = tileLayout * warpLayout;
 
-  return combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
+  auto combinedLayout =
+      combineCtaCgaWithShape(ctaLayout, getCTALayout(), shape);
+
+  auto bases = combinedLayout.getBases();
+  std::vector<std::vector<int>> newRegBases;
+  for (const auto &basis : bases[S("register")]) {
+    if (llvm::any_of(basis, [](int b) { return b != 0; })) {
+      newRegBases.push_back(basis);
+    }
+  }
+  bases[S("register")] = newRegBases;
+
+  auto result = LinearLayout(std::move(bases),
+                             llvm::to_vector(combinedLayout.getOutDimNames()));
+
+  return result;
 }
 
 LinearLayout chooseDotDsReadB64TrLayout(DotOperandEncodingAttr dotMfmaLayout,

diff --git a/test/Conversion/amd/async_ops_to_llvm.mlir b/test/Conversion/amd/async_ops_to_llvm.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s --check-prefix=GFX950
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --verify-diagnostics | FileCheck %s
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --triton-amdgpu-membar-analysis --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s --check-prefix=GFX950
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --triton-amdgpu-membar-analysis --convert-triton-amdgpu-to-llvm=arch=gfx942 --verify-diagnostics | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>

diff --git a/test/TritonGPU/amd/amd-extractslice-op.mlir b/test/TritonGPU/amd/amd-extractslice-op.mlir
@@ -12,3 +12,40 @@ module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32,
     tt.return
   }
 }
+
+#blocked3 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_slice_slice_1(%arg0: tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> {tt.divisibility = 16 : i32}) {
+    // CHECK: llvm.func @extract_slice_slice_1
+    // CHECK-COUNT-8: %{{[0-9]*}} = llvm.extractvalue  %arg0[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+    // CHECK: %8 = llvm.mlir.undef : !llvm.struct<(i32, i32, i32, i32)>
+    // CHECK-COUNT-4:  %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32)>
+    %1 = amdgpu.extract_slice %arg0 [128] : tensor<256xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> to tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>>
+    tt.return
+  }
+}
+
+#blocked4 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_slice_slice_0(%arg0: tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked4}>> {tt.divisibility = 16 : i32}) {
+    // CHECK: llvm.func @extract_slice_slice_0
+    // CHECK-COUNT-8: %{{[0-9]*}} = llvm.extractvalue  %arg0[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+    // CHECK: %8 = llvm.mlir.undef : !llvm.struct<(i32, i32, i32, i32)>
+    // CHECK-COUNT-4:  %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32)>
+    %0 = amdgpu.extract_slice %arg0 [0] : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked4}>> to tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked4}>>
+    tt.return
+  }
+}
+
+#blocked5 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_slice_slice_2() {
+        %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked5}>>
+        %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked5}>>
+        // CHECK-COUNT-4:  %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32)>
+        %2 = amdgpu.extract_slice %0 [0] : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked5}>> to tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked5}>>
+        // CHECK-COUNT-4:  %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32)>
+        %3 = amdgpu.extract_slice %1 [0] : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked5}>> to tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked5}>>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/ops-refinement/elementwise.mlir b/test/TritonGPU/amd/ops-refinement/elementwise.mlir
@@ -0,0 +1,127 @@
+// RUN: triton-opt %s -split-input-file -triton-amdgpu-refine-ops='arch=gfx942' | FileCheck %s
+
+// CHECK-LABEL: @exp_kernel
+// CHECK-DAG: [[VALUE_1:%.*]] = amdgpu.extract_slice {{.*}} [0, 0]
+// CHECK-DAG: [[VALUE_2:%.*]] = math.exp2 [[VALUE_1]]
+// CHECK-DAG: [[VALUE_3:%.*]] = amdgpu.extract_slice {{.*}} [0, 16]
+// CHECK-DAG: [[VALUE_4:%.*]] = math.exp2 [[VALUE_3]]
+// CHECK-DAG: [[VALUE_5:%.*]] = amdgpu.extract_slice {{.*}} [64, 0]
+// CHECK-DAG: [[VALUE_6:%.*]] = math.exp2 [[VALUE_5]]
+// CHECK-DAG: [[VALUE_7:%.*]] = amdgpu.extract_slice {{.*}} [64, 16]
+// CHECK-DAG: [[VALUE_8:%.*]] = math.exp2 [[VALUE_7]]
+// CHECK-DAG: [[VALUE_9:%.*]] = amdgpu.concat [[VALUE_2]], [[VALUE_4]], [[VALUE_6]], [[VALUE_8]]
+// CHECK-DAG: tt.return [[VALUE_9]]
+#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @exp_kernel(%arg0: tensor<128x32xf32, #blocked>) -> tensor<128x32xf32, #blocked> attributes {noinline = false} {
+    amdgpu.refine_reschedule_ops_hint
+    %0 = math.exp2 %arg0 : tensor<128x32xf32, #blocked>
+    tt.return %0 : tensor<128x32xf32, #blocked>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: mul_kernel
+// CHECK-DAG: [[VALUE_1:%.*]] = amdgpu.extract_slice {{.*}} [0, 0]
+// CHECK-DAG: [[VALUE_2:%.*]] = amdgpu.extract_slice {{.*}} [0, 0]
+// CHECK-DAG: [[VALUE_3:%.*]] = arith.mulf [[VALUE_1]], [[VALUE_2]]
+// CHECK-DAG: [[VALUE_4:%.*]] = amdgpu.extract_slice {{.*}} [0, 16]
+// CHECK-DAG: [[VALUE_5:%.*]] = amdgpu.extract_slice {{.*}} [0, 16]
+// CHECK-DAG: [[VALUE_6:%.*]] = arith.mulf [[VALUE_4]], [[VALUE_5]]
+// CHECK-DAG: [[VALUE_7:%.*]] = amdgpu.extract_slice {{.*}} [64, 0]
+// CHECK-DAG: [[VALUE_8:%.*]] = amdgpu.extract_slice {{.*}} [64, 0]
+// CHECK-DAG: [[VALUE_9:%.*]] = arith.mulf [[VALUE_7]], [[VALUE_8]]
+// CHECK-DAG: [[VALUE_10:%.*]] = amdgpu.extract_slice {{.*}} [64, 16]
+// CHECK-DAG: [[VALUE_11:%.*]] = amdgpu.extract_slice {{.*}} [64, 16]
+// CHECK-DAG: [[VALUE_12:%.*]] = arith.mulf [[VALUE_10]], [[VALUE_11]]
+// CHECK-DAG: [[VALUE_13:%.*]] = amdgpu.concat [[VALUE_3]], [[VALUE_6]], [[VALUE_9]], [[VALUE_12]]
+// CHECK-DAG: tt.return [[VALUE_13]]
+#blocked = #ttg.blocked<{sizePerThread = [2, 2], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @mul_kernel(%arg0: tensor<128x32xf32, #blocked>, %arg1: tensor<128x32xf32, #blocked>) -> tensor<128x32xf32, #blocked> attributes {noinline = false} {
+    amdgpu.refine_reschedule_ops_hint
+    %0 = arith.mulf %arg0, %arg1 : tensor<128x32xf32, #blocked>
+    tt.return %0 : tensor<128x32xf32, #blocked>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @multiple_operations_kernel
+
+// CHECK-COUNT-4: amdgpu.extract_slice {{.*}}
+// CHECK: [[OP1:%.*]] = amdgpu.concat
+// CHECK-COUNT-4: amdgpu.extract_slice [[OP1]]
+// CHECK: [[OP2:%.*]] = amdgpu.concat
+// CHECK-COUNT-4: amdgpu.extract_slice [[OP2]]
+// CHECK: [[OP3:%.*]] = amdgpu.concat
+// CHECK: tt.return [[OP3]]
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @multiple_operations_kernel(%arg0: tensor<128x32xf32, #mma>, %arg1: tensor<128x32xf32, #mma>) -> tensor<128x32xf32, #mma> attributes {noinline = false} {
+    amdgpu.refine_reschedule_ops_hint
+    %0 = math.exp2 %arg0 : tensor<128x32xf32, #mma>
+    %1 = math.exp2 %0 : tensor<128x32xf32, #mma>
+    %2 = math.exp2 %1 : tensor<128x32xf32, #mma>
+    tt.return %2 : tensor<128x32xf32, #mma>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @nested_operations_kernel
+// CHECK-COUNT-8: amdgpu.extract_slice
+// CHECK: mulf
+// CHECK: amdgpu.concat
+// CHECK: scf.for
+// CHECK-COUNT-4: amdgpu.extract_slice
+// CHECK: math.exp2
+// CHECK: amdgpu.concat
+// CHECK: }
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @nested_operations_kernel(%arg0: tensor<128x32xf32, #blocked>, %arg1: tensor<128x32xf32, #blocked>) -> tensor<128x32xf32, #blocked> attributes {noinline = false} {
+    amdgpu.refine_reschedule_ops_hint
+    %0 = arith.mulf %arg0, %arg1 : tensor<128x32xf32, #blocked>
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : i32
+    %c4 = arith.constant 4 : i32
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<128x32xf32, #blocked>) : i32 {
+      %2 = math.exp2 %0 : tensor<128x32xf32, #blocked>
+      scf.yield %2 : tensor<128x32xf32, #blocked>
+    }
+    tt.return %1 : tensor<128x32xf32, #blocked>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @peer_operations_kernel
+// CHECK: scf.for
+// CHECK-COUNT-4: amdgpu.extract_slice
+// CHECK: math.exp2
+// CHECK: amdgpu.concat
+// CHECK: scf.for
+// CHECK-NOT: amdgpu.extract_slice
+// CHECK: math.exp2
+// CHECK-NOT: amdgpu.concat
+// CHECK: }
+#blocked = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @peer_operations_kernel(%arg0: tensor<128x32xf32, #blocked>) -> tensor<128x32xf32, #blocked> attributes {noinline = false} {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : i32
+    %c4 = arith.constant 4 : i32
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %arg0) -> (tensor<128x32xf32, #blocked>) : i32 {
+      amdgpu.refine_reschedule_ops_hint
+      %2 = math.exp2 %arg2 : tensor<128x32xf32, #blocked>
+      scf.yield %2 : tensor<128x32xf32, #blocked>
+    }
+    %3 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %1) -> (tensor<128x32xf32, #blocked>) : i32 {
+      %4 = math.exp2 %arg4 : tensor<128x32xf32, #blocked>
+      scf.yield %4 : tensor<128x32xf32, #blocked>
+    }
+    tt.return %3 : tensor<128x32xf32, #blocked>
+  }
+}
diff --git a/test/TritonGPU/amd/ops-refinement/local_alloc.mlir b/test/TritonGPU/amd/ops-refinement/local_alloc.mlir
@@ -0,0 +1,35 @@
+// RUN: triton-opt %s -split-input-file -triton-amdgpu-refine-ops='arch=gfx942' -canonicalize | FileCheck %s
+
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+#smem = #ttg.shared_memory
+
+
+// CHECK-LABEL: @local_alloc_refinement
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 16384 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @local_alloc_refinement(%arg0: tensor<64x16xf16, #blocked>) attributes {noinline = false} {
+
+    // CHECK: [[OFFSET_12:%.*]] = arith.constant 12 : i32
+    // CHECK: [[OFFSET_8:%.*]] = arith.constant 8 : i32
+    // CHECK: [[OFFSET_4:%.*]] = arith.constant 4 : i32
+    // CHECK: [[OFFSET_0:%.*]] = arith.constant 0 : i32
+    // CHECK: [[ALLOC:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
+    // CHECK: [[SUBVIEW_0:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_0]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_0:%.*]] = amdgpu.extract_slice %arg0 [0, 0] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_0]], [[SUBVIEW_0]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SUBVIEW_1:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_4]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_1:%.*]] = amdgpu.extract_slice %arg0 [0, 4] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_1]], [[SUBVIEW_1]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SUBVIEW_2:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_8]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_2:%.*]] = amdgpu.extract_slice %arg0 [0, 8] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_2]], [[SUBVIEW_2]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SUBVIEW_3:%.*]] = ttg.memdesc_subview [[ALLOC]][[[OFFSET_0]], [[OFFSET_0]], [[OFFSET_12]]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: [[SLICE_3:%.*]] = amdgpu.extract_slice %arg0 [0, 12] : tensor<64x16xf16, #blocked> to tensor<64x4xf16, #blocked>
+    // CHECK: ttg.local_store [[SLICE_3]], [[SUBVIEW_3]] : tensor<64x4xf16, #blocked> -> !ttg.memdesc<64x4xf16, #shared, #smem, mutable, 1x64x16>
+    // CHECK: amdgpu.instruction_sched_hint {isBufferLoadsAEnabled = false, isBufferLoadsBEnabled = false, numDsReadsA = #amdgpu.InstCounter<0, none>, numDsReadsB = #amdgpu.InstCounter<0, none>, numDsWritesA = #amdgpu.InstCounter<0, none>, numDsWritesB = #amdgpu.InstCounter<0, none>, numGlobalLoadsA = #amdgpu.InstCounter<0, none>, numGlobalLoadsB = #amdgpu.InstCounter<0, none>, numMMAs = #amdgpu.InstCounter<0, none>, variant = #amdgpu.SchedHintVariant<refine_ops>}
+    // CHECK: ttg.local_dealloc [[ALLOC]] : !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
+    %0 = ttg.local_alloc %arg0 : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem>
+    amdgpu.instruction_sched_hint {isBufferLoadsAEnabled = false, isBufferLoadsBEnabled = false, numDsReadsA = #amdgpu.InstCounter<0, none>, numDsReadsB = #amdgpu.InstCounter<0, none>, numDsWritesA = #amdgpu.InstCounter<0, none>, numDsWritesB = #amdgpu.InstCounter<0, none>, numGlobalLoadsA = #amdgpu.InstCounter<0, none>, numGlobalLoadsB = #amdgpu.InstCounter<0, none>, numMMAs = #amdgpu.InstCounter<0, none>, variant = #amdgpu.SchedHintVariant<refine_ops>}
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/ops-refinement/simple-dot.mlir b/test/TritonGPU/amd/ops-refinement/simple-dot.mlir
@@ -0,0 +1,42 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline='num_stages=2' -cse -canonicalize -triton-amdgpu-refine-ops='arch=gfx942' -canonicalize | FileCheck %s
+
+#mma = #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = true}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK: @matmul_kernel
+  tt.func public @matmul_kernel(
+      %arg0: tensor<256x64x!tt.ptr<f16>, #blocked> {tt.contiguity=16 : i32, tt.divisibility=16: i32, tt.constancy=16: i32},
+      %arg1: tensor<64x128x!tt.ptr<f16>, #blocked> {tt.contiguity=16 : i32, tt.divisibility=16: i32, tt.constancy=16: i32})  -> tensor<256x128xf32, #mma> attributes {noinline = false} {
+
+    %output = arith.constant dense<0.000000e+00> : tensor<256x128xf32, #mma>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c64_i32 = arith.constant 64 : i32
+
+    %shift0 = arith.constant dense<64> : tensor<256x64xi32, #blocked>
+    %shift1 = arith.constant dense<64> : tensor<64x128xi32, #blocked>
+
+    %0:3 = scf.for %arg2 = %c0_i32 to %c64_i32 step %c1_i32 iter_args(
+      %loop_arg0 = %output,
+      %loop_arg1 = %arg0,
+      %loop_arg2 = %arg1) -> (
+        tensor<256x128xf32, #mma>,
+        tensor<256x64x!tt.ptr<f16>, #blocked>,
+        tensor<64x128x!tt.ptr<f16>, #blocked>) : i32 {
+      %1 = tt.load %loop_arg1 : tensor<256x64x!tt.ptr<f16>, #blocked>
+      %2 = tt.load %loop_arg2 : tensor<64x128x!tt.ptr<f16>, #blocked>
+      %3 = ttg.convert_layout %1 : tensor<256x64xf16, #blocked> -> tensor<256x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+      %4 = ttg.convert_layout %2 : tensor<64x128xf16, #blocked> -> tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+      %5 = tt.dot %3, %4, %loop_arg0 : tensor<256x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<64x128xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<256x128xf32, #mma>
+      %6 = tt.addptr %loop_arg1, %shift0 : tensor<256x64x!tt.ptr<f16>, #blocked>, tensor<256x64xi32, #blocked>
+      %7 = tt.addptr %loop_arg2, %shift1 : tensor<64x128x!tt.ptr<f16>, #blocked>, tensor<64x128xi32, #blocked>
+      scf.yield %5, %6, %7 : tensor<256x128xf32, #mma>, tensor<256x64x!tt.ptr<f16>, #blocked>, tensor<64x128x!tt.ptr<f16>, #blocked>
+    }
+
+    tt.return %0#0 : tensor<256x128xf32, #mma>
+  }
+}
+
+
+// TODO: add TT GEMM case to the test
@@ -299,6 +299,10 @@ def make_llir(src, metadata, options):
         passes.convert.add_index_to_llvmir(pm)
 
         passes.ttgpuir.add_allocate_shared_memory(pm)
+        amd.passes.ttgpuir.add_membar_analysis(pm)
+        amd.passes.ttgpuir.add_refine_amdgpu_ops(pm, options.arch)
+        passes.common.add_canonicalizer(pm)
+        amd.passes.ttgpuir.add_reschedule_amdgpu_ops(pm, options.arch)
         ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
         ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
         ##    of the value of kernel arg `allow_flush_denorm`.