[AMD] Add an option to force async copy overlapping

joviliast · web-flow · commit 77c00fa45a67 · 2025-05-21T15:19:07.000+02:00
Use `TRITON_HIP_ASYNC_COPY_OVERLAP=1` env to enable async copy overlap
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -35,6 +35,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_LOCAL_PREFETCH",
     "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE",
+    "TRITON_HIP_ASYNC_COPY_OVERLAP",
     "TRITON_HIP_ENABLE_F16_ASYNC_PINGPONG",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -122,10 +122,10 @@ class StreamPipeliner {
 public:
   StreamPipeliner(scf::ForOp _forOp, int _numStages, int _globalPrefetch,
                   int _localPrefetch, bool _useAsyncCopy,
-                  bool _useF16BlockPingpong)
+                  bool _useF16BlockPingpong, bool _useAsyncCopyOverlap)
       : forOp(_forOp), numStages(_numStages), numBuffers(1),
         useAsyncCopy(_useAsyncCopy), useF16BlockPingpong(_useF16BlockPingpong),
-        schedule(numStages),
+        useAsyncCopyOverlap(_useAsyncCopyOverlap), schedule(numStages),
         axisInfoAnalysis(forOp->getParentOfType<ModuleOp>()) {
     int lastStage = numStages - 1;
     stages[SCHED_GLOBAL_LOAD] = 0;
@@ -181,6 +181,9 @@ class StreamPipeliner {
   // Whether or not we are intend to ping-pong.
   bool useF16BlockPingpong;
 
+  // Move AsyncCopy before AsyncWait.
+  bool useAsyncCopyOverlap;
+
   // Stage for each SchedType Op
   int stages[SCHED_SIZE];
   // Cluster for each SchedType Op
@@ -297,6 +300,14 @@ LogicalResult StreamPipeliner::initSchedule(int maxIndirectionLevel) {
     computeCluster = localLoadCluster;
   }
 
+  if (useAsyncCopyOverlap) {
+    globalLoadCluster = 0;
+    localStoreCluster = 1;
+    asyncWaitCluster = 2;
+    localLoadCluster = 3;
+    computeCluster = 3;
+  }
+
   // Make assignments
   std::array<tt::CoarseSchedule::Cluster, SCHED_SIZE> clusterVec;
   std::generate(clusterVec.begin(), clusterVec.end(),
@@ -1072,6 +1083,9 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase<PipelinePass> {
     // between MXFP4 and FP16.
     bool useF16BlockPingpong =
         triton::tools::getBoolEnv("TRITON_HIP_ENABLE_F16_ASYNC_PINGPONG");
+    bool useAsyncCopyOverlap =
+        triton::tools::getBoolEnv("TRITON_HIP_ASYNC_COPY_OVERLAP") &
+        useAsyncCopy;
     SmallVector<scf::ForOp> loops;
     getOperation()->walk([&](scf::ForOp forOp) {
       labelLoadOpsForTritonDot(forOp);
@@ -1092,7 +1106,7 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase<PipelinePass> {
       } else {
         StreamPipeliner sp(forOp, tt::getNumStagesOrDefault(forOp, numStages),
                            globalPrefetch, localPrefetch, useAsyncCopy,
-                           useF16BlockPingpong);
+                           useF16BlockPingpong, useAsyncCopyOverlap);
         (void)sp.pipelineLoop();
       }
     }