intel
diff --git a/‎README.md
-40 b/‎README.md
-40
diff --git a/‎cmake/llvm-hash.txt
+1-1 b/‎cmake/llvm-hash.txt
+1-1
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+1-2 b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+1-2
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-5 b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-5
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
+2-1 b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
+2-1
diff --git a/‎lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp
+6-6 b/‎lib/Conversion/TritonGPUToLLVM/AllocateWarpGroups.cpp
+6-6
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+2-2 b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+2-2
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
+2-2 b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
+2-2
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+13-6 b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+13-6
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
+5-5 b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
+5-5
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
+3-2 b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
+3-2
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
+10-8 b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp
+10-8
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
+2-2 b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
+2-2
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp
+16-4 b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp
+16-4
diff --git a/‎test/Conversion/allocate_warp_groups.mlir
+19 b/‎test/Conversion/allocate_warp_groups.mlir
+19
@@ -24,46 +24,6 @@ pip install triton
 
 Binary wheels are available for CPython 3.9-3.13.
 
-# Enabling Blackwell Support
-
-The main branch now features support for NVIDIA Blackwell GPUs using 5th
-generation tensor cores. To enable this, you will need two additional steps:
-
-1. Build a pre-release PyTorch from source with CUDA 12.8
-2. Build triton from the latest source
-
-
-First, to build pytorch you need to have CUDA 12.8 installed locally. If not,
-follow the [instructions for your platform](https://developer.nvidia.com/cuda-downloads)
-```bash
-# Clone and checkout pytorch 2.6 release candidate
-git clone https://github.com/pytorch/pytorch
-cd pytorch
-git checkout v2.6.0-rc9
-git submodule sync
-git submodule update --init --recursive -j 8
-
-# Install build dependencies (assumes you already have a system compiler)
-pip install -r requirements.txt
-pip install mkl-static mkl-include wheel
-
-# Build PyTorch (will take a long time)
-export CUDA_HOME=/usr/local/cuda-12.8
-export CUDA_PATH=$CUDA_HOME
-export TORCH_CUDA_ARCH_LIST=Blackwell
-python setup.py develop
-
-# Optional, package build into a wheel to install on other machines.
-python setup.py bdist_wheel
-ls dist  # Wheel should be output in this directory
-```
-
-Note that if you use the domain libraries (`torchvision`, `torchtext`,
-`torchaudio`, etc.) these will need to be built from source as well, otherwise
-their custom PyTorch extensions will not work.
-
-Finally, follow the instructions below to install triton from source.
-
 # Install from source
 
 ```shell
 
@@ -1 +1 @@
-092b6e73e651469527662443b592f98f442ece72
+3c709802d31b5bc5ed3af8284b40593ff39b9eec
@@ -287,8 +287,7 @@ LinearLayout chooseScaledMfmaScaleLayout(
 // 8 elements. This layout is useful for emitting the widest 128-bit global
 // store instructions. Since it closely resembles mfmaLayout, conversion between
 // the two can be done using transferWithinWarp, without involving LDS
-LinearLayout chooseMfmaLikeStoreLayout(AMDMfmaEncodingAttr mfmaLayout,
-                                       ArrayRef<int64_t> shape);
+std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -265,11 +265,6 @@ def TTG_MemDescReshapeOp : TTG_Op<"memdesc_reshape", [Pure,
   }];
 
   let arguments = (ins TTG_MemDescType:$src);
-
-  let arguments = (
-    ins TTG_MemDescType:$src
-  );
-
   let results = (outs TTG_MemDescType:$result);
 
   let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))";
 
@@ -20,6 +20,7 @@ static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
 static const char *kAssignedStageAttrName = "ttg.assigned_stage";
+static const char *kAssignedClusterAttrName = "ttg.assigned_cluster";
 
 //===----------------------------------------------------------------------===//
 // Hoisting Utilities
@@ -106,7 +107,7 @@ Value createScalarAlloc(ImplicitLocOpBuilder &rewriter, Type type,
 Value createBarrierAlloc(scf::ForOp forOp, int numBarriers,
                          int arriveCount = 1);
 // Create an allocation that can hold distance number of tensor shapes.
-Value createAlloc(scf::ForOp forOp, RankedTensorType ty, Location loc,
+Value createAlloc(Operation *insertBefore, RankedTensorType ty, Location loc,
                   gpu::SharedEncodingTrait sharedEnc, unsigned distance);
 
 // Determine if the operation is a TMA load.
 
@@ -157,22 +157,22 @@ struct AllocateWarpGroups
       }
 
       // Compute the register deficit over the partition warp groups.
-      int registerDeficit = 0;
+      int registerBudget = maxnreg * baseNumWarps * threadsPerWarp;
       for (const WarpGroupInfo &wg : warpGroups) {
         assert(wg.numWarps % 4 == 0);
-        registerDeficit +=
+        registerBudget +=
             (maxnreg - wg.maxRequestedRegs) * wg.numWarps * threadsPerWarp;
       }
-      if (registerDeficit <= 0)
+      if (registerBudget <= 0)
         return;
 
       // Determine the number of extra registers that we can distribute to the
       // default warp group.
-      int leftover =
-          ((baseNumWarps * threadsPerWarp * maxnreg) + registerDeficit) /
-          baseNumWarps / threadsPerWarp;
+      int leftover = registerBudget / (baseNumWarps * threadsPerWarp);
       // Round down to the nearest multiple of 8.
       leftover = leftover / 8 * 8;
+      if (leftover < 24)
+        return; // too few registers
 
       // Generate setmaxnreg in each partition according to its warp group.
       SmallVector<int32_t> maxnregsPerPartition(1 + arr.size());
 
@@ -298,8 +298,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
             b.shl(b.lshr(offset, b.i32_val(rshiftVal)), b.i32_val(lshiftVal)),
             offset);
       }
-      auto vecAddr = b.gep(sharedPtrTy, elemTy, smemBase, offset);
-      vecAddr.setInbounds(true);
+      auto vecAddr = b.gep(sharedPtrTy, elemTy, smemBase, offset,
+                           LLVM::GEPNoWrapFlags::inbounds);
       return vecAddr;
     };
 
 
@@ -398,8 +398,8 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
     smemOffset = b.sub(smemOffset, baseToAllocBaseDist);
   }
   auto ptrTy = smemBase.getType();
-  auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset);
-  vecAddr.setInbounds(true);
+  auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset,
+                       LLVM::GEPNoWrapFlags::inbounds);
   return vecAddr;
 }
 
 
@@ -1537,10 +1537,17 @@ LinearLayout chooseScaledMfmaScaleLayout(
   return newLL;
 }
 
-LinearLayout chooseMfmaLikeStoreLayout(AMDMfmaEncodingAttr mfmaLayout,
-                                       ArrayRef<int64_t> shape) {
-  assert(shape.size() == 2 && mfmaLayout.getMDim() == 32 &&
-         mfmaLayout.getNDim() == 32 && mfmaLayout.getIsTransposed());
+std::optional<LinearLayout>
+chooseMfmaLikeStoreLayout(RankedTensorType valType) {
+  auto mfmaLayout = cast<AMDMfmaEncodingAttr>(valType.getEncoding());
+
+  // Currently support transposed [B]F16 MFMA32x32 on CDNA4
+  bool isMfma32 = mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32;
+  Type elemType = valType.getElementType();
+  if (!(valType.getRank() == 2 && (elemType.isF16() || elemType.isBF16()) &&
+        mfmaLayout.getVersionMajor() == 4 && mfmaLayout.getIsTransposed() &&
+        isMfma32))
+    return {};
 
   MLIRContext *ctx = mfmaLayout.getContext();
   StringAttr kRegister = S("register");
@@ -1565,8 +1572,8 @@ LinearLayout chooseMfmaLikeStoreLayout(AMDMfmaEncodingAttr mfmaLayout,
       identityStandardND(kWarp, mfmaLayout.getWarpsPerCTA(), order);
   LinearLayout ctaLayout = mfma8Layout.transposeOuts(standardOutDims) *
                            warpLayout.transposeOuts(standardOutDims);
-  mfma8Layout =
-      combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
+  mfma8Layout = combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(),
+                                       valType.getShape());
   return mfma8Layout;
 }
 
 
@@ -380,22 +380,22 @@ Value mlir::triton::createBarrierAlloc(scf::ForOp forOp, int numBarriers,
   return barrierAlloc;
 }
 
-Value mlir::triton::createAlloc(scf::ForOp forOp, RankedTensorType ty,
+Value mlir::triton::createAlloc(Operation *insertBefore, RankedTensorType ty,
                                 Location loc,
                                 gpu::SharedEncodingTrait sharedEnc,
                                 unsigned distance) {
-  OpBuilder builder(forOp);
+  OpBuilder builder(insertBefore);
   Attribute sharedMemorySpace =
-      ttg::SharedMemorySpaceAttr::get(forOp.getContext());
+      ttg::SharedMemorySpaceAttr::get(insertBefore->getContext());
   SmallVector<int64_t> bufferShape(ty.getShape().begin(), ty.getShape().end());
   bufferShape.insert(bufferShape.begin(), distance);
   Type memdescType = ttg::MemDescType::get(bufferShape, ty.getElementType(),
                                            sharedEnc, sharedMemorySpace,
                                            /*mutableMemory=*/true);
   Value alloc = builder.create<ttg::LocalAllocOp>(loc, memdescType);
 
-  builder.setInsertionPointAfter(forOp);
-  builder.create<ttg::LocalDeallocOp>(forOp.getLoc(), alloc);
+  builder.setInsertionPointAfter(insertBefore);
+  builder.create<ttg::LocalDeallocOp>(insertBefore->getLoc(), alloc);
   return alloc;
 }
 
 
@@ -276,10 +276,11 @@ static std::optional<int> dotCanBeProperlyAsync(ttng::WarpGroupDotOp dotOp,
     }
 
     // If it's a shmem operand, it must either be defined outside the loop, or
-    // come from an MemDescSubview op.  Only ConvertLayout and Trans ops are
+    // come from an MemDescSubview op.  Only ConvertLayout and view ops are
     // allowed in between.
     Value transitiveOperand = operand;
-    while (isa_and_nonnull<ttg::ConvertLayoutOp, ttg::MemDescTransOp>(
+    while (isa_and_nonnull<ttg::ConvertLayoutOp, ttg::MemDescTransOp,
+                           ttg::MemDescReshapeOp>(
                transitiveOperand.getDefiningOp()) ||
            isa<BlockArgument>(transitiveOperand)) {
       auto blockArg = dyn_cast<BlockArgument>(transitiveOperand);
 
@@ -1241,10 +1241,10 @@ ttg::LocalAllocOp findShmemAlloc(Value operand) {
   // come from an MemDescSubview op. Only ConvertLayout and Trans ops are
   // allowed in between.
   Value transitiveOperand = operand;
-  while (
-      isa_and_nonnull<ttg::ConvertLayoutOp, tt::TransOp, ttg::MemDescTransOp>(
-          transitiveOperand.getDefiningOp()) ||
-      isa<BlockArgument>(transitiveOperand)) {
+  while (isa_and_nonnull<ttg::ConvertLayoutOp, tt::TransOp, ttg::MemDescTransOp,
+                         ttg::MemDescReshapeOp>(
+             transitiveOperand.getDefiningOp()) ||
+         isa<BlockArgument>(transitiveOperand)) {
     if (auto blockArg = dyn_cast<BlockArgument>(transitiveOperand)) {
       assert(isa<scf::ForOp>(blockArg.getOwner()->getParentOp()) &&
              "Block argument must come from a for loop");
@@ -1409,7 +1409,7 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
     }
 
     // Non-subview/trans ops will be replaced by `val`.
-    if (!isa<ttg::MemDescTransOp, ttg::MemDescSubviewOp>(use.getOwner())) {
+    if (!use.getOwner()->hasTrait<OpTrait::MemDescViewTrait>()) {
       operandsToReplace.push_back(&use);
       continue;
     }
@@ -1427,13 +1427,15 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
           oldType.getMemorySpace(), isMutable);
       newVal = builder.create<ttg::MemDescSubviewOp>(
           subview.getLoc(), newDstType, val, subview.getOffsets());
-      newVal.getDefiningOp()->setAttrs(user->getAttrs());
     } else if (auto trans = dyn_cast<ttg::MemDescTransOp>(user)) {
       newVal = builder.create<ttg::MemDescTransOp>(trans.getLoc(), val,
                                                    trans.getOrder());
-      newVal.getDefiningOp()->setAttrs(user->getAttrs());
+    } else if (auto reshape = dyn_cast<ttg::MemDescReshapeOp>(user)) {
+      newVal = builder.create<ttg::MemDescReshapeOp>(reshape.getLoc(),
+                                                     reshape.getType(), val);
     }
-    assert(newVal);
+    assert(newVal && "unhandled memdesc view");
+    newVal.getDefiningOp()->setAttrs(user->getAttrs());
     replaceUsesAndPropagateType(builder, user, newVal);
     opsToDelete.push_back(use.getOwner());
   }
 
@@ -130,7 +130,7 @@ static PartitionScheme getPartitionScheme(scf::ForOp loop) {
     }
     while (!operandViews.empty()) {
       Operation *op = operandViews.pop_back_val();
-      if (!op->hasOneUse() || !isa<MemDescSubviewOp, MemDescTransOp>(op))
+      if (!op->hasOneUse() || !op->hasTrait<OpTrait::MemDescViewTrait>())
         continue;
       mma.operandViews.push_back(op);
       if (Operation *defOp = op->getOperand(0).getDefiningOp())
@@ -669,7 +669,7 @@ findSharedMemorySinkOps(Value value, SmallVectorImpl<Operation *> &sinkOps) {
   for (Operation *user : value.getUsers()) {
     if (isa<ttng::MMAv5OpInterface, LocalLoadOp>(user)) {
       sinkOps.push_back(user);
-    } else if (isa<MemDescTransOp, MemDescSubviewOp>(user)) {
+    } else if (user->hasTrait<OpTrait::MemDescViewTrait>()) {
       if (failed(findSharedMemorySinkOps(user->getResult(0), sinkOps)))
         return failure();
     } else {
 
@@ -208,11 +208,23 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) {
       continue;
     }
 
-    if (isa<RankedTensorType>(capture.getType())) {
-      return mlir::emitWarning(capture.getLoc(),
-                               "FIXME: capturing tensor values into warp "
-                               "partitions is not supported");
+    // Explicitly pass tensor captures through shared memory.
+    auto tensorTy = dyn_cast<RankedTensorType>(capture.getType());
+    if (tensorTy) {
+      SharedEncodingTrait sharedEnc = getSharedEncoding(tensorTy);
+      ImplicitLocOpBuilder b(capture.getLoc(), wsOp);
+      auto memdescTy = MemDescType::get(
+          tensorTy.getShape(), tensorTy.getElementType(), sharedEnc,
+          SharedMemorySpaceAttr::get(tensorTy.getContext()));
+      auto alloc = b.create<LocalAllocOp>(memdescTy, capture);
+      for (Region *region : wsOp.getPartitionRegions()) {
+        b.setInsertionPointToStart(&region->front());
+        Value value = b.create<LocalLoadOp>(tensorTy, alloc);
+        replaceAllUsesInRegionWith(capture, value, *region);
+      }
+      capture = alloc;
     }
+
     explicitCaptures.push_back(capture);
   }
 
 
@@ -92,3 +92,22 @@ tt.func @setmaxnreg() {
 }
 
 }
+
+// -----
+
+// CHECK: module attributes {ttg.maxnreg = 128 : i32
+module attributes {"ttg.num-warps" = 8 : i32} {
+
+tt.func @steal_from_default() {
+  // CHECK: actualRegisters = array<i32: 64, 192>
+  ttg.warp_specialize() attributes {requestedRegisters = array<i32: 192>}
+  default {
+    ttg.warp_yield
+  }
+  partition0() num_warps(8) {
+    ttg.warp_return
+  } : () -> ()
+  tt.return
+}
+
+}
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-092b6e73e651469527662443b592f98f442ece72`
	`1`	`+3c709802d31b5bc5ed3af8284b40593ff39b9eec`
Original file line number	Diff line number	Diff line change
`@@ -398,8 +398,8 @@ Value getSmemVecAddr(const LinearLayout &regLayout,`
`398`	`398`	`smemOffset = b.sub(smemOffset, baseToAllocBaseDist);`
`399`	`399`	`}`
`400`	`400`	`auto ptrTy = smemBase.getType();`
`401`		`- auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset);`
`402`		`- vecAddr.setInbounds(true);`
	`401`	`+ auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset,`
	`402`	`+ LLVM::GEPNoWrapFlags::inbounds);`
`403`	`403`	`return vecAddr;`
`404`	`404`	`}`
`405`	`405`