Skip to content

[AMD] Added bufferOps refinement #776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: refine-ops-pass
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions third_party/amd/lib/TritonAMDGPUTransforms/RefineOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,106 @@ struct LoadOpPattern : public RefineRewritePattern<triton::LoadOp> {
}
};

// In contrast to `tt.load` which operates on a tensor of pointers,
// `ttg.buffer_load` operates on a tensor descriptor and offsets
// which are a tensor of integers. `ttg.buffer_load` also involves
// `mask` and `other` tensors which are optional. These tensors
// must be sliced as well if they are provided. It is difficult
// to unify both convertion patterns for `tt.load` and `ttg.buffer_load`;
// thus we provide a dedicated pattern to refine `ttg.buffer_load` ops.
struct AMDGCNBufferLoadOp

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add an explanatory comment regarding how refinement of buffer_load is more complex than that of global_load; it looks like we need to examine the refinement of masks, otherTensor and offsets and bring it all together. This'll make the function more understandable.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

: public RefineRewritePattern<triton::amdgpu::BufferLoadOp> {
AMDGCNBufferLoadOp(MLIRContext *context, PatternBenefit benefit = 1)
: RefineRewritePattern(context, benefit) {}

LogicalResult apply(triton::amdgpu::BufferLoadOp op,
PatternRewriter &rewriter) const override {
auto ctx = op->getContext();
auto loc = op.getLoc();

auto origBasePtr = op.getPtr();
auto origElementType =
cast<PointerType>(origBasePtr.getType()).getPointeeType();
auto origOffsets = op.getOffsets();
auto origEncoding =
cast<RankedTensorType>(origOffsets.getType()).getEncoding();
if (!origEncoding)
return failure();

auto origStride = op.getStride();
auto origCache = op.getCache();
auto origMask = op.getMask();
auto origOtherTensor = op.getOther();

rewriter.setInsertionPointAfter(op);

auto refineTensor = [&](mlir::Value tensor) {
auto tensorType = cast<RankedTensorType>(tensor.getType());
auto origShape = tensorType.getShape();
auto elemType = tensorType.getElementType();
auto encoding = dyn_cast<BlockedEncodingAttr>(tensorType.getEncoding());
assert(encoding != nullptr);

RefinedBlock refinedBlock(origShape, elemType, encoding);

AMD::CoordinateMapper coordsMapper(refinedBlock.numPerDims);
SmallVector<Value> slices;
for (size_t linearIdx = 0; linearIdx < refinedBlock.numSubTiles;
++linearIdx) {
auto coords = coordsMapper.map(linearIdx);
SmallVector<int64_t> offset(refinedBlock.numDims, 0);
for (auto [dim, coord] : llvm::enumerate(coords)) {
offset[dim] = coord * refinedBlock.elementsPerWorkGroup[dim];
}

auto slice = rewriter.create<triton::amdgpu::ExtractSliceOp>(
loc, Type{refinedBlock.tensorType}, Value{tensor}, offset);

slices.push_back(slice);
}

return std::tuple(slices, refinedBlock.refinedShape,
refinedBlock.numPerDims);
};

auto [slicedOffsets, refinedShape, numPerDims] = refineTensor(origOffsets);
std::optional<SmallVector<Value>> slicedMasks;
if (origMask) {
slicedMasks = std::get<0>(refineTensor(origMask));
assert(slicedMasks.value().size() == slicedOffsets.size());
}

std::optional<SmallVector<Value>> slicedOtherTensors;
if (origOtherTensor) {
slicedOtherTensors = std::get<0>(refineTensor(origOtherTensor));
assert(slicedOtherTensors.value().size() == slicedOffsets.size());
}

Type refinedTensorType =
RankedTensorType::get(refinedShape, origElementType, origEncoding);

SmallVector<Value> refinedOps;
for (size_t i = 0; i < slicedOffsets.size(); ++i) {
Value slicedOffset = slicedOffsets[i];
Value slicedMask = slicedMasks ? slicedMasks.value()[i] : nullptr;
Value slicedOtherTensor =
slicedOtherTensors ? slicedOtherTensors.value()[i] : nullptr;

auto refinedOp = rewriter.create<triton::amdgpu::BufferLoadOp>(
loc, refinedTensorType, origBasePtr, slicedOffset, origStride,
origCache, slicedMask, slicedOtherTensor);
refinedOps.push_back(refinedOp);
}

Value origResult = op.getResult();
auto joinedResult = rewriter.create<triton::amdgpu::ConcatOp>(
loc, origResult.getType(), refinedOps);

origResult.replaceAllUsesWith(joinedResult);
return success();
}
};

struct LocalStoreOpPattern
: public RefineRewritePattern<triton::gpu::LocalStoreOp> {
LocalStoreOpPattern(MLIRContext *context, PatternBenefit benefit = 1)
Expand Down Expand Up @@ -1212,6 +1312,7 @@ struct TritonAMDGPURefineOps
patterns.add<LocalLoadOpPattern>(context, /*benefit=*/1);
patterns.add<DotOpPattern>(context, /*benefit=*/1);
patterns.add<LoadOpPattern>(context, /*benefit=*/1);
patterns.add<AMDGCNBufferLoadOp>(context, /*benefit=*/1);
patterns.add<LocalStoreOpPattern>(context, /*benefit=*/1);
patterns.add<ReduceOpPattern>(context, /*benefit=*/1);
patterns.add<ExpandDimsOpPattern>(context, /*benefit=*/1);
Expand Down
Loading