Skip to content

Commit 0745219

Browse files
authored
[AMDGPU] Add target intrinsic for s_buffer_prefetch_data (llvm#107293)
1 parent 4c1a6a2 commit 0745219

16 files changed

+168
-9
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
449449
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
450450
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
451451
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
452+
TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts")
452453

453454
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
454455
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,8 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global
2222
__builtin_amdgcn_s_barrier_wait(-1);
2323
*out = *in;
2424
}
25+
26+
void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off)
27+
{
28+
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}}
29+
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,3 +281,22 @@ void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned
281281
__builtin_amdgcn_s_prefetch_data(gp, len);
282282
__builtin_amdgcn_s_prefetch_data(cp, 31);
283283
}
284+
285+
// CHECK-LABEL: @test_s_buffer_prefetch_data(
286+
// CHECK-NEXT: entry:
287+
// CHECK-NEXT: [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
288+
// CHECK-NEXT: [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
289+
// CHECK-NEXT: store ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(5) [[RSRC_ADDR]], align 16
290+
// CHECK-NEXT: store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
291+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
292+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
293+
// CHECK-NEXT: call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
294+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
295+
// CHECK-NEXT: call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
296+
// CHECK-NEXT: ret void
297+
//
298+
void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
299+
{
300+
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
301+
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
302+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,6 +1723,15 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
17231723
ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
17241724
def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS;
17251725

1726+
def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic <
1727+
[],
1728+
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
1729+
llvm_i32_ty, // offset (imm)
1730+
llvm_i32_ty], // len (SGPR/imm)
1731+
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<1>>], "", [SDNPMemOperand]>,
1732+
AMDGPURsrcIntrinsic<0>,
1733+
ClangBuiltin<"__builtin_amdgcn_s_buffer_prefetch_data">;
1734+
17261735
} // defset AMDGPUBufferIntrinsics
17271736

17281737
// Uses that do not set the done bit should set IntrWriteMem on the

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
296296
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
297297
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
298298
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
299+
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
299300

300301
class GISelSop2Pat <
301302
SDPatternOperator node,

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5545,6 +5545,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
55455545
NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
55465546
NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
55475547
NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5548+
NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
55485549
NODE_NAME_CASE(BUFFER_STORE)
55495550
NODE_NAME_CASE(BUFFER_STORE_BYTE)
55505551
NODE_NAME_CASE(BUFFER_STORE_SHORT)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,7 @@ enum NodeType : unsigned {
589589
SBUFFER_LOAD_UBYTE,
590590
SBUFFER_LOAD_SHORT,
591591
SBUFFER_LOAD_USHORT,
592+
SBUFFER_PREFETCH_DATA,
592593
BUFFER_STORE,
593594
BUFFER_STORE_BYTE,
594595
BUFFER_STORE_SHORT,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5237,7 +5237,8 @@ getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
52375237

52385238
InstructionSelector::ComplexRendererFns
52395239
AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5240-
std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5240+
std::optional<uint64_t> OffsetVal =
5241+
Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
52415242
if (!OffsetVal)
52425243
return {};
52435244

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6797,6 +6797,18 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
67976797
return true;
67986798
}
67996799

6800+
bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6801+
MachineInstr &MI) const {
6802+
MachineIRBuilder &B = Helper.MIRBuilder;
6803+
GISelChangeObserver &Observer = Helper.Observer;
6804+
Observer.changingInstr(MI);
6805+
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6806+
MI.removeOperand(0); // Remove intrinsic ID
6807+
castBufferRsrcArgToV4I32(MI, B, 0);
6808+
Observer.changedInstr(MI);
6809+
return true;
6810+
}
6811+
68006812
// TODO: Move to selection
68016813
bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
68026814
MachineRegisterInfo &MRI,
@@ -7485,6 +7497,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
74857497
case Intrinsic::amdgcn_permlanex16:
74867498
case Intrinsic::amdgcn_permlane64:
74877499
return legalizeLaneOp(Helper, MI, IntrID);
7500+
case Intrinsic::amdgcn_s_buffer_prefetch_data:
7501+
return legalizeSBufferPrefetch(Helper, MI);
74887502
default: {
74897503
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
74907504
AMDGPU::getImageDimIntrinsicInfo(IntrID))

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
227227

228228
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
229229

230+
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const;
231+
230232
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI,
231233
MachineIRBuilder &B) const;
232234
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3101,6 +3101,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
31013101
applyMappingSBufferLoad(B, OpdMapper);
31023102
return;
31033103
}
3104+
case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
3105+
constrainOpWithReadfirstlane(B, MI, 0);
3106+
constrainOpWithReadfirstlane(B, MI, 2);
3107+
return;
31043108
case AMDGPU::G_INTRINSIC:
31053109
case AMDGPU::G_INTRINSIC_CONVERGENT: {
31063110
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
@@ -4464,6 +4468,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
44644468
OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
44654469
break;
44664470
}
4471+
case AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH:
4472+
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4473+
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4474+
break;
44674475
case AMDGPU::G_INTRINSIC:
44684476
case AMDGPU::G_INTRINSIC_CONVERGENT: {
44694477
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,9 +1210,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12101210
Info.ptrVal = RsrcArg;
12111211
}
12121212

1213-
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1214-
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1215-
Info.flags |= MachineMemOperand::MOVolatile;
1213+
bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1214+
if (!IsSPrefetch) {
1215+
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1216+
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1217+
Info.flags |= MachineMemOperand::MOVolatile;
1218+
}
1219+
12161220
Info.flags |= MachineMemOperand::MODereferenceable;
12171221
if (ME.onlyReadsMemory()) {
12181222
if (RsrcIntr->IsImage) {
@@ -1251,16 +1255,18 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12511255

12521256
Info.flags |= MachineMemOperand::MOStore;
12531257
} else {
1254-
// Atomic or NoReturn Sampler
1258+
// Atomic, NoReturn Sampler or prefetch
12551259
Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
12561260
ISD::INTRINSIC_W_CHAIN;
1257-
Info.flags |= MachineMemOperand::MOLoad |
1258-
MachineMemOperand::MOStore |
1259-
MachineMemOperand::MODereferenceable;
1261+
Info.flags |=
1262+
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1263+
1264+
if (!IsSPrefetch)
1265+
Info.flags |= MachineMemOperand::MOStore;
12601266

12611267
switch (IntrID) {
12621268
default:
1263-
if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1269+
if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
12641270
// Fake memory access type for no return sampler intrinsics
12651271
Info.memVT = MVT::i32;
12661272
} else {
@@ -9934,6 +9940,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
99349940
return Op.getOperand(0);
99359941
return Op;
99369942
}
9943+
case Intrinsic::amdgcn_s_buffer_prefetch_data: {
9944+
SDValue Ops[] = {
9945+
Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
9946+
Op.getOperand(3), // offset
9947+
Op.getOperand(4), // length
9948+
};
9949+
9950+
MemSDNode *M = cast<MemSDNode>(Op);
9951+
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
9952+
Op->getVTList(), Ops, M->getMemoryVT(),
9953+
M->getMemOperand());
9954+
}
99379955
default: {
99389956
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
99399957
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,14 @@ class isIntType<ValueType SrcVT> {
312312
bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
313313
}
314314

315+
def SDTSBufferPrefetch : SDTypeProfile<0, 3,
316+
[SDTCisVT<0, v4i32>, // rsrc
317+
SDTCisVT<1, i32>, // offset(imm)
318+
SDTCisVT<2, i32>]>; // length
319+
320+
def SIsbuffer_prefetch : SDNode<"AMDGPUISD::SBUFFER_PREFETCH_DATA", SDTSBufferPrefetch,
321+
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
322+
315323
//===----------------------------------------------------------------------===//
316324
// SDNodes PatFrags for loads/stores with a glue input.
317325
// This is for SDNodes and PatFrag for local loads and stores to

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3984,6 +3984,16 @@ def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction;
39843984
def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction;
39853985
def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction;
39863986

3987+
class SBufferPrefetchInstruction : AMDGPUGenericInstruction {
3988+
let OutOperandList = (outs);
3989+
let InOperandList = (ins type0:$rsrc, untyped_imm_0:$offset, type1:$len);
3990+
let hasSideEffects = 0;
3991+
let mayLoad = 1;
3992+
let mayStore = 1;
3993+
}
3994+
3995+
def G_AMDGPU_S_BUFFER_PREFETCH : SBufferPrefetchInstruction;
3996+
39873997
def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
39883998
let OutOperandList = (outs type0:$dst);
39893999
let InOperandList = (ins type0:$src0, type0:$src1);

llvm/lib/Target/AMDGPU/SMInstructions.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1171,6 +1171,17 @@ let SubtargetPredicate = isGFX12Plus in {
11711171
def : GCNPat <
11721172
(int_amdgcn_s_prefetch_data (i64 SReg_64:$sbase), imm:$len),
11731173
(S_PREFETCH_DATA $sbase, 0, (i32 SGPR_NULL), (as_i8timm $len))
1174+
1175+
>;
1176+
1177+
def : GCNPat <
1178+
(SIsbuffer_prefetch v4i32:$sbase, (SMRDBufferImm i32:$offset), (i32 SReg_32:$len)),
1179+
(S_BUFFER_PREFETCH_DATA SReg_128:$sbase, i32imm:$offset, $len, 0)
1180+
>;
1181+
1182+
def : GCNPat <
1183+
(SIsbuffer_prefetch v4i32:$sbase, (SMRDBufferImm i32:$offset), imm:$len),
1184+
(S_BUFFER_PREFETCH_DATA SReg_128:$sbase, i32imm:$offset, (i32 SGPR_NULL), (as_i8timm $len))
11741185
>;
11751186
} // End let SubtargetPredicate = isGFX12Plus
11761187

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=GCN %s
4+
5+
declare void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) %rsrc, i32 %offset, i32 %len)
6+
7+
define amdgpu_ps void @buffer_prefetch_data_imm_offset_sgpr_len(ptr addrspace(8) inreg %rsrc, i32 inreg %len) {
8+
; GCN-LABEL: buffer_prefetch_data_imm_offset_sgpr_len:
9+
; GCN: ; %bb.0: ; %entry
10+
; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x80, s4, 0
11+
; GCN-NEXT: s_endpgm
12+
entry:
13+
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
14+
ret void
15+
}
16+
17+
define amdgpu_ps void @buffer_prefetch_data_imm_offset_imm_len(ptr addrspace(8) inreg %rsrc) {
18+
; GCN-LABEL: buffer_prefetch_data_imm_offset_imm_len:
19+
; GCN: ; %bb.0: ; %entry
20+
; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x0, null, 31
21+
; GCN-NEXT: s_endpgm
22+
entry:
23+
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 0, i32 31)
24+
ret void
25+
}
26+
27+
define amdgpu_ps void @buffer_prefetch_data_imm_offset_vgpr_len(ptr addrspace(8) inreg %rsrc, i32 %len) {
28+
; GCN-LABEL: buffer_prefetch_data_imm_offset_vgpr_len:
29+
; GCN: ; %bb.0: ; %entry
30+
; GCN-NEXT: v_readfirstlane_b32 s4, v0
31+
; GCN-NEXT: s_buffer_prefetch_data s[0:3], 0x80, s4, 0
32+
; GCN-NEXT: s_endpgm
33+
entry:
34+
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
35+
ret void
36+
}
37+
38+
define amdgpu_ps void @buffer_prefetch_data_vgpr_rsrc_imm_offset_sgpr_len(ptr addrspace(8) %rsrc, i32 inreg %len) {
39+
; GCN-LABEL: buffer_prefetch_data_vgpr_rsrc_imm_offset_sgpr_len:
40+
; GCN: ; %bb.0: ; %entry
41+
; GCN-NEXT: v_readfirstlane_b32 s4, v0
42+
; GCN-NEXT: v_readfirstlane_b32 s5, v1
43+
; GCN-NEXT: v_readfirstlane_b32 s6, v2
44+
; GCN-NEXT: v_readfirstlane_b32 s7, v3
45+
; GCN-NEXT: s_buffer_prefetch_data s[4:7], 0x80, s0, 0
46+
; GCN-NEXT: s_endpgm
47+
entry:
48+
tail call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) inreg %rsrc, i32 128, i32 %len)
49+
ret void
50+
}

0 commit comments

Comments
 (0)