Skip to content

Commit a1d7da0

Browse files
authored
[AMDGPU][SILoadStoreOptimizer] Merge constrained sloads (#96162)
Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load.
1 parent 0998e3c commit a1d7da0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+2483
-2216
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

+40-8
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
216216
CombineInfo &Paired, bool Modify = false);
217217
static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218218
const CombineInfo &Paired);
219-
static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
219+
unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220220
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221221
const CombineInfo &Paired);
222222
const TargetRegisterClass *
@@ -353,6 +353,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
353353
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354354
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355355
case AMDGPU::S_LOAD_DWORDX2_IMM:
356+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
356357
case AMDGPU::GLOBAL_LOAD_DWORDX2:
357358
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358359
case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +364,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363364
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364365
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365366
case AMDGPU::S_LOAD_DWORDX3_IMM:
367+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
366368
case AMDGPU::GLOBAL_LOAD_DWORDX3:
367369
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368370
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +375,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
373375
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374376
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375377
case AMDGPU::S_LOAD_DWORDX4_IMM:
378+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
376379
case AMDGPU::GLOBAL_LOAD_DWORDX4:
377380
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378381
case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +386,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
383386
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384387
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385388
case AMDGPU::S_LOAD_DWORDX8_IMM:
389+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
386390
return 8;
387391
case AMDGPU::DS_READ_B32:
388392
case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +511,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
507511
case AMDGPU::S_LOAD_DWORDX3_IMM:
508512
case AMDGPU::S_LOAD_DWORDX4_IMM:
509513
case AMDGPU::S_LOAD_DWORDX8_IMM:
514+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
510518
return S_LOAD_IMM;
511519
case AMDGPU::DS_READ_B32:
512520
case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +599,10 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
591599
case AMDGPU::S_LOAD_DWORDX3_IMM:
592600
case AMDGPU::S_LOAD_DWORDX4_IMM:
593601
case AMDGPU::S_LOAD_DWORDX8_IMM:
602+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
594606
return AMDGPU::S_LOAD_DWORD_IMM;
595607
case AMDGPU::GLOBAL_LOAD_DWORD:
596608
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +715,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703715
case AMDGPU::S_LOAD_DWORDX3_IMM:
704716
case AMDGPU::S_LOAD_DWORDX4_IMM:
705717
case AMDGPU::S_LOAD_DWORDX8_IMM:
718+
case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719+
case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720+
case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721+
case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
706722
Result.SBase = true;
707723
return Result;
708724
case AMDGPU::DS_READ_B32:
@@ -1212,8 +1228,14 @@ void SILoadStoreOptimizer::copyToDestRegs(
12121228

12131229
// Copy to the old destination registers.
12141230
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1215-
const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1216-
const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1231+
auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1232+
auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1233+
1234+
// The constrained sload instructions in S_LOAD_IMM class will have
1235+
// `early-clobber` flag in the dst operand. Remove the flag before using the
1236+
// MOs in copies.
1237+
Dest0->setIsEarlyClobber(false);
1238+
Dest1->setIsEarlyClobber(false);
12171239

12181240
BuildMI(*MBB, InsertBefore, DL, CopyDesc)
12191241
.add(*Dest0) // Copy to same destination including flags and sub reg.
@@ -1700,19 +1722,29 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
17001722
case 8:
17011723
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17021724
}
1703-
case S_LOAD_IMM:
1725+
case S_LOAD_IMM: {
1726+
// If XNACK is enabled, use the constrained opcodes when the first load is
1727+
// under-aligned.
1728+
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1729+
bool NeedsConstrainedOpc =
1730+
STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4;
17041731
switch (Width) {
17051732
default:
17061733
return 0;
17071734
case 2:
1708-
return AMDGPU::S_LOAD_DWORDX2_IMM;
1735+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736+
: AMDGPU::S_LOAD_DWORDX2_IMM;
17091737
case 3:
1710-
return AMDGPU::S_LOAD_DWORDX3_IMM;
1738+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739+
: AMDGPU::S_LOAD_DWORDX3_IMM;
17111740
case 4:
1712-
return AMDGPU::S_LOAD_DWORDX4_IMM;
1741+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742+
: AMDGPU::S_LOAD_DWORDX4_IMM;
17131743
case 8:
1714-
return AMDGPU::S_LOAD_DWORDX8_IMM;
1744+
return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745+
: AMDGPU::S_LOAD_DWORDX8_IMM;
17151746
}
1747+
}
17161748
case GLOBAL_LOAD:
17171749
switch (Width) {
17181750
default:

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll

+12-12
Original file line numberDiff line numberDiff line change
@@ -658,17 +658,17 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
658658
;
659659
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
660660
; GFX1013: ; %bb.0:
661-
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
661+
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
662662
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
663663
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
664664
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
665665
; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
666666
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
667667
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
668-
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
669-
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
670-
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
671-
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
668+
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
669+
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
670+
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
671+
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
672672
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
673673
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
674674
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
@@ -681,7 +681,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr,
681681
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
682682
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
683683
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
684-
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7]
684+
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11]
685685
; GFX1013-NEXT: s_waitcnt vmcnt(0)
686686
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
687687
; GFX1013-NEXT: s_endpgm
@@ -769,14 +769,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
769769
;
770770
; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
771771
; GFX1013: ; %bb.0:
772-
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
772+
; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
773773
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
774774
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700
775775
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
776-
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
777-
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
778-
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
779-
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
776+
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
777+
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
778+
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
779+
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
780780
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
781781
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
782782
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
@@ -789,7 +789,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
789789
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
790790
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200
791791
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
792-
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
792+
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
793793
; GFX1013-NEXT: s_waitcnt vmcnt(0)
794794
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
795795
; GFX1013-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll

+14-14
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
1919
;
2020
; GFX10-LABEL: dpp_test:
2121
; GFX10: ; %bb.0:
22-
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
22+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2323
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
24-
; GFX10-NEXT: v_mov_b32_e32 v0, s2
25-
; GFX10-NEXT: v_mov_b32_e32 v1, s3
24+
; GFX10-NEXT: v_mov_b32_e32 v0, s6
25+
; GFX10-NEXT: v_mov_b32_e32 v1, s7
2626
; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
2727
; GFX10-NEXT: v_mov_b32_e32 v1, 0
28-
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
28+
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
2929
; GFX10-NEXT: s_endpgm
3030
;
3131
; GFX11-LABEL: dpp_test:
@@ -176,16 +176,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
176176
;
177177
; GFX10-LABEL: update_dppv2i32_test:
178178
; GFX10: ; %bb.0:
179-
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
179+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
180180
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
181181
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
182-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
183-
; GFX10-NEXT: v_mov_b32_e32 v2, s2
184-
; GFX10-NEXT: v_mov_b32_e32 v3, s3
182+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
183+
; GFX10-NEXT: v_mov_b32_e32 v2, s6
184+
; GFX10-NEXT: v_mov_b32_e32 v3, s7
185185
; GFX10-NEXT: s_waitcnt vmcnt(0)
186186
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
187187
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
188-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
188+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
189189
; GFX10-NEXT: s_endpgm
190190
;
191191
; GFX11-LABEL: update_dppv2i32_test:
@@ -232,16 +232,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
232232
;
233233
; GFX10-LABEL: update_dppv2f32_test:
234234
; GFX10: ; %bb.0:
235-
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
235+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
236236
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
237237
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
238-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
239-
; GFX10-NEXT: v_mov_b32_e32 v2, s2
240-
; GFX10-NEXT: v_mov_b32_e32 v3, s3
238+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
239+
; GFX10-NEXT: v_mov_b32_e32 v2, s6
240+
; GFX10-NEXT: v_mov_b32_e32 v3, s7
241241
; GFX10-NEXT: s_waitcnt vmcnt(0)
242242
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
243243
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
244-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
244+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
245245
; GFX10-NEXT: s_endpgm
246246
;
247247
; GFX11-LABEL: update_dppv2f32_test:

0 commit comments

Comments
 (0)