@@ -216,7 +216,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
216
216
CombineInfo &Paired, bool Modify = false );
217
217
static bool widthsFit (const GCNSubtarget &STI, const CombineInfo &CI,
218
218
const CombineInfo &Paired);
219
- static unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired);
219
+ unsigned getNewOpcode (const CombineInfo &CI, const CombineInfo &Paired);
220
220
static std::pair<unsigned , unsigned > getSubRegIdxs (const CombineInfo &CI,
221
221
const CombineInfo &Paired);
222
222
const TargetRegisterClass *
@@ -353,6 +353,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
353
353
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354
354
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355
355
case AMDGPU::S_LOAD_DWORDX2_IMM:
356
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
356
357
case AMDGPU::GLOBAL_LOAD_DWORDX2:
357
358
case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358
359
case AMDGPU::GLOBAL_STORE_DWORDX2:
@@ -363,6 +364,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
363
364
case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364
365
case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365
366
case AMDGPU::S_LOAD_DWORDX3_IMM:
367
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
366
368
case AMDGPU::GLOBAL_LOAD_DWORDX3:
367
369
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368
370
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -373,6 +375,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
373
375
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374
376
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375
377
case AMDGPU::S_LOAD_DWORDX4_IMM:
378
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
376
379
case AMDGPU::GLOBAL_LOAD_DWORDX4:
377
380
case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378
381
case AMDGPU::GLOBAL_STORE_DWORDX4:
@@ -383,6 +386,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
383
386
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384
387
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385
388
case AMDGPU::S_LOAD_DWORDX8_IMM:
389
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
386
390
return 8 ;
387
391
case AMDGPU::DS_READ_B32:
388
392
case AMDGPU::DS_READ_B32_gfx9:
@@ -507,6 +511,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
507
511
case AMDGPU::S_LOAD_DWORDX3_IMM:
508
512
case AMDGPU::S_LOAD_DWORDX4_IMM:
509
513
case AMDGPU::S_LOAD_DWORDX8_IMM:
514
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
510
518
return S_LOAD_IMM;
511
519
case AMDGPU::DS_READ_B32:
512
520
case AMDGPU::DS_READ_B32_gfx9:
@@ -591,6 +599,10 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
591
599
case AMDGPU::S_LOAD_DWORDX3_IMM:
592
600
case AMDGPU::S_LOAD_DWORDX4_IMM:
593
601
case AMDGPU::S_LOAD_DWORDX8_IMM:
602
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
594
606
return AMDGPU::S_LOAD_DWORD_IMM;
595
607
case AMDGPU::GLOBAL_LOAD_DWORD:
596
608
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -703,6 +715,10 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
703
715
case AMDGPU::S_LOAD_DWORDX3_IMM:
704
716
case AMDGPU::S_LOAD_DWORDX4_IMM:
705
717
case AMDGPU::S_LOAD_DWORDX8_IMM:
718
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719
+ case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721
+ case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
706
722
Result.SBase = true ;
707
723
return Result;
708
724
case AMDGPU::DS_READ_B32:
@@ -1212,8 +1228,14 @@ void SILoadStoreOptimizer::copyToDestRegs(
1212
1228
1213
1229
// Copy to the old destination registers.
1214
1230
const MCInstrDesc &CopyDesc = TII->get (TargetOpcode::COPY);
1215
- const auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1216
- const auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1231
+ auto *Dest0 = TII->getNamedOperand (*CI.I , OpName);
1232
+ auto *Dest1 = TII->getNamedOperand (*Paired.I , OpName);
1233
+
1234
+ // The constrained sload instructions in S_LOAD_IMM class will have
1235
+ // `early-clobber` flag in the dst operand. Remove the flag before using the
1236
+ // MOs in copies.
1237
+ Dest0->setIsEarlyClobber (false );
1238
+ Dest1->setIsEarlyClobber (false );
1217
1239
1218
1240
BuildMI (*MBB, InsertBefore, DL, CopyDesc)
1219
1241
.add (*Dest0) // Copy to same destination including flags and sub reg.
@@ -1700,19 +1722,29 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1700
1722
case 8 :
1701
1723
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1702
1724
}
1703
- case S_LOAD_IMM:
1725
+ case S_LOAD_IMM: {
1726
+ // If XNACK is enabled, use the constrained opcodes when the first load is
1727
+ // under-aligned.
1728
+ const MachineMemOperand *MMO = *CI.I ->memoperands_begin ();
1729
+ bool NeedsConstrainedOpc =
1730
+ STM->isXNACKEnabled () && MMO->getAlign ().value () < Width * 4 ;
1704
1731
switch (Width) {
1705
1732
default :
1706
1733
return 0 ;
1707
1734
case 2 :
1708
- return AMDGPU::S_LOAD_DWORDX2_IMM;
1735
+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736
+ : AMDGPU::S_LOAD_DWORDX2_IMM;
1709
1737
case 3 :
1710
- return AMDGPU::S_LOAD_DWORDX3_IMM;
1738
+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739
+ : AMDGPU::S_LOAD_DWORDX3_IMM;
1711
1740
case 4 :
1712
- return AMDGPU::S_LOAD_DWORDX4_IMM;
1741
+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742
+ : AMDGPU::S_LOAD_DWORDX4_IMM;
1713
1743
case 8 :
1714
- return AMDGPU::S_LOAD_DWORDX8_IMM;
1744
+ return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745
+ : AMDGPU::S_LOAD_DWORDX8_IMM;
1715
1746
}
1747
+ }
1716
1748
case GLOBAL_LOAD:
1717
1749
switch (Width) {
1718
1750
default :
0 commit comments