Skip to content

Commit ae059a1

Browse files
authored
[AMDGPU][True16][CodeGen] support v_mov_b16 and v_swap_b16 in true16 format (#102198)
support v_swap_b16 in true16 format. update tableGen pattern and folding for v_mov_b16. --------- Co-authored-by: guochen2 <[email protected]>
1 parent 1a6d60e commit ae059a1

File tree

9 files changed

+192
-73
lines changed

9 files changed

+192
-73
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -1460,7 +1460,15 @@ bool SIFoldOperands::tryFoldFoldableCopy(
14601460
return false;
14611461
}
14621462

1463-
MachineOperand &OpToFold = MI.getOperand(1);
1463+
MachineOperand *OpToFoldPtr;
1464+
if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1465+
// Folding when any src_modifiers are non-zero is unsupported
1466+
if (TII->hasAnyModifiersSet(MI))
1467+
return false;
1468+
OpToFoldPtr = &MI.getOperand(2);
1469+
} else
1470+
OpToFoldPtr = &MI.getOperand(1);
1471+
MachineOperand &OpToFold = *OpToFoldPtr;
14641472
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
14651473

14661474
// FIXME: We could also be folding things like TargetIndexes.

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -3369,6 +3369,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
33693369

33703370
bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
33713371
switch (MI.getOpcode()) {
3372+
case AMDGPU::V_MOV_B16_t16_e32:
3373+
case AMDGPU::V_MOV_B16_t16_e64:
33723374
case AMDGPU::V_MOV_B32_e32:
33733375
case AMDGPU::V_MOV_B32_e64:
33743376
case AMDGPU::V_MOV_B64_PSEUDO:
@@ -5639,7 +5641,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
56395641
unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
56405642
const TargetRegisterClass *RC = RI.getRegClass(RCID);
56415643
unsigned Size = RI.getRegSizeInBits(*RC);
5642-
unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5644+
unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5645+
: Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5646+
: AMDGPU::V_MOV_B32_e32;
56435647
if (MO.isReg())
56445648
Opcode = AMDGPU::COPY;
56455649
else if (RI.isSGPRClass(RC))

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

+45-20
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,7 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs(
657657
// although requirements match the pass placement and it reduces code size too.
658658
MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
659659
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
660+
MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
660661
MovT.getOpcode() == AMDGPU::COPY);
661662

662663
Register T = MovT.getOperand(0).getReg();
@@ -668,7 +669,12 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
668669
Register X = Xop.getReg();
669670
unsigned Xsub = Xop.getSubReg();
670671

671-
unsigned Size = TII->getOpSize(MovT, 0) / 4;
672+
unsigned Size = TII->getOpSize(MovT, 0);
673+
674+
// We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
675+
// are not allocatble.
676+
if (Size == 2 && X.isVirtual())
677+
return nullptr;
672678

673679
if (!TRI->isVGPR(*MRI, X))
674680
return nullptr;
@@ -684,9 +690,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
684690
KilledT = MovY->killsRegister(T, TRI);
685691

686692
if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
693+
MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
687694
MovY->getOpcode() != AMDGPU::COPY) ||
688-
!MovY->getOperand(1).isReg() ||
689-
MovY->getOperand(1).getReg() != T ||
695+
!MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T ||
690696
MovY->getOperand(1).getSubReg() != Tsub)
691697
continue;
692698

@@ -714,14 +720,15 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
714720
}
715721
if (MovX ||
716722
(I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
723+
I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
717724
I->getOpcode() != AMDGPU::COPY) ||
718725
I->getOperand(0).getReg() != X ||
719726
I->getOperand(0).getSubReg() != Xsub) {
720727
MovX = nullptr;
721728
break;
722729
}
723730

724-
if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
731+
if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
725732
continue;
726733

727734
MovX = &*I;
@@ -730,23 +737,40 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
730737
if (!MovX)
731738
continue;
732739

733-
LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
740+
LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY);
734741

735-
for (unsigned I = 0; I < Size; ++I) {
736-
TargetInstrInfo::RegSubRegPair X1, Y1;
737-
X1 = getSubRegForIndex(X, Xsub, I);
738-
Y1 = getSubRegForIndex(Y, Ysub, I);
739-
MachineBasicBlock &MBB = *MovT.getParent();
742+
MachineBasicBlock &MBB = *MovT.getParent();
743+
SmallVector<MachineInstr *, 4> Swaps;
744+
if (Size == 2) {
740745
auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
741-
TII->get(AMDGPU::V_SWAP_B32))
742-
.addDef(X1.Reg, 0, X1.SubReg)
743-
.addDef(Y1.Reg, 0, Y1.SubReg)
744-
.addReg(Y1.Reg, 0, Y1.SubReg)
745-
.addReg(X1.Reg, 0, X1.SubReg).getInstr();
746-
if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
747-
// Drop implicit EXEC.
748-
MIB->removeOperand(MIB->getNumExplicitOperands());
749-
MIB->copyImplicitOps(*MBB.getParent(), *MovX);
746+
TII->get(AMDGPU::V_SWAP_B16))
747+
.addDef(X)
748+
.addDef(Y)
749+
.addReg(Y)
750+
.addReg(X)
751+
.getInstr();
752+
Swaps.push_back(MIB);
753+
} else {
754+
assert(Size > 0 && Size % 4 == 0);
755+
for (unsigned I = 0; I < Size / 4; ++I) {
756+
TargetInstrInfo::RegSubRegPair X1, Y1;
757+
X1 = getSubRegForIndex(X, Xsub, I);
758+
Y1 = getSubRegForIndex(Y, Ysub, I);
759+
auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
760+
TII->get(AMDGPU::V_SWAP_B32))
761+
.addDef(X1.Reg, 0, X1.SubReg)
762+
.addDef(Y1.Reg, 0, Y1.SubReg)
763+
.addReg(Y1.Reg, 0, Y1.SubReg)
764+
.addReg(X1.Reg, 0, X1.SubReg)
765+
.getInstr();
766+
Swaps.push_back(MIB);
767+
}
768+
}
769+
// Drop implicit EXEC.
770+
if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
771+
for (MachineInstr *Swap : Swaps) {
772+
Swap->removeOperand(Swap->getNumExplicitOperands());
773+
Swap->copyImplicitOps(*MBB.getParent(), *MovX);
750774
}
751775
}
752776
MovX->eraseFromParent();
@@ -833,6 +857,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
833857
}
834858

835859
if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
860+
MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
836861
MI.getOpcode() == AMDGPU::COPY)) {
837862
if (auto *NextMI = matchSwap(MI)) {
838863
Next = NextMI->getIterator();
@@ -1023,7 +1048,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
10231048
MachineFunctionProperties::Property::NoVRegs))
10241049
continue;
10251050

1026-
if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
1051+
if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
10271052
!shouldShrinkTrue16(MI))
10281053
continue;
10291054

llvm/lib/Target/AMDGPU/VOP1Instructions.td

+1-1
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,7 @@ let SubtargetPredicate = isGFX11Plus in {
751751
let IsInvalidSingleUseConsumer = 1;
752752
let IsInvalidSingleUseProducer = 1;
753753
}
754-
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
754+
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
755755
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
756756
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
757757
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;

llvm/test/CodeGen/AMDGPU/bf16.ll

+8-20
Original file line numberDiff line numberDiff line change
@@ -2131,26 +2131,14 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
21312131
; GFX10-NEXT: global_store_short v[2:3], v5, off
21322132
; GFX10-NEXT: s_setpc_b64 s[30:31]
21332133
;
2134-
; GFX11TRUE16-LABEL: test_store_fpimm:
2135-
; GFX11TRUE16: ; %bb.0:
2136-
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2137-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80
2138-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228
2139-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2140-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
2141-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
2142-
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v5, off
2143-
; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v4, off
2144-
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
2145-
;
2146-
; GFX11FAKE16-LABEL: test_store_fpimm:
2147-
; GFX11FAKE16: ; %bb.0:
2148-
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149-
; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80
2150-
; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228
2151-
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off
2152-
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off
2153-
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
2134+
; GFX11-LABEL: test_store_fpimm:
2135+
; GFX11: ; %bb.0:
2136+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2137+
; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80
2138+
; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228
2139+
; GFX11-NEXT: global_store_b16 v[0:1], v4, off
2140+
; GFX11-NEXT: global_store_b16 v[2:3], v5, off
2141+
; GFX11-NEXT: s_setpc_b64 s[30:31]
21542142
store bfloat 1.0, ptr addrspace(1) %ptr0
21552143
store bfloat 42.0, ptr addrspace(1) %ptr1
21562144
ret void

llvm/test/CodeGen/AMDGPU/fadd.f16.ll

+4-12
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
246246
; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
247247
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
248248
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
249-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00
250-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
251-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
249+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
252250
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
253251
; GFX11-SDAG-NEXT: s_nop 0
254252
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -264,9 +262,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
264262
; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
265263
; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
266264
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
267-
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x3c00
268-
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
269-
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
265+
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
270266
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
271267
; GFX11-GISEL-NEXT: s_nop 0
272268
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,9 +386,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
390386
; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
391387
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
392388
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
393-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x4000
394-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
395-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
389+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
396390
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
397391
; GFX11-SDAG-NEXT: s_nop 0
398392
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -408,9 +402,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
408402
; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
409403
; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
410404
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
411-
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x4000
412-
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
413-
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
405+
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
414406
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
415407
; GFX11-GISEL-NEXT: s_nop 0
416408
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
4+
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
5+
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
6+
7+
define half @swap(half %a, half %b, i32 %i) {
8+
; GFX11-TRUE16-LABEL: swap:
9+
; GFX11-TRUE16: ; %bb.0: ; %entry
10+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
12+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
13+
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
14+
; GFX11-TRUE16-NEXT: .LBB0_1: ; %loop
15+
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
16+
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2
17+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
18+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
19+
; GFX11-TRUE16-NEXT: v_swap_b16 v0.l, v0.h
20+
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
21+
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
22+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
23+
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
24+
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
25+
; GFX11-TRUE16-NEXT: ; %bb.2: ; %ret
26+
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
27+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
28+
;
29+
; GFX11-FAKE16-LABEL: swap:
30+
; GFX11-FAKE16: ; %bb.0: ; %entry
31+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32+
; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
33+
; GFX11-FAKE16-NEXT: .LBB0_1: ; %loop
34+
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
35+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
36+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
37+
; GFX11-FAKE16-NEXT: v_swap_b32 v1, v0
38+
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
39+
; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
40+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
41+
; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
42+
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
43+
; GFX11-FAKE16-NEXT: ; %bb.2: ; %ret
44+
; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
45+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
46+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
47+
;
48+
; GFX12-TRUE16-LABEL: swap:
49+
; GFX12-TRUE16: ; %bb.0: ; %entry
50+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
51+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
52+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
53+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
54+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
55+
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
56+
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
57+
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
58+
; GFX12-TRUE16-NEXT: .LBB0_1: ; %loop
59+
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
60+
; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2
61+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
62+
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
63+
; GFX12-TRUE16-NEXT: v_swap_b16 v0.l, v0.h
64+
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
65+
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
66+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
67+
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
68+
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
69+
; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret
70+
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
71+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
72+
;
73+
; GFX12-FAKE16-LABEL: swap:
74+
; GFX12-FAKE16: ; %bb.0: ; %entry
75+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
76+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
77+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
78+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
79+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
80+
; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
81+
; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop
82+
; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
83+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
84+
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
85+
; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0
86+
; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
87+
; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
88+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89+
; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
90+
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
91+
; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret
92+
; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
93+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
94+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
95+
entry:
96+
br label %loop
97+
98+
loop:
99+
%x = phi half [%a, %entry], [%y, %loop]
100+
%y = phi half [%b, %entry], [%x, %loop]
101+
%i2 = phi i32 [%i, %entry], [%i3, %loop]
102+
103+
%i3 = sub i32 %i2, 1
104+
105+
%cmp = icmp eq i32 %i3, 0
106+
br i1 %cmp, label %ret, label %loop
107+
108+
ret:
109+
ret half %x
110+
}

llvm/test/MC/AMDGPU/gfx11_asm_err.s

-18
Original file line numberDiff line numberDiff line change
@@ -169,21 +169,3 @@ s_load_b96 s[20:22], s[2:3], s0
169169

170170
s_buffer_load_b96 s[20:22], s[4:7], s0
171171
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
172-
173-
v_mov_b16 v0.l, s0.h
174-
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
175-
176-
v_mov_b16 v0.l, ttmp0.h
177-
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
178-
179-
v_mov_b16 v0.l, a0.h
180-
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
181-
182-
v_mov_b16 v0.l, s0.h
183-
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
184-
185-
v_mov_b16 v0.l, ttmp0.h
186-
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
187-
188-
v_mov_b16 v0.l, a0.h
189-
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
2+
3+
v_mov_b16 v0.l, s0.h
4+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
5+
6+
v_mov_b16 v0.l, ttmp0.h
7+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
8+
9+
v_mov_b16 v0.l, a0.h
10+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

0 commit comments

Comments
 (0)