diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 04b392829f0d7..afb82d096d843 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -60,6 +60,7 @@ #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Bitset.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" @@ -324,6 +325,10 @@ class Vectorizer { Instruction *ChainElem, Instruction *ChainBegin, const DenseMap &ChainOffsets); + /// Merge equivalence classes if casts could be inserted in one to match + /// the total bitwidth of the instructions. + void insertCastsToMergeClasses(EquivalenceClassMap &EQClasses); + /// Merges the equivalence classes if they have underlying objects that differ /// by one level of indirection (i.e., one is a getelementptr and the other is /// the base pointer in that getelementptr). @@ -1310,6 +1315,135 @@ std::optional Vectorizer::getConstantOffsetSelects( return std::nullopt; } +void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) { + if (EQClasses.size() < 2) + return; + + auto CopyMetaDataFromTo = [&](Instruction *Src, Instruction *Dst) { + SmallVector, 4> MD; + Src->getAllMetadata(MD); + for (const auto [ID, Node] : MD) { + Dst->setMetadata(ID, Node); + } + }; + + // For each class, determine if all instructions are of type int, FP or ptr. + // This information will help us determine the type instructions should be + // casted into. + MapVector> ClassAllTy; + for (const auto &C : EQClasses) { + auto CommonTypeKind = [](Instruction *I) { + if (I->getType()->isIntOrIntVectorTy()) + return 0; + if (I->getType()->isFPOrFPVectorTy()) + return 1; + if (I->getType()->isPtrOrPtrVectorTy()) + return 2; + return -1; // Invalid type kind + }; + + int FirstTypeKind = CommonTypeKind(EQClasses[C.first][0]); + if (FirstTypeKind != -1 && all_of(EQClasses[C.first], [&](Instruction *I) { + return CommonTypeKind(I) == FirstTypeKind; + })) { + ClassAllTy[C.first].set(FirstTypeKind); + } + } + + // Loop over all equivalence classes and try to merge them. Keep track of + // classes that are merged into others. + DenseSet ClassesToErase; + for (auto EC1 : EQClasses) { + for (auto EC2 : EQClasses) { + // Skip if EC2 was already merged before, EC1 follows EC2 in the + // collection or EC1 is the same as EC2. + if (ClassesToErase.contains(EC2.first) || EC1 <= EC2 || + EC1.first == EC2.first) + continue; + + auto [Ptr1, AS1, TySize1, IsLoad1] = EC1.first; + auto [Ptr2, AS2, TySize2, IsLoad2] = EC2.first; + + // Attempt to merge EC2 into EC1. Skip if the pointers, address spaces or + // whether the leader instruction is a load/store are different. Also skip + // if the scalar bitwidth of the first equivalence class is smaller than + // the second one to avoid reconsidering the same equivalence class pair. + if (Ptr1 != Ptr2 || AS1 != AS2 || IsLoad1 != IsLoad2 || TySize1 < TySize2) + continue; + + // An All-FP class should only be merged into another All-FP class. + if ((ClassAllTy[EC1.first].test(1) && !ClassAllTy[EC2.first].test(1)) || + (!ClassAllTy[EC1.first].test(2) && ClassAllTy[EC2.first].test(2))) + continue; + + // Ensure all instructions in EC2 can be bitcasted into NewTy. + /// TODO: NewTyBits is needed as stuctured binded variables cannot be + /// captured by a lambda until C++20. + auto NewTyBits = std::get<2>(EC1.first); + if (any_of(EC2.second, [&](Instruction *I) { + return DL.getTypeSizeInBits(getLoadStoreType(I)) != NewTyBits; + })) + continue; + + // Create a new type for the equivalence class. + auto &Ctx = EC2.second[0]->getContext(); + Type *NewTy = Type::getIntNTy(EC2.second[0]->getContext(), NewTyBits); + if (ClassAllTy[EC1.first].test(1) && ClassAllTy[EC2.first].test(1)) { + if (NewTyBits == 16) + NewTy = Type::getHalfTy(Ctx); + else if (NewTyBits == 32) + NewTy = Type::getFloatTy(Ctx); + else if (NewTyBits == 64) + NewTy = Type::getDoubleTy(Ctx); + } else if (ClassAllTy[EC1.first].test(2) && + ClassAllTy[EC2.first].test(2)) { + NewTy = PointerType::get(Ctx, AS2); + } + + for (auto *Inst : EC2.second) { + Value *Ptr = getLoadStorePointerOperand(Inst); + Type *OrigTy = Inst->getType(); + if (OrigTy == NewTy) + continue; + if (auto *LI = dyn_cast(Inst)) { + Builder.SetInsertPoint(LI->getIterator()); + auto *NewLoad = Builder.CreateLoad(NewTy, Ptr); + auto *Cast = Builder.CreateBitOrPointerCast( + NewLoad, OrigTy, NewLoad->getName() + ".cast"); + LI->replaceAllUsesWith(Cast); + CopyMetaDataFromTo(LI, NewLoad); + LI->eraseFromParent(); + EQClasses[EC1.first].emplace_back(NewLoad); + } else { + auto *SI = cast(Inst); + Builder.SetInsertPoint(SI->getIterator()); + auto *Cast = Builder.CreateBitOrPointerCast( + SI->getValueOperand(), NewTy, + SI->getValueOperand()->getName() + ".cast"); + auto *NewStore = Builder.CreateStore( + Cast, getLoadStorePointerOperand(SI), SI->isVolatile()); + CopyMetaDataFromTo(SI, NewStore); + SI->eraseFromParent(); + EQClasses[EC1.first].emplace_back(NewStore); + } + } + + // Sort the instructions in the equivalence class by their order in the + // basic block. This is important to ensure that the instructions are + // vectorized in the correct order. + std::sort(EQClasses[EC1.first].begin(), EQClasses[EC1.first].end(), + [](const Instruction *A, const Instruction *B) { + return A && B && A->comesBefore(B); + }); + ClassesToErase.insert(EC2.first); + } + } + + // Erase the equivalence classes that were merged into others. + for (auto Key : ClassesToErase) + EQClasses.erase(Key); +} + void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const { if (EQClasses.size() < 2) // There is nothing to merge. return; @@ -1495,7 +1629,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin, /*IsLoad=*/LI != nullptr}] .emplace_back(&I); } - + insertCastsToMergeClasses(Ret); mergeEquivalenceClasses(Ret); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 8e16889c72e65..72081c08d22ee 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2547,44 +2547,45 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: udiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_and_b32 s0, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s1, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s11, 0xffff -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 ; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: s_lshr_b32 s4, s7, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -2600,6 +2601,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2607,42 +2609,43 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_and_b32 s6, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s0, s3, 0xffff +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: s_and_b32 s0, s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s3, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX9-NEXT: s_lshr_b32 s0, s3, 16 +; GFX9-NEXT: s_lshr_b32 s2, s7, 16 ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s0, s1, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -2650,7 +2653,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -2659,8 +2661,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -2762,49 +2763,51 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: urem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_and_b32 s0, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s1, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_lshr_b32 s5, s6, 16 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: s_and_b32 s5, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 +; GFX6-NEXT: s_and_b32 s8, s9, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 +; GFX6-NEXT: s_lshr_b32 s4, s9, 16 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: s_lshr_b32 s5, s9, 16 +; GFX6-NEXT: s_lshr_b32 s5, s7, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 @@ -2815,10 +2818,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2830,67 +2833,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s4, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s5, s1, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s3, 0xffff ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: s_lshr_b32 s6, s7, 16 ; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GFX9-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v4, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3000,62 +3003,64 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: sdiv_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s7, s10 +; GFX6-NEXT: s_sext_i32_i16 s6, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GFX6-NEXT: s_xor_b32 s6, s7, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_or_b32 s8, s6, 1 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_ashr_i32 s5, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_cselect_b32 s6, s8, 0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: s_ashr_i32 s6, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s4, s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_sext_i32_i16 s5, s11 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_ashr_i32 s5, s11, 16 +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s9, 16 +; GFX6-NEXT: s_ashr_i32 s4, s11, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 @@ -3080,13 +3085,13 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: sdiv_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -3098,61 +3103,61 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s8, 0 +; GFX9-NEXT: s_ashr_i32 s5, s6, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 -; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_sext_i32_i16 s4, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s0, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_ashr_i32 s4, s7, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: v_add_u32_e32 v1, s2, v5 ; GFX9-NEXT: s_ashr_i32 s2, s3, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 -; GFX9-NEXT: s_ashr_i32 s0, s1, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s2, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v6 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3270,53 +3275,55 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX6-LABEL: srem_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s0, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX6-NEXT: s_sext_i32_i16 s1, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_or_b32 s10, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s10, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_cselect_b32 s0, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_ashr_i32 s5, s8, 16 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s6, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: s_lshr_b32 s7, s10, 16 -; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NEXT: s_lshr_b32 s8, s8, 16 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s11 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: s_sext_i32_i16 s5, s7 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 @@ -3330,30 +3337,30 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s11, 16 +; GFX6-NEXT: s_ashr_i32 s4, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_ashr_i32 s5, s9, 16 +; GFX6-NEXT: s_ashr_i32 s5, s7, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s9, 16 -; GFX6-NEXT: s_lshr_b32 s7, s11, 16 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: s_lshr_b32 s6, s7, 16 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: s_lshr_b32 s8, s9, 16 +; GFX6-NEXT: s_or_b32 s9, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: s_cselect_b32 s4, s9, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s8 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s7, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3362,13 +3369,13 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; ; GFX9-LABEL: srem_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s2 +; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 +; GFX9-NEXT: s_sext_i32_i16 s9, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -3380,69 +3387,69 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s10, 0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 -; GFX9-NEXT: s_ashr_i32 s0, s2, 16 +; GFX9-NEXT: s_ashr_i32 s10, s2, 16 +; GFX9-NEXT: s_ashr_i32 s2, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s2, s10, s0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v3 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s1 +; GFX9-NEXT: s_xor_b32 s4, s10, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s6, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: s_sext_i32_i16 s6, s7 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: s_sext_i32_i16 s8, s3 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: s_xor_b32 s0, s8, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s0, s0, 1 -; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX9-NEXT: s_xor_b32 s2, s8, s6 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 -; GFX9-NEXT: s_ashr_i32 s2, s1, 16 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_ashr_i32 s4, s7, 16 +; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX9-NEXT: s_ashr_i32 s5, s3, 16 +; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v5 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_xor_b32 s0, s2, s3 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s5, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 +; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v6 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, ptr addrspace(1) %out @@ -3838,46 +3845,48 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: udiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX6-NEXT: s_and_b32 s0, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s1, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s4, s8, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s11, 0xffff -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3885,48 +3894,47 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_and_b32 s6, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s0, s3, 0xffff +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s0, s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s3, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc ; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v6, v0, s[6:7] +; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4006,52 +4014,54 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: urem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: s_lshr_b32 s6, s10, 16 +; GFX6-NEXT: s_and_b32 s1, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX6-NEXT: s_and_b32 s1, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v4 +; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: s_and_b32 s6, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: v_mad_f32 v2, -v1, v4, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4059,33 +4069,34 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s2, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_and_b32 s8, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: s_and_b32 s5, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX9-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc @@ -4094,18 +4105,17 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[6:7] +; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4191,46 +4201,47 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: sdiv_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s7, s10 +; GFX6-NEXT: s_sext_i32_i16 s6, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GFX6-NEXT: s_xor_b32 s6, s7, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_or_b32 s8, s6, 1 +; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: s_ashr_i32 s5, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_cselect_b32 s6, s8, 0 +; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v2 +; GFX6-NEXT: s_ashr_i32 s6, s10, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_xor_b32 s4, s6, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: s_sext_i32_i16 s5, s11 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s9 +; GFX6-NEXT: s_sext_i32_i16 s4, s11 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s5 @@ -4243,6 +4254,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -4253,13 +4265,13 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: sdiv_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -4271,44 +4283,44 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s8, 0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_ashr_i32 s5, s6, 16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s2, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_or_b32 s0, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s0, s0, 0 -; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 -; GFX9-NEXT: s_sext_i32_i16 s0, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_sext_i32_i16 s4, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v4 +; GFX9-NEXT: s_sext_i32_i16 s2, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s2, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s4, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s4, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v1, v2, s[6:7] +; GFX9-NEXT: global_store_short v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -4400,68 +4412,70 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX6-LABEL: srem_v3i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sext_i32_i16 s4, s10 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s8 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_sext_i32_i16 s0, s8 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX6-NEXT: s_sext_i32_i16 s1, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX6-NEXT: s_xor_b32 s0, s1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_or_b32 s10, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s10, 16 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX6-NEXT: s_cselect_b32 s0, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_ashr_i32 s4, s8, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_ashr_i32 s5, s8, 16 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_ashr_i32 s5, s6, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s8, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: s_lshr_b32 s7, s10, 16 -; GFX6-NEXT: s_or_b32 s8, s4, 1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NEXT: s_lshr_b32 s8, s8, 16 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 -; GFX6-NEXT: s_sext_i32_i16 s4, s11 +; GFX6-NEXT: s_sext_i32_i16 s4, s9 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: s_sext_i32_i16 s5, s7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s9 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 @@ -4470,12 +4484,12 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; ; GFX9-LABEL: srem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s8, s2 +; GFX9-NEXT: s_sext_i32_i16 s8, s6 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 +; GFX9-NEXT: s_sext_i32_i16 s9, s2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX9-NEXT: s_xor_b32 s4, s9, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -4487,51 +4501,51 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX9-NEXT: s_cselect_b32 s4, s10, 0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 -; GFX9-NEXT: s_ashr_i32 s0, s2, 16 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GFX9-NEXT: s_xor_b32 s2, s10, s0 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_ashr_i32 s10, s2, 16 +; GFX9-NEXT: s_ashr_i32 s2, s6, 16 +; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_add_u32_e32 v1, s4, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: s_xor_b32 s4, s10, s2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s6, s4, 1 ; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s2, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 -; GFX9-NEXT: s_sext_i32_i16 s2, s3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v3 +; GFX9-NEXT: s_sext_i32_i16 s4, s7 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX9-NEXT: s_xor_b32 s0, s3, s2 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: s_xor_b32 s2, s5, s4 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: s_or_b32 s6, s2, 1 ; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s6, 0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[6:7] +; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, ptr addrspace(1) %out @@ -5665,29 +5679,31 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX6-NEXT: s_sub_i32 s1, 0, s0 -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s0 +; GFX6-NEXT: s_mov_b32 s8, s4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: s_mul_i32 s1, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s8, s1 +; GFX6-NEXT: s_sub_i32 s1, s6, s1 ; GFX6-NEXT: s_sub_i32 s3, s1, s0 ; GFX6-NEXT: s_cmp_ge_u32 s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 @@ -5702,10 +5718,10 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s7, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 ; GFX6-NEXT: s_mul_i32 s0, s0, s2 -; GFX6-NEXT: s_sub_i32 s0, s9, s0 +; GFX6-NEXT: s_sub_i32 s0, s7, s0 ; GFX6-NEXT: s_sub_i32 s1, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; GFX6-NEXT: s_cmp_ge_u32 s0, s2 @@ -5716,19 +5732,19 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_cmp_ge_u32 s0, s2 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -5740,37 +5756,37 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 ; GFX9-NEXT: s_mul_i32 s5, s4, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s5 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 ; GFX9-NEXT: s_add_i32 s9, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_sub_i32 s5, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 ; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cselect_b32 s0, s5, s0 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 ; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s0, s5, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 ; GFX9-NEXT: s_sub_i32 s4, 0, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8 ; GFX9-NEXT: s_mul_i32 s5, s4, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 ; GFX9-NEXT: s_add_i32 s7, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_sub_i32 s5, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 ; GFX9-NEXT: s_cselect_b32 s4, s7, s4 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 ; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s1, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y @@ -6006,64 +6022,67 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 +; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s6, s6, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s2, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s6 +; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, 0, s8 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s8 +; GFX6-NEXT: s_sub_i32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s8 +; GFX6-NEXT: s_cmp_ge_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s4, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -6075,33 +6094,33 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: s_mul_i32 s4, s4, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 ; GFX9-NEXT: s_mul_i32 s4, s4, s7 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s0, s4, s0 -; GFX9-NEXT: s_sub_i32 s4, s0, s7 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s4, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s4, s2, s7 +; GFX9-NEXT: s_cmp_ge_u32 s2, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 ; GFX9-NEXT: s_sub_i32 s4, 0, s6 ; GFX9-NEXT: s_mul_i32 s4, s4, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 ; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s3, s8 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s4, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_sub_i32 s4, s1, s6 -; GFX9-NEXT: s_cmp_ge_u32 s1, s6 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s4, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_sub_i32 s4, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = urem <2 x i32> %x, %shl.y @@ -6483,136 +6502,138 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX6-NEXT: s_abs_i32 s6, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_sub_i32 s7, 0, s6 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s8, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s7 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 -; GFX6-NEXT: s_abs_i32 s7, s0 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_abs_i32 s4, s2 +; GFX6-NEXT: s_xor_b32 s2, s2, s6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s6 -; GFX6-NEXT: s_sub_i32 s2, s7, s2 -; GFX6-NEXT: s_sub_i32 s7, s2, s6 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: s_ashr_i32 s2, s2, 31 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_mul_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s5, s4, s8 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_cselect_b32 s2, s7, s2 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX6-NEXT: s_abs_i32 s2, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_abs_i32 s8, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: s_xor_b32 s3, s1, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_abs_i32 s1, s1 -; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_abs_i32 s1, s3 +; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_ashr_i32 s3, s3, 31 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s0, v2 +; GFX6-NEXT: s_xor_b32 s0, s3, s9 +; GFX6-NEXT: s_ashr_i32 s0, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s0, v1 -; GFX6-NEXT: s_mul_i32 s0, s0, s2 -; GFX6-NEXT: s_sub_i32 s0, s1, s0 -; GFX6-NEXT: s_sub_i32 s1, s0, s2 +; GFX6-NEXT: v_readfirstlane_b32 s2, v1 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s1, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 +; GFX6-NEXT: s_cmp_ge_u32 s1, s8 ; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX9-NEXT: s_abs_i32 s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3 -; GFX9-NEXT: s_abs_i32 s3, s0 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7 +; GFX9-NEXT: s_abs_i32 s5, s2 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 -; GFX9-NEXT: s_ashr_i32 s0, s0, 31 +; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: s_ashr_i32 s2, s2, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s2, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2 -; GFX9-NEXT: s_add_i32 s8, s8, s2 -; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8 -; GFX9-NEXT: s_mul_i32 s8, s2, s6 -; GFX9-NEXT: s_sub_i32 s3, s3, s8 -; GFX9-NEXT: s_add_i32 s9, s2, 1 -; GFX9-NEXT: s_sub_i32 s8, s3, s6 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s2, s9, s2 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_add_i32 s8, s2, 1 -; GFX9-NEXT: s_cmp_ge_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s6, s8, s2 -; GFX9-NEXT: s_abs_i32 s8, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: s_xor_b32 s5, s6, s0 -; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 +; GFX9-NEXT: s_mul_i32 s7, s6, s8 +; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_sub_i32 s7, s5, s8 +; GFX9-NEXT: s_cmp_ge_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s5, s7, s6 +; GFX9-NEXT: s_abs_i32 s6, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_xor_b32 s5, s5, s2 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: s_sub_i32 s2, s5, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s0, s5, s0 -; GFX9-NEXT: s_xor_b32 s4, s1, s7 -; GFX9-NEXT: s_abs_i32 s1, s1 +; GFX9-NEXT: s_xor_b32 s4, s3, s4 +; GFX9-NEXT: s_abs_i32 s3, s3 +; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5 -; GFX9-NEXT: s_mul_i32 s6, s5, s8 -; GFX9-NEXT: s_sub_i32 s1, s1, s6 +; GFX9-NEXT: s_mul_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_sub_i32 s7, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 ; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s1, s8 -; GFX9-NEXT: s_cmp_ge_u32 s1, s8 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 -; GFX9-NEXT: s_cselect_b32 s1, s6, s1 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s1, s8 -; GFX9-NEXT: s_cselect_b32 s1, s6, s5 -; GFX9-NEXT: s_xor_b32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s7, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -6935,122 +6956,125 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX6-NEXT: s_abs_i32 s2, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX6-NEXT: s_abs_i32 s6, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_sub_i32 s0, 0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX6-NEXT: s_abs_i32 s6, s0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s7 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_abs_i32 s4, s2 +; GFX6-NEXT: s_ashr_i32 s2, s2, 31 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 -; GFX6-NEXT: s_mul_i32 s7, s7, s2 -; GFX6-NEXT: s_sub_i32 s6, s6, s7 -; GFX6-NEXT: s_sub_i32 s7, s6, s2 -; GFX6-NEXT: s_cmp_ge_u32 s6, s2 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 -; GFX6-NEXT: s_sub_i32 s7, s6, s2 -; GFX6-NEXT: s_cmp_ge_u32 s6, s2 -; GFX6-NEXT: s_cselect_b32 s2, s7, s6 -; GFX6-NEXT: s_abs_i32 s3, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s6, 0, s3 -; GFX6-NEXT: s_abs_i32 s8, s1 -; GFX6-NEXT: s_xor_b32 s2, s2, s0 +; GFX6-NEXT: s_mul_i32 s7, s7, s6 +; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s7, s4, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s6 +; GFX6-NEXT: s_cselect_b32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s7, s4, s6 +; GFX6-NEXT: s_cmp_ge_u32 s4, s6 +; GFX6-NEXT: s_cselect_b32 s8, s7, s4 +; GFX6-NEXT: s_abs_i32 s9, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_sub_i32 s4, 0, s9 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_ashr_i32 s1, s3, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s0, s2, s0 -; GFX6-NEXT: s_ashr_i32 s1, s1, 31 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_abs_i32 s0, s3 +; GFX6-NEXT: s_xor_b32 s3, s8, s2 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_sub_i32 s2, s3, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_sub_i32 s8, s2, s3 -; GFX6-NEXT: s_cmp_ge_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s2, s8, s2 -; GFX6-NEXT: s_xor_b32 s2, s2, s1 -; GFX6-NEXT: s_sub_i32 s1, s2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s9 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s3, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s3, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s3, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 -; GFX9-NEXT: s_abs_i32 s2, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s7, 0, s2 -; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: s_abs_i32 s0, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_abs_i32 s6, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s5, s2, 31 +; GFX9-NEXT: s_abs_i32 s2, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 ; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 ; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8 -; GFX9-NEXT: s_mul_i32 s7, s7, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s7 -; GFX9-NEXT: s_sub_i32 s7, s0, s2 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s0, s7, s0 -; GFX9-NEXT: s_sub_i32 s7, s0, s2 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s0, s7, s0 -; GFX9-NEXT: s_abs_i32 s7, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s0, s0, s6 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: s_sub_i32 s5, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s2, s8 +; GFX9-NEXT: s_mul_i32 s7, s7, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, s7 +; GFX9-NEXT: s_sub_i32 s7, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s7, s2 +; GFX9-NEXT: s_sub_i32 s7, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s7, s2 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s7, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s0, s0, s6 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_abs_i32 s1, s1 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_abs_i32 s3, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s7 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_sub_i32 s5, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_sub_i32 s5, s1, s7 -; GFX9-NEXT: s_cmp_ge_u32 s1, s7 -; GFX9-NEXT: s_cselect_b32 s1, s5, s1 -; GFX9-NEXT: s_xor_b32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX9-NEXT: s_mul_i32 s5, s5, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s5 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_sub_i32 s5, s3, s4 +; GFX9-NEXT: s_cmp_ge_u32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll index 384beae07ce2e..1ec4da9d35605 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -11,7 +11,6 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 - ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg0, addrspace 6) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s64) from %ir.arg0, align 16, addrspace 6) ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[REG_SEQUENCE]], 8, 0 :: (dereferenceable invariant load (s64) from %ir.arg0 + 8, basealign 16, addrspace 6) ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 @@ -19,14 +18,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[COPY3]], %subreg.sub2, killed [[COPY2]], %subreg.sub3 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) @@ -35,14 +34,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE1]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) @@ -51,14 +50,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE1]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY10]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[COPY10]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) @@ -68,14 +67,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE1]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY12]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) @@ -84,14 +83,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[REG_SEQUENCE1]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[REG_SEQUENCE1]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) @@ -100,14 +99,14 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE1]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[REG_SEQUENCE1]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[REG_SEQUENCE1]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE1]], [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) @@ -117,21 +116,21 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE1]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[REG_SEQUENCE1]], [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224 ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[REG_SEQUENCE1]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) @@ -147,19 +146,19 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 ; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[REG_SEQUENCE1]], [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240 ; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[REG_SEQUENCE1]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) @@ -175,18 +174,18 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256 ; GCN-NEXT: [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY40]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY42:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY41]], [[S_LOAD_DWORDX4_IMM]], [[COPY42]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY41]], [[REG_SEQUENCE1]], [[COPY42]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY43]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) @@ -202,19 +201,19 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY48:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY48]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY48]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136 ; GCN-NEXT: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY49]], [[REG_SEQUENCE1]], [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272 ; GCN-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY50]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY51]], [[REG_SEQUENCE1]], [[COPY52]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY53:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY53]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) @@ -230,18 +229,18 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE2]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[REG_SEQUENCE1]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288 ; GCN-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[REG_SEQUENCE1]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) @@ -257,19 +256,19 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 ; GCN-NEXT: [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[REG_SEQUENCE1]], [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304 ; GCN-NEXT: [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE3]], [[REG_SEQUENCE1]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[REG_SEQUENCE1]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 7208eaeff8eb1..a5bd004db9ea7 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -271,13 +271,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, 16 +; GFX8-NEXT: s_lshl_b32 s1, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll index d63a36c4b2958..eb802df7ebc15 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll @@ -48,8 +48,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] + ; CHECK-NEXT: %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY %10 ; CHECK-NEXT: SI_RETURN implicit $vgpr0 %result = or disjoint i32 %a, %b ret i32 %result @@ -64,10 +64,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] - ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] + ; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec + ; CHECK-NEXT: %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY %12 + ; CHECK-NEXT: $vgpr1 = COPY %13 ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %result = or disjoint <2 x i32> %a, %b ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 7b6a363c42708..c4b51b5380ce5 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -218,13 +218,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff -; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s0, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s1, s2, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fabs_v4f16: @@ -234,13 +234,13 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff -; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s0, s3, 0x7fff7fff +; VI-NEXT: s_and_b32 s1, s2, 0x7fff7fff +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_fabs_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 6bcb086944c91..baf9b0abf7b0c 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -115,13 +115,13 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s3, 31 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff +; VI-NEXT: s_and_b32 s1, s2, 0x7fffffff +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b4b9c2d3e0135..ff08ff2fd3511 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -2063,133 +2063,135 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { ; SI-LABEL: s_copysign_v3f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_lshr_b32 s0, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 +; SI-NEXT: s_lshr_b32 s5, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v5 +; SI-NEXT: v_bfi_b32 v1, s0, v1, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_bfi_b32 v1, s0, v1, v5 -; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_copysign_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_lshr_b32 s0, s0, 16 ; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_bfi_b32 v1, s6, v1, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s0, s4, 4 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: v_bfi_b32 v3, s6, v0, v1 -; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_short v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_short v[0:1], v3 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_copysign_v3f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 -; GFX9-NEXT: global_store_short v0, v2, s[6:7] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_copysign_v3f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s6, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s7 ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5] offset:4 -; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[4:5] +; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[0:1] offset:4 +; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_copysign_v3f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s6, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s4 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s7 +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2 +; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_store_b16 v3, v2, s[4:5] offset:4 -; GFX11-FAKE16-NEXT: global_store_b32 v3, v0, s[4:5] +; GFX11-FAKE16-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 +; GFX11-FAKE16-NEXT: global_store_b32 v3, v0, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) store <3 x half> %out, ptr addrspace(1) %arg_out @@ -2199,23 +2201,25 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { ; SI-LABEL: s_copysign_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s8, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: s_lshr_b32 s0, s2, 16 -; SI-NEXT: s_lshr_b32 s9, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_lshr_b32 s0, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; SI-NEXT: s_lshr_b32 s1, s2, 16 +; SI-NEXT: s_lshr_b32 s10, s9, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s9 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v5 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 @@ -2234,89 +2238,89 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; ; VI-LABEL: s_copysign_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_movk_i32 s6, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s3, s3, 16 -; VI-NEXT: s_lshr_b32 s1, s1, 16 -; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_bfi_b32 v1, s6, v1, v2 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_lshr_b32 s0, s5, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_bfi_b32 v2, s6, v2, v3 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_bfi_b32 v3, s6, v3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s0, s0, 16 -; VI-NEXT: v_bfi_b32 v0, s6, v0, v2 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_bfi_b32 v2, s6, v2, v3 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_bfi_b32 v2, s6, v2, v4 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: v_bfi_b32 v4, s6, v4, v5 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_copysign_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_lshr_b32 s5, s7, 16 ; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_lshr_b32 s3, s6, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_lshr_b32 s4, s7, 16 +; GFX11-NEXT: s_lshr_b32 s6, s6, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-NEXT: s_lshr_b32 s6, s1, 16 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 -; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5] +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) store <4 x half> %out, ptr addrspace(1) %arg_out diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index fab45c9dc3bc3..d83a75e8fa110 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -472,50 +472,52 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { ; SI-LABEL: s_test_copysign_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_brev_b32 s8, -2 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_bfi_b32 v0, s8, v0, v2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_bfi_b32 v0, s0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_brev_b32 s6, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_bfi_b32 v1, s6, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_bfi_b32 v0, s6, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_bfi_b32 v3, s6, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_bfi_b32 v2, s6, v2, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) store <2 x float> %result, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 33910947e6fac..04df0bc525ddc 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -938,16 +938,18 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX6-FASTFMA-LABEL: s_fdiv_v2f32: ; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX6-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-FASTFMA-NEXT: s_mov_b32 s2, -1 +; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-FASTFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 ; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0 +; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s11 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -956,13 +958,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v2, s2 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2 ; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s11, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v1, v0, s9, v1 +; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-FASTFMA-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -972,20 +974,21 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-FASTFMA-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s10, v2 -; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v0, s8, v2 +; GFX6-FASTFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-FASTFMA-NEXT: s_endpgm ; ; GFX6-SLOWFMA-LABEL: s_fdiv_v2f32: ; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-SLOWFMA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -995,14 +998,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s2 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 +; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 +; GFX6-SLOWFMA-NEXT: s_mov_b32 s2, -1 ; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s3, v0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v1, v1, s5, v0 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX6-SLOWFMA-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1012,22 +1014,24 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s2, v4 -; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v0, s4, v4 +; GFX6-SLOWFMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-SLOWFMA-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_v2f32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], s11, s11, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b32 s4, s0 +; GFX7-NEXT: s_mov_b32 s5, s1 +; GFX7-NEXT: v_div_scale_f32 v2, s[0:1], s9, s9, v1 ; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s11 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s9, v0, s9 +; GFX7-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s3, v0, s3 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -1036,13 +1040,13 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX7-NEXT: v_fma_f32 v0, -v2, v4, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], s10, s10, v2 +; GFX7-NEXT: v_div_scale_f32 v3, s[0:1], s8, s8, v2 ; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_fixup_f32 v1, v0, s11, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 +; GFX7-NEXT: v_div_fixup_f32 v1, v0, s9, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 ; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 @@ -1052,19 +1056,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX7-NEXT: v_fma_f32 v0, -v3, v5, v0 ; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX7-NEXT: v_div_fmas_f32 v0, v0, v4, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v0, s10, v2 -; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: v_div_fixup_f32 v0, v0, s8, v2 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s3, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s1, v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_div_scale_f32 v1, s[6:7], s5, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v5, -v1, v3, 1.0 @@ -1074,13 +1079,12 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX8-NEXT: v_fma_f32 v1, -v1, v5, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s2, s2, v4 +; GFX8-NEXT: v_div_scale_f32 v2, s[6:7], s4, s4, v4 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, s0, v3, s0 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 ; GFX8-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-NEXT: v_div_fixup_f32 v1, v1, s3, v0 +; GFX8-NEXT: v_div_fixup_f32 v1, v1, s5, v0 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v0, -v2, v5, 1.0 ; GFX8-NEXT: v_fma_f32 v0, v0, v5, v5 @@ -1090,19 +1094,20 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_div_fixup_f32 v0, v0, s2, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_div_fixup_f32 v0, v0, s4, v4 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s6, s3, s3, s1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 +; GFX10-NEXT: v_div_scale_f32 v0, s4, s7, s7, s3 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s7, s3 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -1112,12 +1117,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_scale_f32 v2, s6, s2, s2, s0 +; GFX10-NEXT: v_div_scale_f32 v2, s4, s6, s6, s2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_div_fixup_f32 v1, v0, s3, s1 -; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 +; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s3 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s6, s2 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -1128,19 +1132,18 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, s0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s1, s3, s1 +; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s3 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s3, s5, s3 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1151,11 +1154,11 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_scale_f32 v2, null, s2, s2, s0 +; GFX11-NEXT: v_div_scale_f32 v2, null, s4, s4, s2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: v_div_fixup_f32 v1, v0, s3, s1 -; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s2, s0 +; GFX11-NEXT: v_div_fixup_f32 v1, v0, s5, s3 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s4, s2 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -1167,8 +1170,8 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, < ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, s0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32: @@ -1193,58 +1196,60 @@ entry: define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_ulp25_v2f32: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s2 -; GFX67-NEXT: v_rcp_f32_e32 v1, s3 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX67-NEXT: v_rcp_f32_e32 v0, s8 +; GFX67-NEXT: v_rcp_f32_e32 v1, s9 +; GFX67-NEXT: s_mov_b32 s4, s0 +; GFX67-NEXT: s_mov_b32 s5, s1 +; GFX67-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX67-NEXT: v_mul_f32_e32 v1, s3, v1 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_ulp25_v2f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s2 -; GFX8-NEXT: v_rcp_f32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_rcp_f32_e32 v2, s6 +; GFX8-NEXT: v_rcp_f32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_ulp25_v2f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s2 -; GFX10-NEXT: v_rcp_f32_e32 v1, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_rcp_f32_e32 v0, s6 +; GFX10-NEXT: v_rcp_f32_e32 v1, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_v2f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s2 -; GFX11-NEXT: v_rcp_f32_e32 v1, s3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s6 +; GFX11-NEXT: v_rcp_f32_e32 v1, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_ulp25_v2f32: @@ -1269,58 +1274,60 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_fast_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: v_rcp_f32_e32 v2, s2 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX67-NEXT: v_rcp_f32_e32 v0, s9 +; GFX67-NEXT: v_rcp_f32_e32 v2, s8 +; GFX67-NEXT: s_mov_b32 s4, s0 +; GFX67-NEXT: s_mov_b32 s5, s1 +; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_rcp_f32_e32 v2, s2 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_rcp_f32_e32 v2, s7 +; GFX8-NEXT: v_rcp_f32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_fast_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_rcp_f32_e32 v2, s2 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_rcp_f32_e32 v2, s6 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_fast_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 -; GFX11-NEXT: v_rcp_f32_e32 v2, s2 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 +; GFX11-NEXT: v_rcp_f32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_fast_math: @@ -1345,58 +1352,60 @@ entry: define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) #0 { ; GFX67-LABEL: s_fdiv_v2f32_arcp_math: ; GFX67: ; %bb.0: ; %entry -; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX67-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX67-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GFX67-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX67-NEXT: s_mov_b32 s7, 0xf000 ; GFX67-NEXT: s_mov_b32 s6, -1 ; GFX67-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-NEXT: v_rcp_f32_e32 v0, s3 -; GFX67-NEXT: v_rcp_f32_e32 v2, s2 -; GFX67-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX67-NEXT: v_mul_f32_e32 v0, s0, v2 +; GFX67-NEXT: v_rcp_f32_e32 v0, s9 +; GFX67-NEXT: v_rcp_f32_e32 v2, s8 +; GFX67-NEXT: s_mov_b32 s4, s0 +; GFX67-NEXT: s_mov_b32 s5, s1 +; GFX67-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX67-NEXT: v_mul_f32_e32 v0, s2, v2 ; GFX67-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX67-NEXT: s_endpgm ; ; GFX8-LABEL: s_fdiv_v2f32_arcp_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_rcp_f32_e32 v2, s2 -; GFX8-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX8-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_rcp_f32_e32 v2, s7 +; GFX8-NEXT: v_rcp_f32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_f32_e32 v3, s3, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, s2, v4 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_v2f32_arcp_math: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_rcp_f32_e32 v2, s2 -; GFX10-NEXT: v_mul_f32_e32 v1, s1, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, s0, v2 -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[6:7] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_rcp_f32_e32 v2, s6 +; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, s2, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_v2f32_arcp_math: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 -; GFX11-NEXT: v_rcp_f32_e32 v2, s2 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 +; GFX11-NEXT: v_rcp_f32_e32 v2, s6 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v1, s1, v0 :: v_dual_mul_f32 v0, s0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-NEXT: v_dual_mul_f32 v1, s3, v0 :: v_dual_mul_f32 v0, s2, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; EG-LABEL: s_fdiv_v2f32_arcp_math: diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index e9fd6119d0c36..6fb21b5c31b5b 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -135,11 +135,11 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_rndne_f32_e32 v1, s3 -; VI-NEXT: v_rndne_f32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_rndne_f32_e32 v3, s3 +; VI-NEXT: v_rndne_f32_e32 v2, s2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fnearbyint_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index eca8c2837b0fc..64a223877e056 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -516,13 +516,13 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 -; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v3, s1 -; CIVI-NEXT: v_mov_b32_e32 v0, s2 -; CIVI-NEXT: v_mov_b32_e32 v1, s3 -; CIVI-NEXT: v_mov_b32_e32 v2, s0 -; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_or_b32 s0, s3, 0x80008000 +; CIVI-NEXT: s_or_b32 s1, s2, 0x80008000 +; CIVI-NEXT: v_mov_b32_e32 v2, s1 +; CIVI-NEXT: v_mov_b32_e32 v3, s0 +; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm ; ; GFX9-LABEL: fneg_fabs_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 1fa9bfa3cfa3f..b93a598cb52ae 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -215,13 +215,13 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s3, 31 -; VI-NEXT: s_bitset1_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_or_b32 s0, s3, 0x80000000 +; VI-NEXT: s_or_b32 s1, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 07a7d8d20c439..35034474ced0e 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -66,13 +66,13 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 +; VI-NEXT: s_xor_b32 s1, s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 43caa4c739fb3..718cc4afe22da 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -179,22 +179,22 @@ entry: define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; SI-NEXT: s_not_b32 s3, s5 -; SI-NEXT: s_lshr_b32 s1, s1, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_not_b32 s1, s4 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; SI-NEXT: s_lshr_b32 s0, s0, 1 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_not_b32 s1, s7 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1 +; SI-NEXT: s_lshr_b32 s0, s3, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_not_b32 s1, s6 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 +; SI-NEXT: s_lshr_b32 s0, s2, 1 ; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 @@ -202,47 +202,43 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; VI-LABEL: fshl_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_not_b32 s7, s7 -; VI-NEXT: s_lshr_b32 s3, s1, 1 -; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_not_b32 s1, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; VI-NEXT: s_lshr_b32 s0, s0, 1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: s_not_b32 s1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 s0, s3, 1 +; VI-NEXT: v_alignbit_b32 v2, s3, v2, 1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_alignbit_b32 v3, s0, v2, v3 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_not_b32 s1, s6 +; VI-NEXT: v_alignbit_b32 v2, s2, v2, 1 +; VI-NEXT: s_lshr_b32 s0, s2, 1 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_alignbit_b32 v2, s0, v2, v4 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 1 -; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 -; GFX9-NEXT: s_not_b32 s1, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_not_b32 s1, s15 +; GFX9-NEXT: s_lshr_b32 s0, s11, 1 +; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_not_b32 s1, s8 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_not_b32 s1, s14 +; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s10, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32: @@ -265,40 +261,34 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX10-LABEL: fshl_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_not_b32 s2, s7 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_not_b32 s3, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 +; GFX10-NEXT: v_alignbit_b32 v0, s11, s13, 1 +; GFX10-NEXT: v_alignbit_b32 v3, s10, s12, 1 +; GFX10-NEXT: s_lshr_b32 s0, s11, 1 +; GFX10-NEXT: s_not_b32 s1, s15 +; GFX10-NEXT: s_lshr_b32 s2, s10, 1 +; GFX10-NEXT: s_not_b32 s3, s14 +; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s2, v3, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 -; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_not_b32 s2, s7 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 -; GFX11-NEXT: s_not_b32 s3, s6 -; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 -; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v0, s3, s5, 1 +; GFX11-NEXT: v_alignbit_b32 v3, s2, s4, 1 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_not_b32 s4, s7 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_not_b32 s5, s6 +; GFX11-NEXT: v_alignbit_b32 v1, s3, v0, s4 +; GFX11-NEXT: v_alignbit_b32 v0, s2, v3, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -309,43 +299,45 @@ entry: define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshl_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v3, s3, v2, 23 +; VI-NEXT: v_alignbit_b32 v2, s2, v4, 25 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, 23 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v3, 25 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_v2i32_imm: @@ -365,25 +357,25 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshl_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_alignbit_b32 v1, s3, s7, 23 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s6, 25 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v1, s3, s5, 23 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s4, 25 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 4a79096442c96..7705fa7b5423e 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -221,51 +221,47 @@ entry: define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { ; SI-LABEL: fshr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_alignbit_b32 v3, s3, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v2, s2, v4, v2 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: v_alignbit_b32 v1, s11, v0, v1 +; GFX9-NEXT: v_alignbit_b32 v0, s10, v3, v4 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: s_endpgm ; @@ -285,79 +281,64 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; GFX10-LABEL: fshr_v2i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s7 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s15 +; GFX10-NEXT: v_mov_b32_e32 v2, s14 +; GFX10-NEXT: v_alignbit_b32 v1, s11, s13, v0 +; GFX10-NEXT: v_alignbit_b32 v0, s10, s12, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: fshr_v2i32: ; GFX11-TRUE16: ; %bb.0: ; %entry -; GFX11-TRUE16-NEXT: s_clause 0x2 -; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, s3, s5, v0.l +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s2, s4, v0.h +; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fshr_v2i32: ; GFX11-FAKE16: ; %bb.0: ; %entry -; GFX11-FAKE16-NEXT: s_clause 0x2 -; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s3, s5, v0 +; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s2, s4, v2 +; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-TRUE16-LABEL: fshr_v2i32: ; GFX12-TRUE16: ; %bb.0: ; %entry -; GFX12-TRUE16-NEXT: s_clause 0x2 -; GFX12-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s1, s3, v0.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s0, s2, v0.h -; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, s3, s5, v0.l +; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, s2, s4, v0.h +; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: fshr_v2i32: ; GFX12-FAKE16: ; %bb.0: ; %entry -; GFX12-FAKE16-NEXT: s_clause 0x2 -; GFX12-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s1, s3, v0 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s0, s2, v2 -; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[4:5] +; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s3, s5, v0 +; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, s2, s4, v2 +; GFX12-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX12-FAKE16-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) @@ -368,43 +349,45 @@ entry: define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { ; SI-LABEL: fshr_v2i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_v2i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_alignbit_b32 v3, s3, v2, 9 +; VI-NEXT: v_alignbit_b32 v2, s2, v4, 7 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_v2i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 -; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, 9 +; GFX9-NEXT: v_alignbit_b32 v0, s2, v3, 7 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_v2i32_imm: @@ -424,37 +407,37 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 ; GFX10-LABEL: fshr_v2i32_imm: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_alignbit_b32 v1, s3, s7, 9 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s6, 7 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_v2i32_imm: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v1, s3, s5, 9 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s4, 7 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fshr_v2i32_imm: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_alignbit_b32 v1, s1, s3, 9 -; GFX12-NEXT: v_alignbit_b32 v0, s0, s2, 7 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_alignbit_b32 v1, s3, s5, 9 +; GFX12-NEXT: v_alignbit_b32 v0, s2, s4, 7 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index f767511370eee..772f7495c7708 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -98,16 +98,16 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_add_u32 s4, s0, 4 -; CIVI-NEXT: s_addc_u32 s5, s1, 0 -; CIVI-NEXT: v_mov_b32_e32 v2, s4 -; CIVI-NEXT: v_mov_b32_e32 v4, s3 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v3, s5 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 -; CIVI-NEXT: v_mov_b32_e32 v5, s2 -; CIVI-NEXT: flat_store_short v[2:3], v4 -; CIVI-NEXT: flat_store_dword v[0:1], v5 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_add_u32 s0, s0, 4 +; CIVI-NEXT: flat_store_dword v[0:1], v2 +; CIVI-NEXT: s_addc_u32 s1, s1, 0 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s3 +; CIVI-NEXT: flat_store_short v[0:1], v2 ; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3f16_arg: @@ -135,8 +135,8 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: v_mov_b32_e32 v3, s3 ; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CIVI-NEXT: s_endpgm @@ -144,9 +144,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out @@ -348,37 +348,21 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 } define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { -; CI-LABEL: extload_v3f16_to_v3f32_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v4, s1 -; CI-NEXT: v_mov_b32_e32 v3, s0 -; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; CI-NEXT: s_endpgm -; -; VI-LABEL: extload_v3f16_to_v3f32_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; VI-NEXT: s_endpgm +; CIVI-LABEL: extload_v3f16_to_v3f32_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_lshr_b32 s4, s2, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; CIVI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v3, s0 +; CIVI-NEXT: v_mov_b32_e32 v4, s1 +; CIVI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: @@ -386,9 +370,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> @@ -404,14 +388,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s3, 16 -; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: s_lshr_b32 s5, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; @@ -424,12 +408,12 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -440,10 +424,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshr_b32 s4, s3, 16 ; GFX11-NEXT: s_lshr_b32 s5, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> @@ -724,61 +708,33 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 } define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { -; CI-LABEL: extload_v4f16_to_v4f64_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_lshr_b32 s5, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 -; CI-NEXT: s_add_u32 s2, s0, 16 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; CI-NEXT: v_mov_b32_e32 v9, s3 -; CI-NEXT: v_mov_b32_e32 v8, s2 -; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; CI-NEXT: s_endpgm -; -; VI-LABEL: extload_v4f16_to_v4f64_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 -; VI-NEXT: v_mov_b32_e32 v9, s3 -; VI-NEXT: v_mov_b32_e32 v8, s2 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; VI-NEXT: s_endpgm +; CIVI-LABEL: extload_v4f16_to_v4f64_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_lshr_b32 s5, s3, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; CIVI-NEXT: s_lshr_b32 s4, s2, 16 +; CIVI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CIVI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; CIVI-NEXT: s_add_u32 s2, s0, 16 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CIVI-NEXT: s_addc_u32 s3, s1, 0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CIVI-NEXT: v_mov_b32_e32 v9, s3 +; CIVI-NEXT: v_mov_b32_e32 v8, s2 +; CIVI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CIVI-NEXT: s_nop 0 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CIVI-NEXT: s_endpgm ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 4b9da7b49e997..78fa39b31196e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -289,20 +289,20 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s5, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s5, 4 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -418,20 +418,20 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s5, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x10001 -; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s5, 4 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -443,19 +443,19 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s6, 3 -; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 -; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 -; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s4, 3 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b64 s[0:1], 0xff, s0 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; GCN-NEXT: s_and_b32 s1, s1, 0x1010101 +; GCN-NEXT: s_and_b32 s0, s0, 0x1010101 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 837c18fe7af0a..2d84f728de489 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1555,10 +1555,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; SI-NEXT: s_lshl_b32 s0, s8, 4 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 -; SI-NEXT: s_and_b32 s9, s1, 0x50005 -; SI-NEXT: s_and_b32 s8, s0, 0x50005 -; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: s_and_b32 s1, s1, 0x50005 +; SI-NEXT: s_and_b32 s0, s0, 0x50005 +; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 9df995b5a7066..9f3959c39d1af 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -734,8 +734,8 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -797,8 +797,8 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1000,16 +1000,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i16_arg: @@ -1328,8 +1328,8 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2393,8 +2393,8 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 978f223aafb94..7140ba5cf2ad9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -339,53 +339,53 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3fb8a000 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 +; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 -; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 +; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 70c3787bac9a1..491d5a8e077e2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -341,53 +341,53 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x40549000 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; VI-SDAG-NEXT: v_sub_f32_e32 v1, s3, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 +; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 -; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 -; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 -; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v1, v5, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-SDAG-NEXT: v_exp_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: s_and_b32 s0, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v8, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v8, s2, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x40549000, v8 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 +; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v8, v8, v9 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v8 +; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 +; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 +; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v7 +; VI-SDAG-NEXT: v_ldexp_f32 v3, v5, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v6 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, v4 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v6 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 15bcab9f774e4..fd0f8e777eebd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -225,26 +225,26 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-SDAG-NEXT: v_add_f32_e32 v2, s3, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v2, s2, v2 ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v0, s2, v0 -; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; VI-SDAG-NEXT: s_cselect_b32 s3, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, s3 -; VI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec -; VI-SDAG-NEXT: s_cselect_b32 s2, 0xffffffc0, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, s2 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v3, v4, s0 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: s_cselect_b32 s0, 0xffffffc0, 0 +; VI-SDAG-NEXT: v_ldexp_f32 v2, v2, s0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index d757df83b32ba..538a5c91991c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_log_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41b17218 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3f317000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3f317000, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3805fdf4, v7 +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v6 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v3 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3f317000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v5 -; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3805fdf4, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 -; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3f317000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3805fdf4, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index ae1318da453c5..6ca3020106070 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -406,51 +406,51 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-LABEL: s_log10_v2f32: ; VI-SDAG: ; %bb.0: ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x411a209b +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x800000 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x411a209b ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v2 ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s4 -; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 -; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 -; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v3 -; VI-SDAG-NEXT: v_sub_f32_e32 v5, v3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3e9a2000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s3, v0 +; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_and_b32_e32 v6, 0xfffff000, v5 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v7, v5, v6 +; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3e9a2000, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x369a84fb, v7 +; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; VI-SDAG-NEXT: v_add_f32_e32 v7, v9, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v6 +; VI-SDAG-NEXT: v_ldexp_f32 v3, s2, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 +; VI-SDAG-NEXT: v_log_f32_e32 v7, v3 +; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v4 +; VI-SDAG-NEXT: v_and_b32_e32 v4, 0xfffff000, v7 +; VI-SDAG-NEXT: v_sub_f32_e32 v5, v7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3e9a2000, v5 ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x369a84fb, v5 -; VI-SDAG-NEXT: s_cselect_b32 s4, 32, 0 +; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x369a84fb, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, v8, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v5, v6, v5 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s4 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v5, v7, v5 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v5 -; VI-SDAG-NEXT: v_log_f32_e32 v5, v1 -; VI-SDAG-NEXT: s_mov_b32 s3, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v5 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v5, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x369a84fb, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v6, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v7|, s3 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index c1ac74e5094b0..9e5e9e8f7b6b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -285,16 +285,16 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-SDAG-NEXT: v_ldexp_f32 v3, s3, v3 ; VI-SDAG-NEXT: s_cselect_b32 s3, 32, 0 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 +; VI-SDAG-NEXT: v_ldexp_f32 v0, s2, v0 ; VI-SDAG-NEXT: v_log_f32_e32 v3, v3 -; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1 -; VI-SDAG-NEXT: v_log_f32_e32 v4, v1 -; VI-SDAG-NEXT: v_sub_f32_e32 v1, v3, v2 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_sub_f32_e32 v0, v4, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 -; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: v_log_f32_e32 v5, v0 +; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_sub_f32_e32 v3, v3, v2 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, v5, v4 +; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_log2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index d2f4f54cefe78..489a279b67a47 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -938,105 +938,105 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_ashr_i32 s6, s0, 16 -; CI-NEXT: s_ashr_i32 s7, s1, 16 -; CI-NEXT: s_sext_i32_i16 s0, s0 -; CI-NEXT: s_sext_i32_i16 s1, s1 -; CI-NEXT: s_ashr_i32 s8, s2, 16 -; CI-NEXT: s_ashr_i32 s9, s3, 16 -; CI-NEXT: s_sext_i32_i16 s2, s2 -; CI-NEXT: s_sext_i32_i16 s3, s3 -; CI-NEXT: s_min_i32 s7, s7, s9 -; CI-NEXT: s_min_i32 s1, s1, s3 -; CI-NEXT: s_min_i32 s3, s6, s8 -; CI-NEXT: s_min_i32 s0, s0, s2 -; CI-NEXT: s_lshl_b32 s7, s7, 16 -; CI-NEXT: s_and_b32 s1, s1, 0xffff -; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: s_and_b32 s0, s0, 0xffff -; CI-NEXT: s_or_b32 s1, s1, s7 -; CI-NEXT: s_or_b32 s0, s0, s3 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_ashr_i32 s0, s2, 16 +; CI-NEXT: s_ashr_i32 s1, s3, 16 +; CI-NEXT: s_sext_i32_i16 s2, s2 +; CI-NEXT: s_sext_i32_i16 s3, s3 +; CI-NEXT: s_ashr_i32 s6, s4, 16 +; CI-NEXT: s_ashr_i32 s7, s5, 16 +; CI-NEXT: s_sext_i32_i16 s4, s4 +; CI-NEXT: s_sext_i32_i16 s5, s5 +; CI-NEXT: s_min_i32 s1, s1, s7 +; CI-NEXT: s_min_i32 s3, s3, s5 +; CI-NEXT: s_min_i32 s0, s0, s6 +; CI-NEXT: s_min_i32 s2, s2, s4 +; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: s_and_b32 s3, s3, 0xffff +; CI-NEXT: s_lshl_b32 s0, s0, 16 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s1, s3, s1 +; CI-NEXT: s_or_b32 s0, s2, s0 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_sle_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s6, s1, 16 -; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s8, s3, 16 -; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_ashr_i32 s7, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s9, s2, 16 -; VI-NEXT: s_sext_i32_i16 s2, s2 -; VI-NEXT: s_min_i32 s6, s6, s8 -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_min_i32 s7, s7, s9 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: s_lshl_b32 s2, s6, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_lshl_b32 s2, s7, 16 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_ashr_i32 s0, s3, 16 +; VI-NEXT: s_ashr_i32 s1, s2, 16 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_ashr_i32 s6, s5, 16 +; VI-NEXT: s_ashr_i32 s7, s4, 16 +; VI-NEXT: s_sext_i32_i16 s5, s5 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_min_i32 s1, s1, s7 +; VI-NEXT: s_min_i32 s0, s0, s6 +; VI-NEXT: s_min_i32 s2, s2, s4 +; VI-NEXT: s_min_i32 s3, s3, s5 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s3, s3, 0xffff +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_and_b32 s2, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s3, s0 +; VI-NEXT: s_or_b32 s1, s2, s1 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_sle_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 -; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_pk_min_i16 v1, s3, v0 +; GFX9-NEXT: v_pk_min_i16 v0, s2, v3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_sle_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 -; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: v_pk_min_i16 v1, s3, s5 +; GFX10-NEXT: v_pk_min_i16 v0, s2, s4 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_sle_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_min_i16 v1, s1, s3 -; GFX11-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_pk_min_i16 v1, s3, s5 +; GFX11-NEXT: v_pk_min_i16 v0, s2, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1427,77 +1427,77 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; CI-LABEL: s_test_imin_slt_v2i32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 -; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_min_i32 s1, s1, s3 -; CI-NEXT: s_min_i32 s0, s0, s2 -; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_min_i32 s0, s3, s5 +; CI-NEXT: s_min_i32 s1, s2, s4 +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_test_imin_slt_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_min_i32 s1, s1, s3 -; VI-NEXT: s_min_i32 s0, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_min_i32 s0, s3, s5 +; VI-NEXT: s_min_i32 s1, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_imin_slt_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_min_i32 s1, s1, s3 -; GFX9-NEXT: s_min_i32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_min_i32 s3, s3, s5 +; GFX9-NEXT: s_min_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_imin_slt_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_min_i32 s0, s0, s2 -; GFX10-NEXT: s_min_i32 s1, s1, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_min_i32 s2, s2, s4 +; GFX10-NEXT: s_min_i32 s3, s3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_imin_slt_v2i32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s0, s0, s2 -; GFX11-NEXT: s_min_i32 s1, s1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_min_i32 s2, s2, s4 +; GFX11-NEXT: s_min_i32 s3, s3, s5 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index ecd1abce67571..e9fb86b97f193 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -2071,8 +2071,8 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX900-LABEL: fadd_fadd_fsub: ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, s3 ; GFX900-NEXT: v_add_f32_e32 v0, s1, v0 @@ -2080,14 +2080,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX900-NEXT: v_add_f32_e32 v3, s2, v0 ; GFX900-NEXT: v_sub_f32_e32 v0, s0, v1 ; GFX900-NEXT: v_subrev_f32_e32 v1, s3, v3 -; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-NEXT: s_endpgm ; ; PACKED-SDAG-LABEL: fadd_fadd_fsub: ; PACKED-SDAG: ; %bb.0: ; %bb ; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; PACKED-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; PACKED-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; PACKED-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; PACKED-SDAG-NEXT: v_add_f32_e32 v0, s1, v0 @@ -2095,7 +2095,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, v0 ; PACKED-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1] -; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] +; PACKED-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; PACKED-SDAG-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index 0a746b0a3f572..79ec56aed4582 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -94,62 +94,64 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotl_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_sub_i32 s3, 32, s3 -; SI-NEXT: s_sub_i32 s2, 32, s2 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_sub_i32 s0, 32, s9 +; SI-NEXT: s_sub_i32 s1, 32, s8 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_alignbit_b32 v1, s3, s3, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotl_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, 32, s2 -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_sub_i32 s1, 32, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v2 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotl_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: s_sub_i32 s2, 32, s2 -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_sub_i32 s4, 32, s7 +; GFX10-NEXT: s_sub_i32 s5, 32, s6 +; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s4 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s5 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 -; GFX11-NEXT: s_sub_i32 s2, 32, s2 -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sub_i32 s4, 32, s7 +; GFX11-NEXT: s_sub_i32 s5, 32, s6 +; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s4 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s5 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %0 = shl <2 x i32> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index d6e361d6e297e..2cc186b85b5d9 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -83,54 +83,56 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i ; ; SI-LABEL: rotr_v2i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_alignbit_b32 v1, s3, s3, v0 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; GFX8-LABEL: rotr_v2i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 -; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v3, s3, s3, v2 +; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v4 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotr_v2i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: v_alignbit_b32 v1, s3, s3, s7 +; GFX10-NEXT: v_alignbit_b32 v0, s2, s2, s6 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_v2i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 -; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_alignbit_b32 v1, s3, s3, s5 +; GFX11-NEXT: v_alignbit_b32 v0, s2, s2, s4 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm entry: %tmp0 = sub <2 x i32> , %y diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index fe47663b11028..5ac01ffbe6d02 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -331,39 +331,39 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_sub_i32 s6, 0, s3 -; VI-NEXT: s_sub_i32 s7, 0, s2 -; VI-NEXT: s_sub_i32 s5, 0, s5 -; VI-NEXT: s_sub_i32 s4, 0, s4 -; VI-NEXT: s_ashr_i32 s8, s2, 16 -; VI-NEXT: s_ashr_i32 s9, s3, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_sub_i32 s4, 0, s3 +; VI-NEXT: s_sub_i32 s5, 0, s2 +; VI-NEXT: s_sub_i32 s1, 0, s1 +; VI-NEXT: s_sub_i32 s0, 0, s0 +; VI-NEXT: s_ashr_i32 s6, s2, 16 +; VI-NEXT: s_ashr_i32 s7, s3, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 -; VI-NEXT: s_sext_i32_i16 s7, s7 -; VI-NEXT: s_sext_i32_i16 s6, s6 -; VI-NEXT: s_sext_i32_i16 s4, s4 ; VI-NEXT: s_sext_i32_i16 s5, s5 -; VI-NEXT: s_max_i32 s3, s3, s6 -; VI-NEXT: s_max_i32 s2, s2, s7 -; VI-NEXT: s_max_i32 s5, s9, s5 -; VI-NEXT: s_max_i32 s4, s8, s4 +; VI-NEXT: s_sext_i32_i16 s4, s4 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s1, s1 +; VI-NEXT: s_max_i32 s3, s3, s4 +; VI-NEXT: s_max_i32 s2, s2, s5 +; VI-NEXT: s_max_i32 s1, s7, s1 +; VI-NEXT: s_max_i32 s0, s6, s0 ; VI-NEXT: s_add_i32 s2, s2, 2 ; VI-NEXT: s_add_i32 s3, s3, 2 -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_lshl_b32 s1, s1, 16 ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_or_b32 s2, s4, s2 -; VI-NEXT: s_add_i32 s3, s3, 0x20000 -; VI-NEXT: s_add_i32 s2, s2, 0x20000 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_or_b32 s1, s1, s3 +; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_add_i32 s1, s1, 0x20000 +; VI-NEXT: s_add_i32 s0, s0, 0x20000 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_abs_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index a56346f3bb45b..a70d495f459b5 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -164,98 +164,102 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x, ; ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s6, 0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v0 -; GFX6-NEXT: s_mul_i32 s6, s6, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s6, s0, s2 -; GFX6-NEXT: s_cmp_ge_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s0, s6, s0 -; GFX6-NEXT: s_sub_i32 s2, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_mul_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s4, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s4, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s4, v1 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: s_mul_i32 s2, s2, s3 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: s_sub_i32 s2, s1, s3 -; GFX6-NEXT: s_cmp_ge_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s9 +; GFX6-NEXT: s_sub_i32 s0, s3, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s9 +; GFX6-NEXT: s_cmp_ge_u32 s0, s9 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_sub_i32 s6, 0, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s0, 0, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_readfirstlane_b32 s6, v0 -; GFX8-NEXT: s_mul_i32 s6, s6, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s6 -; GFX8-NEXT: s_sub_i32 s6, s0, s2 -; GFX8-NEXT: s_cmp_ge_u32 s0, s2 -; GFX8-NEXT: s_cselect_b32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s6, s0, s2 -; GFX8-NEXT: s_cmp_ge_u32 s0, s2 -; GFX8-NEXT: s_cselect_b32 s0, s6, s0 -; GFX8-NEXT: s_sub_i32 s2, 0, s3 -; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: s_mul_i32 s4, s4, s6 +; GFX8-NEXT: s_sub_i32 s2, s2, s4 +; GFX8-NEXT: s_sub_i32 s4, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s4, s2, s6 +; GFX8-NEXT: s_cmp_ge_u32 s2, s6 +; GFX8-NEXT: s_cselect_b32 s2, s4, s2 +; GFX8-NEXT: s_sub_i32 s4, 0, s7 +; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_mul_hi_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: s_mul_i32 s0, s0, s3 -; GFX8-NEXT: s_sub_i32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s3 -; GFX8-NEXT: s_cmp_ge_u32 s0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 +; GFX8-NEXT: s_mul_i32 s0, s0, s7 +; GFX8-NEXT: s_sub_i32 s0, s3, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s7 +; GFX8-NEXT: s_cmp_ge_u32 s0, s7 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: s_sub_i32 s1, s0, s3 -; GFX8-NEXT: s_cmp_ge_u32 s0, s3 +; GFX8-NEXT: s_sub_i32 s1, s0, s7 +; GFX8-NEXT: s_cmp_ge_u32 s0, s7 ; GFX8-NEXT: s_cselect_b32 s0, s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %result0 = udiv <2 x i32> %x, %y store <2 x i32> %result0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 97738a7944741..d1500b290b154 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -258,8 +258,8 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %cast = uitofp <2 x i32> %in to <2 x double> diff --git a/llvm/test/CodeGen/NVPTX/dag-cse.ll b/llvm/test/CodeGen/NVPTX/dag-cse.ll index ff22c0bd747e4..6440183b28a87 100644 --- a/llvm/test/CodeGen/NVPTX/dag-cse.ll +++ b/llvm/test/CodeGen/NVPTX/dag-cse.ll @@ -9,8 +9,8 @@ ; Verify that loads with different memory types are not subject to CSE ; once they are promoted to the same type. ; -; CHECK: ld.global.v2.u8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a]; -; CHECK: st.global.v2.u8 [b], {%[[B1]], %[[B2]]}; +; CHECK: ld.global.u16 %[[B:rs[0-9]+]], [a]; +; CHECK: st.global.u16 [b], %[[B]]; ; ; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a]; ; CHECK: st.global.u32 [c], %[[C]]; diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll index 318e55c748f7f..407e2fc7da794 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -1,10 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s -; CHECK-LABEL: @merge_v2i32_v2i32( -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> zeroinitializer +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" define amdgpu_kernel void @merge_v2i32_v2i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %b, i64 1 @@ -18,10 +26,16 @@ entry: ret void } -; CHECK-LABEL: @merge_v1i32_v1i32( -; CHECK: load <2 x i32> -; CHECK: store <2 x i32> zeroinitializer define amdgpu_kernel void @merge_v1i32_v1i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> +; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <1 x i32>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <1 x i32>, ptr addrspace(1) %b, i64 1 @@ -35,12 +49,18 @@ entry: ret void } -; CHECK-LABEL: @no_merge_v3i32_v3i32( -; CHECK: load <3 x i32> -; CHECK: load <3 x i32> -; CHECK: store <3 x i32> zeroinitializer -; CHECK: store <3 x i32> zeroinitializer define amdgpu_kernel void @no_merge_v3i32_v3i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1 +; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1 +; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4 +; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b, i64 1 @@ -54,10 +74,16 @@ entry: ret void } -; CHECK-LABEL: @merge_v2i16_v2i16( -; CHECK: load <4 x i16> -; CHECK: store <4 x i16> zeroinitializer define amdgpu_kernel void @merge_v2i16_v2i16(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b, i64 1 @@ -71,15 +97,27 @@ entry: ret void } -; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs( -; CHECK-OOB-RELAXED: load <4 x i16> -; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer -; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs( -; CHECK-OOB-STRICT: load <2 x i16> -; CHECK-OOB-STRICT: load <2 x i16> -; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer -; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer define amdgpu_kernel void @merge_fat_ptrs(ptr addrspace(7) nocapture %a, ptr addrspace(7) nocapture readonly %b) #0 { +; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs( +; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] +; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4 +; CHECK-OOB-RELAXED-NEXT: ret void +; +; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs( +; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] +; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1 +; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1 +; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4 +; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4 +; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4 +; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4 +; CHECK-OOB-STRICT-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %a, i32 1 %b.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %b, i32 1 @@ -93,19 +131,217 @@ entry: ret void } -; Ideally this would be merged -; CHECK-LABEL: @merge_load_i32_v2i16( -; CHECK: load i32, -; CHECK: load <2 x i16> define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[A]], align 4, !invariant.load [[META0:![0-9]+]], !nontemporal [[META1:![0-9]+]] +; CHECK-NEXT: [[LD_01:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[DOTCAST:%.*]] = bitcast i32 [[TMP1]] to <2 x i16> +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 - %ld.0 = load i32, ptr addrspace(1) %a - %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1 + %ld.0 = load i32, ptr addrspace(1) %a, align 4, !nontemporal !0, !invariant.load !1 + %ld.1 = load <2 x i16>, ptr addrspace(1) %a.1, align 4, !nontemporal !0, !invariant.load !1 ret void } attributes #0 = { nounwind } attributes #1 = { nounwind readnone } + +!0 = !{!"nontemporal"} +!1 = !{!"invariant.load"} + + +define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-OOB-RELAXED-LABEL: define void @merge_i32_2i16_float_4i8( +; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-OOB-RELAXED-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-OOB-RELAXED-NEXT: [[LOAD33:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-OOB-RELAXED-NEXT: [[TMP3:%.*]] = bitcast i32 [[LOAD33]] to float +; CHECK-OOB-RELAXED-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-OOB-RELAXED-NEXT: [[DOTCAST:%.*]] = bitcast i32 [[TMP2]] to <2 x i16> +; CHECK-OOB-RELAXED-NEXT: [[DOTCAST1:%.*]] = bitcast i32 [[TMP4]] to <4 x i8> +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-RELAXED-NEXT: [[DOTCAST_CAST:%.*]] = bitcast <2 x i16> [[DOTCAST]] to i32 +; CHECK-OOB-RELAXED-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD12]], i32 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[DOTCAST_CAST]], i32 1 +; CHECK-OOB-RELAXED-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2 +; CHECK-OOB-RELAXED-NEXT: [[DOTCAST1_CAST:%.*]] = bitcast <4 x i8> [[DOTCAST1]] to i32 +; CHECK-OOB-RELAXED-NEXT: [[TMP7:%.*]] = bitcast float [[TMP3]] to i32 +; CHECK-OOB-RELAXED-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[DOTCAST1_CAST]], i32 1 +; CHECK-OOB-RELAXED-NEXT: store <2 x i32> [[TMP9]], ptr addrspace(2) [[STORE_GEP3]], align 4 +; CHECK-OOB-RELAXED-NEXT: ret void +; +; CHECK-OOB-STRICT-LABEL: define void @merge_i32_2i16_float_4i8( +; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-OOB-STRICT-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-STRICT-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-OOB-STRICT-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-OOB-STRICT-NEXT: [[LOAD33:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-OOB-STRICT-NEXT: [[TMP3:%.*]] = bitcast i32 [[LOAD33]] to float +; CHECK-OOB-STRICT-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-OOB-STRICT-NEXT: [[DOTCAST:%.*]] = bitcast i32 [[TMP2]] to <2 x i16> +; CHECK-OOB-STRICT-NEXT: [[DOTCAST1:%.*]] = bitcast i32 [[TMP4]] to <4 x i8> +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-STRICT-NEXT: [[DOTCAST_CAST:%.*]] = bitcast <2 x i16> [[DOTCAST]] to i32 +; CHECK-OOB-STRICT-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD12]], i32 0 +; CHECK-OOB-STRICT-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[DOTCAST_CAST]], i32 1 +; CHECK-OOB-STRICT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2 +; CHECK-OOB-STRICT-NEXT: [[DOTCAST1_CAST:%.*]] = bitcast <4 x i8> [[DOTCAST1]] to i32 +; CHECK-OOB-STRICT-NEXT: [[TMP7:%.*]] = bitcast float [[TMP3]] to i32 +; CHECK-OOB-STRICT-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 +; CHECK-OOB-STRICT-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[DOTCAST1_CAST]], i32 1 +; CHECK-OOB-STRICT-NEXT: store <2 x i32> [[TMP9]], ptr addrspace(2) [[STORE_GEP3]], align 4 +; CHECK-OOB-STRICT-NEXT: ret void +; + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0 + %load1 = load i32, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4 + %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2 + %load3 = load float, ptr addrspace(1) %gep3, align 4 + %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3 + %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store i32 %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1 + store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4 + %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2 + store float %load3, ptr addrspace(2) %store.gep3, align 4 + %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3 + store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4 + ret void +} + +define void @merge_fp_v2half_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-OOB-RELAXED-LABEL: define void @merge_fp_v2half_type( +; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] { +; CHECK-OOB-RELAXED-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[LOAD11:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-OOB-RELAXED-NEXT: [[DOTCAST:%.*]] = bitcast float [[TMP2]] to <2 x half> +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-RELAXED-NEXT: [[DOTCAST_CAST:%.*]] = bitcast <2 x half> [[DOTCAST]] to i32 +; CHECK-OOB-RELAXED-NEXT: [[TMP3:%.*]] = bitcast float [[LOAD11]] to i32 +; CHECK-OOB-RELAXED-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-OOB-RELAXED-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[DOTCAST_CAST]], i32 1 +; CHECK-OOB-RELAXED-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: ret void +; +; CHECK-OOB-STRICT-LABEL: define void @merge_fp_v2half_type( +; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-OOB-STRICT-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-STRICT-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[LOAD11:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-OOB-STRICT-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-OOB-STRICT-NEXT: [[DOTCAST:%.*]] = bitcast float [[TMP2]] to <2 x half> +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-STRICT-NEXT: [[DOTCAST_CAST:%.*]] = bitcast <2 x half> [[DOTCAST]] to i32 +; CHECK-OOB-STRICT-NEXT: [[TMP3:%.*]] = bitcast float [[LOAD11]] to i32 +; CHECK-OOB-STRICT-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-OOB-STRICT-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[DOTCAST_CAST]], i32 1 +; CHECK-OOB-STRICT-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: ret void +; + %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0 + %load1 = load float, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x half>, ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store float %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(2) %ptr2, i64 1 + store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +} + +define void @merge_v2half_bfloat_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-OOB-RELAXED-LABEL: define void @merge_v2half_bfloat_type( +; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] { +; CHECK-OOB-RELAXED-NEXT: [[GEP1:%.*]] = getelementptr inbounds bfloat, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-RELAXED-NEXT: [[LOAD1:%.*]] = load bfloat, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-OOB-RELAXED-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-RELAXED-NEXT: store bfloat [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-OOB-RELAXED-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-OOB-RELAXED-NEXT: ret void +; +; CHECK-OOB-STRICT-LABEL: define void @merge_v2half_bfloat_type( +; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-OOB-STRICT-NEXT: [[GEP1:%.*]] = getelementptr inbounds bfloat, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-STRICT-NEXT: [[LOAD1:%.*]] = load bfloat, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-OOB-STRICT-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-STRICT-NEXT: store bfloat [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-OOB-STRICT-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-OOB-STRICT-NEXT: ret void +; + %gep1 = getelementptr inbounds bfloat, ptr addrspace(1) %ptr1, i64 0 + %load1 = load bfloat, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x half>, ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store bfloat %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(2) %ptr2, i64 1 + store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +} + +define void @no_merge_mixed_ptr_addrspaces(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-OOB-RELAXED-LABEL: define void @no_merge_mixed_ptr_addrspaces( +; CHECK-OOB-RELAXED-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) #[[ATTR1]] { +; CHECK-OOB-RELAXED-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-RELAXED-NEXT: [[LOAD1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-OOB-RELAXED-NEXT: [[LOAD2:%.*]] = load ptr addrspace(2), ptr addrspace(1) [[GEP2]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-RELAXED-NEXT: store ptr addrspace(1) [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-OOB-RELAXED-NEXT: store ptr addrspace(2) [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-OOB-RELAXED-NEXT: ret void +; +; CHECK-OOB-STRICT-LABEL: define void @no_merge_mixed_ptr_addrspaces( +; CHECK-OOB-STRICT-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-OOB-STRICT-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-OOB-STRICT-NEXT: [[LOAD1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-OOB-STRICT-NEXT: [[LOAD2:%.*]] = load ptr addrspace(2), ptr addrspace(1) [[GEP2]], align 4 +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-OOB-STRICT-NEXT: store ptr addrspace(1) [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-OOB-STRICT-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-OOB-STRICT-NEXT: store ptr addrspace(2) [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-OOB-STRICT-NEXT: ret void +; + %gep1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %ptr1, i64 0 + %load1 = load ptr addrspace(1), ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) %ptr1, i64 1 + %load2 = load ptr addrspace(2), ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store ptr addrspace(1) %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) %ptr2, i64 1 + store ptr addrspace(2) %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +} +;. +; CHECK-OOB-RELAXED: [[META0]] = !{!"invariant.load"} +; CHECK-OOB-RELAXED: [[META1]] = !{!"nontemporal"} +;. +; CHECK-OOB-STRICT: [[META0]] = !{!"invariant.load"} +; CHECK-OOB-STRICT: [[META1]] = !{!"nontemporal"} +;.