From 7a786f239eb79a41c0abc974f8a94e540c1df920 Mon Sep 17 00:00:00 2001 From: jofrn Date: Fri, 31 Jan 2025 13:12:56 -0500 Subject: [PATCH] [SelectionDAG][X86] Remove unused elements from atomic vector. After splitting, all elements are created. The two components must be found by looking at the upper and lower half of EXTRACT_ELEMENT. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8 --- llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 ++- .../SelectionDAGAddressAnalysis.cpp | 30 ++-- .../SelectionDAG/SelectionDAGBuilder.cpp | 6 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 43 +++-- llvm/test/CodeGen/X86/atomic-load-store.ll | 167 ++---------------- 6 files changed, 83 insertions(+), 187 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index ba11ddbb5b731..d3cd81c146280 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1843,7 +1843,7 @@ class SelectionDAG { /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); + SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their @@ -2281,7 +2281,7 @@ class SelectionDAG { /// merged. Check that both are nonvolatile and if LD is loading /// 'Bytes' bytes from a location that is 'Dist' units away from the /// location that the 'Base' load is loading from. - bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, + bool areNonVolatileConsecutiveLoads(MemSDNode *LD, MemSDNode *Base, unsigned Bytes, int Dist) const; /// Infer alignment of a load / store address. Return std::nullopt if it diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 2a68903c34cef..8e77a542ab029 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12218,7 +12218,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, return TokenFactor; } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, +SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp) { assert(isa(NewMemOp.getNode()) && "Expected a memop node"); SDValue OldChain = SDValue(OldLoad, 1); @@ -12911,17 +12911,21 @@ std::pair SelectionDAG::UnrollVectorOverflowOp( getBuildVector(NewOvVT, dl, OvScalars)); } -bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, - LoadSDNode *Base, +bool SelectionDAG::areNonVolatileConsecutiveLoads(MemSDNode *LD, + MemSDNode *Base, unsigned Bytes, int Dist) const { if (LD->isVolatile() || Base->isVolatile()) return false; - // TODO: probably too restrictive for atomics, revisit - if (!LD->isSimple()) - return false; - if (LD->isIndexed() || Base->isIndexed()) - return false; + if (auto Ld = dyn_cast(LD)) { + if (!Ld->isSimple()) + return false; + if (Ld->isIndexed()) + return false; + } + if (auto Ld = dyn_cast(Base)) + if (Ld->isIndexed()) + return false; if (LD->getChain() != Base->getChain()) return false; EVT VT = LD->getMemoryVT(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index f2ab88851b780..c29cb424c7a4c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -195,8 +195,8 @@ bool BaseIndexOffset::contains(const SelectionDAG &DAG, int64_t BitSize, } /// Parses tree in Ptr for base, index, offset addresses. -static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, - const SelectionDAG &DAG) { +template +static BaseIndexOffset matchSDNode(const T *N, const SelectionDAG &DAG) { SDValue Ptr = N->getBasePtr(); // (((B + I*M) + c)) + c ... @@ -206,16 +206,18 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, bool IsIndexSignExt = false; // pre-inc/pre-dec ops are components of EA. - if (N->getAddressingMode() == ISD::PRE_INC) { - if (auto *C = dyn_cast(N->getOffset())) - Offset += C->getSExtValue(); - else // If unknown, give up now. - return BaseIndexOffset(SDValue(), SDValue(), 0, false); - } else if (N->getAddressingMode() == ISD::PRE_DEC) { - if (auto *C = dyn_cast(N->getOffset())) - Offset -= C->getSExtValue(); - else // If unknown, give up now. - return BaseIndexOffset(SDValue(), SDValue(), 0, false); + if constexpr (std::is_same_v) { + if (N->getAddressingMode() == ISD::PRE_INC) { + if (auto *C = dyn_cast(N->getOffset())) + Offset += C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } else if (N->getAddressingMode() == ISD::PRE_DEC) { + if (auto *C = dyn_cast(N->getOffset())) + Offset -= C->getSExtValue(); + else // If unknown, give up now. + return BaseIndexOffset(SDValue(), SDValue(), 0, false); + } } // Consume constant adds & ors with appropriate masking. @@ -300,8 +302,10 @@ static BaseIndexOffset matchLSNode(const LSBaseSDNode *N, BaseIndexOffset BaseIndexOffset::match(const SDNode *N, const SelectionDAG &DAG) { + if (const auto *AN = dyn_cast(N)) + return matchSDNode(AN, DAG); if (const auto *LS0 = dyn_cast(N)) - return matchLSNode(LS0, DAG); + return matchSDNode(LS0, DAG); if (const auto *LN = dyn_cast(N)) { if (LN->hasOffset()) return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 97ce20b973204..3e7e0008f7c9b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5172,7 +5172,11 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { L = DAG.getPtrExtOrTrunc(L, dl, VT); setValue(&I, L); - DAG.setRoot(OutChain); + + if (VT.isVector()) + DAG.setRoot(InChain); + else + DAG.setRoot(OutChain); } void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 749fa34e791af..76c235499a1b5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. -static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { - if (ISD::isNON_EXTLoad(Elt.getNode())) { - auto *BaseLd = cast(Elt); - if (!BaseLd->isSimple()) - return false; +static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) { + if (auto *BaseLd = dyn_cast(Elt)) { Ld = BaseLd; ByteOffset = 0; return true; - } + } else if (auto *BaseLd = dyn_cast(Elt)) + if (ISD::isNON_EXTLoad(Elt.getNode())) { + if (!BaseLd->isSimple()) + return false; + Ld = BaseLd; + ByteOffset = 0; + return true; + } switch (Elt.getOpcode()) { case ISD::BITCAST: @@ -7230,6 +7234,20 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { } } break; + case ISD::EXTRACT_ELEMENT: + if (auto *IdxC = dyn_cast(Elt.getOperand(1))) { + SDValue Src = Elt.getOperand(0); + unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); + unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); + if (2 * DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && + findEltLoadSrc(Src, Ld, ByteOffset)) { + uint64_t Idx = IdxC->getZExtValue(); + if (Idx == 1) // Get the upper half. + ByteOffset += SrcSizeInBits / (1 << 2 * 2); + return true; + } + } + break; } return false; @@ -7254,7 +7272,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); - SmallVector Loads(NumElems, nullptr); + SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an @@ -7304,7 +7322,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); - LoadSDNode *LDBase = Loads[FirstLoadedElt]; + MemSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; @@ -7318,8 +7336,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. - auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { - LoadSDNode *Ld = Loads[EltIdx]; + auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) { + MemSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); @@ -7347,7 +7365,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, } } - auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { + auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); @@ -9452,8 +9470,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { { SmallVector Ops(Op->ops().take_front(NumElems)); if (SDValue LD = - EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) + EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) { return LD; + } } // If this is a splat of pairs of 32-bit elements, we can use a narrower diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 42b0955824293..08d0405345f57 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -205,63 +205,19 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { } define <2 x half> @atomic_vec2_half(ptr %x) { -; CHECK3-LABEL: atomic_vec2_half: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec2_half: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec2_half: +; CHECK: ## %bb.0: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq %ret = load atomic <2 x half>, ptr %x acquire, align 4 ret <2 x half> %ret } define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { -; CHECK3-LABEL: atomic_vec2_bfloat: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movl (%rdi), %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec2_bfloat: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movl (%rdi), %eax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec2_bfloat: +; CHECK: ## %bb.0: +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 ret <2 x bfloat> %ret } @@ -439,110 +395,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { } define <4 x half> @atomic_vec4_half(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_half: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movl %eax, %ecx -; CHECK3-NEXT: shrl $16, %ecx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: shrq $48, %rax -; CHECK3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_half: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK0-NEXT: movq %rax, %rcx -; CHECK0-NEXT: shrq $32, %rcx -; CHECK0-NEXT: movw %cx, %dx -; CHECK0-NEXT: ## implicit-def: $ecx -; CHECK0-NEXT: movw %dx, %cx -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %cx -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm3 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_half: +; CHECK: ## %bb.0: +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: retq %ret = load atomic <4 x half>, ptr %x acquire, align 8 ret <4 x half> %ret } define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { -; CHECK3-LABEL: atomic_vec4_bfloat: -; CHECK3: ## %bb.0: -; CHECK3-NEXT: movq (%rdi), %rax -; CHECK3-NEXT: movq %rax, %rcx -; CHECK3-NEXT: movq %rax, %rdx -; CHECK3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax -; CHECK3-NEXT: shrl $16, %eax -; CHECK3-NEXT: shrq $32, %rcx -; CHECK3-NEXT: shrq $48, %rdx -; CHECK3-NEXT: pinsrw $0, %edx, %xmm1 -; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK3-NEXT: retq -; -; CHECK0-LABEL: atomic_vec4_bfloat: -; CHECK0: ## %bb.0: -; CHECK0-NEXT: movq (%rdi), %rax -; CHECK0-NEXT: movl %eax, %ecx -; CHECK0-NEXT: shrl $16, %ecx -; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx -; CHECK0-NEXT: movw %ax, %dx -; CHECK0-NEXT: movq %rax, %rsi -; CHECK0-NEXT: shrq $32, %rsi -; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi -; CHECK0-NEXT: shrq $48, %rax -; CHECK0-NEXT: movw %ax, %di -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %di, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %si, %ax -; CHECK0-NEXT: ## implicit-def: $xmm1 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %dx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm0 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK0-NEXT: ## implicit-def: $eax -; CHECK0-NEXT: movw %cx, %ax -; CHECK0-NEXT: ## implicit-def: $xmm2 -; CHECK0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK0-NEXT: retq +; CHECK-LABEL: atomic_vec4_bfloat: +; CHECK: ## %bb.0: +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: retq %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 ret <4 x bfloat> %ret }