Skip to content

Commit 2faa227

Browse files
committed
[SelectionDAG][X86] Split via Concat <n x T> vector types for atomic load
Vector types that aren't widened are 'split' via CONCAT_VECTORS so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. commit-id:3a045357
1 parent 5bc5d32 commit 2faa227

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h

+1
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
960960
void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
961961
void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
962962
void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
963+
void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD);
963964
void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
964965
void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
965966
void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo,

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
11721172
SplitVecRes_STEP_VECTOR(N, Lo, Hi);
11731173
break;
11741174
case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
1175+
case ISD::ATOMIC_LOAD:
1176+
SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
1177+
break;
11751178
case ISD::LOAD:
11761179
SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
11771180
break;
@@ -1421,6 +1424,35 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
14211424
SetSplitVector(SDValue(N, ResNo), Lo, Hi);
14221425
}
14231426

1427+
void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD) {
1428+
SDLoc dl(LD);
1429+
1430+
EVT MemoryVT = LD->getMemoryVT();
1431+
unsigned NumElts = MemoryVT.getVectorMinNumElements();
1432+
1433+
EVT IntMemoryVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
1434+
EVT ElemVT =
1435+
EVT::getVectorVT(*DAG.getContext(), MemoryVT.getVectorElementType(), 1);
1436+
1437+
// Create a single atomic to load all the elements at once.
1438+
SDValue Atomic =
1439+
DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, IntMemoryVT, IntMemoryVT,
1440+
LD->getChain(), LD->getBasePtr(), LD->getMemOperand());
1441+
1442+
// Instead of splitting, put all the elements back into a vector.
1443+
SmallVector<SDValue, 4> Ops;
1444+
for (unsigned i = 0; i < NumElts; ++i) {
1445+
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Atomic,
1446+
DAG.getVectorIdxConstant(i, dl));
1447+
Elt = DAG.getBitcast(ElemVT, Elt);
1448+
Ops.push_back(Elt);
1449+
}
1450+
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MemoryVT, Ops);
1451+
1452+
ReplaceValueWith(SDValue(LD, 0), Concat);
1453+
ReplaceValueWith(SDValue(LD, 1), LD->getChain());
1454+
}
1455+
14241456
void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
14251457
MachinePointerInfo &MPI, SDValue &Ptr,
14261458
uint64_t *ScaledOffset) {

llvm/test/CodeGen/X86/atomic-load-store.ll

+171
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,76 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
204204
ret <2 x float> %ret
205205
}
206206

207+
define <2 x half> @atomic_vec2_half(ptr %x) {
208+
; CHECK3-LABEL: atomic_vec2_half:
209+
; CHECK3: ## %bb.0:
210+
; CHECK3-NEXT: movl (%rdi), %eax
211+
; CHECK3-NEXT: movd %eax, %xmm1
212+
; CHECK3-NEXT: shrl $16, %eax
213+
; CHECK3-NEXT: pinsrw $0, %eax, %xmm2
214+
; CHECK3-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
215+
; CHECK3-NEXT: pand %xmm0, %xmm1
216+
; CHECK3-NEXT: pslld $16, %xmm2
217+
; CHECK3-NEXT: pandn %xmm2, %xmm0
218+
; CHECK3-NEXT: por %xmm1, %xmm0
219+
; CHECK3-NEXT: retq
220+
;
221+
; CHECK0-LABEL: atomic_vec2_half:
222+
; CHECK0: ## %bb.0:
223+
; CHECK0-NEXT: movl (%rdi), %eax
224+
; CHECK0-NEXT: movl %eax, %ecx
225+
; CHECK0-NEXT: shrl $16, %ecx
226+
; CHECK0-NEXT: movw %cx, %dx
227+
; CHECK0-NEXT: ## implicit-def: $ecx
228+
; CHECK0-NEXT: movw %dx, %cx
229+
; CHECK0-NEXT: ## implicit-def: $xmm2
230+
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2
231+
; CHECK0-NEXT: movd %eax, %xmm0
232+
; CHECK0-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
233+
; CHECK0-NEXT: pand %xmm1, %xmm0
234+
; CHECK0-NEXT: pslld $16, %xmm2
235+
; CHECK0-NEXT: pandn %xmm2, %xmm1
236+
; CHECK0-NEXT: por %xmm1, %xmm0
237+
; CHECK0-NEXT: retq
238+
%ret = load atomic <2 x half>, ptr %x acquire, align 4
239+
ret <2 x half> %ret
240+
}
241+
242+
define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
243+
; CHECK3-LABEL: atomic_vec2_bfloat:
244+
; CHECK3: ## %bb.0:
245+
; CHECK3-NEXT: movl (%rdi), %eax
246+
; CHECK3-NEXT: movd %eax, %xmm1
247+
; CHECK3-NEXT: shrl $16, %eax
248+
; CHECK3-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
249+
; CHECK3-NEXT: pand %xmm0, %xmm1
250+
; CHECK3-NEXT: pinsrw $0, %eax, %xmm2
251+
; CHECK3-NEXT: pslld $16, %xmm2
252+
; CHECK3-NEXT: pandn %xmm2, %xmm0
253+
; CHECK3-NEXT: por %xmm1, %xmm0
254+
; CHECK3-NEXT: retq
255+
;
256+
; CHECK0-LABEL: atomic_vec2_bfloat:
257+
; CHECK0: ## %bb.0:
258+
; CHECK0-NEXT: movl (%rdi), %eax
259+
; CHECK0-NEXT: movl %eax, %ecx
260+
; CHECK0-NEXT: shrl $16, %ecx
261+
; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
262+
; CHECK0-NEXT: movd %eax, %xmm0
263+
; CHECK0-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
264+
; CHECK0-NEXT: pand %xmm1, %xmm0
265+
; CHECK0-NEXT: ## implicit-def: $eax
266+
; CHECK0-NEXT: movw %cx, %ax
267+
; CHECK0-NEXT: ## implicit-def: $xmm2
268+
; CHECK0-NEXT: pinsrw $0, %eax, %xmm2
269+
; CHECK0-NEXT: pslld $16, %xmm2
270+
; CHECK0-NEXT: pandn %xmm2, %xmm1
271+
; CHECK0-NEXT: por %xmm1, %xmm0
272+
; CHECK0-NEXT: retq
273+
%ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
274+
ret <2 x bfloat> %ret
275+
}
276+
207277
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
208278
; CHECK3-LABEL: atomic_vec1_ptr:
209279
; CHECK3: ## %bb.0:
@@ -376,6 +446,107 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
376446
ret <4 x i16> %ret
377447
}
378448

449+
define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
450+
; CHECK3-LABEL: atomic_vec4_half:
451+
; CHECK3: ## %bb.0:
452+
; CHECK3-NEXT: movq (%rdi), %rax
453+
; CHECK3-NEXT: movl %eax, %ecx
454+
; CHECK3-NEXT: shrl $16, %ecx
455+
; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1
456+
; CHECK3-NEXT: movq %rax, %rcx
457+
; CHECK3-NEXT: shrq $32, %rcx
458+
; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
459+
; CHECK3-NEXT: movq %rax, %xmm0
460+
; CHECK3-NEXT: shrq $48, %rax
461+
; CHECK3-NEXT: pinsrw $0, %eax, %xmm3
462+
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
463+
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
464+
; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
465+
; CHECK3-NEXT: retq
466+
;
467+
; CHECK0-LABEL: atomic_vec4_half:
468+
; CHECK0: ## %bb.0:
469+
; CHECK0-NEXT: movq (%rdi), %rax
470+
; CHECK0-NEXT: movl %eax, %ecx
471+
; CHECK0-NEXT: shrl $16, %ecx
472+
; CHECK0-NEXT: movw %cx, %dx
473+
; CHECK0-NEXT: ## implicit-def: $ecx
474+
; CHECK0-NEXT: movw %dx, %cx
475+
; CHECK0-NEXT: ## implicit-def: $xmm2
476+
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2
477+
; CHECK0-NEXT: movq %rax, %rcx
478+
; CHECK0-NEXT: shrq $32, %rcx
479+
; CHECK0-NEXT: movw %cx, %dx
480+
; CHECK0-NEXT: ## implicit-def: $ecx
481+
; CHECK0-NEXT: movw %dx, %cx
482+
; CHECK0-NEXT: ## implicit-def: $xmm1
483+
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
484+
; CHECK0-NEXT: movq %rax, %rcx
485+
; CHECK0-NEXT: shrq $48, %rcx
486+
; CHECK0-NEXT: movw %cx, %dx
487+
; CHECK0-NEXT: ## implicit-def: $ecx
488+
; CHECK0-NEXT: movw %dx, %cx
489+
; CHECK0-NEXT: ## implicit-def: $xmm3
490+
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm3
491+
; CHECK0-NEXT: movq %rax, %xmm0
492+
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
493+
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
494+
; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
495+
; CHECK0-NEXT: retq
496+
%ret = load atomic <4 x half>, ptr %x acquire, align 8
497+
ret <4 x half> %ret
498+
}
499+
500+
define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
501+
; CHECK3-LABEL: atomic_vec4_bfloat:
502+
; CHECK3: ## %bb.0:
503+
; CHECK3-NEXT: movq (%rdi), %rax
504+
; CHECK3-NEXT: movq %rax, %xmm0
505+
; CHECK3-NEXT: movl %eax, %ecx
506+
; CHECK3-NEXT: shrl $16, %ecx
507+
; CHECK3-NEXT: movq %rax, %rdx
508+
; CHECK3-NEXT: shrq $32, %rdx
509+
; CHECK3-NEXT: shrq $48, %rax
510+
; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
511+
; CHECK3-NEXT: pinsrw $0, %edx, %xmm2
512+
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
513+
; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1
514+
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
515+
; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
516+
; CHECK3-NEXT: retq
517+
;
518+
; CHECK0-LABEL: atomic_vec4_bfloat:
519+
; CHECK0: ## %bb.0:
520+
; CHECK0-NEXT: movq (%rdi), %rax
521+
; CHECK0-NEXT: movq %rax, %xmm0
522+
; CHECK0-NEXT: movl %eax, %ecx
523+
; CHECK0-NEXT: shrl $16, %ecx
524+
; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
525+
; CHECK0-NEXT: movq %rax, %rdx
526+
; CHECK0-NEXT: shrq $32, %rdx
527+
; CHECK0-NEXT: ## kill: def $dx killed $dx killed $rdx
528+
; CHECK0-NEXT: shrq $48, %rax
529+
; CHECK0-NEXT: movw %ax, %si
530+
; CHECK0-NEXT: ## implicit-def: $eax
531+
; CHECK0-NEXT: movw %si, %ax
532+
; CHECK0-NEXT: ## implicit-def: $xmm2
533+
; CHECK0-NEXT: pinsrw $0, %eax, %xmm2
534+
; CHECK0-NEXT: ## implicit-def: $eax
535+
; CHECK0-NEXT: movw %dx, %ax
536+
; CHECK0-NEXT: ## implicit-def: $xmm1
537+
; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
538+
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
539+
; CHECK0-NEXT: ## implicit-def: $eax
540+
; CHECK0-NEXT: movw %cx, %ax
541+
; CHECK0-NEXT: ## implicit-def: $xmm2
542+
; CHECK0-NEXT: pinsrw $0, %eax, %xmm2
543+
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
544+
; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
545+
; CHECK0-NEXT: retq
546+
%ret = load atomic <4 x bfloat>, ptr %x acquire, align 8
547+
ret <4 x bfloat> %ret
548+
}
549+
379550
define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind {
380551
; CHECK-LABEL: atomic_vec4_float_align:
381552
; CHECK: ## %bb.0:

0 commit comments

Comments
 (0)