Skip to content

Commit fc42ee7

Browse files
committed
[SelectionDAG][X86] Widen <2 x T> vector types for atomic load
Vector types of 2 elements must be widened. This change does this for vector types of atomic load in SelectionDAG so that it can translate aligned vectors of >1 size. Also, it also adds Pats to remove an extra MOV. commit-id:2894ccd1
1 parent 3fd1c8a commit fc42ee7

File tree

5 files changed

+177
-23
lines changed

5 files changed

+177
-23
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h

+1
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
10621062
SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
10631063
SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
10641064
SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
1065+
SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N);
10651066
SDValue WidenVecRes_LOAD(SDNode* N);
10661067
SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
10671068
SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

+87-21
Original file line numberDiff line numberDiff line change
@@ -4592,6 +4592,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
45924592
break;
45934593
case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
45944594
case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
4595+
case ISD::ATOMIC_LOAD:
4596+
Res = WidenVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
4597+
break;
45954598
case ISD::LOAD: Res = WidenVecRes_LOAD(N); break;
45964599
case ISD::STEP_VECTOR:
45974600
case ISD::SPLAT_VECTOR:
@@ -5982,6 +5985,89 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
59825985
N->getOperand(1), N->getOperand(2));
59835986
}
59845987

5988+
static SDValue loadElement(SDValue LdOp, EVT FirstVT, EVT WidenVT,
5989+
TypeSize LdWidth, TypeSize FirstVTWidth, SDLoc dl,
5990+
SelectionDAG &DAG) {
5991+
assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
5992+
TypeSize WidenWidth = WidenVT.getSizeInBits();
5993+
if (!FirstVT.isVector()) {
5994+
unsigned NumElts =
5995+
WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
5996+
EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts);
5997+
SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
5998+
return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
5999+
} else if (FirstVT == WidenVT)
6000+
return LdOp;
6001+
else {
6002+
// TODO: We don't currently have any tests that exercise this code path.
6003+
assert(WidenWidth.getFixedValue() % FirstVTWidth.getFixedValue() == 0);
6004+
unsigned NumConcat =
6005+
WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
6006+
SmallVector<SDValue, 16> ConcatOps(NumConcat);
6007+
SDValue UndefVal = DAG.getUNDEF(FirstVT);
6008+
ConcatOps[0] = LdOp;
6009+
for (unsigned i = 1; i != NumConcat; ++i)
6010+
ConcatOps[i] = UndefVal;
6011+
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
6012+
}
6013+
}
6014+
6015+
static std::optional<EVT> findMemType(SelectionDAG &DAG,
6016+
const TargetLowering &TLI, unsigned Width,
6017+
EVT WidenVT, unsigned Align,
6018+
unsigned WidenEx);
6019+
6020+
SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) {
6021+
EVT WidenVT =
6022+
TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
6023+
EVT LdVT = LD->getMemoryVT();
6024+
SDLoc dl(LD);
6025+
assert(LdVT.isVector() && WidenVT.isVector());
6026+
assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
6027+
assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
6028+
6029+
// Load information
6030+
SDValue Chain = LD->getChain();
6031+
SDValue BasePtr = LD->getBasePtr();
6032+
MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
6033+
AAMDNodes AAInfo = LD->getAAInfo();
6034+
6035+
TypeSize LdWidth = LdVT.getSizeInBits();
6036+
TypeSize WidenWidth = WidenVT.getSizeInBits();
6037+
TypeSize WidthDiff = WidenWidth - LdWidth;
6038+
// Allow wider loads if they are sufficiently aligned to avoid memory faults
6039+
// and if the original load is simple.
6040+
unsigned LdAlign =
6041+
(!LD->isSimple() || LdVT.isScalableVector()) ? 0 : LD->getAlign().value();
6042+
6043+
// Find the vector type that can load from.
6044+
std::optional<EVT> FirstVT =
6045+
findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, LdAlign,
6046+
WidthDiff.getKnownMinValue());
6047+
6048+
if (!FirstVT)
6049+
return SDValue();
6050+
6051+
SmallVector<EVT, 8> MemVTs;
6052+
TypeSize FirstVTWidth = FirstVT->getSizeInBits();
6053+
6054+
SDValue LdOp = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, *FirstVT, *FirstVT,
6055+
Chain, BasePtr, LD->getMemOperand());
6056+
6057+
// Load the element with one instruction.
6058+
SDValue Result = loadElement(LdOp, *FirstVT, WidenVT, LdWidth, FirstVTWidth,
6059+
dl, DAG);
6060+
6061+
if (Result) {
6062+
// Modified the chain - switch anything that used the old chain to use
6063+
// the new one.
6064+
ReplaceValueWith(SDValue(LD, 1), LdOp.getValue(1));
6065+
return Result;
6066+
}
6067+
6068+
report_fatal_error("Unable to widen atomic vector load");
6069+
}
6070+
59856071
SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
59866072
LoadSDNode *LD = cast<LoadSDNode>(N);
59876073
ISD::LoadExtType ExtType = LD->getExtensionType();
@@ -7865,27 +7951,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
78657951

78667952
// Check if we can load the element with one instruction.
78677953
if (MemVTs.empty()) {
7868-
assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
7869-
if (!FirstVT->isVector()) {
7870-
unsigned NumElts =
7871-
WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
7872-
EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), *FirstVT, NumElts);
7873-
SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
7874-
return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
7875-
}
7876-
if (FirstVT == WidenVT)
7877-
return LdOp;
7878-
7879-
// TODO: We don't currently have any tests that exercise this code path.
7880-
assert(WidenWidth.getFixedValue() % FirstVTWidth.getFixedValue() == 0);
7881-
unsigned NumConcat =
7882-
WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
7883-
SmallVector<SDValue, 16> ConcatOps(NumConcat);
7884-
SDValue UndefVal = DAG.getUNDEF(*FirstVT);
7885-
ConcatOps[0] = LdOp;
7886-
for (unsigned i = 1; i != NumConcat; ++i)
7887-
ConcatOps[i] = UndefVal;
7888-
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
7954+
return loadElement(LdOp, *FirstVT, WidenVT, LdWidth, FirstVTWidth, dl, DAG);
78897955
}
78907956

78917957
// Load vector by using multiple loads from largest vector to scalar.

llvm/lib/Target/X86/X86InstrCompiler.td

+7
Original file line numberDiff line numberDiff line change
@@ -1200,6 +1200,13 @@ def : Pat<(i16 (atomic_load_16 addr:$src)), (MOV16rm addr:$src)>;
12001200
def : Pat<(i32 (atomic_load_32 addr:$src)), (MOV32rm addr:$src)>;
12011201
def : Pat<(i64 (atomic_load_64 addr:$src)), (MOV64rm addr:$src)>;
12021202

1203+
def : Pat<(v4i32 (scalar_to_vector (i32 (anyext (i16 (atomic_load_16 addr:$src)))))),
1204+
(MOVDI2PDIrm addr:$src)>; // load atomic <2 x i8>
1205+
def : Pat<(v4i32 (scalar_to_vector (i32 (atomic_load_32 addr:$src)))),
1206+
(MOVDI2PDIrm addr:$src)>; // load atomic <2 x i16>
1207+
def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))),
1208+
(MOV64toPQIrm addr:$src)>; // load atomic <2 x i32,float>
1209+
12031210
// Floating point loads/stores.
12041211
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
12051212
(MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>;

llvm/test/CodeGen/X86/atomic-load-store.ll

+81
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,64 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
146146
ret <1 x i64> %ret
147147
}
148148

149+
define <2 x i8> @atomic_vec2_i8(ptr %x) {
150+
; CHECK3-LABEL: atomic_vec2_i8:
151+
; CHECK3: ## %bb.0:
152+
; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
153+
; CHECK3-NEXT: retq
154+
;
155+
; CHECK0-LABEL: atomic_vec2_i8:
156+
; CHECK0: ## %bb.0:
157+
; CHECK0-NEXT: movw (%rdi), %cx
158+
; CHECK0-NEXT: ## implicit-def: $eax
159+
; CHECK0-NEXT: movw %cx, %ax
160+
; CHECK0-NEXT: movd %eax, %xmm0
161+
; CHECK0-NEXT: retq
162+
%ret = load atomic <2 x i8>, ptr %x acquire, align 4
163+
ret <2 x i8> %ret
164+
}
165+
166+
define <2 x i16> @atomic_vec2_i16(ptr %x) {
167+
; CHECK3-LABEL: atomic_vec2_i16:
168+
; CHECK3: ## %bb.0:
169+
; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
170+
; CHECK3-NEXT: retq
171+
;
172+
; CHECK0-LABEL: atomic_vec2_i16:
173+
; CHECK0: ## %bb.0:
174+
; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
175+
; CHECK0-NEXT: retq
176+
%ret = load atomic <2 x i16>, ptr %x acquire, align 4
177+
ret <2 x i16> %ret
178+
}
179+
180+
define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) {
181+
; CHECK-LABEL: atomic_vec2_ptr270:
182+
; CHECK: ## %bb.0:
183+
; CHECK-NEXT: movq (%rdi), %xmm0
184+
; CHECK-NEXT: retq
185+
%ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8
186+
ret <2 x ptr addrspace(270)> %ret
187+
}
188+
189+
define <2 x i32> @atomic_vec2_i32_align(ptr %x) {
190+
; CHECK-LABEL: atomic_vec2_i32_align:
191+
; CHECK: ## %bb.0:
192+
; CHECK-NEXT: movq (%rdi), %xmm0
193+
; CHECK-NEXT: retq
194+
%ret = load atomic <2 x i32>, ptr %x acquire, align 8
195+
ret <2 x i32> %ret
196+
}
197+
198+
define <2 x float> @atomic_vec2_float_align(ptr %x) {
199+
; CHECK-LABEL: atomic_vec2_float_align:
200+
; CHECK: ## %bb.0:
201+
; CHECK-NEXT: movq (%rdi), %xmm0
202+
; CHECK-NEXT: retq
203+
%ret = load atomic <2 x float>, ptr %x acquire, align 8
204+
ret <2 x float> %ret
205+
}
206+
149207
define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
150208
; CHECK3-LABEL: atomic_vec1_ptr:
151209
; CHECK3: ## %bb.0:
@@ -295,6 +353,29 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
295353
ret <2 x i32> %ret
296354
}
297355

356+
define <4 x i8> @atomic_vec4_i8(ptr %x) nounwind {
357+
; CHECK3-LABEL: atomic_vec4_i8:
358+
; CHECK3: ## %bb.0:
359+
; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
360+
; CHECK3-NEXT: retq
361+
;
362+
; CHECK0-LABEL: atomic_vec4_i8:
363+
; CHECK0: ## %bb.0:
364+
; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
365+
; CHECK0-NEXT: retq
366+
%ret = load atomic <4 x i8>, ptr %x acquire, align 4
367+
ret <4 x i8> %ret
368+
}
369+
370+
define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
371+
; CHECK-LABEL: atomic_vec4_i16:
372+
; CHECK: ## %bb.0:
373+
; CHECK-NEXT: movq (%rdi), %xmm0
374+
; CHECK-NEXT: retq
375+
%ret = load atomic <4 x i16>, ptr %x acquire, align 8
376+
ret <4 x i16> %ret
377+
}
378+
298379
define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind {
299380
; CHECK-LABEL: atomic_vec4_float_align:
300381
; CHECK: ## %bb.0:

llvm/test/CodeGen/X86/atomic-unordered.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -2275,8 +2275,7 @@ define i64 @load_i16_anyext_i64(ptr %ptr) {
22752275
;
22762276
; CHECK-O3-LABEL: load_i16_anyext_i64:
22772277
; CHECK-O3: # %bb.0:
2278-
; CHECK-O3-NEXT: movzwl (%rdi), %eax
2279-
; CHECK-O3-NEXT: vmovd %eax, %xmm0
2278+
; CHECK-O3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
22802279
; CHECK-O3-NEXT: vmovq %xmm0, %rax
22812280
; CHECK-O3-NEXT: retq
22822281
%v = load atomic i16, ptr %ptr unordered, align 8

0 commit comments

Comments
 (0)