Skip to content

Commit 0e3399b

Browse files
committed
[X86] Add avx512 scatter intrinsics that use a vXi1 mask instead of a scalar integer.
We're trying to have the vXi1 types in IR as much as possible. This prevents the need for bitcasts when the producer of the mask was already a vXi1 value like an icmp. The bitcasts can be subject to code motion and interfere with basic block at a time isel in bad ways. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351275 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 844041a commit 0e3399b

File tree

4 files changed

+257
-95
lines changed

4 files changed

+257
-95
lines changed

include/llvm/IR/IntrinsicsX86.td

+118-1
Original file line numberDiff line numberDiff line change
@@ -3569,6 +3569,7 @@ let TargetPrefix = "x86" in {
35693569

35703570
// Gather and Scatter ops
35713571
let TargetPrefix = "x86" in {
3572+
// NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
35723573
def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
35733574
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
35743575
llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
@@ -3701,6 +3702,7 @@ let TargetPrefix = "x86" in {
37013702
[IntrReadMem, IntrArgMemOnly]>;
37023703

37033704
// scatter
3705+
// NOTE: These are deprecated in favor of the versions that take a vXi1 mask.
37043706
def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
37053707
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
37063708
llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
@@ -3861,7 +3863,7 @@ let TargetPrefix = "x86" in {
38613863
llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
38623864
}
38633865

3864-
// AVX512 gather intrinsics that use vXi1 masks.
3866+
// AVX512 gather/scatter intrinsics that use vXi1 masks.
38653867
let TargetPrefix = "x86" in {
38663868
def int_x86_avx512_mask_gather_dpd_512 :
38673869
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
@@ -3977,6 +3979,121 @@ let TargetPrefix = "x86" in {
39773979
Intrinsic<[llvm_v8i32_ty],
39783980
[llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty],
39793981
[IntrReadMem, IntrArgMemOnly]>;
3982+
3983+
def int_x86_avx512_mask_scatter_dpd_512 :
3984+
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
3985+
llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
3986+
[IntrArgMemOnly]>;
3987+
def int_x86_avx512_mask_scatter_dps_512 :
3988+
Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
3989+
llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
3990+
[IntrArgMemOnly]>;
3991+
def int_x86_avx512_mask_scatter_qpd_512 :
3992+
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
3993+
llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
3994+
[IntrArgMemOnly]>;
3995+
def int_x86_avx512_mask_scatter_qps_512 :
3996+
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
3997+
llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
3998+
[IntrArgMemOnly]>;
3999+
4000+
4001+
def int_x86_avx512_mask_scatter_dpq_512 :
4002+
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,
4003+
llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
4004+
[IntrArgMemOnly]>;
4005+
def int_x86_avx512_mask_scatter_dpi_512 :
4006+
Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty,
4007+
llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
4008+
[IntrArgMemOnly]>;
4009+
def int_x86_avx512_mask_scatter_qpq_512 :
4010+
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,llvm_v8i64_ty, llvm_v8i64_ty,
4011+
llvm_i32_ty],
4012+
[IntrArgMemOnly]>;
4013+
def int_x86_avx512_mask_scatter_qpi_512 :
4014+
Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i32_ty,
4015+
llvm_i32_ty],
4016+
[IntrArgMemOnly]>;
4017+
4018+
def int_x86_avx512_mask_scatterdiv2_df :
4019+
Intrinsic<[],
4020+
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty],
4021+
[IntrArgMemOnly]>;
4022+
4023+
def int_x86_avx512_mask_scatterdiv2_di :
4024+
Intrinsic<[],
4025+
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
4026+
[IntrArgMemOnly]>;
4027+
4028+
def int_x86_avx512_mask_scatterdiv4_df :
4029+
Intrinsic<[],
4030+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty],
4031+
[IntrArgMemOnly]>;
4032+
4033+
def int_x86_avx512_mask_scatterdiv4_di :
4034+
Intrinsic<[],
4035+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
4036+
[IntrArgMemOnly]>;
4037+
4038+
def int_x86_avx512_mask_scatterdiv4_sf :
4039+
Intrinsic<[],
4040+
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty],
4041+
[IntrArgMemOnly]>;
4042+
4043+
def int_x86_avx512_mask_scatterdiv4_si :
4044+
Intrinsic<[],
4045+
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty],
4046+
[IntrArgMemOnly]>;
4047+
4048+
def int_x86_avx512_mask_scatterdiv8_sf :
4049+
Intrinsic<[],
4050+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty],
4051+
[IntrArgMemOnly]>;
4052+
4053+
def int_x86_avx512_mask_scatterdiv8_si :
4054+
Intrinsic<[],
4055+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty],
4056+
[IntrArgMemOnly]>;
4057+
4058+
def int_x86_avx512_mask_scattersiv2_df :
4059+
Intrinsic<[],
4060+
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty],
4061+
[IntrArgMemOnly]>;
4062+
4063+
def int_x86_avx512_mask_scattersiv2_di :
4064+
Intrinsic<[],
4065+
[llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty],
4066+
[IntrArgMemOnly]>;
4067+
4068+
def int_x86_avx512_mask_scattersiv4_df :
4069+
Intrinsic<[],
4070+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty],
4071+
[IntrArgMemOnly]>;
4072+
4073+
def int_x86_avx512_mask_scattersiv4_di :
4074+
Intrinsic<[],
4075+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty],
4076+
[IntrArgMemOnly]>;
4077+
4078+
def int_x86_avx512_mask_scattersiv4_sf :
4079+
Intrinsic<[],
4080+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty],
4081+
[IntrArgMemOnly]>;
4082+
4083+
def int_x86_avx512_mask_scattersiv4_si :
4084+
Intrinsic<[],
4085+
[llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
4086+
[IntrArgMemOnly]>;
4087+
4088+
def int_x86_avx512_mask_scattersiv8_sf :
4089+
Intrinsic<[],
4090+
[llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty],
4091+
[IntrArgMemOnly]>;
4092+
4093+
def int_x86_avx512_mask_scattersiv8_si :
4094+
Intrinsic<[],
4095+
[llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
4096+
[IntrArgMemOnly]>;
39804097
}
39814098

39824099
// AVX-512 conflict detection instruction

lib/Target/X86/X86ISelLowering.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -22361,9 +22361,13 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
2236122361
Src.getSimpleValueType().getVectorNumElements());
2236222362
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
2236322363

22364-
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22364+
// We support two versions of the scatter intrinsics. One with scalar mask and
22365+
// one with vXi1 mask. Convert scalar to vXi1 if necessary.
22366+
if (Mask.getValueType() != MaskVT)
22367+
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
22368+
2236522369
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
22366-
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
22370+
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
2236722371
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
2236822372
return SDValue(Res, 1);
2236922373
}

lib/Target/X86/X86IntrinsicsInfo.h

+25
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,31 @@ static const IntrinsicData IntrinsicsWithChain[] = {
249249
X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
250250
X86ISD::VTRUNCUS, 0),
251251

252+
X86_INTRINSIC_DATA(avx512_mask_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
253+
X86_INTRINSIC_DATA(avx512_mask_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
254+
X86_INTRINSIC_DATA(avx512_mask_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
255+
X86_INTRINSIC_DATA(avx512_mask_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
256+
X86_INTRINSIC_DATA(avx512_mask_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
257+
X86_INTRINSIC_DATA(avx512_mask_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
258+
X86_INTRINSIC_DATA(avx512_mask_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
259+
X86_INTRINSIC_DATA(avx512_mask_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
260+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
261+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
262+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
263+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
264+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
265+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
266+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
267+
X86_INTRINSIC_DATA(avx512_mask_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
268+
X86_INTRINSIC_DATA(avx512_mask_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
269+
X86_INTRINSIC_DATA(avx512_mask_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
270+
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
271+
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
272+
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
273+
X86_INTRINSIC_DATA(avx512_mask_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
274+
X86_INTRINSIC_DATA(avx512_mask_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
275+
X86_INTRINSIC_DATA(avx512_mask_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
276+
252277
X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
253278
X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
254279
X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),

0 commit comments

Comments
 (0)