diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 78ed00c13041cf..0699e89560abd1 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18151,9 +18151,9 @@ bool GenTree::canBeContained() const { return false; } - else if (OperIsHWIntrinsic() && !isContainableHWIntrinsic()) + else if (OperIsHWIntrinsic()) { - return isEmbeddedMaskingCompatibleHWIntrinsic(); + return isContainableHWIntrinsic(); } return true; @@ -20334,10 +20334,11 @@ bool GenTree::isCommutativeHWIntrinsic() const bool GenTree::isContainableHWIntrinsic() const { - assert(OperIs(GT_HWINTRINSIC)); + const GenTreeHWIntrinsic* node = AsHWIntrinsic(); + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); #ifdef TARGET_XARCH - switch (AsHWIntrinsic()->GetHWIntrinsicId()) + switch (intrinsic) { case NI_X86Base_LoadAlignedVector128: case NI_X86Base_LoadScalarVector128: @@ -20353,7 +20354,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX512_ConvertToVector256Int32: case NI_AVX512_ConvertToVector256UInt32: { - if (varTypeIsFloating(AsHWIntrinsic()->GetSimdBaseType())) + if (varTypeIsFloating(node->GetSimdBaseType())) { return false; } @@ -20428,24 +20429,24 @@ bool GenTree::isContainableHWIntrinsic() const default: { - return false; + return isEmbeddedMaskingCompatible(); } } #elif defined(TARGET_ARM64) - return (AsHWIntrinsic()->GetHWIntrinsicId() == NI_Sve_ConditionalSelect); + return (intrinsic == NI_Sve_ConditionalSelect) || isEmbeddedMaskingCompatible(); #else return false; #endif // TARGET_XARCH } -bool GenTree::isRMWHWIntrinsic(Compiler* comp) +bool GenTree::isRMWHWIntrinsic(Compiler* comp) const { assert(OperIs(GT_HWINTRINSIC)); assert(comp != nullptr); #if defined(TARGET_XARCH) - GenTreeHWIntrinsic* hwintrinsic = AsHWIntrinsic(); - NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); + const GenTreeHWIntrinsic* hwintrinsic = AsHWIntrinsic(); + NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); if (!comp->canUseVexEncoding()) { @@ -20638,19 +20639,21 @@ bool GenTree::isEmbeddedBroadcastCompatibleHWIntrinsic(Compiler* comp) const #endif // TARGET_XARCH //------------------------------------------------------------------------ -// isEmbeddedMaskingCompatibleHWIntrinsic : Checks if the intrinsic is compatible +// isEmbeddedMaskingCompatible: Checks if the node is a hwintrinsic compatible // with the EVEX embedded masking form for its intended lowering instruction. // // Return Value: -// true if the intrinsic node lowering instruction has a EVEX embedded masking support +// true if the node lowering instruction has a EVEX embedded masking support // -bool GenTree::isEmbeddedMaskingCompatibleHWIntrinsic() const +bool GenTree::isEmbeddedMaskingCompatible() const { if (OperIsHWIntrinsic()) { - NamedIntrinsic intrinsicId = AsHWIntrinsic()->GetHWIntrinsicId(); + const GenTreeHWIntrinsic* node = AsHWIntrinsic(); + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + #if defined(TARGET_XARCH) - var_types simdBaseType = AsHWIntrinsic()->GetSimdBaseType(); + var_types simdBaseType = node->GetSimdBaseType(); if (simdBaseType == TYP_UNKNOWN) { @@ -20658,23 +20661,220 @@ bool GenTree::isEmbeddedMaskingCompatibleHWIntrinsic() const return false; } - if (AsHWIntrinsic()->OperIsMemoryLoadOrStore()) + HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic); + + if (!HWIntrinsicInfo::genIsTableDrivenHWIntrinsic(intrinsic, category)) + { + // TODO-AVX512-CQ: Codegen is currently limited to only handling embedded + // masking for table driven intrinsics. This can be relaxed once that is fixed. + return false; + } + + if (node->OperIsMemoryLoadOrStore()) { // Direct loads and stores cannot be embedded masking compatible // as they may suppress faults that should otherwise be raised return false; } - instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, simdBaseType, nullptr); + instruction ins = HWIntrinsicInfo::lookupIns(intrinsic, simdBaseType, nullptr); return CodeGenInterface::instIsEmbeddedMaskingCompatible(ins); #elif defined(TARGET_ARM64) - return HWIntrinsicInfo::IsEmbeddedMaskedOperation(intrinsicId) || - HWIntrinsicInfo::IsOptionalEmbeddedMaskedOperation(intrinsicId); + return HWIntrinsicInfo::IsEmbeddedMaskedOperation(intrinsic) || + HWIntrinsicInfo::IsOptionalEmbeddedMaskedOperation(intrinsic); #endif } return false; } +#if defined(TARGET_XARCH) +//------------------------------------------------------------------------ +// isEmbeddedMaskingCompatible : Checks if the node is a hwintrinsic compatible +// with the EVEX embedded masking form for its intended lowering instruction. +// +// Arguments: +// comp - The compiler +// tgtMaskSize - The mask size to check compatibility against +// tgtSimdBaseJitType - The target simd base jit type to use if supported +// +// Return Value: +// true if the node lowering instruction has a EVEX embedded masking support +// +bool GenTree::isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize, CorInfoType& tgtSimdBaseJitType) const +{ + if (!isEmbeddedMaskingCompatible()) + { + return false; + } + + if (comp->opts.MinOpts()) + { + return false; + } + + if (comp->canUseEmbeddedMasking()) + { + return false; + } + + if (isRMWHWIntrinsic(comp)) + { + // TODO-AVX512-CQ: Ensure we can support embedded operations on RMW intrinsics + return false; + } + + const GenTreeHWIntrinsic* node = AsHWIntrinsic(); + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + var_types simdType = node->TypeGet(); + + instruction ins = HWIntrinsicInfo::lookupIns(intrinsic, simdBaseType, comp); + unsigned maskBaseSize = CodeGenInterface::instKMaskBaseSize(ins); + unsigned tgtMaskBaseSize = tgtMaskSize / (genTypeSize(simdType) / 16); + + tgtSimdBaseJitType = CORINFO_TYPE_UNDEF; + + if (maskBaseSize != tgtMaskBaseSize) + { + // Some intrinsics are effectively bitwise operations and so we + // can freely update them to match the size of the actual mask + + bool supportsMaskBaseSize4Or8 = false; + + switch (ins) + { + case INS_andpd: + case INS_andps: + case INS_andnpd: + case INS_andnps: + case INS_orpd: + case INS_orps: + case INS_pandd: + case INS_pandnd: + case INS_pord: + case INS_pxord: + case INS_vpandq: + case INS_vpandnq: + case INS_vporq: + case INS_vpxorq: + case INS_vshuff32x4: + case INS_vshuff64x2: + case INS_vshufi32x4: + case INS_vshufi64x2: + case INS_xorpd: + case INS_xorps: + { + // These intrinsics support embedded broadcast and have masking support for 4 or 8 + assert((maskBaseSize == 4) || (maskBaseSize == 8)); + + if (!comp->codeGen->IsEmbeddedBroadcastEnabled(ins, node->Op(2))) + { + // We cannot change the base type if we've already contained a broadcast + supportsMaskBaseSize4Or8 = true; + } + break; + } + + case INS_vpternlogd: + case INS_vpternlogq: + { + // These intrinsics support embedded broadcast and have masking support for 4 or 8 + assert((maskBaseSize == 4) || (maskBaseSize == 8)); + + if (!comp->codeGen->IsEmbeddedBroadcastEnabled(ins, node->Op(3))) + { + // We cannot change the base type if we've already contained a broadcast + supportsMaskBaseSize4Or8 = true; + } + break; + } + + case INS_vbroadcastf32x4: + case INS_vbroadcastf32x8: + case INS_vbroadcastf64x2: + case INS_vbroadcastf64x4: + case INS_vbroadcasti32x4: + case INS_vbroadcasti32x8: + case INS_vbroadcasti64x2: + case INS_vbroadcasti64x4: + case INS_vextractf32x4: + case INS_vextractf32x8: + case INS_vextractf64x2: + case INS_vextractf64x4: + case INS_vextracti32x4: + case INS_vextracti32x8: + case INS_vextracti64x2: + case INS_vextracti64x4: + case INS_vinsertf32x4: + case INS_vinsertf32x8: + case INS_vinsertf64x2: + case INS_vinsertf64x4: + case INS_vinserti32x4: + case INS_vinserti32x8: + case INS_vinserti64x2: + case INS_vinserti64x4: + { + // These intrinsics don't support embedded broadcast and have masking support for 4 or 8 + assert((maskBaseSize == 4) || (maskBaseSize == 8)); + supportsMaskBaseSize4Or8 = true; + break; + } + + default: + { + break; + } + } + + if (supportsMaskBaseSize4Or8) + { + if (tgtMaskBaseSize == 8) + { + if (varTypeIsFloating(simdBaseType)) + { + tgtSimdBaseJitType = CORINFO_TYPE_DOUBLE; + } + else if (varTypeIsSigned(simdBaseType)) + { + tgtSimdBaseJitType = CORINFO_TYPE_LONG; + } + else + { + tgtSimdBaseJitType = CORINFO_TYPE_ULONG; + } + } + else if (tgtMaskBaseSize == 4) + { + if (varTypeIsFloating(simdBaseType)) + { + tgtSimdBaseJitType = CORINFO_TYPE_FLOAT; + } + else if (varTypeIsSigned(simdBaseType)) + { + tgtSimdBaseJitType = CORINFO_TYPE_INT; + } + else + { + tgtSimdBaseJitType = CORINFO_TYPE_UINT; + } + } + } + } + + if (tgtSimdBaseJitType != CORINFO_TYPE_UNDEF) + { + ins = HWIntrinsicInfo::lookupIns(intrinsic, simdBaseType, comp); + maskBaseSize = CodeGenInterface::instKMaskBaseSize(ins); + } + + unsigned maskSize = maskBaseSize * (genTypeSize(simdType) / 16); + assert(maskSize != 0); + + return maskSize != tgtMaskSize; +} +#endif // TARGET_XARCH + GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, @@ -30872,47 +31072,9 @@ var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp( var_types lookupType = type; #if defined(TARGET_XARCH) - if (reverseCond) - { - oper = ReverseRelop(oper); - } - - switch (oper) + if ((simdSize == 64) || comp->canUseEvexEncoding()) { - case GT_EQ: - { - if (simdSize == 64) - { - lookupType = TYP_MASK; - } - break; - } - - case GT_GE: - case GT_LE: - case GT_NE: - { - if ((simdSize == 64) || (varTypeIsIntegral(simdBaseType) && comp->canUseEvexEncoding())) - { - lookupType = TYP_MASK; - } - break; - } - - case GT_GT: - case GT_LT: - { - if ((simdSize == 64) || (varTypeIsUnsigned(simdBaseType) && comp->canUseEvexEncoding())) - { - lookupType = TYP_MASK; - } - break; - } - - default: - { - unreached(); - } + lookupType = TYP_MASK; } #endif // TARGET_XARCH @@ -33037,6 +33199,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case GT_DIV: { + assert(!varTypeIsMask(retType)); + if (varTypeIsFloating(simdBaseType)) { // Handle `x / NaN == NaN` and `NaN / x == NaN` @@ -33195,6 +33359,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case GT_MUL: { + assert(!varTypeIsMask(retType)); + if (!varTypeIsFloating(simdBaseType)) { // Handle `x * 0 == 0` and `0 * x == 0` @@ -33339,6 +33505,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case GT_SUB: { + assert(!varTypeIsMask(retType)); + if (varTypeIsFloating(simdBaseType)) { // Handle `x - NaN == NaN` and `NaN - x == NaN` diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 921a34fa057fbb..b4e4b86261ad71 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1483,12 +1483,13 @@ struct GenTree #ifdef FEATURE_HW_INTRINSICS bool isCommutativeHWIntrinsic() const; bool isContainableHWIntrinsic() const; - bool isRMWHWIntrinsic(Compiler* comp); + bool isRMWHWIntrinsic(Compiler* comp) const; #if defined(TARGET_XARCH) bool isEvexCompatibleHWIntrinsic(Compiler* comp) const; bool isEmbeddedBroadcastCompatibleHWIntrinsic(Compiler* comp) const; + bool isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize, CorInfoType& tgtSimdBaseJitType) const; #endif // TARGET_XARCH - bool isEmbeddedMaskingCompatibleHWIntrinsic() const; + bool isEmbeddedMaskingCompatible() const; #else bool isCommutativeHWIntrinsic() const { @@ -1500,7 +1501,7 @@ struct GenTree return false; } - bool isRMWHWIntrinsic(Compiler* comp) + bool isRMWHWIntrinsic(Compiler* comp) const { return false; } @@ -1517,7 +1518,7 @@ struct GenTree } #endif // TARGET_XARCH - bool isEmbeddedMaskingCompatibleHWIntrinsic() const + bool isEmbeddedMaskingCompatible() const { return false; } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 39223ca652ecf1..22c0ca2ed1b189 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -429,17 +429,17 @@ HARDWARE_INTRINSIC(X86Base, AndNot, HARDWARE_INTRINSIC(X86Base, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(X86Base, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(X86Base, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(X86Base, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(X86Base, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) @@ -464,7 +464,7 @@ HARDWARE_INTRINSIC(X86Base, CompareScalarUnorderedGreaterThanOrEqual, HARDWARE_INTRINSIC(X86Base, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(X86Base, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(X86Base, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(X86Base, ConvertScalarToVector128Double, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2sd, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(X86Base, ConvertScalarToVector128Int32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(X86Base, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2ss}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) @@ -570,12 +570,12 @@ HARDWARE_INTRINSIC(SSE42, Abs, HARDWARE_INTRINSIC(SSE42, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE42, AlignRight, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(SSE42, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, BlendVariable, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(SSE42, BlendVariable, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE42, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE42, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE42, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(SSE42, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE42, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE42, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(SSE42, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) @@ -645,25 +645,25 @@ HARDWARE_INTRINSIC(AVX, AddSubtract, HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf32x4, INS_vbroadcastf32x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Compare, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotGreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotLessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) @@ -729,13 +729,13 @@ HARDWARE_INTRINSIC(AVX2, AndNot, HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, BitFieldExtract, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt) +HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x4, INS_vbroadcasti32x4, INS_vbroadcasti32x4, INS_vbroadcasti32x4, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NormalizeSmallTypeToInt) -HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) -HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, ConvertToInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX2, ConvertToUInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int16, 32, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) @@ -843,7 +843,7 @@ HARDWARE_INTRINSIC(AVX512, AlignRight64, HARDWARE_INTRINSIC(AVX512, And, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandd, INS_pandd, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512, AndNot, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_vpandnq, INS_vpandnq, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NormalizeSmallTypeToInt) HARDWARE_INTRINSIC(AVX512, Average, 64, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX512, BlendVariable, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, BlendVariable, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, BroadcastPairScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512, BroadcastPairScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512, BroadcastPairScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x2, INS_vbroadcasti32x2, INS_invalid, INS_invalid, INS_vbroadcastf32x2, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -852,19 +852,19 @@ HARDWARE_INTRINSIC(AVX512, BroadcastVector128ToVector512, HARDWARE_INTRINSIC(AVX512, BroadcastVector256ToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcasti32x8, INS_vbroadcasti32x8, INS_vbroadcasti64x4, INS_vbroadcasti64x4, INS_vbroadcastf32x8, INS_vbroadcastf64x4}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512, Classify, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_InvalidNodeId|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512, ClassifyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_InvalidNodeId|HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX512, Compare, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, Compare, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, CompareGreaterThan, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, CompareGreaterThanOrEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, CompareLessThan, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, CompareLessThanOrEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, CompareNotEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThan, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareNotLessThan, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqual, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareOrdered, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(AVX512, CompareUnordered, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThan, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareNotGreaterThanOrEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareNotLessThan, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareNotLessThanOrEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareOrdered, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(AVX512, CompareUnordered, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, Compress, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(AVX512, CompressStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX512, ConvertScalarToVector128Double, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2sd32, INS_vcvtusi2sd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index ae753ee165cd30..c7d880d553d9aa 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2371,7 +2371,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); - if ((simdSize == 64) || (varTypeIsShort(simdBaseType) && canUseEvexEncoding())) + if ((simdSize == 64) || canUseEvexEncoding()) { intrinsic = NI_AVX512_MoveMask; } @@ -4936,6 +4936,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_SSE42_BlendVariable: + case NI_AVX_BlendVariable: + case NI_AVX2_BlendVariable: case NI_AVX512_BlendVariable: { assert(sig->numArgs == 3); @@ -4944,9 +4947,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op3 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op3, simdBaseJitType, simdSize); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, NI_AVX512_BlendVariableMask, simdBaseJitType, - simdSize); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_BlendVariableMask; + op3 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op3, simdBaseJitType, simdSize); + } + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, simdBaseJitType, simdSize); break; } @@ -4963,26 +4969,29 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { intrinsic = NI_AVX512_ClassifyScalarMask; } + retType = TYP_MASK; op2 = impPopStack().val; op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, intrinsic, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } case NI_AVX_Compare: - case NI_AVX_CompareScalar: case NI_AVX512_Compare: { - assert(sig->numArgs == 3); - - if (intrinsic == NI_AVX512_Compare) + if ((simdSize == 64) || canUseEvexEncoding()) { intrinsic = NI_AVX512_CompareMask; retType = TYP_MASK; } + FALLTHROUGH; + } + + case NI_AVX_CompareScalar: + { + assert(sig->numArgs == 3); int immLowerBound = 0; int immUpperBound = HWIntrinsicInfo::lookupImmUpperBound(intrinsic); @@ -5013,168 +5022,240 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, simdBaseJitType, simdSize); } - - if (retType == TYP_MASK) - { - retType = getSIMDTypeForSize(simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); - } break; } + case NI_X86Base_CompareEqual: + case NI_SSE42_CompareEqual: + case NI_AVX_CompareEqual: + case NI_AVX2_CompareEqual: case NI_AVX512_CompareEqual: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareEqualMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareEqualMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareGreaterThan: + case NI_SSE42_CompareGreaterThan: + case NI_AVX_CompareGreaterThan: + case NI_AVX2_CompareGreaterThan: case NI_AVX512_CompareGreaterThan: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareGreaterThanMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareGreaterThanMask, simdBaseJitType, - simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareGreaterThanOrEqual: + case NI_AVX_CompareGreaterThanOrEqual: case NI_AVX512_CompareGreaterThanOrEqual: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareGreaterThanOrEqualMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareGreaterThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareLessThan: + case NI_SSE42_CompareLessThan: + case NI_AVX_CompareLessThan: + case NI_AVX2_CompareLessThan: case NI_AVX512_CompareLessThan: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareLessThanMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareLessThanMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareLessThanOrEqual: + case NI_AVX_CompareLessThanOrEqual: case NI_AVX512_CompareLessThanOrEqual: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareLessThanOrEqualMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareLessThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareNotEqual: + case NI_AVX_CompareNotEqual: case NI_AVX512_CompareNotEqual: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareNotEqualMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareNotEqualMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareNotGreaterThan: + case NI_AVX_CompareNotGreaterThan: case NI_AVX512_CompareNotGreaterThan: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareNotGreaterThanMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareNotGreaterThanMask, simdBaseJitType, - simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareNotGreaterThanOrEqual: + case NI_AVX_CompareNotGreaterThanOrEqual: case NI_AVX512_CompareNotGreaterThanOrEqual: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareNotGreaterThanOrEqualMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareNotGreaterThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareNotLessThan: + case NI_AVX_CompareNotLessThan: case NI_AVX512_CompareNotLessThan: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareNotLessThanMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareNotLessThanMask, simdBaseJitType, - simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareNotLessThanOrEqual: + case NI_AVX_CompareNotLessThanOrEqual: case NI_AVX512_CompareNotLessThanOrEqual: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareNotLessThanOrEqualMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareNotLessThanOrEqualMask, - simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareOrdered: + case NI_AVX_CompareOrdered: case NI_AVX512_CompareOrdered: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareOrderedMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareOrderedMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } + case NI_X86Base_CompareUnordered: + case NI_AVX_CompareUnordered: case NI_AVX512_CompareUnordered: { assert(sig->numArgs == 2); + if ((simdSize == 64) || canUseEvexEncoding()) + { + intrinsic = NI_AVX512_CompareUnorderedMask; + retType = TYP_MASK; + } + op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = - gtNewSimdHWIntrinsicNode(TYP_MASK, op1, op2, NI_AVX512_CompareUnorderedMask, simdBaseJitType, simdSize); - retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize); break; } @@ -5374,7 +5455,13 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } } - if (isMinMaxIntrinsic) + if (retType == TYP_MASK) + { + retType = getSIMDTypeForSize(simdSize); + assert(retType == getSIMDTypeForSize(getSIMDTypeSizeInBytes(sig->retTypeSigClass))); + retNode = gtNewSimdCvtMaskToVectorNode(retType, retNode, simdBaseJitType, simdSize); + } + else if (isMinMaxIntrinsic) { assert(sig->numArgs == 2); assert(retNode == nullptr); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 8fe43d05484567..ad3d9e24080861 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -140,15 +140,20 @@ bool Lowering::CheckImmedAndMakeContained(GenTree* parentNode, GenTree* childNod // computation changing values? // // Arguments: -// node - The node. -// endExclusive - The exclusive end of the range to check invariance for. +// node - The node. +// endExclusive - The exclusive end of the range to check invariance for. +// comp - The compiler +// scratchSideEffects - A SideEffectSet used for interference checks. // // Returns: // True if 'node' can be evaluated at any point between its current // location and 'endExclusive' without giving a different result; otherwise // false. // -bool Lowering::IsInvariantInRange(GenTree* node, GenTree* endExclusive) const +bool Lowering::IsInvariantInRange(GenTree* node, + GenTree* endExclusive, + Compiler* comp, + SideEffectSet& scratchSideEffects) { assert((node != nullptr) && (endExclusive != nullptr)); @@ -164,14 +169,14 @@ bool Lowering::IsInvariantInRange(GenTree* node, GenTree* endExclusive) const return false; } - m_scratchSideEffects.Clear(); - m_scratchSideEffects.AddNode(comp, node); + scratchSideEffects.Clear(); + scratchSideEffects.AddNode(comp, node); for (GenTree* cur = node->gtNext; cur != endExclusive; cur = cur->gtNext) { assert((cur != nullptr) && "Expected first node to precede end node"); const bool strict = true; - if (m_scratchSideEffects.InterferesWith(comp, cur, strict)) + if (scratchSideEffects.InterferesWith(comp, cur, strict)) { return false; } @@ -180,6 +185,25 @@ bool Lowering::IsInvariantInRange(GenTree* node, GenTree* endExclusive) const return true; } +//------------------------------------------------------------------------ +// IsInvariantInRange: Check if a node is invariant in the specified range. In +// other words, can 'node' be moved to right before 'endExclusive' without its +// computation changing values? +// +// Arguments: +// node - The node. +// endExclusive - The exclusive end of the range to check invariance for. +// +// Returns: +// True if 'node' can be evaluated at any point between its current +// location and 'endExclusive' without giving a different result; otherwise +// false. +// +bool Lowering::IsInvariantInRange(GenTree* node, GenTree* endExclusive) const +{ + return IsInvariantInRange(node, endExclusive, comp, m_scratchSideEffects); +} + //------------------------------------------------------------------------ // IsInvariantInRange: Check if a node is invariant in the specified range, // ignoring conflicts with one particular node. diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 49d74503e0593b..fa5dfbabc1dca0 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -21,6 +21,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX class Lowering final : public Phase { + friend class Rationalizer; + public: inline Lowering(Compiler* compiler, LinearScanInterface* lsra) : Phase(compiler, PHASE_LOWERING) @@ -551,12 +553,16 @@ class Lowering final : public Phase // Checks and makes 'childNode' contained in the 'parentNode' bool CheckImmedAndMakeContained(GenTree* parentNode, GenTree* childNode); - bool IsInvariantInRange(GenTree* node, GenTree* endExclusive) const; - bool IsInvariantInRange(GenTree* node, GenTree* endExclusive, GenTree* ignoreNode) const; - bool IsRangeInvariantInRange(GenTree* rangeStart, - GenTree* rangeEnd, - GenTree* endExclusive, - GenTree* ignoreNode) const; + static bool IsInvariantInRange(GenTree* node, + GenTree* endExclusive, + Compiler* comp, + SideEffectSet& scratchSideEffects); + bool IsInvariantInRange(GenTree* node, GenTree* endExclusive) const; + bool IsInvariantInRange(GenTree* node, GenTree* endExclusive, GenTree* ignoreNode) const; + bool IsRangeInvariantInRange(GenTree* rangeStart, + GenTree* rangeEnd, + GenTree* endExclusive, + GenTree* ignoreNode) const; // Check if marking an operand of a node as reg-optional is safe. bool IsSafeToMarkRegOptional(GenTree* parentNode, GenTree* node) const; diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 7a3d1eeb57e12d..e39905d8bd7ac7 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -4026,7 +4026,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { const GenTreeHWIntrinsic* embOp = op2->AsHWIntrinsic(); - if (IsInvariantInRange(op2, node) && op2->isEmbeddedMaskingCompatibleHWIntrinsic()) + if (IsInvariantInRange(op2, node) && op2->isEmbeddedMaskingCompatible()) { bool contain = false; uint32_t maskSize = genTypeSize(node->GetSimdBaseType()); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index e9b5d08e4c36b2..a097d5d5b01acc 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -10514,222 +10514,18 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) MakeSrcContained(node, op1); } - if (op2->isEmbeddedMaskingCompatibleHWIntrinsic()) + if (IsInvariantInRange(op2, node)) { - bool isEmbeddedMask = !comp->opts.MinOpts() && comp->canUseEmbeddedMasking(); + unsigned tgtMaskSize = simdSize / genTypeSize(simdBaseType); + CorInfoType tgtSimdBaseJitType = CORINFO_TYPE_UNDEF; - if (op2->isRMWHWIntrinsic(comp)) + if (op2->isEmbeddedMaskingCompatible(comp, tgtMaskSize, tgtSimdBaseJitType)) { - // TODO-AVX512-CQ: Ensure we can support embedded operations on RMW intrinsics - isEmbeddedMask = false; - } - - GenTreeHWIntrinsic* op2Intrinsic = op2->AsHWIntrinsic(); - NamedIntrinsic op2IntrinsicId = NI_Illegal; - HWIntrinsicCategory category = HW_Category_Special; - - if (isEmbeddedMask) - { - // TODO-AVX512-CQ: Codegen is currently limited to only handling embedded - // masking for table driven intrinsics. This can be relaxed once that is fixed. - - op2IntrinsicId = op2Intrinsic->GetHWIntrinsicId(); - category = HWIntrinsicInfo::lookupCategory(op2IntrinsicId); - isEmbeddedMask = - HWIntrinsicInfo::genIsTableDrivenHWIntrinsic(op2IntrinsicId, category); - - size_t numArgs = node->GetOperandCount(); - - if (numArgs == 1) - { - if (op2Intrinsic->OperIsMemoryLoad()) - { - isEmbeddedMask = false; - } - } - else if (numArgs == 2) - { - if (category == HW_Category_MemoryStore) - { - isEmbeddedMask = false; - } - } - } - - if (isEmbeddedMask) - { - var_types op2SimdBaseType = op2Intrinsic->GetSimdBaseType(); - - instruction ins = - HWIntrinsicInfo::lookupIns(op2IntrinsicId, op2SimdBaseType, comp); - - unsigned expectedMaskBaseSize = CodeGenInterface::instKMaskBaseSize(ins); - - // It's safe to use the return and base type of the BlendVariableMask node - // since anything which lowered to it will have validated compatibility itself - unsigned actualMaskSize = - genTypeSize(node->TypeGet()) / genTypeSize(simdBaseType); - unsigned actualMaskBaseSize = - actualMaskSize / (genTypeSize(node->TypeGet()) / 16); - - CorInfoType op2AdjustedSimdBaseJitType = CORINFO_TYPE_UNDEF; - - if (actualMaskBaseSize != expectedMaskBaseSize) + if (tgtSimdBaseJitType != CORINFO_TYPE_UNDEF) { - // Some intrinsics are effectively bitwise operations and so we - // can freely update them to match the size of the actual mask - - bool supportsMaskBaseSize4Or8 = false; - - switch (ins) - { - case INS_andpd: - case INS_andps: - case INS_andnpd: - case INS_andnps: - case INS_orpd: - case INS_orps: - case INS_pandd: - case INS_pandnd: - case INS_pord: - case INS_pxord: - case INS_vpandq: - case INS_vpandnq: - case INS_vporq: - case INS_vpxorq: - case INS_vshuff32x4: - case INS_vshuff64x2: - case INS_vshufi32x4: - case INS_vshufi64x2: - case INS_xorpd: - case INS_xorps: - { - // These intrinsics support embedded broadcast and have masking - // support for 4 or 8 - assert((expectedMaskBaseSize == 4) || (expectedMaskBaseSize == 8)); - - if (!comp->codeGen->IsEmbeddedBroadcastEnabled(ins, - op2Intrinsic->Op(2))) - { - // We cannot change the base type if we've already contained a - // broadcast - supportsMaskBaseSize4Or8 = true; - } - break; - } - - case INS_vpternlogd: - case INS_vpternlogq: - { - // These intrinsics support embedded broadcast and have masking - // support for 4 or 8 - assert((expectedMaskBaseSize == 4) || (expectedMaskBaseSize == 8)); - - if (!comp->codeGen->IsEmbeddedBroadcastEnabled(ins, - op2Intrinsic->Op(3))) - { - // We cannot change the base type if we've already contained a - // broadcast - supportsMaskBaseSize4Or8 = true; - } - break; - } - - case INS_vbroadcastf32x4: - case INS_vbroadcastf32x8: - case INS_vbroadcastf64x2: - case INS_vbroadcastf64x4: - case INS_vbroadcasti32x4: - case INS_vbroadcasti32x8: - case INS_vbroadcasti64x2: - case INS_vbroadcasti64x4: - case INS_vextractf32x4: - case INS_vextractf32x8: - case INS_vextractf64x2: - case INS_vextractf64x4: - case INS_vextracti32x4: - case INS_vextracti32x8: - case INS_vextracti64x2: - case INS_vextracti64x4: - case INS_vinsertf32x4: - case INS_vinsertf32x8: - case INS_vinsertf64x2: - case INS_vinsertf64x4: - case INS_vinserti32x4: - case INS_vinserti32x8: - case INS_vinserti64x2: - case INS_vinserti64x4: - { - // These intrinsics don't support embedded broadcast and have - // masking support for 4 or 8 - assert((expectedMaskBaseSize == 4) || (expectedMaskBaseSize == 8)); - supportsMaskBaseSize4Or8 = true; - break; - } - - default: - { - break; - } - } - - if (supportsMaskBaseSize4Or8) - { - if (actualMaskBaseSize == 8) - { - if (varTypeIsFloating(op2SimdBaseType)) - { - op2AdjustedSimdBaseJitType = CORINFO_TYPE_DOUBLE; - } - else if (varTypeIsSigned(op2SimdBaseType)) - { - op2AdjustedSimdBaseJitType = CORINFO_TYPE_LONG; - } - else - { - op2AdjustedSimdBaseJitType = CORINFO_TYPE_ULONG; - } - } - else if (actualMaskBaseSize == 4) - { - if (varTypeIsFloating(op2SimdBaseType)) - { - op2AdjustedSimdBaseJitType = CORINFO_TYPE_FLOAT; - } - else if (varTypeIsSigned(op2SimdBaseType)) - { - op2AdjustedSimdBaseJitType = CORINFO_TYPE_INT; - } - else - { - op2AdjustedSimdBaseJitType = CORINFO_TYPE_UINT; - } - } - } + op2->AsHWIntrinsic()->SetSimdBaseJitType(tgtSimdBaseJitType); } - if (op2AdjustedSimdBaseJitType != CORINFO_TYPE_UNDEF) - { - ins = HWIntrinsicInfo::lookupIns(op2IntrinsicId, op2SimdBaseType, comp); - expectedMaskBaseSize = CodeGenInterface::instKMaskBaseSize(ins); - } - - unsigned expectedMaskSize = - expectedMaskBaseSize * (genTypeSize(op2->TypeGet()) / 16); - assert(expectedMaskSize != 0); - - if (actualMaskSize != expectedMaskSize) - { - isEmbeddedMask = false; - } - else if (op2AdjustedSimdBaseJitType != CORINFO_TYPE_UNDEF) - { - op2Intrinsic->SetSimdBaseJitType(op2AdjustedSimdBaseJitType); - } - } - - if (isEmbeddedMask && IsInvariantInRange(op2, node)) - { MakeSrcContained(node, op2); op2->MakeEmbMaskOp(); break; diff --git a/src/coreclr/jit/rationalize.cpp b/src/coreclr/jit/rationalize.cpp index 7115c01cbe963a..9e95b0982190c6 100644 --- a/src/coreclr/jit/rationalize.cpp +++ b/src/coreclr/jit/rationalize.cpp @@ -2,6 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. #include "jitpch.h" +#include "lower.h" + #ifdef _MSC_VER #pragma hdrstop #endif @@ -21,7 +23,6 @@ // Return Value: // None. // - void Rationalizer::RewriteNodeAsCall(GenTree** use, CORINFO_SIG_INFO* sig, ArrayStack& parents, @@ -262,8 +263,8 @@ void Rationalizer::RewriteNodeAsCall(GenTree** use, // RewriteIntrinsicAsUserCall : Rewrite an intrinsic operator as a GT_CALL to the original method. // // Arguments: -// ppTree - A pointer-to-a-pointer for the intrinsic node -// fgWalkData - A pointer to tree walk data providing the context +// use - A pointer-to-a-pointer for the intrinsic node +// parents - A pointer to tree walk data providing the context // // Return Value: // None. @@ -272,7 +273,6 @@ void Rationalizer::RewriteNodeAsCall(GenTree** use, // The ones that are not being rewritten here must be handled in Codegen. // Conceptually, the lower is the right place to do the rewrite. Keeping it in rationalization is // mainly for throughput issue. - void Rationalizer::RewriteIntrinsicAsUserCall(GenTree** use, ArrayStack& parents) { GenTreeIntrinsic* intrinsic = (*use)->AsIntrinsic(); @@ -310,12 +310,15 @@ void Rationalizer::RewriteIntrinsicAsUserCall(GenTree** use, ArrayStackAsHWIntrinsic(); + + // Intrinsics should have already been rewritten back into user calls. + assert(!node->IsUserCall()); + + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + + switch (intrinsic) + { +#if defined(TARGET_XARCH) + case NI_AVX512_BlendVariableMask: + { + RewriteHWIntrinsicBlendv(use, parents); + break; + } + + case NI_AVX512_ConvertMaskToVector: + case NI_AVX512_MoveMask: + { + RewriteHWIntrinsicMaskOp(use, parents); + break; + } +#endif // TARGET_XARCH + + default: + { + break; + } + } +} + +#if defined(TARGET_XARCH) +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicBlendv: Rewrites a hwintrinsic blendv operation +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// +void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + // We normalize all comparisons to be of TYP_MASK on import. However, if we + // get to rationalization and we cannot take advantage of embedded masking + // then we want to rewrite things to just directly produce TYP_SIMD instead. + + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + var_types retType = node->TypeGet(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + if (simdSize == 64) + { + return; + } + + GenTree* op2 = node->Op(2); + + // We're in the post-order visit and are traversing in execution order, so + // everything between op2 and node will have already been rewritten to LIR + // form and doing the IsInvariantInRange check is safe. This allows us to + // catch cases where something is embedded masking compatible but where we + // could never actually contain it and so we want to rewrite it to the non-mask + // variant + SideEffectSet scratchSideEffects; + + if (Lowering::IsInvariantInRange(op2, node, comp, scratchSideEffects)) + { + unsigned tgtMaskSize = simdSize / genTypeSize(simdBaseType); + CorInfoType tgtSimdBaseJitType = CORINFO_TYPE_UNDEF; + + if (op2->isEmbeddedMaskingCompatible(comp, tgtMaskSize, tgtSimdBaseJitType)) + { + // We are going to utilize the embedded mask, so we don't need to rewrite. However, + // we want to fixup the simdBaseJitType here since it simplifies lowering and allows + // both embedded broadcast and the mask to be live simultaneously. + + if (tgtSimdBaseJitType != CORINFO_TYPE_UNDEF) + { + op2->AsHWIntrinsic()->SetSimdBaseJitType(tgtSimdBaseJitType); + } + return; + } + } + + GenTree*& op3 = node->Op(3); + + if (!ShouldRewriteToNonMaskHWIntrinsic(op3)) + { + return; + } + + parents.Push(op3); + RewriteHWIntrinsicToNonMask(&op3, parents); + (void)parents.Pop(); + + if (simdSize == 32) + { + if (varTypeIsIntegral(simdBaseType)) + { + intrinsic = NI_AVX2_BlendVariable; + } + else + { + intrinsic = NI_AVX_BlendVariable; + } + } + else + { + intrinsic = NI_SSE42_BlendVariable; + } + + if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsic) && varTypeIsSmall(simdBaseType)) + { + node->SetSimdBaseJitType(varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT); + } + node->ChangeHWIntrinsicId(intrinsic); +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicMaskOp: Rewrites a hwintrinsic mask operation +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// +void Rationalizer::RewriteHWIntrinsicMaskOp(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + // We normalize all comparisons to be of TYP_MASK on import. However, if we + // get to rationalization and we're just converting that back to TYP_SIMD, + // then we want to rewrite things to just directly produce TYP_SIMD instead. + + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + var_types retType = node->TypeGet(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + if (simdSize == 64) + { + // we must always use the evex encoding + return; + } + + if ((intrinsic == NI_AVX512_MoveMask) && varTypeIsShort(simdBaseType)) + { + // we need to keep the evex form as it's more efficient + return; + } + + GenTree*& op1 = node->Op(1); + + if (!ShouldRewriteToNonMaskHWIntrinsic(op1)) + { + return; + } + + parents.Push(op1); + RewriteHWIntrinsicToNonMask(&op1, parents); + (void)parents.Pop(); + + if (intrinsic == NI_AVX512_ConvertMaskToVector) + { + if (parents.Height() > 1) + { + parents.Top(1)->ReplaceOperand(use, op1); + } + else + { + *use = op1; + } + BlockRange().Remove(node); + + // Adjust the parent stack + assert(parents.Top() == node); + (void)parents.Pop(); + parents.Push(op1); + } + else + { + assert(intrinsic == NI_AVX512_MoveMask); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = (simdSize == 32) ? NI_AVX2_MoveMask : NI_X86Base_MoveMask; + break; + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + simdBaseJitType = CORINFO_TYPE_FLOAT; + intrinsic = (simdSize == 32) ? NI_AVX_MoveMask : NI_X86Base_MoveMask; + break; + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + simdBaseJitType = CORINFO_TYPE_DOUBLE; + intrinsic = (simdSize == 32) ? NI_AVX_MoveMask : NI_X86Base_MoveMask; + break; + } + + default: + { + unreached(); + } + } + + node->SetSimdBaseJitType(simdBaseJitType); + node->ChangeHWIntrinsicId(intrinsic); + } +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicToNonMask: Rewrites a hwintrinsic to its non-mask form +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// +void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeStack& parents) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + + assert(node->TypeIs(TYP_MASK)); + assert(ShouldRewriteToNonMaskHWIntrinsic(node)); + + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + + switch (intrinsic) + { + case NI_AVX512_AndMask: + { + RewriteHWIntrinsicBitwiseOpToNonMask(use, parents, GT_AND); + break; + } + + case NI_AVX512_AndNotMask: + { + RewriteHWIntrinsicBitwiseOpToNonMask(use, parents, GT_AND_NOT); + break; + } + + case NI_AVX512_NotMask: + { + RewriteHWIntrinsicBitwiseOpToNonMask(use, parents, GT_NOT); + break; + } + + case NI_AVX512_OrMask: + { + RewriteHWIntrinsicBitwiseOpToNonMask(use, parents, GT_OR); + break; + } + + case NI_AVX512_XorMask: + { + RewriteHWIntrinsicBitwiseOpToNonMask(use, parents, GT_XOR); + break; + } + + case NI_AVX512_XnorMask: + { + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + GenTree* op1 = + comp->gtNewSimdBinOpNode(GT_XOR, simdType, node->Op(1), node->Op(2), simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, op1); + node->Op(1) = op1; + + GenTree* op2 = comp->gtNewAllBitsSetConNode(simdType); + BlockRange().InsertBefore(node, op2); + node->Op(2) = op2; + + RewriteHWIntrinsicBitwiseOpToNonMask(use, parents, GT_XOR); + break; + } + + case NI_AVX512_CompareMask: + case NI_AVX512_CompareEqualMask: + case NI_AVX512_CompareGreaterThanMask: + case NI_AVX512_CompareGreaterThanOrEqualMask: + case NI_AVX512_CompareLessThanMask: + case NI_AVX512_CompareLessThanOrEqualMask: + case NI_AVX512_CompareNotEqualMask: + case NI_AVX512_CompareNotGreaterThanMask: + case NI_AVX512_CompareNotGreaterThanOrEqualMask: + case NI_AVX512_CompareNotLessThanMask: + case NI_AVX512_CompareNotLessThanOrEqualMask: + case NI_AVX512_CompareOrderedMask: + case NI_AVX512_CompareUnorderedMask: + { + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + + switch (intrinsic) + { + case NI_AVX512_CompareMask: + { + intrinsic = NI_AVX_Compare; + break; + } + + case NI_AVX512_CompareEqualMask: + { + if (simdSize == 32) + { + if (varTypeIsIntegral(simdBaseType)) + { + intrinsic = NI_AVX2_CompareEqual; + } + else + { + intrinsic = NI_AVX_CompareEqual; + } + } + else if (varTypeIsLong(simdBaseType)) + { + intrinsic = NI_SSE42_CompareEqual; + } + else + { + intrinsic = NI_X86Base_CompareEqual; + } + break; + } + + case NI_AVX512_CompareGreaterThanMask: + { + if (simdSize == 32) + { + if (varTypeIsIntegral(simdBaseType)) + { + intrinsic = NI_AVX2_CompareGreaterThan; + } + else + { + intrinsic = NI_AVX_CompareGreaterThan; + } + } + else if (varTypeIsLong(simdBaseType)) + { + intrinsic = NI_SSE42_CompareGreaterThan; + } + else + { + intrinsic = NI_X86Base_CompareGreaterThan; + } + break; + } + + case NI_AVX512_CompareGreaterThanOrEqualMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareGreaterThanOrEqual; + } + else + { + intrinsic = NI_X86Base_CompareGreaterThanOrEqual; + } + break; + } + + case NI_AVX512_CompareLessThanMask: + { + if (simdSize == 32) + { + if (varTypeIsIntegral(simdBaseType)) + { + intrinsic = NI_AVX2_CompareLessThan; + } + else + { + intrinsic = NI_AVX_CompareLessThan; + } + } + else if (varTypeIsLong(simdBaseType)) + { + intrinsic = NI_SSE42_CompareLessThan; + } + else + { + intrinsic = NI_X86Base_CompareLessThan; + } + break; + } + + case NI_AVX512_CompareLessThanOrEqualMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareLessThanOrEqual; + } + else + { + intrinsic = NI_X86Base_CompareLessThanOrEqual; + } + break; + } + + case NI_AVX512_CompareNotEqualMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareNotEqual; + } + else + { + intrinsic = NI_X86Base_CompareNotEqual; + } + break; + } + + case NI_AVX512_CompareNotGreaterThanMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareNotGreaterThan; + } + else + { + intrinsic = NI_X86Base_CompareNotGreaterThan; + } + break; + } + + case NI_AVX512_CompareNotGreaterThanOrEqualMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareNotGreaterThanOrEqual; + } + else + { + intrinsic = NI_X86Base_CompareNotGreaterThanOrEqual; + } + break; + } + + case NI_AVX512_CompareNotLessThanMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareNotLessThan; + } + else + { + intrinsic = NI_X86Base_CompareNotLessThan; + } + break; + } + + case NI_AVX512_CompareNotLessThanOrEqualMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareNotLessThanOrEqual; + } + else + { + intrinsic = NI_X86Base_CompareNotLessThanOrEqual; + } + break; + } + + case NI_AVX512_CompareOrderedMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareOrdered; + } + else + { + intrinsic = NI_X86Base_CompareOrdered; + } + break; + } + + case NI_AVX512_CompareUnorderedMask: + { + if (simdSize == 32) + { + intrinsic = NI_AVX_CompareUnordered; + } + else + { + intrinsic = NI_X86Base_CompareUnordered; + } + break; + } + + default: + { + unreached(); + } + } + + node->gtType = simdType; + node->ChangeHWIntrinsicId(intrinsic); + + break; + } + + case NI_AVX512_ConvertVectorToMask: + { + GenTree* op1 = node->Op(1); + + if (parents.Height() > 1) + { + parents.Top(1)->ReplaceOperand(use, op1); + } + else + { + *use = op1; + } + BlockRange().Remove(node); + + // Adjust the parent stack + assert(parents.Top() == node); + (void)parents.Pop(); + parents.Push(op1); + + break; + } + + default: + { + unreached(); + } + } +} + +//---------------------------------------------------------------------------------------------- +// RewriteHWIntrinsicBitwiseOpToNonMask: Rewrites hwintrinsic bitwise operation to its non-mask form +// +// Arguments: +// use - A pointer to the hwintrinsic node +// parents - A reference to tree walk data providing the context +// oper - The operation represented by the hwintrinsic +// +void Rationalizer::RewriteHWIntrinsicBitwiseOpToNonMask(GenTree** use, Compiler::GenTreeStack& parents, genTreeOps oper) +{ + GenTreeHWIntrinsic* node = (*use)->AsHWIntrinsic(); + assert((node->GetOperandCount() == 1) || (node->GetOperandCount() == 2)); + + assert(node->TypeIs(TYP_MASK)); + assert(oper != GT_NONE); + + NamedIntrinsic intrinsic = NI_Illegal; + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + const bool isScalar = false; + + GenTree*& op1 = node->Op(1); + + parents.Push(op1); + RewriteHWIntrinsicToNonMask(&op1, parents); + (void)parents.Pop(); + + if (node->GetOperandCount() == 1) + { + assert(oper == GT_NOT); + + GenTree* op2 = comp->gtNewAllBitsSetConNode(simdType); + BlockRange().InsertBefore(node, op2); + + intrinsic = + GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, GT_XOR, op1, op2, simdBaseType, simdSize, isScalar); + + node->gtType = simdType; + node->ResetHWIntrinsicId(intrinsic, comp, op1, op2); + } + else + { + GenTree*& op2 = node->Op(2); + + parents.Push(op2); + RewriteHWIntrinsicToNonMask(&op2, parents); + (void)parents.Pop(); + + intrinsic = + GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(comp, oper, op1, op2, simdBaseType, simdSize, isScalar); + + node->gtType = simdType; + node->ChangeHWIntrinsicId(intrinsic); + } +} + +//---------------------------------------------------------------------------------------------- +// ShouldRewriteToNonMaskHWIntrinsic: Determines if a node is a hwintrinsic that should be rewritten +// to its non-mask form +// +// Arguments: +// node - The node to check +// +// Returns: +// true if node is a hardware intrinsic node and should be converted to its non-mask form; otherwise false +// +bool Rationalizer::ShouldRewriteToNonMaskHWIntrinsic(GenTree* node) +{ + assert(node->TypeIs(TYP_MASK)); + + if (!node->OperIsHWIntrinsic()) + { + // Nothing to optimize if we don't have a hwintrinsic + return false; + } + + GenTreeHWIntrinsic* hwNode = node->AsHWIntrinsic(); + NamedIntrinsic intrinsic = hwNode->GetHWIntrinsicId(); + + if (hwNode->GetSimdSize() == 64) + { + // TYP_SIMD64 comparisons always produce a TYP_MASK + return false; + } + + switch (intrinsic) + { + case NI_AVX512_AndMask: + case NI_AVX512_AndNotMask: + case NI_AVX512_OrMask: + case NI_AVX512_XorMask: + case NI_AVX512_XnorMask: + { + // binary bitwise operations should be optimized if both inputs can + assert(hwNode->GetOperandCount() == 2); + return ShouldRewriteToNonMaskHWIntrinsic(hwNode->Op(1)) && ShouldRewriteToNonMaskHWIntrinsic(hwNode->Op(2)); + } + + case NI_AVX512_NotMask: + { + // unary bitwise operations should be optimized if the input can + assert(hwNode->GetOperandCount() == 1); + return ShouldRewriteToNonMaskHWIntrinsic(hwNode->Op(1)); + } + + case NI_AVX512_CompareMask: + case NI_AVX512_CompareEqualMask: + case NI_AVX512_CompareGreaterThanMask: + case NI_AVX512_CompareGreaterThanOrEqualMask: + case NI_AVX512_CompareLessThanMask: + case NI_AVX512_CompareLessThanOrEqualMask: + case NI_AVX512_CompareNotEqualMask: + case NI_AVX512_CompareNotGreaterThanMask: + case NI_AVX512_CompareNotGreaterThanOrEqualMask: + case NI_AVX512_CompareNotLessThanMask: + case NI_AVX512_CompareNotLessThanOrEqualMask: + case NI_AVX512_CompareOrderedMask: + case NI_AVX512_CompareUnorderedMask: + { + assert((hwNode->GetOperandCount() == 2) || (hwNode->GetOperandCount() == 3)); + var_types simdBaseType = hwNode->GetSimdBaseType(); + + if (varTypeIsFloating(simdBaseType)) + { + // floating-point comparisons can always be optimized + return true; + } + + if (intrinsic == NI_AVX512_CompareEqualMask) + { + // equals comparisons can always be optimized + return true; + } + + if (varTypeIsUnsigned(simdBaseType)) + { + // unsigned integer relational comparisons cannot be optimized + return false; + } + + if (intrinsic == NI_AVX512_CompareGreaterThanMask) + { + // signed integer greater-than comparisons can always be optimized + return true; + } + + if (intrinsic == NI_AVX512_CompareLessThanMask) + { + // signed integer less-than comparisons can always be optimized + return true; + } + break; + } + + case NI_AVX512_ConvertVectorToMask: + { + return true; + } + + default: + { + break; + } + } + + // Other cases cannot be optimized + return false; +} +#endif // TARGET_XARCH +#endif // FEATURE_HW_INTRINSICS + #ifdef TARGET_ARM64 // RewriteSubLshDiv: Possibly rewrite a SubLshDiv node into a Mod. // @@ -784,8 +1518,7 @@ Compiler::fgWalkResult Rationalizer::RewriteNode(GenTree** useEdge, Compiler::Ge #if defined(FEATURE_HW_INTRINSICS) case GT_HWINTRINSIC: - // Intrinsics should have already been rewritten back into user calls. - assert(!node->AsHWIntrinsic()->IsUserCall()); + RewriteHWIntrinsic(useEdge, parentStack); break; #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/rationalize.h b/src/coreclr/jit/rationalize.h index 923e10665cb002..b09bb04da5f1f6 100644 --- a/src/coreclr/jit/rationalize.h +++ b/src/coreclr/jit/rationalize.h @@ -49,7 +49,20 @@ class Rationalizer final : public Phase void RewriteIntrinsicAsUserCall(GenTree** use, Compiler::GenTreeStack& parents); #if defined(FEATURE_HW_INTRINSICS) + // pre-order rewriting void RewriteHWIntrinsicAsUserCall(GenTree** use, Compiler::GenTreeStack& parents); + + // post-order rewriting + void RewriteHWIntrinsic(GenTree** use, Compiler::GenTreeStack& parents); + +#if defined(TARGET_XARCH) + void RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStack& parents); + void RewriteHWIntrinsicMaskOp(GenTree** use, Compiler::GenTreeStack& parents); + void RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeStack& parents); + void RewriteHWIntrinsicBitwiseOpToNonMask(GenTree** use, Compiler::GenTreeStack& parents, genTreeOps oper); + + bool ShouldRewriteToNonMaskHWIntrinsic(GenTree* node); +#endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS #ifdef TARGET_ARM64