Skip to content

Commit 9283581

Browse files
khushal1996Ruihan-YinBruceForstallMichalPetryka
authored
Use Avx10.2 Instructions in Floating Point Conversions (#111775)
* Add support for AVX10.2. Add AVX10.2 API surface and template tests. Lower Avx10.2 nodes accordingly. * Add support and template tests for AVX10v2_V512 * Add new coredistools.dll build from latest llvm repo * Limit JIT unit suite within the subsets which are stable in SDE. * Rename API as per latest API proposal discussions * fix sample tests in handwritten project * Revert "Limit JIT unit suite within the subsets which are stable in SDE." This reverts commit 067e31e. * Limit JIT unit suite within the subsets which are stable in SDE. * Allow a prefix of 0x00 for AVX10.2 instructions. * Revert "Limit JIT unit suite within the subsets which are stable in SDE." This reverts commit 067e31e. * Limit JIT unit suite within the subsets which are stable in SDE. * remove developer comments from files * Enable all template tests and enable ymm embedded rounding * Make emitter independent of ISa and based on insOpts for ymm embedded rounding * Enable ymm embedded rounding based on architecture * Revert "Make emitter independent of ISa and based on insOpts for ymm embedded rounding" This reverts commit 493572f. * Separate Avx10.2 unit testing framework from APX framework * Revert "Limit JIT unit suite within the subsets which are stable in SDE." This reverts commit 067e31e. * Revert "Add new coredistools.dll build from latest llvm repo" This reverts commit 61719f8. * Fix formatting * Use new keyword for class V512 to hide Avx10v1.V512 and correct CI errors * Remove MinMax APis from lowering for numargs=2 * Add docstrings for APIs * revert changes for sde execution of tests * Add appropriate comments from reviews * Apply suggestions from code review Co-authored-by: Bruce Forstall <[email protected]> * Add emitter tests for XMM9/16 to make sure special handling does not interfere. * Format code * Handle sizePrefix = 0 case when decoding evex instruction * Add assert in appropriate places * Club similar instructions together in perf calculation in emitxarch * Run formatting * Add assembly prints for debug assembly capturing for Avx10.2 * Use correct size when running emitter tests * Ad appropriate comments and make review changes * Use AVX10.2 instructions in conversions * Run formatting * Update comment Co-authored-by: Michał Petryka <[email protected]> * Optimize the need for compIsa checks and edit comments * Run formatting --------- Co-authored-by: Ruihan-Yin <[email protected]> Co-authored-by: Bruce Forstall <[email protected]> Co-authored-by: Michał Petryka <[email protected]>
1 parent b61c07c commit 9283581

File tree

6 files changed

+83
-19
lines changed

6 files changed

+83
-19
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7432,7 +7432,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
74327432
noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
74337433

74347434
// We shouldn't be seeing uint64 here as it should have been converted
7435-
// into a helper call by either front-end or lowering phase, unless we have AVX512F
7435+
// into a helper call by either front-end or lowering phase, unless we have AVX512F/AVX10.x
74367436
// accelerated conversions.
74377437
assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))) ||
74387438
compiler->canUseEvexEncodingDebugOnly());

src/coreclr/jit/emitxarch.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13197,6 +13197,14 @@ void emitter::emitDispIns(
1319713197
case INS_vcvttsd2usi64:
1319813198
case INS_vcvttss2usi32:
1319913199
case INS_vcvttss2usi64:
13200+
case INS_vcvttsd2sis32:
13201+
case INS_vcvttsd2sis64:
13202+
case INS_vcvttss2sis32:
13203+
case INS_vcvttss2sis64:
13204+
case INS_vcvttsd2usis32:
13205+
case INS_vcvttsd2usis64:
13206+
case INS_vcvttss2usis32:
13207+
case INS_vcvttss2usis64:
1320013208
{
1320113209
assert(!id->idIsEvexAaaContextSet());
1320213210
printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));

src/coreclr/jit/gentree.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21689,7 +21689,39 @@ GenTree* Compiler::gtNewSimdCvtNode(var_types type,
2168921689
GenTree* fixupVal;
2169021690
bool isV512Supported = false;
2169121691

21692-
if (compIsEvexOpportunisticallySupported(isV512Supported))
21692+
if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
21693+
{
21694+
NamedIntrinsic cvtIntrinsic = NI_Illegal;
21695+
switch (simdTargetBaseType)
21696+
{
21697+
case TYP_INT:
21698+
cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorInt32WithTruncationSaturation
21699+
: NI_AVX10v2_ConvertToVectorInt32WithTruncationSaturation;
21700+
break;
21701+
21702+
case TYP_UINT:
21703+
cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorUInt32WithTruncationSaturation
21704+
: NI_AVX10v2_ConvertToVectorUInt32WithTruncationSaturation;
21705+
break;
21706+
21707+
case TYP_LONG:
21708+
cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorInt64WithTruncationSaturation
21709+
: NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation;
21710+
break;
21711+
21712+
case TYP_ULONG:
21713+
cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorUInt64WithTruncationSaturation
21714+
: NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation;
21715+
break;
21716+
21717+
default:
21718+
{
21719+
unreached();
21720+
}
21721+
}
21722+
return gtNewSimdHWIntrinsicNode(type, op1, cvtIntrinsic, simdSourceBaseJitType, simdSize);
21723+
}
21724+
else if (compIsEvexOpportunisticallySupported(isV512Supported))
2169321725
{
2169421726
/*Generate the control table for VFIXUPIMMSD/SS
2169521727
- For conversion to unsigned

src/coreclr/jit/instr.cpp

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2420,6 +2420,11 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
24202420
//
24212421
instruction CodeGen::ins_FloatConv(var_types to, var_types from)
24222422
{
2423+
// AVX: Supports following conversions
2424+
// srcType = int16/int64 castToType = float
2425+
// AVX512: Supports following conversions
2426+
// srcType = ulong castToType = double/float
2427+
bool isAvx10v2 = false;
24232428
switch (from)
24242429
{
24252430
case TYP_INT:
@@ -2471,36 +2476,52 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from)
24712476
break;
24722477

24732478
case TYP_FLOAT:
2479+
if (to == TYP_FLOAT)
2480+
{
2481+
return ins_Move_Extend(TYP_FLOAT, false);
2482+
}
2483+
else if (to == TYP_DOUBLE)
2484+
{
2485+
return INS_cvtss2sd;
2486+
}
2487+
isAvx10v2 = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2);
2488+
24742489
switch (to)
24752490
{
24762491
case TYP_INT:
2477-
return INS_cvttss2si32;
2492+
return isAvx10v2 ? INS_vcvttss2sis32 : INS_cvttss2si32;
24782493
case TYP_LONG:
2479-
return INS_cvttss2si64;
2480-
case TYP_DOUBLE:
2481-
return INS_cvtss2sd;
2494+
return isAvx10v2 ? INS_vcvttss2sis64 : INS_cvttss2si64;
24822495
case TYP_ULONG:
2483-
return INS_vcvttss2usi64;
2496+
return isAvx10v2 ? INS_vcvttss2usis64 : INS_vcvttss2usi64;
24842497
case TYP_UINT:
2485-
return INS_vcvttss2usi32;
2498+
return isAvx10v2 ? INS_vcvttss2usis32 : INS_vcvttss2usi32;
24862499
default:
24872500
unreached();
24882501
}
24892502
break;
24902503

24912504
case TYP_DOUBLE:
2505+
if (to == TYP_FLOAT)
2506+
{
2507+
return INS_cvtsd2ss;
2508+
}
2509+
else if (to == TYP_DOUBLE)
2510+
{
2511+
return ins_Move_Extend(TYP_DOUBLE, false);
2512+
}
2513+
isAvx10v2 = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2);
2514+
24922515
switch (to)
24932516
{
24942517
case TYP_INT:
2495-
return INS_cvttsd2si32;
2518+
return isAvx10v2 ? INS_vcvttsd2sis32 : INS_cvttsd2si32;
24962519
case TYP_LONG:
2497-
return INS_cvttsd2si64;
2498-
case TYP_FLOAT:
2499-
return INS_cvtsd2ss;
2520+
return isAvx10v2 ? INS_vcvttsd2sis64 : INS_cvttsd2si64;
25002521
case TYP_ULONG:
2501-
return INS_vcvttsd2usi64;
2522+
return isAvx10v2 ? INS_vcvttsd2usis64 : INS_vcvttsd2usi64;
25022523
case TYP_UINT:
2503-
return INS_vcvttsd2usi32;
2524+
return isAvx10v2 ? INS_vcvttsd2usis32 : INS_vcvttsd2usi32;
25042525
default:
25052526
unreached();
25062527
}

src/coreclr/jit/lowerxarch.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -851,7 +851,9 @@ GenTree* Lowering::LowerCast(GenTree* tree)
851851

852852
#if defined(TARGET_AMD64)
853853
// Handle saturation logic for X64
854-
if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !varTypeIsSmall(dstType))
854+
// Let InstructionSet_AVX10v2 pass through since it can handle the saturation
855+
if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !varTypeIsSmall(dstType) &&
856+
!comp->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
855857
{
856858
// We should have filtered out float -> long conversion and
857859
// converted it to float -> double -> long conversion.
@@ -868,10 +870,8 @@ GenTree* Lowering::LowerCast(GenTree* tree)
868870
bool isV512Supported = false;
869871
/*The code below is to introduce saturating conversions on X86/X64.
870872
The C# equivalence of the code is given below -->
871-
872873
// Replace QNaN and SNaN with Zero
873874
op1 = Avx512F.Fixup(op1, op1, Vector128.Create<long>(0x88), 0);
874-
875875
// Convert from double to long, replacing any values that were greater than or equal to MaxValue
876876
with MaxValue
877877
// Values that were less than or equal to MinValue will already be MinValue

src/coreclr/jit/morph.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,9 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
305305
// dstType = int for SSE41
306306
// For pre-SSE41, the all src is converted to TYP_DOUBLE
307307
// and goes through helpers.
308-
&& (tree->gtOverflow() || (dstType == TYP_LONG) ||
309-
!(canUseEvexEncoding() || (dstType == TYP_INT && compOpportunisticallyDependsOn(InstructionSet_SSE41))))
308+
&&
309+
(tree->gtOverflow() || (dstType == TYP_LONG && !compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) ||
310+
!(canUseEvexEncoding() || (dstType == TYP_INT && compOpportunisticallyDependsOn(InstructionSet_SSE41))))
310311
#elif defined(TARGET_ARM)
311312
// Arm: src = float, dst = int64/uint64 or overflow conversion.
312313
&& (tree->gtOverflow() || varTypeIsLong(dstType))
@@ -340,6 +341,8 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
340341
#else
341342
#if defined(TARGET_AMD64)
342343
// Following nodes are handled when lowering the nodes
344+
// float -> ulong/uint/int/long for AVX10.2
345+
// double -> ulong/uint/int/long for AVX10.2
343346
// float -> ulong/uint/int for AVX512F
344347
// double -> ulong/uint/long/int for AVX512F
345348
// float -> int for SSE41

0 commit comments

Comments
 (0)