Use Avx10.2 Instructions in Floating Point Conversions (#111775)

khushal1996 · Ruihan-Yin · BruceForstall · web-flow · commit 9283581f4143 · 2025-02-18T15:04:31.000-08:00
* Add support for AVX10.2. Add AVX10.2 API surface and template tests. Lower Avx10.2 nodes accordingly. * Add support and template tests for AVX10v2_V512 * Add new coredistools.dll build from latest llvm repo * Limit JIT unit suite within the subsets which are stable in SDE. * Rename API as per latest API proposal discussions * fix sample tests in handwritten project * Revert "Limit JIT unit suite within the subsets which are stable in SDE." This reverts commit 067e31e. * Limit JIT unit suite within the subsets which are stable in SDE. * Allow a prefix of 0x00 for AVX10.2 instructions. * Revert "Limit JIT unit suite within the subsets which are stable in SDE." This reverts commit 067e31e. * Limit JIT unit suite within the subsets which are stable in SDE. * remove developer comments from files * Enable all template tests and enable ymm embedded rounding * Make emitter independent of ISa and based on insOpts for ymm embedded rounding * Enable ymm embedded rounding based on architecture * Revert "Make emitter independent of ISa and based on insOpts for ymm embedded rounding" This reverts commit 493572f. * Separate Avx10.2 unit testing framework from APX framework * Revert "Limit JIT unit suite within the subsets which are stable in SDE." This reverts commit 067e31e. * Revert "Add new coredistools.dll build from latest llvm repo" This reverts commit 61719f8. * Fix formatting * Use new keyword for class V512 to hide Avx10v1.V512 and correct CI errors * Remove MinMax APis from lowering for numargs=2 * Add docstrings for APIs * revert changes for sde execution of tests * Add appropriate comments from reviews * Apply suggestions from code review Co-authored-by: Bruce Forstall <brucefo@microsoft.com> * Add emitter tests for XMM9/16 to make sure special handling does not interfere. * Format code * Handle sizePrefix = 0 case when decoding evex instruction * Add assert in appropriate places * Club similar instructions together in perf calculation in emitxarch * Run formatting * Add assembly prints for debug assembly capturing for Avx10.2 * Use correct size when running emitter tests * Ad appropriate comments and make review changes * Use AVX10.2 instructions in conversions * Run formatting * Update comment Co-authored-by: Michał Petryka <35800402+MichalPetryka@users.noreply.github.com> * Optimize the need for compIsa checks and edit comments * Run formatting --------- Co-authored-by: Ruihan-Yin <ruihan.yin@intel.com> Co-authored-by: Bruce Forstall <brucefo@microsoft.com> Co-authored-by: Michał Petryka <35800402+MichalPetryka@users.noreply.github.com>
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
@@ -7432,7 +7432,7 @@ void CodeGen::genFloatToIntCast(GenTree* treeNode)
     noway_assert((dstSize == EA_ATTR(genTypeSize(TYP_INT))) || (dstSize == EA_ATTR(genTypeSize(TYP_LONG))));
 
     // We shouldn't be seeing uint64 here as it should have been converted
-    // into a helper call by either front-end or lowering phase, unless we have AVX512F
+    // into a helper call by either front-end or lowering phase, unless we have AVX512F/AVX10.x
     // accelerated conversions.
     assert(!varTypeIsUnsigned(dstType) || (dstSize != EA_ATTR(genTypeSize(TYP_LONG))) ||
            compiler->canUseEvexEncodingDebugOnly());
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
@@ -13197,6 +13197,14 @@ void emitter::emitDispIns(
                 case INS_vcvttsd2usi64:
                 case INS_vcvttss2usi32:
                 case INS_vcvttss2usi64:
+                case INS_vcvttsd2sis32:
+                case INS_vcvttsd2sis64:
+                case INS_vcvttss2sis32:
+                case INS_vcvttss2sis64:
+                case INS_vcvttsd2usis32:
+                case INS_vcvttsd2usis64:
+                case INS_vcvttss2usis32:
+                case INS_vcvttss2usis64:
                 {
                     assert(!id->idIsEvexAaaContextSet());
                     printf("%s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
@@ -21689,7 +21689,39 @@ GenTree* Compiler::gtNewSimdCvtNode(var_types   type,
     GenTree* fixupVal;
     bool     isV512Supported = false;
 
-    if (compIsEvexOpportunisticallySupported(isV512Supported))
+    if (compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
+    {
+        NamedIntrinsic cvtIntrinsic = NI_Illegal;
+        switch (simdTargetBaseType)
+        {
+            case TYP_INT:
+                cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorInt32WithTruncationSaturation
+                                                : NI_AVX10v2_ConvertToVectorInt32WithTruncationSaturation;
+                break;
+
+            case TYP_UINT:
+                cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorUInt32WithTruncationSaturation
+                                                : NI_AVX10v2_ConvertToVectorUInt32WithTruncationSaturation;
+                break;
+
+            case TYP_LONG:
+                cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorInt64WithTruncationSaturation
+                                                : NI_AVX10v2_ConvertToVectorInt64WithTruncationSaturation;
+                break;
+
+            case TYP_ULONG:
+                cvtIntrinsic = (simdSize == 64) ? NI_AVX10v2_V512_ConvertToVectorUInt64WithTruncationSaturation
+                                                : NI_AVX10v2_ConvertToVectorUInt64WithTruncationSaturation;
+                break;
+
+            default:
+            {
+                unreached();
+            }
+        }
+        return gtNewSimdHWIntrinsicNode(type, op1, cvtIntrinsic, simdSourceBaseJitType, simdSize);
+    }
+    else if (compIsEvexOpportunisticallySupported(isV512Supported))
     {
         /*Generate the control table for VFIXUPIMMSD/SS
         - For conversion to unsigned
diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp
@@ -2420,6 +2420,11 @@ instruction CodeGen::ins_MathOp(genTreeOps oper, var_types type)
 //
 instruction CodeGen::ins_FloatConv(var_types to, var_types from)
 {
+    // AVX: Supports following conversions
+    //   srcType = int16/int64                     castToType = float
+    // AVX512: Supports following conversions
+    //   srcType = ulong                           castToType = double/float
+    bool isAvx10v2 = false;
     switch (from)
     {
         case TYP_INT:
@@ -2471,36 +2476,52 @@ instruction CodeGen::ins_FloatConv(var_types to, var_types from)
             break;
 
         case TYP_FLOAT:
+            if (to == TYP_FLOAT)
+            {
+                return ins_Move_Extend(TYP_FLOAT, false);
+            }
+            else if (to == TYP_DOUBLE)
+            {
+                return INS_cvtss2sd;
+            }
+            isAvx10v2 = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2);
+
             switch (to)
             {
                 case TYP_INT:
-                    return INS_cvttss2si32;
+                    return isAvx10v2 ? INS_vcvttss2sis32 : INS_cvttss2si32;
                 case TYP_LONG:
-                    return INS_cvttss2si64;
-                case TYP_DOUBLE:
-                    return INS_cvtss2sd;
+                    return isAvx10v2 ? INS_vcvttss2sis64 : INS_cvttss2si64;
                 case TYP_ULONG:
-                    return INS_vcvttss2usi64;
+                    return isAvx10v2 ? INS_vcvttss2usis64 : INS_vcvttss2usi64;
                 case TYP_UINT:
-                    return INS_vcvttss2usi32;
+                    return isAvx10v2 ? INS_vcvttss2usis32 : INS_vcvttss2usi32;
                 default:
                     unreached();
             }
             break;
 
         case TYP_DOUBLE:
+            if (to == TYP_FLOAT)
+            {
+                return INS_cvtsd2ss;
+            }
+            else if (to == TYP_DOUBLE)
+            {
+                return ins_Move_Extend(TYP_DOUBLE, false);
+            }
+            isAvx10v2 = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX10v2);
+
             switch (to)
             {
                 case TYP_INT:
-                    return INS_cvttsd2si32;
+                    return isAvx10v2 ? INS_vcvttsd2sis32 : INS_cvttsd2si32;
                 case TYP_LONG:
-                    return INS_cvttsd2si64;
-                case TYP_FLOAT:
-                    return INS_cvtsd2ss;
+                    return isAvx10v2 ? INS_vcvttsd2sis64 : INS_cvttsd2si64;
                 case TYP_ULONG:
-                    return INS_vcvttsd2usi64;
+                    return isAvx10v2 ? INS_vcvttsd2usis64 : INS_vcvttsd2usi64;
                 case TYP_UINT:
-                    return INS_vcvttsd2usi32;
+                    return isAvx10v2 ? INS_vcvttsd2usis32 : INS_vcvttsd2usi32;
                 default:
                     unreached();
             }
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -851,7 +851,9 @@ GenTree* Lowering::LowerCast(GenTree* tree)
 
 #if defined(TARGET_AMD64)
     // Handle saturation logic for X64
-    if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !varTypeIsSmall(dstType))
+    // Let InstructionSet_AVX10v2 pass through since it can handle the saturation
+    if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !varTypeIsSmall(dstType) &&
+        !comp->compOpportunisticallyDependsOn(InstructionSet_AVX10v2))
     {
         // We should have filtered out float -> long conversion and
         // converted it to float -> double -> long conversion.
@@ -868,10 +870,8 @@ GenTree* Lowering::LowerCast(GenTree* tree)
         bool isV512Supported = false;
         /*The code below is to introduce saturating conversions on X86/X64.
         The C# equivalence of the code is given below -->
-
                 // Replace QNaN and SNaN with Zero
                 op1 = Avx512F.Fixup(op1, op1, Vector128.Create<long>(0x88), 0);
-
                 // Convert from double to long, replacing any values that were greater than or equal to MaxValue
         with MaxValue
                 // Values that were less than or equal to MinValue will already be MinValue
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
@@ -305,8 +305,9 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
             //       dstType = int for SSE41
             // For pre-SSE41, the all src is converted to TYP_DOUBLE
             // and goes through helpers.
-            && (tree->gtOverflow() || (dstType == TYP_LONG) ||
-                !(canUseEvexEncoding() || (dstType == TYP_INT && compOpportunisticallyDependsOn(InstructionSet_SSE41))))
+            &&
+            (tree->gtOverflow() || (dstType == TYP_LONG && !compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) ||
+             !(canUseEvexEncoding() || (dstType == TYP_INT && compOpportunisticallyDependsOn(InstructionSet_SSE41))))
 #elif defined(TARGET_ARM)
             // Arm: src = float, dst = int64/uint64 or overflow conversion.
             && (tree->gtOverflow() || varTypeIsLong(dstType))
@@ -340,6 +341,8 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree)
 #else
 #if defined(TARGET_AMD64)
                 // Following nodes are handled when lowering the nodes
+                //     float  -> ulong/uint/int/long for AVX10.2
+                //     double -> ulong/uint/int/long for AVX10.2
                 //     float  -> ulong/uint/int for AVX512F
                 //     double -> ulong/uint/long/int for AVX512F
                 //     float  -> int for SSE41