dotnet · kunalspathak · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h
@@ -285,6 +285,8 @@ CONFIG_DWORD_INFO(INTERNAL_GCUseGlobalAllocationContext, W("GCUseGlobalAllocatio
 ///
 CONFIG_DWORD_INFO(INTERNAL_JitBreakEmit, W("JitBreakEmit"), (DWORD)-1, "")
 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_JitDebuggable, W("JitDebuggable"), 0, "If set, suppress JIT optimizations that make debugging code difficult")
+CONFIG_DWORD_INFO(INTERNAL_UseSveForVectorT, W("UseSveForVectorT"), 1, "Prefer SVE instructions for VectorT")
+
 #if !defined(DEBUG) && !defined(_DEBUG)
 #define INTERNAL_JitEnableNoWayAssert_Default 0
 #else

diff --git a/src/coreclr/inc/corhdr.h b/src/coreclr/inc/corhdr.h
@@ -1754,6 +1754,8 @@ typedef enum CorInfoHFAElemType : unsigned {
     CORINFO_HFA_ELEM_DOUBLE,
     CORINFO_HFA_ELEM_VECTOR64,
     CORINFO_HFA_ELEM_VECTOR128,
+    CORINFO_HFA_ELEM_VECTOR256,
+    CORINFO_HFA_ELEM_VECTOR512,
 } CorInfoHFAElemType;
 
 //

diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h
@@ -25,24 +25,25 @@ enum CORINFO_InstructionSet
     InstructionSet_Sha1=7,
     InstructionSet_Sha256=8,
     InstructionSet_Atomics=9,
-    InstructionSet_Vector64=10,
-    InstructionSet_Vector128=11,
-    InstructionSet_Dczva=12,
-    InstructionSet_Rcpc=13,
-    InstructionSet_VectorT128=14,
-    InstructionSet_Rcpc2=15,
-    InstructionSet_Sve=16,
-    InstructionSet_Sve2=17,
-    InstructionSet_ArmBase_Arm64=18,
-    InstructionSet_AdvSimd_Arm64=19,
-    InstructionSet_Aes_Arm64=20,
-    InstructionSet_Crc32_Arm64=21,
-    InstructionSet_Dp_Arm64=22,
-    InstructionSet_Rdm_Arm64=23,
-    InstructionSet_Sha1_Arm64=24,
-    InstructionSet_Sha256_Arm64=25,
-    InstructionSet_Sve_Arm64=26,
-    InstructionSet_Sve2_Arm64=27,
+    InstructionSet_Vector=10,
+    InstructionSet_Vector64=11,
+    InstructionSet_Vector128=12,
+    InstructionSet_Dczva=13,
+    InstructionSet_Rcpc=14,
+    InstructionSet_VectorT128=15,
+    InstructionSet_Rcpc2=16,
+    InstructionSet_Sve=17,
+    InstructionSet_Sve2=18,
+    InstructionSet_ArmBase_Arm64=19,
+    InstructionSet_AdvSimd_Arm64=20,
+    InstructionSet_Aes_Arm64=21,
+    InstructionSet_Crc32_Arm64=22,
+    InstructionSet_Dp_Arm64=23,
+    InstructionSet_Rdm_Arm64=24,
+    InstructionSet_Sha1_Arm64=25,
+    InstructionSet_Sha256_Arm64=26,
+    InstructionSet_Sve_Arm64=27,
+    InstructionSet_Sve2_Arm64=28,
 #endif // TARGET_ARM64
 #ifdef TARGET_RISCV64
     InstructionSet_RiscV64Base=1,
@@ -379,6 +380,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
             resultflags.RemoveInstructionSet(InstructionSet_Sve);
         if (resultflags.HasInstructionSet(InstructionSet_Sve2) && !resultflags.HasInstructionSet(InstructionSet_Sve))
             resultflags.RemoveInstructionSet(InstructionSet_Sve2);
+        if (resultflags.HasInstructionSet(InstructionSet_Vector) && !resultflags.HasInstructionSet(InstructionSet_Sve))
+            resultflags.RemoveInstructionSet(InstructionSet_Vector);
 #endif // TARGET_ARM64
 #ifdef TARGET_RISCV64
         if (resultflags.HasInstructionSet(InstructionSet_Zbb) && !resultflags.HasInstructionSet(InstructionSet_RiscV64Base))
@@ -627,6 +630,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
             return "Sha256_Arm64";
         case InstructionSet_Atomics :
             return "Atomics";
+        case InstructionSet_Vector :
+            return "Vector";
         case InstructionSet_Vector64 :
             return "Vector64";
         case InstructionSet_Vector128 :

diff --git a/src/coreclr/inc/corjit.h b/src/coreclr/inc/corjit.h
@@ -438,6 +438,8 @@ class ICorJitInfo : public ICorDynamicInfo
     //
     virtual uint32_t getExpectedTargetArchitecture() = 0;
 
+    virtual uint32_t getTargetVectorLength() = 0;
+
     // Fetches extended flags for a particular compilation instance. Returns
     // the number of bytes written to the provided buffer.
     virtual uint32_t getJitFlags(

diff --git a/src/coreclr/inc/icorjitinfoimpl_generated.h b/src/coreclr/inc/icorjitinfoimpl_generated.h
@@ -744,6 +744,8 @@ uint16_t getRelocTypeHint(
 
 uint32_t getExpectedTargetArchitecture() override;
 
+uint32_t getTargetVectorLength() override;
+
 uint32_t getJitFlags(
           CORJIT_FLAGS* flags,
           uint32_t sizeInBytes) override;

diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h
@@ -37,11 +37,11 @@
 
 #include <minipal/guid.h>
 
-constexpr GUID JITEEVersionIdentifier = { /* 7a77e6d9-7280-439d-bb9d-9887b4516a86 */
-    0x7a77e6d9,
-    0x7280,
-    0x439d,
-    {0xbb, 0x9d, 0x98, 0x87, 0xb4, 0x51, 0x6a, 0x86}
+constexpr GUID JITEEVersionIdentifier = { /* 49287d16-74bd-42e9-9d47-132d7a5f67eb */
+    0x49287d16,
+    0x74bd,
+    0x42e9,
+    {0x9d, 0x47, 0x13, 0x2d, 0x7a, 0x5f, 0x67, 0xeb}
   };
 
 #endif // JIT_EE_VERSIONING_GUID_H
diff --git a/src/coreclr/jit/ICorJitInfo_names_generated.h b/src/coreclr/jit/ICorJitInfo_names_generated.h
@@ -180,6 +180,7 @@ DEF_CLR_API(recordCallSite)
 DEF_CLR_API(recordRelocation)
 DEF_CLR_API(getRelocTypeHint)
 DEF_CLR_API(getExpectedTargetArchitecture)
+DEF_CLR_API(getTargetVectorLength)
 DEF_CLR_API(getJitFlags)
 DEF_CLR_API(getSpecialCopyHelper)
 

diff --git a/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp b/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp
@@ -1743,6 +1743,14 @@ uint32_t WrapICorJitInfo::getExpectedTargetArchitecture()
     return temp;
 }
 
+uint32_t WrapICorJitInfo::getTargetVectorLength()
+{
+    API_ENTER(getTargetVectorLength);
+    uint32_t temp = wrapHnd->getTargetVectorLength();
+    API_LEAVE(getTargetVectorLength);
+    return temp;
+}
+
 uint32_t WrapICorJitInfo::getJitFlags(
           CORJIT_FLAGS* flags,
           uint32_t sizeInBytes)

diff --git a/src/coreclr/jit/abi.cpp b/src/coreclr/jit/abi.cpp
@@ -123,7 +123,15 @@ var_types ABIPassingSegment::GetRegisterType() const
 #ifdef FEATURE_SIMD
             case 16:
                 return TYP_SIMD16;
-#endif
+#ifdef TARGET_ARM64
+            case 32:
+                assert(Compiler::SizeMatchesVectorTLength(Size));
+                return TYP_SIMD32;
+            case 64:
+                assert(Compiler::SizeMatchesVectorTLength(Size));
+                return TYP_SIMD64;
+#endif // TARGET_ARM64
+#endif // FEATURE_SIMD
             default:
                 assert(!"Unexpected size for floating point register");
                 return TYP_UNDEF;

diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp
@@ -284,6 +284,8 @@ bool IntegralRange::Contains(int64_t value) const
                     // Example: IntCns = 42 gives [0..127] with a non -precise range, [42,42] with a precise range.
                     return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::ByteMax};
 #elif defined(TARGET_ARM64)
+                case NI_Vector_op_Equality:
+                case NI_Vector_op_Inequality:
                 case NI_Vector64_op_Equality:
                 case NI_Vector64_op_Inequality:
                 case NI_Vector128_op_Equality:
@@ -2983,8 +2985,7 @@ GenTree* Compiler::optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, G
             conValTree = vecCon;
             break;
         }
-
-#if defined(TARGET_XARCH)
+#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
         case TYP_SIMD32:
         {
             simd32_t value = vnStore->ConstantValue<simd32_t>(vnCns);
@@ -3008,7 +3009,7 @@ GenTree* Compiler::optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, G
         }
         break;
 
-#endif // TARGET_XARCH
+#endif // TARGET_XARCH || TARGET_ARM64
 #endif // FEATURE_SIMD
 
 #if defined(FEATURE_MASKED_HW_INTRINSICS)

diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp
@@ -2280,6 +2280,9 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
                 {
                     // We ignore any differences between SIMD12 and SIMD16 here if we can broadcast the value
                     // via mvni/movi.
+                    // Also, even if UseSveForVectorT == true, we will continue generating loading in V* registers
+                    // instead of Z* registers, because their size is same if VL == 16.
+
                     const bool is8 = tree->TypeIs(TYP_SIMD8);
                     if (vecCon->IsAllBitsSet())
                     {
@@ -2298,12 +2301,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
                             emit->emitIns_R_I(INS_movi, attr, targetReg, val.i32[0], is8 ? INS_OPTS_2S : INS_OPTS_4S);
                         }
                         else if (ElementsAreSame(val.i16, is8 ? 4 : 8) &&
-                                 emitter::emitIns_valid_imm_for_movi(val.i16[0], EA_2BYTE))
+                            emitter::emitIns_valid_imm_for_movi(val.i16[0], EA_2BYTE))
                         {
                             emit->emitIns_R_I(INS_movi, attr, targetReg, val.i16[0], is8 ? INS_OPTS_4H : INS_OPTS_8H);
                         }
                         else if (ElementsAreSame(val.i8, is8 ? 8 : 16) &&
-                                 emitter::emitIns_valid_imm_for_movi(val.i8[0], EA_1BYTE))
+                            emitter::emitIns_valid_imm_for_movi(val.i8[0], EA_1BYTE))
                         {
                             emit->emitIns_R_I(INS_movi, attr, targetReg, val.i8[0], is8 ? INS_OPTS_8B : INS_OPTS_16B);
                         }
@@ -2329,6 +2332,92 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
                     }
                     break;
                 }
+                case TYP_SIMD32:
+                {
+                    // Use scalable registers
+                    if (vecCon->IsAllBitsSet())
+                    {
+                        // Use Scalable_B because for Ones, it doesn't matter.
+                        emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, -1, INS_OPTS_SCALABLE_B);
+                    }
+                    else if (vecCon->IsZero())
+                    {
+                        // Use Scalable_B because for Zero, it doesn't matter.
+                        emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, 0, INS_OPTS_SCALABLE_B);
+                    }
+                    else
+                    {
+                        simd32_t val = vecCon->gtSimd32Val;
+                        if (ElementsAreSame(val.i8, 32))
+                        {
+                            emit->emitIns_R_I(INS_sve_dup, EA_SCALABLE, targetReg, val.i8[0], INS_OPTS_SCALABLE_B);
+                        }
+                        else if (ElementsAreSame(val.i16, 16))
+                        {
+                            emit->emitIns_R_I(INS_sve_dup, EA_SCALABLE, targetReg, val.i16[0], INS_OPTS_SCALABLE_H);
+                        }
+                        else if (ElementsAreSame(val.i32, 8))
+                        {
+                            emit->emitIns_R_I(INS_sve_dup, EA_SCALABLE, targetReg, val.i32[0], INS_OPTS_SCALABLE_S);
+                        }
+                        else
+                        {
+                            // Get a temp integer register to compute long address.
+                            regNumber            addrReg = internalRegisters.GetSingle(tree);
+                            CORINFO_FIELD_HANDLE hnd;
+                            hnd = emit->emitSimdConst(&vecCon->gtSimdVal, emitTypeSize(tree->TypeGet()));
+                            emit->emitIns_R_C(INS_sve_ldr, attr, targetReg, addrReg, hnd, 0);
+                            // emit->emitIns_R_C(INS_adr, EA_8BYTE, addrReg, REG_NA, hnd, 0);
+                            // emit->emitIns_R_R_R_I(INS_sve_ld1b, EA_SCALABLE, targetReg, REG_P1, addrReg, 0,
+                            // INS_OPTS_SCALABLE_B);
+                        }
+                    }
+                    break;
+                }
+                case TYP_SIMD64:
+                {
+                    // Use scalable registers
+                    if (vecCon->IsAllBitsSet())
+                    {
+                        // Use Scalable_B because for Ones, it doesn't matter.
+                        emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, -1, INS_OPTS_SCALABLE_B);
+                    }
+                    else if (vecCon->IsZero())
+                    {
+                        // Use Scalable_B because for Zero, it doesn't matter.
+                        emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, 0, INS_OPTS_SCALABLE_B);
+                    }
+                    else
+                    {
+                        simd64_t val = vecCon->gtSimd64Val;
+                        if (ElementsAreSame(val.i32, 16) && emitter::isValidSimm_MultipleOf<8, 256>(val.i32[0]))
+                        {
+                            emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, val.i32[0], INS_OPTS_SCALABLE_S,
+                                              INS_SCALABLE_OPTS_IMM_BITMASK);
+                        }
+                        else if (ElementsAreSame(val.i16, 32) && emitter::isValidSimm_MultipleOf<8, 256>(val.i16[0]))
+                        {
+                            emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, val.i16[0], INS_OPTS_SCALABLE_H,
+                                              INS_SCALABLE_OPTS_IMM_BITMASK);
+                        }
+                        else if (ElementsAreSame(val.i8, 64) && emitter::isValidSimm<8>(val.i8[0]))
+                        {
+                            emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, val.i8[0], INS_OPTS_SCALABLE_B,
+                                              INS_SCALABLE_OPTS_IMM_BITMASK);
+                        }
+                        else
+                        {
+                            // Get a temp integer register to compute long address.
+                            regNumber            addrReg = internalRegisters.GetSingle(tree);
+                            CORINFO_FIELD_HANDLE hnd;
+                            simd64_t             constValue;
+                            memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t));
+                            hnd = emit->emitSimdConst(&vecCon->gtSimdVal, emitTypeSize(tree->TypeGet()));
+                            emit->emitIns_R_C(INS_sve_ldr, attr, targetReg, addrReg, hnd, 0);
+                        }
+                    }
+                    break;
+                }
 
                 default:
                 {
@@ -2955,7 +3044,18 @@ void CodeGen::genSimpleReturn(GenTree* treeNode)
         }
     }
     emitAttr attr = emitActualTypeSize(targetType);
-    GetEmitter()->emitIns_Mov(INS_mov, attr, retReg, op1->GetRegNum(), /* canSkip */ !movRequired);
+    bool isScalable = (attr == EA_SCALABLE) || (Compiler::UseSveForType(targetType));
+
+    if (isScalable)
+    {
+        // TODO-VL: Should we check the baseType or it doesn't matter because it is just reg->reg move
+        GetEmitter()->emitIns_Mov(INS_sve_mov, attr, retReg, op1->GetRegNum(), /* canSkip */ !movRequired,
+                                  INS_OPTS_SCALABLE_Q);
+    }
+    else
+    {
+        GetEmitter()->emitIns_Mov(INS_mov, attr, retReg, op1->GetRegNum(), /* canSkip */ !movRequired);
+    }
 }
 
 /***********************************************************************************************
@@ -5247,14 +5347,28 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node)
 
     GenTreeLclVar* lclNode = op1->AsLclVar();
     LclVarDsc*     varDsc  = compiler->lvaGetDesc(lclNode);
-    assert(emitTypeSize(varDsc->GetRegisterType(lclNode)) == 16);
-
-    regNumber tgtReg = node->GetRegNum();
-    assert(tgtReg != REG_NA);
+    unsigned       varSize = emitTypeSize(varDsc->GetRegisterType(lclNode));
+    assert((varSize == 16) || (Compiler::SizeMatchesVectorTLength(varSize)));
 
     regNumber op1Reg = genConsumeReg(op1);
     assert(op1Reg != REG_NA);
 
+    regNumber tgtReg = node->GetRegNum();
+#ifdef TARGET_ARM64
+    // TODO-VL: Write a helper to do this check for LclVars*, GenTree*, etc.
+    if (Compiler::UseStrictSveForType(op1->TypeGet()))
+    {
+        // Until we custom ABI for SVE, we will just store entire contents of Z* registers
+        // on stack. If we don't do it, we will need multiple free registers to save the
+        // contents of everything but lower 8-bytes.
+        assert(tgtReg == REG_NA);
+
+        GetEmitter()->emitIns_S_R(INS_sve_str, EA_SCALABLE, op1Reg, lclNode->GetLclNum(), 0);
+        return;
+    }
+#endif // TARGET_ARM64
+    assert(tgtReg != REG_NA);
+
     GetEmitter()->emitIns_R_R_I_I(INS_mov, EA_8BYTE, tgtReg, op1Reg, 0, 1);
 
     if ((node->gtFlags & GTF_SPILL) != 0)
@@ -5303,10 +5417,12 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node)
 
     GenTreeLclVar* lclNode = op1->AsLclVar();
     LclVarDsc*     varDsc  = compiler->lvaGetDesc(lclNode);
-    assert(emitTypeSize(varDsc->GetRegisterType(lclNode)) == 16);
+
+    unsigned varSize = emitTypeSize(varDsc->GetRegisterType(lclNode));
+    assert((varSize == 16) || (Compiler::SizeMatchesVectorTLength(varSize)));
 
     regNumber srcReg = node->GetRegNum();
-    assert(srcReg != REG_NA);
+    assert((srcReg != REG_NA) || (Compiler::UseStrictSveForType(node->TypeGet())));
 
     regNumber lclVarReg = genConsumeReg(lclNode);
     assert(lclVarReg != REG_NA);
@@ -5318,6 +5434,19 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node)
         // The localVar must have a stack home.
         assert(varDsc->lvOnFrame);
 
+#ifdef TARGET_ARM64
+        // TODO-VL: Write a helper to do this check for LclVars*, GenTree*, etc.
+        if (Compiler::UseStrictSveForType(op1->TypeGet()))
+        {
+            // Until we custom ABI for SVE, we will just store entire contents of Z* registers
+            // on stack. If we don't do it, we will need multiple free registers to save the
+            // contents of everything but lower 8-bytes.
+
+            GetEmitter()->emitIns_R_S(INS_sve_ldr, EA_SCALABLE, lclVarReg, varNum, 0);
+            return;
+        }
+#endif // TARGET_ARM64
+
         // We will load this from the upper 8 bytes of this localVar's home.
         int offset = 8;