uxlfoundation · david-cortes-intel · May 26, 2025 · May 26, 2025 · May 27, 2025 · May 27, 2025
@@ -227,12 +227,7 @@ public:
                 /* Sum input array elements in case of non-normalized data */
                 for (DAAL_INT i = 0; i < nRows; i++)
                 {
-                    PRAGMA_FORCE_SIMD
-                    PRAGMA_VECTOR_ALWAYS
-                    for (DAAL_INT j = 0; j < _nFeatures; j++)
-                    {
-                        sumsPtr[j] += dataBlock[i * _nFeatures + j];
-                    }
+                    daal::internal::MathInst<algorithmFPType, cpu>::vAdd(_nFeatures, sumsPtr, dataBlock + i * _nFeatures, sumsPtr);
                 }
             }
         }
@@ -405,10 +400,7 @@ services::Status updateDenseCrossProductAndSums(bool isNormalized, size_t nFeatu
             {
                 return services::Status(services::ErrorMemoryAllocationFailed);
             }
-            for (size_t i = 0; i < nFeatures; i++)
-            {
-                sums[i] = resultSums[i];
-            }
+            daal::services::internal::daal_memcpy_s(sums, nFeatures * sizeof(algorithmFPType), resultSums, nFeatures * sizeof(algorithmFPType));
             for (size_t i = 0; i < nFeatures; i++)
             {
                 PRAGMA_FORCE_SIMD
@@ -421,10 +413,8 @@ services::Status updateDenseCrossProductAndSums(bool isNormalized, size_t nFeatu
         }
         else
         {
-            for (size_t i = 0; i < nFeatures * nFeatures; i++)
-            {
-                crossProduct[i] = resultCrossProduct[i];
-            }
+            daal::services::internal::daal_memcpy_s(crossProduct, nFeatures * nFeatures * sizeof(algorithmFPType), resultCrossProduct,
+                                                    nFeatures * nFeatures * sizeof(algorithmFPType));
         }
     }
     else
@@ -538,10 +528,7 @@ void mergeCrossProductAndSums(size_t nFeatures, const algorithmFPType * partialC
         nObservations[0] += partialNObservations[0];
 
         /* Merge sums */
-        for (size_t i = 0; i < nFeatures; i++)
-        {
-            sums[i] += partialSums[i];
-        }
+        daal::internal::MathInst<algorithmFPType, cpu>::vAdd(nFeatures, sums, partialSums, sums);
     }
 }
 
@@ -578,10 +565,7 @@ services::Status finalizeCovariance(size_t nFeatures, algorithmFPType nObservati
         DAAL_CHECK_MALLOC(diagInvSqrtsArray.get());
 
         algorithmFPType * diagInvSqrts = diagInvSqrtsArray.get();
-        for (size_t i = 0; i < nFeatures; i++)
-        {
-            diagInvSqrts[i] = 1.0 / daal::internal::MathInst<algorithmFPType, cpu>::sSqrt(crossProduct[i * nFeatures + i]);
-        }
+        daal::internal::MathInst<algorithmFPType, cpu>::vInvSqrtI(nFeatures, crossProduct, nFeatures + 1, diagInvSqrts, 1);
 
         for (size_t i = 0; i < nFeatures; i++)
         {

@@ -60,6 +60,8 @@ struct Math
 
     static fpType sCdfNormInv(fpType in) { return _impl<fpType, cpu>::sCdfNormInv(in); }
 
+    static void vAdd(SizeType n, const fpType * a, const fpType * b, fpType * y) { _impl<fpType, cpu>::vAdd(n, a, b, y); }
+
     static void vPowx(SizeType n, const fpType * in, fpType in1, fpType * out) { _impl<fpType, cpu>::vPowx(n, in, in1, out); }
 
     static void vPowxAsLnExp(SizeType n, const fpType * in, fpType in1, fpType * out)
@@ -86,6 +88,11 @@ struct Math
 
     static void vSqrt(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vSqrt(n, in, out); }
 
+    static void vInvSqrtI(SizeType n, const fpType * a, const SizeType inca, fpType * b, const SizeType incb)
+    {
+        _impl<fpType, cpu>::vInvSqrtI(n, a, inca, b, incb);
+    }
+
     static void vLog(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vLog(n, in, out); }
 
     static void vLog1p(SizeType n, const fpType * in, fpType * out) { _impl<fpType, cpu>::vLog1p(n, in, out); }

@@ -101,6 +101,8 @@ struct MklMath<double, cpu>
         return r;
     }
 
+    static void vAdd(SizeType n, const double * a, const double * b, double * y) { __DAAL_MKLFN_CALL_MATH(vdAdd, (n, a, b, y)); }
+
     static void vPowx(SizeType n, const double * in, double in1, double * out)
     {
         __DAAL_MKLFN_CALL_MATH(vmdPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
@@ -138,6 +140,11 @@ struct MklMath<double, cpu>
         __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
     }
 
+    static void vInvSqrtI(SizeType n, const double * a, const SizeType inca, double * b, const SizeType incb)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdInvSqrtI, (n, a, inca, b, incb, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
     static void vLog(SizeType n, const double * in, double * out)
     {
         __DAAL_MKLFN_CALL_MATH(vmdLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
@@ -213,6 +220,8 @@ struct MklMath<float, cpu>
         return r;
     }
 
+    static void vAdd(SizeType n, const float * a, const float * b, float * y) { __DAAL_MKLFN_CALL_MATH(vsAdd, (n, a, b, y)); }
+
     static void vPowx(SizeType n, const float * in, float in1, float * out)
     {
         __DAAL_MKLFN_CALL_MATH(vmsPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
@@ -250,6 +259,11 @@ struct MklMath<float, cpu>
         __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
     }
 
+    static void vInvSqrtI(SizeType n, const float * a, const SizeType inca, float * b, const SizeType incb)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsInvSqrtI, (n, a, inca, b, incb, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
+
     static void vLog(SizeType n, const float * in, float * out)
     {
         __DAAL_MKLFN_CALL_MATH(vmsLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));

@@ -70,6 +70,12 @@ struct RefMath<double, cpu>
     // Not implemented
     static double sCdfNormInv(double in) { return std::numeric_limits<double>::quiet_NaN(); }
 
+    static void vAdd(SizeType n, const double * a, const double * b, double * y)
+    {
+#pragma omp simd
+        for (SizeType i = 0; i < n; ++i) y[i] = a[i] + b[i];
+    }
+
     static void vPowx(SizeType n, const double * in, double in1, double * out)
     {
 #pragma omp simd
@@ -117,6 +123,12 @@ struct RefMath<double, cpu>
         for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]);
     }
 
+    static void vInvSqrtI(SizeType n, const double * a, const SizeType inca, double * b, const SizeType incb)
+    {
+#pragma omp simd
+        for (SizeType i = 0; i < n; ++i) b[i * incb] = 1.0 / sqrt(a[i * inca]);
+    }
+
     static void vLog(SizeType n, const double * in, double * out)
     {
 #pragma omp simd
@@ -167,6 +179,12 @@ struct RefMath<float, cpu>
     // Not implemented
     static float sCdfNormInv(float in) { return std::numeric_limits<float>::quiet_NaN(); }
 
+    static void vAdd(SizeType n, const float * a, const float * b, float * y)
+    {
+#pragma omp simd
+        for (SizeType i = 0; i < n; ++i) y[i] = a[i] + b[i];
+    }
+
     static void vPowx(SizeType n, const float * in, float in1, float * out)
     {
 #pragma omp simd
@@ -214,6 +232,12 @@ struct RefMath<float, cpu>
         for (SizeType i = 0; i < n; ++i) out[i] = sqrt(in[i]);
     }
 
+    static void vInvSqrtI(SizeType n, const float * a, const SizeType inca, float * b, const SizeType incb)
+    {
+#pragma omp simd
+        for (SizeType i = 0; i < n; ++i) b[i * incb] = 1.0f / sqrtf(a[i * inca]);
+    }
+
     static void vLog(SizeType n, const float * in, float * out)
     {
 #pragma omp simd