SixLabors · JimBobSquarePants · Aug 14, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -1097,4 +1097,83 @@ public static nuint Vector512Count<TVector>(this Span<float> span)
     public static nuint Vector512Count<TVector>(int length)
         where TVector : struct
         => (uint)length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Normalizes the values in a given <see cref="Span{T}"/>.
+    /// </summary>
+    /// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
+    /// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void Normalize(Span<float> span, float sum)
+    {
+        if (Vector512.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15);
+            Vector512<float> sum512 = Vector512.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector512<float>>(ref startRef) /= sum512;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)16);
+            }
+
+            if ((span.Length & 15) >= 8)
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum512.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum512.GetLower().GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else if (Vector256.IsHardwareAccelerated)
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
+            Vector256<float> sum256 = Vector256.Create(sum);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)8);
+            }
+
+            if ((span.Length & 7) >= 4)
+            {
+                Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
+                startRef = ref Unsafe.Add(ref startRef, (nuint)4);
+            }
+
+            endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+        else
+        {
+            ref float startRef = ref MemoryMarshal.GetReference(span);
+            ref float endRef = ref Unsafe.Add(ref startRef, span.Length);
+
+            while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
+            {
+                startRef /= sum;
+                startRef = ref Unsafe.Add(ref startRef, (nuint)1);
+            }
+        }
+    }
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -245,6 +245,44 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
         return default;
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector128{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector128{float}, Vector128{float}, Vector128{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<float> MultiplyAddEstimate(Vector128<float> a, Vector128<float> b, Vector128<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -110,6 +110,44 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
         return Vector256.ConvertToInt32(val_2p23_f32 | sign);
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector256{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/>. This approach can result
+    /// in slightly different results compared to performing the multiplication and addition separately due to
+    /// differences in how floating-point
+    /// rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplyAddEstimate(Vector256<float> a, Vector256<float> b, Vector256<float> c)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplyAdd(a, b, c);
+        }
+
+        return (a * b) + c;
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@@ -110,6 +110,39 @@ public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
         return Vector512.ConvertToInt32(val_2p23_f32 | sign);
     }
 
+    /// <summary>
+    /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
+    /// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
+    /// corresponding element in <paramref name="c"/>.
+    /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
+    /// fused operation for better performance and precision.
+    /// </summary>
+    /// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
+    /// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
+    /// <paramref name="a"/> and <paramref name="b"/>.</param>
+    /// <returns>
+    /// A <see cref="Vector512{Single}"/> where each element is the result of multiplying the corresponding elements
+    /// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
+    /// </returns>
+    /// <remarks>
+    /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
+    /// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/> against the upper and lower
+    /// buts. This approach can result in slightly different results compared to performing the multiplication and
+    /// addition separately due to differences in how floating-point rounding is handled.
+    /// <para>
+    /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
+    /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
+    /// is critical.
+    /// </para>
+    /// </remarks>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
+
+        // Don't actually use FMA as it requires many more instruction to extract the
+        // upper and lower parts of the vector and then recombine them.
+        => (a + b) * c;
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }