Open
Description
Background and motivation
Intel x86/x64 provides MUL/IMUL instructions that compute the low and high bits of a multiplication in a single instruction.
This would be very useful for Math.BigMul
implementations (currently using intrinsics on ARM64 and MULX on x64, but that has bad CQ at the moment).
This would also speed up System.Decimal
calculations significantly (currently can't use Math.BigMul
there because it's slower than the hand-tuned existing code that composes big multiplications from smaller 32x32 multiplications).
API Proposal
namespace System.Runtime.Intrinsics.X86
{
partial class X86Base
{
partial class X64
{
internal static (ulong Lower, ulong Upper) Multiply(ulong left, ulong right);
internal static (ulong Lower, long Upper) Multiply(long left, long right);
}
}
}
Related DivMod API: #27292
API Usage
In decimal.DecCalc.VarDecFromR8
this would allow:
// Add -power factors of 10, -power <= (29 - 15) = 14.
power = -power;
if (X86.X86Base.X64.IsSupported || Arm.ArmBase.Arm64.IsSupported)
{
ulong low64;
ulong hi64 = Math.BigMul(mant, s_ulongPowers10[power], out low64);
if (hi64 > uint.MaxValue)
Number.ThrowOverflowException(TypeCode.Decimal);
result.High = (uint)hi64;
result.Low64 = low64;
}
else if (power < 10)
{
uint pow10 = s_powers10[power];
ulong low64 = UInt32x32To64((uint)mant, pow10);
ulong hi64 = UInt32x32To64((uint)(mant >> 32), pow10);
result.Low = (uint)low64;
hi64 += low64 >> 32;
result.Mid = (uint)hi64;
hi64 >>= 32;
result.High = (uint)hi64;
}
else
{
UInt64x64To128(mant, s_ulongPowers10[power], ref result);
}
Another example in decimal.DecCalc.VarDecMul
that would only be faster on x64 (because on ARM64 BigMul is actually two expensive instructions):
// Highest 32 bits is non-zero. Calculate 5 more partial products.
if (X86.X86Base.X64.IsSupported)
{
ulong mid64 = tmp;
tmp = Math.BigMul(d1.High, d2.Low64, out tmp2);
if (mid64 > (mid64 += tmp2)) // add with carry detection
tmp++;
tmp += Math.BigMul(d2.High, d1.Low64, out tmp2);
if (mid64 > (mid64 += tmp2)) // add with carry detection
tmp++;
bufProd.Mid64 = mid64;
}
else
{
tmp2 = UInt32x32To64(d1.Low, d2.High);
tmp += tmp2; // this could generate carry
uint tmp3 = 0;
if (tmp < tmp2) // detect carry
tmp3 = 1;
tmp2 = UInt32x32To64(d1.High, d2.Low);
tmp += tmp2; // this could generate carry
bufProd.U2 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp3++;
tmp2 = ((ulong)tmp3 << 32) | (tmp >> 32);
tmp = UInt32x32To64(d1.Mid, d2.High);
tmp += tmp2; // this could generate carry
tmp3 = 0;
if (tmp < tmp2) // detect carry
tmp3 = 1;
tmp2 = UInt32x32To64(d1.High, d2.Mid);
tmp += tmp2; // this could generate carry
bufProd.U3 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp3++;
tmp = ((ulong)tmp3 << 32) | (tmp >> 32);
}
bufProd.High64 = UInt32x32To64(d1.High, d2.High) + tmp;
hiProd = 5;