openjdk · limingliu-ampere · May 27, 2025 · May 30, 2025 · Jun 4, 2025 · Jun 5, 2025
diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp
@@ -89,6 +89,10 @@ define_pd_global(intx, InlineSmallCode,          1000);
           "Use CRC32 instructions for CRC32 computation")               \
   product(bool, UseCryptoPmullForCRC32, false,                          \
           "Use Crypto PMULL instructions for CRC32 computation")        \
+  product(intx, CryptoPmullForCRC32LowLimit, 256, DIAGNOSTIC,           \
+          "Minimum size in bytes when Crypto PMULL will be used."       \
+          "Value must be a multiple of 128.")                           \
+          range(256, max_jint)                                          \
   product(bool, UseSIMDForMemoryOps, false,                             \
           "Use SIMD instructions in generated memory move code")        \
   product(bool, UseSIMDForArrayEquals, true,                            \

diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -4332,7 +4332,7 @@ void MacroAssembler::kernel_crc32_using_crypto_pmull(Register crc, Register buf,
     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
 
-    subs(tmp0, len, 384);
+    subs(tmp0, len, CryptoPmullForCRC32LowLimit);
     mvnw(crc, crc);
     br(Assembler::GE, CRC_by128_pre);
   BIND(CRC_less128);
@@ -4346,13 +4346,13 @@ void MacroAssembler::kernel_crc32_using_crypto_pmull(Register crc, Register buf,
     b(L_exit);
 
   BIND(CRC_by32_loop);
-    ldp(tmp0, tmp1, Address(buf));
+    ldp(tmp0, tmp1, Address(post(buf, 16)));
+    subs(len, len, 32);
     crc32x(crc, crc, tmp0);
-    ldp(tmp2, tmp3, Address(buf, 16));
+    ldr(tmp2, Address(post(buf, 8)));
     crc32x(crc, crc, tmp1);
-    add(buf, buf, 32);
+    ldr(tmp3, Address(post(buf, 8)));
     crc32x(crc, crc, tmp2);
-    subs(len, len, 32);
     crc32x(crc, crc, tmp3);
     br(Assembler::GE, CRC_by32_loop);
     cmn(len, (u1)32);
@@ -4697,7 +4697,7 @@ void MacroAssembler::kernel_crc32c_using_crypto_pmull(Register crc, Register buf
     Label CRC_by4_loop, CRC_by1_loop, CRC_less128, CRC_by128_pre, CRC_by32_loop, CRC_less32, L_exit;
     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2);
 
-    subs(tmp0, len, 384);
+    subs(tmp0, len, CryptoPmullForCRC32LowLimit);
     br(Assembler::GE, CRC_by128_pre);
   BIND(CRC_less128);
     subs(len, len, 32);
@@ -4710,14 +4710,13 @@ void MacroAssembler::kernel_crc32c_using_crypto_pmull(Register crc, Register buf
     b(L_exit);
 
   BIND(CRC_by32_loop);
-    ldp(tmp0, tmp1, Address(buf));
+    ldp(tmp0, tmp1, Address(post(buf, 16)));
+    subs(len, len, 32);
     crc32cx(crc, crc, tmp0);
-    ldr(tmp2, Address(buf, 16));
+    ldr(tmp2, Address(post(buf, 8)));
     crc32cx(crc, crc, tmp1);
-    ldr(tmp3, Address(buf, 24));
+    ldr(tmp3, Address(post(buf, 8)));
     crc32cx(crc, crc, tmp2);
-    add(buf, buf, 32);
-    subs(len, len, 32);
     crc32cx(crc, crc, tmp3);
     br(Assembler::GE, CRC_by32_loop);
     cmn(len, (u1)32);

diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@@ -120,6 +120,11 @@ void VM_Version::initialize() {
     ContendedPaddingWidth = dcache_line;
   }
 
+  if (!(is_aligned(CryptoPmullForCRC32LowLimit, 128))) {
+    warning("CryptoPmullForCRC32LowLimit must be a multiple of 128");
+    CryptoPmullForCRC32LowLimit = align_down(CryptoPmullForCRC32LowLimit, 128);
+  }
+
   if (os::supports_map_sync()) {
     // if dcpop is available publish data cache line flush size via
     // generic field, otherwise let if default to zero thereby
@@ -148,6 +153,9 @@ void VM_Version::initialize() {
   if (_cpu == CPU_AMPERE && ((_model == CPU_MODEL_AMPERE_1)  ||
                              (_model == CPU_MODEL_AMPERE_1A) ||
                              (_model == CPU_MODEL_AMPERE_1B))) {
+    if (FLAG_IS_DEFAULT(UseCryptoPmullForCRC32)) {
+      FLAG_SET_DEFAULT(UseCryptoPmullForCRC32, true);
+    }
     if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps)) {
       FLAG_SET_DEFAULT(UseSIMDForMemoryOps, true);
     }
@@ -265,6 +273,9 @@ void VM_Version::initialize() {
     if (FLAG_IS_DEFAULT(UseCryptoPmullForCRC32)) {
       FLAG_SET_DEFAULT(UseCryptoPmullForCRC32, true);
     }
+    if (FLAG_IS_DEFAULT(CryptoPmullForCRC32LowLimit)) {
+      FLAG_SET_DEFAULT(CryptoPmullForCRC32LowLimit, 384);
+    }
     if (FLAG_IS_DEFAULT(CodeEntryAlignment)) {
       FLAG_SET_DEFAULT(CodeEntryAlignment, 32);
     }