Fix missing vzeroupper in gcm_ghash_vpclmulqdq_avx2() for len=16

[Adapted by Brian Smith to *ring*'s prior modifications.] gcm_ghash_vpclmulqdq_avx2() executes vzeroupper only when len >= 32, as it was supposed to use only xmm registers for len == 16. But actually it was writing to two ymm registers unconditionally for $BSWAP_MASK and $GFPOLY. Therefore, there could be a slow-down in later code using legacy SSE instructions, in the (probably rare) case where gcm_ghash_vpclmulqdq_avx2() was called with len=16 *and* wasn't followed by a function that does vzeroupper, e.g. aes_gcm_enc_update_vaes_avx2(). (The Windows xmm register restore epilogue of gcm_ghash_vpclmulqdq_avx2() itself does use legacy SSE instructions, so probably was slowed down by this, but that's just 8 instructions.) Fix this by updating gcm_ghash_vpclmulqdq_avx2() to correctly use only xmm registers when len=16. This makes it match the similar code in aes-gcm-avx10-x86_64.pl, which does do it correctly, more closely. Also, make both functions just execute vzeroupper unconditionally, so that it won't be missed again. It's actually only 1 cycle on the CPUs this code runs on, and it no longer seems worth executing conditionally. Change-Id: I975cd5b526e5cdae1a567f4085c2484552bf6bea
briansmith · Mar 10, 2025 · ac0c9c5 · ac0c9c5
1 parent 15c48c5
commit ac0c9c5
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 5 deletions.
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -786,9 +786,6 @@ sub _ghash_update {
     jae             .Laad_loop_1x$local_label_suffix
 
 .Laad_large_done$local_label_suffix:
-    # Issue the vzeroupper that is needed after using ymm or zmm registers.
-    # Do it here instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
 
     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
 .Laad_blockbyblock$local_label_suffix:
@@ -809,6 +806,8 @@ sub _ghash_update {
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper      # This is needed after using ymm or zmm registers.
 ___
     return $code;
 }

diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -461,10 +461,16 @@ sub _ghash_4x {
     @{[ _save_xmmregs (6 .. 9) ]}
     .seh_endprologue
 
-    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
+    # usually only 128-bit vectors will be used.  So as an optimization, don't
+    # broadcast these constants to both 128-bit lanes quite yet.
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+
+    # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
+
 
     # Update GHASH with the remaining 16-byte block if any.
 .Lghash_lastblock:
@@ -479,6 +485,8 @@ sub _ghash_4x {
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
 ___
 }
 $code .= _end_func;