aes: Oxidize aes_nohw [un]compact_{block,word}.

briansmith · May 21, 2024 · e17d947 · e17d947
1 parent 640fa8f
commit e17d947
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 183 deletions.
diff --git a/build.rs b/build.rs
@@ -869,9 +869,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_encrypt",
         "aes_hw_set_encrypt_key",
-        "aes_nohw_batch_get",
-        "aes_nohw_compact_block",
-        "aes_nohw_uncompact_block",
         "aes_nohw_sub_bytes",
         "aes_nohw_to_batch",
         "aes_nohw_mix_columns",

diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
@@ -155,158 +155,6 @@ typedef struct {
   aes_word_t w[8];
 } AES_NOHW_BATCH;
 
-// aes_nohw_delta_swap returns |a| with bits |a & mask| and
-// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
-static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
-                                             aes_word_t shift) {
-  // See
-  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
-  aes_word_t b = (a ^ (a >> shift)) & mask;
-  return a ^ b ^ (b << shift);
-}
-
-// In the 32-bit and 64-bit implementations, a block spans multiple words.
-// |aes_nohw_compact_block| must permute bits across different words. First we
-// implement |aes_nohw_compact_word| which performs a smaller version of the
-// transformation which stays within a single word.
-//
-// These transformations are generalizations of the output of
-// http://programming.sirrida.de/calcperm.php on smaller inputs.
-#if defined(OPENSSL_64_BIT)
-static inline uint64_t aes_nohw_compact_word(uint64_t a) {
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap8(a);
-#endif
-  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
-  // quartets of those chunks:
-  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
-  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
-  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
-  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
-  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
-  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
-  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
-  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
-  return a;
-}
-
-static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
-  // Reverse the steps of |aes_nohw_uncompact_word|.
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
-  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap8(a);
-#endif
-  return a;
-}
-#else   // !OPENSSL_64_BIT
-static inline uint32_t aes_nohw_compact_word(uint32_t a) {
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap4(a);
-#endif
-  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
-  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
-  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
-  // Note:  0x00cc = 0b0000_0000_1100_1100
-  //   0x00cc << 6 = 0b0011_0011_0000_0000
-  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
-  // Now we swap groups of four bits (still numbering by pairs):
-  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
-  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
-  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
-  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
-  return a;
-}
-
-static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
-  // Reverse the steps of |aes_nohw_uncompact_word|.
-  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
-  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap4(a);
-#endif
-  return a;
-}
-
-static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
-                                                uint8_t a2, uint8_t a3) {
-  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
-         ((uint32_t)a3 << 24);
-}
-
-static inline uint8_t lo(uint32_t a) {
-  return (uint8_t)a;
-}
-
-#endif  // OPENSSL_64_BIT
-
-void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                                          const uint8_t in[16]) {
-  OPENSSL_memcpy(out, in, 16);
-#if defined(OPENSSL_64_BIT)
-  uint64_t a0 = aes_nohw_compact_word(out[0]);
-  uint64_t a1 = aes_nohw_compact_word(out[1]);
-  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
-  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
-#else
-  uint32_t a0 = aes_nohw_compact_word(out[0]);
-  uint32_t a1 = aes_nohw_compact_word(out[1]);
-  uint32_t a2 = aes_nohw_compact_word(out[2]);
-  uint32_t a3 = aes_nohw_compact_word(out[3]);
-  // Note clang, when building for ARM Thumb2, will sometimes miscompile
-  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
-  // without optimizations. This bug was introduced in
-  // https://reviews.llvm.org/rL340261 and fixed in
-  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
-  out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
-  out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
-  out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
-  out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
-#endif
-}
-
-void aes_nohw_uncompact_block(
-    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
-#if defined(OPENSSL_64_BIT)
-  uint64_t a0 = in[0];
-  uint64_t a1 = in[1];
-  uint64_t b0 =
-      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
-  uint64_t b1 =
-      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
-  OPENSSL_memcpy(out, &b0, 8);
-  OPENSSL_memcpy(out + 8, &b1, 8);
-#else
-  uint32_t a0 = in[0];
-  uint32_t a1 = in[1];
-  uint32_t a2 = in[2];
-  uint32_t a3 = in[3];
-  // Note clang, when building for ARM Thumb2, will sometimes miscompile
-  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
-  // without optimizations. This bug was introduced in
-  // https://reviews.llvm.org/rL340261 and fixed in
-  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
-  uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
-  uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
-  uint32_t b2 =
-      aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
-  uint32_t b3 =
-      aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
-  b0 = aes_nohw_uncompact_word(b0);
-  b1 = aes_nohw_uncompact_word(b1);
-  b2 = aes_nohw_uncompact_word(b2);
-  b3 = aes_nohw_uncompact_word(b3);
-  OPENSSL_memcpy(out, &b0, 4);
-  OPENSSL_memcpy(out + 4, &b1, 4);
-  OPENSSL_memcpy(out + 8, &b2, 4);
-  OPENSSL_memcpy(out + 12, &b3, 4);
-#endif
-}
-
 // AES round steps.
 
 void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {

diff --git a/crypto/internal.h b/crypto/internal.h
@@ -378,18 +378,6 @@ static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) {
 static inline uint32_t CRYPTO_bswap4(uint32_t x) {
   return __builtin_bswap32(x);
 }
-
-static inline uint64_t CRYPTO_bswap8(uint64_t x) {
-  return __builtin_bswap64(x);
-}
-#elif defined(_MSC_VER)
-#pragma warning(push, 3)
-#include <stdlib.h>
-#pragma warning(pop)
-#pragma intrinsic(_byteswap_ulong)
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  return _byteswap_ulong(x);
-}
 #endif
 
 #if !defined(RING_CORE_NOSTDLIBINC)

diff --git a/src/aead/aes/aes_nohw.rs b/src/aead/aes/aes_nohw.rs
@@ -19,7 +19,7 @@ use crate::{
     polyfill::{self, usize_from_u32, ArraySplitMap as _},
 };
 use cfg_if::cfg_if;
-use core::{array, mem::MaybeUninit, ops::RangeFrom};
+use core::{array, ops::RangeFrom};
 
 type Word = constant_time::Word;
 const WORD_SIZE: usize = core::mem::size_of::<Word>();
@@ -39,26 +39,161 @@ fn shift_right<const I: u32>(a: Word) -> Word {
     a >> (I * BATCH_SIZE_U32)
 }
 
-fn compact_block(input: &[u8; 16]) -> [Word; BLOCK_WORDS] {
-    prefixed_extern! {
-        fn aes_nohw_compact_block(out: *mut [Word; BLOCK_WORDS], input: &[u8; 16]);
-    }
-    let mut block = MaybeUninit::uninit();
-    unsafe {
-        aes_nohw_compact_block(block.as_mut_ptr(), input);
-        block.assume_init()
-    }
+// aes_nohw_delta_swap returns |a| with bits |a & mask| and
+// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
+#[inline(always)]
+fn delta_swap<const MASK: Word, const SHIFT: u8>(a: Word) -> Word {
+    // See
+    // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
+    let b = (a ^ (a >> SHIFT)) & MASK;
+    a ^ b ^ (b << SHIFT)
 }
 
-fn uncompact_block(input: &[Word; BLOCK_WORDS], out: &mut [u8; BLOCK_LEN]) {
-    prefixed_extern! {
-        fn aes_nohw_uncompact_block(out: *mut [u8; BLOCK_LEN], input: &[Word; BLOCK_WORDS]);
+// In the 32-bit and 64-bit implementations, a block spans multiple words.
+// |aes_nohw_compact_block| must permute bits across different words. First we
+// implement |aes_nohw_compact_word| which performs a smaller version of the
+// transformation which stays within a single word.
+//
+// These transformations are generalizations of the output of
+// http://programming.sirrida.de/calcperm.php on smaller inputs.
+#[inline(always)]
+fn compact_word(a: Word) -> Word {
+    let a = Word::from_le(a);
+    cfg_if! {
+        if #[cfg(target_pointer_width = "64")] {
+            // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
+            // quartets of those chunks:
+            //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+            //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
+            let a = delta_swap::<0x00f000f000f000f0, 4>(a);
+            // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
+            //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
+            //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
+            let a = delta_swap::<0x0000ff000000ff00, 8>(a);
+            // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
+            //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
+            //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
+            delta_swap::<0x00000000ffff0000, 16>(a)
+        } else if #[cfg(target_pointer_width = "32")] {
+            // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
+            //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+            //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
+            // Note:  0x00cc = 0b0000_0000_1100_1100
+            //   0x00cc << 6 = 0b0011_0011_0000_0000
+            let a = delta_swap::<0x00cc00cc, 6>(a);
+            // Now we swap groups of four bits (still numbering by pairs):
+            //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
+            //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
+            // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
+            delta_swap::<0x0000f0f0, 12>(a)
+        } else {
+            unimplemented!()
+        }
     }
-    unsafe {
-        aes_nohw_uncompact_block(out, input);
+}
+
+#[inline(always)]
+fn uncompact_word(a: Word) -> Word {
+    #[cfg(target_pointer_width = "64")]
+    let r = {
+        // Reverse the steps of |aes_nohw_uncompact_word|.
+        let a = delta_swap::<0x00000000ffff0000, 16>(a);
+        let a = delta_swap::<0x0000ff000000ff00, 8>(a);
+        delta_swap::<0x00f000f000f000f0, 4>(a)
+    };
+
+    #[cfg(target_pointer_width = "32")]
+    let r = {
+        let a = delta_swap::<0x0000f0f0, 12>(a);
+        delta_swap::<0x00cc00cc, 6>(a)
+    };
+
+    Word::to_le(r)
+}
+
+fn compact_block(input: &[u8; 16]) -> [Word; BLOCK_WORDS] {
+    let out: [Word; BLOCK_WORDS] = unsafe { core::mem::transmute(*input) };
+    let a0 = compact_word(out[0]);
+    let a1 = compact_word(out[1]);
+
+    #[cfg(target_pointer_width = "64")]
+    let r = [
+        (a0 & 0x00000000ffffffff) | (a1 << 32),
+        (a1 & 0xffffffff00000000) | (a0 >> 32),
+    ];
+
+    #[cfg(target_pointer_width = "32")]
+    let r = {
+        let a2 = compact_word(out[2]);
+        let a3 = compact_word(out[3]);
+        // Note clang, when building for ARM Thumb2, will sometimes miscompile
+        // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+        // without optimizations. This bug was introduced in
+        // https://reviews.llvm.org/rL340261 and fixed in
+        // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+        [
+            Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]),
+            Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]),
+            Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]),
+            Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]),
+        ]
+    };
+
+    r
+}
+
+fn uncompact_block(out: &mut [u8; BLOCK_LEN], input: &[Word; BLOCK_WORDS]) {
+    let a0 = input[0];
+    let a1 = input[1];
+
+    #[cfg(target_pointer_width = "64")]
+    let [b0, b1] = {
+        [
+            (a0 & 0x00000000ffffffff) | (a1 << 32),
+            (a1 & 0xffffffff00000000) | (a0 >> 32),
+        ]
+    };
+
+    #[cfg(target_pointer_width = "32")]
+    let [b0, b1, b2, b3] = {
+        let a2 = input[2];
+        let a3 = input[3];
+
+        // Note clang, when building for ARM Thumb2, will sometimes miscompile
+        // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+        // without optimizations. This bug was introduced in
+        // https://reviews.llvm.org/rL340261 and fixed in
+        // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+        let b0 = Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]);
+        let b1 = Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]);
+        let b2 = Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]);
+        let b3 = Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]);
+        [b0, b1, b2, b3]
+    };
+
+    let b0 = uncompact_word(b0);
+    let b1 = uncompact_word(b1);
+
+    #[cfg(target_pointer_width = "32")]
+    let (b2, b3) = (uncompact_word(b2), uncompact_word(b3));
+
+    let (out, _) = polyfill::slice::as_chunks_mut(out);
+    out[0] = Word::to_ne_bytes(b0);
+    out[1] = Word::to_ne_bytes(b1);
+
+    #[cfg(target_pointer_width = "32")]
+    {
+        out[2] = Word::to_ne_bytes(b2);
+        out[3] = Word::to_ne_bytes(b3);
     }
 }
 
+#[cfg(target_pointer_width = "32")]
+#[inline(always)]
+fn lo(w: Word) -> u8 {
+    w as u8
+}
+
 // aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
 // |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
 // |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
@@ -172,7 +307,7 @@ impl Batch {
         copy.transpose();
         out.iter_mut().enumerate().for_each(|(i, out)| {
             let block = copy.get(i);
-            uncompact_block(&block, out);
+            uncompact_block(out, &block);
         });
     }