Skip to content

Commit

Permalink
aes: Oxidize aes_nohw [un]compact_{block,word}.
Browse files Browse the repository at this point in the history
  • Loading branch information
briansmith committed May 21, 2024
1 parent 640fa8f commit e17d947
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 183 deletions.
3 changes: 0 additions & 3 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -869,9 +869,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"aes_hw_ctr32_encrypt_blocks",
"aes_hw_encrypt",
"aes_hw_set_encrypt_key",
"aes_nohw_batch_get",
"aes_nohw_compact_block",
"aes_nohw_uncompact_block",
"aes_nohw_sub_bytes",
"aes_nohw_to_batch",
"aes_nohw_mix_columns",
Expand Down
152 changes: 0 additions & 152 deletions crypto/fipsmodule/aes/aes_nohw.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,158 +155,6 @@ typedef struct {
aes_word_t w[8];
} AES_NOHW_BATCH;

// aes_nohw_delta_swap returns |a| with bits |a & mask| and
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
aes_word_t shift) {
// See
// https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
aes_word_t b = (a ^ (a >> shift)) & mask;
return a ^ b ^ (b << shift);
}

// In the 32-bit and 64-bit implementations, a block spans multiple words.
// |aes_nohw_compact_block| must permute bits across different words. First we
// implement |aes_nohw_compact_word| which performs a smaller version of the
// transformation which stays within a single word.
//
// These transformations are generalizations of the output of
// http://programming.sirrida.de/calcperm.php on smaller inputs.
#if defined(OPENSSL_64_BIT)
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap8(a);
#endif
// Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
// quartets of those chunks:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
// Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
// Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
// 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
return a;
}

static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
// Reverse the steps of |aes_nohw_uncompact_word|.
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap8(a);
#endif
return a;
}
#else // !OPENSSL_64_BIT
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap4(a);
#endif
// Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
// Note: 0x00cc = 0b0000_0000_1100_1100
// 0x00cc << 6 = 0b0011_0011_0000_0000
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
// Now we swap groups of four bits (still numbering by pairs):
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
// 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
// Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
return a;
}

static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
// Reverse the steps of |aes_nohw_uncompact_word|.
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap4(a);
#endif
return a;
}

static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
uint8_t a2, uint8_t a3) {
return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
((uint32_t)a3 << 24);
}

static inline uint8_t lo(uint32_t a) {
return (uint8_t)a;
}

#endif // OPENSSL_64_BIT

void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
const uint8_t in[16]) {
OPENSSL_memcpy(out, in, 16);
#if defined(OPENSSL_64_BIT)
uint64_t a0 = aes_nohw_compact_word(out[0]);
uint64_t a1 = aes_nohw_compact_word(out[1]);
out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
#else
uint32_t a0 = aes_nohw_compact_word(out[0]);
uint32_t a1 = aes_nohw_compact_word(out[1]);
uint32_t a2 = aes_nohw_compact_word(out[2]);
uint32_t a3 = aes_nohw_compact_word(out[3]);
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
#endif
}

void aes_nohw_uncompact_block(
uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
#if defined(OPENSSL_64_BIT)
uint64_t a0 = in[0];
uint64_t a1 = in[1];
uint64_t b0 =
aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
uint64_t b1 =
aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
OPENSSL_memcpy(out, &b0, 8);
OPENSSL_memcpy(out + 8, &b1, 8);
#else
uint32_t a0 = in[0];
uint32_t a1 = in[1];
uint32_t a2 = in[2];
uint32_t a3 = in[3];
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
uint32_t b2 =
aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
uint32_t b3 =
aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
b0 = aes_nohw_uncompact_word(b0);
b1 = aes_nohw_uncompact_word(b1);
b2 = aes_nohw_uncompact_word(b2);
b3 = aes_nohw_uncompact_word(b3);
OPENSSL_memcpy(out, &b0, 4);
OPENSSL_memcpy(out + 4, &b1, 4);
OPENSSL_memcpy(out + 8, &b2, 4);
OPENSSL_memcpy(out + 12, &b3, 4);
#endif
}

// AES round steps.

void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
Expand Down
12 changes: 0 additions & 12 deletions crypto/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,18 +378,6 @@ static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) {
static inline uint32_t CRYPTO_bswap4(uint32_t x) {
return __builtin_bswap32(x);
}

static inline uint64_t CRYPTO_bswap8(uint64_t x) {
return __builtin_bswap64(x);
}
#elif defined(_MSC_VER)
#pragma warning(push, 3)
#include <stdlib.h>
#pragma warning(pop)
#pragma intrinsic(_byteswap_ulong)
static inline uint32_t CRYPTO_bswap4(uint32_t x) {
return _byteswap_ulong(x);
}
#endif

#if !defined(RING_CORE_NOSTDLIBINC)
Expand Down
167 changes: 151 additions & 16 deletions src/aead/aes/aes_nohw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use crate::{
polyfill::{self, usize_from_u32, ArraySplitMap as _},
};
use cfg_if::cfg_if;
use core::{array, mem::MaybeUninit, ops::RangeFrom};
use core::{array, ops::RangeFrom};

type Word = constant_time::Word;
const WORD_SIZE: usize = core::mem::size_of::<Word>();
Expand All @@ -39,26 +39,161 @@ fn shift_right<const I: u32>(a: Word) -> Word {
a >> (I * BATCH_SIZE_U32)
}

fn compact_block(input: &[u8; 16]) -> [Word; BLOCK_WORDS] {
prefixed_extern! {
fn aes_nohw_compact_block(out: *mut [Word; BLOCK_WORDS], input: &[u8; 16]);
}
let mut block = MaybeUninit::uninit();
unsafe {
aes_nohw_compact_block(block.as_mut_ptr(), input);
block.assume_init()
}
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
#[inline(always)]
fn delta_swap<const MASK: Word, const SHIFT: u8>(a: Word) -> Word {
// See
// https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
let b = (a ^ (a >> SHIFT)) & MASK;
a ^ b ^ (b << SHIFT)
}

fn uncompact_block(input: &[Word; BLOCK_WORDS], out: &mut [u8; BLOCK_LEN]) {
prefixed_extern! {
fn aes_nohw_uncompact_block(out: *mut [u8; BLOCK_LEN], input: &[Word; BLOCK_WORDS]);
// In the 32-bit and 64-bit implementations, a block spans multiple words.
// |aes_nohw_compact_block| must permute bits across different words. First we
// implement |aes_nohw_compact_word| which performs a smaller version of the
// transformation which stays within a single word.
//
// These transformations are generalizations of the output of
// http://programming.sirrida.de/calcperm.php on smaller inputs.
#[inline(always)]
fn compact_word(a: Word) -> Word {
let a = Word::from_le(a);
cfg_if! {
if #[cfg(target_pointer_width = "64")] {
// Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
// quartets of those chunks:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
let a = delta_swap::<0x00f000f000f000f0, 4>(a);
// Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
let a = delta_swap::<0x0000ff000000ff00, 8>(a);
// Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
// 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
delta_swap::<0x00000000ffff0000, 16>(a)
} else if #[cfg(target_pointer_width = "32")] {
// Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
// Note: 0x00cc = 0b0000_0000_1100_1100
// 0x00cc << 6 = 0b0011_0011_0000_0000
let a = delta_swap::<0x00cc00cc, 6>(a);
// Now we swap groups of four bits (still numbering by pairs):
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
// 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
// Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
delta_swap::<0x0000f0f0, 12>(a)
} else {
unimplemented!()
}
}
unsafe {
aes_nohw_uncompact_block(out, input);
}

#[inline(always)]
fn uncompact_word(a: Word) -> Word {
#[cfg(target_pointer_width = "64")]
let r = {
// Reverse the steps of |aes_nohw_uncompact_word|.
let a = delta_swap::<0x00000000ffff0000, 16>(a);
let a = delta_swap::<0x0000ff000000ff00, 8>(a);
delta_swap::<0x00f000f000f000f0, 4>(a)
};

#[cfg(target_pointer_width = "32")]
let r = {
let a = delta_swap::<0x0000f0f0, 12>(a);
delta_swap::<0x00cc00cc, 6>(a)
};

Word::to_le(r)
}

fn compact_block(input: &[u8; 16]) -> [Word; BLOCK_WORDS] {
let out: [Word; BLOCK_WORDS] = unsafe { core::mem::transmute(*input) };
let a0 = compact_word(out[0]);
let a1 = compact_word(out[1]);

#[cfg(target_pointer_width = "64")]
let r = [
(a0 & 0x00000000ffffffff) | (a1 << 32),
(a1 & 0xffffffff00000000) | (a0 >> 32),
];

#[cfg(target_pointer_width = "32")]
let r = {
let a2 = compact_word(out[2]);
let a3 = compact_word(out[3]);
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
[
Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]),
Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]),
Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]),
Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]),
]
};

r
}

fn uncompact_block(out: &mut [u8; BLOCK_LEN], input: &[Word; BLOCK_WORDS]) {
let a0 = input[0];
let a1 = input[1];

#[cfg(target_pointer_width = "64")]
let [b0, b1] = {
[
(a0 & 0x00000000ffffffff) | (a1 << 32),
(a1 & 0xffffffff00000000) | (a0 >> 32),
]
};

#[cfg(target_pointer_width = "32")]
let [b0, b1, b2, b3] = {
let a2 = input[2];
let a3 = input[3];

// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
let b0 = Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]);
let b1 = Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]);
let b2 = Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]);
let b3 = Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]);
[b0, b1, b2, b3]
};

let b0 = uncompact_word(b0);
let b1 = uncompact_word(b1);

#[cfg(target_pointer_width = "32")]
let (b2, b3) = (uncompact_word(b2), uncompact_word(b3));

let (out, _) = polyfill::slice::as_chunks_mut(out);
out[0] = Word::to_ne_bytes(b0);
out[1] = Word::to_ne_bytes(b1);

#[cfg(target_pointer_width = "32")]
{
out[2] = Word::to_ne_bytes(b2);
out[3] = Word::to_ne_bytes(b3);
}
}

#[cfg(target_pointer_width = "32")]
#[inline(always)]
fn lo(w: Word) -> u8 {
w as u8
}

// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
Expand Down Expand Up @@ -172,7 +307,7 @@ impl Batch {
copy.transpose();
out.iter_mut().enumerate().for_each(|(i, out)| {
let block = copy.get(i);
uncompact_block(&block, out);
uncompact_block(out, &block);
});
}

Expand Down

0 comments on commit e17d947

Please sign in to comment.