diff --git a/Cargo.toml b/Cargo.toml index 0898ddd1af..7160e0362c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,6 @@ include = [ "crypto/curve25519/curve25519_64_adx.c", "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", - "crypto/fipsmodule/aes/aes_nohw.c", "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", "crypto/fipsmodule/aes/asm/aesv8-armx.pl", @@ -106,7 +105,6 @@ include = [ "crypto/cipher_extra/asm/chacha20_poly1305_armv8.pl", "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl", "examples/**/*.rs", - "include/ring-core/aes.h", "include/ring-core/arm_arch.h", "include/ring-core/asm_base.h", "include/ring-core/base.h", diff --git a/build.rs b/build.rs index f1186f216c..2c473a3061 100644 --- a/build.rs +++ b/build.rs @@ -53,7 +53,6 @@ const WASM32: &str = "wasm32"; #[rustfmt::skip] const RING_SRCS: &[(&[&str], &str)] = &[ (&[], "crypto/curve25519/curve25519.c"), - (&[], "crypto/fipsmodule/aes/aes_nohw.c"), (&[], "crypto/fipsmodule/bn/montgomery.c"), (&[], "crypto/fipsmodule/bn/montgomery_inv.c"), (&[], "crypto/fipsmodule/ec/ecp_nistz.c"), @@ -869,10 +868,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "aes_hw_ctr32_encrypt_blocks", "aes_hw_encrypt", "aes_hw_set_encrypt_key", - "aes_nohw_sub_bytes", - "aes_nohw_to_batch", - "aes_nohw_mix_columns", - "aes_nohw_shift_rows", "aesni_gcm_decrypt", "aesni_gcm_encrypt", "bn_from_montgomery_in_place", diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c deleted file mode 100644 index d9606319de..0000000000 --- a/crypto/fipsmodule/aes/aes_nohw.c +++ /dev/null @@ -1,391 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "../../internal.h" - -// This file contains a constant-time implementation of AES, bitsliced with -// 32-bit or 64-bit, operating on two-, four-, and eight-block -// batches, respectively. -// -// This implementation is based on the algorithms described in the following -// references: -// - https://bearssl.org/constanttime.html#aes -// - https://eprint.iacr.org/2009/129.pdf -// - https://eprint.iacr.org/2009/191.pdf - - -// Word operations. -// -// An aes_word_t is the word used for this AES implementation. Throughout this -// file, bits and bytes are ordered little-endian, though "left" and "right" -// shifts match the operations themselves, which makes them reversed in a -// little-endian, left-to-right reading. -// -// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an -// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| -// bits each, each corresponding to a byte in an AES block in column-major -// order (AES's byte order). We refer to these as "logical bytes". Note, in the -// 32-bit and 64-bit implementations, they are smaller than a byte. (The -// contents of a logical byte will be described later.) -// -// MSVC does not support C bit operators on |__m128i|, so the wrapper functions -// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and -// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift -// value ranges from 0 to 15 independent of |aes_word_t| and -// |AES_NOHW_BATCH_SIZE|. -// -// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which -// uses row-major order. Matching the AES order was easier to reason about, and -// we do not have PSHUFB available to arbitrarily permute bytes. - -#if defined(OPENSSL_64_BIT) -typedef uint64_t aes_word_t; -#define AES_NOHW_WORD_SIZE 8 -#define AES_NOHW_BATCH_SIZE 4 -#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) -#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) -#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) -#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) -#else // !OPENSSL_64_BIT -typedef uint32_t aes_word_t; -#define AES_NOHW_WORD_SIZE 4 -#define AES_NOHW_BATCH_SIZE 2 -#define AES_NOHW_ROW0_MASK 0x03030303 -#define AES_NOHW_ROW1_MASK 0x0c0c0c0c -#define AES_NOHW_ROW2_MASK 0x30303030 -#define AES_NOHW_ROW3_MASK 0xc0c0c0c0 -#endif // OPENSSL_64_BIT - -static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { - return a & b; -} - -static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { - return a | b; -} - -static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { - return a ^ b; -} - -static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } - -static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { - return a << (i * AES_NOHW_BATCH_SIZE); -} - -static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { - return a >> (i * AES_NOHW_BATCH_SIZE); -} - -OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), - "batch size does not match word size"); -OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), - "AES_NOHW_WORD_SIZE is incorrect"); - - -// Block representations. -// -// This implementation uses three representations for AES blocks. First, the -// public API represents blocks as uint8_t[16] in the usual way. Second, most -// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. -// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words -// containing bitsliced blocks a, b, c, d, this would be as follows (vertical -// bars divide logical bytes): -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// ... -// -// Finally, an individual block may be stored as an intermediate form in an -// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each -// block, so that block[0]'s ith logical byte contains least-significant -// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of -// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as -// "compacting" the block. Note this is no-op with 128-bit words because then -// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit -// words, one block would be stored in two words: -// -// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... -// -// Observe that the distances between corresponding bits in bitsliced and -// compact bit orders match. If we line up corresponding words of each block, -// the bitsliced and compact representations may be converted by tranposing bits -// in corresponding logical bytes. Continuing the 64-bit example: -// -// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... -// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... -// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// -// Note also that bitwise operations and (logical) byte permutations on an -// |aes_word_t| work equally for the bitsliced and compact words. -// -// We use the compact form in the |AES_KEY| representation to save work -// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists -// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately -// before or after |aes_nohw_transpose|. - -#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) - -// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise -// specified, it is in bitsliced form. -typedef struct { - aes_word_t w[8]; -} AES_NOHW_BATCH; - -// AES round steps. - -void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/191.pdf, Appendix C. - aes_word_t x0 = batch->w[7]; - aes_word_t x1 = batch->w[6]; - aes_word_t x2 = batch->w[5]; - aes_word_t x3 = batch->w[4]; - aes_word_t x4 = batch->w[3]; - aes_word_t x5 = batch->w[2]; - aes_word_t x6 = batch->w[1]; - aes_word_t x7 = batch->w[0]; - - // Figure 2, the top linear transformation. - aes_word_t y14 = aes_nohw_xor(x3, x5); - aes_word_t y13 = aes_nohw_xor(x0, x6); - aes_word_t y9 = aes_nohw_xor(x0, x3); - aes_word_t y8 = aes_nohw_xor(x0, x5); - aes_word_t t0 = aes_nohw_xor(x1, x2); - aes_word_t y1 = aes_nohw_xor(t0, x7); - aes_word_t y4 = aes_nohw_xor(y1, x3); - aes_word_t y12 = aes_nohw_xor(y13, y14); - aes_word_t y2 = aes_nohw_xor(y1, x0); - aes_word_t y5 = aes_nohw_xor(y1, x6); - aes_word_t y3 = aes_nohw_xor(y5, y8); - aes_word_t t1 = aes_nohw_xor(x4, y12); - aes_word_t y15 = aes_nohw_xor(t1, x5); - aes_word_t y20 = aes_nohw_xor(t1, x1); - aes_word_t y6 = aes_nohw_xor(y15, x7); - aes_word_t y10 = aes_nohw_xor(y15, t0); - aes_word_t y11 = aes_nohw_xor(y20, y9); - aes_word_t y7 = aes_nohw_xor(x7, y11); - aes_word_t y17 = aes_nohw_xor(y10, y11); - aes_word_t y19 = aes_nohw_xor(y10, y8); - aes_word_t y16 = aes_nohw_xor(t0, y11); - aes_word_t y21 = aes_nohw_xor(y13, y16); - aes_word_t y18 = aes_nohw_xor(x0, y16); - - // Figure 3, the middle non-linear section. - aes_word_t t2 = aes_nohw_and(y12, y15); - aes_word_t t3 = aes_nohw_and(y3, y6); - aes_word_t t4 = aes_nohw_xor(t3, t2); - aes_word_t t5 = aes_nohw_and(y4, x7); - aes_word_t t6 = aes_nohw_xor(t5, t2); - aes_word_t t7 = aes_nohw_and(y13, y16); - aes_word_t t8 = aes_nohw_and(y5, y1); - aes_word_t t9 = aes_nohw_xor(t8, t7); - aes_word_t t10 = aes_nohw_and(y2, y7); - aes_word_t t11 = aes_nohw_xor(t10, t7); - aes_word_t t12 = aes_nohw_and(y9, y11); - aes_word_t t13 = aes_nohw_and(y14, y17); - aes_word_t t14 = aes_nohw_xor(t13, t12); - aes_word_t t15 = aes_nohw_and(y8, y10); - aes_word_t t16 = aes_nohw_xor(t15, t12); - aes_word_t t17 = aes_nohw_xor(t4, t14); - aes_word_t t18 = aes_nohw_xor(t6, t16); - aes_word_t t19 = aes_nohw_xor(t9, t14); - aes_word_t t20 = aes_nohw_xor(t11, t16); - aes_word_t t21 = aes_nohw_xor(t17, y20); - aes_word_t t22 = aes_nohw_xor(t18, y19); - aes_word_t t23 = aes_nohw_xor(t19, y21); - aes_word_t t24 = aes_nohw_xor(t20, y18); - aes_word_t t25 = aes_nohw_xor(t21, t22); - aes_word_t t26 = aes_nohw_and(t21, t23); - aes_word_t t27 = aes_nohw_xor(t24, t26); - aes_word_t t28 = aes_nohw_and(t25, t27); - aes_word_t t29 = aes_nohw_xor(t28, t22); - aes_word_t t30 = aes_nohw_xor(t23, t24); - aes_word_t t31 = aes_nohw_xor(t22, t26); - aes_word_t t32 = aes_nohw_and(t31, t30); - aes_word_t t33 = aes_nohw_xor(t32, t24); - aes_word_t t34 = aes_nohw_xor(t23, t33); - aes_word_t t35 = aes_nohw_xor(t27, t33); - aes_word_t t36 = aes_nohw_and(t24, t35); - aes_word_t t37 = aes_nohw_xor(t36, t34); - aes_word_t t38 = aes_nohw_xor(t27, t36); - aes_word_t t39 = aes_nohw_and(t29, t38); - aes_word_t t40 = aes_nohw_xor(t25, t39); - aes_word_t t41 = aes_nohw_xor(t40, t37); - aes_word_t t42 = aes_nohw_xor(t29, t33); - aes_word_t t43 = aes_nohw_xor(t29, t40); - aes_word_t t44 = aes_nohw_xor(t33, t37); - aes_word_t t45 = aes_nohw_xor(t42, t41); - aes_word_t z0 = aes_nohw_and(t44, y15); - aes_word_t z1 = aes_nohw_and(t37, y6); - aes_word_t z2 = aes_nohw_and(t33, x7); - aes_word_t z3 = aes_nohw_and(t43, y16); - aes_word_t z4 = aes_nohw_and(t40, y1); - aes_word_t z5 = aes_nohw_and(t29, y7); - aes_word_t z6 = aes_nohw_and(t42, y11); - aes_word_t z7 = aes_nohw_and(t45, y17); - aes_word_t z8 = aes_nohw_and(t41, y10); - aes_word_t z9 = aes_nohw_and(t44, y12); - aes_word_t z10 = aes_nohw_and(t37, y3); - aes_word_t z11 = aes_nohw_and(t33, y4); - aes_word_t z12 = aes_nohw_and(t43, y13); - aes_word_t z13 = aes_nohw_and(t40, y5); - aes_word_t z14 = aes_nohw_and(t29, y2); - aes_word_t z15 = aes_nohw_and(t42, y9); - aes_word_t z16 = aes_nohw_and(t45, y14); - aes_word_t z17 = aes_nohw_and(t41, y8); - - // Figure 4, bottom linear transformation. - aes_word_t t46 = aes_nohw_xor(z15, z16); - aes_word_t t47 = aes_nohw_xor(z10, z11); - aes_word_t t48 = aes_nohw_xor(z5, z13); - aes_word_t t49 = aes_nohw_xor(z9, z10); - aes_word_t t50 = aes_nohw_xor(z2, z12); - aes_word_t t51 = aes_nohw_xor(z2, z5); - aes_word_t t52 = aes_nohw_xor(z7, z8); - aes_word_t t53 = aes_nohw_xor(z0, z3); - aes_word_t t54 = aes_nohw_xor(z6, z7); - aes_word_t t55 = aes_nohw_xor(z16, z17); - aes_word_t t56 = aes_nohw_xor(z12, t48); - aes_word_t t57 = aes_nohw_xor(t50, t53); - aes_word_t t58 = aes_nohw_xor(z4, t46); - aes_word_t t59 = aes_nohw_xor(z3, t54); - aes_word_t t60 = aes_nohw_xor(t46, t57); - aes_word_t t61 = aes_nohw_xor(z14, t57); - aes_word_t t62 = aes_nohw_xor(t52, t58); - aes_word_t t63 = aes_nohw_xor(t49, t58); - aes_word_t t64 = aes_nohw_xor(z4, t59); - aes_word_t t65 = aes_nohw_xor(t61, t62); - aes_word_t t66 = aes_nohw_xor(z1, t63); - aes_word_t s0 = aes_nohw_xor(t59, t63); - aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); - aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); - aes_word_t t67 = aes_nohw_xor(t64, t65); - aes_word_t s3 = aes_nohw_xor(t53, t66); - aes_word_t s4 = aes_nohw_xor(t51, t66); - aes_word_t s5 = aes_nohw_xor(t47, t65); - aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); - aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); - - batch->w[0] = s7; - batch->w[1] = s6; - batch->w[2] = s5; - batch->w[3] = s4; - batch->w[4] = s3; - batch->w[5] = s2; - batch->w[6] = s1; - batch->w[7] = s0; -} - -// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated -// to the right by |n|. This is a macro because |aes_nohw_shift_*| require -// constant shift counts in the SSE2 implementation. -#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ - (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ - aes_nohw_shift_left((v), 16 - (n)*4))) - -void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { - for (size_t i = 0; i < 8; i++) { - aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); - aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); - aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); - aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); - row1 = aes_nohw_rotate_cols_right(row1, 1); - row2 = aes_nohw_rotate_cols_right(row2, 2); - row3 = aes_nohw_rotate_cols_right(row3, 3); - batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); - } -} - -// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated -// down by one. -static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { -#if defined(OPENSSL_64_BIT) - return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | - ((v << 12) & UINT64_C(0xf000f000f000f000)); -#else - return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); -#endif -} - -// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated -// by two. -static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { -#if defined(OPENSSL_64_BIT) - return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | - ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); -#else - return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); -#endif -} - -void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. - aes_word_t a0 = batch->w[0]; - aes_word_t a1 = batch->w[1]; - aes_word_t a2 = batch->w[2]; - aes_word_t a3 = batch->w[3]; - aes_word_t a4 = batch->w[4]; - aes_word_t a5 = batch->w[5]; - aes_word_t a6 = batch->w[6]; - aes_word_t a7 = batch->w[7]; - - aes_word_t r0 = aes_nohw_rotate_rows_down(a0); - aes_word_t a0_r0 = aes_nohw_xor(a0, r0); - aes_word_t r1 = aes_nohw_rotate_rows_down(a1); - aes_word_t a1_r1 = aes_nohw_xor(a1, r1); - aes_word_t r2 = aes_nohw_rotate_rows_down(a2); - aes_word_t a2_r2 = aes_nohw_xor(a2, r2); - aes_word_t r3 = aes_nohw_rotate_rows_down(a3); - aes_word_t a3_r3 = aes_nohw_xor(a3, r3); - aes_word_t r4 = aes_nohw_rotate_rows_down(a4); - aes_word_t a4_r4 = aes_nohw_xor(a4, r4); - aes_word_t r5 = aes_nohw_rotate_rows_down(a5); - aes_word_t a5_r5 = aes_nohw_xor(a5, r5); - aes_word_t r6 = aes_nohw_rotate_rows_down(a6); - aes_word_t a6_r6 = aes_nohw_xor(a6, r6); - aes_word_t r7 = aes_nohw_rotate_rows_down(a7); - aes_word_t a7_r7 = aes_nohw_xor(a7, r7); - - batch->w[0] = - aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); - batch->w[1] = - aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), - aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); - batch->w[2] = - aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); - batch->w[3] = - aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), - aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); - batch->w[4] = - aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), - aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); - batch->w[5] = - aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); - batch->w[6] = - aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); - batch->w[7] = - aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); -} diff --git a/include/ring-core/aes.h b/include/ring-core/aes.h deleted file mode 100644 index 5b5130dad7..0000000000 --- a/include/ring-core/aes.h +++ /dev/null @@ -1,68 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#ifndef OPENSSL_HEADER_AES_H -#define OPENSSL_HEADER_AES_H - -#include - -// Raw AES functions. - - -// AES_MAXNR is the maximum number of AES rounds. -#define AES_MAXNR 14 - -// aes_key_st should be an opaque type, but EVP requires that the size be -// known. -struct aes_key_st { - uint32_t rd_key[4 * (AES_MAXNR + 1)]; - unsigned rounds; -}; -typedef struct aes_key_st AES_KEY; - -#endif // OPENSSL_HEADER_AES_H diff --git a/src/aead/aes.rs b/src/aead/aes.rs index 9227191624..e3b09fd250 100644 --- a/src/aead/aes.rs +++ b/src/aead/aes.rs @@ -353,7 +353,6 @@ impl Key { } } -// Keep this in sync with AES_KEY in aes.h. #[repr(C)] #[derive(Clone)] pub(super) struct AES_KEY { @@ -361,7 +360,6 @@ pub(super) struct AES_KEY { pub rounds: c::uint, } -// Keep this in sync with `AES_MAXNR` in aes.h. const MAX_ROUNDS: usize = 14; pub const AES_128_KEY_LEN: usize = 128 / 8; diff --git a/src/aead/aes/aes_nohw.rs b/src/aead/aes/aes_nohw.rs index 05b17808f8..42be4f3a90 100644 --- a/src/aead/aes/aes_nohw.rs +++ b/src/aead/aes/aes_nohw.rs @@ -29,6 +29,40 @@ const BATCH_SIZE_U32: u32 = BATCH_SIZE as u32; const BLOCK_WORDS: usize = 16 / WORD_SIZE; +cfg_if! { + if #[cfg(target_pointer_width = "64")] { + const ROW0_MASK: Word = 0x000f000f000f000f; + const ROW1_MASK: Word = 0x00f000f000f000f0; + const ROW2_MASK: Word = 0x0f000f000f000f00; + const ROW3_MASK: Word = 0xf000f000f000f000; + } else if #[cfg(target_pointer_width = "32")] { + const ROW0_MASK: Word = 0x03030303; + const ROW1_MASK: Word = 0x0c0c0c0c; + const ROW2_MASK: Word = 0x30303030; + const ROW3_MASK: Word = 0xc0c0c0c0; + } +} + +#[inline(always)] +fn and(a: Word, b: Word) -> Word { + a & b +} + +#[inline(always)] +fn or(a: Word, b: Word) -> Word { + a | b +} + +#[inline(always)] +fn xor(a: Word, b: Word) -> Word { + a ^ b +} + +#[inline(always)] +fn not(a: Word) -> Word { + !a +} + #[inline(always)] fn shift_left(a: Word) -> Word { a << (I * BATCH_SIZE_U32) @@ -271,30 +305,227 @@ impl Batch { self.w[i + (j * STRIDE)] }) } +} +// AES round steps. +impl Batch { fn sub_bytes(&mut self) { - prefixed_extern! { - fn aes_nohw_sub_bytes(batch: &mut Batch); - } - unsafe { aes_nohw_sub_bytes(self) }; + // See https://eprint.iacr.org/2009/191.pdf, Appendix C. + let x0 = self.w[7]; + let x1 = self.w[6]; + let x2 = self.w[5]; + let x3 = self.w[4]; + let x4 = self.w[3]; + let x5 = self.w[2]; + let x6 = self.w[1]; + let x7 = self.w[0]; + + // Figure 2, the top linear transformation. + let y14 = xor(x3, x5); + let y13 = xor(x0, x6); + let y9 = xor(x0, x3); + let y8 = xor(x0, x5); + let t0 = xor(x1, x2); + let y1 = xor(t0, x7); + let y4 = xor(y1, x3); + let y12 = xor(y13, y14); + let y2 = xor(y1, x0); + let y5 = xor(y1, x6); + let y3 = xor(y5, y8); + let t1 = xor(x4, y12); + let y15 = xor(t1, x5); + let y20 = xor(t1, x1); + let y6 = xor(y15, x7); + let y10 = xor(y15, t0); + let y11 = xor(y20, y9); + let y7 = xor(x7, y11); + let y17 = xor(y10, y11); + let y19 = xor(y10, y8); + let y16 = xor(t0, y11); + let y21 = xor(y13, y16); + let y18 = xor(x0, y16); + + // Figure 3, the middle non-linear section. + let t2 = and(y12, y15); + let t3 = and(y3, y6); + let t4 = xor(t3, t2); + let t5 = and(y4, x7); + let t6 = xor(t5, t2); + let t7 = and(y13, y16); + let t8 = and(y5, y1); + let t9 = xor(t8, t7); + let t10 = and(y2, y7); + let t11 = xor(t10, t7); + let t12 = and(y9, y11); + let t13 = and(y14, y17); + let t14 = xor(t13, t12); + let t15 = and(y8, y10); + let t16 = xor(t15, t12); + let t17 = xor(t4, t14); + let t18 = xor(t6, t16); + let t19 = xor(t9, t14); + let t20 = xor(t11, t16); + let t21 = xor(t17, y20); + let t22 = xor(t18, y19); + let t23 = xor(t19, y21); + let t24 = xor(t20, y18); + let t25 = xor(t21, t22); + let t26 = and(t21, t23); + let t27 = xor(t24, t26); + let t28 = and(t25, t27); + let t29 = xor(t28, t22); + let t30 = xor(t23, t24); + let t31 = xor(t22, t26); + let t32 = and(t31, t30); + let t33 = xor(t32, t24); + let t34 = xor(t23, t33); + let t35 = xor(t27, t33); + let t36 = and(t24, t35); + let t37 = xor(t36, t34); + let t38 = xor(t27, t36); + let t39 = and(t29, t38); + let t40 = xor(t25, t39); + let t41 = xor(t40, t37); + let t42 = xor(t29, t33); + let t43 = xor(t29, t40); + let t44 = xor(t33, t37); + let t45 = xor(t42, t41); + let z0 = and(t44, y15); + let z1 = and(t37, y6); + let z2 = and(t33, x7); + let z3 = and(t43, y16); + let z4 = and(t40, y1); + let z5 = and(t29, y7); + let z6 = and(t42, y11); + let z7 = and(t45, y17); + let z8 = and(t41, y10); + let z9 = and(t44, y12); + let z10 = and(t37, y3); + let z11 = and(t33, y4); + let z12 = and(t43, y13); + let z13 = and(t40, y5); + let z14 = and(t29, y2); + let z15 = and(t42, y9); + let z16 = and(t45, y14); + let z17 = and(t41, y8); + + // Figure 4, bottom linear transformation. + let t46 = xor(z15, z16); + let t47 = xor(z10, z11); + let t48 = xor(z5, z13); + let t49 = xor(z9, z10); + let t50 = xor(z2, z12); + let t51 = xor(z2, z5); + let t52 = xor(z7, z8); + let t53 = xor(z0, z3); + let t54 = xor(z6, z7); + let t55 = xor(z16, z17); + let t56 = xor(z12, t48); + let t57 = xor(t50, t53); + let t58 = xor(z4, t46); + let t59 = xor(z3, t54); + let t60 = xor(t46, t57); + let t61 = xor(z14, t57); + let t62 = xor(t52, t58); + let t63 = xor(t49, t58); + let t64 = xor(z4, t59); + let t65 = xor(t61, t62); + let t66 = xor(z1, t63); + let s0 = xor(t59, t63); + let s6 = xor(t56, not(t62)); + let s7 = xor(t48, not(t60)); + let t67 = xor(t64, t65); + let s3 = xor(t53, t66); + let s4 = xor(t51, t66); + let s5 = xor(t47, t65); + let s1 = xor(t64, not(s3)); + let s2 = xor(t55, not(t67)); + + self.w[0] = s7; + self.w[1] = s6; + self.w[2] = s5; + self.w[3] = s4; + self.w[4] = s3; + self.w[5] = s2; + self.w[6] = s1; + self.w[7] = s0; } fn add_round_key(&mut self, key: &Batch) { constant_time::xor_assign_at_start(&mut self.w, &key.w) } + #[inline(always)] + fn rotate_cols_right( + v: Word, + ) -> Word { + or( + shift_right::(v), + shift_left::(v), + ) + } +} + +// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated +// to the right by |n|. This is a macro because |aes_nohw_shift_*| require +// constant shift counts in the SSE2 implementation. +// TODO(MSRV feature(generic_const_exprs)): Replace this. +macro_rules! rotate_cols_right { + ( Self::rotate_cols_right::<$N:literal>($v:expr) ) => { + Self::rotate_cols_right::<{ $N * 4 }, { 16 - ($N * 4) }>($v) + }; +} + +impl Batch { fn shift_rows(&mut self) { - prefixed_extern! { - fn aes_nohw_shift_rows(batch: &mut Batch); - } - unsafe { aes_nohw_shift_rows(self) }; + self.w.iter_mut().for_each(|w| { + let row0 = and(*w, ROW0_MASK); + let row1 = and(*w, ROW1_MASK); + let row2 = and(*w, ROW2_MASK); + let row3 = and(*w, ROW3_MASK); + let row1 = rotate_cols_right!(Self::rotate_cols_right::<1>(row1)); + let row2 = rotate_cols_right!(Self::rotate_cols_right::<2>(row2)); + let row3 = rotate_cols_right!(Self::rotate_cols_right::<3>(row3)); + *w = or(or(row0, row1), or(row2, row3)); + }); } fn mix_columns(&mut self) { - prefixed_extern! { - fn aes_nohw_mix_columns(batch: &mut Batch); - } - unsafe { aes_nohw_mix_columns(self) }; + // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. + let a0 = self.w[0]; + let a1 = self.w[1]; + let a2 = self.w[2]; + let a3 = self.w[3]; + let a4 = self.w[4]; + let a5 = self.w[5]; + let a6 = self.w[6]; + let a7 = self.w[7]; + + let r0 = rotate_rows_down(a0); + let a0_r0 = xor(a0, r0); + let r1 = rotate_rows_down(a1); + let a1_r1 = xor(a1, r1); + let r2 = rotate_rows_down(a2); + let a2_r2 = xor(a2, r2); + let r3 = rotate_rows_down(a3); + let a3_r3 = xor(a3, r3); + let r4 = rotate_rows_down(a4); + let a4_r4 = xor(a4, r4); + let r5 = rotate_rows_down(a5); + let a5_r5 = xor(a5, r5); + let r6 = rotate_rows_down(a6); + let a6_r6 = xor(a6, r6); + let r7 = rotate_rows_down(a7); + let a7_r7 = xor(a7, r7); + + self.w[0] = xor(xor(a7_r7, r0), rotate_rows_twice(a0_r0)); + self.w[1] = xor(xor(a0_r0, a7_r7), xor(r1, rotate_rows_twice(a1_r1))); + self.w[2] = xor(xor(a1_r1, r2), rotate_rows_twice(a2_r2)); + self.w[3] = xor(xor(a2_r2, a7_r7), xor(r3, rotate_rows_twice(a3_r3))); + self.w[4] = xor(xor(a3_r3, a7_r7), xor(r4, rotate_rows_twice(a4_r4))); + self.w[5] = xor(xor(a4_r4, r5), rotate_rows_twice(a5_r5)); + self.w[6] = xor(xor(a5_r5, r6), rotate_rows_twice(a6_r6)); + self.w[7] = xor(xor(a6_r6, r7), rotate_rows_twice(a7_r7)); } // aes_nohw_from_batch writes the first |num_blocks| blocks in |batch| to |out|. @@ -361,6 +592,21 @@ fn rotate_rows_down(v: Word) -> Word { } } +// rotate_rows_twice returns |v| with the rows in each column rotated +// by two. +#[inline(always)] +fn rotate_rows_twice(v: Word) -> Word { + #[cfg(target_pointer_width = "64")] + { + ((v >> 8) & 0x00ff00ff00ff00ff) | ((v << 8) & 0xff00ff00ff00ff00) + } + + #[cfg(target_pointer_width = "32")] + { + ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0) + } +} + // Key schedule. // An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is