diff --git a/Cargo.toml b/Cargo.toml index c8ac2a3978..0585b60927 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,8 +69,8 @@ include = [ "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl", "crypto/fipsmodule/ec/ecp_nistz.c", "crypto/fipsmodule/ec/ecp_nistz.h", - "crypto/fipsmodule/ec/ecp_nistz384.h", - "crypto/fipsmodule/ec/ecp_nistz384.inl", + "crypto/fipsmodule/ec/ecp_nistz.inl", + "crypto/fipsmodule/ec/gfp.h", "crypto/fipsmodule/ec/gfp_p256.c", "crypto/fipsmodule/ec/gfp_p384.c", "crypto/fipsmodule/ec/p256.c", diff --git a/crypto/fipsmodule/ec/ecp_nistz384.inl b/crypto/fipsmodule/ec/ecp_nistz.inl similarity index 50% rename from crypto/fipsmodule/ec/ecp_nistz384.inl rename to crypto/fipsmodule/ec/ecp_nistz.inl index ae28f97ae5..a9b2211a1f 100644 --- a/crypto/fipsmodule/ec/ecp_nistz384.inl +++ b/crypto/fipsmodule/ec/ecp_nistz.inl @@ -20,20 +20,24 @@ * Shay Gueron and Vlad Krasnov * "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes" * http://eprint.iacr.org/2013/816 */ - #include "ecp_nistz.h" +#include "gfp.h" #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-conversion" #endif +#define point_add(prefix, bits) RENAME_FUNC(prefix, bits, point_add) +#define point_double(prefix, bits) RENAME_FUNC(prefix, bits, point_double) +#define point_mul(prefix, bits) RENAME_FUNC(prefix, bits, point_mul) + /* Point double: r = 2*a */ -static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) { - BN_ULONG S[P384_LIMBS]; - BN_ULONG M[P384_LIMBS]; - BN_ULONG Zsqr[P384_LIMBS]; - BN_ULONG tmp0[P384_LIMBS]; +static void point_double(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a) { + BN_ULONG S[FE_LIMBS]; + BN_ULONG M[FE_LIMBS]; + BN_ULONG Zsqr[FE_LIMBS]; + BN_ULONG tmp0[FE_LIMBS]; const BN_ULONG *in_x = a->X; const BN_ULONG *in_y = a->Y; @@ -74,20 +78,20 @@ static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) { } /* Point addition: r = a+b */ -static void nistz384_point_add(P384_POINT *r, const P384_POINT *a, - const P384_POINT *b) { - BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS]; - BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS]; - BN_ULONG Z1sqr[P384_LIMBS]; - BN_ULONG Z2sqr[P384_LIMBS]; - BN_ULONG H[P384_LIMBS], R[P384_LIMBS]; - BN_ULONG Hsqr[P384_LIMBS]; - BN_ULONG Rsqr[P384_LIMBS]; - BN_ULONG Hcub[P384_LIMBS]; - - BN_ULONG res_x[P384_LIMBS]; - BN_ULONG res_y[P384_LIMBS]; - BN_ULONG res_z[P384_LIMBS]; +static void point_add(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a, + const NIST_POINT *b) { + BN_ULONG U2[FE_LIMBS], S2[FE_LIMBS]; + BN_ULONG U1[FE_LIMBS], S1[FE_LIMBS]; + BN_ULONG Z1sqr[FE_LIMBS]; + BN_ULONG Z2sqr[FE_LIMBS]; + BN_ULONG H[FE_LIMBS], R[FE_LIMBS]; + BN_ULONG Hsqr[FE_LIMBS]; + BN_ULONG Rsqr[FE_LIMBS]; + BN_ULONG Hcub[FE_LIMBS]; + + BN_ULONG res_x[FE_LIMBS]; + BN_ULONG res_y[FE_LIMBS]; + BN_ULONG res_z[FE_LIMBS]; const BN_ULONG *in1_x = a->X; const BN_ULONG *in1_y = a->Y; @@ -117,11 +121,11 @@ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a, BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty; if (is_exceptional) { if (is_equal(S1, S2)) { - nistz384_point_double(r, a); + point_double(nistz, BITS)(r, a); } else { - limbs_zero(r->X, P384_LIMBS); - limbs_zero(r->Y, P384_LIMBS); - limbs_zero(r->Z, P384_LIMBS); + limbs_zero(r->X, FE_LIMBS); + limbs_zero(r->Y, FE_LIMBS); + limbs_zero(r->Z, FE_LIMBS); } return; } @@ -152,147 +156,136 @@ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a, copy_conditional(res_y, in1_y, in2infty); copy_conditional(res_z, in1_z, in2infty); - limbs_copy(r->X, res_x, P384_LIMBS); - limbs_copy(r->Y, res_y, P384_LIMBS); - limbs_copy(r->Z, res_z, P384_LIMBS); + limbs_copy(r->X, res_x, FE_LIMBS); + limbs_copy(r->Y, res_y, FE_LIMBS); + limbs_copy(r->Z, res_z, FE_LIMBS); } -static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue, - const P384_POINT table[16]) { +static void add_precomputed_w(NIST_POINT *r, crypto_word_t wvalue, + const NIST_POINT table[TBL_SZ]) { crypto_word_t recoded_is_negative; crypto_word_t recoded; - booth_recode(&recoded_is_negative, &recoded, wvalue, 5); + booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS); - alignas(64) P384_POINT h; - p384_point_select_w5(&h, table, recoded); + alignas(64) NIST_POINT h; + NIST_POINT_select_w(&h, table, recoded); - alignas(64) BN_ULONG tmp[P384_LIMBS]; - p384_elem_neg(tmp, h.Y); + alignas(64) BN_ULONG tmp[FE_LIMBS]; + elem_neg(tmp, h.Y); copy_conditional(h.Y, tmp, recoded_is_negative); - nistz384_point_add(r, r, &h); + point_add(nistz, BITS)(r, r, &h); } /* r = p * p_scalar */ -static void nistz384_point_mul(P384_POINT *r, - const BN_ULONG p_scalar[P384_LIMBS], - const Limb p_x[P384_LIMBS], - const Limb p_y[P384_LIMBS]) { - static const size_t kWindowSize = 5; - static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; - - uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1]; +static void point_mul(nistz, BITS)(NIST_POINT *r, const BN_ULONG p_scalar[FE_LIMBS], + const BN_ULONG p_x[FE_LIMBS], + const BN_ULONG p_y[FE_LIMBS]) { + uint8_t p_str[(FE_LIMBS * sizeof(Limb)) + 1]; little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]), - p_scalar, P384_LIMBS); + p_scalar, FE_LIMBS); - /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should + /* A |NIST_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should * add no more than 63 bytes of overhead. Thus, |table| should require * ~2367 ((144 * 16) + 63) bytes of stack space. */ - alignas(64) P384_POINT table[16]; + alignas(64) NIST_POINT table[TBL_SZ]; /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is * not stored. All other values are actually stored with an offset of -1 in * table. */ - P384_POINT *row = table; - - limbs_copy(row[1 - 1].X, p_x, P384_LIMBS); - limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS); - limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS); - - nistz384_point_double(&row[2 - 1], &row[1 - 1]); - nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); - nistz384_point_double(&row[4 - 1], &row[2 - 1]); - nistz384_point_double(&row[6 - 1], &row[3 - 1]); - nistz384_point_double(&row[8 - 1], &row[4 - 1]); - nistz384_point_double(&row[12 - 1], &row[6 - 1]); - nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); - nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); - nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); - nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); - nistz384_point_double(&row[14 - 1], &row[7 - 1]); - nistz384_point_double(&row[10 - 1], &row[5 - 1]); - nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); - nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); - nistz384_point_double(&row[16 - 1], &row[8 - 1]); - - static const size_t START_INDEX = 384 - 4; + NIST_POINT *row = table; + + limbs_copy(row[0].X, p_x, FE_LIMBS); + limbs_copy(row[0].Y, p_y, FE_LIMBS); + limbs_copy(row[0].Z, ONE, FE_LIMBS); + + point_double(nistz, BITS)(&row[1], &row[0]); + + for (int i = 2; i < TBL_SZ; i += 2) { + point_add(nistz, BITS)(&row[i], &row[i - 1], &row[0]); + point_double(nistz, BITS)(&row[i + 1], &row[i / 2]); + } + + static const size_t ROUND_SIZE = (BITS + W_BITS - 1) / W_BITS * W_BITS; + size_t START_INDEX = ROUND_SIZE == BITS + 1 ? ROUND_SIZE - W_BITS: ROUND_SIZE; size_t index = START_INDEX; BN_ULONG recoded_is_negative; crypto_word_t recoded; crypto_word_t wvalue = p_str[(index - 1) / 8]; - wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK; - booth_recode(&recoded_is_negative, &recoded, wvalue, 5); + booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS); dev_assert_secret(!recoded_is_negative); - p384_point_select_w5(r, table, recoded); + NIST_POINT_select_w(r, table, recoded); - while (index >= kWindowSize) { + while (index >= W_BITS) { if (index != START_INDEX) { size_t off = (index - 1) / 8; wvalue = p_str[off] | p_str[off + 1] << 8; - wvalue = (wvalue >> ((index - 1) % 8)) & kMask; - add_precomputed_w5(r, wvalue, table); + wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK; + add_precomputed_w(r, wvalue, table); } - index -= kWindowSize; + index -= W_BITS; - nistz384_point_double(r, r); - nistz384_point_double(r, r); - nistz384_point_double(r, r); - nistz384_point_double(r, r); - nistz384_point_double(r, r); + for (int i = 0; i < W_BITS; i++) { + point_double(nistz, BITS)(r, r); + } } /* Final window */ wvalue = p_str[0]; - wvalue = (wvalue << 1) & kMask; - add_precomputed_w5(r, wvalue, table); + wvalue = (wvalue << 1) & W_MASK; + add_precomputed_w(r, wvalue, table); } -void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS]) +void point_double(p, BITS)(Limb r[3][FE_LIMBS], const Limb a[3][FE_LIMBS]) { - P384_POINT t; - limbs_copy(t.X, a[0], P384_LIMBS); - limbs_copy(t.Y, a[1], P384_LIMBS); - limbs_copy(t.Z, a[2], P384_LIMBS); - nistz384_point_double(&t, &t); - limbs_copy(r[0], t.X, P384_LIMBS); - limbs_copy(r[1], t.Y, P384_LIMBS); - limbs_copy(r[2], t.Z, P384_LIMBS); + NIST_POINT t; + limbs_copy(t.X, a[0], FE_LIMBS); + limbs_copy(t.Y, a[1], FE_LIMBS); + limbs_copy(t.Z, a[2], FE_LIMBS); + point_double(nistz, BITS)(&t, &t); + limbs_copy(r[0], t.X, FE_LIMBS); + limbs_copy(r[1], t.Y, FE_LIMBS); + limbs_copy(r[2], t.Z, FE_LIMBS); } -void p384_point_add(Limb r[3][P384_LIMBS], - const Limb a[3][P384_LIMBS], - const Limb b[3][P384_LIMBS]) +void point_add(p, BITS)(Limb r[3][FE_LIMBS], + const Limb a[3][FE_LIMBS], + const Limb b[3][FE_LIMBS]) { - P384_POINT t1; - limbs_copy(t1.X, a[0], P384_LIMBS); - limbs_copy(t1.Y, a[1], P384_LIMBS); - limbs_copy(t1.Z, a[2], P384_LIMBS); + NIST_POINT t1; + limbs_copy(t1.X, a[0], FE_LIMBS); + limbs_copy(t1.Y, a[1], FE_LIMBS); + limbs_copy(t1.Z, a[2], FE_LIMBS); - P384_POINT t2; - limbs_copy(t2.X, b[0], P384_LIMBS); - limbs_copy(t2.Y, b[1], P384_LIMBS); - limbs_copy(t2.Z, b[2], P384_LIMBS); + NIST_POINT t2; + limbs_copy(t2.X, b[0], FE_LIMBS); + limbs_copy(t2.Y, b[1], FE_LIMBS); + limbs_copy(t2.Z, b[2], FE_LIMBS); - nistz384_point_add(&t1, &t1, &t2); + point_add(nistz, BITS)(&t1, &t1, &t2); - limbs_copy(r[0], t1.X, P384_LIMBS); - limbs_copy(r[1], t1.Y, P384_LIMBS); - limbs_copy(r[2], t1.Z, P384_LIMBS); + limbs_copy(r[0], t1.X, FE_LIMBS); + limbs_copy(r[1], t1.Y, FE_LIMBS); + limbs_copy(r[2], t1.Z, FE_LIMBS); } -void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS], - const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) { - alignas(64) P384_POINT acc; - nistz384_point_mul(&acc, p_scalar, p_x, p_y); - limbs_copy(r[0], acc.X, P384_LIMBS); - limbs_copy(r[1], acc.Y, P384_LIMBS); - limbs_copy(r[2], acc.Z, P384_LIMBS); +void point_mul(p, BITS)(Limb r[3][FE_LIMBS], + const BN_ULONG p_scalar[FE_LIMBS], + const Limb p_x[FE_LIMBS], + const Limb p_y[FE_LIMBS]) +{ + alignas(64) NIST_POINT acc; + point_mul(nistz, BITS)(&acc, p_scalar, p_x, p_y); + limbs_copy(r[0], acc.X, FE_LIMBS); + limbs_copy(r[1], acc.Y, FE_LIMBS); + limbs_copy(r[2], acc.Z, FE_LIMBS); } #if defined(__GNUC__) || defined(__clang__) diff --git a/crypto/fipsmodule/ec/ecp_nistz384.h b/crypto/fipsmodule/ec/ecp_nistz384.h deleted file mode 100644 index ca87e60721..0000000000 --- a/crypto/fipsmodule/ec/ecp_nistz384.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2014, Intel Corporation. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H -#define OPENSSL_HEADER_EC_ECP_NISTZ384_H - -#include "../../limbs/limbs.h" - -#define P384_LIMBS (384u / LIMB_BITS) - -typedef struct { - Limb X[P384_LIMBS]; - Limb Y[P384_LIMBS]; - Limb Z[P384_LIMBS]; -} P384_POINT; - -typedef struct { - Limb X[P384_LIMBS]; - Limb Y[P384_LIMBS]; -} P384_POINT_AFFINE; - - -#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H diff --git a/crypto/fipsmodule/ec/gfp.h b/crypto/fipsmodule/ec/gfp.h new file mode 100644 index 0000000000..172ccc787d --- /dev/null +++ b/crypto/fipsmodule/ec/gfp.h @@ -0,0 +1,193 @@ +/* Copyright 2016-2024 Brian Smith. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#define RENAME_FUNC(prefix, bits, func) prefix ## bits ## _ ## func + +typedef struct { + Limb X[FE_LIMBS]; + Limb Y[FE_LIMBS]; + Limb Z[FE_LIMBS]; +} NIST_POINT; + +typedef struct { + Limb X[FE_LIMBS]; + Limb Y[FE_LIMBS]; +} NIST_POINT_AFFINE; + +#define TBL_SZ (1 << (W_BITS - 1)) +#define W_MASK ((1 << (W_BITS + 1)) - 1) + +static inline Limb is_equal(const Elem a, const Elem b) { + return LIMBS_equal(a, b, FE_LIMBS); +} + +static inline Limb is_zero(const BN_ULONG a[FE_LIMBS]) { + return LIMBS_are_zero(a, FE_LIMBS); +} + +static inline void copy_conditional(Elem r, const Elem a, + const Limb condition) { + for (size_t i = 0; i < FE_LIMBS; ++i) { + r[i] = constant_time_select_w(condition, a[i], r[i]); + } +} + +static inline void elem_add(Elem r, const Elem a, const Elem b) { + LIMBS_add_mod(r, a, b, Q, FE_LIMBS); +} + +static inline void elem_sub(Elem r, const Elem a, const Elem b) { + LIMBS_sub_mod(r, a, b, Q, FE_LIMBS); +} + +static void elem_div_by_2(Elem r, const Elem a) { + /* Consider the case where `a` is even. Then we can shift `a` right one bit + * and the result will still be valid because we didn't lose any bits and so + * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy. + * + * The remainder of this comment is considering the case where `a` is odd. + * + * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)` + * because the lowest bit is lost during the shift. For example, consider: + * + * ```python + * q = 2**384 - 2**128 - 2**96 + 2**32 - 1 + * a = 2**383 + * two_a = a * 2 % q + * assert two_a == 0x100000000ffffffffffffffff00000001 + * ``` + * + * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When + * we divide `two_a` by two (mod q), we need to get the value `2**383`, which + * we obviously can't get with just a right shift. + * + * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate + * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to + * keep track of an extra most significant bit. We can avoid that by instead + * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least + * significant bit of `a`. `q + 1` is even, which means it can be shifted + * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest + * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know + * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of + * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant + * bit of `a`, which is 1. Thus: + * + * sum = ((q + 1) >> 1) + (a >> 1) + * sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2) + * <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2) + * <= (q + 1)/2 + (q - 3)/2 (simplifying) + * <= (q + 1 + q - 3)/2 (factoring out the common divisor) + * <= (2q - 2)/2 (simplifying) + * <= q - 1 (simplifying) + * + * Thus, no reduction of the sum mod `q` is necessary. */ + + Limb is_odd = constant_time_is_nonzero_w(a[0] & 1); + + /* r = a >> 1. */ + Limb carry = a[FE_LIMBS - 1] & 1; + r[FE_LIMBS - 1] = a[FE_LIMBS - 1] >> 1; + for (size_t i = 1; i < FE_LIMBS; ++i) { + Limb new_carry = a[FE_LIMBS - i - 1]; + r[FE_LIMBS - i - 1] = + (a[FE_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1)); + carry = new_carry; + } + + Elem adjusted; + BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, FE_LIMBS); + dev_assert_secret(carry2 == 0); + (void)carry2; + copy_conditional(r, adjusted, is_odd); +} + +static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) { + /* XXX: Not (clearly) constant-time; inefficient.*/ + bn_mul_mont(r, a, b, Q, Q_N0, FE_LIMBS); +} + +static inline void elem_mul_by_2(Elem r, const Elem a) { + LIMBS_shl_mod(r, a, Q, FE_LIMBS); +} + +static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) { + /* XXX: inefficient. TODO: Replace with an integrated shift + add. */ + Elem doubled; + elem_add(doubled, a, a); + elem_add(r, doubled, a); +} + +static inline void elem_sqr_mont(Elem r, const Elem a) { + /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */ + elem_mul_mont(r, a, a); +} + +static void elem_neg(Elem r, const Elem a) { + Limb is_zero = LIMBS_are_zero(a, FE_LIMBS); + Carry borrow = limbs_sub(r, Q, a, FE_LIMBS); + dev_assert_secret(borrow == 0); + (void)borrow; + for (size_t i = 0; i < FE_LIMBS; ++i) { + r[i] = constant_time_select_w(is_zero, 0, r[i]); + } +} + +static void NIST_POINT_select_w(NIST_POINT *out, + const NIST_POINT table[TBL_SZ], size_t index) { + Elem x; limbs_zero(x, FE_LIMBS); + Elem y; limbs_zero(y, FE_LIMBS); + Elem z; limbs_zero(z, FE_LIMBS); + + // TODO: Rewrite in terms of |limbs_select|. + for (size_t i = 0; i < TBL_SZ; ++i) { + crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1); + for (size_t j = 0; j < FE_LIMBS; ++j) { + x[j] = constant_time_select_w(equal, table[i].X[j], x[j]); + y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]); + z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]); + } + } + + limbs_copy(out->X, x, FE_LIMBS); + limbs_copy(out->Y, y, FE_LIMBS); + limbs_copy(out->Z, z, FE_LIMBS); +} + +#define bits_elem_neg(prefix, bits) RENAME_FUNC(prefix, bits, elem_neg) +#define bits_elem_sub(prefix, bits) RENAME_FUNC(prefix, bits, elem_sub) +#define bits_elem_div_by_2(prefix, bits) RENAME_FUNC(prefix, bits, elem_div_by_2) +#define bits_elem_mul_mont(prefix, bits) RENAME_FUNC(prefix, bits, elem_mul_mont) +#define bits_scalar_mul_mont(prefix, bits) RENAME_FUNC(prefix, bits, scalar_mul_mont) + +void bits_elem_neg(p, BITS)(Elem r, const Elem a) { + elem_neg(r, a); +} + +void bits_elem_sub(p, BITS)(Elem r, const Elem a, const Elem b) { + elem_sub(r, a, b); +} + +void bits_elem_div_by_2(p, BITS)(Elem r, const Elem a) { + elem_div_by_2(r, a); +} + +void bits_elem_mul_mont(p, BITS)(Elem r, const Elem a, const Elem b) { + elem_mul_mont(r, a, b); +} + +void bits_scalar_mul_mont(p, BITS)(ScalarMont r, const ScalarMont a, + const ScalarMont b) { + /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */ + bn_mul_mont(r, a, b, N, N_N0, FE_LIMBS); +} diff --git a/crypto/fipsmodule/ec/gfp_p384.c b/crypto/fipsmodule/ec/gfp_p384.c index 90065eaeb0..8e1e53ac1d 100644 --- a/crypto/fipsmodule/ec/gfp_p384.c +++ b/crypto/fipsmodule/ec/gfp_p384.c @@ -1,4 +1,5 @@ -/* Copyright 2016 Brian Smith. + +/* Copyright 2016-2023 Brian Smith. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -13,23 +14,22 @@ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "../../limbs/limbs.h" - -#include "ecp_nistz384.h" #include "../bn/internal.h" #include "../../internal.h" #include "../../limbs/limbs.inl" - /* XXX: Here we assume that the conversion from |Carry| to |Limb| is - * constant-time, but we haven't verified that assumption. TODO: Fix it so - * we don't need to make that assumption. */ +#define BITS 384 + +#define P384_LIMBS (384u / LIMB_BITS) +#define FE_LIMBS P384_LIMBS -typedef Limb Elem[P384_LIMBS]; -typedef Limb ScalarMont[P384_LIMBS]; -typedef Limb Scalar[P384_LIMBS]; +typedef Limb Elem[FE_LIMBS]; +typedef Limb ScalarMont[FE_LIMBS]; +typedef Limb Scalar[FE_LIMBS]; -static const BN_ULONG Q[P384_LIMBS] = { +static const Elem Q = { #if defined(OPENSSL_64_BIT) 0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff @@ -39,7 +39,7 @@ static const BN_ULONG Q[P384_LIMBS] = { #endif }; -static const BN_ULONG N[P384_LIMBS] = { +static const Elem N = { #if defined(OPENSSL_64_BIT) 0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff @@ -49,7 +49,7 @@ static const BN_ULONG N[P384_LIMBS] = { #endif }; -static const BN_ULONG ONE[P384_LIMBS] = { +static const Elem ONE = { #if defined(OPENSSL_64_BIT) 0xffffffff00000001, 0xffffffff, 1, 0, 0 #else @@ -83,164 +83,9 @@ static const BN_ULONG N_N0[] = { #define INLINE_IF_POSSIBLE inline #endif -static inline Limb is_equal(const Elem a, const Elem b) { - return LIMBS_equal(a, b, P384_LIMBS); -} - -static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) { - return LIMBS_are_zero(a, P384_LIMBS); -} - -static inline void copy_conditional(Elem r, const Elem a, - const Limb condition) { - for (size_t i = 0; i < P384_LIMBS; ++i) { - r[i] = constant_time_select_w(condition, a[i], r[i]); - } -} - - -static inline void elem_add(Elem r, const Elem a, const Elem b) { - LIMBS_add_mod(r, a, b, Q, P384_LIMBS); -} - -static inline void elem_sub(Elem r, const Elem a, const Elem b) { - LIMBS_sub_mod(r, a, b, Q, P384_LIMBS); -} - -static void elem_div_by_2(Elem r, const Elem a) { - /* Consider the case where `a` is even. Then we can shift `a` right one bit - * and the result will still be valid because we didn't lose any bits and so - * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy. - * - * The remainder of this comment is considering the case where `a` is odd. - * - * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)` - * because the lowest bit is lost during the shift. For example, consider: - * - * ```python - * q = 2**384 - 2**128 - 2**96 + 2**32 - 1 - * a = 2**383 - * two_a = a * 2 % q - * assert two_a == 0x100000000ffffffffffffffff00000001 - * ``` - * - * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When - * we divide `two_a` by two (mod q), we need to get the value `2**383`, which - * we obviously can't get with just a right shift. - * - * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate - * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to - * keep track of an extra most significant bit. We can avoid that by instead - * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least - * significant bit of `a`. `q + 1` is even, which means it can be shifted - * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest - * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know - * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of - * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant - * bit of `a`, which is 1. Thus: - * - * sum = ((q + 1) >> 1) + (a >> 1) - * sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2) - * <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2) - * <= (q + 1)/2 + (q - 3)/2 (simplifying) - * <= (q + 1 + q - 3)/2 (factoring out the common divisor) - * <= (2q - 2)/2 (simplifying) - * <= q - 1 (simplifying) - * - * Thus, no reduction of the sum mod `q` is necessary. */ - - Limb is_odd = constant_time_is_nonzero_w(a[0] & 1); - - /* r = a >> 1. */ - Limb carry = a[P384_LIMBS - 1] & 1; - r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1; - for (size_t i = 1; i < P384_LIMBS; ++i) { - Limb new_carry = a[P384_LIMBS - i - 1]; - r[P384_LIMBS - i - 1] = - (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1)); - carry = new_carry; - } - - Elem adjusted; - BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS); - dev_assert_secret(carry2 == 0); - (void)carry2; - copy_conditional(r, adjusted, is_odd); -} - -static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) { - /* XXX: Not (clearly) constant-time; inefficient.*/ - bn_mul_mont(r, a, b, Q, Q_N0, P384_LIMBS); -} - -static inline void elem_mul_by_2(Elem r, const Elem a) { - LIMBS_shl_mod(r, a, Q, P384_LIMBS); -} - -static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) { - /* XXX: inefficient. TODO: Replace with an integrated shift + add. */ - Elem doubled; - elem_add(doubled, a, a); - elem_add(r, doubled, a); -} - -static inline void elem_sqr_mont(Elem r, const Elem a) { - /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */ - elem_mul_mont(r, a, a); -} - -void p384_elem_sub(Elem r, const Elem a, const Elem b) { - elem_sub(r, a, b); -} - -void p384_elem_div_by_2(Elem r, const Elem a) { - elem_div_by_2(r, a); -} - -void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) { - elem_mul_mont(r, a, b); -} - -void p384_elem_neg(Elem r, const Elem a) { - Limb is_zero = LIMBS_are_zero(a, P384_LIMBS); - Carry borrow = limbs_sub(r, Q, a, P384_LIMBS); - dev_assert_secret(borrow == 0); - (void)borrow; - for (size_t i = 0; i < P384_LIMBS; ++i) { - r[i] = constant_time_select_w(is_zero, 0, r[i]); - } -} - - -void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a, - const ScalarMont b) { - /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */ - bn_mul_mont(r, a, b, N, N_N0, P384_LIMBS); -} - - -/* TODO(perf): Optimize this. */ - -static void p384_point_select_w5(P384_POINT *out, - const P384_POINT table[16], size_t index) { - Elem x; limbs_zero(x, P384_LIMBS); - Elem y; limbs_zero(y, P384_LIMBS); - Elem z; limbs_zero(z, P384_LIMBS); - - // TODO: Rewrite in terms of |limbs_select|. - for (size_t i = 0; i < 16; ++i) { - crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1); - for (size_t j = 0; j < P384_LIMBS; ++j) { - x[j] = constant_time_select_w(equal, table[i].X[j], x[j]); - y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]); - z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]); - } - } - - limbs_copy(out->X, x, P384_LIMBS); - limbs_copy(out->Y, y, P384_LIMBS); - limbs_copy(out->Z, z, P384_LIMBS); -} - - -#include "ecp_nistz384.inl" +/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */ +/* Window values that are Ok for P521 (look at `ecp_nistz.h`): 4 */ +#define W_BITS 5 + +#include "ecp_nistz.inl" + diff --git a/mk/generate_curves.py b/mk/generate_curves.py index 92efecadce..fed6d96f36 100644 --- a/mk/generate_curves.py +++ b/mk/generate_curves.py @@ -17,7 +17,7 @@ from textwrap import wrap rs_template = """ -// Copyright 2016-2023 Brian Smith. +// Copyright 2016-2024 Brian Smith. // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted, provided that the above @@ -215,6 +215,7 @@ a: *const Limb, // [3][COMMON_OPS.num_limbs] b: *const Limb, // [3][COMMON_OPS.num_limbs] ); + fn p%(bits)s_point_mul( r: *mut Limb, // [3][COMMON_OPS.num_limbs] p_scalar: *const Limb, // [COMMON_OPS.num_limbs] @@ -345,19 +346,25 @@ def generate_rs(g, out_dir): #include "../../limbs/limbs.inl" -typedef Limb Elem[P%(bits)d_LIMBS]; -typedef Limb ScalarMont[P%(bits)d_LIMBS]; -typedef Limb Scalar[P%(bits)d_LIMBS]; +#define BITS %(bits)d + +#define P%(bits)d_LIMBS (%(bits)du / LIMB_BITS) + +#define FE_LIMBS P%(bits)d_LIMBS + +typedef Limb Elem[FE_LIMBS]; +typedef Limb ScalarMont[FE_LIMBS]; +typedef Limb Scalar[FE_LIMBS]; -static const BN_ULONG Q[P%(bits)d_LIMBS] = { +static const Elem Q = { %(q)s }; -static const BN_ULONG N[P%(bits)d_LIMBS] = { +static const Elem N = { %(n)s }; -static const BN_ULONG ONE[P%(bits)d_LIMBS] = { +static const Elem ONE = { %(q_one)s }; @@ -373,6 +380,20 @@ def generate_rs(g, out_dir): %(n_n0)s }; +/* XXX: MSVC for x86 warns when it fails to inline these functions it should + * probably inline. */ +#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86) +#define INLINE_IF_POSSIBLE __forceinline +#else +#define INLINE_IF_POSSIBLE inline +#endif + +/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */ +/* Window values that are Ok for P521 (look at `ecp_nistz.h`): 4 */ +#define W_BITS %(w_bits)d + +#include "ecp_nistz.inl" + """ # Given a number |x|, return a generator of a sequence |a| such that @@ -437,6 +458,7 @@ def generate_c(g, out_dir): "q_plus_1_shr_1": format_big_int(const((q + 1) >> 1), big_int_limbs(q)), "n" : format_big_int(const(n), big_int_limbs(q)), "n_n0": format_n0(n), + "w_bits": g["w_bits"], } out_path = os.path.join(out_dir, "gfp_%s.c" % name) @@ -476,6 +498,7 @@ def generate(g, out_dir): "Gx": 0x6b17d1f2_e12c4247_f8bce6e5_63a440f2_77037d81_2deb33a0_f4a13945_d898c296, "Gy": 0x4fe342e2_fe1a7f9b_8ee7eb4a_7c0f9e16_2bce3357_6b315ece_cbb64068_37bf51f5, "cofactor": 1, + "w_bits": 5, } p384 = { @@ -488,6 +511,7 @@ def generate(g, out_dir): "Gx": 0xaa87ca22_be8b0537_8eb1c71e_f320ad74_6e1d3b62_8ba79b98_59f741e0_82542a38_5502f25d_bf55296c_3a545e38_72760ab7, "Gy": 0x3617de4a_96262c6f_5d9e98bf_9292dc29_f8f41dbd_289a147c_e9da3113_b5f0b8c0_0a60b1ce_1d7e819d_7a431d7c_90ea0e5f, "cofactor": 1, + "w_bits": 5, } p521 = { @@ -500,6 +524,7 @@ def generate(g, out_dir): "Gx": 0xc6_858e06b7_0404e9cd_9e3ecb66_2395b442_9c648139_053fb521_f828af60_6b4d3dba_a14b5e77_efe75928_fe1dc127_a2ffa8de_3348b3c1_856a429b_f97e7e31_c2e5bd66, "Gy": 0x118_39296a78_9a3bc004_5c8a5fb4_2c7d1bd9_98f54449_579b4468_17afbd17_273e662c_97ee7299_5ef42640_c550b901_3fad0761_353c7086_a272c240_88be9476_9fd16650, "cofactor": 1, + "w_bits": 4, } import os