p256: add 32-bit arithmetic and optimizations (RustCrypto#1033)

Co-authored-by: davideciacciolo <[email protected]>
joshka · Mar 13, 2024 · 6ff3bb7 · 6ff3bb7
1 parent 7652c58
commit 6ff3bb7
Show file tree

Hide file tree

Showing 8 changed files with 841 additions and 470 deletions.
diff --git a/p256/src/arithmetic.rs b/p256/src/arithmetic.rs
@@ -8,7 +8,6 @@ pub(crate) mod field;
 #[cfg(feature = "hash2curve")]
 mod hash2curve;
 pub(crate) mod scalar;
-pub(crate) mod util;
 
 use self::{field::FieldElement, scalar::Scalar};
 use crate::NistP256;

diff --git a/p256/src/arithmetic/field.rs b/p256/src/arithmetic/field.rs
@@ -2,17 +2,18 @@
 
 #![allow(clippy::assign_op_pattern, clippy::op_ref)]
 
-use crate::{
-    arithmetic::util::{adc, mac, sbb, u256_to_u64x4, u64x4_to_u256},
-    FieldBytes,
-};
+#[cfg_attr(target_pointer_width = "32", path = "field/field32.rs")]
+#[cfg_attr(target_pointer_width = "64", path = "field/field64.rs")]
+mod field_impl;
+
+use crate::FieldBytes;
 use core::{
     iter::{Product, Sum},
     ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
 };
 use elliptic_curve::ops::Invert;
 use elliptic_curve::{
-    bigint::{ArrayEncoding, U256},
+    bigint::{ArrayEncoding, U256, U512},
     ff::{Field, PrimeField},
     rand_core::RngCore,
     subtle::{Choice, ConditionallySelectable, ConstantTimeEq, ConstantTimeLess, CtOption},
@@ -116,171 +117,28 @@ impl FieldElement {
 
     /// Returns self + rhs mod p
     pub const fn add(&self, rhs: &Self) -> Self {
-        let a = u256_to_u64x4(self.0);
-        let b = u256_to_u64x4(rhs.0);
-
-        // Bit 256 of p is set, so addition can result in five words.
-        let (w0, carry) = adc(a[0], b[0], 0);
-        let (w1, carry) = adc(a[1], b[1], carry);
-        let (w2, carry) = adc(a[2], b[2], carry);
-        let (w3, w4) = adc(a[3], b[3], carry);
-
-        // Attempt to subtract the modulus, to ensure the result is in the field.
-        let modulus = u256_to_u64x4(MODULUS.0);
-        let (result, _) = Self::sub_inner(
-            w0, w1, w2, w3, w4, modulus[0], modulus[1], modulus[2], modulus[3], 0,
-        );
-        result
+        Self(field_impl::add(self.0, rhs.0))
     }
 
-    /// Returns 2*self.
+    /// Returns 2 * self.
     pub const fn double(&self) -> Self {
         self.add(self)
     }
 
     /// Returns self - rhs mod p
     pub const fn sub(&self, rhs: &Self) -> Self {
-        let a = u256_to_u64x4(self.0);
-        let b = u256_to_u64x4(rhs.0);
-        Self::sub_inner(a[0], a[1], a[2], a[3], 0, b[0], b[1], b[2], b[3], 0).0
+        Self(field_impl::sub(self.0, rhs.0))
     }
 
     /// Negate element.
     pub const fn neg(&self) -> Self {
         Self::sub(&Self::ZERO, self)
     }
 
-    fn from_bytes_wide(bytes: [u8; 64]) -> Self {
-        #[allow(clippy::unwrap_used)]
-        FieldElement::montgomery_reduce(
-            u64::from_be_bytes(bytes[0..8].try_into().unwrap()),
-            u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
-            u64::from_be_bytes(bytes[16..24].try_into().unwrap()),
-            u64::from_be_bytes(bytes[24..32].try_into().unwrap()),
-            u64::from_be_bytes(bytes[32..40].try_into().unwrap()),
-            u64::from_be_bytes(bytes[40..48].try_into().unwrap()),
-            u64::from_be_bytes(bytes[48..56].try_into().unwrap()),
-            u64::from_be_bytes(bytes[56..64].try_into().unwrap()),
-        )
-    }
-
-    #[inline]
-    #[allow(clippy::too_many_arguments)]
-    const fn sub_inner(
-        l0: u64,
-        l1: u64,
-        l2: u64,
-        l3: u64,
-        l4: u64,
-        r0: u64,
-        r1: u64,
-        r2: u64,
-        r3: u64,
-        r4: u64,
-    ) -> (Self, u64) {
-        let (w0, borrow) = sbb(l0, r0, 0);
-        let (w1, borrow) = sbb(l1, r1, borrow);
-        let (w2, borrow) = sbb(l2, r2, borrow);
-        let (w3, borrow) = sbb(l3, r3, borrow);
-        let (_, borrow) = sbb(l4, r4, borrow);
-
-        // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
-        // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the
-        // modulus.
-        let modulus = u256_to_u64x4(MODULUS.0);
-        let (w0, carry) = adc(w0, modulus[0] & borrow, 0);
-        let (w1, carry) = adc(w1, modulus[1] & borrow, carry);
-        let (w2, carry) = adc(w2, modulus[2] & borrow, carry);
-        let (w3, _) = adc(w3, modulus[3] & borrow, carry);
-
-        (Self(u64x4_to_u256([w0, w1, w2, w3])), borrow)
-    }
-
-    /// Montgomery Reduction
-    ///
-    /// The general algorithm is:
-    /// ```text
-    /// A <- input (2n b-limbs)
-    /// for i in 0..n {
-    ///     k <- A[i] p' mod b
-    ///     A <- A + k p b^i
-    /// }
-    /// A <- A / b^n
-    /// if A >= p {
-    ///     A <- A - p
-    /// }
-    /// ```
-    ///
-    /// For secp256r1, we have the following simplifications:
-    ///
-    /// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A.
-    ///
-    /// - The first limb of p is 2^64 - 1; multiplications by this limb can be simplified
-    ///   to a shift and subtraction:
-    ///   ```text
-    ///       a_i * (2^64 - 1) = a_i * 2^64 - a_i = (a_i << 64) - a_i
-    ///   ```
-    ///   However, because `p' = 1`, the first limb of p is multiplied by limb i of the
-    ///   intermediate A and then immediately added to that same limb, so we simply
-    ///   initialize the carry to limb i of the intermediate.
-    ///
-    /// - The third limb of p is zero, so we can ignore any multiplications by it and just
-    ///   add the carry.
-    ///
-    /// References:
-    /// - Handbook of Applied Cryptography, Chapter 14
-    ///   Algorithm 14.32
-    ///   http://cacr.uwaterloo.ca/hac/about/chap14.pdf
-    ///
-    /// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256
-    ///   Algorithm 7) Montgomery Word-by-Word Reduction
-    ///   https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf
-    #[inline]
-    #[allow(clippy::too_many_arguments)]
-    const fn montgomery_reduce(
-        r0: u64,
-        r1: u64,
-        r2: u64,
-        r3: u64,
-        r4: u64,
-        r5: u64,
-        r6: u64,
-        r7: u64,
-    ) -> Self {
-        let modulus = u256_to_u64x4(MODULUS.0);
-
-        let (r1, carry) = mac(r1, r0, modulus[1], r0);
-        let (r2, carry) = adc(r2, 0, carry);
-        let (r3, carry) = mac(r3, r0, modulus[3], carry);
-        let (r4, carry2) = adc(r4, 0, carry);
-
-        let (r2, carry) = mac(r2, r1, modulus[1], r1);
-        let (r3, carry) = adc(r3, 0, carry);
-        let (r4, carry) = mac(r4, r1, modulus[3], carry);
-        let (r5, carry2) = adc(r5, carry2, carry);
-
-        let (r3, carry) = mac(r3, r2, modulus[1], r2);
-        let (r4, carry) = adc(r4, 0, carry);
-        let (r5, carry) = mac(r5, r2, modulus[3], carry);
-        let (r6, carry2) = adc(r6, carry2, carry);
-
-        let (r4, carry) = mac(r4, r3, modulus[1], r3);
-        let (r5, carry) = adc(r5, 0, carry);
-        let (r6, carry) = mac(r6, r3, modulus[3], carry);
-        let (r7, r8) = adc(r7, carry2, carry);
-
-        // Result may be within MODULUS of the correct value
-        let (result, _) = Self::sub_inner(
-            r4, r5, r6, r7, r8, modulus[0], modulus[1], modulus[2], modulus[3], 0,
-        );
-        result
-    }
-
     /// Translate a field element out of the Montgomery domain.
     #[inline]
     pub(crate) const fn to_canonical(self) -> Self {
-        let w = u256_to_u64x4(self.0);
-        FieldElement::montgomery_reduce(w[0], w[1], w[2], w[3], 0, 0, 0, 0)
+        Self(field_impl::to_canonical(self.0))
     }
 
     /// Translate a field element into the Montgomery domain.
@@ -291,31 +149,8 @@ impl FieldElement {
 
     /// Returns self * rhs mod p
     pub const fn multiply(&self, rhs: &Self) -> Self {
-        // Schoolbook multiplication.
-        let a = u256_to_u64x4(self.0);
-        let b = u256_to_u64x4(rhs.0);
-
-        let (w0, carry) = mac(0, a[0], b[0], 0);
-        let (w1, carry) = mac(0, a[0], b[1], carry);
-        let (w2, carry) = mac(0, a[0], b[2], carry);
-        let (w3, w4) = mac(0, a[0], b[3], carry);
-
-        let (w1, carry) = mac(w1, a[1], b[0], 0);
-        let (w2, carry) = mac(w2, a[1], b[1], carry);
-        let (w3, carry) = mac(w3, a[1], b[2], carry);
-        let (w4, w5) = mac(w4, a[1], b[3], carry);
-
-        let (w2, carry) = mac(w2, a[2], b[0], 0);
-        let (w3, carry) = mac(w3, a[2], b[1], carry);
-        let (w4, carry) = mac(w4, a[2], b[2], carry);
-        let (w5, w6) = mac(w5, a[2], b[3], carry);
-
-        let (w3, carry) = mac(w3, a[3], b[0], 0);
-        let (w4, carry) = mac(w4, a[3], b[1], carry);
-        let (w5, carry) = mac(w5, a[3], b[2], carry);
-        let (w6, w7) = mac(w6, a[3], b[3], carry);
-
-        FieldElement::montgomery_reduce(w0, w1, w2, w3, w4, w5, w6, w7)
+        let (lo, hi): (U256, U256) = self.0.split_mul(&rhs.0);
+        Self(field_impl::montgomery_reduce(lo, hi))
     }
 
     /// Returns self * self mod p
@@ -420,7 +255,8 @@ impl Field for FieldElement {
         // negligible bias from the uniform distribution.
         let mut buf = [0; 64];
         rng.fill_bytes(&mut buf);
-        FieldElement::from_bytes_wide(buf)
+        let buf = U512::from_be_slice(&buf);
+        Self(field_impl::from_bytes_wide(buf))
     }
 
     #[must_use]
@@ -666,9 +502,13 @@ impl<'a> Product<&'a FieldElement> for FieldElement {
 
 #[cfg(test)]
 mod tests {
-    use super::{u64x4_to_u256, FieldElement};
+    use super::FieldElement;
     use crate::{test_vectors::field::DBL_TEST_VECTORS, FieldBytes};
     use core::ops::Mul;
+
+    #[cfg(target_pointer_width = "64")]
+    use crate::U256;
+    #[cfg(target_pointer_width = "64")]
     use proptest::{num::u64::ANY, prelude::*};
 
     #[test]
@@ -783,6 +623,7 @@ mod tests {
         assert_eq!(four.sqrt().unwrap(), two);
     }
 
+    #[cfg(target_pointer_width = "64")]
     proptest! {
         /// This checks behaviour well within the field ranges, because it doesn't set the
         /// highest limb.
@@ -795,8 +636,8 @@ mod tests {
             b1 in ANY,
             b2 in ANY,
         ) {
-            let a = FieldElement(u64x4_to_u256([a0, a1, a2, 0]));
-            let b = FieldElement(u64x4_to_u256([b0, b1, b2, 0]));
+            let a = FieldElement(U256::from_words([a0, a1, a2, 0]));
+            let b = FieldElement(U256::from_words([b0, b1, b2, 0]));
             assert_eq!(a.add(&b).sub(&a), b);
         }
     }