From 40018e35cf58dbfac773d3332a35116a87063c67 Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 09:34:43 -0700
Subject: [PATCH 1/7] NFC P-256: Add `twin_mul` method to `PublicScalarOps`.

Allow each curve to provide its own `twin_mul` implementation. For now,
use the same implementation we've been using.
---
 src/ec/suite_b/ecdsa/verification.rs | 16 ++--------------
 src/ec/suite_b/ops.rs                | 18 ++++++++++++++----
 src/ec/suite_b/ops/p256.rs           |  5 ++++-
 src/ec/suite_b/ops/p384.rs           |  4 +++-
 4 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/src/ec/suite_b/ecdsa/verification.rs b/src/ec/suite_b/ecdsa/verification.rs
index 2e9f50127d..157753e5cb 100644
--- a/src/ec/suite_b/ecdsa/verification.rs
+++ b/src/ec/suite_b/ecdsa/verification.rs
@@ -123,7 +123,7 @@ impl EcdsaVerificationAlgorithm {
         // NSA Guide Step 6: "Compute the elliptic curve point
         // R = (xR, yR) = u1*G + u2*Q, using EC scalar multiplication and EC
         // addition. If R is equal to the point at infinity, output INVALID."
-        let product = twin_mul(self.ops.private_key_ops, &u1, &u2, &peer_pub_key);
+        let product = (self.ops.twin_mul)(&u1, &u2, &peer_pub_key);
 
         // Verify that the point we computed is on the curve; see
         // `verify_affine_point_is_on_the_curve_scaled` for details on why. It
@@ -158,7 +158,7 @@ impl EcdsaVerificationAlgorithm {
         }
         if self.ops.elem_less_than(&r, &self.ops.q_minus_n) {
             self.ops
-                .private_key_ops
+                .scalar_ops
                 .common
                 .elem_add(&mut r, &public_key_ops.common.n);
             if sig_r_equals_x(self.ops, &r, &x, &z2) {
@@ -193,18 +193,6 @@ fn split_rs_asn1<'a>(
     })
 }
 
-fn twin_mul(
-    ops: &PrivateKeyOps,
-    g_scalar: &Scalar,
-    p_scalar: &Scalar,
-    p_xy: &(Elem<R>, Elem<R>),
-) -> Point {
-    // XXX: Inefficient. TODO: implement interleaved wNAF multiplication.
-    let scaled_g = ops.point_mul_base(g_scalar);
-    let scaled_p = ops.point_mul(p_scalar, p_xy);
-    ops.common.point_sum(&scaled_g, &scaled_p)
-}
-
 /// Verification of fixed-length (PKCS#11 style) ECDSA signatures using the
 /// P-256 curve and SHA-256.
 ///
diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs
index b42b958efa..83683a0f73 100644
--- a/src/ec/suite_b/ops.rs
+++ b/src/ec/suite_b/ops.rs
@@ -270,10 +270,7 @@ pub struct PublicScalarOps {
     pub scalar_ops: &'static ScalarOps,
     pub public_key_ops: &'static PublicKeyOps,
 
-    // XXX: `PublicScalarOps` shouldn't depend on `PrivateKeyOps`, but it does
-    // temporarily until `twin_mul` is rewritten.
-    pub private_key_ops: &'static PrivateKeyOps,
-
+    pub twin_mul: fn(g_scalar: &Scalar, p_scalar: &Scalar, p_xy: &(Elem<R>, Elem<R>)) -> Point,
     pub q_minus_n: Elem<Unencoded>,
 }
 
@@ -305,6 +302,19 @@ pub struct PrivateScalarOps {
     pub oneRR_mod_n: Scalar<RR>, // 1 * R**2 (mod n). TOOD: Use One<RR>.
 }
 
+// XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF
+// multiplication.
+fn twin_mul_inefficient(
+    ops: &PrivateKeyOps,
+    g_scalar: &Scalar,
+    p_scalar: &Scalar,
+    p_xy: &(Elem<R>, Elem<R>),
+) -> Point {
+    let scaled_g = ops.point_mul_base(g_scalar);
+    let scaled_p = ops.point_mul(p_scalar, p_xy);
+    ops.common.point_sum(&scaled_g, &scaled_p)
+}
+
 // This assumes n < q < 2*n.
 pub fn elem_reduced_to_scalar(ops: &CommonOps, elem: &Elem<Unencoded>) -> Scalar<Unencoded> {
     let num_limbs = ops.num_limbs;
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index b7ea524a1d..c0ccbcdc1f 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -114,7 +114,10 @@ pub static SCALAR_OPS: ScalarOps = ScalarOps {
 pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     scalar_ops: &SCALAR_OPS,
     public_key_ops: &PUBLIC_KEY_OPS,
-    private_key_ops: &PRIVATE_KEY_OPS,
+    twin_mul: |g_scalar, p_scalar, p_xy| {
+        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
+    },
+
     q_minus_n: Elem::from_hex("4319055358e8617b0c46353d039cdaae"),
 };
 
diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs
index 6ef4bc3f9e..f424c520d0 100644
--- a/src/ec/suite_b/ops/p384.rs
+++ b/src/ec/suite_b/ops/p384.rs
@@ -122,7 +122,9 @@ pub static SCALAR_OPS: ScalarOps = ScalarOps {
 pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     scalar_ops: &SCALAR_OPS,
     public_key_ops: &PUBLIC_KEY_OPS,
-    private_key_ops: &PRIVATE_KEY_OPS,
+    twin_mul: |g_scalar, p_scalar, p_xy| {
+        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
+    },
 
     q_minus_n: Elem::from_hex("389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68c"),
 };

From 83ceb38075b5b0864ac33e5e05e12cd0d84d167a Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 10:44:17 -0700
Subject: [PATCH 2/7] Import ecp_nistz256_points_mul_public from BoringSSL.

From BoringSSL commit 8d71d244c0debac4079beeb02b5802fde59b94bd.

Comment it out until it is modified to work.
---
 crypto/fipsmodule/ec/p256-nistz.c | 68 +++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c
index 33add75fcb..abd7739e9e 100644
--- a/crypto/fipsmodule/ec/p256-nistz.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -284,4 +284,72 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
   limbs_copy(r->Z, p.Z, P256_LIMBS);
 }
 
+#if 0
+
+static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
+                                           EC_JACOBIAN *r,
+                                           const EC_SCALAR *g_scalar,
+                                           const EC_JACOBIAN *p_,
+                                           const EC_SCALAR *p_scalar) {
+  assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL);
+
+  alignas(32) P256_POINT p;
+  uint8_t p_str[33];
+  OPENSSL_memcpy(p_str, g_scalar->words, 32);
+  p_str[32] = 0;
+
+  // First window
+  size_t index = 0;
+  size_t wvalue = calc_first_wvalue(&index, p_str);
+
+  // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
+  // is infinity and |ONE| otherwise. |p| was computed from the table, so it
+  // is infinity iff |wvalue >> 1| is zero.
+  if ((wvalue >> 1) != 0) {
+    OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X,
+                   sizeof(p.X));
+    OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y,
+                   sizeof(p.Y));
+    OPENSSL_memcpy(p.Z, ONE, sizeof(p.Z));
+  } else {
+    OPENSSL_memset(p.X, 0, sizeof(p.X));
+    OPENSSL_memset(p.Y, 0, sizeof(p.Y));
+    OPENSSL_memset(p.Z, 0, sizeof(p.Z));
+  }
+
+  if ((wvalue & 1) == 1) {
+    ecp_nistz256_neg(p.Y, p.Y);
+  }
+
+  for (int i = 1; i < 37; i++) {
+    wvalue = calc_wvalue(&index, p_str);
+    if ((wvalue >> 1) == 0) {
+      continue;
+    }
+
+    alignas(32) P256_POINT_AFFINE t;
+    OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1],
+                   sizeof(t));
+    if ((wvalue & 1) == 1) {
+      ecp_nistz256_neg(t.Y, t.Y);
+    }
+
+    // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are
+    // the same non-infinity point, so it is important that we compute the
+    // |g_scalar| term before the |p_scalar| term.
+    ecp_nistz256_point_add_affine(&p, &p, &t);
+  }
+
+  alignas(32) P256_POINT tmp;
+  ecp_nistz256_windowed_mul(group, &tmp, p_, p_scalar);
+  ecp_nistz256_point_add(&p, &p, &tmp);
+
+  assert(group->field.N.width == P256_LIMBS);
+  OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG));
+}
+
+#endif
+
 #endif /* defined(OPENSSL_USE_NISTZ256) */

From 4fa99059cc9720e929d5a5b4a1a3bb2410678f1d Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 10:30:49 -0700
Subject: [PATCH 3/7] P-256 ECDSA verification: Use optimized nistz256
 verification.

Import the optimized nistz256 verification from BoringSSL.
---
 build.rs                          |  1 +
 crypto/fipsmodule/ec/p256-nistz.c | 27 ++++++++++---------------
 src/ec/suite_b/ops/p256.rs        | 33 ++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/build.rs b/build.rs
index 5c328aa35b..a1e1f41d81 100644
--- a/build.rs
+++ b/build.rs
@@ -958,6 +958,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "p256_point_double",
         "p256_point_mul",
         "p256_point_mul_base",
+        "p256_points_mul_public",
         "p256_scalar_mul_mont",
         "p256_scalar_sqr_rep_mont",
         "p256_sqr_mont",
diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c
index abd7739e9e..f0fc61424d 100644
--- a/crypto/fipsmodule/ec/p256-nistz.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -284,18 +284,14 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
   limbs_copy(r->Z, p.Z, P256_LIMBS);
 }
 
-#if 0
-
-static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
-                                           EC_JACOBIAN *r,
-                                           const EC_SCALAR *g_scalar,
-                                           const EC_JACOBIAN *p_,
-                                           const EC_SCALAR *p_scalar) {
-  assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL);
-
+void p256_points_mul_public(P256_POINT *r,
+                            const Limb g_scalar[P256_LIMBS],
+                            const Limb p_scalar[P256_LIMBS],
+                            const Limb p_x[P256_LIMBS],
+                            const Limb p_y[P256_LIMBS]) {
   alignas(32) P256_POINT p;
   uint8_t p_str[33];
-  OPENSSL_memcpy(p_str, g_scalar->words, 32);
+  OPENSSL_memcpy(p_str, g_scalar, 32);
   p_str[32] = 0;
 
   // First window
@@ -341,15 +337,12 @@ static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
   }
 
   alignas(32) P256_POINT tmp;
-  ecp_nistz256_windowed_mul(group, &tmp, p_, p_scalar);
+  ecp_nistz256_windowed_mul(&tmp, p_scalar, p_x, p_y);
   ecp_nistz256_point_add(&p, &p, &tmp);
 
-  assert(group->field.N.width == P256_LIMBS);
-  OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->X, p.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y, p.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z, p.Z, P256_LIMBS * sizeof(BN_ULONG));
 }
 
-#endif
-
 #endif /* defined(OPENSSL_USE_NISTZ256) */
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index c0ccbcdc1f..566dbfe2be 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -114,6 +114,11 @@ pub static SCALAR_OPS: ScalarOps = ScalarOps {
 pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     scalar_ops: &SCALAR_OPS,
     public_key_ops: &PUBLIC_KEY_OPS,
+
+    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+    twin_mul: twin_mul_nistz256,
+
+    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
     twin_mul: |g_scalar, p_scalar, p_xy| {
         twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
     },
@@ -121,6 +126,33 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     q_minus_n: Elem::from_hex("4319055358e8617b0c46353d039cdaae"),
 };
 
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+fn twin_mul_nistz256(
+    g_scalar: &Scalar,
+    p_scalar: &Scalar,
+    (p_x, p_y): &(Elem<R>, Elem<R>),
+) -> Point {
+    prefixed_extern! {
+        fn p256_points_mul_public(r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
+                                  g_scalar: *const Limb, // [COMMON_OPS.num_limbs]
+                                  p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
+                                  p_x: *const Limb,      // [COMMON_OPS.num_limbs]
+                                  p_y: *const Limb,      // [COMMON_OPS.num_limbs]
+        );
+    }
+    let mut r = Point::new_at_infinity();
+    unsafe {
+        p256_points_mul_public(
+            r.xyz.as_mut_ptr(),
+            g_scalar.limbs.as_ptr(),
+            p_scalar.limbs.as_ptr(),
+            p_x.limbs.as_ptr(),
+            p_y.limbs.as_ptr(),
+        );
+    }
+    r
+}
+
 pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps {
     scalar_ops: &SCALAR_OPS,
 
@@ -273,7 +305,6 @@ prefixed_extern! {
         p_x: *const Limb,      // [COMMON_OPS.num_limbs]
         p_y: *const Limb,      // [COMMON_OPS.num_limbs]
     );
-
     fn p256_scalar_mul_mont(
         r: *mut Limb,   // [COMMON_OPS.num_limbs]
         a: *const Limb, // [COMMON_OPS.num_limbs]

From 86f49768e8f01551f326d57db8fd76d404945894 Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 11:12:20 -0700
Subject: [PATCH 4/7] P-256 ECDSA verification: Clarify multiplication.

Move more of the logic for the nistz256 multiplication into Rust.
---
 build.rs                          |  2 +-
 crypto/fipsmodule/ec/p256-nistz.c | 11 ++------
 src/ec/suite_b/ops.rs             | 10 +++++--
 src/ec/suite_b/ops/p256.rs        | 46 ++++++++++++++++++-------------
 4 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/build.rs b/build.rs
index a1e1f41d81..3cbae9d926 100644
--- a/build.rs
+++ b/build.rs
@@ -958,7 +958,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "p256_point_double",
         "p256_point_mul",
         "p256_point_mul_base",
-        "p256_points_mul_public",
+        "p256_point_mul_base_vartime",
         "p256_scalar_mul_mont",
         "p256_scalar_sqr_rep_mont",
         "p256_sqr_mont",
diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c
index f0fc61424d..c40b1085db 100644
--- a/crypto/fipsmodule/ec/p256-nistz.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -284,11 +284,8 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
   limbs_copy(r->Z, p.Z, P256_LIMBS);
 }
 
-void p256_points_mul_public(P256_POINT *r,
-                            const Limb g_scalar[P256_LIMBS],
-                            const Limb p_scalar[P256_LIMBS],
-                            const Limb p_x[P256_LIMBS],
-                            const Limb p_y[P256_LIMBS]) {
+void p256_point_mul_base_vartime(P256_POINT *r,
+                                 const Limb g_scalar[P256_LIMBS]) {
   alignas(32) P256_POINT p;
   uint8_t p_str[33];
   OPENSSL_memcpy(p_str, g_scalar, 32);
@@ -336,10 +333,6 @@ void p256_points_mul_public(P256_POINT *r,
     ecp_nistz256_point_add_affine(&p, &p, &t);
   }
 
-  alignas(32) P256_POINT tmp;
-  ecp_nistz256_windowed_mul(&tmp, p_scalar, p_x, p_y);
-  ecp_nistz256_point_add(&p, &p, &tmp);
-
   OPENSSL_memcpy(r->X, p.X, P256_LIMBS * sizeof(BN_ULONG));
   OPENSSL_memcpy(r->Y, p.Y, P256_LIMBS * sizeof(BN_ULONG));
   OPENSSL_memcpy(r->Z, p.Z, P256_LIMBS * sizeof(BN_ULONG));
diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs
index 83683a0f73..5aa241390e 100644
--- a/src/ec/suite_b/ops.rs
+++ b/src/ec/suite_b/ops.rs
@@ -979,6 +979,7 @@ mod tests {
     fn p256_point_mul_base_test() {
         point_mul_base_tests(
             &p256::PRIVATE_KEY_OPS,
+            |s| p256::PRIVATE_KEY_OPS.point_mul_base(s),
             test_file!("ops/p256_point_mul_base_tests.txt"),
         );
     }
@@ -987,16 +988,21 @@ mod tests {
     fn p384_point_mul_base_test() {
         point_mul_base_tests(
             &p384::PRIVATE_KEY_OPS,
+            |s| p384::PRIVATE_KEY_OPS.point_mul_base(s),
             test_file!("ops/p384_point_mul_base_tests.txt"),
         );
     }
 
-    fn point_mul_base_tests(ops: &PrivateKeyOps, test_file: test::File) {
+    pub(super) fn point_mul_base_tests(
+        ops: &PrivateKeyOps,
+        f: impl Fn(&Scalar) -> Point,
+        test_file: test::File,
+    ) {
         test::run(test_file, |section, test_case| {
             assert_eq!(section, "");
             let g_scalar = consume_scalar(ops.common, test_case, "g_scalar");
             let expected_result = consume_point(ops, test_case, "r");
-            let actual_result = ops.point_mul_base(&g_scalar);
+            let actual_result = f(&g_scalar);
             assert_point_actual_equals_expected(ops, &actual_result, &expected_result);
             Ok(())
         })
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index 566dbfe2be..adbed60936 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -127,30 +127,24 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
 };
 
 #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
-fn twin_mul_nistz256(
-    g_scalar: &Scalar,
-    p_scalar: &Scalar,
-    (p_x, p_y): &(Elem<R>, Elem<R>),
-) -> Point {
+fn twin_mul_nistz256(g_scalar: &Scalar, p_scalar: &Scalar, p_xy: &(Elem<R>, Elem<R>)) -> Point {
+    let scaled_g = point_mul_base_vartime(g_scalar);
+    let scaled_p = PRIVATE_KEY_OPS.point_mul(p_scalar, p_xy);
+    PRIVATE_KEY_OPS.common.point_sum(&scaled_g, &scaled_p)
+}
+
+#[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+fn point_mul_base_vartime(g_scalar: &Scalar) -> Point {
     prefixed_extern! {
-        fn p256_points_mul_public(r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
-                                  g_scalar: *const Limb, // [COMMON_OPS.num_limbs]
-                                  p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
-                                  p_x: *const Limb,      // [COMMON_OPS.num_limbs]
-                                  p_y: *const Limb,      // [COMMON_OPS.num_limbs]
+        fn p256_point_mul_base_vartime(r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
+                                       g_scalar: *const Limb, // [COMMON_OPS.num_limbs]
         );
     }
-    let mut r = Point::new_at_infinity();
+    let mut scaled_g = Point::new_at_infinity();
     unsafe {
-        p256_points_mul_public(
-            r.xyz.as_mut_ptr(),
-            g_scalar.limbs.as_ptr(),
-            p_scalar.limbs.as_ptr(),
-            p_x.limbs.as_ptr(),
-            p_y.limbs.as_ptr(),
-        );
+        p256_point_mul_base_vartime(scaled_g.xyz.as_mut_ptr(), g_scalar.limbs.as_ptr());
     }
-    r
+    scaled_g
 }
 
 pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps {
@@ -316,3 +310,17 @@ prefixed_extern! {
         rep: Limb,
     );
 }
+
+#[cfg(test)]
+mod tests {
+    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
+    #[test]
+    fn p256_point_mul_base_vartime_test() {
+        use super::{super::tests::point_mul_base_tests, *};
+        point_mul_base_tests(
+            &PRIVATE_KEY_OPS,
+            point_mul_base_vartime,
+            test_file!("p256_point_mul_base_tests.txt"),
+        );
+    }
+}

From e6728bfb71fcf88df207e5e0c31411abba95c22c Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 14:25:41 -0700
Subject: [PATCH 5/7] P-256 nistz: Use arrays instead of P256_POINT in boundary
 functions.

Better match the Rust declarations of these functions. Prepare to
support more target platforms and more weird things (like P-521) that
by avoiding any kind of alignment assumptions at the language
boundary (or elsewhere).
---
 crypto/fipsmodule/ec/p256-nistz.c |  16 ++---
 crypto/fipsmodule/ec/p256.c       | 112 ++++++++++++++++++++----------
 2 files changed, 85 insertions(+), 43 deletions(-)

diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c
index c40b1085db..aa6344c68b 100644
--- a/crypto/fipsmodule/ec/p256-nistz.c
+++ b/crypto/fipsmodule/ec/p256-nistz.c
@@ -232,18 +232,18 @@ static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
   return booth_recode_w7(wvalue);
 }
 
-void p256_point_mul(P256_POINT *r, const Limb p_scalar[P256_LIMBS],
+void p256_point_mul(Limb r[3][P256_LIMBS], const Limb p_scalar[P256_LIMBS],
                         const Limb p_x[P256_LIMBS],
                         const Limb p_y[P256_LIMBS]) {
   alignas(32) P256_POINT out;
   ecp_nistz256_windowed_mul(&out, p_scalar, p_x, p_y);
 
-  limbs_copy(r->X, out.X, P256_LIMBS);
-  limbs_copy(r->Y, out.Y, P256_LIMBS);
-  limbs_copy(r->Z, out.Z, P256_LIMBS);
+  limbs_copy(r[0], out.X, P256_LIMBS);
+  limbs_copy(r[1], out.Y, P256_LIMBS);
+  limbs_copy(r[2], out.Z, P256_LIMBS);
 }
 
-void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
+void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
   P256_SCALAR_BYTES p_str;
   p256_scalar_bytes_from_limbs(p_str, scalar);
 
@@ -279,9 +279,9 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
     ecp_nistz256_point_add_affine(&p, &p, &t);
   }
 
-  limbs_copy(r->X, p.X, P256_LIMBS);
-  limbs_copy(r->Y, p.Y, P256_LIMBS);
-  limbs_copy(r->Z, p.Z, P256_LIMBS);
+  limbs_copy(r[0], p.X, P256_LIMBS);
+  limbs_copy(r[1], p.Y, P256_LIMBS);
+  limbs_copy(r[2], p.Z, P256_LIMBS);
 }
 
 void p256_point_mul_base_vartime(P256_POINT *r,
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index 8d6152486e..7b3bcfb612 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -96,6 +96,21 @@ static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
   fiat_p256_selectznz(out, !!t, z, nz);
 }
 
+static void fiat_p256_from_words(fiat_p256_felem out,
+                                 const Limb in[32 / sizeof(BN_ULONG)]) {
+  // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on
+  // 64-bit platforms without |uint128_t|, they are different. However, on
+  // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same
+  // layout.
+  OPENSSL_memcpy(out, in, 32);
+}
+
+static void fiat_p256_to_words(Limb out[32 / sizeof(BN_ULONG)], const fiat_p256_felem in) {
+  // See |fiat_p256_from_words|.
+  OPENSSL_memcpy(out, in, 32);
+}
+
+
 // Group operations
 // ----------------
 //
@@ -339,8 +354,8 @@ static crypto_word_t fiat_p256_get_bit(const Limb in[P256_LIMBS], int i) {
 #endif
 }
 
-void p256_point_mul(P256_POINT *r, const Limb scalar[P256_LIMBS],
-                        const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) {
+void p256_point_mul(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS],
+                    const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) {
   debug_assert_nonsecret(r != NULL);
   debug_assert_nonsecret(scalar != NULL);
   debug_assert_nonsecret(p_x != NULL);
@@ -349,9 +364,9 @@ void p256_point_mul(P256_POINT *r, const Limb scalar[P256_LIMBS],
   fiat_p256_felem p_pre_comp[17][3];
   OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
   // Precompute multiples.
-  limbs_copy(&p_pre_comp[1][0][0], p_x, P256_LIMBS);
-  limbs_copy(&p_pre_comp[1][1][0], p_y, P256_LIMBS);
-  limbs_copy(&p_pre_comp[1][2][0], fiat_p256_one, P256_LIMBS);
+  fiat_p256_from_words(p_pre_comp[1][0], p_x);
+  fiat_p256_from_words(p_pre_comp[1][1], p_y);
+  fiat_p256_copy(p_pre_comp[1][2], fiat_p256_one);
 
   for (size_t j = 2; j <= 16; ++j) {
     if (j & 1) {
@@ -407,12 +422,12 @@ void p256_point_mul(P256_POINT *r, const Limb scalar[P256_LIMBS],
     }
   }
 
-  limbs_copy(r->X, nq[0], P256_LIMBS);
-  limbs_copy(r->Y, nq[1], P256_LIMBS);
-  limbs_copy(r->Z, nq[2], P256_LIMBS);
+  fiat_p256_to_words(r[0], nq[0]);
+  fiat_p256_to_words(r[1], nq[1]);
+  fiat_p256_to_words(r[2], nq[2]);
 }
 
-void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
+void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
   // Set nq to the point at infinity.
   fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3];
 
@@ -453,45 +468,72 @@ void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) {
                         tmp[0], tmp[1], tmp[2]);
   }
 
-  limbs_copy(r->X, nq[0], P256_LIMBS);
-  limbs_copy(r->Y, nq[1], P256_LIMBS);
-  limbs_copy(r->Z, nq[2], P256_LIMBS);
+  fiat_p256_to_words(r[0], nq[0]);
+  fiat_p256_to_words(r[1], nq[1]);
+  fiat_p256_to_words(r[2], nq[2]);
 }
 
 void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
-                       const Limb b[P256_LIMBS]) {
-  fiat_p256_mul(r, a, b);
+                   const Limb b[P256_LIMBS]) {
+  fiat_p256_felem a_, b_;
+  fiat_p256_from_words(a_, a);
+  fiat_p256_from_words(b_, b);
+  fiat_p256_mul(a_, a_, b_);
+  fiat_p256_to_words(r, a_);
 }
 
 void p256_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
-  fiat_p256_square(r, a);
+  fiat_p256_felem x;
+  fiat_p256_from_words(x, a);
+  fiat_p256_square(x, x);
+  fiat_p256_to_words(r, x);
 }
 
-void p256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b) {
-  fiat_p256_point_add(r->X, r->Y, r->Z,
-                      a->X, a->Y, a->Z,
-                      0,
-                      b->X, b->Y, b->Z);
+void p256_point_add(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS],
+                    const Limb b[3][P256_LIMBS]) {
+  fiat_p256_felem x1, y1, z1, x2, y2, z2;
+  fiat_p256_from_words(x1, a[0]);
+  fiat_p256_from_words(y1, a[1]);
+  fiat_p256_from_words(z1, a[2]);
+  fiat_p256_from_words(x2, b[0]);
+  fiat_p256_from_words(y2, b[1]);
+  fiat_p256_from_words(z2, b[2]);
+  fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2,
+                      z2);
+  fiat_p256_to_words(r[0], x1);
+  fiat_p256_to_words(r[1], y1);
+  fiat_p256_to_words(r[2], z1);
 }
 
-void p256_point_double(P256_POINT *r, const P256_POINT *a) {
-  fiat_p256_point_double(r->X, r->Y, r->Z,
-                         a->X, a->Y, a->Z);
+void p256_point_double(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS]) {
+  fiat_p256_felem x, y, z;
+  fiat_p256_from_words(x, a[0]);
+  fiat_p256_from_words(y, a[1]);
+  fiat_p256_from_words(z, a[2]);
+  fiat_p256_point_double(x, y, z, x, y, z);
+  fiat_p256_to_words(r[0], x);
+  fiat_p256_to_words(r[1], y);
+  fiat_p256_to_words(r[2], z);
 }
 
 // For testing only.
-void p256_point_add_affine(P256_POINT *r, const P256_POINT *a,
-                               const BN_ULONG b[P256_LIMBS * 2]) {
-  const Limb *b_x = &b[0];
-  const Limb *b_y = &b[P256_LIMBS];
-  fiat_p256_felem b_z = {0};
-  crypto_word_t b_is_inf = constant_time_select_w(
-      LIMBS_are_zero(b_x, P256_LIMBS), LIMBS_are_zero(b_y, P256_LIMBS), 0);
-  fiat_p256_cmovznz(b_z, constant_time_is_zero_w(b_is_inf), b_z, fiat_p256_one);
-  fiat_p256_point_add(r->X, r->Y, r->Z,
-                      a->X, a->Y, a->Z,
-                      1,
-                      b_x, b_y, b_z);
+void p256_point_add_affine(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS],
+                           const Limb b[2][P256_LIMBS]) {
+  fiat_p256_felem x1, y1, z1, x2, y2;
+  fiat_p256_from_words(x1, a[0]);
+  fiat_p256_from_words(y1, a[1]);
+  fiat_p256_from_words(z1, a[2]);
+  fiat_p256_from_words(x2, b[0]);
+  fiat_p256_from_words(y2, b[1]);
+
+  fiat_p256_felem z2 = {0};
+  fiat_p256_cmovznz(z2, fiat_p256_nz(x2) & fiat_p256_nz(y2), z2, fiat_p256_one);
+
+  fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 1 /* mixed */, x2, y2, z2);
+
+  fiat_p256_to_words(r[0], x1);
+  fiat_p256_to_words(r[1], y1);
+  fiat_p256_to_words(r[2], z1);
 }
 
 #endif

From 63aacbe4b8da081812296cee49add70b3ab7491f Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 13:40:54 -0700
Subject: [PATCH 6/7] Import BoringSSL's ec_compute_wNAF.

---
 crypto/fipsmodule/bn/internal.h |   3 +
 crypto/fipsmodule/bn/shift.c    |  67 +++++++++++++++
 crypto/fipsmodule/ec/internal.h |  84 ++++++++++++++++++
 crypto/fipsmodule/ec/p256.c     |  96 +++++++++++++++++++++
 crypto/fipsmodule/ec/wnaf.c     | 148 ++++++++++++++++++++++++++++++++
 5 files changed, 398 insertions(+)
 create mode 100644 crypto/fipsmodule/bn/shift.c
 create mode 100644 crypto/fipsmodule/ec/internal.h
 create mode 100644 crypto/fipsmodule/ec/wnaf.c

diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 3fbb7d7521..20173f080a 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -165,6 +165,9 @@ typedef crypto_word_t BN_ULONG;
 #error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
 #endif
 
+// bn_is_bit_set_words returns one if bit |bit| is set in |a| and zero
+// otherwise.
+int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit);
 
 // |num| must be at least 4, at least on x86.
 //
diff --git a/crypto/fipsmodule/bn/shift.c b/crypto/fipsmodule/bn/shift.c
new file mode 100644
index 0000000000..76bf6219a2
--- /dev/null
+++ b/crypto/fipsmodule/bn/shift.c
@@ -0,0 +1,67 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include "internal.h"
+
+
+int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit) {
+  size_t i = bit / BN_BITS2;
+  size_t j = bit % BN_BITS2;
+  if (i >= num) {
+    return 0;
+  }
+  return (a[i] >> j) & 1;
+}
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
new file mode 100644
index 0000000000..cf7c807256
--- /dev/null
+++ b/crypto/fipsmodule/ec/internal.h
@@ -0,0 +1,84 @@
+/* Originally written by Bodo Moeller for the OpenSSL project.
+ * ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the OpenSSL open source
+ * license provided above.
+ *
+ * The elliptic curve binary polynomial software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#ifndef OPENSSL_HEADER_EC_INTERNAL_H
+#define OPENSSL_HEADER_EC_INTERNAL_H
+
+#include <openssl/base.h>
+
+// ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
+// |scalar| to |out|. |out| must have room for |bits| + 1 elements, each of
+// which will be either zero or odd with an absolute value less than  2^w
+// satisfying
+//     scalar = \sum_j out[j]*2^j
+// where at most one of any  w+1  consecutive digits is non-zero
+// with the exception that the most significant digit may be only
+// w-1 zeros away from that next non-zero digit.
+void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
+                     const EC_SCALAR *scalar, size_t bits, int w);
+
+#endif  // OPENSSL_HEADER_EC_INTERNAL_H
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index 7b3bcfb612..dc67a71f07 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -473,6 +473,102 @@ void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
   fiat_p256_to_words(r[2], nq[2]);
 }
 
+#if 0
+
+static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
+                                             EC_JACOBIAN *r,
+                                             const EC_SCALAR *g_scalar,
+                                             const EC_JACOBIAN *p,
+                                             const EC_SCALAR *p_scalar) {
+#define P256_WSIZE_PUBLIC 4
+  // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|.
+  fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3];
+  fiat_p256_from_generic(p_pre_comp[0][0], &p->X);
+  fiat_p256_from_generic(p_pre_comp[0][1], &p->Y);
+  fiat_p256_from_generic(p_pre_comp[0][2], &p->Z);
+  fiat_p256_felem p2[3];
+  fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0],
+                         p_pre_comp[0][1], p_pre_comp[0][2]);
+  for (size_t i = 1; i < OPENSSL_ARRAY_SIZE(p_pre_comp); i++) {
+    fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2],
+                        p_pre_comp[i - 1][0], p_pre_comp[i - 1][1],
+                        p_pre_comp[i - 1][2], 0 /* not mixed */, p2[0], p2[1],
+                        p2[2]);
+  }
+
+  // Set up the coefficients for |p_scalar|.
+  int8_t p_wNAF[257];
+  ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC);
+
+  // Set |ret| to the point at infinity.
+  int skip = 1;  // Save some point operations.
+  fiat_p256_felem ret[3] = {{0}, {0}, {0}};
+  for (int i = 256; i >= 0; i--) {
+    if (!skip) {
+      fiat_p256_point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]);
+    }
+
+    // For the |g_scalar|, we use the precomputed table without the
+    // constant-time lookup.
+    if (i <= 31) {
+      // First, look 32 bits upwards.
+      crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3;
+      bits |= fiat_p256_get_bit(g_scalar, i + 160) << 2;
+      bits |= fiat_p256_get_bit(g_scalar, i + 96) << 1;
+      bits |= fiat_p256_get_bit(g_scalar, i + 32);
+      if (bits != 0) {
+        size_t index = (size_t)(bits - 1);
+        fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
+                            1 /* mixed */, fiat_p256_g_pre_comp[1][index][0],
+                            fiat_p256_g_pre_comp[1][index][1],
+                            fiat_p256_one);
+        skip = 0;
+      }
+
+      // Second, look at the current position.
+      bits = fiat_p256_get_bit(g_scalar, i + 192) << 3;
+      bits |= fiat_p256_get_bit(g_scalar, i + 128) << 2;
+      bits |= fiat_p256_get_bit(g_scalar, i + 64) << 1;
+      bits |= fiat_p256_get_bit(g_scalar, i);
+      if (bits != 0) {
+        size_t index = (size_t)(bits - 1);
+        fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
+                            1 /* mixed */, fiat_p256_g_pre_comp[0][index][0],
+                            fiat_p256_g_pre_comp[0][index][1],
+                            fiat_p256_one);
+        skip = 0;
+      }
+    }
+
+    int digit = p_wNAF[i];
+    if (digit != 0) {
+      assert(digit & 1);
+      size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1);
+      fiat_p256_felem *y = &p_pre_comp[idx][1], tmp;
+      if (digit < 0) {
+        fiat_p256_opp(tmp, p_pre_comp[idx][1]);
+        y = &tmp;
+      }
+      if (!skip) {
+        fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2],
+                            0 /* not mixed */, p_pre_comp[idx][0], *y,
+                            p_pre_comp[idx][2]);
+      } else {
+        fiat_p256_copy(ret[0], p_pre_comp[idx][0]);
+        fiat_p256_copy(ret[1], *y);
+        fiat_p256_copy(ret[2], p_pre_comp[idx][2]);
+        skip = 0;
+      }
+    }
+  }
+
+  fiat_p256_to_generic(&r->X, ret[0]);
+  fiat_p256_to_generic(&r->Y, ret[1]);
+  fiat_p256_to_generic(&r->Z, ret[2]);
+}
+
+#endif
+
 void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
                    const Limb b[P256_LIMBS]) {
   fiat_p256_felem a_, b_;
diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c
new file mode 100644
index 0000000000..56de6cfec5
--- /dev/null
+++ b/crypto/fipsmodule/ec/wnaf.c
@@ -0,0 +1,148 @@
+/* Originally written by Bodo Moeller for the OpenSSL project.
+ * ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the OpenSSL open source
+ * license provided above.
+ *
+ * The elliptic curve binary polynomial software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#include <openssl/ec.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/bn.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+#include <openssl/thread.h>
+
+#include "internal.h"
+#include "../bn/internal.h"
+#include "../../internal.h"
+
+
+// This file implements the wNAF-based interleaving multi-exponentiation method
+// at:
+//   http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13
+//   http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf
+
+void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
+                     const EC_SCALAR *scalar, size_t bits, int w) {
+  // 'int8_t' can represent integers with absolute values less than 2^7.
+  assert(0 < w && w <= 7);
+  assert(bits != 0);
+  int bit = 1 << w;         // 2^w, at most 128
+  int next_bit = bit << 1;  // 2^(w+1), at most 256
+  int mask = next_bit - 1;  // at most 255
+
+  int window_val = scalar->words[0] & mask;
+  for (size_t j = 0; j < bits + 1; j++) {
+    assert(0 <= window_val && window_val <= next_bit);
+    int digit = 0;
+    if (window_val & 1) {
+      assert(0 < window_val && window_val < next_bit);
+      if (window_val & bit) {
+        digit = window_val - next_bit;
+        // We know -next_bit < digit < 0 and window_val - digit = next_bit.
+
+        // modified wNAF
+        if (j + w + 1 >= bits) {
+          // special case for generating modified wNAFs:
+          // no new bits will be added into window_val,
+          // so using a positive digit here will decrease
+          // the total length of the representation
+
+          digit = window_val & (mask >> 1);
+          // We know 0 < digit < bit and window_val - digit = bit.
+        }
+      } else {
+        digit = window_val;
+        // We know 0 < digit < bit and window_val - digit = 0.
+      }
+
+      window_val -= digit;
+
+      // Now window_val is 0 or 2^(w+1) in standard wNAF generation.
+      // For modified window NAFs, it may also be 2^w.
+      //
+      // See the comments above for the derivation of each of these bounds.
+      assert(window_val == 0 || window_val == next_bit || window_val == bit);
+      assert(-bit < digit && digit < bit);
+
+      // window_val was odd, so digit is also odd.
+      assert(digit & 1);
+    }
+
+    out[j] = digit;
+
+    // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if
+    // we shift and add at most one copy of |bit|, this will continue to hold
+    // afterwards.
+    window_val >>= 1;
+    window_val += bit * bn_is_bit_set_words(scalar->words, group->order.N.width,
+                                            j + w + 1);
+    assert(window_val <= next_bit);
+  }
+
+  // bits + 1 entries should be sufficient to consume all bits.
+  assert(window_val == 0);
+}

From 2de84993cbffd9d4eb4e281566fcfcca76c2bc80 Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Tue, 17 Oct 2023 16:03:36 -0700
Subject: [PATCH 7/7] P-256 ECDSA verification: Use BoringSSL's W-NAF-based
 implementation.

On targets where we don't use nistz256, use the Fiat W-NAF-based
implementation instead.
---
 Cargo.toml                      |  3 +++
 build.rs                        |  5 ++++
 crypto/fipsmodule/ec/internal.h |  5 ++--
 crypto/fipsmodule/ec/p256.c     | 31 +++++++++++--------------
 crypto/fipsmodule/ec/wnaf.c     | 41 +++++++++++----------------------
 crypto/internal.h               |  2 ++
 src/ec/suite_b/ops/p256.rs      | 26 ++++++++++++++++++---
 7 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index d2dd40127b..670b59c426 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,12 +65,14 @@ include = [
     "crypto/fipsmodule/bn/internal.h",
     "crypto/fipsmodule/bn/montgomery.c",
     "crypto/fipsmodule/bn/montgomery_inv.c",
+    "crypto/fipsmodule/bn/shift.c",
     "crypto/fipsmodule/ec/asm/p256-armv8-asm.pl",
     "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
     "crypto/fipsmodule/ec/ecp_nistz.c",
     "crypto/fipsmodule/ec/ecp_nistz.h",
     "crypto/fipsmodule/ec/ecp_nistz384.h",
     "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/internal.h",
     "crypto/fipsmodule/ec/gfp_p256.c",
     "crypto/fipsmodule/ec/gfp_p384.c",
     "crypto/fipsmodule/ec/p256.c",
@@ -80,6 +82,7 @@ include = [
     "crypto/fipsmodule/ec/p256_shared.h",
     "crypto/fipsmodule/ec/p256_table.h",
     "crypto/fipsmodule/ec/util.h",
+    "crypto/fipsmodule/ec/wnaf.c",
     "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
     "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl",
     "crypto/fipsmodule/modes/asm/ghash-armv4.pl",
diff --git a/build.rs b/build.rs
index 3cbae9d926..6add88b4f7 100644
--- a/build.rs
+++ b/build.rs
@@ -38,10 +38,12 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/fipsmodule/aes/aes_nohw.c"),
     (&[], "crypto/fipsmodule/bn/montgomery.c"),
     (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
+    (&[], "crypto/fipsmodule/bn/shift.c"),
     (&[], "crypto/fipsmodule/ec/ecp_nistz.c"),
     (&[], "crypto/fipsmodule/ec/gfp_p256.c"),
     (&[], "crypto/fipsmodule/ec/gfp_p384.c"),
     (&[], "crypto/fipsmodule/ec/p256.c"),
+    (&[], "crypto/fipsmodule/ec/wnaf.c"),
     (&[], "crypto/limbs/limbs.c"),
     (&[], "crypto/mem.c"),
     (&[], "crypto/poly1305/poly1305.c"),
@@ -919,6 +921,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "aesni_gcm_decrypt",
         "aesni_gcm_encrypt",
         "bn_from_montgomery_in_place",
+        "bn_is_bit_set_words",
         "bn_gather5",
         "bn_mul_mont",
         "bn_mul_mont_gather5",
@@ -933,6 +936,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "bssl_constant_time_test_main",
         "chacha20_poly1305_open",
         "chacha20_poly1305_seal",
+        "ec_compute_wNAF",
         "fiat_curve25519_adx_mul",
         "fiat_curve25519_adx_square",
         "gcm_ghash_avx",
@@ -959,6 +963,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "p256_point_mul",
         "p256_point_mul_base",
         "p256_point_mul_base_vartime",
+        "p256_point_mul_public",
         "p256_scalar_mul_mont",
         "p256_scalar_sqr_rep_mont",
         "p256_sqr_mont",
diff --git a/crypto/fipsmodule/ec/internal.h b/crypto/fipsmodule/ec/internal.h
index cf7c807256..99c47bae39 100644
--- a/crypto/fipsmodule/ec/internal.h
+++ b/crypto/fipsmodule/ec/internal.h
@@ -68,7 +68,7 @@
 #ifndef OPENSSL_HEADER_EC_INTERNAL_H
 #define OPENSSL_HEADER_EC_INTERNAL_H
 
-#include <openssl/base.h>
+#include <ring-core/base.h>
 
 // ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
 // |scalar| to |out|. |out| must have room for |bits| + 1 elements, each of
@@ -78,7 +78,6 @@
 // where at most one of any  w+1  consecutive digits is non-zero
 // with the exception that the most significant digit may be only
 // w-1 zeros away from that next non-zero digit.
-void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
-                     const EC_SCALAR *scalar, size_t bits, int w);
+void ec_compute_wNAF(int8_t *out, const BN_ULONG *scalar, size_t scalar_limbs, size_t bits, int w);
 
 #endif  // OPENSSL_HEADER_EC_INTERNAL_H
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index dc67a71f07..8461a42067 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -23,6 +23,7 @@
 
 #include "p256_shared.h"
 
+#include "internal.h"
 #include "../../internal.h"
 #include "./util.h"
 
@@ -473,19 +474,17 @@ void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
   fiat_p256_to_words(r[2], nq[2]);
 }
 
-#if 0
-
-static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
-                                             EC_JACOBIAN *r,
-                                             const EC_SCALAR *g_scalar,
-                                             const EC_JACOBIAN *p,
-                                             const EC_SCALAR *p_scalar) {
+void p256_point_mul_public(Limb r[3][P256_LIMBS],
+                           const Limb g_scalar[P256_LIMBS],
+                           const Limb p_scalar[P256_LIMBS],
+                           const Limb p_x[P256_LIMBS],
+                           const Limb p_y[P256_LIMBS]) {
 #define P256_WSIZE_PUBLIC 4
   // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|.
   fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3];
-  fiat_p256_from_generic(p_pre_comp[0][0], &p->X);
-  fiat_p256_from_generic(p_pre_comp[0][1], &p->Y);
-  fiat_p256_from_generic(p_pre_comp[0][2], &p->Z);
+  fiat_p256_from_words(p_pre_comp[0][0], p_x);
+  fiat_p256_from_words(p_pre_comp[0][1], p_y);
+  fiat_p256_copy(p_pre_comp[0][2], fiat_p256_one);
   fiat_p256_felem p2[3];
   fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0],
                          p_pre_comp[0][1], p_pre_comp[0][2]);
@@ -498,7 +497,7 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
 
   // Set up the coefficients for |p_scalar|.
   int8_t p_wNAF[257];
-  ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC);
+  ec_compute_wNAF(p_wNAF, p_scalar, P256_LIMBS, 256, P256_WSIZE_PUBLIC);
 
   // Set |ret| to the point at infinity.
   int skip = 1;  // Save some point operations.
@@ -542,7 +541,7 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
 
     int digit = p_wNAF[i];
     if (digit != 0) {
-      assert(digit & 1);
+      debug_assert_nonsecret(digit & 1);
       size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1);
       fiat_p256_felem *y = &p_pre_comp[idx][1], tmp;
       if (digit < 0) {
@@ -562,13 +561,11 @@ static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group,
     }
   }
 
-  fiat_p256_to_generic(&r->X, ret[0]);
-  fiat_p256_to_generic(&r->Y, ret[1]);
-  fiat_p256_to_generic(&r->Z, ret[2]);
+  fiat_p256_to_words(r[0], ret[0]);
+  fiat_p256_to_words(r[1], ret[1]);
+  fiat_p256_to_words(r[2], ret[2]);
 }
 
-#endif
-
 void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
                    const Limb b[P256_LIMBS]) {
   fiat_p256_felem a_, b_;
diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c
index 56de6cfec5..cd8c1161f1 100644
--- a/crypto/fipsmodule/ec/wnaf.c
+++ b/crypto/fipsmodule/ec/wnaf.c
@@ -65,17 +65,6 @@
  * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
  * Laboratories. */
 
-#include <openssl/ec.h>
-
-#include <assert.h>
-#include <string.h>
-
-#include <openssl/bn.h>
-#include <openssl/err.h>
-#include <openssl/mem.h>
-#include <openssl/thread.h>
-
-#include "internal.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 
@@ -85,27 +74,26 @@
 //   http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13
 //   http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf
 
-void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
-                     const EC_SCALAR *scalar, size_t bits, int w) {
+void ec_compute_wNAF(int8_t *out, const BN_ULONG scalar[], size_t scalar_limbs, size_t bits, int w) {
   // 'int8_t' can represent integers with absolute values less than 2^7.
-  assert(0 < w && w <= 7);
-  assert(bits != 0);
+  debug_assert_nonsecret(0 < w && w <= 7);
+  debug_assert_nonsecret(bits != 0);
   int bit = 1 << w;         // 2^w, at most 128
   int next_bit = bit << 1;  // 2^(w+1), at most 256
   int mask = next_bit - 1;  // at most 255
 
-  int window_val = scalar->words[0] & mask;
+  int window_val = ((int)scalar[0]) & mask;
   for (size_t j = 0; j < bits + 1; j++) {
-    assert(0 <= window_val && window_val <= next_bit);
+    debug_assert_nonsecret(0 <= window_val && window_val <= next_bit);
     int digit = 0;
     if (window_val & 1) {
-      assert(0 < window_val && window_val < next_bit);
+      debug_assert_nonsecret(0 < window_val && window_val < next_bit);
       if (window_val & bit) {
         digit = window_val - next_bit;
         // We know -next_bit < digit < 0 and window_val - digit = next_bit.
 
         // modified wNAF
-        if (j + w + 1 >= bits) {
+        if (j + ((size_t)w) + 1 >= bits) {
           // special case for generating modified wNAFs:
           // no new bits will be added into window_val,
           // so using a positive digit here will decrease
@@ -125,24 +113,23 @@ void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
       // For modified window NAFs, it may also be 2^w.
       //
       // See the comments above for the derivation of each of these bounds.
-      assert(window_val == 0 || window_val == next_bit || window_val == bit);
-      assert(-bit < digit && digit < bit);
+      debug_assert_nonsecret(window_val == 0 || window_val == next_bit || window_val == bit);
+      debug_assert_nonsecret(-bit < digit && digit < bit);
 
       // window_val was odd, so digit is also odd.
-      assert(digit & 1);
+      debug_assert_nonsecret(digit & 1);
     }
 
-    out[j] = digit;
+    out[j] = (int8_t)digit;
 
     // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if
     // we shift and add at most one copy of |bit|, this will continue to hold
     // afterwards.
     window_val >>= 1;
-    window_val += bit * bn_is_bit_set_words(scalar->words, group->order.N.width,
-                                            j + w + 1);
-    assert(window_val <= next_bit);
+    window_val += bit * bn_is_bit_set_words(scalar, scalar_limbs, j + (size_t)w + 1);
+    debug_assert_nonsecret(window_val <= next_bit);
   }
 
   // bits + 1 entries should be sufficient to consume all bits.
-  assert(window_val == 0);
+  debug_assert_nonsecret(window_val == 0);
 }
diff --git a/crypto/internal.h b/crypto/internal.h
index 7beb1d44fd..062ca564c6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -150,6 +150,8 @@ typedef __int128_t int128_t;
 typedef __uint128_t uint128_t;
 #endif
 
+#define OPENSSL_ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
 // Pointer utility functions.
 
 // buffers_alias returns one if |a| and |b| alias and zero otherwise.
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index adbed60936..104c2e10fc 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -119,9 +119,7 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     twin_mul: twin_mul_nistz256,
 
     #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
-    twin_mul: |g_scalar, p_scalar, p_xy| {
-        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
-    },
+    twin_mul: twin_mul_fiat,
 
     q_minus_n: Elem::from_hex("4319055358e8617b0c46353d039cdaae"),
 };
@@ -147,6 +145,28 @@ fn point_mul_base_vartime(g_scalar: &Scalar) -> Point {
     scaled_g
 }
 
+#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
+fn twin_mul_fiat(g_scalar: &Scalar, p_scalar: &Scalar, &(p_x, p_y): &(Elem<R>, Elem<R>)) -> Point {
+    prefixed_extern! {
+        fn p256_point_mul_public(r: *mut Limb,
+                                g_scalar: *const Limb,
+                                p_scalar: *const Limb,
+                                p_x: *const Limb,
+                                p_y: *const Limb);
+    }
+    let mut r = Point::new_at_infinity();
+    unsafe {
+        p256_point_mul_public(
+            r.xyz.as_mut_ptr(),
+            g_scalar.limbs.as_ptr(),
+            p_scalar.limbs.as_ptr(),
+            p_x.limbs.as_ptr(),
+            p_y.limbs.as_ptr(),
+        );
+    }
+    r
+}
+
 pub static PRIVATE_SCALAR_OPS: PrivateScalarOps = PrivateScalarOps {
     scalar_ops: &SCALAR_OPS,