diff --git a/Cargo.toml b/Cargo.toml
index 3378dc84bc..2e5524ea8c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,8 +69,7 @@ include = [
     "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
     "crypto/fipsmodule/ec/ecp_nistz.c",
     "crypto/fipsmodule/ec/ecp_nistz.h",
-    "crypto/fipsmodule/ec/ecp_nistz384.h",
-    "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/ecp_nistz.inl",
     "crypto/fipsmodule/ec/gfp_p256.c",
     "crypto/fipsmodule/ec/gfp_p384.c",
     "crypto/fipsmodule/ec/p256.c",
diff --git a/crypto/fipsmodule/ec/ecp_nistz.inl b/crypto/fipsmodule/ec/ecp_nistz.inl
new file mode 100644
index 0000000000..a3063b6f12
--- /dev/null
+++ b/crypto/fipsmodule/ec/ecp_nistz.inl
@@ -0,0 +1,449 @@
+/* Copyright (c) 2014, Intel Corporation.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/* Developers and authors:
+ * Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * (1) Intel Corporation, Israel Development Center
+ * (2) University of Haifa
+ * Reference:
+ *   Shay Gueron and Vlad Krasnov
+ *   "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
+ *   http://eprint.iacr.org/2013/816 */
+#include "ecp_nistz.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+#define RENAME_FUNC(prefix, bits, func) prefix ## bits ## _ ## func
+
+#define point_add(prefix, bits) RENAME_FUNC(prefix, bits, point_add)
+#define point_double(prefix, bits) RENAME_FUNC(prefix, bits, point_double)
+#define point_mul(prefix, bits) RENAME_FUNC(prefix, bits, point_mul)
+
+typedef Limb Elem[FE_LIMBS];
+typedef Limb ScalarMont[FE_LIMBS];
+typedef Limb Scalar[FE_LIMBS];
+
+typedef struct {
+  Limb X[FE_LIMBS];
+  Limb Y[FE_LIMBS];
+  Limb Z[FE_LIMBS];
+} NIST_POINT;
+
+typedef struct {
+  Limb X[FE_LIMBS];
+  Limb Y[FE_LIMBS];
+} NIST_POINT_AFFINE;
+
+#define TBL_SZ (1 << (W_BITS - 1))
+#define W_MASK ((1 << (W_BITS + 1)) - 1)
+
+static inline Limb is_equal(const Elem a, const Elem b) {
+  return LIMBS_equal(a, b, FE_LIMBS);
+}
+
+static inline Limb is_zero(const BN_ULONG a[FE_LIMBS]) {
+  return LIMBS_are_zero(a, FE_LIMBS);
+}
+
+static inline void copy_conditional(Elem r, const Elem a,
+                                                const Limb condition) {
+  for (size_t i = 0; i < FE_LIMBS; ++i) {
+    r[i] = constant_time_select_w(condition, a[i], r[i]);
+  }
+}
+
+static inline void elem_add(Elem r, const Elem a, const Elem b) {
+  LIMBS_add_mod(r, a, b, Q, FE_LIMBS);
+}
+
+static inline void elem_sub(Elem r, const Elem a, const Elem b) {
+  LIMBS_sub_mod(r, a, b, Q, FE_LIMBS);
+}
+
+static void elem_div_by_2(Elem r, const Elem a) {
+  /* Consider the case where `a` is even. Then we can shift `a` right one bit
+   * and the result will still be valid because we didn't lose any bits and so
+   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
+   *
+   * The remainder of this comment is considering the case where `a` is odd.
+   *
+   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
+   * because the lowest bit is lost during the shift. For example, consider:
+   *
+   * ```python
+   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
+   * a = 2**383
+   * two_a = a * 2 % q
+   * assert two_a == 0x100000000ffffffffffffffff00000001
+   * ```
+   *
+   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
+   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
+   * we obviously can't get with just a right shift.
+   *
+   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
+   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
+   * keep track of an extra most significant bit. We can avoid that by instead
+   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
+   * significant bit of `a`. `q + 1` is even, which means it can be shifted
+   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
+   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
+   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
+   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
+   * bit of `a`, which is 1. Thus:
+   *
+   * sum  =  ((q + 1) >> 1) + (a >> 1)
+   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
+   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
+   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
+   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
+   *     <=  (2q - 2)/2                 (simplifying)
+   *     <=  q - 1                      (simplifying)
+   *
+   * Thus, no reduction of the sum mod `q` is necessary. */
+
+  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
+
+  /* r = a >> 1. */
+  Limb carry = a[FE_LIMBS - 1] & 1;
+  r[FE_LIMBS - 1] = a[FE_LIMBS - 1] >> 1;
+  for (size_t i = 1; i < FE_LIMBS; ++i) {
+    Limb new_carry = a[FE_LIMBS - i - 1];
+    r[FE_LIMBS - i - 1] =
+        (a[FE_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
+    carry = new_carry;
+  }
+
+  Elem adjusted;
+  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, FE_LIMBS);
+  dev_assert_secret(carry2 == 0);
+  (void)carry2;
+  copy_conditional(r, adjusted, is_odd);
+}
+
+static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
+  /* XXX: Not (clearly) constant-time; inefficient.*/
+  bn_mul_mont(r, a, b, Q, Q_N0, FE_LIMBS);
+}
+
+static inline void elem_mul_by_2(Elem r, const Elem a) {
+  LIMBS_shl_mod(r, a, Q, FE_LIMBS);
+}
+
+static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
+  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
+  Elem doubled;
+  elem_add(doubled, a, a);
+  elem_add(r, doubled, a);
+}
+
+static inline void elem_sqr_mont(Elem r, const Elem a) {
+  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
+  elem_mul_mont(r, a, a);
+}
+
+static void elem_neg(Elem r, const Elem a) {
+  Limb is_zero = LIMBS_are_zero(a, FE_LIMBS);
+  Carry borrow = limbs_sub(r, Q, a, FE_LIMBS);
+  dev_assert_secret(borrow == 0);
+  (void)borrow;
+  for (size_t i = 0; i < FE_LIMBS; ++i) {
+    r[i] = constant_time_select_w(is_zero, 0, r[i]);
+  }
+}
+
+/* Point double: r = 2*a */
+static void point_double(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a) {
+  BN_ULONG S[FE_LIMBS];
+  BN_ULONG M[FE_LIMBS];
+  BN_ULONG Zsqr[FE_LIMBS];
+  BN_ULONG tmp0[FE_LIMBS];
+
+  const BN_ULONG *in_x = a->X;
+  const BN_ULONG *in_y = a->Y;
+  const BN_ULONG *in_z = a->Z;
+
+  BN_ULONG *res_x = r->X;
+  BN_ULONG *res_y = r->Y;
+  BN_ULONG *res_z = r->Z;
+
+  elem_mul_by_2(S, in_y);
+
+  elem_sqr_mont(Zsqr, in_z);
+
+  elem_sqr_mont(S, S);
+
+  elem_mul_mont(res_z, in_z, in_y);
+  elem_mul_by_2(res_z, res_z);
+
+  elem_add(M, in_x, Zsqr);
+  elem_sub(Zsqr, in_x, Zsqr);
+
+  elem_sqr_mont(res_y, S);
+  elem_div_by_2(res_y, res_y);
+
+  elem_mul_mont(M, M, Zsqr);
+  elem_mul_by_3(M, M);
+
+  elem_mul_mont(S, S, in_x);
+  elem_mul_by_2(tmp0, S);
+
+  elem_sqr_mont(res_x, M);
+
+  elem_sub(res_x, res_x, tmp0);
+  elem_sub(S, S, res_x);
+
+  elem_mul_mont(S, S, M);
+  elem_sub(res_y, S, res_y);
+}
+
+/* Point addition: r = a+b */
+static void point_add(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a,
+                                   const NIST_POINT *b) {
+  BN_ULONG U2[FE_LIMBS], S2[FE_LIMBS];
+  BN_ULONG U1[FE_LIMBS], S1[FE_LIMBS];
+  BN_ULONG Z1sqr[FE_LIMBS];
+  BN_ULONG Z2sqr[FE_LIMBS];
+  BN_ULONG H[FE_LIMBS], R[FE_LIMBS];
+  BN_ULONG Hsqr[FE_LIMBS];
+  BN_ULONG Rsqr[FE_LIMBS];
+  BN_ULONG Hcub[FE_LIMBS];
+
+  BN_ULONG res_x[FE_LIMBS];
+  BN_ULONG res_y[FE_LIMBS];
+  BN_ULONG res_z[FE_LIMBS];
+
+  const BN_ULONG *in1_x = a->X;
+  const BN_ULONG *in1_y = a->Y;
+  const BN_ULONG *in1_z = a->Z;
+
+  const BN_ULONG *in2_x = b->X;
+  const BN_ULONG *in2_y = b->Y;
+  const BN_ULONG *in2_z = b->Z;
+
+  BN_ULONG in1infty = is_zero(a->Z);
+  BN_ULONG in2infty = is_zero(b->Z);
+
+  elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
+  elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
+
+  elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
+  elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
+
+  elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
+  elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
+  elem_sub(R, S2, S1);          /* R = S2 - S1 */
+
+  elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
+  elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
+  elem_sub(H, U2, U1);             /* H = U2 - U1 */
+
+  BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
+  if (is_exceptional) {
+    if (is_equal(S1, S2)) {
+      point_double(nistz, BITS)(r, a);
+    } else {
+      limbs_zero(r->X, FE_LIMBS);
+      limbs_zero(r->Y, FE_LIMBS);
+      limbs_zero(r->Z, FE_LIMBS);
+    }
+    return;
+  }
+
+  elem_sqr_mont(Rsqr, R);             /* R^2 */
+  elem_mul_mont(res_z, H, in1_z);     /* Z3 = H*Z1*Z2 */
+  elem_sqr_mont(Hsqr, H);             /* H^2 */
+  elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
+  elem_mul_mont(Hcub, Hsqr, H);       /* H^3 */
+
+  elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
+  elem_mul_by_2(Hsqr, U2);     /* 2*U1*H^2 */
+
+  elem_sub(res_x, Rsqr, Hsqr);
+  elem_sub(res_x, res_x, Hcub);
+
+  elem_sub(res_y, U2, res_x);
+
+  elem_mul_mont(S2, S1, Hcub);
+  elem_mul_mont(res_y, R, res_y);
+  elem_sub(res_y, res_y, S2);
+
+  copy_conditional(res_x, in2_x, in1infty);
+  copy_conditional(res_y, in2_y, in1infty);
+  copy_conditional(res_z, in2_z, in1infty);
+
+  copy_conditional(res_x, in1_x, in2infty);
+  copy_conditional(res_y, in1_y, in2infty);
+  copy_conditional(res_z, in1_z, in2infty);
+
+  limbs_copy(r->X, res_x, FE_LIMBS);
+  limbs_copy(r->Y, res_y, FE_LIMBS);
+  limbs_copy(r->Z, res_z, FE_LIMBS);
+}
+
+static void NIST_POINT_select_w(NIST_POINT *out,
+                                const NIST_POINT table[TBL_SZ], size_t index) {
+  Elem x; limbs_zero(x, FE_LIMBS);
+  Elem y; limbs_zero(y, FE_LIMBS);
+  Elem z; limbs_zero(z, FE_LIMBS);
+
+  // TODO: Rewrite in terms of |limbs_select|.
+  for (size_t i = 0; i < TBL_SZ; ++i) {
+    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
+    for (size_t j = 0; j < FE_LIMBS; ++j) {
+      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
+      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
+      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
+    }
+  }
+
+  limbs_copy(out->X, x, FE_LIMBS);
+  limbs_copy(out->Y, y, FE_LIMBS);
+  limbs_copy(out->Z, z, FE_LIMBS);
+}
+
+static void add_precomputed_w(NIST_POINT *r, crypto_word_t wvalue,
+                              const NIST_POINT table[TBL_SZ]) {
+  crypto_word_t recoded_is_negative;
+  crypto_word_t recoded;
+  booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS);
+
+  alignas(64) NIST_POINT h;
+  NIST_POINT_select_w(&h, table, recoded);
+
+  alignas(64) BN_ULONG tmp[FE_LIMBS];
+  elem_neg(tmp, h.Y);
+  copy_conditional(h.Y, tmp, recoded_is_negative);
+
+  point_add(nistz, BITS)(r, r, &h);
+}
+
+/* r = p * p_scalar */
+static void point_mul(nistz, BITS)(NIST_POINT *r, const BN_ULONG p_scalar[FE_LIMBS],
+                                   const BN_ULONG p_x[FE_LIMBS],
+                                   const BN_ULONG p_y[FE_LIMBS]) {
+  uint8_t p_str[(FE_LIMBS * sizeof(Limb)) + 1];
+  little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
+                                  p_scalar, FE_LIMBS);
+
+  /* A |NIST_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
+  * add no more than 63 bytes of overhead. Thus, |table| should require
+  * ~2367 ((144 * 16) + 63) bytes of stack space. */
+  alignas(64) NIST_POINT table[TBL_SZ];
+
+  /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
+  * not stored. All other values are actually stored with an offset of -1 in
+  * table. */
+  NIST_POINT *row = table;
+
+  limbs_copy(row[0].X, p_x, FE_LIMBS);
+  limbs_copy(row[0].Y, p_y, FE_LIMBS);
+  limbs_copy(row[0].Z, ONE, FE_LIMBS);
+
+  point_double(nistz, BITS)(&row[1], &row[0]);
+
+  for (int i = 2; i < TBL_SZ; i += 2) {
+    point_add(nistz, BITS)(&row[i], &row[i - 1], &row[0]);
+    point_double(nistz, BITS)(&row[i + 1], &row[i / 2]);
+  }
+
+  static const size_t ROUND_SIZE = (BITS + W_BITS - 1) / W_BITS * W_BITS;
+  static const size_t START_INDEX = ROUND_SIZE == BITS + 1 ? ROUND_SIZE - W_BITS: ROUND_SIZE;
+  size_t index = START_INDEX;
+
+  BN_ULONG recoded_is_negative;
+  crypto_word_t recoded;
+
+  crypto_word_t wvalue = p_str[(index - 1) / 8];
+  wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK;
+
+  booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS);
+  dev_assert_secret(!recoded_is_negative);
+
+  NIST_POINT_select_w(r, table, recoded);
+
+  while (index >= W_BITS) {
+    if (index != START_INDEX) {
+      size_t off = (index - 1) / 8;
+
+      wvalue = p_str[off] | p_str[off + 1] << 8;
+      wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK;
+      add_precomputed_w(r, wvalue, table);
+    }
+
+    index -= W_BITS;
+
+    for (int i = 0; i < W_BITS; i++) {
+      point_double(nistz, BITS)(r, r);
+    }
+  }
+
+  /* Final window */
+  wvalue = p_str[0];
+  wvalue = (wvalue << 1) & W_MASK;
+  add_precomputed_w(r, wvalue, table);
+}
+
+
+void point_double(p, BITS)(Limb r[3][FE_LIMBS], const Limb a[3][FE_LIMBS])
+{
+  NIST_POINT t;
+  limbs_copy(t.X, a[0], FE_LIMBS);
+  limbs_copy(t.Y, a[1], FE_LIMBS);
+  limbs_copy(t.Z, a[2], FE_LIMBS);
+  point_double(nistz, BITS)(&t, &t);
+  limbs_copy(r[0], t.X, FE_LIMBS);
+  limbs_copy(r[1], t.Y, FE_LIMBS);
+  limbs_copy(r[2], t.Z, FE_LIMBS);
+}
+
+void point_add(p, BITS)(Limb r[3][FE_LIMBS],
+                        const Limb a[3][FE_LIMBS],
+                        const Limb b[3][FE_LIMBS])
+{
+  NIST_POINT t1;
+  limbs_copy(t1.X, a[0], FE_LIMBS);
+  limbs_copy(t1.Y, a[1], FE_LIMBS);
+  limbs_copy(t1.Z, a[2], FE_LIMBS);
+
+  NIST_POINT t2;
+  limbs_copy(t2.X, b[0], FE_LIMBS);
+  limbs_copy(t2.Y, b[1], FE_LIMBS);
+  limbs_copy(t2.Z, b[2], FE_LIMBS);
+
+  point_add(nistz, BITS)(&t1, &t1, &t2);
+
+  limbs_copy(r[0], t1.X, FE_LIMBS);
+  limbs_copy(r[1], t1.Y, FE_LIMBS);
+  limbs_copy(r[2], t1.Z, FE_LIMBS);
+}
+
+void point_mul(p, BITS)(Limb r[3][FE_LIMBS],
+                        const BN_ULONG p_scalar[FE_LIMBS],
+                        const Limb p_x[FE_LIMBS],
+                        const Limb p_y[FE_LIMBS])
+{
+  alignas(64) NIST_POINT acc;
+  point_mul(nistz, BITS)(&acc, p_scalar, p_x, p_y);
+  limbs_copy(r[0], acc.X, FE_LIMBS);
+  limbs_copy(r[1], acc.Y, FE_LIMBS);
+  limbs_copy(r[2], acc.Z, FE_LIMBS);
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.h b/crypto/fipsmodule/ec/ecp_nistz384.h
deleted file mode 100644
index ca87e60721..0000000000
--- a/crypto/fipsmodule/ec/ecp_nistz384.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2014, Intel Corporation.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
-#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
-
-#include "../../limbs/limbs.h"
-
-#define P384_LIMBS (384u / LIMB_BITS)
-
-typedef struct {
-  Limb X[P384_LIMBS];
-  Limb Y[P384_LIMBS];
-  Limb Z[P384_LIMBS];
-} P384_POINT;
-
-typedef struct {
-  Limb X[P384_LIMBS];
-  Limb Y[P384_LIMBS];
-} P384_POINT_AFFINE;
-
-
-#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.inl b/crypto/fipsmodule/ec/ecp_nistz384.inl
deleted file mode 100644
index ae28f97ae5..0000000000
--- a/crypto/fipsmodule/ec/ecp_nistz384.inl
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2014, Intel Corporation.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-/* Developers and authors:
- * Shay Gueron (1, 2), and Vlad Krasnov (1)
- * (1) Intel Corporation, Israel Development Center
- * (2) University of Haifa
- * Reference:
- *   Shay Gueron and Vlad Krasnov
- *   "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
- *   http://eprint.iacr.org/2013/816 */
-
-#include "ecp_nistz.h"
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-conversion"
-#endif
-
-/* Point double: r = 2*a */
-static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
-  BN_ULONG S[P384_LIMBS];
-  BN_ULONG M[P384_LIMBS];
-  BN_ULONG Zsqr[P384_LIMBS];
-  BN_ULONG tmp0[P384_LIMBS];
-
-  const BN_ULONG *in_x = a->X;
-  const BN_ULONG *in_y = a->Y;
-  const BN_ULONG *in_z = a->Z;
-
-  BN_ULONG *res_x = r->X;
-  BN_ULONG *res_y = r->Y;
-  BN_ULONG *res_z = r->Z;
-
-  elem_mul_by_2(S, in_y);
-
-  elem_sqr_mont(Zsqr, in_z);
-
-  elem_sqr_mont(S, S);
-
-  elem_mul_mont(res_z, in_z, in_y);
-  elem_mul_by_2(res_z, res_z);
-
-  elem_add(M, in_x, Zsqr);
-  elem_sub(Zsqr, in_x, Zsqr);
-
-  elem_sqr_mont(res_y, S);
-  elem_div_by_2(res_y, res_y);
-
-  elem_mul_mont(M, M, Zsqr);
-  elem_mul_by_3(M, M);
-
-  elem_mul_mont(S, S, in_x);
-  elem_mul_by_2(tmp0, S);
-
-  elem_sqr_mont(res_x, M);
-
-  elem_sub(res_x, res_x, tmp0);
-  elem_sub(S, S, res_x);
-
-  elem_mul_mont(S, S, M);
-  elem_sub(res_y, S, res_y);
-}
-
-/* Point addition: r = a+b */
-static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
-                               const P384_POINT *b) {
-  BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
-  BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
-  BN_ULONG Z1sqr[P384_LIMBS];
-  BN_ULONG Z2sqr[P384_LIMBS];
-  BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
-  BN_ULONG Hsqr[P384_LIMBS];
-  BN_ULONG Rsqr[P384_LIMBS];
-  BN_ULONG Hcub[P384_LIMBS];
-
-  BN_ULONG res_x[P384_LIMBS];
-  BN_ULONG res_y[P384_LIMBS];
-  BN_ULONG res_z[P384_LIMBS];
-
-  const BN_ULONG *in1_x = a->X;
-  const BN_ULONG *in1_y = a->Y;
-  const BN_ULONG *in1_z = a->Z;
-
-  const BN_ULONG *in2_x = b->X;
-  const BN_ULONG *in2_y = b->Y;
-  const BN_ULONG *in2_z = b->Z;
-
-  BN_ULONG in1infty = is_zero(a->Z);
-  BN_ULONG in2infty = is_zero(b->Z);
-
-  elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
-  elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
-
-  elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
-  elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
-
-  elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
-  elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
-  elem_sub(R, S2, S1);          /* R = S2 - S1 */
-
-  elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
-  elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
-  elem_sub(H, U2, U1);             /* H = U2 - U1 */
-
-  BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
-  if (is_exceptional) {
-    if (is_equal(S1, S2)) {
-      nistz384_point_double(r, a);
-    } else {
-      limbs_zero(r->X, P384_LIMBS);
-      limbs_zero(r->Y, P384_LIMBS);
-      limbs_zero(r->Z, P384_LIMBS);
-    }
-    return;
-  }
-
-  elem_sqr_mont(Rsqr, R);             /* R^2 */
-  elem_mul_mont(res_z, H, in1_z);     /* Z3 = H*Z1*Z2 */
-  elem_sqr_mont(Hsqr, H);             /* H^2 */
-  elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
-  elem_mul_mont(Hcub, Hsqr, H);       /* H^3 */
-
-  elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
-  elem_mul_by_2(Hsqr, U2);     /* 2*U1*H^2 */
-
-  elem_sub(res_x, Rsqr, Hsqr);
-  elem_sub(res_x, res_x, Hcub);
-
-  elem_sub(res_y, U2, res_x);
-
-  elem_mul_mont(S2, S1, Hcub);
-  elem_mul_mont(res_y, R, res_y);
-  elem_sub(res_y, res_y, S2);
-
-  copy_conditional(res_x, in2_x, in1infty);
-  copy_conditional(res_y, in2_y, in1infty);
-  copy_conditional(res_z, in2_z, in1infty);
-
-  copy_conditional(res_x, in1_x, in2infty);
-  copy_conditional(res_y, in1_y, in2infty);
-  copy_conditional(res_z, in1_z, in2infty);
-
-  limbs_copy(r->X, res_x, P384_LIMBS);
-  limbs_copy(r->Y, res_y, P384_LIMBS);
-  limbs_copy(r->Z, res_z, P384_LIMBS);
-}
-
-static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue,
-                               const P384_POINT table[16]) {
-  crypto_word_t recoded_is_negative;
-  crypto_word_t recoded;
-  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
-
-  alignas(64) P384_POINT h;
-  p384_point_select_w5(&h, table, recoded);
-
-  alignas(64) BN_ULONG tmp[P384_LIMBS];
-  p384_elem_neg(tmp, h.Y);
-  copy_conditional(h.Y, tmp, recoded_is_negative);
-
-  nistz384_point_add(r, r, &h);
-}
-
-/* r = p * p_scalar */
-static void nistz384_point_mul(P384_POINT *r,
-                               const BN_ULONG p_scalar[P384_LIMBS],
-                               const Limb p_x[P384_LIMBS],
-                               const Limb p_y[P384_LIMBS]) {
-  static const size_t kWindowSize = 5;
-  static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
-
-  uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
-  little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
-                                  p_scalar, P384_LIMBS);
-
-  /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
-  * add no more than 63 bytes of overhead. Thus, |table| should require
-  * ~2367 ((144 * 16) + 63) bytes of stack space. */
-  alignas(64) P384_POINT table[16];
-
-  /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
-  * not stored. All other values are actually stored with an offset of -1 in
-  * table. */
-  P384_POINT *row = table;
-
-  limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
-  limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
-  limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
-
-  nistz384_point_double(&row[2 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[4 - 1], &row[2 - 1]);
-  nistz384_point_double(&row[6 - 1], &row[3 - 1]);
-  nistz384_point_double(&row[8 - 1], &row[4 - 1]);
-  nistz384_point_double(&row[12 - 1], &row[6 - 1]);
-  nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[14 - 1], &row[7 - 1]);
-  nistz384_point_double(&row[10 - 1], &row[5 - 1]);
-  nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[16 - 1], &row[8 - 1]);
-
-  static const size_t START_INDEX = 384 - 4;
-  size_t index = START_INDEX;
-
-  BN_ULONG recoded_is_negative;
-  crypto_word_t recoded;
-
-  crypto_word_t wvalue = p_str[(index - 1) / 8];
-  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-
-  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
-  dev_assert_secret(!recoded_is_negative);
-
-  p384_point_select_w5(r, table, recoded);
-
-  while (index >= kWindowSize) {
-    if (index != START_INDEX) {
-      size_t off = (index - 1) / 8;
-
-      wvalue = p_str[off] | p_str[off + 1] << 8;
-      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-      add_precomputed_w5(r, wvalue, table);
-    }
-
-    index -= kWindowSize;
-
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-  }
-
-  /* Final window */
-  wvalue = p_str[0];
-  wvalue = (wvalue << 1) & kMask;
-  add_precomputed_w5(r, wvalue, table);
-}
-
-void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS])
-{
-  P384_POINT t;
-  limbs_copy(t.X, a[0], P384_LIMBS);
-  limbs_copy(t.Y, a[1], P384_LIMBS);
-  limbs_copy(t.Z, a[2], P384_LIMBS);
-  nistz384_point_double(&t, &t);
-  limbs_copy(r[0], t.X, P384_LIMBS);
-  limbs_copy(r[1], t.Y, P384_LIMBS);
-  limbs_copy(r[2], t.Z, P384_LIMBS);
-}
-
-void p384_point_add(Limb r[3][P384_LIMBS],
-                    const Limb a[3][P384_LIMBS],
-                    const Limb b[3][P384_LIMBS])
-{
-  P384_POINT t1;
-  limbs_copy(t1.X, a[0], P384_LIMBS);
-  limbs_copy(t1.Y, a[1], P384_LIMBS);
-  limbs_copy(t1.Z, a[2], P384_LIMBS);
-
-  P384_POINT t2;
-  limbs_copy(t2.X, b[0], P384_LIMBS);
-  limbs_copy(t2.Y, b[1], P384_LIMBS);
-  limbs_copy(t2.Z, b[2], P384_LIMBS);
-
-  nistz384_point_add(&t1, &t1, &t2);
-
-  limbs_copy(r[0], t1.X, P384_LIMBS);
-  limbs_copy(r[1], t1.Y, P384_LIMBS);
-  limbs_copy(r[2], t1.Z, P384_LIMBS);
-}
-
-void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS],
-                    const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) {
-  alignas(64) P384_POINT acc;
-  nistz384_point_mul(&acc, p_scalar, p_x, p_y);
-  limbs_copy(r[0], acc.X, P384_LIMBS);
-  limbs_copy(r[1], acc.Y, P384_LIMBS);
-  limbs_copy(r[2], acc.Z, P384_LIMBS);
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
diff --git a/crypto/fipsmodule/ec/gfp_p384.c b/crypto/fipsmodule/ec/gfp_p384.c
index 18382856d7..41aac8774d 100644
--- a/crypto/fipsmodule/ec/gfp_p384.c
+++ b/crypto/fipsmodule/ec/gfp_p384.c
@@ -14,21 +14,12 @@
 
 #include "../../limbs/limbs.h"
 
-#include "ecp_nistz384.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 
 #include "../../limbs/limbs.inl"
 
- /* XXX: Here we assume that the conversion from |Carry| to |Limb| is
-  * constant-time, but we haven't verified that assumption. TODO: Fix it so
-  * we don't need to make that assumption. */
-
-
-typedef Limb Elem[P384_LIMBS];
-typedef Limb ScalarMont[P384_LIMBS];
-typedef Limb Scalar[P384_LIMBS];
-
+#define P384_LIMBS (384u / LIMB_BITS)
 
 static const BN_ULONG Q[P384_LIMBS] = {
   TOBN(0x00000000, 0xffffffff),
@@ -48,12 +39,23 @@ static const BN_ULONG N[P384_LIMBS] = {
   TOBN(0xffffffff, 0xffffffff),
 };
 
-
 static const BN_ULONG ONE[P384_LIMBS] = {
   TOBN(0xffffffff, 1), TOBN(0, 0xffffffff), TOBN(0, 1), TOBN(0, 0), TOBN(0, 0),
   TOBN(0, 0),
 };
 
+static const BN_ULONG Q_PLUS_1_SHR_1[P384_LIMBS] = {
+  TOBN(0x00000000, 0x80000000),
+  TOBN(0x7fffffff, 0x80000000),
+  TOBN(0xffffffff, 0xffffffff),
+  TOBN(0xffffffff, 0xffffffff),
+  TOBN(0xffffffff, 0xffffffff),
+  TOBN(0x7fffffff, 0xffffffff),
+};
+
+static const BN_ULONG Q_N0[] = {
+  BN_MONT_CTX_N0(0x1, 0x1)
+};
 
 /* XXX: MSVC for x86 warns when it fails to inline these functions it should
  * probably inline. */
@@ -63,120 +65,12 @@ static const BN_ULONG ONE[P384_LIMBS] = {
 #define INLINE_IF_POSSIBLE inline
 #endif
 
-static inline Limb is_equal(const Elem a, const Elem b) {
-  return LIMBS_equal(a, b, P384_LIMBS);
-}
+#define BITS 384
+/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */
+#define W_BITS 5
+#define FE_LIMBS P384_LIMBS
 
-static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
-  return LIMBS_are_zero(a, P384_LIMBS);
-}
-
-static inline void copy_conditional(Elem r, const Elem a,
-                                                const Limb condition) {
-  for (size_t i = 0; i < P384_LIMBS; ++i) {
-    r[i] = constant_time_select_w(condition, a[i], r[i]);
-  }
-}
-
-
-static inline void elem_add(Elem r, const Elem a, const Elem b) {
-  LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
-}
-
-static inline void elem_sub(Elem r, const Elem a, const Elem b) {
-  LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
-}
-
-static void elem_div_by_2(Elem r, const Elem a) {
-  /* Consider the case where `a` is even. Then we can shift `a` right one bit
-   * and the result will still be valid because we didn't lose any bits and so
-   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
-   *
-   * The remainder of this comment is considering the case where `a` is odd.
-   *
-   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
-   * because the lowest bit is lost during the shift. For example, consider:
-   *
-   * ```python
-   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
-   * a = 2**383
-   * two_a = a * 2 % q
-   * assert two_a == 0x100000000ffffffffffffffff00000001
-   * ```
-   *
-   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
-   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
-   * we obviously can't get with just a right shift.
-   *
-   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
-   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
-   * keep track of an extra most significant bit. We can avoid that by instead
-   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
-   * significant bit of `a`. `q + 1` is even, which means it can be shifted
-   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
-   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
-   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
-   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
-   * bit of `a`, which is 1. Thus:
-   *
-   * sum  =  ((q + 1) >> 1) + (a >> 1)
-   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
-   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
-   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
-   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
-   *     <=  (2q - 2)/2                 (simplifying)
-   *     <=  q - 1                      (simplifying)
-   *
-   * Thus, no reduction of the sum mod `q` is necessary. */
-
-  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
-
-  /* r = a >> 1. */
-  Limb carry = a[P384_LIMBS - 1] & 1;
-  r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
-  for (size_t i = 1; i < P384_LIMBS; ++i) {
-    Limb new_carry = a[P384_LIMBS - i - 1];
-    r[P384_LIMBS - i - 1] =
-        (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
-    carry = new_carry;
-  }
-
-  static const Elem Q_PLUS_1_SHR_1 = {
-    TOBN(0x00000000, 0x80000000), TOBN(0x7fffffff, 0x80000000),
-    TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0xffffffff),
-    TOBN(0xffffffff, 0xffffffff), TOBN(0x7fffffff, 0xffffffff),
-  };
-
-  Elem adjusted;
-  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
-  dev_assert_secret(carry2 == 0);
-  (void)carry2;
-  copy_conditional(r, adjusted, is_odd);
-}
-
-static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
-  static const BN_ULONG Q_N0[] = {
-    BN_MONT_CTX_N0(0x1, 0x1)
-  };
-  /* XXX: Not (clearly) constant-time; inefficient.*/
-  bn_mul_mont(r, a, b, Q, Q_N0, P384_LIMBS);
-}
-
-static inline void elem_mul_by_2(Elem r, const Elem a) {
-  LIMBS_shl_mod(r, a, Q, P384_LIMBS);
-}
-
-static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
-  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
-  Elem doubled;
-  elem_add(doubled, a, a);
-  elem_add(r, doubled, a);
-}
-
-static inline void elem_sqr_mont(Elem r, const Elem a) {
-  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
-  elem_mul_mont(r, a, a);
-}
+#include "ecp_nistz.inl"
 
 void p384_elem_sub(Elem r, const Elem a, const Elem b) {
   elem_sub(r, a, b);
@@ -191,48 +85,14 @@ void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
 }
 
 void p384_elem_neg(Elem r, const Elem a) {
-  Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
-  Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
-  dev_assert_secret(borrow == 0);
-  (void)borrow;
-  for (size_t i = 0; i < P384_LIMBS; ++i) {
-    r[i] = constant_time_select_w(is_zero, 0, r[i]);
-  }
+  elem_neg(r, a);
 }
 
-
 void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
                               const ScalarMont b) {
   static const BN_ULONG N_N0[] = {
     BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45)
   };
   /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
-  bn_mul_mont(r, a, b, N, N_N0, P384_LIMBS);
-}
-
-
-/* TODO(perf): Optimize this. */
-
-static void p384_point_select_w5(P384_POINT *out,
-                                     const P384_POINT table[16], size_t index) {
-  Elem x; limbs_zero(x, P384_LIMBS);
-  Elem y; limbs_zero(y, P384_LIMBS);
-  Elem z; limbs_zero(z, P384_LIMBS);
-
-  // TODO: Rewrite in terms of |limbs_select|.
-  for (size_t i = 0; i < 16; ++i) {
-    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
-    for (size_t j = 0; j < P384_LIMBS; ++j) {
-      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
-      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
-      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
-    }
-  }
-
-  limbs_copy(out->X, x, P384_LIMBS);
-  limbs_copy(out->Y, y, P384_LIMBS);
-  limbs_copy(out->Z, z, P384_LIMBS);
+  bn_mul_mont(r, a, b, N, N_N0, FE_LIMBS);
 }
-
-
-#include "ecp_nistz384.inl"
diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs
index 54ec00aa8f..c87753c143 100644
--- a/src/ec/suite_b/ops/p384.rs
+++ b/src/ec/suite_b/ops/p384.rs
@@ -291,6 +291,7 @@ prefixed_extern! {
         a: *const Limb, // [3][COMMON_OPS.num_limbs]
         b: *const Limb, // [3][COMMON_OPS.num_limbs]
     );
+
     fn p384_point_mul(
         r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
         p_scalar: *const Limb, // [COMMON_OPS.num_limbs]