From fcf5cd651eb548d0f45677cb9cf9e399c10de08d Mon Sep 17 00:00:00 2001
From: Vlad Krasnov <vlad@cloudflare.com>
Date: Thu, 31 Aug 2023 15:34:54 -0400
Subject: [PATCH] ec: make P384 code a little bit more generic

This change makes it easier to reuse the P384 code which is quite
generic already. No algorithmic changes are made, only some code
is shuffled around. This prepares the ground for P521 implementation.
---
 Cargo.toml                                    |   4 +-
 .../ec/{ecp_nistz384.inl => ecp_nistz.inl}    | 221 +++++++++---------
 crypto/fipsmodule/ec/ecp_nistz384.h           |  34 ---
 crypto/fipsmodule/ec/gfp.h                    | 193 +++++++++++++++
 crypto/fipsmodule/ec/gfp_p384.c               | 191 ++-------------
 mk/generate_curves.py                         |  39 +++-
 6 files changed, 352 insertions(+), 330 deletions(-)
 rename crypto/fipsmodule/ec/{ecp_nistz384.inl => ecp_nistz.inl} (50%)
 delete mode 100644 crypto/fipsmodule/ec/ecp_nistz384.h
 create mode 100644 crypto/fipsmodule/ec/gfp.h

diff --git a/Cargo.toml b/Cargo.toml
index c8ac2a3978..0585b60927 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,8 +69,8 @@ include = [
     "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
     "crypto/fipsmodule/ec/ecp_nistz.c",
     "crypto/fipsmodule/ec/ecp_nistz.h",
-    "crypto/fipsmodule/ec/ecp_nistz384.h",
-    "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/ecp_nistz.inl",
+    "crypto/fipsmodule/ec/gfp.h",
     "crypto/fipsmodule/ec/gfp_p256.c",
     "crypto/fipsmodule/ec/gfp_p384.c",
     "crypto/fipsmodule/ec/p256.c",
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.inl b/crypto/fipsmodule/ec/ecp_nistz.inl
similarity index 50%
rename from crypto/fipsmodule/ec/ecp_nistz384.inl
rename to crypto/fipsmodule/ec/ecp_nistz.inl
index ae28f97ae5..a9b2211a1f 100644
--- a/crypto/fipsmodule/ec/ecp_nistz384.inl
+++ b/crypto/fipsmodule/ec/ecp_nistz.inl
@@ -20,20 +20,24 @@
  *   Shay Gueron and Vlad Krasnov
  *   "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
  *   http://eprint.iacr.org/2013/816 */
-
 #include "ecp_nistz.h"
+#include "gfp.h"
 
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wsign-conversion"
 #endif
 
+#define point_add(prefix, bits) RENAME_FUNC(prefix, bits, point_add)
+#define point_double(prefix, bits) RENAME_FUNC(prefix, bits, point_double)
+#define point_mul(prefix, bits) RENAME_FUNC(prefix, bits, point_mul)
+
 /* Point double: r = 2*a */
-static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
-  BN_ULONG S[P384_LIMBS];
-  BN_ULONG M[P384_LIMBS];
-  BN_ULONG Zsqr[P384_LIMBS];
-  BN_ULONG tmp0[P384_LIMBS];
+static void point_double(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a) {
+  BN_ULONG S[FE_LIMBS];
+  BN_ULONG M[FE_LIMBS];
+  BN_ULONG Zsqr[FE_LIMBS];
+  BN_ULONG tmp0[FE_LIMBS];
 
   const BN_ULONG *in_x = a->X;
   const BN_ULONG *in_y = a->Y;
@@ -74,20 +78,20 @@ static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
 }
 
 /* Point addition: r = a+b */
-static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
-                               const P384_POINT *b) {
-  BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
-  BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
-  BN_ULONG Z1sqr[P384_LIMBS];
-  BN_ULONG Z2sqr[P384_LIMBS];
-  BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
-  BN_ULONG Hsqr[P384_LIMBS];
-  BN_ULONG Rsqr[P384_LIMBS];
-  BN_ULONG Hcub[P384_LIMBS];
-
-  BN_ULONG res_x[P384_LIMBS];
-  BN_ULONG res_y[P384_LIMBS];
-  BN_ULONG res_z[P384_LIMBS];
+static void point_add(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a,
+                                   const NIST_POINT *b) {
+  BN_ULONG U2[FE_LIMBS], S2[FE_LIMBS];
+  BN_ULONG U1[FE_LIMBS], S1[FE_LIMBS];
+  BN_ULONG Z1sqr[FE_LIMBS];
+  BN_ULONG Z2sqr[FE_LIMBS];
+  BN_ULONG H[FE_LIMBS], R[FE_LIMBS];
+  BN_ULONG Hsqr[FE_LIMBS];
+  BN_ULONG Rsqr[FE_LIMBS];
+  BN_ULONG Hcub[FE_LIMBS];
+
+  BN_ULONG res_x[FE_LIMBS];
+  BN_ULONG res_y[FE_LIMBS];
+  BN_ULONG res_z[FE_LIMBS];
 
   const BN_ULONG *in1_x = a->X;
   const BN_ULONG *in1_y = a->Y;
@@ -117,11 +121,11 @@ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
   BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
   if (is_exceptional) {
     if (is_equal(S1, S2)) {
-      nistz384_point_double(r, a);
+      point_double(nistz, BITS)(r, a);
     } else {
-      limbs_zero(r->X, P384_LIMBS);
-      limbs_zero(r->Y, P384_LIMBS);
-      limbs_zero(r->Z, P384_LIMBS);
+      limbs_zero(r->X, FE_LIMBS);
+      limbs_zero(r->Y, FE_LIMBS);
+      limbs_zero(r->Z, FE_LIMBS);
     }
     return;
   }
@@ -152,147 +156,136 @@ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
   copy_conditional(res_y, in1_y, in2infty);
   copy_conditional(res_z, in1_z, in2infty);
 
-  limbs_copy(r->X, res_x, P384_LIMBS);
-  limbs_copy(r->Y, res_y, P384_LIMBS);
-  limbs_copy(r->Z, res_z, P384_LIMBS);
+  limbs_copy(r->X, res_x, FE_LIMBS);
+  limbs_copy(r->Y, res_y, FE_LIMBS);
+  limbs_copy(r->Z, res_z, FE_LIMBS);
 }
 
-static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue,
-                               const P384_POINT table[16]) {
+static void add_precomputed_w(NIST_POINT *r, crypto_word_t wvalue,
+                              const NIST_POINT table[TBL_SZ]) {
   crypto_word_t recoded_is_negative;
   crypto_word_t recoded;
-  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
+  booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS);
 
-  alignas(64) P384_POINT h;
-  p384_point_select_w5(&h, table, recoded);
+  alignas(64) NIST_POINT h;
+  NIST_POINT_select_w(&h, table, recoded);
 
-  alignas(64) BN_ULONG tmp[P384_LIMBS];
-  p384_elem_neg(tmp, h.Y);
+  alignas(64) BN_ULONG tmp[FE_LIMBS];
+  elem_neg(tmp, h.Y);
   copy_conditional(h.Y, tmp, recoded_is_negative);
 
-  nistz384_point_add(r, r, &h);
+  point_add(nistz, BITS)(r, r, &h);
 }
 
 /* r = p * p_scalar */
-static void nistz384_point_mul(P384_POINT *r,
-                               const BN_ULONG p_scalar[P384_LIMBS],
-                               const Limb p_x[P384_LIMBS],
-                               const Limb p_y[P384_LIMBS]) {
-  static const size_t kWindowSize = 5;
-  static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
-
-  uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
+static void point_mul(nistz, BITS)(NIST_POINT *r, const BN_ULONG p_scalar[FE_LIMBS],
+                                   const BN_ULONG p_x[FE_LIMBS],
+                                   const BN_ULONG p_y[FE_LIMBS]) {
+  uint8_t p_str[(FE_LIMBS * sizeof(Limb)) + 1];
   little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
-                                  p_scalar, P384_LIMBS);
+                                  p_scalar, FE_LIMBS);
 
-  /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
+  /* A |NIST_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
   * add no more than 63 bytes of overhead. Thus, |table| should require
   * ~2367 ((144 * 16) + 63) bytes of stack space. */
-  alignas(64) P384_POINT table[16];
+  alignas(64) NIST_POINT table[TBL_SZ];
 
   /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
   * not stored. All other values are actually stored with an offset of -1 in
   * table. */
-  P384_POINT *row = table;
-
-  limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
-  limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
-  limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
-
-  nistz384_point_double(&row[2 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[4 - 1], &row[2 - 1]);
-  nistz384_point_double(&row[6 - 1], &row[3 - 1]);
-  nistz384_point_double(&row[8 - 1], &row[4 - 1]);
-  nistz384_point_double(&row[12 - 1], &row[6 - 1]);
-  nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[14 - 1], &row[7 - 1]);
-  nistz384_point_double(&row[10 - 1], &row[5 - 1]);
-  nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[16 - 1], &row[8 - 1]);
-
-  static const size_t START_INDEX = 384 - 4;
+  NIST_POINT *row = table;
+
+  limbs_copy(row[0].X, p_x, FE_LIMBS);
+  limbs_copy(row[0].Y, p_y, FE_LIMBS);
+  limbs_copy(row[0].Z, ONE, FE_LIMBS);
+
+  point_double(nistz, BITS)(&row[1], &row[0]);
+
+  for (int i = 2; i < TBL_SZ; i += 2) {
+    point_add(nistz, BITS)(&row[i], &row[i - 1], &row[0]);
+    point_double(nistz, BITS)(&row[i + 1], &row[i / 2]);
+  }
+
+  static const size_t ROUND_SIZE = (BITS + W_BITS - 1) / W_BITS * W_BITS;
+  size_t START_INDEX = ROUND_SIZE == BITS + 1 ? ROUND_SIZE - W_BITS: ROUND_SIZE;
   size_t index = START_INDEX;
 
   BN_ULONG recoded_is_negative;
   crypto_word_t recoded;
 
   crypto_word_t wvalue = p_str[(index - 1) / 8];
-  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
+  wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK;
 
-  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
+  booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS);
   dev_assert_secret(!recoded_is_negative);
 
-  p384_point_select_w5(r, table, recoded);
+  NIST_POINT_select_w(r, table, recoded);
 
-  while (index >= kWindowSize) {
+  while (index >= W_BITS) {
     if (index != START_INDEX) {
       size_t off = (index - 1) / 8;
 
       wvalue = p_str[off] | p_str[off + 1] << 8;
-      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-      add_precomputed_w5(r, wvalue, table);
+      wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK;
+      add_precomputed_w(r, wvalue, table);
     }
 
-    index -= kWindowSize;
+    index -= W_BITS;
 
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
+    for (int i = 0; i < W_BITS; i++) {
+      point_double(nistz, BITS)(r, r);
+    }
   }
 
   /* Final window */
   wvalue = p_str[0];
-  wvalue = (wvalue << 1) & kMask;
-  add_precomputed_w5(r, wvalue, table);
+  wvalue = (wvalue << 1) & W_MASK;
+  add_precomputed_w(r, wvalue, table);
 }
 
-void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS])
+void point_double(p, BITS)(Limb r[3][FE_LIMBS], const Limb a[3][FE_LIMBS])
 {
-  P384_POINT t;
-  limbs_copy(t.X, a[0], P384_LIMBS);
-  limbs_copy(t.Y, a[1], P384_LIMBS);
-  limbs_copy(t.Z, a[2], P384_LIMBS);
-  nistz384_point_double(&t, &t);
-  limbs_copy(r[0], t.X, P384_LIMBS);
-  limbs_copy(r[1], t.Y, P384_LIMBS);
-  limbs_copy(r[2], t.Z, P384_LIMBS);
+  NIST_POINT t;
+  limbs_copy(t.X, a[0], FE_LIMBS);
+  limbs_copy(t.Y, a[1], FE_LIMBS);
+  limbs_copy(t.Z, a[2], FE_LIMBS);
+  point_double(nistz, BITS)(&t, &t);
+  limbs_copy(r[0], t.X, FE_LIMBS);
+  limbs_copy(r[1], t.Y, FE_LIMBS);
+  limbs_copy(r[2], t.Z, FE_LIMBS);
 }
 
-void p384_point_add(Limb r[3][P384_LIMBS],
-                    const Limb a[3][P384_LIMBS],
-                    const Limb b[3][P384_LIMBS])
+void point_add(p, BITS)(Limb r[3][FE_LIMBS],
+                        const Limb a[3][FE_LIMBS],
+                        const Limb b[3][FE_LIMBS])
 {
-  P384_POINT t1;
-  limbs_copy(t1.X, a[0], P384_LIMBS);
-  limbs_copy(t1.Y, a[1], P384_LIMBS);
-  limbs_copy(t1.Z, a[2], P384_LIMBS);
+  NIST_POINT t1;
+  limbs_copy(t1.X, a[0], FE_LIMBS);
+  limbs_copy(t1.Y, a[1], FE_LIMBS);
+  limbs_copy(t1.Z, a[2], FE_LIMBS);
 
-  P384_POINT t2;
-  limbs_copy(t2.X, b[0], P384_LIMBS);
-  limbs_copy(t2.Y, b[1], P384_LIMBS);
-  limbs_copy(t2.Z, b[2], P384_LIMBS);
+  NIST_POINT t2;
+  limbs_copy(t2.X, b[0], FE_LIMBS);
+  limbs_copy(t2.Y, b[1], FE_LIMBS);
+  limbs_copy(t2.Z, b[2], FE_LIMBS);
 
-  nistz384_point_add(&t1, &t1, &t2);
+  point_add(nistz, BITS)(&t1, &t1, &t2);
 
-  limbs_copy(r[0], t1.X, P384_LIMBS);
-  limbs_copy(r[1], t1.Y, P384_LIMBS);
-  limbs_copy(r[2], t1.Z, P384_LIMBS);
+  limbs_copy(r[0], t1.X, FE_LIMBS);
+  limbs_copy(r[1], t1.Y, FE_LIMBS);
+  limbs_copy(r[2], t1.Z, FE_LIMBS);
 }
 
-void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS],
-                    const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) {
-  alignas(64) P384_POINT acc;
-  nistz384_point_mul(&acc, p_scalar, p_x, p_y);
-  limbs_copy(r[0], acc.X, P384_LIMBS);
-  limbs_copy(r[1], acc.Y, P384_LIMBS);
-  limbs_copy(r[2], acc.Z, P384_LIMBS);
+void point_mul(p, BITS)(Limb r[3][FE_LIMBS],
+                        const BN_ULONG p_scalar[FE_LIMBS],
+                        const Limb p_x[FE_LIMBS],
+                        const Limb p_y[FE_LIMBS])
+{
+  alignas(64) NIST_POINT acc;
+  point_mul(nistz, BITS)(&acc, p_scalar, p_x, p_y);
+  limbs_copy(r[0], acc.X, FE_LIMBS);
+  limbs_copy(r[1], acc.Y, FE_LIMBS);
+  limbs_copy(r[2], acc.Z, FE_LIMBS);
 }
 
 #if defined(__GNUC__) || defined(__clang__)
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.h b/crypto/fipsmodule/ec/ecp_nistz384.h
deleted file mode 100644
index ca87e60721..0000000000
--- a/crypto/fipsmodule/ec/ecp_nistz384.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2014, Intel Corporation.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
-#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
-
-#include "../../limbs/limbs.h"
-
-#define P384_LIMBS (384u / LIMB_BITS)
-
-typedef struct {
-  Limb X[P384_LIMBS];
-  Limb Y[P384_LIMBS];
-  Limb Z[P384_LIMBS];
-} P384_POINT;
-
-typedef struct {
-  Limb X[P384_LIMBS];
-  Limb Y[P384_LIMBS];
-} P384_POINT_AFFINE;
-
-
-#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H
diff --git a/crypto/fipsmodule/ec/gfp.h b/crypto/fipsmodule/ec/gfp.h
new file mode 100644
index 0000000000..172ccc787d
--- /dev/null
+++ b/crypto/fipsmodule/ec/gfp.h
@@ -0,0 +1,193 @@
+/* Copyright 2016-2024 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#define RENAME_FUNC(prefix, bits, func) prefix ## bits ## _ ## func
+
+typedef struct {
+  Limb X[FE_LIMBS];
+  Limb Y[FE_LIMBS];
+  Limb Z[FE_LIMBS];
+} NIST_POINT;
+
+typedef struct {
+  Limb X[FE_LIMBS];
+  Limb Y[FE_LIMBS];
+} NIST_POINT_AFFINE;
+
+#define TBL_SZ (1 << (W_BITS - 1))
+#define W_MASK ((1 << (W_BITS + 1)) - 1)
+
+static inline Limb is_equal(const Elem a, const Elem b) {
+  return LIMBS_equal(a, b, FE_LIMBS);
+}
+
+static inline Limb is_zero(const BN_ULONG a[FE_LIMBS]) {
+  return LIMBS_are_zero(a, FE_LIMBS);
+}
+
+static inline void copy_conditional(Elem r, const Elem a,
+                                                const Limb condition) {
+  for (size_t i = 0; i < FE_LIMBS; ++i) {
+    r[i] = constant_time_select_w(condition, a[i], r[i]);
+  }
+}
+
+static inline void elem_add(Elem r, const Elem a, const Elem b) {
+  LIMBS_add_mod(r, a, b, Q, FE_LIMBS);
+}
+
+static inline void elem_sub(Elem r, const Elem a, const Elem b) {
+  LIMBS_sub_mod(r, a, b, Q, FE_LIMBS);
+}
+
+static void elem_div_by_2(Elem r, const Elem a) {
+  /* Consider the case where `a` is even. Then we can shift `a` right one bit
+   * and the result will still be valid because we didn't lose any bits and so
+   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
+   *
+   * The remainder of this comment is considering the case where `a` is odd.
+   *
+   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
+   * because the lowest bit is lost during the shift. For example, consider:
+   *
+   * ```python
+   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
+   * a = 2**383
+   * two_a = a * 2 % q
+   * assert two_a == 0x100000000ffffffffffffffff00000001
+   * ```
+   *
+   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
+   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
+   * we obviously can't get with just a right shift.
+   *
+   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
+   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
+   * keep track of an extra most significant bit. We can avoid that by instead
+   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
+   * significant bit of `a`. `q + 1` is even, which means it can be shifted
+   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
+   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
+   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
+   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
+   * bit of `a`, which is 1. Thus:
+   *
+   * sum  =  ((q + 1) >> 1) + (a >> 1)
+   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
+   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
+   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
+   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
+   *     <=  (2q - 2)/2                 (simplifying)
+   *     <=  q - 1                      (simplifying)
+   *
+   * Thus, no reduction of the sum mod `q` is necessary. */
+
+  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
+
+  /* r = a >> 1. */
+  Limb carry = a[FE_LIMBS - 1] & 1;
+  r[FE_LIMBS - 1] = a[FE_LIMBS - 1] >> 1;
+  for (size_t i = 1; i < FE_LIMBS; ++i) {
+    Limb new_carry = a[FE_LIMBS - i - 1];
+    r[FE_LIMBS - i - 1] =
+        (a[FE_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
+    carry = new_carry;
+  }
+
+  Elem adjusted;
+  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, FE_LIMBS);
+  dev_assert_secret(carry2 == 0);
+  (void)carry2;
+  copy_conditional(r, adjusted, is_odd);
+}
+
+static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
+  /* XXX: Not (clearly) constant-time; inefficient.*/
+  bn_mul_mont(r, a, b, Q, Q_N0, FE_LIMBS);
+}
+
+static inline void elem_mul_by_2(Elem r, const Elem a) {
+  LIMBS_shl_mod(r, a, Q, FE_LIMBS);
+}
+
+static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
+  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
+  Elem doubled;
+  elem_add(doubled, a, a);
+  elem_add(r, doubled, a);
+}
+
+static inline void elem_sqr_mont(Elem r, const Elem a) {
+  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
+  elem_mul_mont(r, a, a);
+}
+
+static void elem_neg(Elem r, const Elem a) {
+  Limb is_zero = LIMBS_are_zero(a, FE_LIMBS);
+  Carry borrow = limbs_sub(r, Q, a, FE_LIMBS);
+  dev_assert_secret(borrow == 0);
+  (void)borrow;
+  for (size_t i = 0; i < FE_LIMBS; ++i) {
+    r[i] = constant_time_select_w(is_zero, 0, r[i]);
+  }
+}
+
+static void NIST_POINT_select_w(NIST_POINT *out,
+                                const NIST_POINT table[TBL_SZ], size_t index) {
+  Elem x; limbs_zero(x, FE_LIMBS);
+  Elem y; limbs_zero(y, FE_LIMBS);
+  Elem z; limbs_zero(z, FE_LIMBS);
+
+  // TODO: Rewrite in terms of |limbs_select|.
+  for (size_t i = 0; i < TBL_SZ; ++i) {
+    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
+    for (size_t j = 0; j < FE_LIMBS; ++j) {
+      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
+      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
+      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
+    }
+  }
+
+  limbs_copy(out->X, x, FE_LIMBS);
+  limbs_copy(out->Y, y, FE_LIMBS);
+  limbs_copy(out->Z, z, FE_LIMBS);
+}
+
+#define bits_elem_neg(prefix, bits) RENAME_FUNC(prefix, bits, elem_neg)
+#define bits_elem_sub(prefix, bits) RENAME_FUNC(prefix, bits, elem_sub)
+#define bits_elem_div_by_2(prefix, bits) RENAME_FUNC(prefix, bits, elem_div_by_2)
+#define bits_elem_mul_mont(prefix, bits) RENAME_FUNC(prefix, bits, elem_mul_mont)
+#define bits_scalar_mul_mont(prefix, bits) RENAME_FUNC(prefix, bits, scalar_mul_mont)
+
+void bits_elem_neg(p, BITS)(Elem r, const Elem a) {
+  elem_neg(r, a);
+}
+
+void bits_elem_sub(p, BITS)(Elem r, const Elem a, const Elem b) {
+  elem_sub(r, a, b);
+}
+
+void bits_elem_div_by_2(p, BITS)(Elem r, const Elem a) {
+  elem_div_by_2(r, a);
+}
+
+void bits_elem_mul_mont(p, BITS)(Elem r, const Elem a, const Elem b) {
+  elem_mul_mont(r, a, b);
+}
+
+void bits_scalar_mul_mont(p, BITS)(ScalarMont r, const ScalarMont a,
+                              const ScalarMont b) {
+  /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
+  bn_mul_mont(r, a, b, N, N_N0, FE_LIMBS);
+}
diff --git a/crypto/fipsmodule/ec/gfp_p384.c b/crypto/fipsmodule/ec/gfp_p384.c
index 90065eaeb0..8e1e53ac1d 100644
--- a/crypto/fipsmodule/ec/gfp_p384.c
+++ b/crypto/fipsmodule/ec/gfp_p384.c
@@ -1,4 +1,5 @@
-/* Copyright 2016 Brian Smith.
+
+/* Copyright 2016-2023 Brian Smith.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -13,23 +14,22 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
 #include "../../limbs/limbs.h"
-
-#include "ecp_nistz384.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 
 #include "../../limbs/limbs.inl"
 
- /* XXX: Here we assume that the conversion from |Carry| to |Limb| is
-  * constant-time, but we haven't verified that assumption. TODO: Fix it so
-  * we don't need to make that assumption. */
+#define BITS 384
+
+#define P384_LIMBS (384u / LIMB_BITS)
 
+#define FE_LIMBS P384_LIMBS
 
-typedef Limb Elem[P384_LIMBS];
-typedef Limb ScalarMont[P384_LIMBS];
-typedef Limb Scalar[P384_LIMBS];
+typedef Limb Elem[FE_LIMBS];
+typedef Limb ScalarMont[FE_LIMBS];
+typedef Limb Scalar[FE_LIMBS];
 
-static const BN_ULONG Q[P384_LIMBS] = {
+static const Elem Q = {
 #if defined(OPENSSL_64_BIT)
   0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff,
   0xffffffffffffffff, 0xffffffffffffffff
@@ -39,7 +39,7 @@ static const BN_ULONG Q[P384_LIMBS] = {
 #endif
 };
 
-static const BN_ULONG N[P384_LIMBS] = {
+static const Elem N = {
 #if defined(OPENSSL_64_BIT)
   0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff,
   0xffffffffffffffff, 0xffffffffffffffff
@@ -49,7 +49,7 @@ static const BN_ULONG N[P384_LIMBS] = {
 #endif
 };
 
-static const BN_ULONG ONE[P384_LIMBS] = {
+static const Elem ONE = {
 #if defined(OPENSSL_64_BIT)
   0xffffffff00000001, 0xffffffff, 1, 0, 0
 #else
@@ -83,164 +83,9 @@ static const BN_ULONG N_N0[] = {
 #define INLINE_IF_POSSIBLE inline
 #endif
 
-static inline Limb is_equal(const Elem a, const Elem b) {
-  return LIMBS_equal(a, b, P384_LIMBS);
-}
-
-static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
-  return LIMBS_are_zero(a, P384_LIMBS);
-}
-
-static inline void copy_conditional(Elem r, const Elem a,
-                                                const Limb condition) {
-  for (size_t i = 0; i < P384_LIMBS; ++i) {
-    r[i] = constant_time_select_w(condition, a[i], r[i]);
-  }
-}
-
-
-static inline void elem_add(Elem r, const Elem a, const Elem b) {
-  LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
-}
-
-static inline void elem_sub(Elem r, const Elem a, const Elem b) {
-  LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
-}
-
-static void elem_div_by_2(Elem r, const Elem a) {
-  /* Consider the case where `a` is even. Then we can shift `a` right one bit
-   * and the result will still be valid because we didn't lose any bits and so
-   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
-   *
-   * The remainder of this comment is considering the case where `a` is odd.
-   *
-   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
-   * because the lowest bit is lost during the shift. For example, consider:
-   *
-   * ```python
-   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
-   * a = 2**383
-   * two_a = a * 2 % q
-   * assert two_a == 0x100000000ffffffffffffffff00000001
-   * ```
-   *
-   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
-   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
-   * we obviously can't get with just a right shift.
-   *
-   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
-   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
-   * keep track of an extra most significant bit. We can avoid that by instead
-   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
-   * significant bit of `a`. `q + 1` is even, which means it can be shifted
-   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
-   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
-   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
-   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
-   * bit of `a`, which is 1. Thus:
-   *
-   * sum  =  ((q + 1) >> 1) + (a >> 1)
-   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
-   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
-   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
-   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
-   *     <=  (2q - 2)/2                 (simplifying)
-   *     <=  q - 1                      (simplifying)
-   *
-   * Thus, no reduction of the sum mod `q` is necessary. */
-
-  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
-
-  /* r = a >> 1. */
-  Limb carry = a[P384_LIMBS - 1] & 1;
-  r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
-  for (size_t i = 1; i < P384_LIMBS; ++i) {
-    Limb new_carry = a[P384_LIMBS - i - 1];
-    r[P384_LIMBS - i - 1] =
-        (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
-    carry = new_carry;
-  }
-
-  Elem adjusted;
-  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
-  dev_assert_secret(carry2 == 0);
-  (void)carry2;
-  copy_conditional(r, adjusted, is_odd);
-}
-
-static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
-  /* XXX: Not (clearly) constant-time; inefficient.*/
-  bn_mul_mont(r, a, b, Q, Q_N0, P384_LIMBS);
-}
-
-static inline void elem_mul_by_2(Elem r, const Elem a) {
-  LIMBS_shl_mod(r, a, Q, P384_LIMBS);
-}
-
-static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
-  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
-  Elem doubled;
-  elem_add(doubled, a, a);
-  elem_add(r, doubled, a);
-}
-
-static inline void elem_sqr_mont(Elem r, const Elem a) {
-  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
-  elem_mul_mont(r, a, a);
-}
-
-void p384_elem_sub(Elem r, const Elem a, const Elem b) {
-  elem_sub(r, a, b);
-}
-
-void p384_elem_div_by_2(Elem r, const Elem a) {
-  elem_div_by_2(r, a);
-}
-
-void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
-  elem_mul_mont(r, a, b);
-}
-
-void p384_elem_neg(Elem r, const Elem a) {
-  Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
-  Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
-  dev_assert_secret(borrow == 0);
-  (void)borrow;
-  for (size_t i = 0; i < P384_LIMBS; ++i) {
-    r[i] = constant_time_select_w(is_zero, 0, r[i]);
-  }
-}
-
-
-void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
-                              const ScalarMont b) {
-  /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
-  bn_mul_mont(r, a, b, N, N_N0, P384_LIMBS);
-}
-
-
-/* TODO(perf): Optimize this. */
-
-static void p384_point_select_w5(P384_POINT *out,
-                                     const P384_POINT table[16], size_t index) {
-  Elem x; limbs_zero(x, P384_LIMBS);
-  Elem y; limbs_zero(y, P384_LIMBS);
-  Elem z; limbs_zero(z, P384_LIMBS);
-
-  // TODO: Rewrite in terms of |limbs_select|.
-  for (size_t i = 0; i < 16; ++i) {
-    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
-    for (size_t j = 0; j < P384_LIMBS; ++j) {
-      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
-      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
-      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
-    }
-  }
-
-  limbs_copy(out->X, x, P384_LIMBS);
-  limbs_copy(out->Y, y, P384_LIMBS);
-  limbs_copy(out->Z, z, P384_LIMBS);
-}
-
-
-#include "ecp_nistz384.inl"
+/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */
+/* Window values that are Ok for P521 (look at `ecp_nistz.h`): 4 */
+#define W_BITS 5
+
+#include "ecp_nistz.inl"
+
diff --git a/mk/generate_curves.py b/mk/generate_curves.py
index 92efecadce..fed6d96f36 100644
--- a/mk/generate_curves.py
+++ b/mk/generate_curves.py
@@ -17,7 +17,7 @@
 from textwrap import wrap
 
 rs_template = """
-// Copyright 2016-2023 Brian Smith.
+// Copyright 2016-2024 Brian Smith.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -215,6 +215,7 @@
         a: *const Limb, // [3][COMMON_OPS.num_limbs]
         b: *const Limb, // [3][COMMON_OPS.num_limbs]
     );
+
     fn p%(bits)s_point_mul(
         r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
         p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
@@ -345,19 +346,25 @@ def generate_rs(g, out_dir):
 
 #include "../../limbs/limbs.inl"
 
-typedef Limb Elem[P%(bits)d_LIMBS];
-typedef Limb ScalarMont[P%(bits)d_LIMBS];
-typedef Limb Scalar[P%(bits)d_LIMBS];
+#define BITS %(bits)d
+
+#define P%(bits)d_LIMBS (%(bits)du / LIMB_BITS)
+
+#define FE_LIMBS P%(bits)d_LIMBS
+
+typedef Limb Elem[FE_LIMBS];
+typedef Limb ScalarMont[FE_LIMBS];
+typedef Limb Scalar[FE_LIMBS];
 
-static const BN_ULONG Q[P%(bits)d_LIMBS] = {
+static const Elem Q = {
 %(q)s
 };
 
-static const BN_ULONG N[P%(bits)d_LIMBS] = {
+static const Elem N = {
 %(n)s
 };
 
-static const BN_ULONG ONE[P%(bits)d_LIMBS] = {
+static const Elem ONE = {
 %(q_one)s
 };
 
@@ -373,6 +380,20 @@ def generate_rs(g, out_dir):
   %(n_n0)s
 };
 
+/* XXX: MSVC for x86 warns when it fails to inline these functions it should
+ * probably inline. */
+#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86)
+#define INLINE_IF_POSSIBLE __forceinline
+#else
+#define INLINE_IF_POSSIBLE inline
+#endif
+
+/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */
+/* Window values that are Ok for P521 (look at `ecp_nistz.h`): 4 */
+#define W_BITS %(w_bits)d
+
+#include "ecp_nistz.inl"
+
 """
 
 # Given a number |x|, return a generator of a sequence |a| such that
@@ -437,6 +458,7 @@ def generate_c(g, out_dir):
         "q_plus_1_shr_1": format_big_int(const((q + 1) >> 1), big_int_limbs(q)),
         "n" : format_big_int(const(n), big_int_limbs(q)),
         "n_n0": format_n0(n),
+        "w_bits": g["w_bits"],
     }
 
     out_path = os.path.join(out_dir, "gfp_%s.c" % name)
@@ -476,6 +498,7 @@ def generate(g, out_dir):
     "Gx": 0x6b17d1f2_e12c4247_f8bce6e5_63a440f2_77037d81_2deb33a0_f4a13945_d898c296,
     "Gy": 0x4fe342e2_fe1a7f9b_8ee7eb4a_7c0f9e16_2bce3357_6b315ece_cbb64068_37bf51f5,
     "cofactor": 1,
+    "w_bits": 5,
 }
 
 p384 = {
@@ -488,6 +511,7 @@ def generate(g, out_dir):
     "Gx": 0xaa87ca22_be8b0537_8eb1c71e_f320ad74_6e1d3b62_8ba79b98_59f741e0_82542a38_5502f25d_bf55296c_3a545e38_72760ab7,
     "Gy": 0x3617de4a_96262c6f_5d9e98bf_9292dc29_f8f41dbd_289a147c_e9da3113_b5f0b8c0_0a60b1ce_1d7e819d_7a431d7c_90ea0e5f,
     "cofactor": 1,
+    "w_bits": 5,
 }
 
 p521 = {
@@ -500,6 +524,7 @@ def generate(g, out_dir):
     "Gx": 0xc6_858e06b7_0404e9cd_9e3ecb66_2395b442_9c648139_053fb521_f828af60_6b4d3dba_a14b5e77_efe75928_fe1dc127_a2ffa8de_3348b3c1_856a429b_f97e7e31_c2e5bd66,
     "Gy": 0x118_39296a78_9a3bc004_5c8a5fb4_2c7d1bd9_98f54449_579b4468_17afbd17_273e662c_97ee7299_5ef42640_c550b901_3fad0761_353c7086_a272c240_88be9476_9fd16650,
     "cofactor": 1,
+    "w_bits": 4,
 }
 
 import os