diff --git a/Cargo.toml b/Cargo.toml
index c8ac2a3978..0585b60927 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -69,8 +69,8 @@ include = [
     "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl",
     "crypto/fipsmodule/ec/ecp_nistz.c",
     "crypto/fipsmodule/ec/ecp_nistz.h",
-    "crypto/fipsmodule/ec/ecp_nistz384.h",
-    "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/ecp_nistz.inl",
+    "crypto/fipsmodule/ec/gfp.h",
     "crypto/fipsmodule/ec/gfp_p256.c",
     "crypto/fipsmodule/ec/gfp_p384.c",
     "crypto/fipsmodule/ec/p256.c",
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.inl b/crypto/fipsmodule/ec/ecp_nistz.inl
similarity index 50%
rename from crypto/fipsmodule/ec/ecp_nistz384.inl
rename to crypto/fipsmodule/ec/ecp_nistz.inl
index ae28f97ae5..a9b2211a1f 100644
--- a/crypto/fipsmodule/ec/ecp_nistz384.inl
+++ b/crypto/fipsmodule/ec/ecp_nistz.inl
@@ -20,20 +20,24 @@
  *   Shay Gueron and Vlad Krasnov
  *   "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
  *   http://eprint.iacr.org/2013/816 */
-
 #include "ecp_nistz.h"
+#include "gfp.h"
 
 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wsign-conversion"
 #endif
 
+#define point_add(prefix, bits) RENAME_FUNC(prefix, bits, point_add)
+#define point_double(prefix, bits) RENAME_FUNC(prefix, bits, point_double)
+#define point_mul(prefix, bits) RENAME_FUNC(prefix, bits, point_mul)
+
 /* Point double: r = 2*a */
-static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
-  BN_ULONG S[P384_LIMBS];
-  BN_ULONG M[P384_LIMBS];
-  BN_ULONG Zsqr[P384_LIMBS];
-  BN_ULONG tmp0[P384_LIMBS];
+static void point_double(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a) {
+  BN_ULONG S[FE_LIMBS];
+  BN_ULONG M[FE_LIMBS];
+  BN_ULONG Zsqr[FE_LIMBS];
+  BN_ULONG tmp0[FE_LIMBS];
 
   const BN_ULONG *in_x = a->X;
   const BN_ULONG *in_y = a->Y;
@@ -74,20 +78,20 @@ static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
 }
 
 /* Point addition: r = a+b */
-static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
-                               const P384_POINT *b) {
-  BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
-  BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
-  BN_ULONG Z1sqr[P384_LIMBS];
-  BN_ULONG Z2sqr[P384_LIMBS];
-  BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
-  BN_ULONG Hsqr[P384_LIMBS];
-  BN_ULONG Rsqr[P384_LIMBS];
-  BN_ULONG Hcub[P384_LIMBS];
-
-  BN_ULONG res_x[P384_LIMBS];
-  BN_ULONG res_y[P384_LIMBS];
-  BN_ULONG res_z[P384_LIMBS];
+static void point_add(nistz, BITS)(NIST_POINT *r, const NIST_POINT *a,
+                                   const NIST_POINT *b) {
+  BN_ULONG U2[FE_LIMBS], S2[FE_LIMBS];
+  BN_ULONG U1[FE_LIMBS], S1[FE_LIMBS];
+  BN_ULONG Z1sqr[FE_LIMBS];
+  BN_ULONG Z2sqr[FE_LIMBS];
+  BN_ULONG H[FE_LIMBS], R[FE_LIMBS];
+  BN_ULONG Hsqr[FE_LIMBS];
+  BN_ULONG Rsqr[FE_LIMBS];
+  BN_ULONG Hcub[FE_LIMBS];
+
+  BN_ULONG res_x[FE_LIMBS];
+  BN_ULONG res_y[FE_LIMBS];
+  BN_ULONG res_z[FE_LIMBS];
 
   const BN_ULONG *in1_x = a->X;
   const BN_ULONG *in1_y = a->Y;
@@ -117,11 +121,11 @@ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
   BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
   if (is_exceptional) {
     if (is_equal(S1, S2)) {
-      nistz384_point_double(r, a);
+      point_double(nistz, BITS)(r, a);
     } else {
-      limbs_zero(r->X, P384_LIMBS);
-      limbs_zero(r->Y, P384_LIMBS);
-      limbs_zero(r->Z, P384_LIMBS);
+      limbs_zero(r->X, FE_LIMBS);
+      limbs_zero(r->Y, FE_LIMBS);
+      limbs_zero(r->Z, FE_LIMBS);
     }
     return;
   }
@@ -152,147 +156,136 @@ static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
   copy_conditional(res_y, in1_y, in2infty);
   copy_conditional(res_z, in1_z, in2infty);
 
-  limbs_copy(r->X, res_x, P384_LIMBS);
-  limbs_copy(r->Y, res_y, P384_LIMBS);
-  limbs_copy(r->Z, res_z, P384_LIMBS);
+  limbs_copy(r->X, res_x, FE_LIMBS);
+  limbs_copy(r->Y, res_y, FE_LIMBS);
+  limbs_copy(r->Z, res_z, FE_LIMBS);
 }
 
-static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue,
-                               const P384_POINT table[16]) {
+static void add_precomputed_w(NIST_POINT *r, crypto_word_t wvalue,
+                              const NIST_POINT table[TBL_SZ]) {
   crypto_word_t recoded_is_negative;
   crypto_word_t recoded;
-  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
+  booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS);
 
-  alignas(64) P384_POINT h;
-  p384_point_select_w5(&h, table, recoded);
+  alignas(64) NIST_POINT h;
+  NIST_POINT_select_w(&h, table, recoded);
 
-  alignas(64) BN_ULONG tmp[P384_LIMBS];
-  p384_elem_neg(tmp, h.Y);
+  alignas(64) BN_ULONG tmp[FE_LIMBS];
+  elem_neg(tmp, h.Y);
   copy_conditional(h.Y, tmp, recoded_is_negative);
 
-  nistz384_point_add(r, r, &h);
+  point_add(nistz, BITS)(r, r, &h);
 }
 
 /* r = p * p_scalar */
-static void nistz384_point_mul(P384_POINT *r,
-                               const BN_ULONG p_scalar[P384_LIMBS],
-                               const Limb p_x[P384_LIMBS],
-                               const Limb p_y[P384_LIMBS]) {
-  static const size_t kWindowSize = 5;
-  static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
-
-  uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
+static void point_mul(nistz, BITS)(NIST_POINT *r, const BN_ULONG p_scalar[FE_LIMBS],
+                                   const BN_ULONG p_x[FE_LIMBS],
+                                   const BN_ULONG p_y[FE_LIMBS]) {
+  uint8_t p_str[(FE_LIMBS * sizeof(Limb)) + 1];
   little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
-                                  p_scalar, P384_LIMBS);
+                                  p_scalar, FE_LIMBS);
 
-  /* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
+  /* A |NIST_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
   * add no more than 63 bytes of overhead. Thus, |table| should require
   * ~2367 ((144 * 16) + 63) bytes of stack space. */
-  alignas(64) P384_POINT table[16];
+  alignas(64) NIST_POINT table[TBL_SZ];
 
   /* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
   * not stored. All other values are actually stored with an offset of -1 in
   * table. */
-  P384_POINT *row = table;
-
-  limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
-  limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
-  limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
-
-  nistz384_point_double(&row[2 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[4 - 1], &row[2 - 1]);
-  nistz384_point_double(&row[6 - 1], &row[3 - 1]);
-  nistz384_point_double(&row[8 - 1], &row[4 - 1]);
-  nistz384_point_double(&row[12 - 1], &row[6 - 1]);
-  nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[14 - 1], &row[7 - 1]);
-  nistz384_point_double(&row[10 - 1], &row[5 - 1]);
-  nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
-  nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
-  nistz384_point_double(&row[16 - 1], &row[8 - 1]);
-
-  static const size_t START_INDEX = 384 - 4;
+  NIST_POINT *row = table;
+
+  limbs_copy(row[0].X, p_x, FE_LIMBS);
+  limbs_copy(row[0].Y, p_y, FE_LIMBS);
+  limbs_copy(row[0].Z, ONE, FE_LIMBS);
+
+  point_double(nistz, BITS)(&row[1], &row[0]);
+
+  for (int i = 2; i < TBL_SZ; i += 2) {
+    point_add(nistz, BITS)(&row[i], &row[i - 1], &row[0]);
+    point_double(nistz, BITS)(&row[i + 1], &row[i / 2]);
+  }
+
+  static const size_t ROUND_SIZE = (BITS + W_BITS - 1) / W_BITS * W_BITS;
+  size_t START_INDEX = ROUND_SIZE == BITS + 1 ? ROUND_SIZE - W_BITS: ROUND_SIZE;
   size_t index = START_INDEX;
 
   BN_ULONG recoded_is_negative;
   crypto_word_t recoded;
 
   crypto_word_t wvalue = p_str[(index - 1) / 8];
-  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
+  wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK;
 
-  booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
+  booth_recode(&recoded_is_negative, &recoded, wvalue, W_BITS);
   dev_assert_secret(!recoded_is_negative);
 
-  p384_point_select_w5(r, table, recoded);
+  NIST_POINT_select_w(r, table, recoded);
 
-  while (index >= kWindowSize) {
+  while (index >= W_BITS) {
     if (index != START_INDEX) {
       size_t off = (index - 1) / 8;
 
       wvalue = p_str[off] | p_str[off + 1] << 8;
-      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-      add_precomputed_w5(r, wvalue, table);
+      wvalue = (wvalue >> ((index - 1) % 8)) & W_MASK;
+      add_precomputed_w(r, wvalue, table);
     }
 
-    index -= kWindowSize;
+    index -= W_BITS;
 
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
-    nistz384_point_double(r, r);
+    for (int i = 0; i < W_BITS; i++) {
+      point_double(nistz, BITS)(r, r);
+    }
   }
 
   /* Final window */
   wvalue = p_str[0];
-  wvalue = (wvalue << 1) & kMask;
-  add_precomputed_w5(r, wvalue, table);
+  wvalue = (wvalue << 1) & W_MASK;
+  add_precomputed_w(r, wvalue, table);
 }
 
-void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS])
+void point_double(p, BITS)(Limb r[3][FE_LIMBS], const Limb a[3][FE_LIMBS])
 {
-  P384_POINT t;
-  limbs_copy(t.X, a[0], P384_LIMBS);
-  limbs_copy(t.Y, a[1], P384_LIMBS);
-  limbs_copy(t.Z, a[2], P384_LIMBS);
-  nistz384_point_double(&t, &t);
-  limbs_copy(r[0], t.X, P384_LIMBS);
-  limbs_copy(r[1], t.Y, P384_LIMBS);
-  limbs_copy(r[2], t.Z, P384_LIMBS);
+  NIST_POINT t;
+  limbs_copy(t.X, a[0], FE_LIMBS);
+  limbs_copy(t.Y, a[1], FE_LIMBS);
+  limbs_copy(t.Z, a[2], FE_LIMBS);
+  point_double(nistz, BITS)(&t, &t);
+  limbs_copy(r[0], t.X, FE_LIMBS);
+  limbs_copy(r[1], t.Y, FE_LIMBS);
+  limbs_copy(r[2], t.Z, FE_LIMBS);
 }
 
-void p384_point_add(Limb r[3][P384_LIMBS],
-                    const Limb a[3][P384_LIMBS],
-                    const Limb b[3][P384_LIMBS])
+void point_add(p, BITS)(Limb r[3][FE_LIMBS],
+                        const Limb a[3][FE_LIMBS],
+                        const Limb b[3][FE_LIMBS])
 {
-  P384_POINT t1;
-  limbs_copy(t1.X, a[0], P384_LIMBS);
-  limbs_copy(t1.Y, a[1], P384_LIMBS);
-  limbs_copy(t1.Z, a[2], P384_LIMBS);
+  NIST_POINT t1;
+  limbs_copy(t1.X, a[0], FE_LIMBS);
+  limbs_copy(t1.Y, a[1], FE_LIMBS);
+  limbs_copy(t1.Z, a[2], FE_LIMBS);
 
-  P384_POINT t2;
-  limbs_copy(t2.X, b[0], P384_LIMBS);
-  limbs_copy(t2.Y, b[1], P384_LIMBS);
-  limbs_copy(t2.Z, b[2], P384_LIMBS);
+  NIST_POINT t2;
+  limbs_copy(t2.X, b[0], FE_LIMBS);
+  limbs_copy(t2.Y, b[1], FE_LIMBS);
+  limbs_copy(t2.Z, b[2], FE_LIMBS);
 
-  nistz384_point_add(&t1, &t1, &t2);
+  point_add(nistz, BITS)(&t1, &t1, &t2);
 
-  limbs_copy(r[0], t1.X, P384_LIMBS);
-  limbs_copy(r[1], t1.Y, P384_LIMBS);
-  limbs_copy(r[2], t1.Z, P384_LIMBS);
+  limbs_copy(r[0], t1.X, FE_LIMBS);
+  limbs_copy(r[1], t1.Y, FE_LIMBS);
+  limbs_copy(r[2], t1.Z, FE_LIMBS);
 }
 
-void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS],
-                    const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) {
-  alignas(64) P384_POINT acc;
-  nistz384_point_mul(&acc, p_scalar, p_x, p_y);
-  limbs_copy(r[0], acc.X, P384_LIMBS);
-  limbs_copy(r[1], acc.Y, P384_LIMBS);
-  limbs_copy(r[2], acc.Z, P384_LIMBS);
+void point_mul(p, BITS)(Limb r[3][FE_LIMBS],
+                        const BN_ULONG p_scalar[FE_LIMBS],
+                        const Limb p_x[FE_LIMBS],
+                        const Limb p_y[FE_LIMBS])
+{
+  alignas(64) NIST_POINT acc;
+  point_mul(nistz, BITS)(&acc, p_scalar, p_x, p_y);
+  limbs_copy(r[0], acc.X, FE_LIMBS);
+  limbs_copy(r[1], acc.Y, FE_LIMBS);
+  limbs_copy(r[2], acc.Z, FE_LIMBS);
 }
 
 #if defined(__GNUC__) || defined(__clang__)
diff --git a/crypto/fipsmodule/ec/ecp_nistz384.h b/crypto/fipsmodule/ec/ecp_nistz384.h
deleted file mode 100644
index ca87e60721..0000000000
--- a/crypto/fipsmodule/ec/ecp_nistz384.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2014, Intel Corporation.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
-#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
-
-#include "../../limbs/limbs.h"
-
-#define P384_LIMBS (384u / LIMB_BITS)
-
-typedef struct {
-  Limb X[P384_LIMBS];
-  Limb Y[P384_LIMBS];
-  Limb Z[P384_LIMBS];
-} P384_POINT;
-
-typedef struct {
-  Limb X[P384_LIMBS];
-  Limb Y[P384_LIMBS];
-} P384_POINT_AFFINE;
-
-
-#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H
diff --git a/crypto/fipsmodule/ec/gfp.h b/crypto/fipsmodule/ec/gfp.h
new file mode 100644
index 0000000000..172ccc787d
--- /dev/null
+++ b/crypto/fipsmodule/ec/gfp.h
@@ -0,0 +1,193 @@
+/* Copyright 2016-2024 Brian Smith.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#define RENAME_FUNC(prefix, bits, func) prefix ## bits ## _ ## func
+
+typedef struct {
+  Limb X[FE_LIMBS];
+  Limb Y[FE_LIMBS];
+  Limb Z[FE_LIMBS];
+} NIST_POINT;
+
+typedef struct {
+  Limb X[FE_LIMBS];
+  Limb Y[FE_LIMBS];
+} NIST_POINT_AFFINE;
+
+#define TBL_SZ (1 << (W_BITS - 1))
+#define W_MASK ((1 << (W_BITS + 1)) - 1)
+
+static inline Limb is_equal(const Elem a, const Elem b) {
+  return LIMBS_equal(a, b, FE_LIMBS);
+}
+
+static inline Limb is_zero(const BN_ULONG a[FE_LIMBS]) {
+  return LIMBS_are_zero(a, FE_LIMBS);
+}
+
+static inline void copy_conditional(Elem r, const Elem a,
+                                                const Limb condition) {
+  for (size_t i = 0; i < FE_LIMBS; ++i) {
+    r[i] = constant_time_select_w(condition, a[i], r[i]);
+  }
+}
+
+static inline void elem_add(Elem r, const Elem a, const Elem b) {
+  LIMBS_add_mod(r, a, b, Q, FE_LIMBS);
+}
+
+static inline void elem_sub(Elem r, const Elem a, const Elem b) {
+  LIMBS_sub_mod(r, a, b, Q, FE_LIMBS);
+}
+
+static void elem_div_by_2(Elem r, const Elem a) {
+  /* Consider the case where `a` is even. Then we can shift `a` right one bit
+   * and the result will still be valid because we didn't lose any bits and so
+   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
+   *
+   * The remainder of this comment is considering the case where `a` is odd.
+   *
+   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
+   * because the lowest bit is lost during the shift. For example, consider:
+   *
+   * ```python
+   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
+   * a = 2**383
+   * two_a = a * 2 % q
+   * assert two_a == 0x100000000ffffffffffffffff00000001
+   * ```
+   *
+   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
+   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
+   * we obviously can't get with just a right shift.
+   *
+   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
+   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
+   * keep track of an extra most significant bit. We can avoid that by instead
+   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
+   * significant bit of `a`. `q + 1` is even, which means it can be shifted
+   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
+   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
+   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
+   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
+   * bit of `a`, which is 1. Thus:
+   *
+   * sum  =  ((q + 1) >> 1) + (a >> 1)
+   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
+   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
+   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
+   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
+   *     <=  (2q - 2)/2                 (simplifying)
+   *     <=  q - 1                      (simplifying)
+   *
+   * Thus, no reduction of the sum mod `q` is necessary. */
+
+  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
+
+  /* r = a >> 1. */
+  Limb carry = a[FE_LIMBS - 1] & 1;
+  r[FE_LIMBS - 1] = a[FE_LIMBS - 1] >> 1;
+  for (size_t i = 1; i < FE_LIMBS; ++i) {
+    Limb new_carry = a[FE_LIMBS - i - 1];
+    r[FE_LIMBS - i - 1] =
+        (a[FE_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
+    carry = new_carry;
+  }
+
+  Elem adjusted;
+  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, FE_LIMBS);
+  dev_assert_secret(carry2 == 0);
+  (void)carry2;
+  copy_conditional(r, adjusted, is_odd);
+}
+
+static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
+  /* XXX: Not (clearly) constant-time; inefficient.*/
+  bn_mul_mont(r, a, b, Q, Q_N0, FE_LIMBS);
+}
+
+static inline void elem_mul_by_2(Elem r, const Elem a) {
+  LIMBS_shl_mod(r, a, Q, FE_LIMBS);
+}
+
+static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
+  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
+  Elem doubled;
+  elem_add(doubled, a, a);
+  elem_add(r, doubled, a);
+}
+
+static inline void elem_sqr_mont(Elem r, const Elem a) {
+  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
+  elem_mul_mont(r, a, a);
+}
+
+static void elem_neg(Elem r, const Elem a) {
+  Limb is_zero = LIMBS_are_zero(a, FE_LIMBS);
+  Carry borrow = limbs_sub(r, Q, a, FE_LIMBS);
+  dev_assert_secret(borrow == 0);
+  (void)borrow;
+  for (size_t i = 0; i < FE_LIMBS; ++i) {
+    r[i] = constant_time_select_w(is_zero, 0, r[i]);
+  }
+}
+
+static void NIST_POINT_select_w(NIST_POINT *out,
+                                const NIST_POINT table[TBL_SZ], size_t index) {
+  Elem x; limbs_zero(x, FE_LIMBS);
+  Elem y; limbs_zero(y, FE_LIMBS);
+  Elem z; limbs_zero(z, FE_LIMBS);
+
+  // TODO: Rewrite in terms of |limbs_select|.
+  for (size_t i = 0; i < TBL_SZ; ++i) {
+    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
+    for (size_t j = 0; j < FE_LIMBS; ++j) {
+      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
+      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
+      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
+    }
+  }
+
+  limbs_copy(out->X, x, FE_LIMBS);
+  limbs_copy(out->Y, y, FE_LIMBS);
+  limbs_copy(out->Z, z, FE_LIMBS);
+}
+
+#define bits_elem_neg(prefix, bits) RENAME_FUNC(prefix, bits, elem_neg)
+#define bits_elem_sub(prefix, bits) RENAME_FUNC(prefix, bits, elem_sub)
+#define bits_elem_div_by_2(prefix, bits) RENAME_FUNC(prefix, bits, elem_div_by_2)
+#define bits_elem_mul_mont(prefix, bits) RENAME_FUNC(prefix, bits, elem_mul_mont)
+#define bits_scalar_mul_mont(prefix, bits) RENAME_FUNC(prefix, bits, scalar_mul_mont)
+
+void bits_elem_neg(p, BITS)(Elem r, const Elem a) {
+  elem_neg(r, a);
+}
+
+void bits_elem_sub(p, BITS)(Elem r, const Elem a, const Elem b) {
+  elem_sub(r, a, b);
+}
+
+void bits_elem_div_by_2(p, BITS)(Elem r, const Elem a) {
+  elem_div_by_2(r, a);
+}
+
+void bits_elem_mul_mont(p, BITS)(Elem r, const Elem a, const Elem b) {
+  elem_mul_mont(r, a, b);
+}
+
+void bits_scalar_mul_mont(p, BITS)(ScalarMont r, const ScalarMont a,
+                              const ScalarMont b) {
+  /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
+  bn_mul_mont(r, a, b, N, N_N0, FE_LIMBS);
+}
diff --git a/crypto/fipsmodule/ec/gfp_p384.c b/crypto/fipsmodule/ec/gfp_p384.c
index 90065eaeb0..8e1e53ac1d 100644
--- a/crypto/fipsmodule/ec/gfp_p384.c
+++ b/crypto/fipsmodule/ec/gfp_p384.c
@@ -1,4 +1,5 @@
-/* Copyright 2016 Brian Smith.
+
+/* Copyright 2016-2023 Brian Smith.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -13,23 +14,22 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
 #include "../../limbs/limbs.h"
-
-#include "ecp_nistz384.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 
 #include "../../limbs/limbs.inl"
 
- /* XXX: Here we assume that the conversion from |Carry| to |Limb| is
-  * constant-time, but we haven't verified that assumption. TODO: Fix it so
-  * we don't need to make that assumption. */
+#define BITS 384
+
+#define P384_LIMBS (384u / LIMB_BITS)
 
+#define FE_LIMBS P384_LIMBS
 
-typedef Limb Elem[P384_LIMBS];
-typedef Limb ScalarMont[P384_LIMBS];
-typedef Limb Scalar[P384_LIMBS];
+typedef Limb Elem[FE_LIMBS];
+typedef Limb ScalarMont[FE_LIMBS];
+typedef Limb Scalar[FE_LIMBS];
 
-static const BN_ULONG Q[P384_LIMBS] = {
+static const Elem Q = {
 #if defined(OPENSSL_64_BIT)
   0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff,
   0xffffffffffffffff, 0xffffffffffffffff
@@ -39,7 +39,7 @@ static const BN_ULONG Q[P384_LIMBS] = {
 #endif
 };
 
-static const BN_ULONG N[P384_LIMBS] = {
+static const Elem N = {
 #if defined(OPENSSL_64_BIT)
   0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff,
   0xffffffffffffffff, 0xffffffffffffffff
@@ -49,7 +49,7 @@ static const BN_ULONG N[P384_LIMBS] = {
 #endif
 };
 
-static const BN_ULONG ONE[P384_LIMBS] = {
+static const Elem ONE = {
 #if defined(OPENSSL_64_BIT)
   0xffffffff00000001, 0xffffffff, 1, 0, 0
 #else
@@ -83,164 +83,9 @@ static const BN_ULONG N_N0[] = {
 #define INLINE_IF_POSSIBLE inline
 #endif
 
-static inline Limb is_equal(const Elem a, const Elem b) {
-  return LIMBS_equal(a, b, P384_LIMBS);
-}
-
-static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
-  return LIMBS_are_zero(a, P384_LIMBS);
-}
-
-static inline void copy_conditional(Elem r, const Elem a,
-                                                const Limb condition) {
-  for (size_t i = 0; i < P384_LIMBS; ++i) {
-    r[i] = constant_time_select_w(condition, a[i], r[i]);
-  }
-}
-
-
-static inline void elem_add(Elem r, const Elem a, const Elem b) {
-  LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
-}
-
-static inline void elem_sub(Elem r, const Elem a, const Elem b) {
-  LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
-}
-
-static void elem_div_by_2(Elem r, const Elem a) {
-  /* Consider the case where `a` is even. Then we can shift `a` right one bit
-   * and the result will still be valid because we didn't lose any bits and so
-   * `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
-   *
-   * The remainder of this comment is considering the case where `a` is odd.
-   *
-   * Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
-   * because the lowest bit is lost during the shift. For example, consider:
-   *
-   * ```python
-   * q = 2**384 - 2**128 - 2**96 + 2**32 - 1
-   * a = 2**383
-   * two_a = a * 2 % q
-   * assert two_a == 0x100000000ffffffffffffffff00000001
-   * ```
-   *
-   * Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
-   * we divide `two_a` by two (mod q), we need to get the value `2**383`, which
-   * we obviously can't get with just a right shift.
-   *
-   * `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
-   * `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
-   * keep track of an extra most significant bit. We can avoid that by instead
-   * calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
-   * significant bit of `a`. `q + 1` is even, which means it can be shifted
-   * without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
-   * odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
-   * `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
-   * `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
-   * bit of `a`, which is 1. Thus:
-   *
-   * sum  =  ((q + 1) >> 1) + (a >> 1)
-   * sum  =  (q + 1)/2 + (a >> 1)       (substituting (q + 1)/2)
-   *     <=  (q + 1)/2 + (q - 2 - 1)/2  (substituting a <= q - 2)
-   *     <=  (q + 1)/2 + (q - 3)/2      (simplifying)
-   *     <=  (q + 1 + q - 3)/2          (factoring out the common divisor)
-   *     <=  (2q - 2)/2                 (simplifying)
-   *     <=  q - 1                      (simplifying)
-   *
-   * Thus, no reduction of the sum mod `q` is necessary. */
-
-  Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
-
-  /* r = a >> 1. */
-  Limb carry = a[P384_LIMBS - 1] & 1;
-  r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
-  for (size_t i = 1; i < P384_LIMBS; ++i) {
-    Limb new_carry = a[P384_LIMBS - i - 1];
-    r[P384_LIMBS - i - 1] =
-        (a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
-    carry = new_carry;
-  }
-
-  Elem adjusted;
-  BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
-  dev_assert_secret(carry2 == 0);
-  (void)carry2;
-  copy_conditional(r, adjusted, is_odd);
-}
-
-static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
-  /* XXX: Not (clearly) constant-time; inefficient.*/
-  bn_mul_mont(r, a, b, Q, Q_N0, P384_LIMBS);
-}
-
-static inline void elem_mul_by_2(Elem r, const Elem a) {
-  LIMBS_shl_mod(r, a, Q, P384_LIMBS);
-}
-
-static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
-  /* XXX: inefficient. TODO: Replace with an integrated shift + add. */
-  Elem doubled;
-  elem_add(doubled, a, a);
-  elem_add(r, doubled, a);
-}
-
-static inline void elem_sqr_mont(Elem r, const Elem a) {
-  /* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
-  elem_mul_mont(r, a, a);
-}
-
-void p384_elem_sub(Elem r, const Elem a, const Elem b) {
-  elem_sub(r, a, b);
-}
-
-void p384_elem_div_by_2(Elem r, const Elem a) {
-  elem_div_by_2(r, a);
-}
-
-void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
-  elem_mul_mont(r, a, b);
-}
-
-void p384_elem_neg(Elem r, const Elem a) {
-  Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
-  Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
-  dev_assert_secret(borrow == 0);
-  (void)borrow;
-  for (size_t i = 0; i < P384_LIMBS; ++i) {
-    r[i] = constant_time_select_w(is_zero, 0, r[i]);
-  }
-}
-
-
-void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
-                              const ScalarMont b) {
-  /* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
-  bn_mul_mont(r, a, b, N, N_N0, P384_LIMBS);
-}
-
-
-/* TODO(perf): Optimize this. */
-
-static void p384_point_select_w5(P384_POINT *out,
-                                     const P384_POINT table[16], size_t index) {
-  Elem x; limbs_zero(x, P384_LIMBS);
-  Elem y; limbs_zero(y, P384_LIMBS);
-  Elem z; limbs_zero(z, P384_LIMBS);
-
-  // TODO: Rewrite in terms of |limbs_select|.
-  for (size_t i = 0; i < 16; ++i) {
-    crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
-    for (size_t j = 0; j < P384_LIMBS; ++j) {
-      x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
-      y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
-      z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
-    }
-  }
-
-  limbs_copy(out->X, x, P384_LIMBS);
-  limbs_copy(out->Y, y, P384_LIMBS);
-  limbs_copy(out->Z, z, P384_LIMBS);
-}
-
-
-#include "ecp_nistz384.inl"
+/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */
+/* Window values that are Ok for P521 (look at `ecp_nistz.h`): 4 */
+#define W_BITS 5
+
+#include "ecp_nistz.inl"
+
diff --git a/mk/generate_curves.py b/mk/generate_curves.py
index 92efecadce..fed6d96f36 100644
--- a/mk/generate_curves.py
+++ b/mk/generate_curves.py
@@ -17,7 +17,7 @@
 from textwrap import wrap
 
 rs_template = """
-// Copyright 2016-2023 Brian Smith.
+// Copyright 2016-2024 Brian Smith.
 //
 // Permission to use, copy, modify, and/or distribute this software for any
 // purpose with or without fee is hereby granted, provided that the above
@@ -215,6 +215,7 @@
         a: *const Limb, // [3][COMMON_OPS.num_limbs]
         b: *const Limb, // [3][COMMON_OPS.num_limbs]
     );
+
     fn p%(bits)s_point_mul(
         r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
         p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
@@ -345,19 +346,25 @@ def generate_rs(g, out_dir):
 
 #include "../../limbs/limbs.inl"
 
-typedef Limb Elem[P%(bits)d_LIMBS];
-typedef Limb ScalarMont[P%(bits)d_LIMBS];
-typedef Limb Scalar[P%(bits)d_LIMBS];
+#define BITS %(bits)d
+
+#define P%(bits)d_LIMBS (%(bits)du / LIMB_BITS)
+
+#define FE_LIMBS P%(bits)d_LIMBS
+
+typedef Limb Elem[FE_LIMBS];
+typedef Limb ScalarMont[FE_LIMBS];
+typedef Limb Scalar[FE_LIMBS];
 
-static const BN_ULONG Q[P%(bits)d_LIMBS] = {
+static const Elem Q = {
 %(q)s
 };
 
-static const BN_ULONG N[P%(bits)d_LIMBS] = {
+static const Elem N = {
 %(n)s
 };
 
-static const BN_ULONG ONE[P%(bits)d_LIMBS] = {
+static const Elem ONE = {
 %(q_one)s
 };
 
@@ -373,6 +380,20 @@ def generate_rs(g, out_dir):
   %(n_n0)s
 };
 
+/* XXX: MSVC for x86 warns when it fails to inline these functions it should
+ * probably inline. */
+#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86)
+#define INLINE_IF_POSSIBLE __forceinline
+#else
+#define INLINE_IF_POSSIBLE inline
+#endif
+
+/* Window values that are Ok for P384 (look at `ecp_nistz.h`): 2, 5, 6, 7 */
+/* Window values that are Ok for P521 (look at `ecp_nistz.h`): 4 */
+#define W_BITS %(w_bits)d
+
+#include "ecp_nistz.inl"
+
 """
 
 # Given a number |x|, return a generator of a sequence |a| such that
@@ -437,6 +458,7 @@ def generate_c(g, out_dir):
         "q_plus_1_shr_1": format_big_int(const((q + 1) >> 1), big_int_limbs(q)),
         "n" : format_big_int(const(n), big_int_limbs(q)),
         "n_n0": format_n0(n),
+        "w_bits": g["w_bits"],
     }
 
     out_path = os.path.join(out_dir, "gfp_%s.c" % name)
@@ -476,6 +498,7 @@ def generate(g, out_dir):
     "Gx": 0x6b17d1f2_e12c4247_f8bce6e5_63a440f2_77037d81_2deb33a0_f4a13945_d898c296,
     "Gy": 0x4fe342e2_fe1a7f9b_8ee7eb4a_7c0f9e16_2bce3357_6b315ece_cbb64068_37bf51f5,
     "cofactor": 1,
+    "w_bits": 5,
 }
 
 p384 = {
@@ -488,6 +511,7 @@ def generate(g, out_dir):
     "Gx": 0xaa87ca22_be8b0537_8eb1c71e_f320ad74_6e1d3b62_8ba79b98_59f741e0_82542a38_5502f25d_bf55296c_3a545e38_72760ab7,
     "Gy": 0x3617de4a_96262c6f_5d9e98bf_9292dc29_f8f41dbd_289a147c_e9da3113_b5f0b8c0_0a60b1ce_1d7e819d_7a431d7c_90ea0e5f,
     "cofactor": 1,
+    "w_bits": 5,
 }
 
 p521 = {
@@ -500,6 +524,7 @@ def generate(g, out_dir):
     "Gx": 0xc6_858e06b7_0404e9cd_9e3ecb66_2395b442_9c648139_053fb521_f828af60_6b4d3dba_a14b5e77_efe75928_fe1dc127_a2ffa8de_3348b3c1_856a429b_f97e7e31_c2e5bd66,
     "Gy": 0x118_39296a78_9a3bc004_5c8a5fb4_2c7d1bd9_98f54449_579b4468_17afbd17_273e662c_97ee7299_5ef42640_c550b901_3fad0761_353c7086_a272c240_88be9476_9fd16650,
     "cofactor": 1,
+    "w_bits": 4,
 }
 
 import os