diff --git a/src/aead/gcm.rs b/src/aead/gcm.rs
index f1a2ef296a..27716e4ba8 100644
--- a/src/aead/gcm.rs
+++ b/src/aead/gcm.rs
@@ -12,17 +12,25 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
+use self::ffi::{Block, BLOCK_LEN, ZERO_BLOCK};
 use super::{aes_gcm, Aad};
 use crate::{
     bits::{BitLength, FromByteLen as _},
-    constant_time, cpu, error,
+    cpu, error,
     polyfill::{sliceutil::overwrite_at_start, ArraySplitMap as _},
 };
-use core::ops::BitXorAssign;
+use cfg_if::cfg_if;
 
-// GCM uses the same block type as AES.
-use super::aes::{Block, BLOCK_LEN, ZERO_BLOCK};
+cfg_if! {
+    if #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))] {
+        pub(super) use self::ffi::{HTable, Xi};
+    } else {
+        use self::ffi::{HTable, Xi};
+    }
+}
 
+#[macro_use]
+mod ffi;
 mod gcm_nohw;
 
 #[derive(Clone)]
@@ -33,118 +41,21 @@ pub struct Key {
 impl Key {
     pub(super) fn new(h_be: Block, cpu_features: cpu::Features) -> Self {
         let h: [u64; 2] = h_be.array_split_map(u64::from_be_bytes);
-
-        let mut key = Self {
-            h_table: HTable {
-                Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN],
-            },
-        };
-        let h_table = &mut key.h_table;
-
-        match detect_implementation(cpu_features) {
+        let h_table = match detect_implementation(cpu_features) {
             #[cfg(target_arch = "x86_64")]
-            Implementation::CLMUL if has_avx_movbe(cpu_features) => {
-                prefixed_extern! {
-                    fn gcm_init_avx(HTable: &mut HTable, h: &[u64; 2]);
-                }
-                unsafe {
-                    gcm_init_avx(h_table, &h);
-                }
-            }
+            Implementation::CLMUL if has_avx_movbe(cpu_features) => unsafe {
+                htable_new!(gcm_init_avx, &h, cou_features)
+            },
 
             #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))]
-            Implementation::CLMUL => {
-                prefixed_extern! {
-                    fn gcm_init_clmul(Htable: &mut HTable, h: &[u64; 2]);
-                }
-                unsafe {
-                    gcm_init_clmul(h_table, &h);
-                }
-            }
+            Implementation::CLMUL => unsafe { htable_new!(gcm_init_clmul, &h, cpu_features) },
 
             #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
-            Implementation::NEON => {
-                prefixed_extern! {
-                    fn gcm_init_neon(Htable: &mut HTable, h: &[u64; 2]);
-                }
-                unsafe {
-                    gcm_init_neon(h_table, &h);
-                }
-            }
-
-            Implementation::Fallback => {
-                h_table.Htable[0] = gcm_nohw::init(h);
-            }
-        }
-
-        key
-    }
-}
-
-/// SAFETY:
-///  * The function `$name` must meet the contract of the `f` paramweter of
-///    `ghash()`.
-#[cfg(any(
-    target_arch = "aarch64",
-    target_arch = "arm",
-    target_arch = "x86",
-    target_arch = "x86_64"
-))]
-macro_rules! ghash {
-    ( $name:ident, $xi:expr, $h_table:expr, $input:expr, $cpu_features:expr ) => {{
-        prefixed_extern! {
-            fn $name(
-                xi: &mut Xi,
-                Htable: &HTable,
-                inp: *const u8,
-                len: crate::c::NonZero_size_t,
-            );
-        }
-        ghash($name, $xi, $h_table, $input, $cpu_features);
-    }};
-}
-
-/// SAFETY:
-///   * `f` must read `len` bytes from `inp`; it may assume
-///     that `len` is a (non-zero) multiple of `BLOCK_LEN`.
-///   * `f` may inspect CPU features.
-#[cfg(any(
-    target_arch = "aarch64",
-    target_arch = "arm",
-    target_arch = "x86",
-    target_arch = "x86_64"
-))]
-unsafe fn ghash(
-    f: unsafe extern "C" fn(
-        xi: &mut Xi,
-        Htable: &HTable,
-        inp: *const u8,
-        len: crate::c::NonZero_size_t,
-    ),
-    xi: &mut Xi,
-    h_table: &HTable,
-    input: &[[u8; BLOCK_LEN]],
-    cpu_features: cpu::Features,
-) {
-    use crate::polyfill::slice;
-    use core::num::NonZeroUsize;
-
-    let input = slice::flatten(input);
+            Implementation::NEON => unsafe { htable_new!(gcm_init_neon, &h, cpu_features) },
 
-    let input_len = match NonZeroUsize::new(input.len()) {
-        Some(len) => len,
-        None => {
-            return;
-        }
-    };
-
-    let _: cpu::Features = cpu_features;
-    // SAFETY:
-    //  * There are `input_len: NonZeroUsize` bytes available at `input` for
-    //    `f` to read.
-    //  * CPU feature detection has been done.
-    unsafe {
-        f(xi, h_table, input.as_ptr(), input_len);
+            Implementation::Fallback => HTable::new_single_entry(gcm_nohw::init(h)),
+        };
+        Self { h_table }
     }
 }
 
@@ -209,7 +120,7 @@ impl<'key> Context<'key> {
 
     pub fn update_blocks(&mut self, input: &[[u8; BLOCK_LEN]]) {
         let xi = &mut self.Xi;
-        let h_table = &self.h_table;
+        let h_table = self.h_table;
 
         match detect_implementation(self.cpu_features) {
             #[cfg(target_arch = "x86_64")]
@@ -240,7 +151,7 @@ impl<'key> Context<'key> {
             },
 
             Implementation::Fallback => {
-                gcm_nohw::ghash(xi, h_table.Htable[0], input);
+                gcm_nohw::ghash(xi, h_table.first_entry(), input);
             }
         }
     }
@@ -257,31 +168,21 @@ impl<'key> Context<'key> {
         self.Xi.bitxor_assign(a);
 
         let xi = &mut self.Xi;
-        let h_table = &self.h_table;
+        let h_table = self.h_table;
 
         match detect_implementation(self.cpu_features) {
             #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))]
-            Implementation::CLMUL => {
-                prefixed_extern! {
-                    fn gcm_gmult_clmul(xi: &mut Xi, Htable: &HTable);
-                }
-                unsafe {
-                    gcm_gmult_clmul(xi, h_table);
-                }
-            }
+            Implementation::CLMUL => unsafe {
+                gmult!(gcm_gmult_clmul, xi, h_table, self.cpu_features)
+            },
 
             #[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
-            Implementation::NEON => {
-                prefixed_extern! {
-                    fn gcm_gmult_neon(xi: &mut Xi, Htable: &HTable);
-                }
-                unsafe {
-                    gcm_gmult_neon(xi, h_table);
-                }
-            }
+            Implementation::NEON => unsafe {
+                gmult!(gcm_gmult_neon, xi, h_table, self.cpu_features)
+            },
 
             Implementation::Fallback => {
-                gcm_nohw::gmult(xi, h_table.Htable[0]);
+                gcm_nohw::gmult(xi, h_table.first_entry());
             }
         }
     }
@@ -295,7 +196,7 @@ impl<'key> Context<'key> {
         alen.copy_from_slice(&BitLength::<u64>::to_be_bytes(self.aad_len));
         clen.copy_from_slice(&BitLength::<u64>::to_be_bytes(self.in_out_len));
         self.update_block(block);
-        f(self.Xi.0, self.cpu_features)
+        f(self.Xi.into_block(), self.cpu_features)
     }
 
     #[cfg(target_arch = "x86_64")]
@@ -315,32 +216,6 @@ impl<'key> Context<'key> {
     }
 }
 
-// The alignment is required by some assembly code.
-#[derive(Clone)]
-#[repr(C, align(16))]
-pub(super) struct HTable {
-    Htable: [U128; HTABLE_LEN],
-}
-
-#[derive(Clone, Copy)]
-#[repr(C)]
-struct U128 {
-    hi: u64,
-    lo: u64,
-}
-
-const HTABLE_LEN: usize = 16;
-
-#[repr(transparent)]
-pub struct Xi(Block);
-
-impl BitXorAssign<Block> for Xi {
-    #[inline]
-    fn bitxor_assign(&mut self, a: Block) {
-        self.0 = constant_time::xor_16(self.0, a)
-    }
-}
-
 #[allow(clippy::upper_case_acronyms)]
 enum Implementation {
     #[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))]
diff --git a/src/aead/gcm/ffi.rs b/src/aead/gcm/ffi.rs
new file mode 100644
index 0000000000..41f679f068
--- /dev/null
+++ b/src/aead/gcm/ffi.rs
@@ -0,0 +1,186 @@
+// Copyright 2018 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use crate::constant_time;
+
+pub(in super::super) const BLOCK_LEN: usize = 16;
+pub(in super::super) type Block = [u8; BLOCK_LEN];
+pub(super) const ZERO_BLOCK: Block = [0u8; BLOCK_LEN];
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm",
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+macro_rules! htable_new {
+    ( $name:ident, $input:expr, $cpu_features:expr ) => {{
+        use crate::aead::gcm::ffi::HTable;
+        prefixed_extern! {
+            fn $name(HTable: &mut HTable, h: &[u64; 2]);
+        }
+        HTable::new($name, $input)
+    }};
+}
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm",
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+macro_rules! gmult {
+    ( $name:ident, $xi:expr, $h_table:expr, $cpu_features:expr ) => {{
+        use crate::aead::gcm::ffi::{HTable, Xi};
+        prefixed_extern! {
+            fn $name(xi: &mut Xi, Htable: &HTable);
+        }
+        $h_table.gmult($name, $xi, $cpu_features)
+    }};
+}
+
+/// SAFETY:
+///  * The function `$name` must meet the contract of the `f` paramweter of
+///    `ghash()`.
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm",
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+macro_rules! ghash {
+    ( $name:ident, $xi:expr, $h_table:expr, $input:expr, $cpu_features:expr ) => {{
+        use crate::aead::gcm::ffi::{HTable, Xi};
+        prefixed_extern! {
+            fn $name(
+                xi: &mut Xi,
+                Htable: &HTable,
+                inp: *const u8,
+                len: crate::c::NonZero_size_t,
+            );
+        }
+        $h_table.ghash($name, $xi, $input, $cpu_features)
+    }};
+}
+
+/// SAFETY:
+///   * `f` must read `len` bytes from `inp`; it may assume
+///     that `len` is a (non-zero) multiple of `BLOCK_LEN`.
+///   * `f` may inspect CPU features.
+#[cfg(any(
+    target_arch = "aarch64",
+    target_arch = "arm",
+    target_arch = "x86",
+    target_arch = "x86_64"
+))]
+impl HTable {
+    pub(super) unsafe fn new(
+        init: unsafe extern "C" fn(HTable: &mut HTable, h: &[u64; 2]),
+        value: &[u64; 2],
+    ) -> Self {
+        let mut r = Self {
+            Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN],
+        };
+        unsafe { init(&mut r, value) };
+        r
+    }
+
+    pub(super) unsafe fn gmult(
+        &self,
+        f: unsafe extern "C" fn(xi: &mut Xi, h_table: &HTable),
+        xi: &mut Xi,
+        _cpu_features: crate::cpu::Features,
+    ) {
+        unsafe { f(xi, self) }
+    }
+
+    pub(super) unsafe fn ghash(
+        &self,
+        f: unsafe extern "C" fn(
+            xi: &mut Xi,
+            Htable: &HTable,
+            inp: *const u8,
+            len: crate::c::NonZero_size_t,
+        ),
+        xi: &mut Xi,
+        input: &[[u8; BLOCK_LEN]],
+        cpu_features: crate::cpu::Features,
+    ) {
+        use crate::polyfill::slice;
+        use core::num::NonZeroUsize;
+
+        let input = slice::flatten(input);
+
+        let input_len = match NonZeroUsize::new(input.len()) {
+            Some(len) => len,
+            None => {
+                return;
+            }
+        };
+
+        let _: crate::cpu::Features = cpu_features;
+        // SAFETY:
+        //  * There are `input_len: NonZeroUsize` bytes available at `input` for
+        //    `f` to read.
+        //  * CPU feature detection has been done.
+        unsafe {
+            f(xi, &self, input.as_ptr(), input_len);
+        }
+    }
+}
+
+impl HTable {
+    pub(super) fn new_single_entry(first_entry: U128) -> Self {
+        let mut r = Self {
+            Htable: [U128 { hi: 0, lo: 0 }; HTABLE_LEN],
+        };
+        r.Htable[0] = first_entry;
+        r
+    }
+
+    pub(super) fn first_entry(&self) -> U128 {
+        self.Htable[9]
+    }
+}
+
+// The alignment is required by some assembly code.
+#[derive(Clone)]
+#[repr(C, align(16))]
+pub(in super::super) struct HTable {
+    Htable: [U128; HTABLE_LEN],
+}
+
+#[derive(Clone, Copy)]
+#[repr(C)]
+pub(super) struct U128 {
+    pub(super) hi: u64,
+    pub(super) lo: u64,
+}
+
+const HTABLE_LEN: usize = 16;
+
+#[repr(transparent)]
+pub(in super::super) struct Xi(pub(super) Block);
+
+impl Xi {
+    #[inline]
+    pub(super) fn bitxor_assign(&mut self, a: Block) {
+        self.0 = constant_time::xor_16(self.0, a)
+    }
+
+    #[inline]
+    pub(super) fn into_block(self) -> Block {
+        self.0
+    }
+}
diff --git a/src/aead/gcm/gcm_nohw.rs b/src/aead/gcm/gcm_nohw.rs
index edc8a04769..77ca08e056 100644
--- a/src/aead/gcm/gcm_nohw.rs
+++ b/src/aead/gcm/gcm_nohw.rs
@@ -22,7 +22,7 @@
 //
 // Unlike the BearSSL notes, we use u128 in the 64-bit implementation.
 
-use super::{Xi, BLOCK_LEN};
+use super::{ffi::U128, Xi, BLOCK_LEN};
 use crate::polyfill::ArraySplitMap as _;
 
 #[cfg(target_pointer_width = "64")]
@@ -138,7 +138,7 @@ fn gcm_mul64_nohw(a: u64, b: u64) -> (u64, u64) {
     (lo ^ (mid << 32), hi ^ (mid >> 32))
 }
 
-pub(super) fn init(xi: [u64; 2]) -> super::U128 {
+pub(super) fn init(xi: [u64; 2]) -> U128 {
     // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
     // avoids a shift by 1 in the multiplication, needed to account for bit
     // reversal losing a bit after multiplication, that is,
@@ -165,10 +165,10 @@ pub(super) fn init(xi: [u64; 2]) -> super::U128 {
     hi ^= carry & 0xc200000000000000;
 
     // This implementation does not use the rest of |Htable|.
-    super::U128 { hi, lo }
+    U128 { hi, lo }
 }
 
-fn gcm_polyval_nohw(xi: &mut [u64; 2], h: super::U128) {
+fn gcm_polyval_nohw(xi: &mut [u64; 2], h: U128) {
     // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
     // through |r3|. Note there is no byte or bit reversal because we are
     // evaluating POLYVAL.
@@ -217,13 +217,13 @@ fn gcm_polyval_nohw(xi: &mut [u64; 2], h: super::U128) {
     *xi = [r2, r3];
 }
 
-pub(super) fn gmult(xi: &mut Xi, h: super::U128) {
+pub(super) fn gmult(xi: &mut Xi, h: U128) {
     with_swapped_xi(xi, |swapped| {
         gcm_polyval_nohw(swapped, h);
     })
 }
 
-pub(super) fn ghash(xi: &mut Xi, h: super::U128, input: &[[u8; BLOCK_LEN]]) {
+pub(super) fn ghash(xi: &mut Xi, h: U128, input: &[[u8; BLOCK_LEN]]) {
     with_swapped_xi(xi, |swapped| {
         input.iter().for_each(|&input| {
             let input = input.array_split_map(u64::from_be_bytes);