diff --git a/Cargo.lock b/Cargo.lock
index 8bdfe0138..1db6a343e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -47,6 +47,12 @@ version = "1.0.54"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311"
 
+[[package]]
+name = "cfg-if"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
+
 [[package]]
 name = "crypto-mac"
 version = "0.8.0"
@@ -213,6 +219,7 @@ name = "sha-1"
 version = "0.9.0"
 dependencies = [
  "block-buffer",
+ "cfg-if",
  "digest",
  "hex-literal",
  "libc",
diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index aba0700d6..985ef5579 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -20,6 +20,7 @@ block-buffer = "0.9"
 opaque-debug = "0.2"
 sha1-asm = { version = "0.4", optional = true }
 libc = { version = "0.2.68", optional = true }
+cfg-if = "0.1"
 
 [dev-dependencies]
 digest = { version = "0.9", features = ["dev"] }
diff --git a/sha1/src/aarch64.rs b/sha1/src/aarch64.rs
deleted file mode 100644
index 8d1a916cc..000000000
--- a/sha1/src/aarch64.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-use libc::{getauxval, AT_HWCAP, HWCAP_SHA1};
-
-#[inline(always)]
-pub fn sha1_supported() -> bool {
-    #[allow(unsafe_code)]
-    let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) };
-    (hwcaps & HWCAP_SHA1) != 0
-}
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
new file mode 100644
index 000000000..b595199ba
--- /dev/null
+++ b/sha1/src/compress.rs
@@ -0,0 +1,32 @@
+use digest::consts::U64;
+use digest::generic_array::GenericArray;
+
+mod aarch64;
+#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
+mod soft;
+mod x86;
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "asm-aarch64")] {
+        use aarch64::compress as compress_inner;
+    } else if #[cfg(feature = "asm")] {
+        // TODO: replace after sha1-asm rework
+        fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) {
+            for block in blocks {
+                sha1_asm::compress(state, block);
+            }
+        }
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        use x86::compress as compress_inner;
+    } else {
+        use soft::compress as compress_inner;
+    }
+}
+
+pub fn compress(state: &mut [u32; 5], blocks: &[GenericArray<u8, U64>]) {
+    // SAFETY: GenericArray<u8, U64> and [u8; 64] have
+    // exactly the same memory layout
+    #[allow(unsafe_code)]
+    let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) };
+    compress_inner(state, blocks);
+}
diff --git a/sha1/src/compress/aarch64.rs b/sha1/src/compress/aarch64.rs
new file mode 100644
index 000000000..85295f052
--- /dev/null
+++ b/sha1/src/compress/aarch64.rs
@@ -0,0 +1,21 @@
+#![cfg(feature = "asm-aarch64")]
+use libc::{getauxval, AT_HWCAP, HWCAP_SHA1};
+
+fn sha1_supported() -> bool {
+    #[allow(unsafe_code)]
+    let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) };
+    (hwcaps & HWCAP_SHA1) != 0
+}
+
+pub fn compress(state: &mut [u32; 5], blocks: &[u8; 64]) {
+    // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once
+    // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented
+    // to let us use it on no_std.
+    if sha1_supported() {
+        for block in blocks {
+            sha1_asm::compress(state, block);
+        }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
diff --git a/sha1/src/utils.rs b/sha1/src/compress/soft.rs
similarity index 66%
rename from sha1/src/utils.rs
rename to sha1/src/compress/soft.rs
index 1d746fb9f..94a019b98 100644
--- a/sha1/src/utils.rs
+++ b/sha1/src/compress/soft.rs
@@ -1,10 +1,6 @@
 #![allow(clippy::many_single_char_names)]
 use crate::consts::{BLOCK_LEN, K0, K1, K2, K3};
 use core::convert::TryInto;
-use digest::generic_array::typenum::U64;
-use digest::generic_array::GenericArray;
-
-type Block = GenericArray<u8, U64>;
 
 #[inline(always)]
 fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
@@ -21,27 +17,18 @@ fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]]
 }
 
-/// Not an intrinsic, but gets the first element of a vector.
-#[inline]
-pub fn sha1_first(w0: [u32; 4]) -> u32 {
-    w0[0]
-}
-
-/// Not an intrinsic, but adds a word to the first element of a vector.
 #[inline]
 pub fn sha1_first_add(e: u32, w0: [u32; 4]) -> [u32; 4] {
     let [a, b, c, d] = w0;
     [e.wrapping_add(a), b, c, d]
 }
 
-/// Emulates `llvm.x86.sha1msg1` intrinsic.
 fn sha1msg1(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     let [_, _, w2, w3] = a;
     let [w4, w5, _, _] = b;
     [a[0] ^ w2, a[1] ^ w3, a[2] ^ w4, a[3] ^ w5]
 }
 
-/// Emulates `llvm.x86.sha1msg2` intrinsic.
 fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     let [x0, x1, x2, x3] = a;
     let [_, w13, w14, w15] = b;
@@ -54,21 +41,11 @@ fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     [w16, w17, w18, w19]
 }
 
-/// Performs 4 rounds of the message schedule update.
-/*
-pub fn sha1_schedule_x4(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
-    sha1msg2(sha1msg1(v0, v1) ^ v2, v3)
-}
-*/
-
-/// Emulates `llvm.x86.sha1nexte` intrinsic.
 #[inline]
 fn sha1_first_half(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
-    sha1_first_add(sha1_first(abcd).rotate_left(30), msg)
+    sha1_first_add(abcd[0].rotate_left(30), msg)
 }
 
-/// Emulates `llvm.x86.sha1rnds4` intrinsic.
-/// Performs 4 rounds of the message block digest.
 fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] {
     const K0V: [u32; 4] = [K0, K0, K0, K0];
     const K1V: [u32; 4] = [K1, K1, K1, K1];
@@ -84,7 +61,6 @@ fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] {
     }
 }
 
-/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic.
 fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     let [mut a, mut b, mut c, mut d] = abcd;
     let [t, u, v, w] = msg;
@@ -123,7 +99,6 @@ fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
-/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic.
 fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     let [mut a, mut b, mut c, mut d] = abcd;
     let [t, u, v, w] = msg;
@@ -162,7 +137,6 @@ fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
-/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic.
 fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     let [mut a, mut b, mut c, mut d] = abcd;
     let [t, u, v, w] = msg;
@@ -201,7 +175,7 @@ fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
-/// Process a block with the SHA-1 algorithm.
+#[inline(always)]
 fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     macro_rules! schedule {
         ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
@@ -216,7 +190,6 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     }
 
     // Rounds 0..20
-    // TODO: replace with `[u32; 4]::load`
     let mut h0 = [state[0], state[1], state[2], state[3]];
     let mut w0 = [block[0], block[1], block[2], block[3]];
     let mut h1 = sha1_digest_round_x4(h0, sha1_first_add(state[4], w0), 0);
@@ -265,7 +238,7 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     w4 = schedule!(w0, w1, w2, w3);
     h0 = rounds4!(h1, h0, w4, 3);
 
-    let e = sha1_first(h1).rotate_left(30);
+    let e = h1[0].rotate_left(30);
     let [a, b, c, d] = h0;
 
     state[0] = state[0].wrapping_add(a);
@@ -275,58 +248,16 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     state[4] = state[4].wrapping_add(e);
 }
 
-/// Process a block with the SHA-1 algorithm. (See more...)
-///
-/// SHA-1 is a cryptographic hash function, and as such, it operates
-/// on an arbitrary number of bytes. This function operates on a fixed
-/// number of bytes. If you call this function with anything other than
-/// 64 bytes, then it will panic! This function takes two arguments:
-///
-/// * `state` is reference to an **array** of 5 words.
-/// * `block` is reference to a **slice** of 64 bytes.
-///
-/// If you want the function that performs a message digest on an arbitrary
-/// number of bytes, then see also the `Sha1` struct above.
-///
-/// # Implementation
-///
-/// First, some background. Both ARM and Intel are releasing documentation
-/// that they plan to include instruction set extensions for SHA1 and SHA256
-/// sometime in the near future. Second, LLVM won't lower these intrinsics yet,
-/// so these functions were written emulate these instructions. Finally,
-/// the block function implemented with these emulated intrinsics turned out
-/// to be quite fast! What follows is a discussion of this CPU-level view
-/// of the SHA-1 algorithm and how it relates to the mathematical definition.
-///
-/// The SHA instruction set extensions can be divided up into two categories:
-///
-/// * message work schedule update calculation ("schedule" v., "work" n.)
-/// * message block 80-round digest calculation ("digest" v., "block" n.)
-///
-/// The schedule-related functions can be used to easily perform 4 rounds
-/// of the message work schedule update calculation, as shown below:
-///
-/// ```ignore
-/// macro_rules! schedule_x4 {
-///     ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => (
-///         sha1msg2(sha1msg1($v0, $v1) ^ $v2, $v3)
-///     )
-/// }
-///
-/// macro_rules! round_x4 {
-///     ($h0:ident, $h1:ident, $wk:expr, $i:expr) => (
-///         sha1rnds4($h0, sha1_first_half($h1, $wk), $i)
-///     )
-/// }
-/// ```
-///
-/// and also shown above is how the digest-related functions can be used to
-/// perform 4 rounds of the message block digest calculation.
-///
-pub fn compress(state: &mut [u32; 5], block: &Block) {
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     let mut block_u32 = [0u32; BLOCK_LEN];
-    for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
-        *o = u32::from_be_bytes(chunk.try_into().unwrap());
+    // since LLVM can't properly use aliasing yet it will make
+    // unnecessary state stores without this copy
+    let mut state_cpy = *state;
+    for block in blocks.iter() {
+        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
+            *o = u32::from_be_bytes(chunk.try_into().unwrap());
+        }
+        sha1_digest_block_u32(&mut state_cpy, &block_u32);
     }
-    sha1_digest_block_u32(state, &block_u32);
+    *state = state_cpy;
 }
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
new file mode 100644
index 000000000..fd8eebfaf
--- /dev/null
+++ b/sha1/src/compress/x86.rs
@@ -0,0 +1,132 @@
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#![allow(unsafe_code)]
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+#[cfg(not(all(
+    target_feature = "sha",
+    target_feature = "sse2",
+    target_feature = "ssse3",
+    target_feature = "sse4.1",
+)))]
+fn sha1_supported() -> bool {
+    false
+}
+
+#[cfg(all(
+    target_feature = "sha",
+    target_feature = "sse2",
+    target_feature = "ssse3",
+    target_feature = "sse4.1",
+))]
+fn sha1_supported() -> bool {
+    true
+}
+
+macro_rules! rounds4 {
+    ($h0:ident, $h1:ident, $wk:expr, $i:expr) => {
+        _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i)
+    };
+}
+
+macro_rules! schedule {
+    ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
+        _mm_sha1msg2_epu32(_mm_xor_si128(_mm_sha1msg1_epu32($v0, $v1), $v2), $v3)
+    };
+}
+
+#[target_feature(enable = "sha,ssse3,sse4.1")]
+unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    #[allow(non_snake_case)]
+    let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F);
+
+    let mut state_abcd = _mm_set_epi32(
+        state[0] as i32,
+        state[1] as i32,
+        state[2] as i32,
+        state[3] as i32,
+    );
+    let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0);
+
+    for block in blocks {
+        // SAFETY: we use only unaligned loads with this pointer
+        #[allow(clippy::cast_ptr_alignment)]
+        let block_ptr = block.as_ptr() as *const __m128i;
+
+        let h0 = state_abcd;
+        let e0 = state_e;
+
+        let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK);
+        let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK);
+        let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK);
+        let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK);
+
+        // Rounds 0..20
+        let mut h1 = _mm_sha1rnds4_epu32(h0, _mm_add_epi32(e0, w0), 0);
+        let mut h0 = rounds4!(h1, h0, w1, 0);
+        h1 = rounds4!(h0, h1, w2, 0);
+        h0 = rounds4!(h1, h0, w3, 0);
+        let mut w4 = schedule!(w0, w1, w2, w3);
+        h1 = rounds4!(h0, h1, w4, 0);
+
+        // Rounds 20..40
+        w0 = schedule!(w1, w2, w3, w4);
+        h0 = rounds4!(h1, h0, w0, 1);
+        w1 = schedule!(w2, w3, w4, w0);
+        h1 = rounds4!(h0, h1, w1, 1);
+        w2 = schedule!(w3, w4, w0, w1);
+        h0 = rounds4!(h1, h0, w2, 1);
+        w3 = schedule!(w4, w0, w1, w2);
+        h1 = rounds4!(h0, h1, w3, 1);
+        w4 = schedule!(w0, w1, w2, w3);
+        h0 = rounds4!(h1, h0, w4, 1);
+
+        // Rounds 40..60
+        w0 = schedule!(w1, w2, w3, w4);
+        h1 = rounds4!(h0, h1, w0, 2);
+        w1 = schedule!(w2, w3, w4, w0);
+        h0 = rounds4!(h1, h0, w1, 2);
+        w2 = schedule!(w3, w4, w0, w1);
+        h1 = rounds4!(h0, h1, w2, 2);
+        w3 = schedule!(w4, w0, w1, w2);
+        h0 = rounds4!(h1, h0, w3, 2);
+        w4 = schedule!(w0, w1, w2, w3);
+        h1 = rounds4!(h0, h1, w4, 2);
+
+        // Rounds 60..80
+        w0 = schedule!(w1, w2, w3, w4);
+        h0 = rounds4!(h1, h0, w0, 3);
+        w1 = schedule!(w2, w3, w4, w0);
+        h1 = rounds4!(h0, h1, w1, 3);
+        w2 = schedule!(w3, w4, w0, w1);
+        h0 = rounds4!(h1, h0, w2, 3);
+        w3 = schedule!(w4, w0, w1, w2);
+        h1 = rounds4!(h0, h1, w3, 3);
+        w4 = schedule!(w0, w1, w2, w3);
+        h0 = rounds4!(h1, h0, w4, 3);
+
+        state_abcd = _mm_add_epi32(state_abcd, h0);
+        state_e = _mm_sha1nexte_epu32(h1, state_e);
+    }
+
+    state[0] = _mm_extract_epi32(state_abcd, 3) as u32;
+    state[1] = _mm_extract_epi32(state_abcd, 2) as u32;
+    state[2] = _mm_extract_epi32(state_abcd, 1) as u32;
+    state[3] = _mm_extract_epi32(state_abcd, 0) as u32;
+    state[4] = _mm_extract_epi32(state_e, 3) as u32;
+}
+
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if sha1_supported() {
+        unsafe {
+            digest_blocks(state, blocks);
+        }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index abe0bb492..da93d5549 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -54,33 +54,20 @@ compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when build
 ))]
 compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU.");
 
-#[macro_use]
-extern crate opaque_debug;
-#[cfg(feature = "asm")]
-extern crate sha1_asm;
 #[cfg(feature = "std")]
 extern crate std;
 
-#[cfg(feature = "asm-aarch64")]
-mod aarch64;
+mod compress;
 mod consts;
-#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
-mod utils;
-
-pub use digest::{self, Digest};
 
+use crate::compress::compress;
 use crate::consts::{H, STATE_LEN};
 use block_buffer::BlockBuffer;
 use digest::consts::{U20, U64};
 use digest::impl_write;
+pub use digest::{self, Digest};
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
-#[cfg(not(feature = "asm"))]
-use crate::utils::compress;
-
-#[cfg(feature = "asm")]
-use digest::generic_array::GenericArray;
-
 /// Structure representing the state of a SHA-1 computation
 #[derive(Clone)]
 pub struct Sha1 {
@@ -109,7 +96,7 @@ impl Update for Sha1 {
         // Assumes that `length_bits<<3` will not overflow
         self.len += input.len() as u64;
         let state = &mut self.h;
-        self.buffer.input_block(input, |d| compress(state, d));
+        self.buffer.input_blocks(input, |d| compress(state, d));
     }
 }
 
@@ -119,7 +106,8 @@ impl FixedOutputDirty for Sha1 {
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         let s = &mut self.h;
         let l = self.len << 3;
-        self.buffer.len64_padding_be(l, |d| compress(s, d));
+        self.buffer
+            .len64_padding_be(l, |d| compress(s, core::slice::from_ref(d)));
         for (chunk, v) in out.chunks_exact_mut(4).zip(self.h.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
@@ -134,28 +122,5 @@ impl Reset for Sha1 {
     }
 }
 
-#[cfg(all(feature = "asm", not(feature = "asm-aarch64")))]
-#[inline(always)]
-fn compress(state: &mut [u32; 5], block: &GenericArray<u8, U64>) {
-    #[allow(unsafe_code)]
-    let block: &[u8; 64] = unsafe { core::mem::transmute(block) };
-    sha1_asm::compress(state, block);
-}
-
-#[cfg(feature = "asm-aarch64")]
-#[inline(always)]
-fn compress(state: &mut [u32; 5], block: &GenericArray<u8, U64>) {
-    // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once
-    // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented
-    // to let us use it on no_std.
-    if aarch64::sha1_supported() {
-        #[allow(unsafe_code)]
-        let block: &[u8; 64] = unsafe { core::mem::transmute(block) };
-        sha1_asm::compress(state, block);
-    } else {
-        utils::compress(state, block);
-    }
-}
-
-impl_opaque_debug!(Sha1);
+opaque_debug::impl_opaque_debug!(Sha1);
 impl_write!(Sha1);