From 6e920452c53fb5310de976666c77e6949af5d700 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 11:21:07 +0300
Subject: [PATCH 01/14] sha1: add x86 intrinsics support

---
 Cargo.lock                              |   7 ++
 sha1/Cargo.toml                         |   1 +
 sha1/src/aarch64.rs                     |   8 --
 sha1/src/compress.rs                    |  34 +++++++
 sha1/src/compress/aarch64.rs            |  21 ++++
 sha1/src/{utils.rs => compress/soft.rs} |  89 ++---------------
 sha1/src/compress/x86.rs                | 125 ++++++++++++++++++++++++
 sha1/src/lib.rs                         |  49 ++--------
 sha1/tests/lib.rs                       |  10 ++
 9 files changed, 213 insertions(+), 131 deletions(-)
 delete mode 100644 sha1/src/aarch64.rs
 create mode 100644 sha1/src/compress.rs
 create mode 100644 sha1/src/compress/aarch64.rs
 rename sha1/src/{utils.rs => compress/soft.rs} (67%)
 create mode 100644 sha1/src/compress/x86.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8bdfe0138..1db6a343e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -47,6 +47,12 @@ version = "1.0.54"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311"
 
+[[package]]
+name = "cfg-if"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
+
 [[package]]
 name = "crypto-mac"
 version = "0.8.0"
@@ -213,6 +219,7 @@ name = "sha-1"
 version = "0.9.0"
 dependencies = [
  "block-buffer",
+ "cfg-if",
  "digest",
  "hex-literal",
  "libc",
diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index aba0700d6..985ef5579 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -20,6 +20,7 @@ block-buffer = "0.9"
 opaque-debug = "0.2"
 sha1-asm = { version = "0.4", optional = true }
 libc = { version = "0.2.68", optional = true }
+cfg-if = "0.1"
 
 [dev-dependencies]
 digest = { version = "0.9", features = ["dev"] }
diff --git a/sha1/src/aarch64.rs b/sha1/src/aarch64.rs
deleted file mode 100644
index 8d1a916cc..000000000
--- a/sha1/src/aarch64.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-use libc::{getauxval, AT_HWCAP, HWCAP_SHA1};
-
-#[inline(always)]
-pub fn sha1_supported() -> bool {
-    #[allow(unsafe_code)]
-    let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) };
-    (hwcaps & HWCAP_SHA1) != 0
-}
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
new file mode 100644
index 000000000..2c298f26a
--- /dev/null
+++ b/sha1/src/compress.rs
@@ -0,0 +1,34 @@
+use digest::generic_array::GenericArray;
+use digest::consts::U64;
+
+#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
+mod soft;
+mod aarch64;
+mod x86;
+
+type Block = GenericArray<u8, U64>;
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "asm-aarch64")] {
+        use aarch64::compress as compress_inner;
+    } else if #[cfg(feature = "asm")] {
+        // TODO: replace after sha1-asm rework
+        fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) {
+            for block in blocks {
+                sha1_asm::compress(state, block);
+            }
+        }
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        use x86::compress as compress_inner;
+    } else {
+        use soft::compress as compress_inner;
+    }
+}
+
+pub fn compress(state: &mut [u32; 5], blocks: &[Block]) {
+    // SAFETY: GenericArray<u8, U64> and [u8; 64] have
+    // exactly the same memory layout
+    #[allow(unsafe_code)]
+    let blocks: &[[u8; 64]] = unsafe { core::mem::transmute(blocks) };
+    compress_inner(state, blocks);
+}
diff --git a/sha1/src/compress/aarch64.rs b/sha1/src/compress/aarch64.rs
new file mode 100644
index 000000000..85295f052
--- /dev/null
+++ b/sha1/src/compress/aarch64.rs
@@ -0,0 +1,21 @@
+#![cfg(feature = "asm-aarch64")]
+use libc::{getauxval, AT_HWCAP, HWCAP_SHA1};
+
+fn sha1_supported() -> bool {
+    #[allow(unsafe_code)]
+    let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) };
+    (hwcaps & HWCAP_SHA1) != 0
+}
+
+pub fn compress(state: &mut [u32; 5], blocks: &[u8; 64]) {
+    // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once
+    // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented
+    // to let us use it on no_std.
+    if sha1_supported() {
+        for block in blocks {
+            sha1_asm::compress(state, block);
+        }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
diff --git a/sha1/src/utils.rs b/sha1/src/compress/soft.rs
similarity index 67%
rename from sha1/src/utils.rs
rename to sha1/src/compress/soft.rs
index 1d746fb9f..77907b47b 100644
--- a/sha1/src/utils.rs
+++ b/sha1/src/compress/soft.rs
@@ -1,10 +1,6 @@
 #![allow(clippy::many_single_char_names)]
 use crate::consts::{BLOCK_LEN, K0, K1, K2, K3};
 use core::convert::TryInto;
-use digest::generic_array::typenum::U64;
-use digest::generic_array::GenericArray;
-
-type Block = GenericArray<u8, U64>;
 
 #[inline(always)]
 fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
@@ -21,27 +17,18 @@ fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]]
 }
 
-/// Not an intrinsic, but gets the first element of a vector.
-#[inline]
-pub fn sha1_first(w0: [u32; 4]) -> u32 {
-    w0[0]
-}
-
-/// Not an intrinsic, but adds a word to the first element of a vector.
 #[inline]
 pub fn sha1_first_add(e: u32, w0: [u32; 4]) -> [u32; 4] {
     let [a, b, c, d] = w0;
     [e.wrapping_add(a), b, c, d]
 }
 
-/// Emulates `llvm.x86.sha1msg1` intrinsic.
 fn sha1msg1(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     let [_, _, w2, w3] = a;
     let [w4, w5, _, _] = b;
     [a[0] ^ w2, a[1] ^ w3, a[2] ^ w4, a[3] ^ w5]
 }
 
-/// Emulates `llvm.x86.sha1msg2` intrinsic.
 fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     let [x0, x1, x2, x3] = a;
     let [_, w13, w14, w15] = b;
@@ -54,21 +41,11 @@ fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     [w16, w17, w18, w19]
 }
 
-/// Performs 4 rounds of the message schedule update.
-/*
-pub fn sha1_schedule_x4(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
-    sha1msg2(sha1msg1(v0, v1) ^ v2, v3)
-}
-*/
-
-/// Emulates `llvm.x86.sha1nexte` intrinsic.
 #[inline]
 fn sha1_first_half(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
-    sha1_first_add(sha1_first(abcd).rotate_left(30), msg)
+    sha1_first_add(abcd[0].rotate_left(30), msg)
 }
 
-/// Emulates `llvm.x86.sha1rnds4` intrinsic.
-/// Performs 4 rounds of the message block digest.
 fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] {
     const K0V: [u32; 4] = [K0, K0, K0, K0];
     const K1V: [u32; 4] = [K1, K1, K1, K1];
@@ -84,7 +61,6 @@ fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] {
     }
 }
 
-/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic.
 fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     let [mut a, mut b, mut c, mut d] = abcd;
     let [t, u, v, w] = msg;
@@ -123,7 +99,6 @@ fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
-/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic.
 fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     let [mut a, mut b, mut c, mut d] = abcd;
     let [t, u, v, w] = msg;
@@ -162,7 +137,6 @@ fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
-/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic.
 fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     let [mut a, mut b, mut c, mut d] = abcd;
     let [t, u, v, w] = msg;
@@ -216,7 +190,6 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     }
 
     // Rounds 0..20
-    // TODO: replace with `[u32; 4]::load`
     let mut h0 = [state[0], state[1], state[2], state[3]];
     let mut w0 = [block[0], block[1], block[2], block[3]];
     let mut h1 = sha1_digest_round_x4(h0, sha1_first_add(state[4], w0), 0);
@@ -265,7 +238,7 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     w4 = schedule!(w0, w1, w2, w3);
     h0 = rounds4!(h1, h0, w4, 3);
 
-    let e = sha1_first(h1).rotate_left(30);
+    let e = h1[0].rotate_left(30);
     let [a, b, c, d] = h0;
 
     state[0] = state[0].wrapping_add(a);
@@ -275,58 +248,12 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     state[4] = state[4].wrapping_add(e);
 }
 
-/// Process a block with the SHA-1 algorithm. (See more...)
-///
-/// SHA-1 is a cryptographic hash function, and as such, it operates
-/// on an arbitrary number of bytes. This function operates on a fixed
-/// number of bytes. If you call this function with anything other than
-/// 64 bytes, then it will panic! This function takes two arguments:
-///
-/// * `state` is reference to an **array** of 5 words.
-/// * `block` is reference to a **slice** of 64 bytes.
-///
-/// If you want the function that performs a message digest on an arbitrary
-/// number of bytes, then see also the `Sha1` struct above.
-///
-/// # Implementation
-///
-/// First, some background. Both ARM and Intel are releasing documentation
-/// that they plan to include instruction set extensions for SHA1 and SHA256
-/// sometime in the near future. Second, LLVM won't lower these intrinsics yet,
-/// so these functions were written emulate these instructions. Finally,
-/// the block function implemented with these emulated intrinsics turned out
-/// to be quite fast! What follows is a discussion of this CPU-level view
-/// of the SHA-1 algorithm and how it relates to the mathematical definition.
-///
-/// The SHA instruction set extensions can be divided up into two categories:
-///
-/// * message work schedule update calculation ("schedule" v., "work" n.)
-/// * message block 80-round digest calculation ("digest" v., "block" n.)
-///
-/// The schedule-related functions can be used to easily perform 4 rounds
-/// of the message work schedule update calculation, as shown below:
-///
-/// ```ignore
-/// macro_rules! schedule_x4 {
-///     ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => (
-///         sha1msg2(sha1msg1($v0, $v1) ^ $v2, $v3)
-///     )
-/// }
-///
-/// macro_rules! round_x4 {
-///     ($h0:ident, $h1:ident, $wk:expr, $i:expr) => (
-///         sha1rnds4($h0, sha1_first_half($h1, $wk), $i)
-///     )
-/// }
-/// ```
-///
-/// and also shown above is how the digest-related functions can be used to
-/// perform 4 rounds of the message block digest calculation.
-///
-pub fn compress(state: &mut [u32; 5], block: &Block) {
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     let mut block_u32 = [0u32; BLOCK_LEN];
-    for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
-        *o = u32::from_be_bytes(chunk.try_into().unwrap());
+    for block in blocks.iter() {
+        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
+            *o = u32::from_be_bytes(chunk.try_into().unwrap());
+        }
+        sha1_digest_block_u32(state, &block_u32);
     }
-    sha1_digest_block_u32(state, &block_u32);
 }
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
new file mode 100644
index 000000000..ced7d21fa
--- /dev/null
+++ b/sha1/src/compress/x86.rs
@@ -0,0 +1,125 @@
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#![allow(unsafe_code)]
+
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+
+#[cfg(not(target_feature = "sha"))]
+fn sha1_supported() -> bool {
+    true
+}
+
+#[cfg(target_feature = "sha")]
+fn sha1_supported() -> bool {
+    true
+}
+
+macro_rules! rounds4 {
+    ($h0:ident, $h1:ident, $wk:expr, $i:expr) => {
+        _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i)
+    };
+}
+
+macro_rules! schedule {
+    ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
+        _mm_sha1msg2_epu32(_mm_xor_si128(_mm_sha1msg1_epu32($v0, $v1), $v2), $v3)
+    };
+}
+
+#[target_feature(enable = "sha,ssse3,sse4.1")]
+unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    #[allow(non_snake_case)]
+    let MASK: __m128i = _mm_set_epi64x(0x0001020304050607, 0x08090a0b0c0d0e0f);
+
+    let mut state_abcd = _mm_set_epi32(
+        state[0] as i32,
+        state[1] as i32,
+        state[2] as i32,
+        state[3] as i32,
+    );
+    let mut state_e = _mm_set_epi32(
+        state[4] as i32,
+        0,
+        0,
+        0,
+    );
+
+    for block in blocks {
+        let block_ptr = block.as_ptr() as *const __m128i;
+
+        let h0 = state_abcd;
+        let e0 = state_e;
+
+        let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK);
+        let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK);
+        let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK);
+        let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK);
+
+        // Rounds 0..20
+        let mut h1 = _mm_sha1rnds4_epu32(h0, _mm_add_epi32(e0, w0), 0);
+        let mut h0 = rounds4!(h1, h0, w1, 0);
+        h1 = rounds4!(h0, h1, w2, 0);
+        h0 = rounds4!(h1, h0, w3, 0);
+        let mut w4 = schedule!(w0, w1, w2, w3);
+        h1 = rounds4!(h0, h1, w4, 0);
+
+        // Rounds 20..40
+        w0 = schedule!(w1, w2, w3, w4);
+        h0 = rounds4!(h1, h0, w0, 1);
+        w1 = schedule!(w2, w3, w4, w0);
+        h1 = rounds4!(h0, h1, w1, 1);
+        w2 = schedule!(w3, w4, w0, w1);
+        h0 = rounds4!(h1, h0, w2, 1);
+        w3 = schedule!(w4, w0, w1, w2);
+        h1 = rounds4!(h0, h1, w3, 1);
+        w4 = schedule!(w0, w1, w2, w3);
+        h0 = rounds4!(h1, h0, w4, 1);
+
+        // Rounds 40..60
+        w0 = schedule!(w1, w2, w3, w4);
+        h1 = rounds4!(h0, h1, w0, 2);
+        w1 = schedule!(w2, w3, w4, w0);
+        h0 = rounds4!(h1, h0, w1, 2);
+        w2 = schedule!(w3, w4, w0, w1);
+        h1 = rounds4!(h0, h1, w2, 2);
+        w3 = schedule!(w4, w0, w1, w2);
+        h0 = rounds4!(h1, h0, w3, 2);
+        w4 = schedule!(w0, w1, w2, w3);
+        h1 = rounds4!(h0, h1, w4, 2);
+
+        // Rounds 60..80
+        w0 = schedule!(w1, w2, w3, w4);
+        h0 = rounds4!(h1, h0, w0, 3);
+        w1 = schedule!(w2, w3, w4, w0);
+        h1 = rounds4!(h0, h1, w1, 3);
+        w2 = schedule!(w3, w4, w0, w1);
+        h0 = rounds4!(h1, h0, w2, 3);
+        w3 = schedule!(w4, w0, w1, w2);
+        h1 = rounds4!(h0, h1, w3, 3);
+        w4 = schedule!(w0, w1, w2, w3);
+        h0 = rounds4!(h1, h0, w4, 3);
+
+        state_abcd = _mm_add_epi32(state_abcd, h0);
+        state_e = _mm_sha1nexte_epu32(h1, state_e);
+    }
+
+    state[0] = _mm_extract_epi32(state_abcd, 3) as u32;
+    state[1] = _mm_extract_epi32(state_abcd, 2) as u32;
+    state[2] = _mm_extract_epi32(state_abcd, 1) as u32;
+    state[3] = _mm_extract_epi32(state_abcd, 0) as u32;
+    state[4] = _mm_extract_epi32(state_e, 3) as u32;
+}
+
+pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if sha1_supported() {
+        unsafe {
+            digest_blocks(state, blocks);
+        }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index abe0bb492..f7e1f97aa 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -54,32 +54,20 @@ compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when build
 ))]
 compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU.");
 
-#[macro_use]
-extern crate opaque_debug;
-#[cfg(feature = "asm")]
-extern crate sha1_asm;
 #[cfg(feature = "std")]
 extern crate std;
 
-#[cfg(feature = "asm-aarch64")]
-mod aarch64;
 mod consts;
-#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
-mod utils;
+mod compress;
 
-pub use digest::{self, Digest};
-
-use crate::consts::{H, STATE_LEN};
 use block_buffer::BlockBuffer;
+pub use digest::{self, Digest};
 use digest::consts::{U20, U64};
 use digest::impl_write;
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
+use crate::consts::{H, STATE_LEN};
+use crate::compress::compress;
 
-#[cfg(not(feature = "asm"))]
-use crate::utils::compress;
-
-#[cfg(feature = "asm")]
-use digest::generic_array::GenericArray;
 
 /// Structure representing the state of a SHA-1 computation
 #[derive(Clone)]
@@ -109,7 +97,7 @@ impl Update for Sha1 {
         // Assumes that `length_bits<<3` will not overflow
         self.len += input.len() as u64;
         let state = &mut self.h;
-        self.buffer.input_block(input, |d| compress(state, d));
+        self.buffer.input_blocks(input, |d| compress(state, d));
     }
 }
 
@@ -119,7 +107,7 @@ impl FixedOutputDirty for Sha1 {
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         let s = &mut self.h;
         let l = self.len << 3;
-        self.buffer.len64_padding_be(l, |d| compress(s, d));
+        self.buffer.len64_padding_be(l, |d| compress(s, core::slice::from_ref(d)));
         for (chunk, v) in out.chunks_exact_mut(4).zip(self.h.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
@@ -134,28 +122,5 @@ impl Reset for Sha1 {
     }
 }
 
-#[cfg(all(feature = "asm", not(feature = "asm-aarch64")))]
-#[inline(always)]
-fn compress(state: &mut [u32; 5], block: &GenericArray<u8, U64>) {
-    #[allow(unsafe_code)]
-    let block: &[u8; 64] = unsafe { core::mem::transmute(block) };
-    sha1_asm::compress(state, block);
-}
-
-#[cfg(feature = "asm-aarch64")]
-#[inline(always)]
-fn compress(state: &mut [u32; 5], block: &GenericArray<u8, U64>) {
-    // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once
-    // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented
-    // to let us use it on no_std.
-    if aarch64::sha1_supported() {
-        #[allow(unsafe_code)]
-        let block: &[u8; 64] = unsafe { core::mem::transmute(block) };
-        sha1_asm::compress(state, block);
-    } else {
-        utils::compress(state, block);
-    }
-}
-
-impl_opaque_debug!(Sha1);
+opaque_debug::impl_opaque_debug!(Sha1);
 impl_write!(Sha1);
diff --git a/sha1/tests/lib.rs b/sha1/tests/lib.rs
index c7452c902..2af982ad1 100644
--- a/sha1/tests/lib.rs
+++ b/sha1/tests/lib.rs
@@ -10,3 +10,13 @@ fn sha1_1million_a() {
     let output = include_bytes!("data/one_million_a.bin");
     one_million_a::<sha1::Sha1>(output);
 }
+
+#[test]
+fn foo() {
+    use digest::Digest;
+    let msg = [0x10; 64];
+    let res = sha1::Sha1::digest(&msg);
+    assert_eq!(res.as_slice(), &[
+        168, 179, 203, 62, 143, 158, 186, 31, 28, 98, 170, 152, 153, 17, 169, 72, 151, 49, 99, 53
+    ]);
+}
\ No newline at end of file

From b7e621ac758560dd300269f6d8b9c4f5348b2ec8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 11:36:23 +0300
Subject: [PATCH 02/14] fix fmt

---
 sha1/src/compress.rs     |  4 ++--
 sha1/src/compress/x86.rs | 13 ++++---------
 sha1/src/lib.rs          | 12 ++++++------
 sha1/tests/lib.rs        | 10 ----------
 4 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index 2c298f26a..843d92494 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -1,9 +1,9 @@
-use digest::generic_array::GenericArray;
 use digest::consts::U64;
+use digest::generic_array::GenericArray;
 
+mod aarch64;
 #[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
 mod soft;
-mod aarch64;
 mod x86;
 
 type Block = GenericArray<u8, U64>;
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
index ced7d21fa..3599814d6 100644
--- a/sha1/src/compress/x86.rs
+++ b/sha1/src/compress/x86.rs
@@ -1,10 +1,10 @@
 #![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #![allow(unsafe_code)]
 
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
 
 #[cfg(not(target_feature = "sha"))]
 fn sha1_supported() -> bool {
@@ -31,7 +31,7 @@ macro_rules! schedule {
 #[target_feature(enable = "sha,ssse3,sse4.1")]
 unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     #[allow(non_snake_case)]
-    let MASK: __m128i = _mm_set_epi64x(0x0001020304050607, 0x08090a0b0c0d0e0f);
+    let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F);
 
     let mut state_abcd = _mm_set_epi32(
         state[0] as i32,
@@ -39,12 +39,7 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
         state[2] as i32,
         state[3] as i32,
     );
-    let mut state_e = _mm_set_epi32(
-        state[4] as i32,
-        0,
-        0,
-        0,
-    );
+    let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0);
 
     for block in blocks {
         let block_ptr = block.as_ptr() as *const __m128i;
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index f7e1f97aa..da93d5549 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -57,17 +57,16 @@ compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use
 #[cfg(feature = "std")]
 extern crate std;
 
-mod consts;
 mod compress;
+mod consts;
 
+use crate::compress::compress;
+use crate::consts::{H, STATE_LEN};
 use block_buffer::BlockBuffer;
-pub use digest::{self, Digest};
 use digest::consts::{U20, U64};
 use digest::impl_write;
+pub use digest::{self, Digest};
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
-use crate::consts::{H, STATE_LEN};
-use crate::compress::compress;
-
 
 /// Structure representing the state of a SHA-1 computation
 #[derive(Clone)]
@@ -107,7 +106,8 @@ impl FixedOutputDirty for Sha1 {
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         let s = &mut self.h;
         let l = self.len << 3;
-        self.buffer.len64_padding_be(l, |d| compress(s, core::slice::from_ref(d)));
+        self.buffer
+            .len64_padding_be(l, |d| compress(s, core::slice::from_ref(d)));
         for (chunk, v) in out.chunks_exact_mut(4).zip(self.h.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
diff --git a/sha1/tests/lib.rs b/sha1/tests/lib.rs
index 2af982ad1..c7452c902 100644
--- a/sha1/tests/lib.rs
+++ b/sha1/tests/lib.rs
@@ -10,13 +10,3 @@ fn sha1_1million_a() {
     let output = include_bytes!("data/one_million_a.bin");
     one_million_a::<sha1::Sha1>(output);
 }
-
-#[test]
-fn foo() {
-    use digest::Digest;
-    let msg = [0x10; 64];
-    let res = sha1::Sha1::digest(&msg);
-    assert_eq!(res.as_slice(), &[
-        168, 179, 203, 62, 143, 158, 186, 31, 28, 98, 170, 152, 153, 17, 169, 72, 151, 49, 99, 53
-    ]);
-}
\ No newline at end of file

From 9855b5fb11d1ab5e37ded29438b5f191dd3d69e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 11:43:41 +0300
Subject: [PATCH 03/14] fix clippy warnings

---
 sha1/src/compress.rs     | 4 +++-
 sha1/src/compress/x86.rs | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index 843d92494..ffb051576 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -29,6 +29,8 @@ pub fn compress(state: &mut [u32; 5], blocks: &[Block]) {
     // SAFETY: GenericArray<u8, U64> and [u8; 64] have
     // exactly the same memory layout
     #[allow(unsafe_code)]
-    let blocks: &[[u8; 64]] = unsafe { core::mem::transmute(blocks) };
+    let blocks: &[[u8; 64]] = unsafe {
+        &*(blocks as *const [Block] as *const [[u8; 64]])
+    };
     compress_inner(state, blocks);
 }
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
index 3599814d6..29da145c5 100644
--- a/sha1/src/compress/x86.rs
+++ b/sha1/src/compress/x86.rs
@@ -42,6 +42,8 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0);
 
     for block in blocks {
+        // SAFETY: we use only unaligned loads with this pointer
+        #[allow(clippy::cast_ptr_alignment)]
         let block_ptr = block.as_ptr() as *const __m128i;
 
         let h0 = state_abcd;

From ae0b503da72c7c3f0736ac92ae183c7da2c15b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 13:03:22 +0300
Subject: [PATCH 04/14] minor changes

---
 sha1/src/compress.rs      |  8 ++------
 sha1/src/compress/soft.rs |  8 ++++++--
 sha1/src/compress/x86.rs  | 16 +++++++++++++---
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index ffb051576..b595199ba 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -6,8 +6,6 @@ mod aarch64;
 mod soft;
 mod x86;
 
-type Block = GenericArray<u8, U64>;
-
 cfg_if::cfg_if! {
     if #[cfg(feature = "asm-aarch64")] {
         use aarch64::compress as compress_inner;
@@ -25,12 +23,10 @@ cfg_if::cfg_if! {
     }
 }
 
-pub fn compress(state: &mut [u32; 5], blocks: &[Block]) {
+pub fn compress(state: &mut [u32; 5], blocks: &[GenericArray<u8, U64>]) {
     // SAFETY: GenericArray<u8, U64> and [u8; 64] have
     // exactly the same memory layout
     #[allow(unsafe_code)]
-    let blocks: &[[u8; 64]] = unsafe {
-        &*(blocks as *const [Block] as *const [[u8; 64]])
-    };
+    let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) };
     compress_inner(state, blocks);
 }
diff --git a/sha1/src/compress/soft.rs b/sha1/src/compress/soft.rs
index 77907b47b..94a019b98 100644
--- a/sha1/src/compress/soft.rs
+++ b/sha1/src/compress/soft.rs
@@ -175,7 +175,7 @@ fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
-/// Process a block with the SHA-1 algorithm.
+#[inline(always)]
 fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
     macro_rules! schedule {
         ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
@@ -250,10 +250,14 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
 
 pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     let mut block_u32 = [0u32; BLOCK_LEN];
+    // since LLVM can't properly use aliasing yet it will make
+    // unnecessary state stores without this copy
+    let mut state_cpy = *state;
     for block in blocks.iter() {
         for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
             *o = u32::from_be_bytes(chunk.try_into().unwrap());
         }
-        sha1_digest_block_u32(state, &block_u32);
+        sha1_digest_block_u32(&mut state_cpy, &block_u32);
     }
+    *state = state_cpy;
 }
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
index 29da145c5..fd8eebfaf 100644
--- a/sha1/src/compress/x86.rs
+++ b/sha1/src/compress/x86.rs
@@ -6,12 +6,22 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-#[cfg(not(target_feature = "sha"))]
+#[cfg(not(all(
+    target_feature = "sha",
+    target_feature = "sse2",
+    target_feature = "ssse3",
+    target_feature = "sse4.1",
+)))]
 fn sha1_supported() -> bool {
-    true
+    false
 }
 
-#[cfg(target_feature = "sha")]
+#[cfg(all(
+    target_feature = "sha",
+    target_feature = "sse2",
+    target_feature = "ssse3",
+    target_feature = "sse4.1",
+))]
 fn sha1_supported() -> bool {
     true
 }

From b60c81e71ce3adbc7ce8bdce081c4ca5d555df22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 15:45:12 +0300
Subject: [PATCH 05/14] sha2: add x86 intrinsics support

---
 Cargo.lock                                    |   1 +
 sha2/Cargo.toml                               |  10 +-
 sha2/src/aarch64.rs                           |   7 -
 sha2/src/lib.rs                               |  39 +-----
 sha2/src/sha256.rs                            |  77 +++-------
 sha2/src/sha256_compress.rs                   |  32 +++++
 sha2/src/sha256_compress/aarch64.rs           |  20 +++
 .../soft.rs}                                  | 124 ++---------------
 sha2/src/sha256_compress/x86.rs               | 131 ++++++++++++++++++
 sha2/src/sha512.rs                            |  79 ++++-------
 sha2/src/sha512_compress.rs                   |  24 ++++
 .../soft.rs}                                  | 114 ++-------------
 sha2/tests/lib.rs                             |   2 -
 13 files changed, 286 insertions(+), 374 deletions(-)
 delete mode 100644 sha2/src/aarch64.rs
 create mode 100644 sha2/src/sha256_compress.rs
 create mode 100644 sha2/src/sha256_compress/aarch64.rs
 rename sha2/src/{sha256_utils.rs => sha256_compress/soft.rs} (60%)
 create mode 100644 sha2/src/sha256_compress/x86.rs
 create mode 100644 sha2/src/sha512_compress.rs
 rename sha2/src/{sha512_utils.rs => sha512_compress/soft.rs} (66%)

diff --git a/Cargo.lock b/Cargo.lock
index 1db6a343e..659c6a8dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -241,6 +241,7 @@ name = "sha2"
 version = "0.9.0"
 dependencies = [
  "block-buffer",
+ "cfg-if",
  "digest",
  "hex-literal",
  "libc",
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 7f7b79d8b..ec690a63e 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -18,7 +18,10 @@ categories = ["cryptography", "no-std"]
 digest = "0.9"
 block-buffer = "0.9"
 opaque-debug = "0.2"
+cfg-if = "0.1"
 sha2-asm = { version = "0.5", optional = true }
+
+[target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
 libc = { version = "0.2.68", optional = true }
 
 [dev-dependencies]
@@ -28,9 +31,8 @@ hex-literal = "0.2"
 [features]
 default = ["std"]
 std = ["digest/std"]
-asm = ["sha2-asm"]
+asm = ["sha2-asm", "libc"]
 compress = []
 
-# TODO: Remove this feature once is_aarch64_feature_detected!() is stabilised.
-# Only used on AArch64 Linux systems, when built without the crypto target_feature.
-asm-aarch64 = ["asm", "libc"]
+# DEPRECATED: use `asm` isntead
+asm-aarch64 = ["asm"]
diff --git a/sha2/src/aarch64.rs b/sha2/src/aarch64.rs
deleted file mode 100644
index 7cba76519..000000000
--- a/sha2/src/aarch64.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-use libc::{getauxval, AT_HWCAP, HWCAP_SHA2};
-
-#[inline(always)]
-pub fn sha2_supported() -> bool {
-    let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) };
-    (hwcaps & HWCAP_SHA2) != 0
-}
diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index c87c064b4..eccc78e67 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -56,50 +56,19 @@
 #![doc(html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")]
 #![warn(missing_docs, rust_2018_idioms)]
 
-// Give relevant error messages if the user tries to enable AArch64 asm on unsupported platforms.
-#[cfg(all(
-    feature = "asm-aarch64",
-    target_arch = "aarch64",
-    not(target_os = "linux")
-))]
-compile_error!("Your OS isn’t yet supported for runtime-checking of AArch64 features.");
-#[cfg(all(feature = "asm-aarch64", not(target_arch = "aarch64")))]
-compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" on non-AArch64 systems.");
-#[cfg(all(
-    feature = "asm-aarch64",
-    target_arch = "aarch64",
-    target_feature = "crypto"
-))]
-compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when building for AArch64 systems with crypto extensions.");
-#[cfg(all(
-    not(feature = "asm-aarch64"),
-    feature = "asm",
-    target_arch = "aarch64",
-    not(target_feature = "crypto"),
-    target_os = "linux"
-))]
-compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU.");
-
-#[macro_use]
-extern crate opaque_debug;
-
 #[cfg(feature = "std")]
 extern crate std;
 
-#[cfg(feature = "asm-aarch64")]
-mod aarch64;
 mod consts;
 mod sha256;
-#[cfg(any(not(feature = "asm"), feature = "asm-aarch64", feature = "compress"))]
-mod sha256_utils;
+mod sha256_compress;
 mod sha512;
-#[cfg(any(not(feature = "asm"), target_arch = "aarch64", feature = "compress"))]
-mod sha512_utils;
+mod sha512_compress;
 
 pub use crate::sha256::{Sha224, Sha256};
 pub use crate::sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256};
 pub use digest::{self, Digest};
 #[cfg(feature = "compress")]
-pub use sha256_utils::compress256;
+pub use sha256_compress::compress256;
 #[cfg(feature = "compress")]
-pub use sha512_utils::compress512;
+pub use sha512_compress::compress512;
diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs
index c30671b86..dc741f61a 100644
--- a/sha2/src/sha256.rs
+++ b/sha2/src/sha256.rs
@@ -1,62 +1,20 @@
 //! SHA-256
-
 use crate::consts::{H224, H256, STATE_LEN};
+use crate::sha256_compress::compress256;
 use block_buffer::BlockBuffer;
-use digest::impl_write;
-use digest::{
-    consts::{U28, U32, U64},
-    generic_array::GenericArray,
-};
+use core::slice::from_ref;
+use digest::consts::{U28, U32, U64};
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
-#[cfg(not(feature = "asm"))]
-use crate::sha256_utils::compress256;
-
-#[cfg(feature = "asm")]
-use sha2_asm::compress256;
-
 type BlockSize = U64;
-type Block = GenericArray<u8, BlockSize>;
-
-/// A structure that represents that state of a digest computation for the
-/// SHA-2 512 family of digest functions
-#[derive(Clone)]
-struct Engine256State {
-    h: [u32; 8],
-}
-
-impl Engine256State {
-    fn new(h: &[u32; STATE_LEN]) -> Engine256State {
-        Engine256State { h: *h }
-    }
-
-    #[cfg(not(feature = "asm-aarch64"))]
-    pub fn process_block(&mut self, block: &Block) {
-        let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) };
-        compress256(&mut self.h, block);
-    }
-
-    #[cfg(feature = "asm-aarch64")]
-    pub fn process_block(&mut self, block: &Block) {
-        let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) };
-        // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha2") once
-        // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented
-        // to let us use it on no_std.
-        if ::aarch64::sha2_supported() {
-            compress256(&mut self.h, block);
-        } else {
-            ::sha256_utils::compress256(&mut self.h, block);
-        }
-    }
-}
 
-/// A structure that keeps track of the state of the Sha-256 operation and
+/// Structure that keeps state of the Sha-256 operation and
 /// contains the logic necessary to perform the final calculations.
 #[derive(Clone)]
 struct Engine256 {
     len: u64,
     buffer: BlockBuffer<BlockSize>,
-    state: Engine256State,
+    state: [u32; 8],
 }
 
 impl Engine256 {
@@ -64,7 +22,7 @@ impl Engine256 {
         Engine256 {
             len: 0,
             buffer: Default::default(),
-            state: Engine256State::new(h),
+            state: *h,
         }
     }
 
@@ -72,19 +30,20 @@ impl Engine256 {
         // Assumes that input.len() can be converted to u64 without overflow
         self.len += (input.len() as u64) << 3;
         let s = &mut self.state;
-        self.buffer.input_block(input, |b| s.process_block(b));
+        self.buffer.input_blocks(input, |b| compress256(s, b));
     }
 
     fn finish(&mut self) {
         let s = &mut self.state;
         let l = self.len;
-        self.buffer.len64_padding_be(l, |b| s.process_block(b));
+        self.buffer
+            .len64_padding_be(l, |b| compress256(s, from_ref(b)));
     }
 
     fn reset(&mut self, h: &[u32; STATE_LEN]) {
         self.len = 0;
         self.buffer.reset();
-        self.state = Engine256State::new(h);
+        self.state = *h;
     }
 }
 
@@ -117,8 +76,8 @@ impl FixedOutputDirty for Sha256 {
 
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         self.engine.finish();
-        let h = self.engine.state.h;
-        for (chunk, v) in out.chunks_exact_mut(4).zip(h.iter()) {
+        let s = self.engine.state;
+        for (chunk, v) in out.chunks_exact_mut(4).zip(s.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
     }
@@ -160,8 +119,8 @@ impl FixedOutputDirty for Sha224 {
 
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         self.engine.finish();
-        let h = &self.engine.state.h[..7];
-        for (chunk, v) in out.chunks_exact_mut(4).zip(h.iter()) {
+        let s = &self.engine.state[..7];
+        for (chunk, v) in out.chunks_exact_mut(4).zip(s.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
     }
@@ -173,8 +132,8 @@ impl Reset for Sha224 {
     }
 }
 
-impl_opaque_debug!(Sha224);
-impl_opaque_debug!(Sha256);
+opaque_debug::impl_opaque_debug!(Sha224);
+opaque_debug::impl_opaque_debug!(Sha256);
 
-impl_write!(Sha224);
-impl_write!(Sha256);
+digest::impl_write!(Sha224);
+digest::impl_write!(Sha256);
diff --git a/sha2/src/sha256_compress.rs b/sha2/src/sha256_compress.rs
new file mode 100644
index 000000000..b0b52cb4d
--- /dev/null
+++ b/sha2/src/sha256_compress.rs
@@ -0,0 +1,32 @@
+use digest::consts::U64;
+use digest::generic_array::GenericArray;
+
+cfg_if::cfg_if! {
+    if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] {
+        mod soft;
+        mod aarch64;
+        use aarch64::compress;
+    } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
+        // TODO: replace after sha2-asm rework
+        fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+            for block in blocks {
+                sha2_asm::compress256(state, block);
+            }
+        }
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        mod soft;
+        mod x86;
+        use x86::compress;
+    } else {
+        mod soft;
+        use soft::compress;
+    }
+}
+
+pub fn compress256(state: &mut [u32; 8], blocks: &[GenericArray<u8, U64>]) {
+    // SAFETY: GenericArray<u8, U64> and [u8; 64] have
+    // exactly the same memory layout
+    #[allow(unsafe_code)]
+    let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) };
+    compress(state, blocks)
+}
diff --git a/sha2/src/sha256_compress/aarch64.rs b/sha2/src/sha256_compress/aarch64.rs
new file mode 100644
index 000000000..a5967ca0c
--- /dev/null
+++ b/sha2/src/sha256_compress/aarch64.rs
@@ -0,0 +1,20 @@
+use libc::{getauxval, AT_HWCAP, HWCAP_SHA2};
+
+#[inline(always)]
+pub fn sha2_supported() -> bool {
+    let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) };
+    (hwcaps & HWCAP_SHA2) != 0
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if sha2_supported() {
+        // TODO: replace after sha2-asm rework
+        for block in blocks {
+            sha2_asm::compress256(&mut self.h, block);
+        }
+    } else {
+        super::soft::compress(&mut self.h, block);
+    }
+}
diff --git a/sha2/src/sha256_utils.rs b/sha2/src/sha256_compress/soft.rs
similarity index 60%
rename from sha2/src/sha256_utils.rs
rename to sha2/src/sha256_compress/soft.rs
index 7d2ec9f63..fe133cd80 100644
--- a/sha2/src/sha256_utils.rs
+++ b/sha2/src/sha256_compress/soft.rs
@@ -32,20 +32,14 @@ fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] {
     ]
 }
 
-/// Not an intrinsic, but works like an unaligned load.
-#[inline]
 fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
     [v3[3], v2[0], v2[1], v2[2]]
 }
 
-/// Not an intrinsic, but useful for swapping vectors.
-#[inline]
 fn sha256swap(v0: [u32; 4]) -> [u32; 4] {
     [v0[2], v0[3], v0[0], v0[1]]
 }
 
-/// Emulates `llvm.x86.sha256msg1` intrinsic.
-// #[inline]
 fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] {
     // sigma 0 on vectors
     #[inline]
@@ -59,8 +53,6 @@ fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] {
     add(v0, sigma0x4(sha256load(v0, v1)))
 }
 
-/// Emulates `llvm.x86.sha256msg2` intrinsic.
-// #[inline]
 fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
     macro_rules! sigma1 {
         ($a:expr) => {
@@ -79,14 +71,6 @@ fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
     [w19, w18, w17, w16]
 }
 
-/*
-/// Performs 4 rounds of the SHA-256 message schedule update.
-fn sha256_schedule_x4(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
-    sha256msg2(sha256msg1(v0, v1) + sha256load(v2, v3), v3)
-}*/
-
-/// Emulates `llvm.x86.sha256rnds2` intrinsic.
-// #[inline]
 fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32; 4] {
     macro_rules! big_sigma0 {
         ($a:expr) => {
@@ -170,6 +154,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
     let mut abef = [state[0], state[1], state[4], state[5]];
     let mut cdgh = [state[2], state[3], state[6], state[7]];
 
+
     // Rounds 0..64
     let mut w0 = [block[3], block[2], block[1], block[0]];
     rounds4!(abef, cdgh, add(k[0], w0));
@@ -179,6 +164,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
     rounds4!(abef, cdgh, add(k[2], w2));
     let mut w3 = [block[15], block[14], block[13], block[12]];
     rounds4!(abef, cdgh, add(k[3], w3));
+
     let mut w4 = schedule!(w0, w1, w2, w3);
     rounds4!(abef, cdgh, add(k[4], w4));
     w0 = schedule!(w1, w2, w3, w4);
@@ -217,102 +203,16 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
     state[7] = state[7].wrapping_add(h);
 }
 
-/// Process a block with the SHA-256 algorithm. (See more...)
-///
-/// Internally, this uses functions which resemble the new Intel SHA instruction
-/// sets, and so it's data locality properties may improve performance. However,
-/// to benefit the most from this implementation, replace these functions with
-/// x86 intrinsics to get a possible speed boost.
-///
-/// # Implementation
-///
-/// The `Sha256` algorithm is implemented with functions that resemble the new
-/// Intel SHA instruction set extensions. These intructions fall into two
-/// categories: message schedule calculation, and the message block 64-round
-/// digest calculation. The schedule-related instructions allow 4 rounds to be
-/// calculated as:
-///
-/// ```ignore
-/// use std::simd::[u32; 4];
-/// use self::crypto::sha2::{
-///     sha256msg1,
-///     sha256msg2,
-///     sha256load
-/// };
-///
-/// fn schedule4_data(work: &mut [[u32; 4]], w: &[u32]) {
-///
-///     // this is to illustrate the data order
-///     work[0] = [w[3], w[2], w[1], w[0]);
-///     work[1] = [w[7], w[6], w[5], w[4]);
-///     work[2] = [w[11], w[10], w[9], w[8]);
-///     work[3] = [w[15], w[14], w[13], w[12]);
-/// }
-///
-/// fn schedule4_work(work: &mut [[u32; 4]], t: usize) {
-///
-///     // this is the core expression
-///     work[t] = sha256msg2(sha256msg1(work[t - 4], work[t - 3]) +
-///                          sha256load(work[t - 2], work[t - 1]),
-///                          work[t - 1])
-/// }
-/// ```
-///
-/// instead of 4 rounds of:
-///
-/// ```ignore
-/// fn schedule_work(w: &mut [u32], t: usize) {
-///     w[t] = sigma1!(w[t - 2]) + w[t - 7] + sigma0!(w[t - 15]) + w[t - 16];
-/// }
-/// ```
-///
-/// and the digest-related instructions allow 4 rounds to be calculated as:
-///
-/// ```ignore
-/// use std::simd::[u32; 4];
-/// use self::crypto::sha2::{K32X4,
-///     sha256rnds2,
-///     sha256swap
-/// };
-///
-/// fn rounds4(state: &mut [u32; 8], work: &mut [[u32; 4]], t: usize) {
-///     let [a, b, c, d, e, f, g, h]: [u32; 8] = *state;
-///
-///     // this is to illustrate the data order
-///     let mut abef = [a, b, e, f);
-///     let mut cdgh = [c, d, g, h);
-///     let temp = K32X4[t] + work[t];
-///
-///     // this is the core expression
-///     cdgh = sha256rnds2(cdgh, abef, temp);
-///     abef = sha256rnds2(abef, cdgh, sha256swap(temp));
-///
-///     *state = [abef[0], abef[1], cdgh[0], cdgh[1],
-///               abef[2], abef[3], cdgh[2], cdgh[3]];
-/// }
-/// ```
-///
-/// instead of 4 rounds of:
-///
-/// ```ignore
-/// fn round(state: &mut [u32; 8], w: &mut [u32], t: usize) {
-///     let [a, b, c, mut d, e, f, g, mut h]: [u32; 8] = *state;
-///
-///     h += big_sigma1!(e) +   choose!(e, f, g) + K32[t] + w[t]; d += h;
-///     h += big_sigma0!(a) + majority!(a, b, c);
-///
-///     *state = [h, a, b, c, d, e, f, g];
-/// }
-/// ```
-///
-/// **NOTE**: It is important to note, however, that these instructions are not
-/// implemented by any CPU (at the time of this writing), and so they are
-/// emulated in this library until the instructions become more common, and gain
-///  support in LLVM (and GCC, etc.).
-pub fn compress256(state: &mut [u32; 8], block: &[u8; 64]) {
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     let mut block_u32 = [0u32; BLOCK_LEN];
-    for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
-        *o = u32::from_be_bytes(chunk.try_into().unwrap());
+    // since LLVM can't properly use aliasing yet it will make
+    // unnecessary state stores without this copy
+    let mut state_cpy = *state;
+    for block in blocks {
+        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) {
+            *o = u32::from_be_bytes(chunk.try_into().unwrap());
+        }
+        sha256_digest_block_u32(&mut state_cpy, &block_u32);
     }
-    sha256_digest_block_u32(state, &block_u32);
+    *state = state_cpy;
 }
diff --git a/sha2/src/sha256_compress/x86.rs b/sha2/src/sha256_compress/x86.rs
new file mode 100644
index 000000000..b9ed6ce0d
--- /dev/null
+++ b/sha2/src/sha256_compress/x86.rs
@@ -0,0 +1,131 @@
+#![allow(clippy::many_single_char_names)]
+
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+
+#[cfg(not(all(
+    target_feature = "sha",
+    target_feature = "sse2",
+    target_feature = "ssse3",
+    target_feature = "sse4.1",
+)))]
+fn is_supported() -> bool {
+    true
+}
+
+#[cfg(all(
+    target_feature = "sha",
+    target_feature = "sse2",
+    target_feature = "ssse3",
+    target_feature = "sse4.1",
+))]
+fn is_supported() -> bool {
+    true
+}
+
+unsafe fn add_k(v: __m128i, i: usize) -> __m128i {
+    let k = &crate::consts::K32X4[i];
+    let t = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
+    _mm_add_epi32(v, t)
+}
+
+unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i, ) -> __m128i {
+    let t1 = _mm_sha256msg1_epu32(v0, v1);
+    let t2 = _mm_alignr_epi8(v3, v2, 4);
+    let t3 = _mm_add_epi32(t1, t2);
+    _mm_sha256msg2_epu32(t3, v3)
+}
+
+macro_rules! rounds4 {
+    ($abef:ident, $cdgh:ident, $rest:expr) => {{
+        $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, $rest);
+        $abef = _mm_sha256rnds2_epu32($abef, $cdgh, _mm_shuffle_epi32($rest, 0x0E));
+    }};
+}
+
+// we use unaligned loads with `__m128i` pointers
+#[allow(clippy::cast_ptr_alignment)]
+#[target_feature(enable = "sha,ssse3,sse4.1")]
+unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    #[allow(non_snake_case)]
+    let MASK: __m128i = _mm_set_epi64x(
+        0x0C0D_0E0F_0809_0A0Bu64 as i64,
+        0x0405_0607_0001_0203u64 as i64,
+    );
+
+    let state_ptr = state.as_ptr() as *const __m128i;
+    let dcba = _mm_loadu_si128(state_ptr.add(0));
+    let efgh = _mm_loadu_si128(state_ptr.add(1));
+
+    let cdab = _mm_shuffle_epi32(dcba, 0xB1);
+    let efgh = _mm_shuffle_epi32(efgh, 0x1B);
+    let mut abef = _mm_alignr_epi8(cdab, efgh, 8);
+    let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
+
+    for block in blocks {
+        let abef_save = abef;
+        let cdgh_save = cdgh;
+
+        let data_ptr = block.as_ptr() as *const __m128i;
+        let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK);
+        let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK);
+        let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK);
+        let mut w3 = _mm_shuffle_epi8( _mm_loadu_si128(data_ptr.add(3)), MASK);
+
+        rounds4!(abef, cdgh, add_k(w0, 0));
+        rounds4!(abef, cdgh, add_k(w1, 1));
+        rounds4!(abef, cdgh, add_k(w2, 2));
+        rounds4!(abef, cdgh, add_k(w3, 3));
+
+        let mut w4 = schedule(w0, w1, w2, w3);
+        rounds4!(abef, cdgh, add_k(w4, 4));
+        w0 = schedule(w1, w2, w3, w4);
+        rounds4!(abef, cdgh, add_k(w0, 5));
+        w1 = schedule(w2, w3, w4, w0);
+        rounds4!(abef, cdgh, add_k(w1, 6));
+        w2 = schedule(w3, w4, w0, w1);
+        rounds4!(abef, cdgh, add_k(w2, 7));
+        w3 = schedule(w4, w0, w1, w2);
+        rounds4!(abef, cdgh, add_k(w3, 8));
+        w4 = schedule(w0, w1, w2, w3);
+        rounds4!(abef, cdgh, add_k(w4, 9));
+        w0 = schedule(w1, w2, w3, w4);
+        rounds4!(abef, cdgh, add_k(w0, 10));
+        w1 = schedule(w2, w3, w4, w0);
+        rounds4!(abef, cdgh, add_k(w1, 11));
+        w2 = schedule(w3, w4, w0, w1);
+        rounds4!(abef, cdgh, add_k(w2, 12));
+        w3 = schedule(w4, w0, w1, w2);
+        rounds4!(abef, cdgh, add_k(w3, 13));
+        w4 = schedule(w0, w1, w2, w3);
+        rounds4!(abef, cdgh, add_k(w4, 14));
+        w0 = schedule(w1, w2, w3, w4);
+        rounds4!(abef, cdgh, add_k(w0, 15));
+
+        abef = _mm_add_epi32(abef, abef_save);
+        cdgh = _mm_add_epi32(cdgh, cdgh_save);
+    }
+
+    let feba = _mm_shuffle_epi32(abef, 0x1B);
+    let dchg = _mm_shuffle_epi32(cdgh, 0xB1);
+    let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+    let hgef = _mm_alignr_epi8(dchg, feba, 8);
+
+    let state_ptr_mut = state.as_mut_ptr() as *mut __m128i;
+    _mm_storeu_si128(state_ptr_mut.add(0), dcba);
+    _mm_storeu_si128(state_ptr_mut.add(1), hgef);
+}
+
+pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if is_supported() {
+        unsafe {
+            digest_blocks(state, blocks);
+        }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs
index ed3a1cccf..a49773222 100644
--- a/sha2/src/sha512.rs
+++ b/sha2/src/sha512.rs
@@ -1,48 +1,21 @@
 //! SHA-512
-
 use crate::consts::{H384, H512, H512_TRUNC_224, H512_TRUNC_256, STATE_LEN};
 use block_buffer::BlockBuffer;
-use digest::impl_write;
-use digest::{
-    consts::{U128, U28, U32, U48, U64},
-    generic_array::GenericArray,
-};
+use core::slice::from_ref;
+use digest::consts::{U128, U28, U32, U48, U64};
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
-#[cfg(any(not(feature = "asm"), target_arch = "aarch64"))]
-use crate::sha512_utils::compress512;
-
-#[cfg(all(feature = "asm", not(target_arch = "aarch64")))]
-use sha2_asm::compress512;
+use crate::sha512_compress::compress512;
 
 type BlockSize = U128;
-type Block = GenericArray<u8, BlockSize>;
-
-/// A structure that represents that state of a digest computation for the
-/// SHA-2 512 family of digest functions
-#[derive(Clone)]
-struct Engine512State {
-    h: [u64; 8],
-}
-
-impl Engine512State {
-    fn new(h: &[u64; 8]) -> Engine512State {
-        Engine512State { h: *h }
-    }
-
-    pub fn process_block(&mut self, block: &Block) {
-        let block = unsafe { &*(block.as_ptr() as *const [u8; 128]) };
-        compress512(&mut self.h, block);
-    }
-}
 
-/// A structure that keeps track of the state of the Sha-512 operation and
+/// Structure that keeps state of the Sha-512 operation and
 /// contains the logic necessary to perform the final calculations.
 #[derive(Clone)]
 struct Engine512 {
     len: u128,
     buffer: BlockBuffer<BlockSize>,
-    state: Engine512State,
+    state: [u64; 8],
 }
 
 impl Engine512 {
@@ -50,26 +23,26 @@ impl Engine512 {
         Engine512 {
             len: 0,
             buffer: Default::default(),
-            state: Engine512State::new(h),
+            state: *h,
         }
     }
 
     fn update(&mut self, input: &[u8]) {
         self.len += (input.len() as u128) << 3;
         let s = &mut self.state;
-        self.buffer.input_block(input, |d| s.process_block(d));
+        self.buffer.input_blocks(input, |b| compress512(s, b));
     }
 
     fn finish(&mut self) {
         let s = &mut self.state;
         self.buffer
-            .len128_padding_be(self.len, |d| s.process_block(d));
+            .len128_padding_be(self.len, |d| compress512(s, from_ref(d)));
     }
 
     fn reset(&mut self, h: &[u64; STATE_LEN]) {
         self.len = 0;
         self.buffer.reset();
-        self.state = Engine512State::new(h);
+        self.state = *h;
     }
 }
 
@@ -102,8 +75,8 @@ impl FixedOutputDirty for Sha512 {
 
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         self.engine.finish();
-        let h = self.engine.state.h;
-        for (chunk, v) in out.chunks_exact_mut(8).zip(h.iter()) {
+        let s = self.engine.state;
+        for (chunk, v) in out.chunks_exact_mut(8).zip(s.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
     }
@@ -145,8 +118,8 @@ impl FixedOutputDirty for Sha384 {
 
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         self.engine.finish();
-        let h = &self.engine.state.h[..6];
-        for (chunk, v) in out.chunks_exact_mut(8).zip(h.iter()) {
+        let s = &self.engine.state[..6];
+        for (chunk, v) in out.chunks_exact_mut(8).zip(s.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
     }
@@ -188,8 +161,8 @@ impl FixedOutputDirty for Sha512Trunc256 {
 
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         self.engine.finish();
-        let h = &self.engine.state.h[..4];
-        for (chunk, v) in out.chunks_exact_mut(8).zip(h.iter()) {
+        let s = &self.engine.state[..4];
+        for (chunk, v) in out.chunks_exact_mut(8).zip(s.iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
     }
@@ -231,11 +204,11 @@ impl FixedOutputDirty for Sha512Trunc224 {
 
     fn finalize_into_dirty(&mut self, out: &mut digest::Output<Self>) {
         self.engine.finish();
-        let h = &self.engine.state.h;
-        for (chunk, v) in out.chunks_exact_mut(8).zip(h[..3].iter()) {
+        let s = &self.engine.state;
+        for (chunk, v) in out.chunks_exact_mut(8).zip(s[..3].iter()) {
             chunk.copy_from_slice(&v.to_be_bytes());
         }
-        out[24..28].copy_from_slice(&h[3].to_be_bytes()[..4]);
+        out[24..28].copy_from_slice(&s[3].to_be_bytes()[..4]);
     }
 }
 
@@ -245,12 +218,12 @@ impl Reset for Sha512Trunc224 {
     }
 }
 
-impl_opaque_debug!(Sha384);
-impl_opaque_debug!(Sha512);
-impl_opaque_debug!(Sha512Trunc224);
-impl_opaque_debug!(Sha512Trunc256);
+opaque_debug::impl_opaque_debug!(Sha384);
+opaque_debug::impl_opaque_debug!(Sha512);
+opaque_debug::impl_opaque_debug!(Sha512Trunc224);
+opaque_debug::impl_opaque_debug!(Sha512Trunc256);
 
-impl_write!(Sha384);
-impl_write!(Sha512);
-impl_write!(Sha512Trunc224);
-impl_write!(Sha512Trunc256);
+digest::impl_write!(Sha384);
+digest::impl_write!(Sha512);
+digest::impl_write!(Sha512Trunc224);
+digest::impl_write!(Sha512Trunc256);
diff --git a/sha2/src/sha512_compress.rs b/sha2/src/sha512_compress.rs
new file mode 100644
index 000000000..baa6b2765
--- /dev/null
+++ b/sha2/src/sha512_compress.rs
@@ -0,0 +1,24 @@
+use digest::consts::U128;
+use digest::generic_array::GenericArray;
+
+cfg_if::cfg_if! {
+    if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
+        // TODO: replace after sha2-asm rework
+        fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+            for block in blocks {
+                sha2_asm::compress512(state, block);
+            }
+        }
+    } else {
+        mod soft;
+        use soft::compress;
+    }
+}
+
+pub fn compress512(state: &mut [u64; 8], blocks: &[GenericArray<u8, U128>]) {
+    // SAFETY: GenericArray<u8, U128> and [u8; 128] have
+    // exactly the same memory layout
+    #[allow(unsafe_code)]
+    let blocks: &[[u8; 128]] = unsafe { &*(blocks as *const _ as *const [[u8; 128]]) };
+    compress(state, blocks)
+}
diff --git a/sha2/src/sha512_utils.rs b/sha2/src/sha512_compress/soft.rs
similarity index 66%
rename from sha2/src/sha512_utils.rs
rename to sha2/src/sha512_compress/soft.rs
index eaa9d51fc..f307f0e54 100644
--- a/sha2/src/sha512_utils.rs
+++ b/sha2/src/sha512_compress/soft.rs
@@ -2,13 +2,11 @@
 use crate::consts::{BLOCK_LEN, K64X2};
 use core::convert::TryInto;
 
-#[inline(always)]
 fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] {
     [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])]
 }
 
 /// Not an intrinsic, but works like an unaligned load.
-#[inline]
 fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] {
     [v1[1], v0[0]]
 }
@@ -202,105 +200,17 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
     state[7] = state[7].wrapping_add(h);
 }
 
-/// Process a block with the SHA-512 algorithm. (See more...)
-///
-/// Internally, this uses functions that resemble the new Intel SHA
-/// instruction set extensions, but since no architecture seems to
-/// have any designs, these may not be the final designs if and/or when
-/// there are instruction set extensions with SHA-512. So to summarize:
-/// SHA-1 and SHA-256 are being implemented in hardware soon (at the time
-/// of this writing), but it doesn't look like SHA-512 will be hardware
-/// accelerated any time soon.
-///
-/// # Implementation
-///
-/// These functions fall into two categories: message schedule calculation, and
-/// the message block 64-round digest calculation. The schedule-related
-/// functions allow 4 rounds to be calculated as:
-///
-/// ```ignore
-/// use std::simd::[u64; 2];
-/// use self::crypto::sha2::{
-///     sha512msg,
-///     sha512load
-/// };
-///
-/// fn schedule4_data(work: &mut [[u64; 2]], w: &[u64]) {
-///
-///     // this is to illustrate the data order
-///     work[0] = [w[1], w[0]);
-///     work[1] = [w[3], w[2]);
-///     work[2] = [w[5], w[4]);
-///     work[3] = [w[7], w[6]);
-///     work[4] = [w[9], w[8]);
-///     work[5] = [w[11], w[10]);
-///     work[6] = [w[13], w[12]);
-///     work[7] = [w[15], w[14]);
-/// }
-///
-/// fn schedule4_work(work: &mut [[u64; 2]], t: usize) {
-///
-///     // this is the core expression
-///     work[t] = sha512msg(work[t - 8],
-///                         work[t - 7],
-///                         sha512load(work[t - 4], work[t - 3]),
-///                         work[t - 1]);
-/// }
-/// ```
-///
-/// instead of 4 rounds of:
-///
-/// ```ignore
-/// fn schedule_work(w: &mut [u64], t: usize) {
-///     w[t] = sigma1!(w[t - 2]) + w[t - 7] + sigma0!(w[t - 15]) + w[t - 16];
-/// }
-/// ```
-///
-/// and the digest-related functions allow 4 rounds to be calculated as:
-///
-/// ```ignore
-/// use std::simd::[u64; 2];
-/// use self::crypto::sha2::{K64X2, sha512rnd};
-///
-/// fn rounds4(state: &mut [u64; 8], work: &mut [[u64; 2]], t: usize) {
-///     let [a, b, c, d, e, f, g, h]: [u64; 8] = *state;
-///
-///     // this is to illustrate the data order
-///     let mut ae = [a, e);
-///     let mut bf = [b, f);
-///     let mut cg = [c, g);
-///     let mut dh = [d, h);
-///     let [w1, w0) = K64X2[2*t]     + work[2*t];
-///     let [w3, w2) = K64X2[2*t + 1] + work[2*t + 1];
-///
-///     // this is the core expression
-///     dh = sha512rnd(ae, bf, cg, dh, w0);
-///     cg = sha512rnd(dh, ae, bf, cg, w1);
-///     bf = sha512rnd(cg, dh, ae, bf, w2);
-///     ae = sha512rnd(bf, cg, dh, ae, w3);
-///
-///     *state = [ae[0], bf[0], cg[0], dh[0],
-///               ae[1], bf[1], cg[1], dh[1]];
-/// }
-/// ```
-///
-/// instead of 4 rounds of:
-///
-/// ```ignore
-/// fn round(state: &mut [u64; 8], w: &mut [u64], t: usize) {
-///     let [a, b, c, mut d, e, f, g, mut h]: [u64; 8] = *state;
-///
-///     h += big_sigma1!(e) +   choose!(e, f, g) + K64[t] + w[t]; d += h;
-///     h += big_sigma0!(a) + majority!(a, b, c);
-///
-///     *state = [h, a, b, c, d, e, f, g];
-/// }
-/// ```
-///
-pub fn compress512(state: &mut [u64; 8], block: &[u8; 128]) {
-    let mut block_u64 = [0u64; BLOCK_LEN];
-    for (o, chunk) in block_u64.iter_mut().zip(block.chunks_exact(8)) {
-        *o = u64::from_be_bytes(chunk.try_into().unwrap());
+
+pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    let mut block_u32 = [0u64; BLOCK_LEN];
+    // since LLVM can't properly use aliasing yet it will make
+    // unnecessary state stores without this copy
+    let mut state_cpy = *state;
+    for block in blocks {
+        for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) {
+            *o = u64::from_be_bytes(chunk.try_into().unwrap());
+        }
+        sha512_digest_block_u64(&mut state_cpy, &block_u32);
     }
-    sha512_digest_block_u64(state, &block_u64);
+    *state = state_cpy;
 }
diff --git a/sha2/tests/lib.rs b/sha2/tests/lib.rs
index 72de11686..b9cb8628a 100644
--- a/sha2/tests/lib.rs
+++ b/sha2/tests/lib.rs
@@ -1,5 +1,3 @@
-#![no_std]
-
 use digest::dev::{digest_test, one_million_a};
 use digest::new_test;
 

From c8e35eb5e65786cbaf8ebef709c948a7a62bd4c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 15:56:09 +0300
Subject: [PATCH 06/14] simplify aarch64

---
 sha1/Cargo.toml      | 11 ++++++-----
 sha1/src/compress.rs | 14 +++++++-------
 sha1/src/lib.rs      | 29 +----------------------------
 sha2/Cargo.toml      |  2 +-
 4 files changed, 15 insertions(+), 41 deletions(-)

diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index 985ef5579..183cd5002 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -18,9 +18,11 @@ name = "sha1"
 digest = "0.9"
 block-buffer = "0.9"
 opaque-debug = "0.2"
+cfg-if = "0.1"
 sha1-asm = { version = "0.4", optional = true }
+
+[target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
 libc = { version = "0.2.68", optional = true }
-cfg-if = "0.1"
 
 [dev-dependencies]
 digest = { version = "0.9", features = ["dev"] }
@@ -29,8 +31,7 @@ hex-literal = "0.2"
 [features]
 default = ["std"]
 std = ["digest/std"]
-asm = ["sha1-asm"]
+asm = ["sha1-asm", "libc"]
 
-# TODO: Remove this feature once is_aarch64_feature_detected!() is stabilised.
-# Only used on AArch64 Linux systems, when built without the crypto target_feature.
-asm-aarch64 = ["asm", "libc"]
+# DEPRECATED: use `asm` instead
+asm-aarch64 = ["asm"]
diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index b595199ba..51a4cdc87 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -1,15 +1,12 @@
 use digest::consts::U64;
 use digest::generic_array::GenericArray;
 
-mod aarch64;
-#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))]
-mod soft;
-mod x86;
-
 cfg_if::cfg_if! {
-    if #[cfg(feature = "asm-aarch64")] {
+    if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] {
+        mod soft;
+        mod aarch64;
         use aarch64::compress as compress_inner;
-    } else if #[cfg(feature = "asm")] {
+    } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
         // TODO: replace after sha1-asm rework
         fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) {
             for block in blocks {
@@ -17,8 +14,11 @@ cfg_if::cfg_if! {
             }
         }
     } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        mod soft;
+        mod x86;
         use x86::compress as compress_inner;
     } else {
+        mod soft;
         use soft::compress as compress_inner;
     }
 }
diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs
index da93d5549..a83425027 100644
--- a/sha1/src/lib.rs
+++ b/sha1/src/lib.rs
@@ -28,32 +28,6 @@
 #![deny(unsafe_code)]
 #![warn(missing_docs, rust_2018_idioms)]
 
-// Give relevant error messages if the user tries to enable AArch64 asm on unsupported platforms.
-#[cfg(all(
-    feature = "asm-aarch64",
-    target_arch = "aarch64",
-    not(target_os = "linux")
-))]
-compile_error!("Your OS isn’t yet supported for runtime-checking of AArch64 features.");
-
-#[cfg(all(feature = "asm-aarch64", not(target_arch = "aarch64")))]
-compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" on non-AArch64 systems.");
-#[cfg(all(
-    feature = "asm-aarch64",
-    target_arch = "aarch64",
-    target_feature = "crypto"
-))]
-compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when building for AArch64 systems with crypto extensions.");
-
-#[cfg(all(
-    not(feature = "asm-aarch64"),
-    feature = "asm",
-    target_arch = "aarch64",
-    not(target_feature = "crypto"),
-    target_os = "linux"
-))]
-compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU.");
-
 #[cfg(feature = "std")]
 extern crate std;
 
@@ -64,7 +38,6 @@ use crate::compress::compress;
 use crate::consts::{H, STATE_LEN};
 use block_buffer::BlockBuffer;
 use digest::consts::{U20, U64};
-use digest::impl_write;
 pub use digest::{self, Digest};
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
@@ -123,4 +96,4 @@ impl Reset for Sha1 {
 }
 
 opaque_debug::impl_opaque_debug!(Sha1);
-impl_write!(Sha1);
+digest::impl_write!(Sha1);
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index ec690a63e..b4b9b4227 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -34,5 +34,5 @@ std = ["digest/std"]
 asm = ["sha2-asm", "libc"]
 compress = []
 
-# DEPRECATED: use `asm` isntead
+# DEPRECATED: use `asm` instead
 asm-aarch64 = ["asm"]

From 979863d0d4b9b36bda0b11bb7ba73a391049c755 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 15:59:11 +0300
Subject: [PATCH 07/14] remove compress modules

---
 sha2/src/lib.rs                               | 10 +++---
 sha2/src/sha256.rs                            | 32 ++++++++++++++++++-
 .../{sha256_compress => sha256}/aarch64.rs    |  0
 sha2/src/{sha256_compress => sha256}/soft.rs  |  0
 sha2/src/{sha256_compress => sha256}/x86.rs   |  0
 sha2/src/sha256_compress.rs                   | 32 -------------------
 sha2/src/sha512.rs                            | 25 +++++++++++++--
 sha2/src/{sha512_compress => sha512}/soft.rs  |  0
 sha2/src/sha512_compress.rs                   | 24 --------------
 9 files changed, 58 insertions(+), 65 deletions(-)
 rename sha2/src/{sha256_compress => sha256}/aarch64.rs (100%)
 rename sha2/src/{sha256_compress => sha256}/soft.rs (100%)
 rename sha2/src/{sha256_compress => sha256}/x86.rs (100%)
 delete mode 100644 sha2/src/sha256_compress.rs
 rename sha2/src/{sha512_compress => sha512}/soft.rs (100%)
 delete mode 100644 sha2/src/sha512_compress.rs

diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index eccc78e67..9b804a471 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -61,14 +61,12 @@ extern crate std;
 
 mod consts;
 mod sha256;
-mod sha256_compress;
 mod sha512;
-mod sha512_compress;
 
-pub use crate::sha256::{Sha224, Sha256};
-pub use crate::sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256};
+pub use sha256::{Sha224, Sha256};
+pub use sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256};
 pub use digest::{self, Digest};
 #[cfg(feature = "compress")]
-pub use sha256_compress::compress256;
+pub use sha256::compress256;
 #[cfg(feature = "compress")]
-pub use sha512_compress::compress512;
+pub use sha512::compress512;
diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs
index dc741f61a..fdd5d5b69 100644
--- a/sha2/src/sha256.rs
+++ b/sha2/src/sha256.rs
@@ -1,9 +1,9 @@
 //! SHA-256
 use crate::consts::{H224, H256, STATE_LEN};
-use crate::sha256_compress::compress256;
 use block_buffer::BlockBuffer;
 use core::slice::from_ref;
 use digest::consts::{U28, U32, U64};
+use digest::generic_array::GenericArray;
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
 type BlockSize = U64;
@@ -137,3 +137,33 @@ opaque_debug::impl_opaque_debug!(Sha256);
 
 digest::impl_write!(Sha224);
 digest::impl_write!(Sha256);
+
+cfg_if::cfg_if! {
+    if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] {
+        mod soft;
+        mod aarch64;
+        use aarch64::compress;
+    } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
+        // TODO: replace after sha2-asm rework
+        fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+            for block in blocks {
+                sha2_asm::compress256(state, block);
+            }
+        }
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        mod soft;
+        mod x86;
+        use x86::compress;
+    } else {
+        mod soft;
+        use soft::compress;
+    }
+}
+
+pub fn compress256(state: &mut [u32; 8], blocks: &[GenericArray<u8, U64>]) {
+    // SAFETY: GenericArray<u8, U64> and [u8; 64] have
+    // exactly the same memory layout
+    #[allow(unsafe_code)]
+    let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) };
+    compress(state, blocks)
+}
diff --git a/sha2/src/sha256_compress/aarch64.rs b/sha2/src/sha256/aarch64.rs
similarity index 100%
rename from sha2/src/sha256_compress/aarch64.rs
rename to sha2/src/sha256/aarch64.rs
diff --git a/sha2/src/sha256_compress/soft.rs b/sha2/src/sha256/soft.rs
similarity index 100%
rename from sha2/src/sha256_compress/soft.rs
rename to sha2/src/sha256/soft.rs
diff --git a/sha2/src/sha256_compress/x86.rs b/sha2/src/sha256/x86.rs
similarity index 100%
rename from sha2/src/sha256_compress/x86.rs
rename to sha2/src/sha256/x86.rs
diff --git a/sha2/src/sha256_compress.rs b/sha2/src/sha256_compress.rs
deleted file mode 100644
index b0b52cb4d..000000000
--- a/sha2/src/sha256_compress.rs
+++ /dev/null
@@ -1,32 +0,0 @@
-use digest::consts::U64;
-use digest::generic_array::GenericArray;
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] {
-        mod soft;
-        mod aarch64;
-        use aarch64::compress;
-    } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
-        // TODO: replace after sha2-asm rework
-        fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-            for block in blocks {
-                sha2_asm::compress256(state, block);
-            }
-        }
-    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
-        mod soft;
-        mod x86;
-        use x86::compress;
-    } else {
-        mod soft;
-        use soft::compress;
-    }
-}
-
-pub fn compress256(state: &mut [u32; 8], blocks: &[GenericArray<u8, U64>]) {
-    // SAFETY: GenericArray<u8, U64> and [u8; 64] have
-    // exactly the same memory layout
-    #[allow(unsafe_code)]
-    let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) };
-    compress(state, blocks)
-}
diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs
index a49773222..63471f683 100644
--- a/sha2/src/sha512.rs
+++ b/sha2/src/sha512.rs
@@ -3,10 +3,9 @@ use crate::consts::{H384, H512, H512_TRUNC_224, H512_TRUNC_256, STATE_LEN};
 use block_buffer::BlockBuffer;
 use core::slice::from_ref;
 use digest::consts::{U128, U28, U32, U48, U64};
+use digest::generic_array::GenericArray;
 use digest::{BlockInput, FixedOutputDirty, Reset, Update};
 
-use crate::sha512_compress::compress512;
-
 type BlockSize = U128;
 
 /// Structure that keeps state of the Sha-512 operation and
@@ -227,3 +226,25 @@ digest::impl_write!(Sha384);
 digest::impl_write!(Sha512);
 digest::impl_write!(Sha512Trunc224);
 digest::impl_write!(Sha512Trunc256);
+
+cfg_if::cfg_if! {
+    if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
+        // TODO: replace after sha2-asm rework
+        fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+            for block in blocks {
+                sha2_asm::compress512(state, block);
+            }
+        }
+    } else {
+        mod soft;
+        use soft::compress;
+    }
+}
+
+pub fn compress512(state: &mut [u64; 8], blocks: &[GenericArray<u8, U128>]) {
+    // SAFETY: GenericArray<u8, U128> and [u8; 128] have
+    // exactly the same memory layout
+    #[allow(unsafe_code)]
+    let blocks: &[[u8; 128]] = unsafe { &*(blocks as *const _ as *const [[u8; 128]]) };
+    compress(state, blocks)
+}
diff --git a/sha2/src/sha512_compress/soft.rs b/sha2/src/sha512/soft.rs
similarity index 100%
rename from sha2/src/sha512_compress/soft.rs
rename to sha2/src/sha512/soft.rs
diff --git a/sha2/src/sha512_compress.rs b/sha2/src/sha512_compress.rs
deleted file mode 100644
index baa6b2765..000000000
--- a/sha2/src/sha512_compress.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-use digest::consts::U128;
-use digest::generic_array::GenericArray;
-
-cfg_if::cfg_if! {
-    if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
-        // TODO: replace after sha2-asm rework
-        fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-            for block in blocks {
-                sha2_asm::compress512(state, block);
-            }
-        }
-    } else {
-        mod soft;
-        use soft::compress;
-    }
-}
-
-pub fn compress512(state: &mut [u64; 8], blocks: &[GenericArray<u8, U128>]) {
-    // SAFETY: GenericArray<u8, U128> and [u8; 128] have
-    // exactly the same memory layout
-    #[allow(unsafe_code)]
-    let blocks: &[[u8; 128]] = unsafe { &*(blocks as *const _ as *const [[u8; 128]]) };
-    compress(state, blocks)
-}

From 1da5fe9cfdfbd1f6ad2b9ceb1f7cfb8f748275e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 16:15:49 +0300
Subject: [PATCH 08/14] simplify rounds

---
 sha2/src/sha256/x86.rs | 69 +++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs
index b9ed6ce0d..0822d942d 100644
--- a/sha2/src/sha256/x86.rs
+++ b/sha2/src/sha256/x86.rs
@@ -31,7 +31,7 @@ unsafe fn add_k(v: __m128i, i: usize) -> __m128i {
     _mm_add_epi32(v, t)
 }
 
-unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i, ) -> __m128i {
+unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
     let t1 = _mm_sha256msg1_epu32(v0, v1);
     let t2 = _mm_alignr_epi8(v3, v2, 4);
     let t3 = _mm_add_epi32(t1, t2);
@@ -39,9 +39,22 @@ unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i, ) -> __m1
 }
 
 macro_rules! rounds4 {
-    ($abef:ident, $cdgh:ident, $rest:expr) => {{
-        $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, $rest);
-        $abef = _mm_sha256rnds2_epu32($abef, $cdgh, _mm_shuffle_epi32($rest, 0x0E));
+    ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
+        let t1 = add_k($rest, $i);
+        $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
+        let t2 = _mm_shuffle_epi32(t1, 0x0E);
+        $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
+    }};
+}
+
+macro_rules! schedule_rounds4 {
+    (
+        $abef:ident, $cdgh:ident,
+        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+        $i: expr
+    ) => {{
+        $w4 = schedule($w0, $w1, $w2, $w3);
+        rounds4!($abef, $cdgh, $w4, $i)
     }};
 }
 
@@ -73,36 +86,24 @@ unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
         let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK);
         let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK);
         let mut w3 = _mm_shuffle_epi8( _mm_loadu_si128(data_ptr.add(3)), MASK);
-
-        rounds4!(abef, cdgh, add_k(w0, 0));
-        rounds4!(abef, cdgh, add_k(w1, 1));
-        rounds4!(abef, cdgh, add_k(w2, 2));
-        rounds4!(abef, cdgh, add_k(w3, 3));
-
-        let mut w4 = schedule(w0, w1, w2, w3);
-        rounds4!(abef, cdgh, add_k(w4, 4));
-        w0 = schedule(w1, w2, w3, w4);
-        rounds4!(abef, cdgh, add_k(w0, 5));
-        w1 = schedule(w2, w3, w4, w0);
-        rounds4!(abef, cdgh, add_k(w1, 6));
-        w2 = schedule(w3, w4, w0, w1);
-        rounds4!(abef, cdgh, add_k(w2, 7));
-        w3 = schedule(w4, w0, w1, w2);
-        rounds4!(abef, cdgh, add_k(w3, 8));
-        w4 = schedule(w0, w1, w2, w3);
-        rounds4!(abef, cdgh, add_k(w4, 9));
-        w0 = schedule(w1, w2, w3, w4);
-        rounds4!(abef, cdgh, add_k(w0, 10));
-        w1 = schedule(w2, w3, w4, w0);
-        rounds4!(abef, cdgh, add_k(w1, 11));
-        w2 = schedule(w3, w4, w0, w1);
-        rounds4!(abef, cdgh, add_k(w2, 12));
-        w3 = schedule(w4, w0, w1, w2);
-        rounds4!(abef, cdgh, add_k(w3, 13));
-        w4 = schedule(w0, w1, w2, w3);
-        rounds4!(abef, cdgh, add_k(w4, 14));
-        w0 = schedule(w1, w2, w3, w4);
-        rounds4!(abef, cdgh, add_k(w0, 15));
+        let mut w4;
+
+        rounds4!(abef, cdgh, w0, 0);
+        rounds4!(abef, cdgh, w1, 1);
+        rounds4!(abef, cdgh, w2, 2);
+        rounds4!(abef, cdgh, w3, 3);
+        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
+        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
+        schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
+        schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
+        schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
+        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
+        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
+        schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
+        schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
+        schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
+        schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
+        schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
 
         abef = _mm_add_epi32(abef, abef_save);
         cdgh = _mm_add_epi32(cdgh, cdgh_save);

From 8808c35c7531e561ddc9c73d62fdb04e32a9e627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 16:17:29 +0300
Subject: [PATCH 09/14] fmt

---
 sha2/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs
index 9b804a471..08829eab2 100644
--- a/sha2/src/lib.rs
+++ b/sha2/src/lib.rs
@@ -63,10 +63,10 @@ mod consts;
 mod sha256;
 mod sha512;
 
-pub use sha256::{Sha224, Sha256};
-pub use sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256};
 pub use digest::{self, Digest};
 #[cfg(feature = "compress")]
 pub use sha256::compress256;
+pub use sha256::{Sha224, Sha256};
 #[cfg(feature = "compress")]
 pub use sha512::compress512;
+pub use sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256};

From aedab56c58234677ca4cedf7e87e191c927a1793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 16:26:04 +0300
Subject: [PATCH 10/14] additional simplifications

---
 sha2/src/sha256/soft.rs | 89 +++++++++++++++++++++--------------------
 sha2/src/sha256/x86.rs  | 14 +++----
 2 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/sha2/src/sha256/soft.rs b/sha2/src/sha256/soft.rs
index fe133cd80..d7be01dc6 100644
--- a/sha2/src/sha256/soft.rs
+++ b/sha2/src/sha256/soft.rs
@@ -1,5 +1,5 @@
 #![allow(clippy::many_single_char_names)]
-use crate::consts::{BLOCK_LEN, K32X4};
+use crate::consts::BLOCK_LEN;
 use core::convert::TryInto;
 
 #[inline(always)]
@@ -134,61 +134,62 @@ fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32;
     [a2, b2, e2, f2]
 }
 
-/// Process a block with the SHA-256 algorithm.
-fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
-    let k = &K32X4;
+fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] {
+    let t1 = sha256msg1(v0, v1);
+    let t2 = sha256load(v2, v3);
+    let t3 = add(t1, t2);
+    sha256msg2(t3, v3)
+}
 
-    macro_rules! schedule {
-        ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
-            sha256msg2(add(sha256msg1($v0, $v1), sha256load($v2, $v3)), $v3)
-        };
-    }
+macro_rules! rounds4 {
+    ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
+        let t1 = add($rest, crate::consts::K32X4[$i]);
+        $cdgh = sha256_digest_round_x2($cdgh, $abef, t1);
+        let t2 = sha256swap(t1);
+        $abef = sha256_digest_round_x2($abef, $cdgh, t2);
+    }};
+}
 
-    macro_rules! rounds4 {
-        ($abef:ident, $cdgh:ident, $rest:expr) => {{
-            $cdgh = sha256_digest_round_x2($cdgh, $abef, $rest);
-            $abef = sha256_digest_round_x2($abef, $cdgh, sha256swap($rest));
-        }};
-    }
+macro_rules! schedule_rounds4 {
+    (
+        $abef:ident, $cdgh:ident,
+        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+        $i: expr
+    ) => {{
+        $w4 = schedule($w0, $w1, $w2, $w3);
+        rounds4!($abef, $cdgh, $w4, $i);
+    }};
+}
 
+/// Process a block with the SHA-256 algorithm.
+fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) {
     let mut abef = [state[0], state[1], state[4], state[5]];
     let mut cdgh = [state[2], state[3], state[6], state[7]];
 
 
     // Rounds 0..64
     let mut w0 = [block[3], block[2], block[1], block[0]];
-    rounds4!(abef, cdgh, add(k[0], w0));
     let mut w1 = [block[7], block[6], block[5], block[4]];
-    rounds4!(abef, cdgh, add(k[1], w1));
     let mut w2 = [block[11], block[10], block[9], block[8]];
-    rounds4!(abef, cdgh, add(k[2], w2));
     let mut w3 = [block[15], block[14], block[13], block[12]];
-    rounds4!(abef, cdgh, add(k[3], w3));
-
-    let mut w4 = schedule!(w0, w1, w2, w3);
-    rounds4!(abef, cdgh, add(k[4], w4));
-    w0 = schedule!(w1, w2, w3, w4);
-    rounds4!(abef, cdgh, add(k[5], w0));
-    w1 = schedule!(w2, w3, w4, w0);
-    rounds4!(abef, cdgh, add(k[6], w1));
-    w2 = schedule!(w3, w4, w0, w1);
-    rounds4!(abef, cdgh, add(k[7], w2));
-    w3 = schedule!(w4, w0, w1, w2);
-    rounds4!(abef, cdgh, add(k[8], w3));
-    w4 = schedule!(w0, w1, w2, w3);
-    rounds4!(abef, cdgh, add(k[9], w4));
-    w0 = schedule!(w1, w2, w3, w4);
-    rounds4!(abef, cdgh, add(k[10], w0));
-    w1 = schedule!(w2, w3, w4, w0);
-    rounds4!(abef, cdgh, add(k[11], w1));
-    w2 = schedule!(w3, w4, w0, w1);
-    rounds4!(abef, cdgh, add(k[12], w2));
-    w3 = schedule!(w4, w0, w1, w2);
-    rounds4!(abef, cdgh, add(k[13], w3));
-    w4 = schedule!(w0, w1, w2, w3);
-    rounds4!(abef, cdgh, add(k[14], w4));
-    w0 = schedule!(w1, w2, w3, w4);
-    rounds4!(abef, cdgh, add(k[15], w0));
+    let mut w4;
+
+    rounds4!(abef, cdgh, w0, 0);
+    rounds4!(abef, cdgh, w1, 1);
+    rounds4!(abef, cdgh, w2, 2);
+    rounds4!(abef, cdgh, w3, 3);
+    schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
+    schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
+    schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
+    schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
+    schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
+    schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
+    schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
+    schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
+    schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
+    schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
+    schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
+    schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
 
     let [a, b, e, f] = abef;
     let [c, d, g, h] = cdgh;
diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs
index 0822d942d..8c84fd2ab 100644
--- a/sha2/src/sha256/x86.rs
+++ b/sha2/src/sha256/x86.rs
@@ -12,7 +12,7 @@ use core::arch::x86::*;
     target_feature = "sse4.1",
 )))]
 fn is_supported() -> bool {
-    true
+    false
 }
 
 #[cfg(all(
@@ -25,12 +25,6 @@ fn is_supported() -> bool {
     true
 }
 
-unsafe fn add_k(v: __m128i, i: usize) -> __m128i {
-    let k = &crate::consts::K32X4[i];
-    let t = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
-    _mm_add_epi32(v, t)
-}
-
 unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
     let t1 = _mm_sha256msg1_epu32(v0, v1);
     let t2 = _mm_alignr_epi8(v3, v2, 4);
@@ -40,7 +34,9 @@ unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128
 
 macro_rules! rounds4 {
     ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
-        let t1 = add_k($rest, $i);
+        let k = &crate::consts::K32X4[$i];
+        let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
+        let t1 = _mm_add_epi32($rest, kv);
         $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
         let t2 = _mm_shuffle_epi32(t1, 0x0E);
         $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
@@ -54,7 +50,7 @@ macro_rules! schedule_rounds4 {
         $i: expr
     ) => {{
         $w4 = schedule($w0, $w1, $w2, $w3);
-        rounds4!($abef, $cdgh, $w4, $i)
+        rounds4!($abef, $cdgh, $w4, $i);
     }};
 }
 

From 22ee1dd769456d1fe20aa774fd413aa2d64eee40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 16:55:20 +0300
Subject: [PATCH 11/14] simplify sha1 compression

---
 sha1/src/compress/soft.rs | 95 +++++++++++++++++++--------------------
 sha1/src/compress/x86.rs  | 70 ++++++++++++++---------------
 2 files changed, 80 insertions(+), 85 deletions(-)

diff --git a/sha1/src/compress/soft.rs b/sha1/src/compress/soft.rs
index 94a019b98..19366fb13 100644
--- a/sha1/src/compress/soft.rs
+++ b/sha1/src/compress/soft.rs
@@ -175,68 +175,67 @@ fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] {
     [b, c, d, e]
 }
 
+macro_rules! rounds4 {
+    ($h0:ident, $h1:ident, $wk:expr, $i:expr) => {
+        sha1_digest_round_x4($h0, sha1_first_half($h1, $wk), $i)
+    };
+}
+
+macro_rules! schedule {
+    ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
+        sha1msg2(xor(sha1msg1($v0, $v1), $v2), $v3)
+    };
+}
+
+macro_rules! schedule_rounds4 {
+    (
+        $h0:ident, $h1:ident,
+        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+        $i:expr
+    ) => {
+        $w4 = schedule!($w0, $w1, $w2, $w3);
+        $h1 = rounds4!($h0, $h1, $w4, $i);
+    };
+}
+
 #[inline(always)]
 fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) {
-    macro_rules! schedule {
-        ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {
-            sha1msg2(xor(sha1msg1($v0, $v1), $v2), $v3)
-        };
-    }
+    let mut w0 = [block[0], block[1], block[2], block[3]];
+    let mut w1 = [block[4], block[5], block[6], block[7]];
+    let mut w2 = [block[8], block[9], block[10], block[11]];
+    let mut w3 = [block[12], block[13], block[14], block[15]];
+    let mut w4;
 
-    macro_rules! rounds4 {
-        ($h0:ident, $h1:ident, $wk:expr, $i:expr) => {
-            sha1_digest_round_x4($h0, sha1_first_half($h1, $wk), $i)
-        };
-    }
+    let mut h0 = [state[0], state[1], state[2], state[3]];
+    let mut h1 = sha1_first_add(state[4], w0);
 
     // Rounds 0..20
-    let mut h0 = [state[0], state[1], state[2], state[3]];
-    let mut w0 = [block[0], block[1], block[2], block[3]];
-    let mut h1 = sha1_digest_round_x4(h0, sha1_first_add(state[4], w0), 0);
-    let mut w1 = [block[4], block[5], block[6], block[7]];
+    h1 = sha1_digest_round_x4(h0, h1, 0);
     h0 = rounds4!(h1, h0, w1, 0);
-    let mut w2 = [block[8], block[9], block[10], block[11]];
     h1 = rounds4!(h0, h1, w2, 0);
-    let mut w3 = [block[12], block[13], block[14], block[15]];
     h0 = rounds4!(h1, h0, w3, 0);
-    let mut w4 = schedule!(w0, w1, w2, w3);
-    h1 = rounds4!(h0, h1, w4, 0);
+    schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 0);
 
     // Rounds 20..40
-    w0 = schedule!(w1, w2, w3, w4);
-    h0 = rounds4!(h1, h0, w0, 1);
-    w1 = schedule!(w2, w3, w4, w0);
-    h1 = rounds4!(h0, h1, w1, 1);
-    w2 = schedule!(w3, w4, w0, w1);
-    h0 = rounds4!(h1, h0, w2, 1);
-    w3 = schedule!(w4, w0, w1, w2);
-    h1 = rounds4!(h0, h1, w3, 1);
-    w4 = schedule!(w0, w1, w2, w3);
-    h0 = rounds4!(h1, h0, w4, 1);
+    schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 1);
+    schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 1);
+    schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 1);
+    schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 1);
+    schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 1);
 
     // Rounds 40..60
-    w0 = schedule!(w1, w2, w3, w4);
-    h1 = rounds4!(h0, h1, w0, 2);
-    w1 = schedule!(w2, w3, w4, w0);
-    h0 = rounds4!(h1, h0, w1, 2);
-    w2 = schedule!(w3, w4, w0, w1);
-    h1 = rounds4!(h0, h1, w2, 2);
-    w3 = schedule!(w4, w0, w1, w2);
-    h0 = rounds4!(h1, h0, w3, 2);
-    w4 = schedule!(w0, w1, w2, w3);
-    h1 = rounds4!(h0, h1, w4, 2);
+    schedule_rounds4!(h0, h1, w1, w2, w3, w4, w0, 2);
+    schedule_rounds4!(h1, h0, w2, w3, w4, w0, w1, 2);
+    schedule_rounds4!(h0, h1, w3, w4, w0, w1, w2, 2);
+    schedule_rounds4!(h1, h0, w4, w0, w1, w2, w3, 2);
+    schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 2);
 
     // Rounds 60..80
-    w0 = schedule!(w1, w2, w3, w4);
-    h0 = rounds4!(h1, h0, w0, 3);
-    w1 = schedule!(w2, w3, w4, w0);
-    h1 = rounds4!(h0, h1, w1, 3);
-    w2 = schedule!(w3, w4, w0, w1);
-    h0 = rounds4!(h1, h0, w2, 3);
-    w3 = schedule!(w4, w0, w1, w2);
-    h1 = rounds4!(h0, h1, w3, 3);
-    w4 = schedule!(w0, w1, w2, w3);
-    h0 = rounds4!(h1, h0, w4, 3);
+    schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 3);
+    schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 3);
+    schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 3);
+    schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 3);
+    schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 3);
 
     let e = h1[0].rotate_left(30);
     let [a, b, c, d] = h0;
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
index fd8eebfaf..a4c7c91e7 100644
--- a/sha1/src/compress/x86.rs
+++ b/sha1/src/compress/x86.rs
@@ -38,6 +38,17 @@ macro_rules! schedule {
     };
 }
 
+macro_rules! schedule_rounds4 {
+    (
+        $h0:ident, $h1:ident,
+        $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
+        $i:expr
+    ) => {
+        $w4 = schedule!($w0, $w1, $w2, $w3);
+        $h1 = rounds4!($h0, $h1, $w4, $i);
+    };
+}
+
 #[target_feature(enable = "sha,ssse3,sse4.1")]
 unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     #[allow(non_snake_case)]
@@ -56,57 +67,42 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
         #[allow(clippy::cast_ptr_alignment)]
         let block_ptr = block.as_ptr() as *const __m128i;
 
-        let h0 = state_abcd;
-        let e0 = state_e;
-
         let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK);
         let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK);
         let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK);
         let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK);
+        let mut w4;
+
+        let mut h0 = state_abcd;
+        let mut h1 = _mm_add_epi32(state_e, w0);
 
         // Rounds 0..20
-        let mut h1 = _mm_sha1rnds4_epu32(h0, _mm_add_epi32(e0, w0), 0);
-        let mut h0 = rounds4!(h1, h0, w1, 0);
+        h1 = _mm_sha1rnds4_epu32(h0, h1, 0);
+        h0 = rounds4!(h1, h0, w1, 0);
         h1 = rounds4!(h0, h1, w2, 0);
         h0 = rounds4!(h1, h0, w3, 0);
-        let mut w4 = schedule!(w0, w1, w2, w3);
-        h1 = rounds4!(h0, h1, w4, 0);
+        schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 0);
 
         // Rounds 20..40
-        w0 = schedule!(w1, w2, w3, w4);
-        h0 = rounds4!(h1, h0, w0, 1);
-        w1 = schedule!(w2, w3, w4, w0);
-        h1 = rounds4!(h0, h1, w1, 1);
-        w2 = schedule!(w3, w4, w0, w1);
-        h0 = rounds4!(h1, h0, w2, 1);
-        w3 = schedule!(w4, w0, w1, w2);
-        h1 = rounds4!(h0, h1, w3, 1);
-        w4 = schedule!(w0, w1, w2, w3);
-        h0 = rounds4!(h1, h0, w4, 1);
+        schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 1);
+        schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 1);
+        schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 1);
+        schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 1);
+        schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 1);
 
         // Rounds 40..60
-        w0 = schedule!(w1, w2, w3, w4);
-        h1 = rounds4!(h0, h1, w0, 2);
-        w1 = schedule!(w2, w3, w4, w0);
-        h0 = rounds4!(h1, h0, w1, 2);
-        w2 = schedule!(w3, w4, w0, w1);
-        h1 = rounds4!(h0, h1, w2, 2);
-        w3 = schedule!(w4, w0, w1, w2);
-        h0 = rounds4!(h1, h0, w3, 2);
-        w4 = schedule!(w0, w1, w2, w3);
-        h1 = rounds4!(h0, h1, w4, 2);
+        schedule_rounds4!(h0, h1, w1, w2, w3, w4, w0, 2);
+        schedule_rounds4!(h1, h0, w2, w3, w4, w0, w1, 2);
+        schedule_rounds4!(h0, h1, w3, w4, w0, w1, w2, 2);
+        schedule_rounds4!(h1, h0, w4, w0, w1, w2, w3, 2);
+        schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 2);
 
         // Rounds 60..80
-        w0 = schedule!(w1, w2, w3, w4);
-        h0 = rounds4!(h1, h0, w0, 3);
-        w1 = schedule!(w2, w3, w4, w0);
-        h1 = rounds4!(h0, h1, w1, 3);
-        w2 = schedule!(w3, w4, w0, w1);
-        h0 = rounds4!(h1, h0, w2, 3);
-        w3 = schedule!(w4, w0, w1, w2);
-        h1 = rounds4!(h0, h1, w3, 3);
-        w4 = schedule!(w0, w1, w2, w3);
-        h0 = rounds4!(h1, h0, w4, 3);
+        schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 3);
+        schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 3);
+        schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 3);
+        schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 3);
+        schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 3);
 
         state_abcd = _mm_add_epi32(state_abcd, h0);
         state_e = _mm_sha1nexte_epu32(h1, state_e);

From dd960b125192e2da918acde69ddb64dc2752f613 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 17:02:48 +0300
Subject: [PATCH 12/14] fix signature

---
 sha1/src/compress.rs   | 2 +-
 sha2/src/sha256/x86.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs
index 51a4cdc87..d00dbd1a0 100644
--- a/sha1/src/compress.rs
+++ b/sha1/src/compress.rs
@@ -8,7 +8,7 @@ cfg_if::cfg_if! {
         use aarch64::compress as compress_inner;
     } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
         // TODO: replace after sha1-asm rework
-        fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) {
+        fn compress_inner(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
             for block in blocks {
                 sha1_asm::compress(state, block);
             }
diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs
index 8c84fd2ab..90e5d7381 100644
--- a/sha2/src/sha256/x86.rs
+++ b/sha2/src/sha256/x86.rs
@@ -34,7 +34,7 @@ unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128
 
 macro_rules! rounds4 {
     ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
-        let k = &crate::consts::K32X4[$i];
+        let k = crate::consts::K32X4[$i];
         let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
         let t1 = _mm_add_epi32($rest, kv);
         $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);

From b49a5270eed002ea5d496b06c72eb8579893d51e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 22:02:57 +0300
Subject: [PATCH 13/14] add runtime detection

---
 Cargo.lock               |  8 ++++++++
 sha1/Cargo.toml          |  1 +
 sha1/src/compress/x86.rs | 24 ++----------------------
 sha2/Cargo.toml          |  1 +
 sha2/src/sha256/x86.rs   | 24 ++----------------------
 5 files changed, 14 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 659c6a8dd..62658d5ef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -53,6 +53,12 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 
+[[package]]
+name = "cpuid-bool"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d375c433320f6c5057ae04a04376eef4d04ce2801448cf8863a78da99107be4"
+
 [[package]]
 name = "crypto-mac"
 version = "0.8.0"
@@ -220,6 +226,7 @@ version = "0.9.0"
 dependencies = [
  "block-buffer",
  "cfg-if",
+ "cpuid-bool",
  "digest",
  "hex-literal",
  "libc",
@@ -242,6 +249,7 @@ version = "0.9.0"
 dependencies = [
  "block-buffer",
  "cfg-if",
+ "cpuid-bool",
  "digest",
  "hex-literal",
  "libc",
diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index 183cd5002..e2241b81a 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -19,6 +19,7 @@ digest = "0.9"
 block-buffer = "0.9"
 opaque-debug = "0.2"
 cfg-if = "0.1"
+cpuid-bool = "0.1"
 sha1-asm = { version = "0.4", optional = true }
 
 [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs
index a4c7c91e7..05d90cc98 100644
--- a/sha1/src/compress/x86.rs
+++ b/sha1/src/compress/x86.rs
@@ -6,26 +6,6 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-#[cfg(not(all(
-    target_feature = "sha",
-    target_feature = "sse2",
-    target_feature = "ssse3",
-    target_feature = "sse4.1",
-)))]
-fn sha1_supported() -> bool {
-    false
-}
-
-#[cfg(all(
-    target_feature = "sha",
-    target_feature = "sse2",
-    target_feature = "ssse3",
-    target_feature = "sse4.1",
-))]
-fn sha1_supported() -> bool {
-    true
-}
-
 macro_rules! rounds4 {
     ($h0:ident, $h1:ident, $wk:expr, $i:expr) => {
         _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i)
@@ -49,7 +29,7 @@ macro_rules! schedule_rounds4 {
     };
 }
 
-#[target_feature(enable = "sha,ssse3,sse4.1")]
+#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
 unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     #[allow(non_snake_case)]
     let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F);
@@ -118,7 +98,7 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
 pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) {
     // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
     // after stabilization
-    if sha1_supported() {
+    if cpuid_bool::cpuid_bool!("sha", "sse2", "ssse3", "sse4.1") {
         unsafe {
             digest_blocks(state, blocks);
         }
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index b4b9b4227..c8c8f61ee 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -19,6 +19,7 @@ digest = "0.9"
 block-buffer = "0.9"
 opaque-debug = "0.2"
 cfg-if = "0.1"
+cpuid-bool = "0.1"
 sha2-asm = { version = "0.5", optional = true }
 
 [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs
index 90e5d7381..04a7d26d0 100644
--- a/sha2/src/sha256/x86.rs
+++ b/sha2/src/sha256/x86.rs
@@ -5,26 +5,6 @@ use core::arch::x86_64::*;
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 
-#[cfg(not(all(
-    target_feature = "sha",
-    target_feature = "sse2",
-    target_feature = "ssse3",
-    target_feature = "sse4.1",
-)))]
-fn is_supported() -> bool {
-    false
-}
-
-#[cfg(all(
-    target_feature = "sha",
-    target_feature = "sse2",
-    target_feature = "ssse3",
-    target_feature = "sse4.1",
-))]
-fn is_supported() -> bool {
-    true
-}
-
 unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
     let t1 = _mm_sha256msg1_epu32(v0, v1);
     let t2 = _mm_alignr_epi8(v3, v2, 4);
@@ -56,7 +36,7 @@ macro_rules! schedule_rounds4 {
 
 // we use unaligned loads with `__m128i` pointers
 #[allow(clippy::cast_ptr_alignment)]
-#[target_feature(enable = "sha,ssse3,sse4.1")]
+#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
 unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     #[allow(non_snake_case)]
     let MASK: __m128i = _mm_set_epi64x(
@@ -118,7 +98,7 @@ unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
     // after stabilization
-    if is_supported() {
+    if cpuid_bool::cpuid_bool!("sha", "sse2", "ssse3", "sse4.1") {
         unsafe {
             digest_blocks(state, blocks);
         }

From 1ee078849e324b4c523b647793c9579d469f9298 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?=
 =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= <newpavlov@gmail.com>
Date: Thu, 11 Jun 2020 22:11:51 +0300
Subject: [PATCH 14/14] make cpuid-bool x86-only dependency

---
 sha1/Cargo.toml | 4 +++-
 sha2/Cargo.toml | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml
index 5d5868902..eebb7cb4c 100644
--- a/sha1/Cargo.toml
+++ b/sha1/Cargo.toml
@@ -19,9 +19,11 @@ digest = "0.9"
 block-buffer = "0.9"
 opaque-debug = "0.3"
 cfg-if = "0.1"
-cpuid-bool = "0.1"
 sha1-asm = { version = "0.4", optional = true }
 
+[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
+cpuid-bool = "0.1"
+
 [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
 libc = { version = "0.2.68", optional = true }
 
diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml
index 3e049ec50..ac3893458 100644
--- a/sha2/Cargo.toml
+++ b/sha2/Cargo.toml
@@ -19,9 +19,11 @@ digest = "0.9"
 block-buffer = "0.9"
 opaque-debug = "0.3"
 cfg-if = "0.1"
-cpuid-bool = "0.1"
 sha2-asm = { version = "0.5", optional = true }
 
+[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
+cpuid-bool = "0.1"
+
 [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies]
 libc = { version = "0.2.68", optional = true }