From 6e920452c53fb5310de976666c77e6949af5d700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 11:21:07 +0300 Subject: [PATCH 01/14] sha1: add x86 intrinsics support --- Cargo.lock | 7 ++ sha1/Cargo.toml | 1 + sha1/src/aarch64.rs | 8 -- sha1/src/compress.rs | 34 +++++++ sha1/src/compress/aarch64.rs | 21 ++++ sha1/src/{utils.rs => compress/soft.rs} | 89 ++--------------- sha1/src/compress/x86.rs | 125 ++++++++++++++++++++++++ sha1/src/lib.rs | 49 ++-------- sha1/tests/lib.rs | 10 ++ 9 files changed, 213 insertions(+), 131 deletions(-) delete mode 100644 sha1/src/aarch64.rs create mode 100644 sha1/src/compress.rs create mode 100644 sha1/src/compress/aarch64.rs rename sha1/src/{utils.rs => compress/soft.rs} (67%) create mode 100644 sha1/src/compress/x86.rs diff --git a/Cargo.lock b/Cargo.lock index 8bdfe0138..1db6a343e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,6 +47,12 @@ version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "crypto-mac" version = "0.8.0" @@ -213,6 +219,7 @@ name = "sha-1" version = "0.9.0" dependencies = [ "block-buffer", + "cfg-if", "digest", "hex-literal", "libc", diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index aba0700d6..985ef5579 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -20,6 +20,7 @@ block-buffer = "0.9" opaque-debug = "0.2" sha1-asm = { version = "0.4", optional = true } libc = { version = "0.2.68", optional = true } +cfg-if = "0.1" [dev-dependencies] digest = { version = "0.9", features = ["dev"] } diff --git a/sha1/src/aarch64.rs b/sha1/src/aarch64.rs deleted file mode 100644 index 8d1a916cc..000000000 --- a/sha1/src/aarch64.rs +++ /dev/null @@ -1,8 +0,0 @@ -use libc::{getauxval, AT_HWCAP, HWCAP_SHA1}; - -#[inline(always)] -pub fn sha1_supported() -> bool { - #[allow(unsafe_code)] - let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) }; - (hwcaps & HWCAP_SHA1) != 0 -} diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs new file mode 100644 index 000000000..2c298f26a --- /dev/null +++ b/sha1/src/compress.rs @@ -0,0 +1,34 @@ +use digest::generic_array::GenericArray; +use digest::consts::U64; + +#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] +mod soft; +mod aarch64; +mod x86; + +type Block = GenericArray; + +cfg_if::cfg_if! { + if #[cfg(feature = "asm-aarch64")] { + use aarch64::compress as compress_inner; + } else if #[cfg(feature = "asm")] { + // TODO: replace after sha1-asm rework + fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) { + for block in blocks { + sha1_asm::compress(state, block); + } + } + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + use x86::compress as compress_inner; + } else { + use soft::compress as compress_inner; + } +} + +pub fn compress(state: &mut [u32; 5], blocks: &[Block]) { + // SAFETY: GenericArray and [u8; 64] have + // exactly the same memory layout + #[allow(unsafe_code)] + let blocks: &[[u8; 64]] = unsafe { core::mem::transmute(blocks) }; + compress_inner(state, blocks); +} diff --git a/sha1/src/compress/aarch64.rs b/sha1/src/compress/aarch64.rs new file mode 100644 index 000000000..85295f052 --- /dev/null +++ b/sha1/src/compress/aarch64.rs @@ -0,0 +1,21 @@ +#![cfg(feature = "asm-aarch64")] +use libc::{getauxval, AT_HWCAP, HWCAP_SHA1}; + +fn sha1_supported() -> bool { + #[allow(unsafe_code)] + let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) }; + (hwcaps & HWCAP_SHA1) != 0 +} + +pub fn compress(state: &mut [u32; 5], blocks: &[u8; 64]) { + // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once + // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented + // to let us use it on no_std. + if sha1_supported() { + for block in blocks { + sha1_asm::compress(state, block); + } + } else { + super::soft::compress(state, blocks); + } +} diff --git a/sha1/src/utils.rs b/sha1/src/compress/soft.rs similarity index 67% rename from sha1/src/utils.rs rename to sha1/src/compress/soft.rs index 1d746fb9f..77907b47b 100644 --- a/sha1/src/utils.rs +++ b/sha1/src/compress/soft.rs @@ -1,10 +1,6 @@ #![allow(clippy::many_single_char_names)] use crate::consts::{BLOCK_LEN, K0, K1, K2, K3}; use core::convert::TryInto; -use digest::generic_array::typenum::U64; -use digest::generic_array::GenericArray; - -type Block = GenericArray; #[inline(always)] fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { @@ -21,27 +17,18 @@ fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]] } -/// Not an intrinsic, but gets the first element of a vector. -#[inline] -pub fn sha1_first(w0: [u32; 4]) -> u32 { - w0[0] -} - -/// Not an intrinsic, but adds a word to the first element of a vector. #[inline] pub fn sha1_first_add(e: u32, w0: [u32; 4]) -> [u32; 4] { let [a, b, c, d] = w0; [e.wrapping_add(a), b, c, d] } -/// Emulates `llvm.x86.sha1msg1` intrinsic. fn sha1msg1(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { let [_, _, w2, w3] = a; let [w4, w5, _, _] = b; [a[0] ^ w2, a[1] ^ w3, a[2] ^ w4, a[3] ^ w5] } -/// Emulates `llvm.x86.sha1msg2` intrinsic. fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { let [x0, x1, x2, x3] = a; let [_, w13, w14, w15] = b; @@ -54,21 +41,11 @@ fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { [w16, w17, w18, w19] } -/// Performs 4 rounds of the message schedule update. -/* -pub fn sha1_schedule_x4(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { - sha1msg2(sha1msg1(v0, v1) ^ v2, v3) -} -*/ - -/// Emulates `llvm.x86.sha1nexte` intrinsic. #[inline] fn sha1_first_half(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { - sha1_first_add(sha1_first(abcd).rotate_left(30), msg) + sha1_first_add(abcd[0].rotate_left(30), msg) } -/// Emulates `llvm.x86.sha1rnds4` intrinsic. -/// Performs 4 rounds of the message block digest. fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] { const K0V: [u32; 4] = [K0, K0, K0, K0]; const K1V: [u32; 4] = [K1, K1, K1, K1]; @@ -84,7 +61,6 @@ fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] { } } -/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic. fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { let [mut a, mut b, mut c, mut d] = abcd; let [t, u, v, w] = msg; @@ -123,7 +99,6 @@ fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } -/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic. fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { let [mut a, mut b, mut c, mut d] = abcd; let [t, u, v, w] = msg; @@ -162,7 +137,6 @@ fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } -/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic. fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { let [mut a, mut b, mut c, mut d] = abcd; let [t, u, v, w] = msg; @@ -216,7 +190,6 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { } // Rounds 0..20 - // TODO: replace with `[u32; 4]::load` let mut h0 = [state[0], state[1], state[2], state[3]]; let mut w0 = [block[0], block[1], block[2], block[3]]; let mut h1 = sha1_digest_round_x4(h0, sha1_first_add(state[4], w0), 0); @@ -265,7 +238,7 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { w4 = schedule!(w0, w1, w2, w3); h0 = rounds4!(h1, h0, w4, 3); - let e = sha1_first(h1).rotate_left(30); + let e = h1[0].rotate_left(30); let [a, b, c, d] = h0; state[0] = state[0].wrapping_add(a); @@ -275,58 +248,12 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { state[4] = state[4].wrapping_add(e); } -/// Process a block with the SHA-1 algorithm. (See more...) -/// -/// SHA-1 is a cryptographic hash function, and as such, it operates -/// on an arbitrary number of bytes. This function operates on a fixed -/// number of bytes. If you call this function with anything other than -/// 64 bytes, then it will panic! This function takes two arguments: -/// -/// * `state` is reference to an **array** of 5 words. -/// * `block` is reference to a **slice** of 64 bytes. -/// -/// If you want the function that performs a message digest on an arbitrary -/// number of bytes, then see also the `Sha1` struct above. -/// -/// # Implementation -/// -/// First, some background. Both ARM and Intel are releasing documentation -/// that they plan to include instruction set extensions for SHA1 and SHA256 -/// sometime in the near future. Second, LLVM won't lower these intrinsics yet, -/// so these functions were written emulate these instructions. Finally, -/// the block function implemented with these emulated intrinsics turned out -/// to be quite fast! What follows is a discussion of this CPU-level view -/// of the SHA-1 algorithm and how it relates to the mathematical definition. -/// -/// The SHA instruction set extensions can be divided up into two categories: -/// -/// * message work schedule update calculation ("schedule" v., "work" n.) -/// * message block 80-round digest calculation ("digest" v., "block" n.) -/// -/// The schedule-related functions can be used to easily perform 4 rounds -/// of the message work schedule update calculation, as shown below: -/// -/// ```ignore -/// macro_rules! schedule_x4 { -/// ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => ( -/// sha1msg2(sha1msg1($v0, $v1) ^ $v2, $v3) -/// ) -/// } -/// -/// macro_rules! round_x4 { -/// ($h0:ident, $h1:ident, $wk:expr, $i:expr) => ( -/// sha1rnds4($h0, sha1_first_half($h1, $wk), $i) -/// ) -/// } -/// ``` -/// -/// and also shown above is how the digest-related functions can be used to -/// perform 4 rounds of the message block digest calculation. -/// -pub fn compress(state: &mut [u32; 5], block: &Block) { +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { let mut block_u32 = [0u32; BLOCK_LEN]; - for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { - *o = u32::from_be_bytes(chunk.try_into().unwrap()); + for block in blocks.iter() { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { + *o = u32::from_be_bytes(chunk.try_into().unwrap()); + } + sha1_digest_block_u32(state, &block_u32); } - sha1_digest_block_u32(state, &block_u32); } diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs new file mode 100644 index 000000000..ced7d21fa --- /dev/null +++ b/sha1/src/compress/x86.rs @@ -0,0 +1,125 @@ +#![cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#![allow(unsafe_code)] + +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; +#[cfg(target_arch = "x86")] +use core::arch::x86::*; + +#[cfg(not(target_feature = "sha"))] +fn sha1_supported() -> bool { + true +} + +#[cfg(target_feature = "sha")] +fn sha1_supported() -> bool { + true +} + +macro_rules! rounds4 { + ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { + _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i) + }; +} + +macro_rules! schedule { + ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { + _mm_sha1msg2_epu32(_mm_xor_si128(_mm_sha1msg1_epu32($v0, $v1), $v2), $v3) + }; +} + +#[target_feature(enable = "sha,ssse3,sse4.1")] +unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + #[allow(non_snake_case)] + let MASK: __m128i = _mm_set_epi64x(0x0001020304050607, 0x08090a0b0c0d0e0f); + + let mut state_abcd = _mm_set_epi32( + state[0] as i32, + state[1] as i32, + state[2] as i32, + state[3] as i32, + ); + let mut state_e = _mm_set_epi32( + state[4] as i32, + 0, + 0, + 0, + ); + + for block in blocks { + let block_ptr = block.as_ptr() as *const __m128i; + + let h0 = state_abcd; + let e0 = state_e; + + let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK); + let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK); + let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK); + let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK); + + // Rounds 0..20 + let mut h1 = _mm_sha1rnds4_epu32(h0, _mm_add_epi32(e0, w0), 0); + let mut h0 = rounds4!(h1, h0, w1, 0); + h1 = rounds4!(h0, h1, w2, 0); + h0 = rounds4!(h1, h0, w3, 0); + let mut w4 = schedule!(w0, w1, w2, w3); + h1 = rounds4!(h0, h1, w4, 0); + + // Rounds 20..40 + w0 = schedule!(w1, w2, w3, w4); + h0 = rounds4!(h1, h0, w0, 1); + w1 = schedule!(w2, w3, w4, w0); + h1 = rounds4!(h0, h1, w1, 1); + w2 = schedule!(w3, w4, w0, w1); + h0 = rounds4!(h1, h0, w2, 1); + w3 = schedule!(w4, w0, w1, w2); + h1 = rounds4!(h0, h1, w3, 1); + w4 = schedule!(w0, w1, w2, w3); + h0 = rounds4!(h1, h0, w4, 1); + + // Rounds 40..60 + w0 = schedule!(w1, w2, w3, w4); + h1 = rounds4!(h0, h1, w0, 2); + w1 = schedule!(w2, w3, w4, w0); + h0 = rounds4!(h1, h0, w1, 2); + w2 = schedule!(w3, w4, w0, w1); + h1 = rounds4!(h0, h1, w2, 2); + w3 = schedule!(w4, w0, w1, w2); + h0 = rounds4!(h1, h0, w3, 2); + w4 = schedule!(w0, w1, w2, w3); + h1 = rounds4!(h0, h1, w4, 2); + + // Rounds 60..80 + w0 = schedule!(w1, w2, w3, w4); + h0 = rounds4!(h1, h0, w0, 3); + w1 = schedule!(w2, w3, w4, w0); + h1 = rounds4!(h0, h1, w1, 3); + w2 = schedule!(w3, w4, w0, w1); + h0 = rounds4!(h1, h0, w2, 3); + w3 = schedule!(w4, w0, w1, w2); + h1 = rounds4!(h0, h1, w3, 3); + w4 = schedule!(w0, w1, w2, w3); + h0 = rounds4!(h1, h0, w4, 3); + + state_abcd = _mm_add_epi32(state_abcd, h0); + state_e = _mm_sha1nexte_epu32(h1, state_e); + } + + state[0] = _mm_extract_epi32(state_abcd, 3) as u32; + state[1] = _mm_extract_epi32(state_abcd, 2) as u32; + state[2] = _mm_extract_epi32(state_abcd, 1) as u32; + state[3] = _mm_extract_epi32(state_abcd, 0) as u32; + state[4] = _mm_extract_epi32(state_e, 3) as u32; +} + +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if sha1_supported() { + unsafe { + digest_blocks(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index abe0bb492..f7e1f97aa 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -54,32 +54,20 @@ compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when build ))] compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU."); -#[macro_use] -extern crate opaque_debug; -#[cfg(feature = "asm")] -extern crate sha1_asm; #[cfg(feature = "std")] extern crate std; -#[cfg(feature = "asm-aarch64")] -mod aarch64; mod consts; -#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] -mod utils; +mod compress; -pub use digest::{self, Digest}; - -use crate::consts::{H, STATE_LEN}; use block_buffer::BlockBuffer; +pub use digest::{self, Digest}; use digest::consts::{U20, U64}; use digest::impl_write; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; +use crate::consts::{H, STATE_LEN}; +use crate::compress::compress; -#[cfg(not(feature = "asm"))] -use crate::utils::compress; - -#[cfg(feature = "asm")] -use digest::generic_array::GenericArray; /// Structure representing the state of a SHA-1 computation #[derive(Clone)] @@ -109,7 +97,7 @@ impl Update for Sha1 { // Assumes that `length_bits<<3` will not overflow self.len += input.len() as u64; let state = &mut self.h; - self.buffer.input_block(input, |d| compress(state, d)); + self.buffer.input_blocks(input, |d| compress(state, d)); } } @@ -119,7 +107,7 @@ impl FixedOutputDirty for Sha1 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { let s = &mut self.h; let l = self.len << 3; - self.buffer.len64_padding_be(l, |d| compress(s, d)); + self.buffer.len64_padding_be(l, |d| compress(s, core::slice::from_ref(d))); for (chunk, v) in out.chunks_exact_mut(4).zip(self.h.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } @@ -134,28 +122,5 @@ impl Reset for Sha1 { } } -#[cfg(all(feature = "asm", not(feature = "asm-aarch64")))] -#[inline(always)] -fn compress(state: &mut [u32; 5], block: &GenericArray) { - #[allow(unsafe_code)] - let block: &[u8; 64] = unsafe { core::mem::transmute(block) }; - sha1_asm::compress(state, block); -} - -#[cfg(feature = "asm-aarch64")] -#[inline(always)] -fn compress(state: &mut [u32; 5], block: &GenericArray) { - // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once - // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented - // to let us use it on no_std. - if aarch64::sha1_supported() { - #[allow(unsafe_code)] - let block: &[u8; 64] = unsafe { core::mem::transmute(block) }; - sha1_asm::compress(state, block); - } else { - utils::compress(state, block); - } -} - -impl_opaque_debug!(Sha1); +opaque_debug::impl_opaque_debug!(Sha1); impl_write!(Sha1); diff --git a/sha1/tests/lib.rs b/sha1/tests/lib.rs index c7452c902..2af982ad1 100644 --- a/sha1/tests/lib.rs +++ b/sha1/tests/lib.rs @@ -10,3 +10,13 @@ fn sha1_1million_a() { let output = include_bytes!("data/one_million_a.bin"); one_million_a::(output); } + +#[test] +fn foo() { + use digest::Digest; + let msg = [0x10; 64]; + let res = sha1::Sha1::digest(&msg); + assert_eq!(res.as_slice(), &[ + 168, 179, 203, 62, 143, 158, 186, 31, 28, 98, 170, 152, 153, 17, 169, 72, 151, 49, 99, 53 + ]); +} \ No newline at end of file From b7e621ac758560dd300269f6d8b9c4f5348b2ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 11:36:23 +0300 Subject: [PATCH 02/14] fix fmt --- sha1/src/compress.rs | 4 ++-- sha1/src/compress/x86.rs | 13 ++++--------- sha1/src/lib.rs | 12 ++++++------ sha1/tests/lib.rs | 10 ---------- 4 files changed, 12 insertions(+), 27 deletions(-) diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index 2c298f26a..843d92494 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -1,9 +1,9 @@ -use digest::generic_array::GenericArray; use digest::consts::U64; +use digest::generic_array::GenericArray; +mod aarch64; #[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] mod soft; -mod aarch64; mod x86; type Block = GenericArray; diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs index ced7d21fa..3599814d6 100644 --- a/sha1/src/compress/x86.rs +++ b/sha1/src/compress/x86.rs @@ -1,10 +1,10 @@ #![cfg(any(target_arch = "x86", target_arch = "x86_64"))] #![allow(unsafe_code)] -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; #[cfg(target_arch = "x86")] use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; #[cfg(not(target_feature = "sha"))] fn sha1_supported() -> bool { @@ -31,7 +31,7 @@ macro_rules! schedule { #[target_feature(enable = "sha,ssse3,sse4.1")] unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { #[allow(non_snake_case)] - let MASK: __m128i = _mm_set_epi64x(0x0001020304050607, 0x08090a0b0c0d0e0f); + let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F); let mut state_abcd = _mm_set_epi32( state[0] as i32, @@ -39,12 +39,7 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { state[2] as i32, state[3] as i32, ); - let mut state_e = _mm_set_epi32( - state[4] as i32, - 0, - 0, - 0, - ); + let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0); for block in blocks { let block_ptr = block.as_ptr() as *const __m128i; diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index f7e1f97aa..da93d5549 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -57,17 +57,16 @@ compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use #[cfg(feature = "std")] extern crate std; -mod consts; mod compress; +mod consts; +use crate::compress::compress; +use crate::consts::{H, STATE_LEN}; use block_buffer::BlockBuffer; -pub use digest::{self, Digest}; use digest::consts::{U20, U64}; use digest::impl_write; +pub use digest::{self, Digest}; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; -use crate::consts::{H, STATE_LEN}; -use crate::compress::compress; - /// Structure representing the state of a SHA-1 computation #[derive(Clone)] @@ -107,7 +106,8 @@ impl FixedOutputDirty for Sha1 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { let s = &mut self.h; let l = self.len << 3; - self.buffer.len64_padding_be(l, |d| compress(s, core::slice::from_ref(d))); + self.buffer + .len64_padding_be(l, |d| compress(s, core::slice::from_ref(d))); for (chunk, v) in out.chunks_exact_mut(4).zip(self.h.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } diff --git a/sha1/tests/lib.rs b/sha1/tests/lib.rs index 2af982ad1..c7452c902 100644 --- a/sha1/tests/lib.rs +++ b/sha1/tests/lib.rs @@ -10,13 +10,3 @@ fn sha1_1million_a() { let output = include_bytes!("data/one_million_a.bin"); one_million_a::(output); } - -#[test] -fn foo() { - use digest::Digest; - let msg = [0x10; 64]; - let res = sha1::Sha1::digest(&msg); - assert_eq!(res.as_slice(), &[ - 168, 179, 203, 62, 143, 158, 186, 31, 28, 98, 170, 152, 153, 17, 169, 72, 151, 49, 99, 53 - ]); -} \ No newline at end of file From 9855b5fb11d1ab5e37ded29438b5f191dd3d69e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 11:43:41 +0300 Subject: [PATCH 03/14] fix clippy warnings --- sha1/src/compress.rs | 4 +++- sha1/src/compress/x86.rs | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index 843d92494..ffb051576 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -29,6 +29,8 @@ pub fn compress(state: &mut [u32; 5], blocks: &[Block]) { // SAFETY: GenericArray and [u8; 64] have // exactly the same memory layout #[allow(unsafe_code)] - let blocks: &[[u8; 64]] = unsafe { core::mem::transmute(blocks) }; + let blocks: &[[u8; 64]] = unsafe { + &*(blocks as *const [Block] as *const [[u8; 64]]) + }; compress_inner(state, blocks); } diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs index 3599814d6..29da145c5 100644 --- a/sha1/src/compress/x86.rs +++ b/sha1/src/compress/x86.rs @@ -42,6 +42,8 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0); for block in blocks { + // SAFETY: we use only unaligned loads with this pointer + #[allow(clippy::cast_ptr_alignment)] let block_ptr = block.as_ptr() as *const __m128i; let h0 = state_abcd; From ae0b503da72c7c3f0736ac92ae183c7da2c15b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 13:03:22 +0300 Subject: [PATCH 04/14] minor changes --- sha1/src/compress.rs | 8 ++------ sha1/src/compress/soft.rs | 8 ++++++-- sha1/src/compress/x86.rs | 16 +++++++++++++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index ffb051576..b595199ba 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -6,8 +6,6 @@ mod aarch64; mod soft; mod x86; -type Block = GenericArray; - cfg_if::cfg_if! { if #[cfg(feature = "asm-aarch64")] { use aarch64::compress as compress_inner; @@ -25,12 +23,10 @@ cfg_if::cfg_if! { } } -pub fn compress(state: &mut [u32; 5], blocks: &[Block]) { +pub fn compress(state: &mut [u32; 5], blocks: &[GenericArray]) { // SAFETY: GenericArray and [u8; 64] have // exactly the same memory layout #[allow(unsafe_code)] - let blocks: &[[u8; 64]] = unsafe { - &*(blocks as *const [Block] as *const [[u8; 64]]) - }; + let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) }; compress_inner(state, blocks); } diff --git a/sha1/src/compress/soft.rs b/sha1/src/compress/soft.rs index 77907b47b..94a019b98 100644 --- a/sha1/src/compress/soft.rs +++ b/sha1/src/compress/soft.rs @@ -175,7 +175,7 @@ fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } -/// Process a block with the SHA-1 algorithm. +#[inline(always)] fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { macro_rules! schedule { ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { @@ -250,10 +250,14 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { let mut block_u32 = [0u32; BLOCK_LEN]; + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; for block in blocks.iter() { for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { *o = u32::from_be_bytes(chunk.try_into().unwrap()); } - sha1_digest_block_u32(state, &block_u32); + sha1_digest_block_u32(&mut state_cpy, &block_u32); } + *state = state_cpy; } diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs index 29da145c5..fd8eebfaf 100644 --- a/sha1/src/compress/x86.rs +++ b/sha1/src/compress/x86.rs @@ -6,12 +6,22 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -#[cfg(not(target_feature = "sha"))] +#[cfg(not(all( + target_feature = "sha", + target_feature = "sse2", + target_feature = "ssse3", + target_feature = "sse4.1", +)))] fn sha1_supported() -> bool { - true + false } -#[cfg(target_feature = "sha")] +#[cfg(all( + target_feature = "sha", + target_feature = "sse2", + target_feature = "ssse3", + target_feature = "sse4.1", +))] fn sha1_supported() -> bool { true } From b60c81e71ce3adbc7ce8bdce081c4ca5d555df22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 15:45:12 +0300 Subject: [PATCH 05/14] sha2: add x86 intrinsics support --- Cargo.lock | 1 + sha2/Cargo.toml | 10 +- sha2/src/aarch64.rs | 7 - sha2/src/lib.rs | 39 +----- sha2/src/sha256.rs | 77 +++------- sha2/src/sha256_compress.rs | 32 +++++ sha2/src/sha256_compress/aarch64.rs | 20 +++ .../soft.rs} | 124 ++--------------- sha2/src/sha256_compress/x86.rs | 131 ++++++++++++++++++ sha2/src/sha512.rs | 79 ++++------- sha2/src/sha512_compress.rs | 24 ++++ .../soft.rs} | 114 ++------------- sha2/tests/lib.rs | 2 - 13 files changed, 286 insertions(+), 374 deletions(-) delete mode 100644 sha2/src/aarch64.rs create mode 100644 sha2/src/sha256_compress.rs create mode 100644 sha2/src/sha256_compress/aarch64.rs rename sha2/src/{sha256_utils.rs => sha256_compress/soft.rs} (60%) create mode 100644 sha2/src/sha256_compress/x86.rs create mode 100644 sha2/src/sha512_compress.rs rename sha2/src/{sha512_utils.rs => sha512_compress/soft.rs} (66%) diff --git a/Cargo.lock b/Cargo.lock index 1db6a343e..659c6a8dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -241,6 +241,7 @@ name = "sha2" version = "0.9.0" dependencies = [ "block-buffer", + "cfg-if", "digest", "hex-literal", "libc", diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 7f7b79d8b..ec690a63e 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -18,7 +18,10 @@ categories = ["cryptography", "no-std"] digest = "0.9" block-buffer = "0.9" opaque-debug = "0.2" +cfg-if = "0.1" sha2-asm = { version = "0.5", optional = true } + +[target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] libc = { version = "0.2.68", optional = true } [dev-dependencies] @@ -28,9 +31,8 @@ hex-literal = "0.2" [features] default = ["std"] std = ["digest/std"] -asm = ["sha2-asm"] +asm = ["sha2-asm", "libc"] compress = [] -# TODO: Remove this feature once is_aarch64_feature_detected!() is stabilised. -# Only used on AArch64 Linux systems, when built without the crypto target_feature. -asm-aarch64 = ["asm", "libc"] +# DEPRECATED: use `asm` isntead +asm-aarch64 = ["asm"] diff --git a/sha2/src/aarch64.rs b/sha2/src/aarch64.rs deleted file mode 100644 index 7cba76519..000000000 --- a/sha2/src/aarch64.rs +++ /dev/null @@ -1,7 +0,0 @@ -use libc::{getauxval, AT_HWCAP, HWCAP_SHA2}; - -#[inline(always)] -pub fn sha2_supported() -> bool { - let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) }; - (hwcaps & HWCAP_SHA2) != 0 -} diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index c87c064b4..eccc78e67 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -56,50 +56,19 @@ #![doc(html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")] #![warn(missing_docs, rust_2018_idioms)] -// Give relevant error messages if the user tries to enable AArch64 asm on unsupported platforms. -#[cfg(all( - feature = "asm-aarch64", - target_arch = "aarch64", - not(target_os = "linux") -))] -compile_error!("Your OS isn’t yet supported for runtime-checking of AArch64 features."); -#[cfg(all(feature = "asm-aarch64", not(target_arch = "aarch64")))] -compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" on non-AArch64 systems."); -#[cfg(all( - feature = "asm-aarch64", - target_arch = "aarch64", - target_feature = "crypto" -))] -compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when building for AArch64 systems with crypto extensions."); -#[cfg(all( - not(feature = "asm-aarch64"), - feature = "asm", - target_arch = "aarch64", - not(target_feature = "crypto"), - target_os = "linux" -))] -compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU."); - -#[macro_use] -extern crate opaque_debug; - #[cfg(feature = "std")] extern crate std; -#[cfg(feature = "asm-aarch64")] -mod aarch64; mod consts; mod sha256; -#[cfg(any(not(feature = "asm"), feature = "asm-aarch64", feature = "compress"))] -mod sha256_utils; +mod sha256_compress; mod sha512; -#[cfg(any(not(feature = "asm"), target_arch = "aarch64", feature = "compress"))] -mod sha512_utils; +mod sha512_compress; pub use crate::sha256::{Sha224, Sha256}; pub use crate::sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256}; pub use digest::{self, Digest}; #[cfg(feature = "compress")] -pub use sha256_utils::compress256; +pub use sha256_compress::compress256; #[cfg(feature = "compress")] -pub use sha512_utils::compress512; +pub use sha512_compress::compress512; diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs index c30671b86..dc741f61a 100644 --- a/sha2/src/sha256.rs +++ b/sha2/src/sha256.rs @@ -1,62 +1,20 @@ //! SHA-256 - use crate::consts::{H224, H256, STATE_LEN}; +use crate::sha256_compress::compress256; use block_buffer::BlockBuffer; -use digest::impl_write; -use digest::{ - consts::{U28, U32, U64}, - generic_array::GenericArray, -}; +use core::slice::from_ref; +use digest::consts::{U28, U32, U64}; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; -#[cfg(not(feature = "asm"))] -use crate::sha256_utils::compress256; - -#[cfg(feature = "asm")] -use sha2_asm::compress256; - type BlockSize = U64; -type Block = GenericArray; - -/// A structure that represents that state of a digest computation for the -/// SHA-2 512 family of digest functions -#[derive(Clone)] -struct Engine256State { - h: [u32; 8], -} - -impl Engine256State { - fn new(h: &[u32; STATE_LEN]) -> Engine256State { - Engine256State { h: *h } - } - - #[cfg(not(feature = "asm-aarch64"))] - pub fn process_block(&mut self, block: &Block) { - let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) }; - compress256(&mut self.h, block); - } - - #[cfg(feature = "asm-aarch64")] - pub fn process_block(&mut self, block: &Block) { - let block = unsafe { &*(block.as_ptr() as *const [u8; 64]) }; - // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha2") once - // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented - // to let us use it on no_std. - if ::aarch64::sha2_supported() { - compress256(&mut self.h, block); - } else { - ::sha256_utils::compress256(&mut self.h, block); - } - } -} -/// A structure that keeps track of the state of the Sha-256 operation and +/// Structure that keeps state of the Sha-256 operation and /// contains the logic necessary to perform the final calculations. #[derive(Clone)] struct Engine256 { len: u64, buffer: BlockBuffer, - state: Engine256State, + state: [u32; 8], } impl Engine256 { @@ -64,7 +22,7 @@ impl Engine256 { Engine256 { len: 0, buffer: Default::default(), - state: Engine256State::new(h), + state: *h, } } @@ -72,19 +30,20 @@ impl Engine256 { // Assumes that input.len() can be converted to u64 without overflow self.len += (input.len() as u64) << 3; let s = &mut self.state; - self.buffer.input_block(input, |b| s.process_block(b)); + self.buffer.input_blocks(input, |b| compress256(s, b)); } fn finish(&mut self) { let s = &mut self.state; let l = self.len; - self.buffer.len64_padding_be(l, |b| s.process_block(b)); + self.buffer + .len64_padding_be(l, |b| compress256(s, from_ref(b))); } fn reset(&mut self, h: &[u32; STATE_LEN]) { self.len = 0; self.buffer.reset(); - self.state = Engine256State::new(h); + self.state = *h; } } @@ -117,8 +76,8 @@ impl FixedOutputDirty for Sha256 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { self.engine.finish(); - let h = self.engine.state.h; - for (chunk, v) in out.chunks_exact_mut(4).zip(h.iter()) { + let s = self.engine.state; + for (chunk, v) in out.chunks_exact_mut(4).zip(s.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } } @@ -160,8 +119,8 @@ impl FixedOutputDirty for Sha224 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { self.engine.finish(); - let h = &self.engine.state.h[..7]; - for (chunk, v) in out.chunks_exact_mut(4).zip(h.iter()) { + let s = &self.engine.state[..7]; + for (chunk, v) in out.chunks_exact_mut(4).zip(s.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } } @@ -173,8 +132,8 @@ impl Reset for Sha224 { } } -impl_opaque_debug!(Sha224); -impl_opaque_debug!(Sha256); +opaque_debug::impl_opaque_debug!(Sha224); +opaque_debug::impl_opaque_debug!(Sha256); -impl_write!(Sha224); -impl_write!(Sha256); +digest::impl_write!(Sha224); +digest::impl_write!(Sha256); diff --git a/sha2/src/sha256_compress.rs b/sha2/src/sha256_compress.rs new file mode 100644 index 000000000..b0b52cb4d --- /dev/null +++ b/sha2/src/sha256_compress.rs @@ -0,0 +1,32 @@ +use digest::consts::U64; +use digest::generic_array::GenericArray; + +cfg_if::cfg_if! { + if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] { + mod soft; + mod aarch64; + use aarch64::compress; + } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { + // TODO: replace after sha2-asm rework + fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + for block in blocks { + sha2_asm::compress256(state, block); + } + } + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + mod soft; + mod x86; + use x86::compress; + } else { + mod soft; + use soft::compress; + } +} + +pub fn compress256(state: &mut [u32; 8], blocks: &[GenericArray]) { + // SAFETY: GenericArray and [u8; 64] have + // exactly the same memory layout + #[allow(unsafe_code)] + let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) }; + compress(state, blocks) +} diff --git a/sha2/src/sha256_compress/aarch64.rs b/sha2/src/sha256_compress/aarch64.rs new file mode 100644 index 000000000..a5967ca0c --- /dev/null +++ b/sha2/src/sha256_compress/aarch64.rs @@ -0,0 +1,20 @@ +use libc::{getauxval, AT_HWCAP, HWCAP_SHA2}; + +#[inline(always)] +pub fn sha2_supported() -> bool { + let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) }; + (hwcaps & HWCAP_SHA2) != 0 +} + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if sha2_supported() { + // TODO: replace after sha2-asm rework + for block in blocks { + sha2_asm::compress256(&mut self.h, block); + } + } else { + super::soft::compress(&mut self.h, block); + } +} diff --git a/sha2/src/sha256_utils.rs b/sha2/src/sha256_compress/soft.rs similarity index 60% rename from sha2/src/sha256_utils.rs rename to sha2/src/sha256_compress/soft.rs index 7d2ec9f63..fe133cd80 100644 --- a/sha2/src/sha256_utils.rs +++ b/sha2/src/sha256_compress/soft.rs @@ -32,20 +32,14 @@ fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { ] } -/// Not an intrinsic, but works like an unaligned load. -#[inline] fn sha256load(v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { [v3[3], v2[0], v2[1], v2[2]] } -/// Not an intrinsic, but useful for swapping vectors. -#[inline] fn sha256swap(v0: [u32; 4]) -> [u32; 4] { [v0[2], v0[3], v0[0], v0[1]] } -/// Emulates `llvm.x86.sha256msg1` intrinsic. -// #[inline] fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] { // sigma 0 on vectors #[inline] @@ -59,8 +53,6 @@ fn sha256msg1(v0: [u32; 4], v1: [u32; 4]) -> [u32; 4] { add(v0, sigma0x4(sha256load(v0, v1))) } -/// Emulates `llvm.x86.sha256msg2` intrinsic. -// #[inline] fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] { macro_rules! sigma1 { ($a:expr) => { @@ -79,14 +71,6 @@ fn sha256msg2(v4: [u32; 4], v3: [u32; 4]) -> [u32; 4] { [w19, w18, w17, w16] } -/* -/// Performs 4 rounds of the SHA-256 message schedule update. -fn sha256_schedule_x4(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { - sha256msg2(sha256msg1(v0, v1) + sha256load(v2, v3), v3) -}*/ - -/// Emulates `llvm.x86.sha256rnds2` intrinsic. -// #[inline] fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32; 4] { macro_rules! big_sigma0 { ($a:expr) => { @@ -170,6 +154,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { let mut abef = [state[0], state[1], state[4], state[5]]; let mut cdgh = [state[2], state[3], state[6], state[7]]; + // Rounds 0..64 let mut w0 = [block[3], block[2], block[1], block[0]]; rounds4!(abef, cdgh, add(k[0], w0)); @@ -179,6 +164,7 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { rounds4!(abef, cdgh, add(k[2], w2)); let mut w3 = [block[15], block[14], block[13], block[12]]; rounds4!(abef, cdgh, add(k[3], w3)); + let mut w4 = schedule!(w0, w1, w2, w3); rounds4!(abef, cdgh, add(k[4], w4)); w0 = schedule!(w1, w2, w3, w4); @@ -217,102 +203,16 @@ fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { state[7] = state[7].wrapping_add(h); } -/// Process a block with the SHA-256 algorithm. (See more...) -/// -/// Internally, this uses functions which resemble the new Intel SHA instruction -/// sets, and so it's data locality properties may improve performance. However, -/// to benefit the most from this implementation, replace these functions with -/// x86 intrinsics to get a possible speed boost. -/// -/// # Implementation -/// -/// The `Sha256` algorithm is implemented with functions that resemble the new -/// Intel SHA instruction set extensions. These intructions fall into two -/// categories: message schedule calculation, and the message block 64-round -/// digest calculation. The schedule-related instructions allow 4 rounds to be -/// calculated as: -/// -/// ```ignore -/// use std::simd::[u32; 4]; -/// use self::crypto::sha2::{ -/// sha256msg1, -/// sha256msg2, -/// sha256load -/// }; -/// -/// fn schedule4_data(work: &mut [[u32; 4]], w: &[u32]) { -/// -/// // this is to illustrate the data order -/// work[0] = [w[3], w[2], w[1], w[0]); -/// work[1] = [w[7], w[6], w[5], w[4]); -/// work[2] = [w[11], w[10], w[9], w[8]); -/// work[3] = [w[15], w[14], w[13], w[12]); -/// } -/// -/// fn schedule4_work(work: &mut [[u32; 4]], t: usize) { -/// -/// // this is the core expression -/// work[t] = sha256msg2(sha256msg1(work[t - 4], work[t - 3]) + -/// sha256load(work[t - 2], work[t - 1]), -/// work[t - 1]) -/// } -/// ``` -/// -/// instead of 4 rounds of: -/// -/// ```ignore -/// fn schedule_work(w: &mut [u32], t: usize) { -/// w[t] = sigma1!(w[t - 2]) + w[t - 7] + sigma0!(w[t - 15]) + w[t - 16]; -/// } -/// ``` -/// -/// and the digest-related instructions allow 4 rounds to be calculated as: -/// -/// ```ignore -/// use std::simd::[u32; 4]; -/// use self::crypto::sha2::{K32X4, -/// sha256rnds2, -/// sha256swap -/// }; -/// -/// fn rounds4(state: &mut [u32; 8], work: &mut [[u32; 4]], t: usize) { -/// let [a, b, c, d, e, f, g, h]: [u32; 8] = *state; -/// -/// // this is to illustrate the data order -/// let mut abef = [a, b, e, f); -/// let mut cdgh = [c, d, g, h); -/// let temp = K32X4[t] + work[t]; -/// -/// // this is the core expression -/// cdgh = sha256rnds2(cdgh, abef, temp); -/// abef = sha256rnds2(abef, cdgh, sha256swap(temp)); -/// -/// *state = [abef[0], abef[1], cdgh[0], cdgh[1], -/// abef[2], abef[3], cdgh[2], cdgh[3]]; -/// } -/// ``` -/// -/// instead of 4 rounds of: -/// -/// ```ignore -/// fn round(state: &mut [u32; 8], w: &mut [u32], t: usize) { -/// let [a, b, c, mut d, e, f, g, mut h]: [u32; 8] = *state; -/// -/// h += big_sigma1!(e) + choose!(e, f, g) + K32[t] + w[t]; d += h; -/// h += big_sigma0!(a) + majority!(a, b, c); -/// -/// *state = [h, a, b, c, d, e, f, g]; -/// } -/// ``` -/// -/// **NOTE**: It is important to note, however, that these instructions are not -/// implemented by any CPU (at the time of this writing), and so they are -/// emulated in this library until the instructions become more common, and gain -/// support in LLVM (and GCC, etc.). -pub fn compress256(state: &mut [u32; 8], block: &[u8; 64]) { +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { let mut block_u32 = [0u32; BLOCK_LEN]; - for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { - *o = u32::from_be_bytes(chunk.try_into().unwrap()); + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; + for block in blocks { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { + *o = u32::from_be_bytes(chunk.try_into().unwrap()); + } + sha256_digest_block_u32(&mut state_cpy, &block_u32); } - sha256_digest_block_u32(state, &block_u32); + *state = state_cpy; } diff --git a/sha2/src/sha256_compress/x86.rs b/sha2/src/sha256_compress/x86.rs new file mode 100644 index 000000000..b9ed6ce0d --- /dev/null +++ b/sha2/src/sha256_compress/x86.rs @@ -0,0 +1,131 @@ +#![allow(clippy::many_single_char_names)] + +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; +#[cfg(target_arch = "x86")] +use core::arch::x86::*; + +#[cfg(not(all( + target_feature = "sha", + target_feature = "sse2", + target_feature = "ssse3", + target_feature = "sse4.1", +)))] +fn is_supported() -> bool { + true +} + +#[cfg(all( + target_feature = "sha", + target_feature = "sse2", + target_feature = "ssse3", + target_feature = "sse4.1", +))] +fn is_supported() -> bool { + true +} + +unsafe fn add_k(v: __m128i, i: usize) -> __m128i { + let k = &crate::consts::K32X4[i]; + let t = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); + _mm_add_epi32(v, t) +} + +unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i, ) -> __m128i { + let t1 = _mm_sha256msg1_epu32(v0, v1); + let t2 = _mm_alignr_epi8(v3, v2, 4); + let t3 = _mm_add_epi32(t1, t2); + _mm_sha256msg2_epu32(t3, v3) +} + +macro_rules! rounds4 { + ($abef:ident, $cdgh:ident, $rest:expr) => {{ + $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, $rest); + $abef = _mm_sha256rnds2_epu32($abef, $cdgh, _mm_shuffle_epi32($rest, 0x0E)); + }}; +} + +// we use unaligned loads with `__m128i` pointers +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "sha,ssse3,sse4.1")] +unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + #[allow(non_snake_case)] + let MASK: __m128i = _mm_set_epi64x( + 0x0C0D_0E0F_0809_0A0Bu64 as i64, + 0x0405_0607_0001_0203u64 as i64, + ); + + let state_ptr = state.as_ptr() as *const __m128i; + let dcba = _mm_loadu_si128(state_ptr.add(0)); + let efgh = _mm_loadu_si128(state_ptr.add(1)); + + let cdab = _mm_shuffle_epi32(dcba, 0xB1); + let efgh = _mm_shuffle_epi32(efgh, 0x1B); + let mut abef = _mm_alignr_epi8(cdab, efgh, 8); + let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0); + + for block in blocks { + let abef_save = abef; + let cdgh_save = cdgh; + + let data_ptr = block.as_ptr() as *const __m128i; + let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK); + let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK); + let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK); + let mut w3 = _mm_shuffle_epi8( _mm_loadu_si128(data_ptr.add(3)), MASK); + + rounds4!(abef, cdgh, add_k(w0, 0)); + rounds4!(abef, cdgh, add_k(w1, 1)); + rounds4!(abef, cdgh, add_k(w2, 2)); + rounds4!(abef, cdgh, add_k(w3, 3)); + + let mut w4 = schedule(w0, w1, w2, w3); + rounds4!(abef, cdgh, add_k(w4, 4)); + w0 = schedule(w1, w2, w3, w4); + rounds4!(abef, cdgh, add_k(w0, 5)); + w1 = schedule(w2, w3, w4, w0); + rounds4!(abef, cdgh, add_k(w1, 6)); + w2 = schedule(w3, w4, w0, w1); + rounds4!(abef, cdgh, add_k(w2, 7)); + w3 = schedule(w4, w0, w1, w2); + rounds4!(abef, cdgh, add_k(w3, 8)); + w4 = schedule(w0, w1, w2, w3); + rounds4!(abef, cdgh, add_k(w4, 9)); + w0 = schedule(w1, w2, w3, w4); + rounds4!(abef, cdgh, add_k(w0, 10)); + w1 = schedule(w2, w3, w4, w0); + rounds4!(abef, cdgh, add_k(w1, 11)); + w2 = schedule(w3, w4, w0, w1); + rounds4!(abef, cdgh, add_k(w2, 12)); + w3 = schedule(w4, w0, w1, w2); + rounds4!(abef, cdgh, add_k(w3, 13)); + w4 = schedule(w0, w1, w2, w3); + rounds4!(abef, cdgh, add_k(w4, 14)); + w0 = schedule(w1, w2, w3, w4); + rounds4!(abef, cdgh, add_k(w0, 15)); + + abef = _mm_add_epi32(abef, abef_save); + cdgh = _mm_add_epi32(cdgh, cdgh_save); + } + + let feba = _mm_shuffle_epi32(abef, 0x1B); + let dchg = _mm_shuffle_epi32(cdgh, 0xB1); + let dcba = _mm_blend_epi16(feba, dchg, 0xF0); + let hgef = _mm_alignr_epi8(dchg, feba, 8); + + let state_ptr_mut = state.as_mut_ptr() as *mut __m128i; + _mm_storeu_si128(state_ptr_mut.add(0), dcba); + _mm_storeu_si128(state_ptr_mut.add(1), hgef); +} + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if is_supported() { + unsafe { + digest_blocks(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs index ed3a1cccf..a49773222 100644 --- a/sha2/src/sha512.rs +++ b/sha2/src/sha512.rs @@ -1,48 +1,21 @@ //! SHA-512 - use crate::consts::{H384, H512, H512_TRUNC_224, H512_TRUNC_256, STATE_LEN}; use block_buffer::BlockBuffer; -use digest::impl_write; -use digest::{ - consts::{U128, U28, U32, U48, U64}, - generic_array::GenericArray, -}; +use core::slice::from_ref; +use digest::consts::{U128, U28, U32, U48, U64}; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; -#[cfg(any(not(feature = "asm"), target_arch = "aarch64"))] -use crate::sha512_utils::compress512; - -#[cfg(all(feature = "asm", not(target_arch = "aarch64")))] -use sha2_asm::compress512; +use crate::sha512_compress::compress512; type BlockSize = U128; -type Block = GenericArray; - -/// A structure that represents that state of a digest computation for the -/// SHA-2 512 family of digest functions -#[derive(Clone)] -struct Engine512State { - h: [u64; 8], -} - -impl Engine512State { - fn new(h: &[u64; 8]) -> Engine512State { - Engine512State { h: *h } - } - - pub fn process_block(&mut self, block: &Block) { - let block = unsafe { &*(block.as_ptr() as *const [u8; 128]) }; - compress512(&mut self.h, block); - } -} -/// A structure that keeps track of the state of the Sha-512 operation and +/// Structure that keeps state of the Sha-512 operation and /// contains the logic necessary to perform the final calculations. #[derive(Clone)] struct Engine512 { len: u128, buffer: BlockBuffer, - state: Engine512State, + state: [u64; 8], } impl Engine512 { @@ -50,26 +23,26 @@ impl Engine512 { Engine512 { len: 0, buffer: Default::default(), - state: Engine512State::new(h), + state: *h, } } fn update(&mut self, input: &[u8]) { self.len += (input.len() as u128) << 3; let s = &mut self.state; - self.buffer.input_block(input, |d| s.process_block(d)); + self.buffer.input_blocks(input, |b| compress512(s, b)); } fn finish(&mut self) { let s = &mut self.state; self.buffer - .len128_padding_be(self.len, |d| s.process_block(d)); + .len128_padding_be(self.len, |d| compress512(s, from_ref(d))); } fn reset(&mut self, h: &[u64; STATE_LEN]) { self.len = 0; self.buffer.reset(); - self.state = Engine512State::new(h); + self.state = *h; } } @@ -102,8 +75,8 @@ impl FixedOutputDirty for Sha512 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { self.engine.finish(); - let h = self.engine.state.h; - for (chunk, v) in out.chunks_exact_mut(8).zip(h.iter()) { + let s = self.engine.state; + for (chunk, v) in out.chunks_exact_mut(8).zip(s.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } } @@ -145,8 +118,8 @@ impl FixedOutputDirty for Sha384 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { self.engine.finish(); - let h = &self.engine.state.h[..6]; - for (chunk, v) in out.chunks_exact_mut(8).zip(h.iter()) { + let s = &self.engine.state[..6]; + for (chunk, v) in out.chunks_exact_mut(8).zip(s.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } } @@ -188,8 +161,8 @@ impl FixedOutputDirty for Sha512Trunc256 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { self.engine.finish(); - let h = &self.engine.state.h[..4]; - for (chunk, v) in out.chunks_exact_mut(8).zip(h.iter()) { + let s = &self.engine.state[..4]; + for (chunk, v) in out.chunks_exact_mut(8).zip(s.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } } @@ -231,11 +204,11 @@ impl FixedOutputDirty for Sha512Trunc224 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { self.engine.finish(); - let h = &self.engine.state.h; - for (chunk, v) in out.chunks_exact_mut(8).zip(h[..3].iter()) { + let s = &self.engine.state; + for (chunk, v) in out.chunks_exact_mut(8).zip(s[..3].iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } - out[24..28].copy_from_slice(&h[3].to_be_bytes()[..4]); + out[24..28].copy_from_slice(&s[3].to_be_bytes()[..4]); } } @@ -245,12 +218,12 @@ impl Reset for Sha512Trunc224 { } } -impl_opaque_debug!(Sha384); -impl_opaque_debug!(Sha512); -impl_opaque_debug!(Sha512Trunc224); -impl_opaque_debug!(Sha512Trunc256); +opaque_debug::impl_opaque_debug!(Sha384); +opaque_debug::impl_opaque_debug!(Sha512); +opaque_debug::impl_opaque_debug!(Sha512Trunc224); +opaque_debug::impl_opaque_debug!(Sha512Trunc256); -impl_write!(Sha384); -impl_write!(Sha512); -impl_write!(Sha512Trunc224); -impl_write!(Sha512Trunc256); +digest::impl_write!(Sha384); +digest::impl_write!(Sha512); +digest::impl_write!(Sha512Trunc224); +digest::impl_write!(Sha512Trunc256); diff --git a/sha2/src/sha512_compress.rs b/sha2/src/sha512_compress.rs new file mode 100644 index 000000000..baa6b2765 --- /dev/null +++ b/sha2/src/sha512_compress.rs @@ -0,0 +1,24 @@ +use digest::consts::U128; +use digest::generic_array::GenericArray; + +cfg_if::cfg_if! { + if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { + // TODO: replace after sha2-asm rework + fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + for block in blocks { + sha2_asm::compress512(state, block); + } + } + } else { + mod soft; + use soft::compress; + } +} + +pub fn compress512(state: &mut [u64; 8], blocks: &[GenericArray]) { + // SAFETY: GenericArray and [u8; 128] have + // exactly the same memory layout + #[allow(unsafe_code)] + let blocks: &[[u8; 128]] = unsafe { &*(blocks as *const _ as *const [[u8; 128]]) }; + compress(state, blocks) +} diff --git a/sha2/src/sha512_utils.rs b/sha2/src/sha512_compress/soft.rs similarity index 66% rename from sha2/src/sha512_utils.rs rename to sha2/src/sha512_compress/soft.rs index eaa9d51fc..f307f0e54 100644 --- a/sha2/src/sha512_utils.rs +++ b/sha2/src/sha512_compress/soft.rs @@ -2,13 +2,11 @@ use crate::consts::{BLOCK_LEN, K64X2}; use core::convert::TryInto; -#[inline(always)] fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] { [a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])] } /// Not an intrinsic, but works like an unaligned load. -#[inline] fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] { [v1[1], v0[0]] } @@ -202,105 +200,17 @@ pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) { state[7] = state[7].wrapping_add(h); } -/// Process a block with the SHA-512 algorithm. (See more...) -/// -/// Internally, this uses functions that resemble the new Intel SHA -/// instruction set extensions, but since no architecture seems to -/// have any designs, these may not be the final designs if and/or when -/// there are instruction set extensions with SHA-512. So to summarize: -/// SHA-1 and SHA-256 are being implemented in hardware soon (at the time -/// of this writing), but it doesn't look like SHA-512 will be hardware -/// accelerated any time soon. -/// -/// # Implementation -/// -/// These functions fall into two categories: message schedule calculation, and -/// the message block 64-round digest calculation. The schedule-related -/// functions allow 4 rounds to be calculated as: -/// -/// ```ignore -/// use std::simd::[u64; 2]; -/// use self::crypto::sha2::{ -/// sha512msg, -/// sha512load -/// }; -/// -/// fn schedule4_data(work: &mut [[u64; 2]], w: &[u64]) { -/// -/// // this is to illustrate the data order -/// work[0] = [w[1], w[0]); -/// work[1] = [w[3], w[2]); -/// work[2] = [w[5], w[4]); -/// work[3] = [w[7], w[6]); -/// work[4] = [w[9], w[8]); -/// work[5] = [w[11], w[10]); -/// work[6] = [w[13], w[12]); -/// work[7] = [w[15], w[14]); -/// } -/// -/// fn schedule4_work(work: &mut [[u64; 2]], t: usize) { -/// -/// // this is the core expression -/// work[t] = sha512msg(work[t - 8], -/// work[t - 7], -/// sha512load(work[t - 4], work[t - 3]), -/// work[t - 1]); -/// } -/// ``` -/// -/// instead of 4 rounds of: -/// -/// ```ignore -/// fn schedule_work(w: &mut [u64], t: usize) { -/// w[t] = sigma1!(w[t - 2]) + w[t - 7] + sigma0!(w[t - 15]) + w[t - 16]; -/// } -/// ``` -/// -/// and the digest-related functions allow 4 rounds to be calculated as: -/// -/// ```ignore -/// use std::simd::[u64; 2]; -/// use self::crypto::sha2::{K64X2, sha512rnd}; -/// -/// fn rounds4(state: &mut [u64; 8], work: &mut [[u64; 2]], t: usize) { -/// let [a, b, c, d, e, f, g, h]: [u64; 8] = *state; -/// -/// // this is to illustrate the data order -/// let mut ae = [a, e); -/// let mut bf = [b, f); -/// let mut cg = [c, g); -/// let mut dh = [d, h); -/// let [w1, w0) = K64X2[2*t] + work[2*t]; -/// let [w3, w2) = K64X2[2*t + 1] + work[2*t + 1]; -/// -/// // this is the core expression -/// dh = sha512rnd(ae, bf, cg, dh, w0); -/// cg = sha512rnd(dh, ae, bf, cg, w1); -/// bf = sha512rnd(cg, dh, ae, bf, w2); -/// ae = sha512rnd(bf, cg, dh, ae, w3); -/// -/// *state = [ae[0], bf[0], cg[0], dh[0], -/// ae[1], bf[1], cg[1], dh[1]]; -/// } -/// ``` -/// -/// instead of 4 rounds of: -/// -/// ```ignore -/// fn round(state: &mut [u64; 8], w: &mut [u64], t: usize) { -/// let [a, b, c, mut d, e, f, g, mut h]: [u64; 8] = *state; -/// -/// h += big_sigma1!(e) + choose!(e, f, g) + K64[t] + w[t]; d += h; -/// h += big_sigma0!(a) + majority!(a, b, c); -/// -/// *state = [h, a, b, c, d, e, f, g]; -/// } -/// ``` -/// -pub fn compress512(state: &mut [u64; 8], block: &[u8; 128]) { - let mut block_u64 = [0u64; BLOCK_LEN]; - for (o, chunk) in block_u64.iter_mut().zip(block.chunks_exact(8)) { - *o = u64::from_be_bytes(chunk.try_into().unwrap()); + +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + let mut block_u32 = [0u64; BLOCK_LEN]; + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; + for block in blocks { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) { + *o = u64::from_be_bytes(chunk.try_into().unwrap()); + } + sha512_digest_block_u64(&mut state_cpy, &block_u32); } - sha512_digest_block_u64(state, &block_u64); + *state = state_cpy; } diff --git a/sha2/tests/lib.rs b/sha2/tests/lib.rs index 72de11686..b9cb8628a 100644 --- a/sha2/tests/lib.rs +++ b/sha2/tests/lib.rs @@ -1,5 +1,3 @@ -#![no_std] - use digest::dev::{digest_test, one_million_a}; use digest::new_test; From c8e35eb5e65786cbaf8ebef709c948a7a62bd4c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 15:56:09 +0300 Subject: [PATCH 06/14] simplify aarch64 --- sha1/Cargo.toml | 11 ++++++----- sha1/src/compress.rs | 14 +++++++------- sha1/src/lib.rs | 29 +---------------------------- sha2/Cargo.toml | 2 +- 4 files changed, 15 insertions(+), 41 deletions(-) diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index 985ef5579..183cd5002 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -18,9 +18,11 @@ name = "sha1" digest = "0.9" block-buffer = "0.9" opaque-debug = "0.2" +cfg-if = "0.1" sha1-asm = { version = "0.4", optional = true } + +[target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] libc = { version = "0.2.68", optional = true } -cfg-if = "0.1" [dev-dependencies] digest = { version = "0.9", features = ["dev"] } @@ -29,8 +31,7 @@ hex-literal = "0.2" [features] default = ["std"] std = ["digest/std"] -asm = ["sha1-asm"] +asm = ["sha1-asm", "libc"] -# TODO: Remove this feature once is_aarch64_feature_detected!() is stabilised. -# Only used on AArch64 Linux systems, when built without the crypto target_feature. -asm-aarch64 = ["asm", "libc"] +# DEPRECATED: use `asm` instead +asm-aarch64 = ["asm"] diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index b595199ba..51a4cdc87 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -1,15 +1,12 @@ use digest::consts::U64; use digest::generic_array::GenericArray; -mod aarch64; -#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] -mod soft; -mod x86; - cfg_if::cfg_if! { - if #[cfg(feature = "asm-aarch64")] { + if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] { + mod soft; + mod aarch64; use aarch64::compress as compress_inner; - } else if #[cfg(feature = "asm")] { + } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { // TODO: replace after sha1-asm rework fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) { for block in blocks { @@ -17,8 +14,11 @@ cfg_if::cfg_if! { } } } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + mod soft; + mod x86; use x86::compress as compress_inner; } else { + mod soft; use soft::compress as compress_inner; } } diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index da93d5549..a83425027 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -28,32 +28,6 @@ #![deny(unsafe_code)] #![warn(missing_docs, rust_2018_idioms)] -// Give relevant error messages if the user tries to enable AArch64 asm on unsupported platforms. -#[cfg(all( - feature = "asm-aarch64", - target_arch = "aarch64", - not(target_os = "linux") -))] -compile_error!("Your OS isn’t yet supported for runtime-checking of AArch64 features."); - -#[cfg(all(feature = "asm-aarch64", not(target_arch = "aarch64")))] -compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" on non-AArch64 systems."); -#[cfg(all( - feature = "asm-aarch64", - target_arch = "aarch64", - target_feature = "crypto" -))] -compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when building for AArch64 systems with crypto extensions."); - -#[cfg(all( - not(feature = "asm-aarch64"), - feature = "asm", - target_arch = "aarch64", - not(target_feature = "crypto"), - target_os = "linux" -))] -compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU."); - #[cfg(feature = "std")] extern crate std; @@ -64,7 +38,6 @@ use crate::compress::compress; use crate::consts::{H, STATE_LEN}; use block_buffer::BlockBuffer; use digest::consts::{U20, U64}; -use digest::impl_write; pub use digest::{self, Digest}; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; @@ -123,4 +96,4 @@ impl Reset for Sha1 { } opaque_debug::impl_opaque_debug!(Sha1); -impl_write!(Sha1); +digest::impl_write!(Sha1); diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index ec690a63e..b4b9b4227 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -34,5 +34,5 @@ std = ["digest/std"] asm = ["sha2-asm", "libc"] compress = [] -# DEPRECATED: use `asm` isntead +# DEPRECATED: use `asm` instead asm-aarch64 = ["asm"] From 979863d0d4b9b36bda0b11bb7ba73a391049c755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 15:59:11 +0300 Subject: [PATCH 07/14] remove compress modules --- sha2/src/lib.rs | 10 +++--- sha2/src/sha256.rs | 32 ++++++++++++++++++- .../{sha256_compress => sha256}/aarch64.rs | 0 sha2/src/{sha256_compress => sha256}/soft.rs | 0 sha2/src/{sha256_compress => sha256}/x86.rs | 0 sha2/src/sha256_compress.rs | 32 ------------------- sha2/src/sha512.rs | 25 +++++++++++++-- sha2/src/{sha512_compress => sha512}/soft.rs | 0 sha2/src/sha512_compress.rs | 24 -------------- 9 files changed, 58 insertions(+), 65 deletions(-) rename sha2/src/{sha256_compress => sha256}/aarch64.rs (100%) rename sha2/src/{sha256_compress => sha256}/soft.rs (100%) rename sha2/src/{sha256_compress => sha256}/x86.rs (100%) delete mode 100644 sha2/src/sha256_compress.rs rename sha2/src/{sha512_compress => sha512}/soft.rs (100%) delete mode 100644 sha2/src/sha512_compress.rs diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index eccc78e67..9b804a471 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -61,14 +61,12 @@ extern crate std; mod consts; mod sha256; -mod sha256_compress; mod sha512; -mod sha512_compress; -pub use crate::sha256::{Sha224, Sha256}; -pub use crate::sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256}; +pub use sha256::{Sha224, Sha256}; +pub use sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256}; pub use digest::{self, Digest}; #[cfg(feature = "compress")] -pub use sha256_compress::compress256; +pub use sha256::compress256; #[cfg(feature = "compress")] -pub use sha512_compress::compress512; +pub use sha512::compress512; diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs index dc741f61a..fdd5d5b69 100644 --- a/sha2/src/sha256.rs +++ b/sha2/src/sha256.rs @@ -1,9 +1,9 @@ //! SHA-256 use crate::consts::{H224, H256, STATE_LEN}; -use crate::sha256_compress::compress256; use block_buffer::BlockBuffer; use core::slice::from_ref; use digest::consts::{U28, U32, U64}; +use digest::generic_array::GenericArray; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; type BlockSize = U64; @@ -137,3 +137,33 @@ opaque_debug::impl_opaque_debug!(Sha256); digest::impl_write!(Sha224); digest::impl_write!(Sha256); + +cfg_if::cfg_if! { + if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] { + mod soft; + mod aarch64; + use aarch64::compress; + } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { + // TODO: replace after sha2-asm rework + fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + for block in blocks { + sha2_asm::compress256(state, block); + } + } + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + mod soft; + mod x86; + use x86::compress; + } else { + mod soft; + use soft::compress; + } +} + +pub fn compress256(state: &mut [u32; 8], blocks: &[GenericArray]) { + // SAFETY: GenericArray and [u8; 64] have + // exactly the same memory layout + #[allow(unsafe_code)] + let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) }; + compress(state, blocks) +} diff --git a/sha2/src/sha256_compress/aarch64.rs b/sha2/src/sha256/aarch64.rs similarity index 100% rename from sha2/src/sha256_compress/aarch64.rs rename to sha2/src/sha256/aarch64.rs diff --git a/sha2/src/sha256_compress/soft.rs b/sha2/src/sha256/soft.rs similarity index 100% rename from sha2/src/sha256_compress/soft.rs rename to sha2/src/sha256/soft.rs diff --git a/sha2/src/sha256_compress/x86.rs b/sha2/src/sha256/x86.rs similarity index 100% rename from sha2/src/sha256_compress/x86.rs rename to sha2/src/sha256/x86.rs diff --git a/sha2/src/sha256_compress.rs b/sha2/src/sha256_compress.rs deleted file mode 100644 index b0b52cb4d..000000000 --- a/sha2/src/sha256_compress.rs +++ /dev/null @@ -1,32 +0,0 @@ -use digest::consts::U64; -use digest::generic_array::GenericArray; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "asm", target_arch = "aarch64", target_os = "linux"))] { - mod soft; - mod aarch64; - use aarch64::compress; - } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { - // TODO: replace after sha2-asm rework - fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { - for block in blocks { - sha2_asm::compress256(state, block); - } - } - } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - mod soft; - mod x86; - use x86::compress; - } else { - mod soft; - use soft::compress; - } -} - -pub fn compress256(state: &mut [u32; 8], blocks: &[GenericArray]) { - // SAFETY: GenericArray and [u8; 64] have - // exactly the same memory layout - #[allow(unsafe_code)] - let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) }; - compress(state, blocks) -} diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs index a49773222..63471f683 100644 --- a/sha2/src/sha512.rs +++ b/sha2/src/sha512.rs @@ -3,10 +3,9 @@ use crate::consts::{H384, H512, H512_TRUNC_224, H512_TRUNC_256, STATE_LEN}; use block_buffer::BlockBuffer; use core::slice::from_ref; use digest::consts::{U128, U28, U32, U48, U64}; +use digest::generic_array::GenericArray; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; -use crate::sha512_compress::compress512; - type BlockSize = U128; /// Structure that keeps state of the Sha-512 operation and @@ -227,3 +226,25 @@ digest::impl_write!(Sha384); digest::impl_write!(Sha512); digest::impl_write!(Sha512Trunc224); digest::impl_write!(Sha512Trunc256); + +cfg_if::cfg_if! { + if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { + // TODO: replace after sha2-asm rework + fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + for block in blocks { + sha2_asm::compress512(state, block); + } + } + } else { + mod soft; + use soft::compress; + } +} + +pub fn compress512(state: &mut [u64; 8], blocks: &[GenericArray]) { + // SAFETY: GenericArray and [u8; 128] have + // exactly the same memory layout + #[allow(unsafe_code)] + let blocks: &[[u8; 128]] = unsafe { &*(blocks as *const _ as *const [[u8; 128]]) }; + compress(state, blocks) +} diff --git a/sha2/src/sha512_compress/soft.rs b/sha2/src/sha512/soft.rs similarity index 100% rename from sha2/src/sha512_compress/soft.rs rename to sha2/src/sha512/soft.rs diff --git a/sha2/src/sha512_compress.rs b/sha2/src/sha512_compress.rs deleted file mode 100644 index baa6b2765..000000000 --- a/sha2/src/sha512_compress.rs +++ /dev/null @@ -1,24 +0,0 @@ -use digest::consts::U128; -use digest::generic_array::GenericArray; - -cfg_if::cfg_if! { - if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { - // TODO: replace after sha2-asm rework - fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - for block in blocks { - sha2_asm::compress512(state, block); - } - } - } else { - mod soft; - use soft::compress; - } -} - -pub fn compress512(state: &mut [u64; 8], blocks: &[GenericArray]) { - // SAFETY: GenericArray and [u8; 128] have - // exactly the same memory layout - #[allow(unsafe_code)] - let blocks: &[[u8; 128]] = unsafe { &*(blocks as *const _ as *const [[u8; 128]]) }; - compress(state, blocks) -} From 1da5fe9cfdfbd1f6ad2b9ceb1f7cfb8f748275e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 16:15:49 +0300 Subject: [PATCH 08/14] simplify rounds --- sha2/src/sha256/x86.rs | 69 +++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs index b9ed6ce0d..0822d942d 100644 --- a/sha2/src/sha256/x86.rs +++ b/sha2/src/sha256/x86.rs @@ -31,7 +31,7 @@ unsafe fn add_k(v: __m128i, i: usize) -> __m128i { _mm_add_epi32(v, t) } -unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i, ) -> __m128i { +unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { let t1 = _mm_sha256msg1_epu32(v0, v1); let t2 = _mm_alignr_epi8(v3, v2, 4); let t3 = _mm_add_epi32(t1, t2); @@ -39,9 +39,22 @@ unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i, ) -> __m1 } macro_rules! rounds4 { - ($abef:ident, $cdgh:ident, $rest:expr) => {{ - $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, $rest); - $abef = _mm_sha256rnds2_epu32($abef, $cdgh, _mm_shuffle_epi32($rest, 0x0E)); + ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ + let t1 = add_k($rest, $i); + $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); + let t2 = _mm_shuffle_epi32(t1, 0x0E); + $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2); + }}; +} + +macro_rules! schedule_rounds4 { + ( + $abef:ident, $cdgh:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i: expr + ) => {{ + $w4 = schedule($w0, $w1, $w2, $w3); + rounds4!($abef, $cdgh, $w4, $i) }}; } @@ -73,36 +86,24 @@ unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK); let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK); let mut w3 = _mm_shuffle_epi8( _mm_loadu_si128(data_ptr.add(3)), MASK); - - rounds4!(abef, cdgh, add_k(w0, 0)); - rounds4!(abef, cdgh, add_k(w1, 1)); - rounds4!(abef, cdgh, add_k(w2, 2)); - rounds4!(abef, cdgh, add_k(w3, 3)); - - let mut w4 = schedule(w0, w1, w2, w3); - rounds4!(abef, cdgh, add_k(w4, 4)); - w0 = schedule(w1, w2, w3, w4); - rounds4!(abef, cdgh, add_k(w0, 5)); - w1 = schedule(w2, w3, w4, w0); - rounds4!(abef, cdgh, add_k(w1, 6)); - w2 = schedule(w3, w4, w0, w1); - rounds4!(abef, cdgh, add_k(w2, 7)); - w3 = schedule(w4, w0, w1, w2); - rounds4!(abef, cdgh, add_k(w3, 8)); - w4 = schedule(w0, w1, w2, w3); - rounds4!(abef, cdgh, add_k(w4, 9)); - w0 = schedule(w1, w2, w3, w4); - rounds4!(abef, cdgh, add_k(w0, 10)); - w1 = schedule(w2, w3, w4, w0); - rounds4!(abef, cdgh, add_k(w1, 11)); - w2 = schedule(w3, w4, w0, w1); - rounds4!(abef, cdgh, add_k(w2, 12)); - w3 = schedule(w4, w0, w1, w2); - rounds4!(abef, cdgh, add_k(w3, 13)); - w4 = schedule(w0, w1, w2, w3); - rounds4!(abef, cdgh, add_k(w4, 14)); - w0 = schedule(w1, w2, w3, w4); - rounds4!(abef, cdgh, add_k(w0, 15)); + let mut w4; + + rounds4!(abef, cdgh, w0, 0); + rounds4!(abef, cdgh, w1, 1); + rounds4!(abef, cdgh, w2, 2); + rounds4!(abef, cdgh, w3, 3); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); abef = _mm_add_epi32(abef, abef_save); cdgh = _mm_add_epi32(cdgh, cdgh_save); From 8808c35c7531e561ddc9c73d62fdb04e32a9e627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 16:17:29 +0300 Subject: [PATCH 09/14] fmt --- sha2/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sha2/src/lib.rs b/sha2/src/lib.rs index 9b804a471..08829eab2 100644 --- a/sha2/src/lib.rs +++ b/sha2/src/lib.rs @@ -63,10 +63,10 @@ mod consts; mod sha256; mod sha512; -pub use sha256::{Sha224, Sha256}; -pub use sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256}; pub use digest::{self, Digest}; #[cfg(feature = "compress")] pub use sha256::compress256; +pub use sha256::{Sha224, Sha256}; #[cfg(feature = "compress")] pub use sha512::compress512; +pub use sha512::{Sha384, Sha512, Sha512Trunc224, Sha512Trunc256}; From aedab56c58234677ca4cedf7e87e191c927a1793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 16:26:04 +0300 Subject: [PATCH 10/14] additional simplifications --- sha2/src/sha256/soft.rs | 89 +++++++++++++++++++++-------------------- sha2/src/sha256/x86.rs | 14 +++---- 2 files changed, 50 insertions(+), 53 deletions(-) diff --git a/sha2/src/sha256/soft.rs b/sha2/src/sha256/soft.rs index fe133cd80..d7be01dc6 100644 --- a/sha2/src/sha256/soft.rs +++ b/sha2/src/sha256/soft.rs @@ -1,5 +1,5 @@ #![allow(clippy::many_single_char_names)] -use crate::consts::{BLOCK_LEN, K32X4}; +use crate::consts::BLOCK_LEN; use core::convert::TryInto; #[inline(always)] @@ -134,61 +134,62 @@ fn sha256_digest_round_x2(cdgh: [u32; 4], abef: [u32; 4], wk: [u32; 4]) -> [u32; [a2, b2, e2, f2] } -/// Process a block with the SHA-256 algorithm. -fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { - let k = &K32X4; +fn schedule(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { + let t1 = sha256msg1(v0, v1); + let t2 = sha256load(v2, v3); + let t3 = add(t1, t2); + sha256msg2(t3, v3) +} - macro_rules! schedule { - ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { - sha256msg2(add(sha256msg1($v0, $v1), sha256load($v2, $v3)), $v3) - }; - } +macro_rules! rounds4 { + ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ + let t1 = add($rest, crate::consts::K32X4[$i]); + $cdgh = sha256_digest_round_x2($cdgh, $abef, t1); + let t2 = sha256swap(t1); + $abef = sha256_digest_round_x2($abef, $cdgh, t2); + }}; +} - macro_rules! rounds4 { - ($abef:ident, $cdgh:ident, $rest:expr) => {{ - $cdgh = sha256_digest_round_x2($cdgh, $abef, $rest); - $abef = sha256_digest_round_x2($abef, $cdgh, sha256swap($rest)); - }}; - } +macro_rules! schedule_rounds4 { + ( + $abef:ident, $cdgh:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i: expr + ) => {{ + $w4 = schedule($w0, $w1, $w2, $w3); + rounds4!($abef, $cdgh, $w4, $i); + }}; +} +/// Process a block with the SHA-256 algorithm. +fn sha256_digest_block_u32(state: &mut [u32; 8], block: &[u32; 16]) { let mut abef = [state[0], state[1], state[4], state[5]]; let mut cdgh = [state[2], state[3], state[6], state[7]]; // Rounds 0..64 let mut w0 = [block[3], block[2], block[1], block[0]]; - rounds4!(abef, cdgh, add(k[0], w0)); let mut w1 = [block[7], block[6], block[5], block[4]]; - rounds4!(abef, cdgh, add(k[1], w1)); let mut w2 = [block[11], block[10], block[9], block[8]]; - rounds4!(abef, cdgh, add(k[2], w2)); let mut w3 = [block[15], block[14], block[13], block[12]]; - rounds4!(abef, cdgh, add(k[3], w3)); - - let mut w4 = schedule!(w0, w1, w2, w3); - rounds4!(abef, cdgh, add(k[4], w4)); - w0 = schedule!(w1, w2, w3, w4); - rounds4!(abef, cdgh, add(k[5], w0)); - w1 = schedule!(w2, w3, w4, w0); - rounds4!(abef, cdgh, add(k[6], w1)); - w2 = schedule!(w3, w4, w0, w1); - rounds4!(abef, cdgh, add(k[7], w2)); - w3 = schedule!(w4, w0, w1, w2); - rounds4!(abef, cdgh, add(k[8], w3)); - w4 = schedule!(w0, w1, w2, w3); - rounds4!(abef, cdgh, add(k[9], w4)); - w0 = schedule!(w1, w2, w3, w4); - rounds4!(abef, cdgh, add(k[10], w0)); - w1 = schedule!(w2, w3, w4, w0); - rounds4!(abef, cdgh, add(k[11], w1)); - w2 = schedule!(w3, w4, w0, w1); - rounds4!(abef, cdgh, add(k[12], w2)); - w3 = schedule!(w4, w0, w1, w2); - rounds4!(abef, cdgh, add(k[13], w3)); - w4 = schedule!(w0, w1, w2, w3); - rounds4!(abef, cdgh, add(k[14], w4)); - w0 = schedule!(w1, w2, w3, w4); - rounds4!(abef, cdgh, add(k[15], w0)); + let mut w4; + + rounds4!(abef, cdgh, w0, 0); + rounds4!(abef, cdgh, w1, 1); + rounds4!(abef, cdgh, w2, 2); + rounds4!(abef, cdgh, w3, 3); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); + schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); + schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); + schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); + schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); + schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); let [a, b, e, f] = abef; let [c, d, g, h] = cdgh; diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs index 0822d942d..8c84fd2ab 100644 --- a/sha2/src/sha256/x86.rs +++ b/sha2/src/sha256/x86.rs @@ -12,7 +12,7 @@ use core::arch::x86::*; target_feature = "sse4.1", )))] fn is_supported() -> bool { - true + false } #[cfg(all( @@ -25,12 +25,6 @@ fn is_supported() -> bool { true } -unsafe fn add_k(v: __m128i, i: usize) -> __m128i { - let k = &crate::consts::K32X4[i]; - let t = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); - _mm_add_epi32(v, t) -} - unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { let t1 = _mm_sha256msg1_epu32(v0, v1); let t2 = _mm_alignr_epi8(v3, v2, 4); @@ -40,7 +34,9 @@ unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128 macro_rules! rounds4 { ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ - let t1 = add_k($rest, $i); + let k = &crate::consts::K32X4[$i]; + let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); + let t1 = _mm_add_epi32($rest, kv); $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); let t2 = _mm_shuffle_epi32(t1, 0x0E); $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2); @@ -54,7 +50,7 @@ macro_rules! schedule_rounds4 { $i: expr ) => {{ $w4 = schedule($w0, $w1, $w2, $w3); - rounds4!($abef, $cdgh, $w4, $i) + rounds4!($abef, $cdgh, $w4, $i); }}; } From 22ee1dd769456d1fe20aa774fd413aa2d64eee40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 16:55:20 +0300 Subject: [PATCH 11/14] simplify sha1 compression --- sha1/src/compress/soft.rs | 95 +++++++++++++++++++-------------------- sha1/src/compress/x86.rs | 70 ++++++++++++++--------------- 2 files changed, 80 insertions(+), 85 deletions(-) diff --git a/sha1/src/compress/soft.rs b/sha1/src/compress/soft.rs index 94a019b98..19366fb13 100644 --- a/sha1/src/compress/soft.rs +++ b/sha1/src/compress/soft.rs @@ -175,68 +175,67 @@ fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } +macro_rules! rounds4 { + ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { + sha1_digest_round_x4($h0, sha1_first_half($h1, $wk), $i) + }; +} + +macro_rules! schedule { + ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { + sha1msg2(xor(sha1msg1($v0, $v1), $v2), $v3) + }; +} + +macro_rules! schedule_rounds4 { + ( + $h0:ident, $h1:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i:expr + ) => { + $w4 = schedule!($w0, $w1, $w2, $w3); + $h1 = rounds4!($h0, $h1, $w4, $i); + }; +} + #[inline(always)] fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { - macro_rules! schedule { - ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { - sha1msg2(xor(sha1msg1($v0, $v1), $v2), $v3) - }; - } + let mut w0 = [block[0], block[1], block[2], block[3]]; + let mut w1 = [block[4], block[5], block[6], block[7]]; + let mut w2 = [block[8], block[9], block[10], block[11]]; + let mut w3 = [block[12], block[13], block[14], block[15]]; + let mut w4; - macro_rules! rounds4 { - ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { - sha1_digest_round_x4($h0, sha1_first_half($h1, $wk), $i) - }; - } + let mut h0 = [state[0], state[1], state[2], state[3]]; + let mut h1 = sha1_first_add(state[4], w0); // Rounds 0..20 - let mut h0 = [state[0], state[1], state[2], state[3]]; - let mut w0 = [block[0], block[1], block[2], block[3]]; - let mut h1 = sha1_digest_round_x4(h0, sha1_first_add(state[4], w0), 0); - let mut w1 = [block[4], block[5], block[6], block[7]]; + h1 = sha1_digest_round_x4(h0, h1, 0); h0 = rounds4!(h1, h0, w1, 0); - let mut w2 = [block[8], block[9], block[10], block[11]]; h1 = rounds4!(h0, h1, w2, 0); - let mut w3 = [block[12], block[13], block[14], block[15]]; h0 = rounds4!(h1, h0, w3, 0); - let mut w4 = schedule!(w0, w1, w2, w3); - h1 = rounds4!(h0, h1, w4, 0); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 0); // Rounds 20..40 - w0 = schedule!(w1, w2, w3, w4); - h0 = rounds4!(h1, h0, w0, 1); - w1 = schedule!(w2, w3, w4, w0); - h1 = rounds4!(h0, h1, w1, 1); - w2 = schedule!(w3, w4, w0, w1); - h0 = rounds4!(h1, h0, w2, 1); - w3 = schedule!(w4, w0, w1, w2); - h1 = rounds4!(h0, h1, w3, 1); - w4 = schedule!(w0, w1, w2, w3); - h0 = rounds4!(h1, h0, w4, 1); + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 1); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 1); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 1); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 1); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 1); // Rounds 40..60 - w0 = schedule!(w1, w2, w3, w4); - h1 = rounds4!(h0, h1, w0, 2); - w1 = schedule!(w2, w3, w4, w0); - h0 = rounds4!(h1, h0, w1, 2); - w2 = schedule!(w3, w4, w0, w1); - h1 = rounds4!(h0, h1, w2, 2); - w3 = schedule!(w4, w0, w1, w2); - h0 = rounds4!(h1, h0, w3, 2); - w4 = schedule!(w0, w1, w2, w3); - h1 = rounds4!(h0, h1, w4, 2); + schedule_rounds4!(h0, h1, w1, w2, w3, w4, w0, 2); + schedule_rounds4!(h1, h0, w2, w3, w4, w0, w1, 2); + schedule_rounds4!(h0, h1, w3, w4, w0, w1, w2, 2); + schedule_rounds4!(h1, h0, w4, w0, w1, w2, w3, 2); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 2); // Rounds 60..80 - w0 = schedule!(w1, w2, w3, w4); - h0 = rounds4!(h1, h0, w0, 3); - w1 = schedule!(w2, w3, w4, w0); - h1 = rounds4!(h0, h1, w1, 3); - w2 = schedule!(w3, w4, w0, w1); - h0 = rounds4!(h1, h0, w2, 3); - w3 = schedule!(w4, w0, w1, w2); - h1 = rounds4!(h0, h1, w3, 3); - w4 = schedule!(w0, w1, w2, w3); - h0 = rounds4!(h1, h0, w4, 3); + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 3); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 3); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 3); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 3); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 3); let e = h1[0].rotate_left(30); let [a, b, c, d] = h0; diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs index fd8eebfaf..a4c7c91e7 100644 --- a/sha1/src/compress/x86.rs +++ b/sha1/src/compress/x86.rs @@ -38,6 +38,17 @@ macro_rules! schedule { }; } +macro_rules! schedule_rounds4 { + ( + $h0:ident, $h1:ident, + $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, + $i:expr + ) => { + $w4 = schedule!($w0, $w1, $w2, $w3); + $h1 = rounds4!($h0, $h1, $w4, $i); + }; +} + #[target_feature(enable = "sha,ssse3,sse4.1")] unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { #[allow(non_snake_case)] @@ -56,57 +67,42 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { #[allow(clippy::cast_ptr_alignment)] let block_ptr = block.as_ptr() as *const __m128i; - let h0 = state_abcd; - let e0 = state_e; - let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK); let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK); let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK); let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK); + let mut w4; + + let mut h0 = state_abcd; + let mut h1 = _mm_add_epi32(state_e, w0); // Rounds 0..20 - let mut h1 = _mm_sha1rnds4_epu32(h0, _mm_add_epi32(e0, w0), 0); - let mut h0 = rounds4!(h1, h0, w1, 0); + h1 = _mm_sha1rnds4_epu32(h0, h1, 0); + h0 = rounds4!(h1, h0, w1, 0); h1 = rounds4!(h0, h1, w2, 0); h0 = rounds4!(h1, h0, w3, 0); - let mut w4 = schedule!(w0, w1, w2, w3); - h1 = rounds4!(h0, h1, w4, 0); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 0); // Rounds 20..40 - w0 = schedule!(w1, w2, w3, w4); - h0 = rounds4!(h1, h0, w0, 1); - w1 = schedule!(w2, w3, w4, w0); - h1 = rounds4!(h0, h1, w1, 1); - w2 = schedule!(w3, w4, w0, w1); - h0 = rounds4!(h1, h0, w2, 1); - w3 = schedule!(w4, w0, w1, w2); - h1 = rounds4!(h0, h1, w3, 1); - w4 = schedule!(w0, w1, w2, w3); - h0 = rounds4!(h1, h0, w4, 1); + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 1); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 1); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 1); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 1); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 1); // Rounds 40..60 - w0 = schedule!(w1, w2, w3, w4); - h1 = rounds4!(h0, h1, w0, 2); - w1 = schedule!(w2, w3, w4, w0); - h0 = rounds4!(h1, h0, w1, 2); - w2 = schedule!(w3, w4, w0, w1); - h1 = rounds4!(h0, h1, w2, 2); - w3 = schedule!(w4, w0, w1, w2); - h0 = rounds4!(h1, h0, w3, 2); - w4 = schedule!(w0, w1, w2, w3); - h1 = rounds4!(h0, h1, w4, 2); + schedule_rounds4!(h0, h1, w1, w2, w3, w4, w0, 2); + schedule_rounds4!(h1, h0, w2, w3, w4, w0, w1, 2); + schedule_rounds4!(h0, h1, w3, w4, w0, w1, w2, 2); + schedule_rounds4!(h1, h0, w4, w0, w1, w2, w3, 2); + schedule_rounds4!(h0, h1, w0, w1, w2, w3, w4, 2); // Rounds 60..80 - w0 = schedule!(w1, w2, w3, w4); - h0 = rounds4!(h1, h0, w0, 3); - w1 = schedule!(w2, w3, w4, w0); - h1 = rounds4!(h0, h1, w1, 3); - w2 = schedule!(w3, w4, w0, w1); - h0 = rounds4!(h1, h0, w2, 3); - w3 = schedule!(w4, w0, w1, w2); - h1 = rounds4!(h0, h1, w3, 3); - w4 = schedule!(w0, w1, w2, w3); - h0 = rounds4!(h1, h0, w4, 3); + schedule_rounds4!(h1, h0, w1, w2, w3, w4, w0, 3); + schedule_rounds4!(h0, h1, w2, w3, w4, w0, w1, 3); + schedule_rounds4!(h1, h0, w3, w4, w0, w1, w2, 3); + schedule_rounds4!(h0, h1, w4, w0, w1, w2, w3, 3); + schedule_rounds4!(h1, h0, w0, w1, w2, w3, w4, 3); state_abcd = _mm_add_epi32(state_abcd, h0); state_e = _mm_sha1nexte_epu32(h1, state_e); From dd960b125192e2da918acde69ddb64dc2752f613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 17:02:48 +0300 Subject: [PATCH 12/14] fix signature --- sha1/src/compress.rs | 2 +- sha2/src/sha256/x86.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs index 51a4cdc87..d00dbd1a0 100644 --- a/sha1/src/compress.rs +++ b/sha1/src/compress.rs @@ -8,7 +8,7 @@ cfg_if::cfg_if! { use aarch64::compress as compress_inner; } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { // TODO: replace after sha1-asm rework - fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) { + fn compress_inner(state: &mut [u32; 5], blocks: &[[u8; 64]]) { for block in blocks { sha1_asm::compress(state, block); } diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs index 8c84fd2ab..90e5d7381 100644 --- a/sha2/src/sha256/x86.rs +++ b/sha2/src/sha256/x86.rs @@ -34,7 +34,7 @@ unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128 macro_rules! rounds4 { ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ - let k = &crate::consts::K32X4[$i]; + let k = crate::consts::K32X4[$i]; let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); let t1 = _mm_add_epi32($rest, kv); $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); From b49a5270eed002ea5d496b06c72eb8579893d51e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 22:02:57 +0300 Subject: [PATCH 13/14] add runtime detection --- Cargo.lock | 8 ++++++++ sha1/Cargo.toml | 1 + sha1/src/compress/x86.rs | 24 ++---------------------- sha2/Cargo.toml | 1 + sha2/src/sha256/x86.rs | 24 ++---------------------- 5 files changed, 14 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 659c6a8dd..62658d5ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -53,6 +53,12 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "cpuid-bool" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d375c433320f6c5057ae04a04376eef4d04ce2801448cf8863a78da99107be4" + [[package]] name = "crypto-mac" version = "0.8.0" @@ -220,6 +226,7 @@ version = "0.9.0" dependencies = [ "block-buffer", "cfg-if", + "cpuid-bool", "digest", "hex-literal", "libc", @@ -242,6 +249,7 @@ version = "0.9.0" dependencies = [ "block-buffer", "cfg-if", + "cpuid-bool", "digest", "hex-literal", "libc", diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index 183cd5002..e2241b81a 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -19,6 +19,7 @@ digest = "0.9" block-buffer = "0.9" opaque-debug = "0.2" cfg-if = "0.1" +cpuid-bool = "0.1" sha1-asm = { version = "0.4", optional = true } [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs index a4c7c91e7..05d90cc98 100644 --- a/sha1/src/compress/x86.rs +++ b/sha1/src/compress/x86.rs @@ -6,26 +6,6 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -#[cfg(not(all( - target_feature = "sha", - target_feature = "sse2", - target_feature = "ssse3", - target_feature = "sse4.1", -)))] -fn sha1_supported() -> bool { - false -} - -#[cfg(all( - target_feature = "sha", - target_feature = "sse2", - target_feature = "ssse3", - target_feature = "sse4.1", -))] -fn sha1_supported() -> bool { - true -} - macro_rules! rounds4 { ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i) @@ -49,7 +29,7 @@ macro_rules! schedule_rounds4 { }; } -#[target_feature(enable = "sha,ssse3,sse4.1")] +#[target_feature(enable = "sha,sse2,ssse3,sse4.1")] unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { #[allow(non_snake_case)] let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F); @@ -118,7 +98,7 @@ unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 // after stabilization - if sha1_supported() { + if cpuid_bool::cpuid_bool!("sha", "sse2", "ssse3", "sse4.1") { unsafe { digest_blocks(state, blocks); } diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index b4b9b4227..c8c8f61ee 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -19,6 +19,7 @@ digest = "0.9" block-buffer = "0.9" opaque-debug = "0.2" cfg-if = "0.1" +cpuid-bool = "0.1" sha2-asm = { version = "0.5", optional = true } [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] diff --git a/sha2/src/sha256/x86.rs b/sha2/src/sha256/x86.rs index 90e5d7381..04a7d26d0 100644 --- a/sha2/src/sha256/x86.rs +++ b/sha2/src/sha256/x86.rs @@ -5,26 +5,6 @@ use core::arch::x86_64::*; #[cfg(target_arch = "x86")] use core::arch::x86::*; -#[cfg(not(all( - target_feature = "sha", - target_feature = "sse2", - target_feature = "ssse3", - target_feature = "sse4.1", -)))] -fn is_supported() -> bool { - false -} - -#[cfg(all( - target_feature = "sha", - target_feature = "sse2", - target_feature = "ssse3", - target_feature = "sse4.1", -))] -fn is_supported() -> bool { - true -} - unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { let t1 = _mm_sha256msg1_epu32(v0, v1); let t2 = _mm_alignr_epi8(v3, v2, 4); @@ -56,7 +36,7 @@ macro_rules! schedule_rounds4 { // we use unaligned loads with `__m128i` pointers #[allow(clippy::cast_ptr_alignment)] -#[target_feature(enable = "sha,ssse3,sse4.1")] +#[target_feature(enable = "sha,sse2,ssse3,sse4.1")] unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { #[allow(non_snake_case)] let MASK: __m128i = _mm_set_epi64x( @@ -118,7 +98,7 @@ unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 // after stabilization - if is_supported() { + if cpuid_bool::cpuid_bool!("sha", "sse2", "ssse3", "sse4.1") { unsafe { digest_blocks(state, blocks); } From 1ee078849e324b4c523b647793c9579d469f9298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D1=91=D0=BC=20=D0=9F=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=BE=D0=B2=20=5BArtyom=20Pavlov=5D?= Date: Thu, 11 Jun 2020 22:11:51 +0300 Subject: [PATCH 14/14] make cpuid-bool x86-only dependency --- sha1/Cargo.toml | 4 +++- sha2/Cargo.toml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index 5d5868902..eebb7cb4c 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -19,9 +19,11 @@ digest = "0.9" block-buffer = "0.9" opaque-debug = "0.3" cfg-if = "0.1" -cpuid-bool = "0.1" sha1-asm = { version = "0.4", optional = true } +[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] +cpuid-bool = "0.1" + [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] libc = { version = "0.2.68", optional = true } diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index 3e049ec50..ac3893458 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -19,9 +19,11 @@ digest = "0.9" block-buffer = "0.9" opaque-debug = "0.3" cfg-if = "0.1" -cpuid-bool = "0.1" sha2-asm = { version = "0.5", optional = true } +[target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] +cpuid-bool = "0.1" + [target.'cfg(all(target_arch = "aarch64", target_os = "linux"))'.dependencies] libc = { version = "0.2.68", optional = true }