diff --git a/Cargo.lock b/Cargo.lock index 8bdfe0138..1db6a343e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,6 +47,12 @@ version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "crypto-mac" version = "0.8.0" @@ -213,6 +219,7 @@ name = "sha-1" version = "0.9.0" dependencies = [ "block-buffer", + "cfg-if", "digest", "hex-literal", "libc", diff --git a/sha1/Cargo.toml b/sha1/Cargo.toml index aba0700d6..985ef5579 100644 --- a/sha1/Cargo.toml +++ b/sha1/Cargo.toml @@ -20,6 +20,7 @@ block-buffer = "0.9" opaque-debug = "0.2" sha1-asm = { version = "0.4", optional = true } libc = { version = "0.2.68", optional = true } +cfg-if = "0.1" [dev-dependencies] digest = { version = "0.9", features = ["dev"] } diff --git a/sha1/src/aarch64.rs b/sha1/src/aarch64.rs deleted file mode 100644 index 8d1a916cc..000000000 --- a/sha1/src/aarch64.rs +++ /dev/null @@ -1,8 +0,0 @@ -use libc::{getauxval, AT_HWCAP, HWCAP_SHA1}; - -#[inline(always)] -pub fn sha1_supported() -> bool { - #[allow(unsafe_code)] - let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) }; - (hwcaps & HWCAP_SHA1) != 0 -} diff --git a/sha1/src/compress.rs b/sha1/src/compress.rs new file mode 100644 index 000000000..b595199ba --- /dev/null +++ b/sha1/src/compress.rs @@ -0,0 +1,32 @@ +use digest::consts::U64; +use digest::generic_array::GenericArray; + +mod aarch64; +#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] +mod soft; +mod x86; + +cfg_if::cfg_if! { + if #[cfg(feature = "asm-aarch64")] { + use aarch64::compress as compress_inner; + } else if #[cfg(feature = "asm")] { + // TODO: replace after sha1-asm rework + fn compress_inner(state: &mut [u32; 5], blocks: &[u8; 64]) { + for block in blocks { + sha1_asm::compress(state, block); + } + } + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + use x86::compress as compress_inner; + } else { + use soft::compress as compress_inner; + } +} + +pub fn compress(state: &mut [u32; 5], blocks: &[GenericArray]) { + // SAFETY: GenericArray and [u8; 64] have + // exactly the same memory layout + #[allow(unsafe_code)] + let blocks: &[[u8; 64]] = unsafe { &*(blocks as *const _ as *const [[u8; 64]]) }; + compress_inner(state, blocks); +} diff --git a/sha1/src/compress/aarch64.rs b/sha1/src/compress/aarch64.rs new file mode 100644 index 000000000..85295f052 --- /dev/null +++ b/sha1/src/compress/aarch64.rs @@ -0,0 +1,21 @@ +#![cfg(feature = "asm-aarch64")] +use libc::{getauxval, AT_HWCAP, HWCAP_SHA1}; + +fn sha1_supported() -> bool { + #[allow(unsafe_code)] + let hwcaps: u64 = unsafe { getauxval(AT_HWCAP) }; + (hwcaps & HWCAP_SHA1) != 0 +} + +pub fn compress(state: &mut [u32; 5], blocks: &[u8; 64]) { + // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once + // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented + // to let us use it on no_std. + if sha1_supported() { + for block in blocks { + sha1_asm::compress(state, block); + } + } else { + super::soft::compress(state, blocks); + } +} diff --git a/sha1/src/utils.rs b/sha1/src/compress/soft.rs similarity index 66% rename from sha1/src/utils.rs rename to sha1/src/compress/soft.rs index 1d746fb9f..94a019b98 100644 --- a/sha1/src/utils.rs +++ b/sha1/src/compress/soft.rs @@ -1,10 +1,6 @@ #![allow(clippy::many_single_char_names)] use crate::consts::{BLOCK_LEN, K0, K1, K2, K3}; use core::convert::TryInto; -use digest::generic_array::typenum::U64; -use digest::generic_array::GenericArray; - -type Block = GenericArray; #[inline(always)] fn add(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { @@ -21,27 +17,18 @@ fn xor(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { [a[0] ^ b[0], a[1] ^ b[1], a[2] ^ b[2], a[3] ^ b[3]] } -/// Not an intrinsic, but gets the first element of a vector. -#[inline] -pub fn sha1_first(w0: [u32; 4]) -> u32 { - w0[0] -} - -/// Not an intrinsic, but adds a word to the first element of a vector. #[inline] pub fn sha1_first_add(e: u32, w0: [u32; 4]) -> [u32; 4] { let [a, b, c, d] = w0; [e.wrapping_add(a), b, c, d] } -/// Emulates `llvm.x86.sha1msg1` intrinsic. fn sha1msg1(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { let [_, _, w2, w3] = a; let [w4, w5, _, _] = b; [a[0] ^ w2, a[1] ^ w3, a[2] ^ w4, a[3] ^ w5] } -/// Emulates `llvm.x86.sha1msg2` intrinsic. fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { let [x0, x1, x2, x3] = a; let [_, w13, w14, w15] = b; @@ -54,21 +41,11 @@ fn sha1msg2(a: [u32; 4], b: [u32; 4]) -> [u32; 4] { [w16, w17, w18, w19] } -/// Performs 4 rounds of the message schedule update. -/* -pub fn sha1_schedule_x4(v0: [u32; 4], v1: [u32; 4], v2: [u32; 4], v3: [u32; 4]) -> [u32; 4] { - sha1msg2(sha1msg1(v0, v1) ^ v2, v3) -} -*/ - -/// Emulates `llvm.x86.sha1nexte` intrinsic. #[inline] fn sha1_first_half(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { - sha1_first_add(sha1_first(abcd).rotate_left(30), msg) + sha1_first_add(abcd[0].rotate_left(30), msg) } -/// Emulates `llvm.x86.sha1rnds4` intrinsic. -/// Performs 4 rounds of the message block digest. fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] { const K0V: [u32; 4] = [K0, K0, K0, K0]; const K1V: [u32; 4] = [K1, K1, K1, K1]; @@ -84,7 +61,6 @@ fn sha1_digest_round_x4(abcd: [u32; 4], work: [u32; 4], i: i8) -> [u32; 4] { } } -/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic. fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { let [mut a, mut b, mut c, mut d] = abcd; let [t, u, v, w] = msg; @@ -123,7 +99,6 @@ fn sha1rnds4c(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } -/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic. fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { let [mut a, mut b, mut c, mut d] = abcd; let [t, u, v, w] = msg; @@ -162,7 +137,6 @@ fn sha1rnds4p(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } -/// Not an intrinsic, but helps emulate `llvm.x86.sha1rnds4` intrinsic. fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { let [mut a, mut b, mut c, mut d] = abcd; let [t, u, v, w] = msg; @@ -201,7 +175,7 @@ fn sha1rnds4m(abcd: [u32; 4], msg: [u32; 4]) -> [u32; 4] { [b, c, d, e] } -/// Process a block with the SHA-1 algorithm. +#[inline(always)] fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { macro_rules! schedule { ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { @@ -216,7 +190,6 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { } // Rounds 0..20 - // TODO: replace with `[u32; 4]::load` let mut h0 = [state[0], state[1], state[2], state[3]]; let mut w0 = [block[0], block[1], block[2], block[3]]; let mut h1 = sha1_digest_round_x4(h0, sha1_first_add(state[4], w0), 0); @@ -265,7 +238,7 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { w4 = schedule!(w0, w1, w2, w3); h0 = rounds4!(h1, h0, w4, 3); - let e = sha1_first(h1).rotate_left(30); + let e = h1[0].rotate_left(30); let [a, b, c, d] = h0; state[0] = state[0].wrapping_add(a); @@ -275,58 +248,16 @@ fn sha1_digest_block_u32(state: &mut [u32; 5], block: &[u32; 16]) { state[4] = state[4].wrapping_add(e); } -/// Process a block with the SHA-1 algorithm. (See more...) -/// -/// SHA-1 is a cryptographic hash function, and as such, it operates -/// on an arbitrary number of bytes. This function operates on a fixed -/// number of bytes. If you call this function with anything other than -/// 64 bytes, then it will panic! This function takes two arguments: -/// -/// * `state` is reference to an **array** of 5 words. -/// * `block` is reference to a **slice** of 64 bytes. -/// -/// If you want the function that performs a message digest on an arbitrary -/// number of bytes, then see also the `Sha1` struct above. -/// -/// # Implementation -/// -/// First, some background. Both ARM and Intel are releasing documentation -/// that they plan to include instruction set extensions for SHA1 and SHA256 -/// sometime in the near future. Second, LLVM won't lower these intrinsics yet, -/// so these functions were written emulate these instructions. Finally, -/// the block function implemented with these emulated intrinsics turned out -/// to be quite fast! What follows is a discussion of this CPU-level view -/// of the SHA-1 algorithm and how it relates to the mathematical definition. -/// -/// The SHA instruction set extensions can be divided up into two categories: -/// -/// * message work schedule update calculation ("schedule" v., "work" n.) -/// * message block 80-round digest calculation ("digest" v., "block" n.) -/// -/// The schedule-related functions can be used to easily perform 4 rounds -/// of the message work schedule update calculation, as shown below: -/// -/// ```ignore -/// macro_rules! schedule_x4 { -/// ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => ( -/// sha1msg2(sha1msg1($v0, $v1) ^ $v2, $v3) -/// ) -/// } -/// -/// macro_rules! round_x4 { -/// ($h0:ident, $h1:ident, $wk:expr, $i:expr) => ( -/// sha1rnds4($h0, sha1_first_half($h1, $wk), $i) -/// ) -/// } -/// ``` -/// -/// and also shown above is how the digest-related functions can be used to -/// perform 4 rounds of the message block digest calculation. -/// -pub fn compress(state: &mut [u32; 5], block: &Block) { +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { let mut block_u32 = [0u32; BLOCK_LEN]; - for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { - *o = u32::from_be_bytes(chunk.try_into().unwrap()); + // since LLVM can't properly use aliasing yet it will make + // unnecessary state stores without this copy + let mut state_cpy = *state; + for block in blocks.iter() { + for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(4)) { + *o = u32::from_be_bytes(chunk.try_into().unwrap()); + } + sha1_digest_block_u32(&mut state_cpy, &block_u32); } - sha1_digest_block_u32(state, &block_u32); + *state = state_cpy; } diff --git a/sha1/src/compress/x86.rs b/sha1/src/compress/x86.rs new file mode 100644 index 000000000..fd8eebfaf --- /dev/null +++ b/sha1/src/compress/x86.rs @@ -0,0 +1,132 @@ +#![cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#![allow(unsafe_code)] + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +#[cfg(not(all( + target_feature = "sha", + target_feature = "sse2", + target_feature = "ssse3", + target_feature = "sse4.1", +)))] +fn sha1_supported() -> bool { + false +} + +#[cfg(all( + target_feature = "sha", + target_feature = "sse2", + target_feature = "ssse3", + target_feature = "sse4.1", +))] +fn sha1_supported() -> bool { + true +} + +macro_rules! rounds4 { + ($h0:ident, $h1:ident, $wk:expr, $i:expr) => { + _mm_sha1rnds4_epu32($h0, _mm_sha1nexte_epu32($h1, $wk), $i) + }; +} + +macro_rules! schedule { + ($v0:expr, $v1:expr, $v2:expr, $v3:expr) => { + _mm_sha1msg2_epu32(_mm_xor_si128(_mm_sha1msg1_epu32($v0, $v1), $v2), $v3) + }; +} + +#[target_feature(enable = "sha,ssse3,sse4.1")] +unsafe fn digest_blocks(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + #[allow(non_snake_case)] + let MASK: __m128i = _mm_set_epi64x(0x0001_0203_0405_0607, 0x0809_0A0B_0C0D_0E0F); + + let mut state_abcd = _mm_set_epi32( + state[0] as i32, + state[1] as i32, + state[2] as i32, + state[3] as i32, + ); + let mut state_e = _mm_set_epi32(state[4] as i32, 0, 0, 0); + + for block in blocks { + // SAFETY: we use only unaligned loads with this pointer + #[allow(clippy::cast_ptr_alignment)] + let block_ptr = block.as_ptr() as *const __m128i; + + let h0 = state_abcd; + let e0 = state_e; + + let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(0)), MASK); + let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(1)), MASK); + let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(2)), MASK); + let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.offset(3)), MASK); + + // Rounds 0..20 + let mut h1 = _mm_sha1rnds4_epu32(h0, _mm_add_epi32(e0, w0), 0); + let mut h0 = rounds4!(h1, h0, w1, 0); + h1 = rounds4!(h0, h1, w2, 0); + h0 = rounds4!(h1, h0, w3, 0); + let mut w4 = schedule!(w0, w1, w2, w3); + h1 = rounds4!(h0, h1, w4, 0); + + // Rounds 20..40 + w0 = schedule!(w1, w2, w3, w4); + h0 = rounds4!(h1, h0, w0, 1); + w1 = schedule!(w2, w3, w4, w0); + h1 = rounds4!(h0, h1, w1, 1); + w2 = schedule!(w3, w4, w0, w1); + h0 = rounds4!(h1, h0, w2, 1); + w3 = schedule!(w4, w0, w1, w2); + h1 = rounds4!(h0, h1, w3, 1); + w4 = schedule!(w0, w1, w2, w3); + h0 = rounds4!(h1, h0, w4, 1); + + // Rounds 40..60 + w0 = schedule!(w1, w2, w3, w4); + h1 = rounds4!(h0, h1, w0, 2); + w1 = schedule!(w2, w3, w4, w0); + h0 = rounds4!(h1, h0, w1, 2); + w2 = schedule!(w3, w4, w0, w1); + h1 = rounds4!(h0, h1, w2, 2); + w3 = schedule!(w4, w0, w1, w2); + h0 = rounds4!(h1, h0, w3, 2); + w4 = schedule!(w0, w1, w2, w3); + h1 = rounds4!(h0, h1, w4, 2); + + // Rounds 60..80 + w0 = schedule!(w1, w2, w3, w4); + h0 = rounds4!(h1, h0, w0, 3); + w1 = schedule!(w2, w3, w4, w0); + h1 = rounds4!(h0, h1, w1, 3); + w2 = schedule!(w3, w4, w0, w1); + h0 = rounds4!(h1, h0, w2, 3); + w3 = schedule!(w4, w0, w1, w2); + h1 = rounds4!(h0, h1, w3, 3); + w4 = schedule!(w0, w1, w2, w3); + h0 = rounds4!(h1, h0, w4, 3); + + state_abcd = _mm_add_epi32(state_abcd, h0); + state_e = _mm_sha1nexte_epu32(h1, state_e); + } + + state[0] = _mm_extract_epi32(state_abcd, 3) as u32; + state[1] = _mm_extract_epi32(state_abcd, 2) as u32; + state[2] = _mm_extract_epi32(state_abcd, 1) as u32; + state[3] = _mm_extract_epi32(state_abcd, 0) as u32; + state[4] = _mm_extract_epi32(state_e, 3) as u32; +} + +pub fn compress(state: &mut [u32; 5], blocks: &[[u8; 64]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if sha1_supported() { + unsafe { + digest_blocks(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} diff --git a/sha1/src/lib.rs b/sha1/src/lib.rs index abe0bb492..da93d5549 100644 --- a/sha1/src/lib.rs +++ b/sha1/src/lib.rs @@ -54,33 +54,20 @@ compile_error!("Enable the \"asm\" feature instead of \"asm-aarch64\" when build ))] compile_error!("Enable the \"asm-aarch64\" feature on AArch64 if you want to use asm detected at runtime, or build with the crypto extensions support, for instance with RUSTFLAGS='-C target-cpu=native' on a compatible CPU."); -#[macro_use] -extern crate opaque_debug; -#[cfg(feature = "asm")] -extern crate sha1_asm; #[cfg(feature = "std")] extern crate std; -#[cfg(feature = "asm-aarch64")] -mod aarch64; +mod compress; mod consts; -#[cfg(any(not(feature = "asm"), feature = "asm-aarch64"))] -mod utils; - -pub use digest::{self, Digest}; +use crate::compress::compress; use crate::consts::{H, STATE_LEN}; use block_buffer::BlockBuffer; use digest::consts::{U20, U64}; use digest::impl_write; +pub use digest::{self, Digest}; use digest::{BlockInput, FixedOutputDirty, Reset, Update}; -#[cfg(not(feature = "asm"))] -use crate::utils::compress; - -#[cfg(feature = "asm")] -use digest::generic_array::GenericArray; - /// Structure representing the state of a SHA-1 computation #[derive(Clone)] pub struct Sha1 { @@ -109,7 +96,7 @@ impl Update for Sha1 { // Assumes that `length_bits<<3` will not overflow self.len += input.len() as u64; let state = &mut self.h; - self.buffer.input_block(input, |d| compress(state, d)); + self.buffer.input_blocks(input, |d| compress(state, d)); } } @@ -119,7 +106,8 @@ impl FixedOutputDirty for Sha1 { fn finalize_into_dirty(&mut self, out: &mut digest::Output) { let s = &mut self.h; let l = self.len << 3; - self.buffer.len64_padding_be(l, |d| compress(s, d)); + self.buffer + .len64_padding_be(l, |d| compress(s, core::slice::from_ref(d))); for (chunk, v) in out.chunks_exact_mut(4).zip(self.h.iter()) { chunk.copy_from_slice(&v.to_be_bytes()); } @@ -134,28 +122,5 @@ impl Reset for Sha1 { } } -#[cfg(all(feature = "asm", not(feature = "asm-aarch64")))] -#[inline(always)] -fn compress(state: &mut [u32; 5], block: &GenericArray) { - #[allow(unsafe_code)] - let block: &[u8; 64] = unsafe { core::mem::transmute(block) }; - sha1_asm::compress(state, block); -} - -#[cfg(feature = "asm-aarch64")] -#[inline(always)] -fn compress(state: &mut [u32; 5], block: &GenericArray) { - // TODO: Replace this platform-specific call with is_aarch64_feature_detected!("sha1") once - // that macro is stabilised and https://github.com/rust-lang/rfcs/pull/2725 is implemented - // to let us use it on no_std. - if aarch64::sha1_supported() { - #[allow(unsafe_code)] - let block: &[u8; 64] = unsafe { core::mem::transmute(block) }; - sha1_asm::compress(state, block); - } else { - utils::compress(state, block); - } -} - -impl_opaque_debug!(Sha1); +opaque_debug::impl_opaque_debug!(Sha1); impl_write!(Sha1);