Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use embedded broadcast for avx512 constants. #139

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aes/cbc_std_vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ static unsigned char C11[] = { 0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0
#define vector(N) \
{ \
K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N, C##N), P##N, C##N, NULL, \
NULL, /*NULL, NULL*/ \
NULL, /*NULL, NULL*/ \
}
struct cbc_vector const cbc_vectors[] = {
vector(1), vector(2), vector(3), vector(4), vector(5), vector(6),
Expand Down
6 changes: 2 additions & 4 deletions aes/gcm_vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,10 +475,8 @@ static uint8_t T11[] = { 0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68,
///////

#define vector(N) \
{ \
K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), \
C##N, T##N, sizeof(T##N) \
}
{ K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, \
P##N, sizeof(P##N), C##N, T##N, sizeof(T##N) }

gcm_vector const gcm_vectors[] = {
// field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen};
Expand Down
14 changes: 4 additions & 10 deletions fips/aes_self_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,7 @@ static uint8_t aes_gcm_256_tag[] = { 0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0
0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b };

#define ADD_CBC_VECTOR(_key, _iv, _plain, _cipher, _descr) \
{ \
_key, sizeof(_key), _iv, _plain, sizeof(_plain), _cipher, _descr \
}
{ _key, sizeof(_key), _iv, _plain, sizeof(_plain), _cipher, _descr }

static const struct self_test_cbc_vector cbc_vectors[] = {
ADD_CBC_VECTOR(aes_cbc_128_key, aes_cbc_128_iv, aes_cbc_128_plaintext,
Expand Down Expand Up @@ -420,9 +418,7 @@ _aes_cbc_self_test(void)
}

#define ADD_XTS_VECTOR(_key1, _key2, _tweak, _plain, _cipher, _descr) \
{ \
_key1, _key2, sizeof(_key1), _tweak, _plain, sizeof(_plain), _cipher, _descr \
}
{ _key1, _key2, sizeof(_key1), _tweak, _plain, sizeof(_plain), _cipher, _descr }

static const struct self_test_xts_vector xts_vectors[] = {
ADD_XTS_VECTOR(aes_xts_128_key1, aes_xts_128_key2, aes_xts_128_tweak, aes_xts_128_plaintext,
Expand Down Expand Up @@ -557,10 +553,8 @@ _aes_xts_self_test(void)
}

#define ADD_GCM_VECTOR(_key, _iv, _plain, _cipher, _aad, _tag, _descr) \
{ \
_key, sizeof(_key), _iv, _plain, sizeof(_plain), _cipher, _aad, sizeof(_aad), \
_tag, sizeof(_tag), _descr \
}
{ _key, sizeof(_key), _iv, _plain, sizeof(_plain), _cipher, \
_aad, sizeof(_aad), _tag, sizeof(_tag), _descr }

static const struct self_test_gcm_vector gcm_vectors[] = {
ADD_GCM_VECTOR(aes_gcm_128_key, aes_gcm_128_iv, aes_gcm_128_plaintext,
Expand Down
16 changes: 8 additions & 8 deletions mh_sha1/mh_sha1_block_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ func(mh_sha1_block_avx2)
;;
;; perform 0-79 steps
;;
vpbroadcastq K, [K00_19]
vpbroadcastd K, [K00_19]
;; do rounds 0...15
%assign I 0
%rep 16
Expand All @@ -426,14 +426,14 @@ func(mh_sha1_block_avx2)
PREFETCH_X [mh_in_p + pref+128*0]
PREFETCH_X [mh_in_p + pref+128*1]
;; do rounds 20...39
vpbroadcastq K, [K20_39]
vpbroadcastd K, [K20_39]
%rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
ROTATE_ARGS
%assign I (I+1)
%endrep
;; do rounds 40...59
vpbroadcastq K, [K40_59]
vpbroadcastd K, [K40_59]
%rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
ROTATE_ARGS
Expand All @@ -442,7 +442,7 @@ func(mh_sha1_block_avx2)
PREFETCH_X [mh_in_p + pref+128*2]
PREFETCH_X [mh_in_p + pref+128*3]
;; do rounds 60...79
vpbroadcastq K, [K60_79]
vpbroadcastd K, [K60_79]
%rep 20
SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
ROTATE_ARGS
Expand Down Expand Up @@ -502,7 +502,7 @@ section .rodata align=32

align 32
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
K00_19: dq 0x5A8279995A827999
K20_39: dq 0x6ED9EBA16ED9EBA1
K40_59: dq 0x8F1BBCDC8F1BBCDC
K60_79: dq 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6
57 changes: 10 additions & 47 deletions mh_sha1/mh_sha1_block_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ func(mh_sha1_block_avx512)
; save rsp
mov RSP_SAVE, rsp

cmp loops, 0
test loops, loops
jle .return

; align rsp to 64 Bytes needed by avx512
Expand All @@ -271,7 +271,7 @@ func(mh_sha1_block_avx512)
VMOVPS HH3, [mh_digests_p + 64*3]
VMOVPS HH4, [mh_digests_p + 64*4]
;a mask used to transform to big-endian data
vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
vbroadcasti32x4 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]

.block_loop:
;transform to big-endian data and store on aligned_frame
Expand All @@ -293,7 +293,7 @@ func(mh_sha1_block_avx512)
vmovdqa64 D, HH3
vmovdqa64 E, HH4

vmovdqa32 KT, [K00_19]
vpbroadcastd KT, [K00_19]
%assign I 0xCA
%assign J 0
%assign K 2
Expand All @@ -306,13 +306,13 @@ func(mh_sha1_block_avx512)
MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
%endif
%if N = 19
vmovdqa32 KT, [K20_39]
vpbroadcastd KT, [K20_39]
%assign I 0x96
%elif N = 39
vmovdqa32 KT, [K40_59]
vpbroadcastd KT, [K40_59]
%assign I 0xE8
%elif N = 59
vmovdqa32 KT, [K60_79]
vpbroadcastd KT, [K60_79]
%assign I 0x96
%endif
%if N % 10 = 9
Expand Down Expand Up @@ -355,48 +355,11 @@ section .data align=64
align 64
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b

K00_19: dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999

K20_39: dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1

K40_59: dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC

K60_79: dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6

%else
%ifidn __OUTPUT_FORMAT__, win64
Expand Down
27 changes: 11 additions & 16 deletions mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
; save rsp
mov RSP_SAVE, rsp

cmp loops, 0
test loops, loops
jle .return

; leave enough space to store segs_digests
Expand Down Expand Up @@ -507,7 +507,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)

.block_loop:
;transform to big-endian data and store on aligned_frame
vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
vbroadcasti128 F, [PSHUFFLE_BYTE_FLIP_MASK]
;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
%assign I 0
%rep 16
Expand All @@ -521,7 +521,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
%assign I (I+1)
%endrep

mov mh_segs, 0 ;start from the first 8 segments
xor mh_segs, mh_segs ;start from the first 8 segments
mov pref, 1024 ;avoid prefetch repeadtedly
.segs_loop:
;; Initialize digests
Expand All @@ -539,7 +539,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
;;
;; perform 0-79 steps
;;
vmovdqa K, [K00_19]
vpbroadcastd K, [K00_19]
;; do rounds 0...15
%assign I 0
%rep 16
Expand All @@ -560,15 +560,15 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
PREFETCH_X [mh_in_p + pref+128*0]
PREFETCH_X [mh_in_p + pref+128*1]
;; do rounds 20...39
vmovdqa K, [K20_39]
vpbroadcastd K, [K20_39]
%rep 20
%assign J (I % 2)
SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
ROTATE_ARGS
%assign I (I+1)
%endrep
;; do rounds 40...59
vmovdqa K, [K40_59]
vpbroadcastd K, [K40_59]
%rep 20
%assign J (I % 2)
SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
Expand All @@ -578,7 +578,7 @@ func(mh_sha1_murmur3_x64_128_block_avx2)
PREFETCH_X [mh_in_p + pref+128*2]
PREFETCH_X [mh_in_p + pref+128*3]
;; do rounds 60...79
vmovdqa K, [K60_79]
vpbroadcastd K, [K60_79]
%rep 20
%assign J (I % 2)
SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
Expand Down Expand Up @@ -642,12 +642,7 @@ section .data align=32

align 32
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
dq 0x5A8279995A827999, 0x5A8279995A827999
K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6
57 changes: 10 additions & 47 deletions mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
; save rsp
mov RSP_SAVE, rsp

cmp loops, 0
test loops, loops
jle .return

; align rsp to 64 Bytes needed by avx512
Expand All @@ -354,7 +354,7 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
VMOVPS HH3, [mh_digests_p + 64*3]
VMOVPS HH4, [mh_digests_p + 64*4]
;a mask used to transform to big-endian data
vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
vbroadcasti32x4 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All these broadcasts can be part of the first commit.


;init murmur variables
mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
Expand Down Expand Up @@ -384,7 +384,7 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
vmovdqa64 D, HH3
vmovdqa64 E, HH4

vmovdqa32 KT, [K00_19]
vpbroadcastd KT, [K00_19]
%assign I 0xCA
%assign J 0
%assign K 2
Expand All @@ -399,13 +399,13 @@ func(mh_sha1_murmur3_x64_128_block_avx512)
PROCESS_LOOP APPEND(W,J), I
%endif
%if N = 19
vmovdqa32 KT, [K20_39]
vpbroadcastd KT, [K20_39]
%assign I 0x96
%elif N = 39
vmovdqa32 KT, [K40_59]
vpbroadcastd KT, [K40_59]
%assign I 0xE8
%elif N = 59
vmovdqa32 KT, [K60_79]
vpbroadcastd KT, [K60_79]
%assign I 0x96
%endif
%if N % 20 = 19
Expand Down Expand Up @@ -453,48 +453,11 @@ section .data align=64
align 64
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b
dq 0x0405060700010203
dq 0x0c0d0e0f08090a0b

K00_19: dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999
dq 0x5A8279995A827999

K20_39: dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1
dq 0x6ED9EBA16ED9EBA1

K40_59: dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC
dq 0x8F1BBCDC8F1BBCDC

K60_79: dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
dq 0xCA62C1D6CA62C1D6
K00_19: dq 0x5A827999
K20_39: dq 0x6ED9EBA1
K40_59: dq 0x8F1BBCDC
K60_79: dq 0xCA62C1D6

%else
%ifidn __OUTPUT_FORMAT__, win64
Expand Down
Loading