Skip to content

Commit b4facba

Browse files
committed
vectorized polyw1_pack
1 parent 69d91bc commit b4facba

File tree

5 files changed

+82
-34
lines changed

5 files changed

+82
-34
lines changed

SHA256SUMS

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,3 @@
1-
6e7e295859101ce0aa3c66600deb1a194c9d6874a08d9d5f858a25bfbb0d34ca tvecs2
2-
33db29f32cbf87fa3f5b0206a68d08703b9149f3fce112190991420649fbb46a tvecs3
3-
d71584fc3cc709fbbe2817e410e26e5729539242f0cc1d0b193cd8ccc69f29e5 tvecs4
4-
358fcf0bcc7bec69b92aeb42c6bba4c4f989693a87e0384415ced93fe9c17069 tvecs2aes
5-
edd793047ed12fbe27e9b188b474c34ee5c75e8f4c6e6f0312dd18f0b38318db tvecs3aes
6-
61f9357202efc7b564d4426755727248a3fb1f45cd74cdeae1a9533c43bd7322 tvecs4aes
71
81ff60e3ef698751e5572f0bb7f831f069605229c220ee1cf27a92572d6ebc7e PQCsignKAT_Dilithium2.req
82
532f4a7a416bba96b607395a6d07fc0eaab1f1f968e49758d2a97c718de832e7 PQCsignKAT_Dilithium2.rsp
93
81ff60e3ef698751e5572f0bb7f831f069605229c220ee1cf27a92572d6ebc7e PQCsignKAT_Dilithium3.req
@@ -16,3 +10,9 @@ edd793047ed12fbe27e9b188b474c34ee5c75e8f4c6e6f0312dd18f0b38318db tvecs3aes
1610
f3c5fcceafa9fb2462721f272791a26c9a123b3a07fad7e07dfec232085fdd7f PQCsignKAT_Dilithium3-AES.rsp
1711
81ff60e3ef698751e5572f0bb7f831f069605229c220ee1cf27a92572d6ebc7e PQCsignKAT_Dilithium4-AES.req
1812
8de4e2ac2032f714263aa0d045275ec62b6f192f8828cfe82b63ec0b0b32deb6 PQCsignKAT_Dilithium4-AES.rsp
13+
592010c7047b68ed07424b8fda5c337d70eccb5dad99e2b3dbc9a124bef89ccd tvecs2
14+
d6cd6334975b89e851b43af35213fdc606b334a9364a86d7f0533d8a87fa54d3 tvecs2aes
15+
190476ba894321b66bbff6af8ced1bb2df3dcb95beef80fb73f1c8c058eaf526 tvecs3
16+
057382ca533828c302c06e8fdbc9d6a313d27b0415a7f5ef004e664dae52fd2a tvecs3aes
17+
f1523cc865df73a8e210c0363dc285e2b59f4c4fbde81c38507ac08279635997 tvecs4
18+
4f0e3cefd83bae430e5d9773040631fcb6ad0cfe1d1472ab5e2d1dfc8977cd40 tvecs4aes

avx2/poly.c

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,14 +1045,56 @@ void polyz_unpack(poly * restrict r, const uint8_t * restrict a) {
10451045
**************************************************/
10461046
void polyw1_pack(uint8_t * restrict r, const poly * restrict a) {
10471047
unsigned int i;
1048-
// _mm256i vec;
1048+
__m256i f0, f1, f2, f3, f4, f5, f6, f7;
1049+
const __m256i mask = _mm256_set1_epi64x(0xFF00FF00FF00FF00);
1050+
const __m256i idx = _mm256_set_epi8(15,13,14,12,11, 9,10, 8,
1051+
7, 5, 6, 4, 3, 1, 2, 0,
1052+
15,13,14,12,11, 9,10, 8,
1053+
7, 5, 6, 4, 3, 1, 2, 0);
10491054
DBENCH_START();
10501055

1051-
// for(i = 0; i < N/8: ++i) {
1052-
// vec = _mm256_load_si256((__m256i *)&a->coeffs[8*i]);
1053-
1054-
for(i = 0; i < N/2; ++i)
1055-
r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4);
1056+
for(i = 0; i < N/64; ++i) {
1057+
f0 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+ 0]);
1058+
f1 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+ 8]);
1059+
f2 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+16]);
1060+
f3 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+24]);
1061+
1062+
f0 = _mm256_and_si256(f0, _mm256_set1_epi32(15));
1063+
f1 = _mm256_and_si256(f1, _mm256_set1_epi32(15));
1064+
f2 = _mm256_and_si256(f2, _mm256_set1_epi32(15));
1065+
f3 = _mm256_and_si256(f3, _mm256_set1_epi32(15));
1066+
1067+
f0 = _mm256_packus_epi32(f0, f1);
1068+
f4 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+32]);
1069+
f5 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+40]);
1070+
1071+
f1 = _mm256_packus_epi32(f2, f3);
1072+
f6 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+48]);
1073+
f7 = _mm256_load_si256((__m256i *)&a->coeffs[64*i+56]);
1074+
1075+
f4 = _mm256_and_si256(f4, _mm256_set1_epi32(15));
1076+
f5 = _mm256_and_si256(f5, _mm256_set1_epi32(15));
1077+
f6 = _mm256_and_si256(f6, _mm256_set1_epi32(15));
1078+
f7 = _mm256_and_si256(f7, _mm256_set1_epi32(15));
1079+
1080+
f2 = _mm256_packus_epi32(f4, f5);
1081+
f3 = _mm256_packus_epi32(f6, f7);
1082+
f0 = _mm256_packus_epi16(f0, f1);
1083+
f1 = _mm256_packus_epi16(f2, f3);
1084+
f2 = _mm256_permute2x128_si256(f0, f1, 0x20); /* ABCD */
1085+
f3 = _mm256_permute2x128_si256(f0, f1, 0x31); /* EFGH */
1086+
1087+
f4 = _mm256_srli_epi16(f2, 8); /* B0D0 */
1088+
f5 = _mm256_slli_epi16(f3, 8); /* 0E0G */
1089+
f0 = _mm256_blendv_epi8(f2, f5, mask); /* AECG */
1090+
f1 = _mm256_blendv_epi8(f4, f3, mask); /* BFDH */
1091+
1092+
f1 = _mm256_slli_epi16(f1, 4);
1093+
f0 = _mm256_or_si256(f0, f1);
1094+
1095+
f0 = _mm256_shuffle_epi8(f0, idx);
1096+
_mm256_storeu_si256((__m256i *)&r[32*i], f0);
1097+
}
10561098

10571099
DBENCH_STOP(*tpack);
10581100
}

avx2/sign.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@ void challenge(poly *c,
8080
**************************************************/
8181
int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
8282
unsigned int i;
83-
__attribute__((aligned(32)))
83+
__attribute__((aligned(32)))
8484
uint8_t seedbuf[3*SEEDBYTES];
85-
__attribute__((aligned(32)))
85+
__attribute__((aligned(32)))
8686
uint8_t tr[CRHBYTES];
8787
const uint8_t *rho, *rhoprime, *key;
8888
polyvecl mat[K];

avx2/test/test_vectors.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <stdint.h>
22
#include <stdio.h>
3+
#include <string.h>
34
#include "../params.h"
45
#include "../sign.h"
56
#include "../poly.h"
@@ -172,9 +173,8 @@ int main(void) {
172173

173174
if(polyveck_chknorm(&w1, 16))
174175
fprintf(stderr, "ERROR in polyveck_chknorm(&w1, 16)!\n");
175-
h = w0;
176-
polyveck_csubq(&h);
177-
if(polyveck_chknorm(&h, ALPHA/2+1))
176+
polyveck_csubq(&w0);
177+
if(polyveck_chknorm(&w0, ALPHA/2+1))
178178
fprintf(stderr, "ERROR in polyveck_chknorm(&w0 ,ALPHA/2+1)!\n");
179179

180180
printf("w1 = ((");
@@ -189,7 +189,8 @@ int main(void) {
189189
printf("w0 = ((");
190190
for(j = 0; j < K; ++j) {
191191
for(k = 0; k < N; ++k) {
192-
u = w0.vec[j].coeffs[k] - Q;
192+
u = w0.vec[j].coeffs[k];
193+
if(u > (Q-1)/2) u -= Q;
193194
printf("%7d", u);
194195
if(k < N-1) printf(", ");
195196
else if(j < K-1) printf("),\n (");
@@ -220,9 +221,8 @@ int main(void) {
220221

221222
if(polyveck_chknorm(&t1, 512))
222223
fprintf(stderr, "ERROR in polyveck_chknorm(&t1, 512)!\n");
223-
h = t0;
224-
polyveck_csubq(&h);
225-
if(polyveck_chknorm(&h, (1U << (D-1)) + 1))
224+
polyveck_csubq(&t0);
225+
if(polyveck_chknorm(&t0, (1U << (D-1)) + 1))
226226
fprintf(stderr, "ERROR in polyveck_chknorm(&t0, 1 << (D-1) + 1)!\n");
227227

228228
printf("t1 = ((");
@@ -237,15 +237,16 @@ int main(void) {
237237
printf("t0 = ((");
238238
for(j = 0; j < K; ++j) {
239239
for(k = 0; k < N; ++k) {
240-
u = t0.vec[j].coeffs[k] - Q;
240+
u = t0.vec[j].coeffs[k];
241+
if(u > (Q-1)/2) u -= Q;
241242
printf("%5d", u);
242243
if(k < N-1) printf(", ");
243244
else if(j < K-1) printf("),\n (");
244245
else printf(")\n");
245246
}
246247
}
247248

248-
challenge(&c, seed, &w); //FIXME: w1
249+
challenge(&c, seed, &w1);
249250
printf("c = (");
250251
for(j = 0; j < N; ++j) {
251252
u = c.coeffs[j];
@@ -257,10 +258,12 @@ int main(void) {
257258

258259
polyveck_make_hint(&h, &w0, &w1);
259260
pack_sig(buf, &y, &h, &c);
260-
unpack_sig(&y, &h, &tmp, buf);
261+
unpack_sig(&y, &w, &tmp, buf);
261262
for(j = 0; j < N; j++)
262263
if(c.coeffs[j] != tmp.coeffs[j])
263264
fprintf(stderr, "ERROR in (un)pack_sig!\n");
265+
if(memcmp(&h,&w,sizeof(h)))
266+
fprintf(stderr, "ERROR in (un)pack_sig!\n");
264267

265268
printf("\n");
266269
}

ref/test/test_vectors.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <stdint.h>
22
#include <stdio.h>
3+
#include <string.h>
34
#include "../params.h"
45
#include "../sign.h"
56
#include "../poly.h"
@@ -124,9 +125,8 @@ int main(void) {
124125

125126
if(polyveck_chknorm(&w1, 16))
126127
fprintf(stderr, "ERROR in polyveck_chknorm(&w1, 16)!\n");
127-
h = w0;
128-
polyveck_csubq(&h);
129-
if(polyveck_chknorm(&h, ALPHA/2+1))
128+
polyveck_csubq(&w0);
129+
if(polyveck_chknorm(&w0, ALPHA/2+1))
130130
fprintf(stderr, "ERROR in polyveck_chknorm(&w0 ,ALPHA/2+1)!\n");
131131

132132
printf("w1 = ((");
@@ -141,7 +141,8 @@ int main(void) {
141141
printf("w0 = ((");
142142
for(j = 0; j < K; ++j) {
143143
for(k = 0; k < N; ++k) {
144-
u = w0.vec[j].coeffs[k] - Q;
144+
u = w0.vec[j].coeffs[k];
145+
if(u > (Q-1)/2) u -= Q;
145146
printf("%7d", u);
146147
if(k < N-1) printf(", ");
147148
else if(j < K-1) printf("),\n (");
@@ -172,9 +173,8 @@ int main(void) {
172173

173174
if(polyveck_chknorm(&t1, 512))
174175
fprintf(stderr, "ERROR in polyveck_chknorm(&t1, 512)!\n");
175-
h = t0;
176-
polyveck_csubq(&h);
177-
if(polyveck_chknorm(&h, (1U << (D-1)) + 1))
176+
polyveck_csubq(&t0);
177+
if(polyveck_chknorm(&t0, (1U << (D-1)) + 1))
178178
fprintf(stderr, "ERROR in polyveck_chknorm(&t0, 1 << (D-1) + 1)!\n");
179179

180180
printf("t1 = ((");
@@ -189,15 +189,16 @@ int main(void) {
189189
printf("t0 = ((");
190190
for(j = 0; j < K; ++j) {
191191
for(k = 0; k < N; ++k) {
192-
u = t0.vec[j].coeffs[k] - Q;
192+
u = t0.vec[j].coeffs[k];
193+
if(u > (Q-1)/2) u -= Q;
193194
printf("%5d", u);
194195
if(k < N-1) printf(", ");
195196
else if(j < K-1) printf("),\n (");
196197
else printf(")\n");
197198
}
198199
}
199200

200-
challenge(&c, seed, &w); //FIXME: w1
201+
challenge(&c, seed, &w1);
201202
printf("c = (");
202203
for(j = 0; j < N; ++j) {
203204
u = c.coeffs[j];
@@ -209,10 +210,12 @@ int main(void) {
209210

210211
polyveck_make_hint(&h, &w0, &w1);
211212
pack_sig(buf, &y, &h, &c);
212-
unpack_sig(&y, &h, &tmp, buf);
213+
unpack_sig(&y, &w, &tmp, buf);
213214
for(j = 0; j < N; j++)
214215
if(c.coeffs[j] != tmp.coeffs[j])
215216
fprintf(stderr, "ERROR in (un)pack_sig!\n");
217+
if(memcmp(&h,&w,sizeof(h)))
218+
fprintf(stderr, "ERROR in (un)pack_sig!\n");
216219

217220
printf("\n");
218221
}

0 commit comments

Comments
 (0)