Skip to content

Commit c087948

Browse files
authored
Turbo Base64: Encode/Decode avx512
1 parent 1ce41f0 commit c087948

File tree

1 file changed

+137
-125
lines changed

1 file changed

+137
-125
lines changed

turbob64v512.c

Lines changed: 137 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,150 @@
2525

2626
#define PREFETCH(_ip_,_i_,_rw_) __builtin_prefetch(_ip_+(_i_),_rw_)
2727

28+
//-------------------- Encode ----------------------------------------------------------------------
29+
//AVX512_VBMI: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=1276,5146,5146,5146&text=_mm512_multishift_epi64_epi8&avx512techs=AVX512_VBMI
30+
//reference: http://0x80.pl/notesen/2016-04-03-avx512-base64.html#avx512vbmi
31+
#define ES512(_i_) { __m512i v0,v1;\
32+
v0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192) ),\
33+
v1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+48));\
34+
u0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u0)), vlut);\
35+
u1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u1)), vlut);\
36+
_mm512_storeu_si512((__m512i*)(op+_i_*256), u0);\
37+
_mm512_storeu_si512((__m512i*)(op+_i_*256+64), u1);\
38+
\
39+
u0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+ 96));\
40+
u1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+144));\
41+
v0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v0)), vlut);\
42+
v1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v1)), vlut);\
43+
_mm512_storeu_si512((__m512i*)(op+_i_*256+128), v0);\
44+
_mm512_storeu_si512((__m512i*)(op+_i_*256+192), v1);\
45+
}
46+
47+
size_t tb64v512enc(const unsigned char* in, size_t inlen, unsigned char *out) {
48+
const unsigned char *ip = in;
49+
unsigned char *op = out;
50+
unsigned outlen = TB64ENCLEN(inlen);
51+
52+
const __m512i vlut = _mm512_setr_epi64(0x4847464544434241ull, 0x504F4E4D4C4B4A49ull, // ABCDEF...789+/
53+
0x5857565554535251ull, 0x6665646362615A59ull,
54+
0x6E6D6C6B6A696867ull, 0x767574737271706Full,
55+
0x333231307A797877ull, 0x2F2B393837363534ull),
56+
vf = _mm512_setr_epi32(0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
57+
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
58+
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
59+
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e),
60+
vs = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10
61+
62+
#define EN 256
63+
if(outlen >= 128+256) {
64+
__m512i u0 = _mm512_loadu_si512((__m512i *) ip );
65+
__m512i u1 = _mm512_loadu_si512((__m512i *)(ip+48));
66+
for(; op < (out+outlen)-(128+EN); op += EN, ip += EN*3/4) {
67+
ES512(0);
68+
#if EN > 256
69+
ES512(1);
70+
#endif
71+
PREFETCH(ip, 384, 0);
72+
}
73+
#if EN > 256
74+
if(op < (out+outlen)-(128+256)) { ES256(0); op += 256; ip += 256*3/4; }
75+
#endif
76+
}
77+
78+
const __m256i vh = _mm256_set_epi8(10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
79+
10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
80+
for(; op < out+outlen-32; op += 32, ip += 32*3/4) {
81+
__m256i v = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *) ip ) );
82+
v = _mm256_inserti128_si256(v,_mm_loadu_si128((__m128i *)(ip+12)),1);
83+
v = _mm256_shuffle_epi8(v, vh);
84+
v = bitunpack256v8_6(v);
85+
v = bitmap256v8_6(v);
86+
_mm256_storeu_si256((__m256i*) op, v);
87+
}
88+
EXTAIL();
89+
return outlen;
90+
}
91+
2892
//--------------------- Decode ----------------------------------------------------------------------
2993
#define CHECK0(a) a
3094
#ifdef B64CHECK
3195
#define CHECK1(a) a
3296
#else
3397
#define CHECK1(a)
3498
#endif
35-
#if 0 // Not faster than avx2
99+
//----------------------------------------------------------
100+
#define BITMAP256V8_6(iv, ov) ov = _mm512_permutex2var_epi8(vlut0, iv, vlut1); //AVX-512_VBMI
101+
102+
#define BITPACK512V8_6(v) {\
103+
__m512i merge_ab_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140)),\
104+
vm = _mm512_madd_epi16(merge_ab_bc, _mm512_set1_epi32(0x00011000));\
105+
v = _mm512_permutexvar_epi8(vp, vm);\
106+
}
107+
108+
#define B64CHK(iv, ov, vx) vx = _mm512_ternarylogic_epi32(vx, ov, iv, 0xfe)
109+
110+
#define DS512(_i_) { __m512i iv0,iv1,ou0,ou1,ov0,ov1; \
111+
iv0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256)), \
112+
iv1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+64));\
113+
\
114+
BITMAP256V8_6(iu0, ou0); CHECK0(B64CHK(iu0, ou0, vx)); BITPACK512V8_6(ou0);\
115+
BITMAP256V8_6(iu1, ou1); CHECK1(B64CHK(iu1, ou1, vx)); BITPACK512V8_6(ou1);\
116+
\
117+
iu0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+128)),\
118+
iu1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+192));\
119+
\
120+
_mm512_storeu_si512((__m128i*)(op+_i_*192), ou0);\
121+
_mm512_storeu_si512((__m128i*)(op+_i_*192+48), ou1);\
122+
\
123+
BITMAP256V8_6(iv0, ov0); CHECK1(B64CHK(iv0, ov0, vx)); BITPACK512V8_6(ov0);\
124+
BITMAP256V8_6(iv1, ov1); CHECK1(B64CHK(iv1, ov1, vx)); BITPACK512V8_6(ov1);\
125+
\
126+
_mm512_storeu_si512((__m128i*)(op+_i_*192+ 96), ov0);\
127+
_mm512_storeu_si512((__m128i*)(op+_i_*192+144), ov1);\
128+
}
129+
130+
//-----------------------------------------------
131+
size_t tb64v512dec(const unsigned char *in, size_t inlen, unsigned char *out) {
132+
const unsigned char *ip = in;
133+
unsigned char *op = out;
134+
#define DN 512
135+
__m512i vx = _mm512_setzero_si512();
136+
if(inlen > 56+128) {
137+
const __m512i vlut0 = _mm512_setr_epi32(0x80808080, 0x80808080, 0x80808080, 0x80808080,
138+
0x80808080, 0x80808080, 0x80808080, 0x80808080,
139+
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
140+
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080),
141+
vlut1 = _mm512_setr_epi32(0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
142+
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
143+
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
144+
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080),
145+
vp = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
146+
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
147+
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
148+
0x00000000, 0x00000000, 0x00000000, 0x00000000);
149+
150+
__m512i iu0 = _mm512_loadu_si512((__m512i *) ip),
151+
iu1 = _mm512_loadu_si512((__m512i *)(ip+64));
152+
for( ; ip < in+(inlen-(DN+4)); ip += DN, op += (DN/4)*3) { PREFETCH(ip,384,0);
153+
DS512(0);
154+
#if DN > 256
155+
DS512(1);
156+
#endif
157+
}
158+
for(; ip < (in+inlen)-64-4; ip += 64, op += 64*3/4) {
159+
__m512i iv = _mm512_loadu_si512((__m512i *) ip), ov;
160+
BITMAP256V8_6(iv, ov);
161+
CHECK0(B64CHK(iv, ov, vx));
162+
BITPACK512V8_6(ov);
163+
_mm512_storeu_si512((__m128i*) op, ov);
164+
}
165+
}
166+
unsigned rc, r = inlen-(ip-in);
167+
if(r && !(rc=tb64xdec(ip, r, op)) || _mm512_movepi8_mask(vx)) return 0;
168+
return (op-out)+rc;
169+
}
170+
171+
#if 0 // AVX512F but Not faster than avx2
36172
#define BITPACK512V8_6_(v) {\
37173
const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140));\
38174
v = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));\
@@ -124,127 +260,3 @@ size_t tb64v512dec0(const unsigned char *in, size_t inlen, unsigned char *out) {
124260
return (op-out)+rc;
125261
}
126262
#endif
127-
//----------------------------------------------------------
128-
#define BITMAP256V8_6(iv, ov) ov = _mm512_permutex2var_epi8(vlut0, iv, vlut1); //AVX-512_VBMI
129-
130-
#define BITPACK512V8_6(v) {\
131-
__m512i merge_ab_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140)),\
132-
vm = _mm512_madd_epi16(merge_ab_bc, _mm512_set1_epi32(0x00011000));\
133-
v = _mm512_permutexvar_epi8(vp, vm);\
134-
}
135-
136-
#define B64CHK(iv, ov, vx) vx = _mm512_ternarylogic_epi32(vx, ov, iv, 0xfe)
137-
138-
#define DS512(_i_) { __m512i iv0,iv1,ou0,ou1,ov0,ov1;\
139-
iv0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256)), \
140-
iv1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+64));\
141-
BITMAP256V8_6(iu0, ou0); CHECK0(B64CHK(iu0, ou0, vx)); BITPACK512V8_6(ou0);\
142-
BITMAP256V8_6(iu1, ou1); CHECK1(B64CHK(iu1, ou1, vx)); BITPACK512V8_6(ou1);\
143-
iu0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+128)),\
144-
iu1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+192));\
145-
_mm512_storeu_si512((__m128i*)(op+_i_*192), ou0);\
146-
_mm512_storeu_si512((__m128i*)(op+_i_*192+48), ou1);\
147-
BITMAP256V8_6(iv0, ov0); CHECK1(B64CHK(iv0, ov0, vx)); BITPACK512V8_6(ov0);\
148-
BITMAP256V8_6(iv1, ov1); CHECK1(B64CHK(iv1, ov1, vx)); BITPACK512V8_6(ov1);\
149-
_mm512_storeu_si512((__m128i*)(op+_i_*192+ 96), ov0);\
150-
_mm512_storeu_si512((__m128i*)(op+_i_*192+144), ov1);\
151-
}
152-
153-
//-----------------------------------------------
154-
size_t tb64v512dec(const unsigned char *in, size_t inlen, unsigned char *out) {
155-
const unsigned char *ip = in;
156-
unsigned char *op = out;
157-
#define DN 512
158-
__m512i vx = _mm512_setzero_si512();
159-
if(inlen > 56+128) {
160-
const __m512i vlut0 = _mm512_setr_epi32(0x80808080, 0x80808080, 0x80808080, 0x80808080,
161-
0x80808080, 0x80808080, 0x80808080, 0x80808080,
162-
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
163-
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080),
164-
vlut1 = _mm512_setr_epi32(0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
165-
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
166-
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
167-
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080),
168-
vp = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
169-
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
170-
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
171-
0x00000000, 0x00000000, 0x00000000, 0x00000000);
172-
173-
__m512i iu0 = _mm512_loadu_si512((__m512i *) ip),
174-
iu1 = _mm512_loadu_si512((__m512i *)(ip+64));
175-
for( ; ip < in+(inlen-(DN+4)); ip += DN, op += (DN/4)*3) { PREFETCH(ip,384,0);
176-
DS512(0);
177-
#if DN > 256
178-
DS512(1);
179-
#endif
180-
}
181-
for(; ip < (in+inlen)-64-4; ip += 64, op += 64*3/4) {
182-
__m512i iv = _mm512_loadu_si512((__m512i *) ip), ov;
183-
BITMAP256V8_6(iv, ov); CHECK0(B64CHK(iv, ov, vx)); BITPACK512V8_6(ov);
184-
_mm512_storeu_si512((__m128i*) op, ov);
185-
}
186-
}
187-
unsigned rc, r = inlen-(ip-in);
188-
if(r && !(rc=tb64xdec(ip, r, op)) || _mm512_movepi8_mask(vx)) return 0;
189-
return (op-out)+rc;
190-
}
191-
192-
//-------------------- Encode ----------------------------------------------------------------------
193-
//AVX512_VBMI: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=1276,5146,5146,5146&text=_mm512_multishift_epi64_epi8&avx512techs=AVX512_VBMI
194-
//reference: http://0x80.pl/notesen/2016-04-03-avx512-base64.html#avx512vbmi
195-
#define ES512(_i_) { __m512i v0,v1;\
196-
v0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192) ),\
197-
v1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+48));\
198-
u0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u0)), vlut);\
199-
u1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u1)), vlut);\
200-
_mm512_storeu_si512((__m512i*)(op+_i_*256), u0);\
201-
_mm512_storeu_si512((__m512i*)(op+_i_*256+64), u1);\
202-
\
203-
u0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+ 96));\
204-
u1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+144));\
205-
v0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v0)), vlut);\
206-
v1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v1)), vlut);\
207-
_mm512_storeu_si512((__m512i*)(op+_i_*256+128), v0);\
208-
_mm512_storeu_si512((__m512i*)(op+_i_*256+192), v1);\
209-
}
210-
211-
size_t tb64v512enc(const unsigned char* in, size_t inlen, unsigned char *out) {
212-
const unsigned char *ip = in;
213-
unsigned char *op = out;
214-
unsigned outlen = TB64ENCLEN(inlen);
215-
216-
static const char *lut = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
217-
const __m512i vlut = _mm512_loadu_si512((const __m512i*)lut);
218-
const __m512i vf = _mm512_setr_epi32(0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
219-
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
220-
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
221-
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
222-
const __m512i vs = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10
223-
224-
#define EN 256
225-
if(outlen >= 128+256) {
226-
__m512i u0 = _mm512_loadu_si512((__m512i *) ip );
227-
__m512i u1 = _mm512_loadu_si512((__m512i *)(ip+48));
228-
for(; op < (out+outlen)-(128+EN); op += EN, ip += EN*3/4) {
229-
ES512(0);
230-
#if EN > 256
231-
ES512(1);
232-
#endif
233-
PREFETCH(ip, 384, 0);
234-
}
235-
#if EN > 256
236-
if(op < (out+outlen)-(128+256)) { ES256(0); op += 256; ip += 256*3/4; }
237-
#endif
238-
}
239-
240-
const __m256i vh = _mm256_set_epi8(10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
241-
10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
242-
for(; op < out+outlen-32; op += 32, ip += 32*3/4) {
243-
__m256i v = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *) ip ) );
244-
v = _mm256_inserti128_si256(v,_mm_loadu_si128((__m128i *)(ip+12)),1);
245-
v = _mm256_shuffle_epi8(v, vh); v = bitunpack256v8_6(v); v = bitmap256v8_6(v);
246-
_mm256_storeu_si256((__m256i*) op, v);
247-
}
248-
EXTAIL();
249-
return outlen;
250-
}

0 commit comments

Comments
 (0)