|
25 | 25 |
|
26 | 26 | #define PREFETCH(_ip_,_i_,_rw_) __builtin_prefetch(_ip_+(_i_),_rw_)
|
27 | 27 |
|
| 28 | +//-------------------- Encode ---------------------------------------------------------------------- |
| 29 | +//AVX512_VBMI: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=1276,5146,5146,5146&text=_mm512_multishift_epi64_epi8&avx512techs=AVX512_VBMI |
| 30 | +//reference: http://0x80.pl/notesen/2016-04-03-avx512-base64.html#avx512vbmi |
| 31 | +#define ES512(_i_) { __m512i v0,v1;\ |
| 32 | + v0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192) ),\ |
| 33 | + v1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+48));\ |
| 34 | + u0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u0)), vlut);\ |
| 35 | + u1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u1)), vlut);\ |
| 36 | + _mm512_storeu_si512((__m512i*)(op+_i_*256), u0);\ |
| 37 | + _mm512_storeu_si512((__m512i*)(op+_i_*256+64), u1);\ |
| 38 | + \ |
| 39 | + u0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+ 96));\ |
| 40 | + u1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+144));\ |
| 41 | + v0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v0)), vlut);\ |
| 42 | + v1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v1)), vlut);\ |
| 43 | + _mm512_storeu_si512((__m512i*)(op+_i_*256+128), v0);\ |
| 44 | + _mm512_storeu_si512((__m512i*)(op+_i_*256+192), v1);\ |
| 45 | +} |
| 46 | + |
| 47 | +size_t tb64v512enc(const unsigned char* in, size_t inlen, unsigned char *out) { |
| 48 | + const unsigned char *ip = in; |
| 49 | + unsigned char *op = out; |
| 50 | + unsigned outlen = TB64ENCLEN(inlen); |
| 51 | + |
| 52 | + const __m512i vlut = _mm512_setr_epi64(0x4847464544434241ull, 0x504F4E4D4C4B4A49ull, // ABCDEF...789+/ |
| 53 | + 0x5857565554535251ull, 0x6665646362615A59ull, |
| 54 | + 0x6E6D6C6B6A696867ull, 0x767574737271706Full, |
| 55 | + 0x333231307A797877ull, 0x2F2B393837363534ull), |
| 56 | + vf = _mm512_setr_epi32(0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, |
| 57 | + 0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516, |
| 58 | + 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, |
| 59 | + 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e), |
| 60 | + vs = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10 |
| 61 | + |
| 62 | + #define EN 256 |
| 63 | + if(outlen >= 128+256) { |
| 64 | + __m512i u0 = _mm512_loadu_si512((__m512i *) ip ); |
| 65 | + __m512i u1 = _mm512_loadu_si512((__m512i *)(ip+48)); |
| 66 | + for(; op < (out+outlen)-(128+EN); op += EN, ip += EN*3/4) { |
| 67 | + ES512(0); |
| 68 | + #if EN > 256 |
| 69 | + ES512(1); |
| 70 | + #endif |
| 71 | + PREFETCH(ip, 384, 0); |
| 72 | + } |
| 73 | + #if EN > 256 |
| 74 | + if(op < (out+outlen)-(128+256)) { ES256(0); op += 256; ip += 256*3/4; } |
| 75 | + #endif |
| 76 | + } |
| 77 | + |
| 78 | + const __m256i vh = _mm256_set_epi8(10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
| 79 | + 10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
| 80 | + for(; op < out+outlen-32; op += 32, ip += 32*3/4) { |
| 81 | + __m256i v = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *) ip ) ); |
| 82 | + v = _mm256_inserti128_si256(v,_mm_loadu_si128((__m128i *)(ip+12)),1); |
| 83 | + v = _mm256_shuffle_epi8(v, vh); |
| 84 | + v = bitunpack256v8_6(v); |
| 85 | + v = bitmap256v8_6(v); |
| 86 | + _mm256_storeu_si256((__m256i*) op, v); |
| 87 | + } |
| 88 | + EXTAIL(); |
| 89 | + return outlen; |
| 90 | +} |
| 91 | + |
28 | 92 | //--------------------- Decode ----------------------------------------------------------------------
|
29 | 93 | #define CHECK0(a) a
|
30 | 94 | #ifdef B64CHECK
|
31 | 95 | #define CHECK1(a) a
|
32 | 96 | #else
|
33 | 97 | #define CHECK1(a)
|
34 | 98 | #endif
|
35 |
| - #if 0 // Not faster than avx2 |
| 99 | +//---------------------------------------------------------- |
| 100 | +#define BITMAP256V8_6(iv, ov) ov = _mm512_permutex2var_epi8(vlut0, iv, vlut1); //AVX-512_VBMI |
| 101 | + |
| 102 | +#define BITPACK512V8_6(v) {\ |
| 103 | + __m512i merge_ab_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140)),\ |
| 104 | + vm = _mm512_madd_epi16(merge_ab_bc, _mm512_set1_epi32(0x00011000));\ |
| 105 | + v = _mm512_permutexvar_epi8(vp, vm);\ |
| 106 | +} |
| 107 | + |
| 108 | +#define B64CHK(iv, ov, vx) vx = _mm512_ternarylogic_epi32(vx, ov, iv, 0xfe) |
| 109 | + |
| 110 | +#define DS512(_i_) { __m512i iv0,iv1,ou0,ou1,ov0,ov1; \ |
| 111 | + iv0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256)), \ |
| 112 | + iv1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+64));\ |
| 113 | + \ |
| 114 | + BITMAP256V8_6(iu0, ou0); CHECK0(B64CHK(iu0, ou0, vx)); BITPACK512V8_6(ou0);\ |
| 115 | + BITMAP256V8_6(iu1, ou1); CHECK1(B64CHK(iu1, ou1, vx)); BITPACK512V8_6(ou1);\ |
| 116 | + \ |
| 117 | + iu0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+128)),\ |
| 118 | + iu1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+192));\ |
| 119 | + \ |
| 120 | + _mm512_storeu_si512((__m128i*)(op+_i_*192), ou0);\ |
| 121 | + _mm512_storeu_si512((__m128i*)(op+_i_*192+48), ou1);\ |
| 122 | + \ |
| 123 | + BITMAP256V8_6(iv0, ov0); CHECK1(B64CHK(iv0, ov0, vx)); BITPACK512V8_6(ov0);\ |
| 124 | + BITMAP256V8_6(iv1, ov1); CHECK1(B64CHK(iv1, ov1, vx)); BITPACK512V8_6(ov1);\ |
| 125 | + \ |
| 126 | + _mm512_storeu_si512((__m128i*)(op+_i_*192+ 96), ov0);\ |
| 127 | + _mm512_storeu_si512((__m128i*)(op+_i_*192+144), ov1);\ |
| 128 | +} |
| 129 | + |
| 130 | +//----------------------------------------------- |
| 131 | +size_t tb64v512dec(const unsigned char *in, size_t inlen, unsigned char *out) { |
| 132 | + const unsigned char *ip = in; |
| 133 | + unsigned char *op = out; |
| 134 | + #define DN 512 |
| 135 | + __m512i vx = _mm512_setzero_si512(); |
| 136 | + if(inlen > 56+128) { |
| 137 | + const __m512i vlut0 = _mm512_setr_epi32(0x80808080, 0x80808080, 0x80808080, 0x80808080, |
| 138 | + 0x80808080, 0x80808080, 0x80808080, 0x80808080, |
| 139 | + 0x80808080, 0x80808080, 0x3e808080, 0x3f808080, |
| 140 | + 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080), |
| 141 | + vlut1 = _mm512_setr_epi32(0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, |
| 142 | + 0x1211100f, 0x16151413, 0x80191817, 0x80808080, |
| 143 | + 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, |
| 144 | + 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080), |
| 145 | + vp = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, |
| 146 | + 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, |
| 147 | + 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, |
| 148 | + 0x00000000, 0x00000000, 0x00000000, 0x00000000); |
| 149 | + |
| 150 | + __m512i iu0 = _mm512_loadu_si512((__m512i *) ip), |
| 151 | + iu1 = _mm512_loadu_si512((__m512i *)(ip+64)); |
| 152 | + for( ; ip < in+(inlen-(DN+4)); ip += DN, op += (DN/4)*3) { PREFETCH(ip,384,0); |
| 153 | + DS512(0); |
| 154 | + #if DN > 256 |
| 155 | + DS512(1); |
| 156 | + #endif |
| 157 | + } |
| 158 | + for(; ip < (in+inlen)-64-4; ip += 64, op += 64*3/4) { |
| 159 | + __m512i iv = _mm512_loadu_si512((__m512i *) ip), ov; |
| 160 | + BITMAP256V8_6(iv, ov); |
| 161 | + CHECK0(B64CHK(iv, ov, vx)); |
| 162 | + BITPACK512V8_6(ov); |
| 163 | + _mm512_storeu_si512((__m128i*) op, ov); |
| 164 | + } |
| 165 | + } |
| 166 | + unsigned rc, r = inlen-(ip-in); |
| 167 | + if(r && !(rc=tb64xdec(ip, r, op)) || _mm512_movepi8_mask(vx)) return 0; |
| 168 | + return (op-out)+rc; |
| 169 | +} |
| 170 | + |
| 171 | + #if 0 // AVX512F but Not faster than avx2 |
36 | 172 | #define BITPACK512V8_6_(v) {\
|
37 | 173 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140));\
|
38 | 174 | v = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));\
|
@@ -124,127 +260,3 @@ size_t tb64v512dec0(const unsigned char *in, size_t inlen, unsigned char *out) {
|
124 | 260 | return (op-out)+rc;
|
125 | 261 | }
|
126 | 262 | #endif
|
127 |
| -//---------------------------------------------------------- |
128 |
| -#define BITMAP256V8_6(iv, ov) ov = _mm512_permutex2var_epi8(vlut0, iv, vlut1); //AVX-512_VBMI |
129 |
| - |
130 |
| -#define BITPACK512V8_6(v) {\ |
131 |
| - __m512i merge_ab_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140)),\ |
132 |
| - vm = _mm512_madd_epi16(merge_ab_bc, _mm512_set1_epi32(0x00011000));\ |
133 |
| - v = _mm512_permutexvar_epi8(vp, vm);\ |
134 |
| -} |
135 |
| - |
136 |
| -#define B64CHK(iv, ov, vx) vx = _mm512_ternarylogic_epi32(vx, ov, iv, 0xfe) |
137 |
| - |
138 |
| -#define DS512(_i_) { __m512i iv0,iv1,ou0,ou1,ov0,ov1;\ |
139 |
| - iv0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256)), \ |
140 |
| - iv1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+64));\ |
141 |
| - BITMAP256V8_6(iu0, ou0); CHECK0(B64CHK(iu0, ou0, vx)); BITPACK512V8_6(ou0);\ |
142 |
| - BITMAP256V8_6(iu1, ou1); CHECK1(B64CHK(iu1, ou1, vx)); BITPACK512V8_6(ou1);\ |
143 |
| - iu0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+128)),\ |
144 |
| - iu1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+192));\ |
145 |
| - _mm512_storeu_si512((__m128i*)(op+_i_*192), ou0);\ |
146 |
| - _mm512_storeu_si512((__m128i*)(op+_i_*192+48), ou1);\ |
147 |
| - BITMAP256V8_6(iv0, ov0); CHECK1(B64CHK(iv0, ov0, vx)); BITPACK512V8_6(ov0);\ |
148 |
| - BITMAP256V8_6(iv1, ov1); CHECK1(B64CHK(iv1, ov1, vx)); BITPACK512V8_6(ov1);\ |
149 |
| - _mm512_storeu_si512((__m128i*)(op+_i_*192+ 96), ov0);\ |
150 |
| - _mm512_storeu_si512((__m128i*)(op+_i_*192+144), ov1);\ |
151 |
| -} |
152 |
| - |
153 |
| -//----------------------------------------------- |
154 |
| -size_t tb64v512dec(const unsigned char *in, size_t inlen, unsigned char *out) { |
155 |
| - const unsigned char *ip = in; |
156 |
| - unsigned char *op = out; |
157 |
| - #define DN 512 |
158 |
| - __m512i vx = _mm512_setzero_si512(); |
159 |
| - if(inlen > 56+128) { |
160 |
| - const __m512i vlut0 = _mm512_setr_epi32(0x80808080, 0x80808080, 0x80808080, 0x80808080, |
161 |
| - 0x80808080, 0x80808080, 0x80808080, 0x80808080, |
162 |
| - 0x80808080, 0x80808080, 0x3e808080, 0x3f808080, |
163 |
| - 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080), |
164 |
| - vlut1 = _mm512_setr_epi32(0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, |
165 |
| - 0x1211100f, 0x16151413, 0x80191817, 0x80808080, |
166 |
| - 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, |
167 |
| - 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080), |
168 |
| - vp = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, |
169 |
| - 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, |
170 |
| - 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, |
171 |
| - 0x00000000, 0x00000000, 0x00000000, 0x00000000); |
172 |
| - |
173 |
| - __m512i iu0 = _mm512_loadu_si512((__m512i *) ip), |
174 |
| - iu1 = _mm512_loadu_si512((__m512i *)(ip+64)); |
175 |
| - for( ; ip < in+(inlen-(DN+4)); ip += DN, op += (DN/4)*3) { PREFETCH(ip,384,0); |
176 |
| - DS512(0); |
177 |
| - #if DN > 256 |
178 |
| - DS512(1); |
179 |
| - #endif |
180 |
| - } |
181 |
| - for(; ip < (in+inlen)-64-4; ip += 64, op += 64*3/4) { |
182 |
| - __m512i iv = _mm512_loadu_si512((__m512i *) ip), ov; |
183 |
| - BITMAP256V8_6(iv, ov); CHECK0(B64CHK(iv, ov, vx)); BITPACK512V8_6(ov); |
184 |
| - _mm512_storeu_si512((__m128i*) op, ov); |
185 |
| - } |
186 |
| - } |
187 |
| - unsigned rc, r = inlen-(ip-in); |
188 |
| - if(r && !(rc=tb64xdec(ip, r, op)) || _mm512_movepi8_mask(vx)) return 0; |
189 |
| - return (op-out)+rc; |
190 |
| -} |
191 |
| - |
192 |
| -//-------------------- Encode ---------------------------------------------------------------------- |
193 |
| -//AVX512_VBMI: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=1276,5146,5146,5146&text=_mm512_multishift_epi64_epi8&avx512techs=AVX512_VBMI |
194 |
| -//reference: http://0x80.pl/notesen/2016-04-03-avx512-base64.html#avx512vbmi |
195 |
| -#define ES512(_i_) { __m512i v0,v1;\ |
196 |
| - v0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192) ),\ |
197 |
| - v1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+48));\ |
198 |
| - u0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u0)), vlut);\ |
199 |
| - u1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u1)), vlut);\ |
200 |
| - _mm512_storeu_si512((__m512i*)(op+_i_*256), u0);\ |
201 |
| - _mm512_storeu_si512((__m512i*)(op+_i_*256+64), u1);\ |
202 |
| - \ |
203 |
| - u0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+ 96));\ |
204 |
| - u1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+144));\ |
205 |
| - v0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v0)), vlut);\ |
206 |
| - v1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v1)), vlut);\ |
207 |
| - _mm512_storeu_si512((__m512i*)(op+_i_*256+128), v0);\ |
208 |
| - _mm512_storeu_si512((__m512i*)(op+_i_*256+192), v1);\ |
209 |
| -} |
210 |
| - |
211 |
| -size_t tb64v512enc(const unsigned char* in, size_t inlen, unsigned char *out) { |
212 |
| - const unsigned char *ip = in; |
213 |
| - unsigned char *op = out; |
214 |
| - unsigned outlen = TB64ENCLEN(inlen); |
215 |
| - |
216 |
| - static const char *lut = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
217 |
| - const __m512i vlut = _mm512_loadu_si512((const __m512i*)lut); |
218 |
| - const __m512i vf = _mm512_setr_epi32(0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, |
219 |
| - 0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516, |
220 |
| - 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, |
221 |
| - 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); |
222 |
| - const __m512i vs = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10 |
223 |
| - |
224 |
| - #define EN 256 |
225 |
| - if(outlen >= 128+256) { |
226 |
| - __m512i u0 = _mm512_loadu_si512((__m512i *) ip ); |
227 |
| - __m512i u1 = _mm512_loadu_si512((__m512i *)(ip+48)); |
228 |
| - for(; op < (out+outlen)-(128+EN); op += EN, ip += EN*3/4) { |
229 |
| - ES512(0); |
230 |
| - #if EN > 256 |
231 |
| - ES512(1); |
232 |
| - #endif |
233 |
| - PREFETCH(ip, 384, 0); |
234 |
| - } |
235 |
| - #if EN > 256 |
236 |
| - if(op < (out+outlen)-(128+256)) { ES256(0); op += 256; ip += 256*3/4; } |
237 |
| - #endif |
238 |
| - } |
239 |
| - |
240 |
| - const __m256i vh = _mm256_set_epi8(10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
241 |
| - 10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); |
242 |
| - for(; op < out+outlen-32; op += 32, ip += 32*3/4) { |
243 |
| - __m256i v = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *) ip ) ); |
244 |
| - v = _mm256_inserti128_si256(v,_mm_loadu_si128((__m128i *)(ip+12)),1); |
245 |
| - v = _mm256_shuffle_epi8(v, vh); v = bitunpack256v8_6(v); v = bitmap256v8_6(v); |
246 |
| - _mm256_storeu_si256((__m256i*) op, v); |
247 |
| - } |
248 |
| - EXTAIL(); |
249 |
| - return outlen; |
250 |
| -} |
|
0 commit comments