Skip to content

Commit a9cb0e6

Browse files
author
Guillaume Piolat
committed
Remove AVX-512 presence enums.
Add first two F16C intrinsics (only generic path for now, taken from stb_image_resize2.h).
1 parent 95c610c commit a9cb0e6

File tree

3 files changed

+95
-99
lines changed

3 files changed

+95
-99
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
| SSE4.2| Yes but ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | Yes (`-mattr=+sse4.2`) | Yes (`-mattr=+crc`) | Yes (`-msse4.2`) |
3535
| BMI2 | Yes but ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | Yes (`-mattr=+bmi2`) | Yes | Yes (`-mbmi2`) |
3636
| AVX | Yes but ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | Yes (`-mattr=+avx`) | Yes | Yes (`-mavx`) |
37+
| F16C | WIP, ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | WIP (`-mattr=+f16c`) | WIP | WIP (`-mf16c`) |
3738
| AVX2 | WIP and ([#42](https://github.com/AuburnSounds/intel-intrinsics/issues/42)) | WIP (`-mattr=+avx2`) | WIP | WIP (`-mavx2`) |
3839

3940
The intrinsics implemented follow the syntax and semantics at: https://software.intel.com/sites/landingpage/IntrinsicsGuide/

source/inteli/avxintrin.d

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* AVX intrinsics.
2+
* AVX and FP16C intrinsics.
33
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX
44
*
55
* Copyright: Guillaume Piolat 2022.
@@ -10,13 +10,19 @@
1010
module inteli.avxintrin;
1111

1212
// AVX instructions
13-
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX
13+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX
1414
// Note: this header will work whether you have AVX enabled or not.
1515
// With LDC, use "dflags-ldc": ["-mattr=+avx"] or equivalent to actively
1616
// generate AVX instructions.
1717
// With GDC, use "dflags-gdc": ["-mavx"] or equivalent to actively
1818
// generate AVX instructions.
1919

20+
// This header also implements FP16C intrinsics.
21+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C
22+
// With LDC, use "dflags-ldc": ["-mattr=+f16c"] or equivalent to actively
23+
// generate F16C instructions.
24+
// With GDC, use "dflags-gdc": ["-mf16c"] or equivalent to actively
25+
// generate F16C instructions.
2026

2127
/// IMPORTANT NOTE ABOUT MASK LOAD/STORE:
2228
///
@@ -4889,3 +4895,84 @@ unittest
48894895
long[4] correct = [-1, 99, 0, 0];
48904896
assert(R.array == correct);
48914897
}
4898+
4899+
4900+
// F16C start here
4901+
4902+
/// Convert 4 packed half-precision (16-bit) floating-point elements
4903+
/// in `a` to packed single-precision (32-bit) floating-point elements.
4904+
/// Note: Only lowest 64-bit of input considered.
4905+
/// Preserve infinities, sign of zeroes, and NaN-ness.
4906+
__m128 _mm_cvtph_ps(__m128i a) pure @trusted
4907+
{
4908+
short8 sa = cast(short8)a;
4909+
4910+
// PERF F16C actual instruction
4911+
{
4912+
// Reference: stb_image_resize2.h has F16C emulation.
4913+
// See:
4914+
// Originated from:
4915+
__m128i mask_nosign = _mm_set1_epi32(0x7fff);
4916+
__m128i smallest_normal = _mm_set1_epi32(0x0400);
4917+
__m128i infinity = _mm_set1_epi32(0x7c00);
4918+
__m128i expadjust_normal = _mm_set1_epi32((127 - 15) << 23);
4919+
__m128i magic_denorm = _mm_set1_epi32(113 << 23);
4920+
__m128i i = a;
4921+
__m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
4922+
__m128i mnosign = mask_nosign;
4923+
__m128i eadjust = expadjust_normal;
4924+
__m128i smallest = smallest_normal;
4925+
__m128i infty = infinity;
4926+
__m128i expmant = _mm_and_si128(mnosign, h);
4927+
__m128i justsign = _mm_xor_si128(h, expmant);
4928+
__m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
4929+
__m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant);
4930+
__m128i shifted = _mm_slli_epi32(expmant, 13);
4931+
__m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust);
4932+
__m128i adjusted = _mm_add_epi32(eadjust, shifted);
4933+
__m128i den1 = _mm_add_epi32(shifted, magic_denorm);
4934+
__m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
4935+
__m128 den2 = _mm_sub_ps(cast(__m128)den1, *cast(const(__m128)*)&magic_denorm);
4936+
__m128 adjusted3 = _mm_and_ps(den2, cast(__m128)b_isdenorm);
4937+
__m128 adjusted4 = _mm_andnot_ps(cast(__m128)b_isdenorm, cast(__m128)adjusted2);
4938+
__m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
4939+
__m128i sign = _mm_slli_epi32(justsign, 16);
4940+
__m128 final_ = _mm_or_ps(adjusted5, cast(__m128)sign);
4941+
return final_;
4942+
}
4943+
}
4944+
unittest
4945+
{
4946+
__m128i A = _mm_setr_epi16(cast(short)0x8000, 0x7C00, cast(short)0xDA90, 0x5000, 0, 0, 0, 0);
4947+
float[4] correct = [-0.0f, float.infinity, -210.0f, 32.0f];
4948+
__m128 R = _mm_cvtph_ps(A);
4949+
assert(R.array == correct);
4950+
}
4951+
4952+
/// Convert 8 packed half-precision (16-bit) floating-point elements
4953+
/// in `a` to packed single-precision (32-bit) floating-point elements.
4954+
/// Note: Preserve infinities, sign of zeroes, and NaN-ness.
4955+
__m256 _mm256_cvtph_ps(__m128i a) pure @trusted
4956+
{
4957+
// PERF F16C actual instruction
4958+
{
4959+
// In stb_image_resize2.h, _mm_cvtph_ps is simply hand-inlined 2x
4960+
// so we do the same here.
4961+
int4 ihi;
4962+
ihi.ptr[0] = a.array[2];
4963+
ihi.ptr[1] = a.array[3];
4964+
__m128 lo = _mm_cvtph_ps(a);
4965+
__m128 hi = _mm_cvtph_ps(ihi);
4966+
return _mm256_set_m128(hi, lo);
4967+
}
4968+
}
4969+
unittest
4970+
{
4971+
__m128i A = _mm_setr_epi16(0, cast(short)-32768, 0, cast(short)0xFC00, 0x7C00, 0x5A90,cast(short)0xDA90, 0x5000);
4972+
float[8] correct = [0.0f, -0.0f, 0.0f, -float.infinity, float.infinity, 210.0f, -210.0f, 32.0f];
4973+
__m256 R = _mm256_cvtph_ps(A);
4974+
assert(R.array == correct);
4975+
}
4976+
4977+
// __m128i _mm_cvtps_ph (__m128 a, int imm8) TODO
4978+
// __m128i _mm256_cvtps_ph (__m256 a, int imm8) TODO

source/inteli/internals.d

Lines changed: 5 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -180,28 +180,10 @@ version(LDC)
180180
enum LDC_with_SSE42 = false;
181181
enum LDC_with_CRC32 = false;
182182
enum LDC_with_AVX = false;
183+
enum LDC_with_F16C = false;
183184
enum LDC_with_AVX2 = false;
184185
enum LDC_with_SHA = false;
185186
enum LDC_with_BMI2 = false;
186-
187-
enum LDC_with_AVX512F = false;
188-
enum LDC_with_AVX512CD = false;
189-
enum LDC_with_AVX512ER = false;
190-
enum LDC_with_AVX512PF = false;
191-
enum LDC_with_AVX512BW = false;
192-
enum LDC_with_AVX512DQ = false;
193-
enum LDC_with_AVX512VL = false;
194-
enum LDC_with_AVX512IFMA = false;
195-
enum LDC_with_AVX512VBMI = false;
196-
enum LDC_with_AVX512VBMI2 = false;
197-
198-
enum LDC_with_AVX512FP16 = false;
199-
enum LDC_with_AVX512BF16 = false;
200-
enum LDC_with_AVX512VNNI = false;
201-
enum LDC_with_AVX512BITALG = false;
202-
203-
enum LDC_with_AVX512VP2INTERSECT = false;
204-
enum LDC_with_AVX512VPOPCNTDQ = false;
205187
}
206188
else version(AArch64)
207189
{
@@ -217,28 +199,10 @@ version(LDC)
217199
enum LDC_with_SSE42 = false;
218200
enum LDC_with_CRC32 = false;
219201
enum LDC_with_AVX = false;
202+
enum LDC_with_F16C = false;
220203
enum LDC_with_AVX2 = false;
221204
enum LDC_with_SHA = false;
222205
enum LDC_with_BMI2 = false;
223-
224-
enum LDC_with_AVX512F = false;
225-
enum LDC_with_AVX512CD = false;
226-
enum LDC_with_AVX512ER = false;
227-
enum LDC_with_AVX512PF = false;
228-
enum LDC_with_AVX512BW = false;
229-
enum LDC_with_AVX512DQ = false;
230-
enum LDC_with_AVX512VL = false;
231-
enum LDC_with_AVX512IFMA = false;
232-
enum LDC_with_AVX512VBMI = false;
233-
enum LDC_with_AVX512VBMI2 = false;
234-
235-
enum LDC_with_AVX512FP16 = false;
236-
enum LDC_with_AVX512BF16 = false;
237-
enum LDC_with_AVX512VNNI = false;
238-
enum LDC_with_AVX512BITALG = false;
239-
240-
enum LDC_with_AVX512VP2INTERSECT = false;
241-
enum LDC_with_AVX512VPOPCNTDQ = false;
242206
}
243207
else static if (some_x86)
244208
{
@@ -276,30 +240,10 @@ version(LDC)
276240
}
277241

278242
enum LDC_with_AVX = __traits(targetHasFeature, "avx") && LDC_with_ia32_builtins;
243+
enum LDC_with_F16C = __traits(targetHasFeature, "f16c") && LDC_with_ia32_builtins;
279244
enum LDC_with_AVX2 = __traits(targetHasFeature, "avx2") && LDC_with_ia32_builtins;
280245
enum LDC_with_SHA = __traits(targetHasFeature, "sha") && LDC_with_ia32_builtins;
281246
enum LDC_with_BMI2 = __traits(targetHasFeature, "bmi2") && LDC_with_ia32_builtins;
282-
283-
// All of the feature flags at https://github.com/cetio/sim-d/blob/main/source/simd/features.d
284-
// but I haven't added them because I don't feel inclined, these suffice.
285-
enum LDC_with_AVX512F = __traits(targetHasFeature, "avx512f") && LDC_with_ia32_builtins;
286-
enum LDC_with_AVX512CD = __traits(targetHasFeature, "avx512cd") && LDC_with_ia32_builtins;
287-
enum LDC_with_AVX512ER = __traits(targetHasFeature, "avx512er") && LDC_with_ia32_builtins;
288-
enum LDC_with_AVX512PF = __traits(targetHasFeature, "avx512pf") && LDC_with_ia32_builtins;
289-
enum LDC_with_AVX512BW = __traits(targetHasFeature, "avx512bw") && LDC_with_ia32_builtins;
290-
enum LDC_with_AVX512DQ = __traits(targetHasFeature, "avx512dq") && LDC_with_ia32_builtins;
291-
enum LDC_with_AVX512VL = __traits(targetHasFeature, "avx512vl") && LDC_with_ia32_builtins;
292-
enum LDC_with_AVX512IFMA = __traits(targetHasFeature, "avx512ifma") && LDC_with_ia32_builtins;
293-
enum LDC_with_AVX512VBMI = __traits(targetHasFeature, "avx512vbmi") && LDC_with_ia32_builtins;
294-
enum LDC_with_AVX512VBMI2 = __traits(targetHasFeature, "avx512vbmi2") && LDC_with_ia32_builtins;
295-
296-
enum LDC_with_AVX512FP16 = (__VERSION__ > 2101) && __traits(targetHasFeature, "avx512fp16") && LDC_with_ia32_builtins;
297-
enum LDC_with_AVX512BF16 = __traits(targetHasFeature, "avx512bf16") && LDC_with_ia32_builtins;
298-
enum LDC_with_AVX512VNNI = __traits(targetHasFeature, "avx512vnni") && LDC_with_ia32_builtins;
299-
enum LDC_with_AVX512BITALG = __traits(targetHasFeature, "avx512bitalg") && LDC_with_ia32_builtins;
300-
301-
enum LDC_with_AVX512VP2INTERSECT = __traits(targetHasFeature, "avx512vp2intersect") && LDC_with_ia32_builtins;
302-
enum LDC_with_AVX512VPOPCNTDQ = __traits(targetHasFeature, "avx512vpopcntdq") && LDC_with_ia32_builtins;
303247
}
304248
else
305249
{
@@ -314,28 +258,10 @@ version(LDC)
314258
enum LDC_with_SSE42 = false;
315259
enum LDC_with_CRC32 = false;
316260
enum LDC_with_AVX = false;
261+
enum LDC_with_F16C = false;
317262
enum LDC_with_AVX2 = false;
318263
enum LDC_with_SHA = false;
319264
enum LDC_with_BMI2 = false;
320-
321-
enum LDC_with_AVX512F = false;
322-
enum LDC_with_AVX512CD = false;
323-
enum LDC_with_AVX512ER = false;
324-
enum LDC_with_AVX512PF = false;
325-
enum LDC_with_AVX512BW = false;
326-
enum LDC_with_AVX512DQ = false;
327-
enum LDC_with_AVX512VL = false;
328-
enum LDC_with_AVX512IFMA = false;
329-
enum LDC_with_AVX512VBMI = false;
330-
enum LDC_with_AVX512VBMI2 = false;
331-
332-
enum LDC_with_AVX512FP16 = false;
333-
enum LDC_with_AVX512BF16 = false;
334-
enum LDC_with_AVX512VNNI = false;
335-
enum LDC_with_AVX512BITALG = false;
336-
337-
enum LDC_with_AVX512VP2INTERSECT = false;
338-
enum LDC_with_AVX512VPOPCNTDQ = false;
339265
}
340266

341267
// Should we use inline x86 assembly with DMD syntax, in LDC?
@@ -368,29 +294,11 @@ else
368294
enum LDC_with_SSE42 = false;
369295
enum LDC_with_CRC32 = false;
370296
enum LDC_with_AVX = false;
297+
enum LDC_with_F16C = false;
371298
enum LDC_with_AVX2 = false;
372299
enum LDC_with_SHA = false;
373300
enum LDC_with_BMI2 = false;
374301

375-
enum LDC_with_AVX512F = false;
376-
enum LDC_with_AVX512CD = false;
377-
enum LDC_with_AVX512ER = false;
378-
enum LDC_with_AVX512PF = false;
379-
enum LDC_with_AVX512BW = false;
380-
enum LDC_with_AVX512DQ = false;
381-
enum LDC_with_AVX512VL = false;
382-
enum LDC_with_AVX512IFMA = false;
383-
enum LDC_with_AVX512VBMI = false;
384-
enum LDC_with_AVX512VBMI2 = false;
385-
386-
enum LDC_with_AVX512FP16 = false;
387-
enum LDC_with_AVX512BF16 = false;
388-
enum LDC_with_AVX512VNNI = false;
389-
enum LDC_with_AVX512BITALG = false;
390-
391-
enum LDC_with_AVX512VP2INTERSECT = false;
392-
enum LDC_with_AVX512VPOPCNTDQ = false;
393-
394302
enum LDC_with_InlineIREx = false;
395303
enum bool LDC_with_optimizations = false;
396304
enum bool LDC_with_32b_x86_asm = false;

0 commit comments

Comments
 (0)