From f99d4e252fed9d5ff7bc8460efe85e822b7e56ef Mon Sep 17 00:00:00 2001 From: Guillaume Piolat Date: Tue, 3 Oct 2023 16:57:36 +0200 Subject: [PATCH] Add support for _mm256_max_epu32 --- source/inteli/avx2intrin.d | 47 ++++++++++++++++++++++++++++++++++++-- source/inteli/smmintrin.d | 17 +++++++++++--- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/source/inteli/avx2intrin.d b/source/inteli/avx2intrin.d index 6464ca1..e748407 100644 --- a/source/inteli/avx2intrin.d +++ b/source/inteli/avx2intrin.d @@ -1475,10 +1475,53 @@ unittest } // TODO __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @safe +// TODO __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @safe +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values. +__m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe +{ + // PERF D_SIMD + version(GNU) + enum bool split = true; + else static if (SIMD_COMPARISON_MASKS_32B) + enum bool split = false; + else + enum bool split = true; + + static if (GDC_with_AVX2) + { + return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b); + } + else static if (split) + { + // split + __m128i a_lo = _mm256_extractf128_si256!0(a); + __m128i a_hi = _mm256_extractf128_si256!1(a); + __m128i b_lo = _mm256_extractf128_si256!0(b); + __m128i b_hi = _mm256_extractf128_si256!1(b); + __m128i r_lo = _mm_max_epu32(a_lo, b_lo); + __m128i r_hi = _mm_max_epu32(a_hi, b_hi); + return _mm256_set_m128i(r_hi, r_lo); + } + else static if (SIMD_COMPARISON_MASKS_32B) + { + // catastrophic with GDC x86 for some reason, like for 16-bit numbers. + uint8 sa = cast(uint8)a; + uint8 sb = cast(uint8)b; + uint8 greater = sa > sb; + return cast(__m256i)( (greater & sa) | (~greater & sb) ); + } + else + static assert(0); +} +unittest +{ + int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1, 4, -7, 0x7fffffff, 1, 11, -7), + _mm256_setr_epi32( -4,-8, 9, -8, -4,-8, 9, -8)); + int[8] correct = [ -4,-8, 9, -7, -4,-8, 11, -7]; + assert(R.array == correct); +} -// TODO __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @safe -// TODO __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe // TODO __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe // Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values. diff --git a/source/inteli/smmintrin.d b/source/inteli/smmintrin.d index bef80e9..651de32 100644 --- a/source/inteli/smmintrin.d +++ b/source/inteli/smmintrin.d @@ -1302,7 +1302,7 @@ unittest } /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. -__m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted +__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) @@ -1323,6 +1323,17 @@ __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted } else { + // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128" + /+ + movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000] + movdqa xmm3, xmm1 + pxor xmm3, xmm2 + pxor xmm2, xmm0 + pcmpgtd xmm2, xmm3 + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + +/ __m128i valueShift = _mm_set1_epi32(-0x80000000); __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b @@ -1448,7 +1459,7 @@ unittest } /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. -__m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted +__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) @@ -1463,7 +1474,7 @@ __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted uint4 sb = cast(uint4)b; static if (SIMD_COMPARISON_MASKS_16B) uint4 greater = sa > sb; - else + else uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); return cast(__m128i)( (~greater & sa) | (greater & sb) ); }