Skip to content

Commit

Permalink
Add support for _mm256_max_epu32
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume Piolat committed Oct 3, 2023
1 parent b5dcfea commit f99d4e2
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 5 deletions.
47 changes: 45 additions & 2 deletions source/inteli/avx2intrin.d
Original file line number Diff line number Diff line change
Expand Up @@ -1475,10 +1475,53 @@ unittest
}

// TODO __m256i _mm256_max_epi8 (__m256i a, __m256i b) pure @safe
// TODO __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @safe

/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed maximum values.
__m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe
{
// PERF D_SIMD
version(GNU)
enum bool split = true;
else static if (SIMD_COMPARISON_MASKS_32B)
enum bool split = false;
else
enum bool split = true;

static if (GDC_with_AVX2)
{
return cast(__m256i) __builtin_ia32_pmaxud256(cast(int8)a, cast(int8)b);
}
else static if (split)
{
// split
__m128i a_lo = _mm256_extractf128_si256!0(a);
__m128i a_hi = _mm256_extractf128_si256!1(a);
__m128i b_lo = _mm256_extractf128_si256!0(b);
__m128i b_hi = _mm256_extractf128_si256!1(b);
__m128i r_lo = _mm_max_epu32(a_lo, b_lo);
__m128i r_hi = _mm_max_epu32(a_hi, b_hi);
return _mm256_set_m128i(r_hi, r_lo);
}
else static if (SIMD_COMPARISON_MASKS_32B)
{
// catastrophic with GDC x86 for some reason, like for 16-bit numbers.
uint8 sa = cast(uint8)a;
uint8 sb = cast(uint8)b;
uint8 greater = sa > sb;
return cast(__m256i)( (greater & sa) | (~greater & sb) );
}
else
static assert(0);
}
unittest
{
int8 R = cast(int8) _mm256_max_epu32(_mm256_setr_epi32(0x7fffffff, 1, 4, -7, 0x7fffffff, 1, 11, -7),
_mm256_setr_epi32( -4,-8, 9, -8, -4,-8, 9, -8));
int[8] correct = [ -4,-8, 9, -7, -4,-8, 11, -7];
assert(R.array == correct);
}

// TODO __m256i _mm256_max_epu16 (__m256i a, __m256i b) pure @safe
// TODO __m256i _mm256_max_epu32 (__m256i a, __m256i b) pure @safe
// TODO __m256i _mm256_max_epu8 (__m256i a, __m256i b) pure @safe

// Compare packed signed 16-bit integers in `a` and `b`, and return packed minimum values.
Expand Down
17 changes: 14 additions & 3 deletions source/inteli/smmintrin.d
Original file line number Diff line number Diff line change
Expand Up @@ -1302,7 +1302,7 @@ unittest
}

/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
__m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
{
// PERF DMD
static if (GDC_with_SSE41)
Expand All @@ -1323,6 +1323,17 @@ __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
}
else
{
// PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
/+
movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
movdqa xmm3, xmm1
pxor xmm3, xmm2
pxor xmm2, xmm0
pcmpgtd xmm2, xmm3
pand xmm0, xmm2
pandn xmm2, xmm1
por xmm0, xmm2
+/
__m128i valueShift = _mm_set1_epi32(-0x80000000);
__m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
Expand Down Expand Up @@ -1448,7 +1459,7 @@ unittest
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
__m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
{
// PERF DMD
static if (GDC_with_SSE41)
Expand All @@ -1463,7 +1474,7 @@ __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
uint4 sb = cast(uint4)b;
static if (SIMD_COMPARISON_MASKS_16B)
uint4 greater = sa > sb;
else
else
uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
return cast(__m128i)( (~greater & sa) | (greater & sb) );
}
Expand Down

0 comments on commit f99d4e2

Please sign in to comment.