Skip to content

Commit 66848bc

Browse files
committed
loongarch: add Blit8888to8888PixelAlphaSwizzleLSX opt
1 parent 933beeb commit 66848bc

File tree

3 files changed

+115
-3
lines changed

3 files changed

+115
-3
lines changed

CMakeLists.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -931,7 +931,15 @@ if(SDL_ASSEMBLY)
931931
cmake_pop_check_state()
932932

933933
if(COMPILER_SUPPORTS_LSX AND HAVE_LSXINTRIN_H)
934-
set_property(SOURCE "${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c" APPEND PROPERTY COMPILE_OPTIONS "-mlsx")
934+
set_property(SOURCE
935+
"${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c"
936+
"${SDL3_SOURCE_DIR}/src/video/SDL_blit_A.c"
937+
APPEND PROPERTY COMPILE_OPTIONS "-mlsx")
938+
939+
set_property(SOURCE
940+
"${SDL3_SOURCE_DIR}/src/video/yuv2rgb/yuv_rgb_lsx.c"
941+
"${SDL3_SOURCE_DIR}/src/video/SDL_blit_A.c"
942+
PROPERTY SKIP_PRECOMPILE_HEADERS 1)
935943
set(HAVE_LSX TRUE)
936944
endif()
937945
endif()

include/SDL3/SDL_intrin.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,12 +281,14 @@ _m_prefetch(void *__P)
281281
* \sa SDL_TARGETING
282282
*/
283283
#define SDL_HAS_TARGET_ATTRIBS
284-
284+
#elif defined(__loongarch64) && defined(__GNUC__) && (__GNUC__ >= 15)
285+
/* LoongArch requires GCC 15+ for target attribute support */
286+
# define SDL_HAS_TARGET_ATTRIBS
285287
#elif defined(__clang__) && defined(__has_attribute)
286288
# if __has_attribute(target)
287289
# define SDL_HAS_TARGET_ATTRIBS
288290
# endif
289-
#elif defined(__GNUC__) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
291+
#elif defined(__GNUC__) && !defined(__loongarch64) && (__GNUC__ + (__GNUC_MINOR__ >= 9) > 4) /* gcc >= 4.9 */
290292
# define SDL_HAS_TARGET_ATTRIBS
291293
#elif defined(__ICC) && __ICC >= 1600
292294
# define SDL_HAS_TARGET_ATTRIBS

src/video/SDL_blit_A.c

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,103 @@ static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *inf
242242

243243
#endif
244244

245+
#ifdef SDL_LSX_INTRINSICS
246+
247+
static void SDL_TARGETING("lsx") Blit8888to8888PixelAlphaSwizzleLSX(SDL_BlitInfo *info)
248+
{
249+
int width = info->dst_w;
250+
int height = info->dst_h;
251+
Uint8 *src = info->src;
252+
int srcskip = info->src_skip;
253+
Uint8 *dst = info->dst;
254+
int dstskip = info->dst_skip;
255+
const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
256+
const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
257+
bool fill_alpha = !dstfmt->Amask;
258+
Uint32 dstAmask, dstAshift;
259+
const Uint8 offsets[] = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
260+
261+
SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
262+
263+
const __m128i const_0xff00 = __lsx_vreplgr2vr_h(0xff00);
264+
const __m128i const_128 = __lsx_vreplgr2vr_b((Uint8)128);
265+
const __m128i const_32641 = __lsx_vreplgr2vr_h(32641);
266+
const __m128i const_257 = __lsx_vreplgr2vr_h(257);
267+
268+
// The byte offsets for the start of each pixel
269+
const __m128i mask_offsets = __lsx_vld(offsets, 0);
270+
271+
const __m128i convert_mask = __lsx_vadd_w(
272+
__lsx_vreplgr2vr_w(
273+
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
274+
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
275+
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
276+
mask_offsets);
277+
278+
const __m128i alpha_splat_mask = __lsx_vadd_b(__lsx_vreplgr2vr_b(srcfmt->Ashift >> 3), mask_offsets);
279+
const __m128i alpha_fill_mask = __lsx_vreplgr2vr_w((int)dstAmask);
280+
281+
while (height--) {
282+
int i = 0;
283+
284+
for (; i + 4 <= width; i += 4) {
285+
__m128i src128 = __lsx_vld(src, 0);
286+
__m128i dst128 = __lsx_vld(dst, 0);
287+
288+
__m128i srcA = __lsx_vshuf_b(src128, src128, alpha_splat_mask);
289+
src128 = __lsx_vshuf_b(src128, src128, convert_mask);
290+
291+
src128 = __lsx_vor_v(src128, alpha_fill_mask);
292+
293+
__m128i srca_lo = __lsx_vilvl_b(srcA, srcA);
294+
__m128i srca_hi = __lsx_vilvh_b(srcA, srcA);
295+
296+
srca_lo = __lsx_vxor_v(srca_lo, const_0xff00);
297+
srca_hi = __lsx_vxor_v(srca_hi, const_0xff00);
298+
299+
src128 = __lsx_vsub_b(src128, const_128);
300+
dst128 = __lsx_vsub_b(dst128, const_128);
301+
302+
__m128i tmp = __lsx_vilvl_b(dst128, src128);
303+
__m128i dst_lo = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_lo, tmp), __lsx_vmulwod_h_bu_b(srca_lo, tmp));
304+
tmp = __lsx_vilvh_b(dst128, src128);
305+
__m128i dst_hi = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_hi, tmp), __lsx_vmulwod_h_bu_b(srca_hi, tmp));
306+
307+
dst_lo = __lsx_vadd_h(dst_lo, const_32641);
308+
dst_hi = __lsx_vadd_h(dst_hi, const_32641);
309+
310+
dst_lo = __lsx_vmuh_hu(dst_lo, const_257);
311+
dst_hi = __lsx_vmuh_hu(dst_hi, const_257);
312+
313+
dst128 = __lsx_vssrarni_bu_h(dst_hi, dst_lo, 0);
314+
if (fill_alpha) {
315+
dst128 = __lsx_vor_v(dst128, alpha_fill_mask);
316+
}
317+
__lsx_vst(dst128, dst, 0);
318+
319+
src += 16;
320+
dst += 16;
321+
}
322+
323+
for (; i < width; ++i) {
324+
Uint32 src32 = *(Uint32 *)src;
325+
Uint32 dst32 = *(Uint32 *)dst;
326+
ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
327+
if (fill_alpha) {
328+
dst32 |= dstAmask;
329+
}
330+
*(Uint32 *)dst = dst32;
331+
src += 4;
332+
dst += 4;
333+
}
334+
335+
src += srcskip;
336+
dst += dstskip;
337+
}
338+
}
339+
340+
#endif
341+
245342
// fast RGB888->(A)RGB888 blending with surface alpha=128 special case
246343
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
247344
{
@@ -1402,6 +1499,11 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
14021499
return Blit8888to8888PixelAlphaSwizzleSSE41;
14031500
}
14041501
#endif
1502+
#ifdef SDL_LSX_INTRINSICS
1503+
if (SDL_HasLSX()) {
1504+
return Blit8888to8888PixelAlphaSwizzleLSX;
1505+
}
1506+
#endif
14051507
#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8)
14061508
// To prevent "unused function" compiler warnings/errors
14071509
(void)Blit8888to8888PixelAlpha;

0 commit comments

Comments
 (0)