Skip to content

Commit 66f5ac9

Browse files
Add fast_matrix_mul_4x4_lsx function for LoongArch64
1 parent d5bd7d2 commit 66f5ac9

File tree

5 files changed

+48
-0
lines changed

5 files changed

+48
-0
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ if(CMAKE_SYSTEM_PROCESSOR)
6969
set(RISCV64_DEVICE ON)
7070
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^loongarch64")
7171
set(LOONGARCH64_DEVICE ON)
72+
add_compile_options(-mlsx)
73+
add_compile_options(-mlasx)
7274
else()
7375
message("Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
7476
endif()

Common/Math/SIMDHeaders.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@
3131
#endif
3232
#endif
3333

34+
#if PPSSPP_ARCH(LOONGARCH64)
35+
#if PPSSPP_ARCH(LOONGARCH64_LSX)
36+
#include <lsxintrin.h>
37+
#endif
38+
#endif
39+
3440
// Basic types
3541

3642
#if PPSSPP_ARCH(ARM64_NEON)

Common/Math/fast/fast_matrix.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,42 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
2222
}
2323
}
2424

25+
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
26+
27+
typedef union
28+
{
29+
int32_t i;
30+
float f;
31+
} FloatInt;
32+
33+
static __m128 __lsx_vreplfr2vr_s(float val)
34+
{
35+
FloatInt tmpval = {.f = val};
36+
return (__m128)__lsx_vreplgr2vr_w(tmpval.i);
37+
}
38+
39+
void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b) {
40+
__m128 a_col_1 = (__m128)__lsx_vld(a, 0);
41+
__m128 a_col_2 = (__m128)__lsx_vld(a + 4, 0);
42+
__m128 a_col_3 = (__m128)__lsx_vld(a + 8, 0);
43+
__m128 a_col_4 = (__m128)__lsx_vld(a + 12, 0);
44+
45+
for (int i = 0; i < 16; i += 4) {
46+
47+
__m128 b1 = __lsx_vreplfr2vr_s(b[i]);
48+
__m128 b2 = __lsx_vreplfr2vr_s(b[i + 1]);
49+
__m128 b3 = __lsx_vreplfr2vr_s(b[i + 2]);
50+
__m128 b4 = __lsx_vreplfr2vr_s(b[i + 3]);
51+
52+
__m128 result = __lsx_vfmul_s(a_col_1, b1);
53+
result = __lsx_vfmadd_s(a_col_2, b2, result);
54+
result = __lsx_vfmadd_s(a_col_3, b3, result);
55+
result = __lsx_vfmadd_s(a_col_4, b4, result);
56+
57+
__lsx_vst(result, &dest[i], 0);
58+
}
59+
}
60+
2561
#elif PPSSPP_ARCH(ARM_NEON)
2662

2763
// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example

Common/Math/fast/fast_matrix.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,15 @@ extern "C" {
1111
extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b);
1212
extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b);
1313
extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b);
14+
extern void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b);
1415

1516
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
1617
// Hard link to SSE implementations on x86/amd64
1718
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse
1819
#elif PPSSPP_ARCH(ARM_NEON)
1920
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon
21+
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
22+
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_lsx
2023
#else
2124
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
2225
#endif

ppsspp_config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
//https://github.com/gcc-mirror/gcc/blob/master/gcc/config/loongarch/loongarch-c.cc
8282
#define PPSSPP_ARCH_LOONGARCH64 1
8383
#define PPSSPP_ARCH_64BIT 1
84+
#define PPSSPP_ARCH_LOONGARCH64_LSX 1
8485
#endif
8586

8687
// PLATFORM defines

0 commit comments

Comments
 (0)