Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added .clangd
Empty file.
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

108 changes: 87 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,67 @@ option(PERNIX_DISABLE_AVX2 "Disable AVX2 optimizations" off)
option(PERNIX_DISABLE_AVX512 "Disable AVX512 optimizations" off)

option(PERNIX_USE_SIMDE "Use SIMDe library for portable SIMD support" off)
set(PERNIX_SIMDE_PROVIDER "AUTO" CACHE STRING "SIMDe provider when PERNIX_USE_SIMDE is enabled (AUTO, PACKAGE, FETCH)")
set_property(CACHE PERNIX_SIMDE_PROVIDER PROPERTY STRINGS AUTO PACKAGE FETCH)
set(PERNIX_ARCH_BACKEND "AUTO" CACHE STRING "Pernix architecture backend (AUTO, FALLBACK, X86, ARM64_NEON, ARM64_SVE, ARM64_SVE2)")
set_property(CACHE PERNIX_ARCH_BACKEND PROPERTY STRINGS AUTO FALLBACK X86 ARM64_NEON ARM64_SVE ARM64_SVE2)

option(PERNIX_ENABLE_FORTRAN_BINDINGS "Build Fortran bindings for pernix" off)

string(TOUPPER "${PERNIX_ARCH_BACKEND}" PERNIX_ARCH_BACKEND)
set(PERNIX_VALID_ARCH_BACKENDS AUTO FALLBACK X86 ARM64_NEON ARM64_SVE ARM64_SVE2)
if (NOT PERNIX_ARCH_BACKEND IN_LIST PERNIX_VALID_ARCH_BACKENDS)
message(FATAL_ERROR "Unsupported PERNIX_ARCH_BACKEND='${PERNIX_ARCH_BACKEND}'. Expected one of: ${PERNIX_VALID_ARCH_BACKENDS}")
endif ()

set(PERNIX_SELECTED_ARCH_BACKEND "${PERNIX_ARCH_BACKEND}")
if (PERNIX_SELECTED_ARCH_BACKEND STREQUAL "AUTO")
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|i[3-6]86|i686)$")
set(PERNIX_SELECTED_ARCH_BACKEND "X86")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|ARM64)$")
set(PERNIX_SELECTED_ARCH_BACKEND "ARM64_NEON")
else ()
set(PERNIX_SELECTED_ARCH_BACKEND "FALLBACK")
endif ()
endif ()
message(STATUS "Pernix architecture backend: ${PERNIX_SELECTED_ARCH_BACKEND}")

string(TOUPPER "${PERNIX_SIMDE_PROVIDER}" PERNIX_SIMDE_PROVIDER)
set(PERNIX_VALID_SIMDE_PROVIDERS AUTO PACKAGE FETCH)
if (NOT PERNIX_SIMDE_PROVIDER IN_LIST PERNIX_VALID_SIMDE_PROVIDERS)
message(FATAL_ERROR "Unsupported PERNIX_SIMDE_PROVIDER='${PERNIX_SIMDE_PROVIDER}'. Expected one of: ${PERNIX_VALID_SIMDE_PROVIDERS}")
endif ()

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

set(PERNIX_BUNDLE_SIMDE_FOR_INSTALL OFF)
if (PERNIX_USE_SIMDE)
add_subdirectory(external/simde EXCLUDE_FROM_ALL)
if (PERNIX_SIMDE_PROVIDER STREQUAL "AUTO" OR PERNIX_SIMDE_PROVIDER STREQUAL "PACKAGE")
find_package(simde CONFIG QUIET)
endif ()

if (NOT TARGET simde::simde AND (PERNIX_SIMDE_PROVIDER STREQUAL "AUTO" OR PERNIX_SIMDE_PROVIDER STREQUAL "FETCH"))
include(FetchContent)
set(SIMDE_TEST_CMAKE_PACKAGING OFF CACHE BOOL "Test SIMDe CMake packaging" FORCE)
FetchContent_Declare(
simde
GIT_REPOSITORY https://github.com/simd-everywhere/simde.git
GIT_TAG f3e8262173b7089db9a9d57a9ecef8dd07ad9c97
GIT_PROGRESS TRUE
EXCLUDE_FROM_ALL
)
FetchContent_MakeAvailable(simde)
set(PERNIX_BUNDLE_SIMDE_FOR_INSTALL ON)
endif ()

if (NOT TARGET simde::simde AND DEFINED simde_SOURCE_DIR AND EXISTS "${simde_SOURCE_DIR}/simde")
add_library(simde::simde INTERFACE IMPORTED GLOBAL)
target_include_directories(simde::simde INTERFACE "${simde_SOURCE_DIR}")
endif ()

if (NOT TARGET simde::simde)
message(FATAL_ERROR "PERNIX_USE_SIMDE is enabled, but simde::simde was not found. Set PERNIX_SIMDE_PROVIDER=FETCH or install SIMDe's CMake package.")
endif ()
endif ()

include(CTest)
Expand All @@ -40,28 +94,42 @@ else ()
endif ()
message(STATUS "Pernix version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")

set(BENCHMARK_CXX_STANDARD 20)

set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED YES)
set(CMAKE_CXX_EXTENSIONS OFF)

include(AddCXXCompilerFlag)
if (MSVC)
message(FATAL_ERROR "MSVC compiler is not supported")
else ()
add_cxx_compiler_flag(-Wall)
add_cxx_compiler_flag(-Wextra)
add_cxx_compiler_flag(-Wshadow)
add_cxx_compiler_flag(-Wfloat-equal)
add_cxx_compiler_flag(-Wold-style-cast)
add_cxx_compiler_flag(-Wconversion)
add_cxx_compiler_flag(-fstrict-aliasing)
add_cxx_compiler_flag(-Wno-ignored-attributes)
include(CheckCXXCompilerFlag)
set(PERNIX_PRIVATE_COMPILE_OPTIONS)
foreach (PERNIX_CXX_FLAG
-Wall
-Wextra
-Wshadow
-Wfloat-equal
-Wold-style-cast
-Wconversion
-fstrict-aliasing
-Wno-ignored-attributes
)
string(MAKE_C_IDENTIFIER "PERNIX_HAS_CXX_FLAG_${PERNIX_CXX_FLAG}" PERNIX_CXX_FLAG_VARIABLE)
check_cxx_compiler_flag("${PERNIX_CXX_FLAG}" "${PERNIX_CXX_FLAG_VARIABLE}")
if (${PERNIX_CXX_FLAG_VARIABLE})
list(APPEND PERNIX_PRIVATE_COMPILE_OPTIONS "${PERNIX_CXX_FLAG}")
else ()
message(STATUS "Compiler flag not supported: ${PERNIX_CXX_FLAG}")
endif ()
endforeach ()

if (PERNIX_ENABLE_LTO)
add_cxx_compiler_flag(-flto=auto)
add_cxx_compiler_flag(-Wno-lto-type-mismatch)
include(CheckIPOSupported)
check_ipo_supported(RESULT PERNIX_IPO_SUPPORTED OUTPUT PERNIX_IPO_ERROR)
if (NOT PERNIX_IPO_SUPPORTED)
message(FATAL_ERROR "PERNIX_ENABLE_LTO is enabled, but IPO/LTO is not supported: ${PERNIX_IPO_ERROR}")
endif ()

check_cxx_compiler_flag("-Wno-lto-type-mismatch" PERNIX_HAS_CXX_FLAG_WNO_LTO_TYPE_MISMATCH)
if (PERNIX_HAS_CXX_FLAG_WNO_LTO_TYPE_MISMATCH)
list(APPEND PERNIX_PRIVATE_COMPILE_OPTIONS "-Wno-lto-type-mismatch")
endif ()

if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
find_program(GCC_AR gcc-ar)
if (GCC_AR)
Expand All @@ -84,8 +152,6 @@ else ()
endif ()
endif ()

include_directories(${PROJECT_SOURCE_DIR}/include)

add_subdirectory(src)

if (PERNIX_ENABLE_FORTRAN_BINDINGS)
Expand All @@ -97,4 +163,4 @@ endif ()
if (PERNIX_ENABLE_TESTS)
enable_testing()
add_subdirectory(tests)
endif ()
endif ()
1 change: 0 additions & 1 deletion external/simde
Submodule simde deleted from 1747b2
200 changes: 200 additions & 0 deletions include/pernix/arm64/neon/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#ifndef PERNIX_ARM64_NEON_COMMON_H
#define PERNIX_ARM64_NEON_COMMON_H

#include <pernix/simd_compat.h>

#include <cstring>

namespace pernix::arm64::neon::internal {
struct float64x2x8_t {
float64x2_t val[8];
};

static constexpr uint32_t tail_bytes(const uint8_t bit_width, const uint32_t remaining_elements) {
const uint32_t tail_bits = remaining_elements * bit_width;
const uint32_t tail_bytes = (tail_bits + 7u) / 8u;
return tail_bytes;
}

__always_inline int32x4x4_t neon_convert_int8x16_int32x4x4(const int8x16_t& input) {
const int16x8_t s16_lo = vmovl_s8(vget_low_s8(input));
const int16x8_t s16_hi = vmovl_s8(vget_high_s8(input));

return {{
vmovl_s16(vget_low_s16(s16_lo)),
vmovl_s16(vget_high_s16(s16_lo)),
vmovl_s16(vget_low_s16(s16_hi)),
vmovl_s16(vget_high_s16(s16_hi)),
}};
}

__always_inline int32x4x2_t neon_convert_int16x8_int32x4x2(const int16x8_t& input) {
return {{
vmovl_s16(vget_low_s16(input)),
vmovl_s16(vget_high_s16(input)),
}};
}

__always_inline float32x4x4_t neon_dequantize_epi32(const int32x4x4_t& input, const float32x4_t& scale) {
return {{
vmulq_f32(vcvtq_f32_s32(input.val[0]), scale),
vmulq_f32(vcvtq_f32_s32(input.val[1]), scale),
vmulq_f32(vcvtq_f32_s32(input.val[2]), scale),
vmulq_f32(vcvtq_f32_s32(input.val[3]), scale),
}};
}

__always_inline float32x4x2_t neon_dequantize_epi32(const int32x4x2_t& input, const float32x4_t& scale) {
return {{
vmulq_f32(vcvtq_f32_s32(input.val[0]), scale),
vmulq_f32(vcvtq_f32_s32(input.val[1]), scale),
}};
}

__always_inline float32x4_t neon_dequantize_epi32(const int32x4_t& input, const float32x4_t& scale) {
return vmulq_f32(vcvtq_f32_s32(input), scale);
}

__always_inline float64x2_t neon_dequantize_epi32_f64(const int32x2_t& input, const float64x2_t& scale) {
return vmulq_f64(vcvtq_f64_s64(vmovl_s32(input)), scale);
}

__always_inline float64x2x2_t neon_dequantize_epi32_f64(const int32x4_t& input, const float64x2_t& scale) {
return {{
neon_dequantize_epi32_f64(vget_low_s32(input), scale),
neon_dequantize_epi32_f64(vget_high_s32(input), scale),
}};
}

__always_inline float64x2x4_t neon_dequantize_epi32_f64(const int32x4x2_t& input, const float64x2_t& scale) {
const float64x2x2_t dequantized_low = neon_dequantize_epi32_f64(input.val[0], scale);
const float64x2x2_t dequantized_high = neon_dequantize_epi32_f64(input.val[1], scale);

return {{
dequantized_low.val[0],
dequantized_low.val[1],
dequantized_high.val[0],
dequantized_high.val[1],
}};
}

__always_inline float64x2x8_t neon_dequantize_epi32_f64(const int32x4x4_t& input, const float64x2_t& scale) {
const float64x2x2_t dequantized0 = neon_dequantize_epi32_f64(input.val[0], scale);
const float64x2x2_t dequantized1 = neon_dequantize_epi32_f64(input.val[1], scale);
const float64x2x2_t dequantized2 = neon_dequantize_epi32_f64(input.val[2], scale);
const float64x2x2_t dequantized3 = neon_dequantize_epi32_f64(input.val[3], scale);

return {{
dequantized0.val[0],
dequantized0.val[1],
dequantized1.val[0],
dequantized1.val[1],
dequantized2.val[0],
dequantized2.val[1],
dequantized3.val[0],
dequantized3.val[1],
}};
}

__always_inline uint8x16_t neon_load_tail_elements_int8(const uint8_t* input, const uint32_t tail_bytes_count) {
uint8_t buffer[16] = {0};
std::memcpy(buffer, input, tail_bytes_count);
return vld1q_u8(buffer);
}

__always_inline uint16x8_t neon_load_tail_elements_int16(const uint8_t* input, const uint32_t tail_bytes_count) {
uint16_t buffer[8] = {0};
std::memcpy(buffer, input, tail_bytes_count);
return vld1q_u16(buffer);
}

__always_inline uint32x4_t neon_load_tail_elements_int32(const uint8_t* input, const uint32_t tail_bytes_count) {
uint32_t buffer[4] = {0};
std::memcpy(buffer, input, tail_bytes_count);
return vld1q_u32(buffer);
}

__always_inline float32x4_t neon_load_tail_elements_f32(const uint8_t* input, const uint32_t tail_elements) {
float32_t buffer[4] = {0.0f};
std::memcpy(buffer, input, tail_elements * sizeof(float32_t));
return vld1q_f32(buffer);
}

__always_inline float64x2_t neon_load_tail_elements_f64(const uint8_t* input, const uint32_t tail_elements) {
float64_t buffer[2] = {0.0};
std::memcpy(buffer, input, tail_elements * sizeof(float64_t));
return vld1q_f64(buffer);
}

__always_inline void neon_store_tail_elements_int8(uint8_t* output, const uint8x16x4_t& data, const uint32_t tail_elements) {
uint8_t buffer[16 * 4];
for (uint32_t i = 0; i < 4; ++i) {
vst1q_u8(buffer + i * 16, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(uint8_t));
}

__always_inline void neon_store_tail_elements_int16(uint16_t* output, const uint16x8x4_t& data, const uint32_t tail_elements) {
uint16_t buffer[8 * 4];
for (uint32_t i = 0; i < 4; ++i) {
vst1q_u16(buffer + i * 8, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(uint16_t));
}

__always_inline void neon_store_tail_elements_int32(uint32_t* output, const uint32x4x4_t& data, const uint32_t tail_elements) {
uint32_t buffer[4 * 4];
for (uint32_t i = 0; i < 4; ++i) {
vst1q_u32(buffer + i * 4, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(uint32_t));
}

__always_inline void neon_store_tail_elements_f32(float32_t* output, const float32x4x4_t& data, const uint32_t tail_elements) {
float32_t buffer[16 * 4];
for (uint32_t i = 0; i < 4; ++i) {
vst1q_f32(buffer + i * 4, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(float32_t));
}

__always_inline void neon_store_tail_elements_f32(float32_t* output, const float32x4x2_t& data, const uint32_t tail_elements) {
float32_t buffer[8 * 2];
for (uint32_t i = 0; i < 2; ++i) {
vst1q_f32(buffer + i * 4, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(float32_t));
}

__always_inline void neon_store_tail_elements_f32(float32_t* output, const float32x4_t& data, const uint32_t tail_elements) {
float32_t buffer[4];
vst1q_f32(buffer, data);
std::memcpy(output, buffer, tail_elements * sizeof(float32_t));
}

__always_inline void neon_store_tail_elements_f64(float64_t* output, const float64x2x4_t& data, const uint32_t tail_elements) {
float64_t buffer[2 * 4];
for (uint32_t i = 0; i < 4; ++i) {
vst1q_f64(buffer + i * 2, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(float64_t));
}

__always_inline void neon_store_tail_elements_f64(float64_t* output, const float64x2x2_t& data, const uint32_t tail_elements) {
float64_t buffer[2 * 2];
for (uint32_t i = 0; i < 2; ++i) {
vst1q_f64(buffer + i * 2, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(float64_t));
}

__always_inline void neon_store_tail_elements_f64(float64_t* output, const float64x2x8_t& data, const uint32_t tail_elements) {
float64_t buffer[2 * 8];
for (uint32_t i = 0; i < 8; ++i) {
vst1q_f64(buffer + i * 2, data.val[i]);
}
std::memcpy(output, buffer, tail_elements * sizeof(float64_t));
}
} // namespace pernix::arm64::neon::internal

#endif // PERNIX_ARM64_NEON_COMMON_H
Loading