From 2b34485c5a95cbe4bbe3f1b2bc95fe5055a5cf66 Mon Sep 17 00:00:00 2001 From: yuchengliu1 Date: Sat, 1 Jul 2023 12:46:03 +0800 Subject: [PATCH] [Graph] remove CPU vector warnings. (#1070) --- .../neural_engine/graph/layers/ele_wise.h | 34 +++----- .../graph/vectors/CMakeLists.txt | 2 +- .../neural_engine/graph/vectors/cpu/vec.hpp | 1 - .../graph/vectors/cpu/vec_arithmetic.cpp | 78 ++++++++++------- .../graph/vectors/cpu/vec_arithmetic.hpp | 32 +++---- .../graph/vectors/cpu/vec_base.hpp | 54 ++++++++++-- .../graph/vectors/cpu/vec_compare.cpp | 16 ++-- .../graph/vectors/cpu/vec_compare.hpp | 8 +- .../graph/vectors/cpu/vec_convert.cpp | 77 ++++++++++------- .../graph/vectors/cpu/vec_convert.hpp | 25 +++--- .../graph/vectors/cpu/vec_load.cpp | 35 -------- .../graph/vectors/cpu/vec_load.hpp | 26 ++++-- .../graph/vectors/cpu/vec_set.cpp | 40 +++------ .../graph/vectors/cpu/vec_set.hpp | 19 ++-- .../graph/vectors/cpu/vec_store.cpp | 40 --------- .../graph/vectors/cpu/vec_store.hpp | 47 ++++++++-- .../neural_engine/graph/vectors/ele_wise.cpp | 86 +++++++++++++++++++ .../neural_engine/graph/vectors/ele_wise.h | 40 +++++++++ 18 files changed, 396 insertions(+), 264 deletions(-) create mode 100644 intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.cpp create mode 100644 intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.h diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/layers/ele_wise.h b/intel_extension_for_transformers/backends/neural_engine/graph/layers/ele_wise.h index c897ec4e6eb..eab255b3f8a 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/layers/ele_wise.h +++ b/intel_extension_for_transformers/backends/neural_engine/graph/layers/ele_wise.h @@ -2,7 +2,7 @@ #include #include "core/data_types.h" -#include "vectors/cpu/simd.h" +#include "vectors/ele_wise.h" #ifdef __cplusplus extern "C" { @@ -11,41 +11,29 @@ extern "C" { // fundamental operations // -inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) { - for (int i = 0; i < n; ++i) x[i] = v; -} +inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) { ne_vec_set_i8_(n, x, v); } -inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) { - for (int i = 0; i < n; ++i) x[i] = v; -} +inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) { ne_vec_set_i16_(n, x, v); } -inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) { - for (int i = 0; i < n; ++i) x[i] = v; -} +inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) { ne_vec_set_i32_(n, x, v); } -inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) { - for (int i = 0; i < n; ++i) x[i] = v; -} +inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) { ne_vec_set_f16_(n, x, v); } inline static void ne_vec_add_f32(const int n, float* z, const float* x, const float* y) { - for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; + ne_vec_add_f32_(n, z, x, y); } inline static void ne_vec_add1_f32(const int n, float* z, const float* x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } -inline static void ne_vec_acc_f32(const int n, float* y, const float* x) { - for (int i = 0; i < n; ++i) y[i] += x[i]; -} +inline static void ne_vec_acc_f32(const int n, float* y, const float* x) { ne_vec_acc_f32_(n, y, x); } inline static void ne_vec_acc1_f32(const int n, float* y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } inline static void ne_vec_sub_f32(const int n, float* z, const float* x, const float* y) { - for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; + ne_vec_sub_f32_(n, z, x, y); } -inline static void ne_vec_set_f32(const int n, float* x, const float v) { - for (int i = 0; i < n; ++i) x[i] = v; -} +inline static void ne_vec_set_f32(const int n, float* x, const float v) { ne_vec_set_f32_(n, x, v); } inline static void ne_vec_cpy_f32(const int n, float* y, const float* x) { for (int i = 0; i < n; ++i) y[i] = x[i]; @@ -54,10 +42,10 @@ inline static void ne_vec_neg_f32(const int n, float* y, const float* x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } inline static void ne_vec_mul_f32(const int n, float* z, const float* x, const float* y) { - for (int i = 0; i < n; ++i) z[i] = x[i] * y[i]; + ne_vec_mul_f32_(n, z, x, y); } inline static void ne_vec_div_f32(const int n, float* z, const float* x, const float* y) { - for (int i = 0; i < n; ++i) z[i] = x[i] / y[i]; + ne_vec_div_f32_(n, z, x, y); } inline static void ne_vec_mad_f32(const int n, float* __restrict y, const float* __restrict x, const float v) { diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/CMakeLists.txt b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/CMakeLists.txt index 020aaf5d1c1..344c6ca6cc8 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/CMakeLists.txt +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/CMakeLists.txt @@ -17,5 +17,5 @@ if (NE_GPU) add_subdirectory(gpu) endif() -add_library_w_warning(ne_vec ele_reduce.cpp) +add_library_w_warning(ne_vec ele_reduce.cpp ele_wise.cpp) target_link_libraries(ne_vec PUBLIC cpu_vec) diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec.hpp index ca1a5c3d1be..584e75130de 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec.hpp @@ -19,7 +19,6 @@ #include "vec_base.hpp" #include "vec_compare.hpp" #include "vec_convert.hpp" -#include "vec_load.hpp" #include "vec_set.hpp" #endif // ENGINE_EXECUTOR_INCLUDE_VEC_HPP_ diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.cpp index 64ea15b5c16..d7d0a6aeb38 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.cpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.cpp @@ -12,27 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "vec_load.hpp" +#include "vec_store.hpp" #include "vec_arithmetic.hpp" +#include "cmath" -inline fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) { +fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) { #if __AVX512F__ - return _mm512_sub_ps(x, y); + return {_mm512_sub_ps(x.first, y.first)}; #else return {_mm256_sub_ps(x.first, y.first), _mm256_sub_ps(x.second, y.second)}; #endif } -inline fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) { +fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) { #if __AVX512F__ - return _mm512_fmsub_ps(x, y, z); + return {_mm512_fmsub_ps(x.first, y.first, z.first)}; #else return {_mm256_fmsub_ps(x.first, y.first, z.first), _mm256_fmsub_ps(x.second, y.second, z.second)}; #endif } -inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) { +fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) { #if __AVX512F__ - return _mm512_maskz_fmsub_ps(mask, x, y, z); + return {_mm512_maskz_fmsub_ps(mask, x.first, y.first, z.first)}; #else __m256 first, second; MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_fmsub_ps(x.first, y.first, z.first), mask & 255, first); @@ -42,33 +45,33 @@ inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) { #endif } -inline fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) { +fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) { #if __AVX512F__ - return _mm512_add_ps(x, y); + return {_mm512_add_ps(x.first, y.first)}; #else return {_mm256_add_ps(x.first, y.first), _mm256_add_ps(x.second, y.second)}; #endif } -inline fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) { +fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) { #if __AVX512F__ - return _mm512_fmadd_ps(x, y, z); + return {_mm512_fmadd_ps(x.first, y.first, z.first)}; #else return {_mm256_fmadd_ps(x.first, y.first, z.first), _mm256_fmadd_ps(x.second, y.second, z.second)}; #endif } -inline fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) { +fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) { #if __AVX512F__ - return _mm512_mul_ps(x, y); + return {_mm512_mul_ps(x.first, y.first)}; #else return {_mm256_mul_ps(x.first, y.first), _mm256_mul_ps(x.second, y.second)}; #endif } -inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) { +fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) { #if __AVX512F__ - return _mm512_maskz_mul_ps(mask, x, y); + return {_mm512_maskz_mul_ps(mask, x.first, y.first)}; #else __m256 first, second; MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_mul_ps(x.first, y.first), mask & 255, first); @@ -78,31 +81,31 @@ inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) { } template -inline fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) { +fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) { static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION), "ERROR: Not support rounding"); #if __AVX512F__ - return _mm512_mul_round_ps(x, y, rounding); + return {_mm512_mul_round_ps(x.first, y.first, rounding)}; #else return {_mm256_round_ps(_mm256_mul_ps(x.first, y.first), rounding), _mm256_round_ps(_mm256_mul_ps(x.second, y.second), rounding)}; #endif } -inline fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) { +fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) { #if __AVX512F__ - return _mm512_div_ps(x, y); + return {_mm512_div_ps(x.first, y.first)}; #else return {_mm256_div_ps(x.first, y.first), _mm256_div_ps(x.second, y.second)}; #endif } -inline float reduce_add_fp32x16(fp32x16 x) { +float reduce_add_fp32x16(fp32x16 x) { #if __AVX512F__ - return _mm512_reduce_add_ps(x); + return {_mm512_reduce_add_ps(x.first)}; #else const __m256 x256 = _mm256_add_ps(x.first, x.second); const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256)); @@ -112,46 +115,55 @@ inline float reduce_add_fp32x16(fp32x16 x) { #endif } -inline fp32x16 sqrt_fp32x16(fp32x16 x) { +fp32x16 sqrt_fp32x16(fp32x16 x) { #if __AVX512F__ - return _mm512_sqrt_ps(x); + return {_mm512_sqrt_ps(x.first)}; #else return {_mm256_sqrt_ps(x.first), _mm256_sqrt_ps(x.second)}; #endif } -inline fp32x16 rsqrt14_fp32x16(fp32x16 x) { +fp32x16 rsqrt14_fp32x16(fp32x16 x) { #if __AVX512F__ - return _mm512_rsqrt14_ps(x); + return {_mm512_rsqrt14_ps(x.first)}; #else // the max relative error is 6x than avx512 return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)}; #endif } -inline fp32x16 ceil_fp32x16(fp32x16 x) { +fp32x16 ceil_fp32x16(fp32x16 x) { #if __AVX512F__ - return _mm512_ceil_ps(x); + return {_mm512_ceil_ps(x.first)}; #else // the max relative error is 6x than avx512 return {_mm256_ceil_ps(x.first), _mm256_ceil_ps(x.second)}; #endif } -inline fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) { +fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) { #if __AVX512F__ - return _mm512_scalef_ps(x, y); + return {_mm512_scalef_ps(x.first, y.first)}; #else // No intrinsic - assert("No intrinsic"); - return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)}; + float* vec_x = new float[16]; + float* vec_y = new float[16]; + float* vec_z = new float[16]; + store_fp32x16(vec_x, x); + store_fp32x16(vec_y, y); + for (int i = 0; i < 16; i++) vec_z[i] = vec_x[i] * exp2(vec_y[i]); + fp32x16 res = load_fp32x16(vec_z); + delete[] vec_x; + delete[] vec_y; + delete[] vec_z; + return res; #endif } -inline float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); } +float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); } -inline fp32x16 abs_fp32x16(fp32x16 x) { +fp32x16 abs_fp32x16(fp32x16 x) { #if __AVX512F__ - return _mm512_abs_ps(x); + return {_mm512_abs_ps(x.first)}; #else return {_mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.first))), _mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.second)))}; diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.hpp index 7ec75dd020b..17629a62d4b 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_arithmetic.hpp @@ -17,50 +17,50 @@ #include "vec_base.hpp" -inline fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y); + fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y); REGISTER_KERNEL_T(sub_fp32x16, fp32x16, fp32x16, fp32x16); -inline fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z); + fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z); REGISTER_KERNEL_T(fmsub_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16); -inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z); + fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z); -inline fp32x16 add_fp32x16(fp32x16 x, fp32x16 y); + fp32x16 add_fp32x16(fp32x16 x, fp32x16 y); REGISTER_KERNEL_T(add_fp32x16, fp32x16, fp32x16, fp32x16); -inline fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z); + fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z); REGISTER_KERNEL_T(fmadd_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16); -inline fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y); + fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y); REGISTER_KERNEL_T(mul_fp32x16, fp32x16, fp32x16, fp32x16); -inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y); + fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y); template -inline fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y); + fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y); -inline fp32x16 div_fp32x16(fp32x16 x, fp32x16 y); + fp32x16 div_fp32x16(fp32x16 x, fp32x16 y); REGISTER_KERNEL_T(div_fp32x16, fp32x16, fp32x16, fp32x16); -inline float reduce_add_fp32x16(fp32x16 x); + float reduce_add_fp32x16(fp32x16 x); REGISTER_KERNEL_T(reduce_add_fp32x16, float, fp32x16); -inline fp32x16 sqrt_fp32x16(fp32x16 x); + fp32x16 sqrt_fp32x16(fp32x16 x); REGISTER_KERNEL_T(sqrt_fp32x16, fp32x16, fp32x16); -inline fp32x16 rsqrt14_fp32x16(fp32x16 x); + fp32x16 rsqrt14_fp32x16(fp32x16 x); REGISTER_KERNEL_T(rsqrt14_fp32x16, fp32x16, fp32x16); -inline fp32x16 ceil_fp32x16(fp32x16 x); + fp32x16 ceil_fp32x16(fp32x16 x); REGISTER_KERNEL_T(ceil_fp32x16, fp32x16, fp32x16); -inline fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y); + fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y); REGISTER_KERNEL_T(scale_fp32x16, fp32x16, fp32x16, fp32x16); -inline float dot_fp32x16(fp32x16 x, fp32x16 y); + float dot_fp32x16(fp32x16 x, fp32x16 y); REGISTER_KERNEL_T(dot_fp32x16, float, fp32x16, fp32x16); -inline fp32x16 abs_fp32x16(fp32x16 x); + fp32x16 abs_fp32x16(fp32x16 x); REGISTER_KERNEL_T(abs_fp32x16, fp32x16, fp32x16); #endif // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_ diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_base.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_base.hpp index 06709bce6f6..363d47e5fe3 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_base.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_base.hpp @@ -19,12 +19,27 @@ #include #if __AVX512F__ -typedef __m512 fp32x16; -typedef __m512i int32x16; +struct fp32x16 { + __m512 first; +}; + +struct s32x16 { + __m512i first; +}; +struct u32x16 { + __m512i first; +}; #else -typedef std::pair<__m256, __m256> fp32x16; -typedef std::pair<__m256i, __m256i> int32x16; +struct fp32x16 { + __m256 first, second; +}; +struct s32x16 { + __m256i first, second; +}; +struct u32x16 { + __m256i first, second; +}; #define MASK_DECORATOR(blend_func, a, b, mask, res) \ switch ((mask)) { \ case 1: \ @@ -53,16 +68,37 @@ typedef std::pair<__m256i, __m256i> int32x16; } #endif -typedef __m256i bf16x16; -typedef __m256i int16x16; -typedef __m128i int8x16; + +struct bf16x16 { + __m256i first; +}; + +struct fp16x16 { + __m256i first; +}; + +struct s16x16 { + __m256i first; +}; +struct s8x16 { + __m128i first; +}; +struct u8x16 { + __m128i first; +}; + #define CPU_VEC_STEP 16 template -T load_kernel_t(const void*); +T load_kernel_t(const void* src) { + return *reinterpret_cast(src); +} template -void store_kernel_t(void*, T); +void store_kernel_t(void* dst, T src) { + T* dst_T = reinterpret_cast(dst); + *dst_T = src; +} template struct kernel_t { diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.cpp index ac960be5de3..61d3df12939 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.cpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.cpp @@ -14,33 +14,33 @@ #include "vec_compare.hpp" -inline fp32x16 min_fp32x16(fp32x16 a, fp32x16 b) { + fp32x16 min_fp32x16(fp32x16 a, fp32x16 b) { #if __AVX512F__ - return _mm512_min_ps(a, b); + return {_mm512_min_ps(a.first, b.first)}; #else return {_mm256_min_ps(a.first, b.first), _mm256_min_ps(a.second, b.second)}; #endif } -inline int32x16 max_int32x16(int32x16 a, int32x16 b) { + s32x16 max_s32x16(s32x16 a, s32x16 b) { #if __AVX512F__ - return _mm512_max_epi32(a, b); + return {_mm512_max_epi32(a.first, b.first)}; #else return {_mm256_max_epi32(a.first, b.first), _mm256_max_epi32(a.second, b.second)}; #endif } -inline fp32x16 max_fp32x16(fp32x16 a, fp32x16 b) { + fp32x16 max_fp32x16(fp32x16 a, fp32x16 b) { #if __AVX512F__ - return _mm512_max_ps(a, b); + return {_mm512_max_ps(a.first, b.first)}; #else return {_mm256_max_ps(a.first, b.first), _mm256_max_ps(a.second, b.second)}; #endif } -inline float reduce_max_fp32x16(fp32x16 x) { + float reduce_max_fp32x16(fp32x16 x) { #if __AVX512F__ - return _mm512_reduce_max_ps(x); + return {_mm512_reduce_max_ps(x.first)}; #else const __m256 x256 = _mm256_max_ps(x.first, x.second); const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256)); diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.hpp index 3656249b660..a9d819532e8 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_compare.hpp @@ -17,13 +17,13 @@ #include "vec_base.hpp" -inline fp32x16 min_fp32x16(fp32x16 a, fp32x16 b); + fp32x16 min_fp32x16(fp32x16 a, fp32x16 b); -inline int32x16 max_int32x16(int32x16 a, int32x16 b); + s32x16 max_s32x16(s32x16 a, s32x16 b); -inline fp32x16 max_fp32x16(fp32x16 a, fp32x16 b); + fp32x16 max_fp32x16(fp32x16 a, fp32x16 b); -inline float reduce_max_fp32x16(fp32x16 x); + float reduce_max_fp32x16(fp32x16 x); REGISTER_KERNEL_T(reduce_max_fp32x16, float, fp32x16); #endif // ENGINE_EXECUTOR_INCLUDE_VEC_COMPARE_HPP_ diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.cpp index 633375ec4d9..6bb176a646a 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.cpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.cpp @@ -12,31 +12,32 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "vec_store.hpp" #include "vec_convert.hpp" template -inline int32x16 cvt_roundfp32x16_int32x16(fp32x16 a) { +s32x16 cvt_roundfp32x16_s32x16(fp32x16 a) { static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION), "ERROR: Not support rounding"); #if __AVX512F__ - return _mm512_cvt_roundps_epi32(a, rounding); + return {_mm512_cvt_roundps_epi32(a.first, rounding)}; #else return {_mm256_cvtps_epi32(_mm256_round_ps(a.first, rounding)), _mm256_cvtps_epi32(_mm256_round_ps(a.second, rounding))}; #endif } template -inline int32x16 maskz_cvt_roundfp32x16_int32x16(int mask, fp32x16 a) { +s32x16 maskz_cvt_roundfp32x16_s32x16(int mask, fp32x16 a) { static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION), "ERROR: Not support rounding"); #if __AVX512F__ - return _mm512_maskz_cvt_roundps_epi32(mask, a, rounding); + return {_mm512_maskz_cvt_roundps_epi32(mask, a.first, rounding)}; #else __m256i first, second; first = _mm256_cvtps_epi32(_mm256_round_ps(a.first, rounding)); @@ -47,48 +48,48 @@ inline int32x16 maskz_cvt_roundfp32x16_int32x16(int mask, fp32x16 a) { #endif } -inline bf16x16 cvt_fp32x16_bf16x16(fp32x16 a) { +bf16x16 cvt_fp32x16_bf16x16(fp32x16 a) { #if __AVX512F__ #if __AVX512BF16__ && __GNUC__ > 11 - return _mm512_cvtneps_pbh(a); + return {_mm512_cvtneps_pbh(a.first)}; #else - return _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(a), 2)); + return {_mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(a.first), 2))}; #endif #else __m256i first = _mm256_bsrli_epi128(_mm256_castps_si256(a.first), 2); __m256i second = _mm256_bsrli_epi128(_mm256_castps_si256(a.second), 2); __m256i res = _mm256_packus_epi32(first, second); - return _mm256_permute4x64_epi64(res, 0x18); + return {_mm256_permute4x64_epi64(res, 0x18)}; #endif } -inline fp32x16 cvt_bf16x16_fp32x16(bf16x16 a) { +fp32x16 cvt_bf16x16_fp32x16(bf16x16 a) { #if __AVX512F__ #if __AVX512BF16__ && __GNUC__ > 11 - return _mm512_cvtpbh_ps(a); + return {_mm512_cvtpbh_ps(a.first)}; #else - return _mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(a), 2)); + return {_mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(a.first), 2))}; #endif #else - __m128i second = _mm256_extractf128_si256(a, 1); + __m128i second = _mm256_extractf128_si256(a.first, 1); __m256 second_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(second), 2)); - __m128i first = _mm256_castsi256_si128(a); + __m128i first = _mm256_castsi256_si128(a.first); __m256 first_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(first), 2)); return {first_fp32, second_fp32}; #endif } -inline fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a) { +fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a) { #if __AVX512F__ #if __AVX512BF16__ && __GNUC__ > 11 - return _mm512_maskz_cvtpbh_ps(mask, a); + return {_mm512_maskz_cvtpbh_ps(mask, a.first)}; #else - return _mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_maskz_cvtepu16_epi32(mask, a), 2)); + return {_mm512_castsi512_ps(_mm512_bslli_epi128(_mm512_maskz_cvtepu16_epi32(mask, a.first), 2))}; #endif #else - __m128i second = _mm256_extractf128_si256(a, 1); + __m128i second = _mm256_extractf128_si256(a.first, 1); __m256 second_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(second), 2)); - __m128i first = _mm256_castsi256_si128(a); + __m128i first = _mm256_castsi256_si128(a.first); __m256 first_fp32 = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(first), 2)); MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), first_fp32, mask & 255, first_fp32); MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), second_fp32, mask >> 8, second_fp32); @@ -96,9 +97,9 @@ inline fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a) { #endif } -inline int8x16 cvt_uint32x16_uint8x16(int32x16 a) { +u8x16 cvt_u32x16_u8x16(u32x16 a) { #if __AVX512F__ - return _mm512_cvtusepi32_epi8(a); + return {_mm512_cvtusepi32_epi8(a.first)}; #else __m256i first = _mm256_min_epi32(_mm256_set1_epi32(255), a.first); __m256i second = _mm256_min_epi32(_mm256_set1_epi32(255), a.second); @@ -108,13 +109,13 @@ inline int8x16 cvt_uint32x16_uint8x16(int32x16 a) { -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1)); __m256i result = _mm256_or_si256(first, second); result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0)); - return _mm256_castsi256_si128(result); + return {_mm256_castsi256_si128(result)}; #endif } -inline int8x16 maskz_cvt_uint32x16_uint8x16(int mask, int32x16 a) { +u8x16 maskz_cvt_u32x16_u8x16(int mask, u32x16 a) { #if __AVX512F__ - return _mm512_maskz_cvtusepi32_epi8(mask, a); + return {_mm512_maskz_cvtusepi32_epi8(mask, a.first)}; #else __m256i first, second; MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), _mm256_min_epi32(_mm256_set1_epi32(255), a.first), @@ -127,13 +128,13 @@ inline int8x16 maskz_cvt_uint32x16_uint8x16(int mask, int32x16 a) { -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1)); __m256i result = _mm256_or_si256(first, second); result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0)); - return _mm256_castsi256_si128(result); + return {_mm256_castsi256_si128(result)}; #endif } -inline int8x16 cvt_int32x16_int8x16(int32x16 a) { +s8x16 cvt_s32x16_s8x16(s32x16 a) { #if __AVX512F__ - return _mm512_cvtsepi32_epi8(a); + return {_mm512_cvtsepi32_epi8(a.first)}; #else __m256i first = _mm256_min_epi32(_mm256_set1_epi32(127), a.first); __m256i second = _mm256_min_epi32(_mm256_set1_epi32(127), a.second); @@ -145,13 +146,13 @@ inline int8x16 cvt_int32x16_int8x16(int32x16 a) { -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1)); __m256i result = _mm256_or_si256(first, second); result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0)); - return _mm256_castsi256_si128(result); + return {_mm256_castsi256_si128(result)}; #endif } -inline int8x16 maskz_cvt_int32x16_int8x16(const int mask, int32x16 a) { +s8x16 maskz_cvt_s32x16_s8x16(const int mask, s32x16 a) { #if __AVX512F__ - return _mm512_maskz_cvtsepi32_epi8(mask, a); + return {_mm512_maskz_cvtsepi32_epi8(mask, a.first)}; #else __m256i first, second; MASK_DECORATOR(_mm256_blend_epi32, _mm256_setzero_si256(), _mm256_min_epi32(_mm256_set1_epi32(127), a.first), @@ -166,6 +167,22 @@ inline int8x16 maskz_cvt_int32x16_int8x16(const int mask, int32x16 a) { -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1)); __m256i result = _mm256_or_si256(first, second); result = _mm256_permutevar8x32_epi32(result, _mm256_set_epi32(7, 6, 3, 2, 5, 1, 4, 0)); - return _mm256_castsi256_si128(result); + return {_mm256_castsi256_si128(result)}; +#endif +} + +void cvtu32x16_store_u8x16(void* base_addr, u32x16 a) { +#ifdef __AVX512F__ + _mm512_mask_cvtusepi32_storeu_epi8(base_addr, 0xffff, a.first); +#else + store_u8x16(base_addr, cvt_u32x16_u8x16(a)); +#endif +} + +void mask_cvtu32x16_store_u8x16(void* base_addr, int mask, u32x16 a) { +#ifdef __AVX512F__ + _mm512_mask_cvtusepi32_storeu_epi8(base_addr, mask, a.first); +#else + mask_store_u8x16(base_addr, mask, maskz_cvt_u32x16_u8x16(mask, a)); #endif } diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.hpp index 30e0fd9ef40..a04c2048add 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_convert.hpp @@ -18,25 +18,26 @@ #include "vec_base.hpp" template -inline int32x16 cvt_roundfp32x16_int32x16(fp32x16 a); +s32x16 cvt_roundfp32x16_s32x16(fp32x16 a); template -struct ne_cvt_roundfp32x16_int32x16_kernel_t : public kernel_t { - ne_cvt_roundfp32x16_int32x16_kernel_t() { func_ = cvt_roundfp32x16_int32x16; } +struct ne_cvt_roundfp32x16_s32x16_kernel_t : public kernel_t { + ne_cvt_roundfp32x16_s32x16_kernel_t() { func_ = cvt_roundfp32x16_s32x16; } }; template -inline int32x16 maskz_cvt_roundfp32x16_int32x16(int mask, fp32x16 a); -inline bf16x16 cvt_fp32x16_bf16x16(fp32x16 a); +s32x16 maskz_cvt_roundfp32x16_s32x16(int mask, fp32x16 a); +bf16x16 cvt_fp32x16_bf16x16(fp32x16 a); -inline fp32x16 cvt_bf16x16_fp32x16(bf16x16 a); +fp32x16 cvt_bf16x16_fp32x16(bf16x16 a); -inline fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a); +fp32x16 maskz_cvt_bf16x16_fp32x16(int mask, bf16x16 a); -inline int8x16 cvt_uint32x16_uint8x16(int32x16 a); +u8x16 cvt_u32x16_u8x16(u32x16 a); +u8x16 maskz_cvt_u32x16_u8x16(int mask, u32x16 a); -inline int8x16 maskz_cvt_uint32x16_uint8x16(int mask, int32x16 a); - -inline int8x16 cvt_int32x16_int8x16(int32x16 a); -inline int8x16 maskz_cvt_int32x16_int8x16(const int mask, int32x16 a); +s8x16 cvt_s32x16_s8x16(s32x16 a); +s8x16 maskz_cvt_s32x16_s8x16(const int mask, s32x16 a); +void cvtu32x16_store_u8x16(void* base_addr, u32x16 a); +void mask_cvtu32x16_store_u8x16(void* base_addr, int mask, u32x16 a); #endif // ENGINE_EXECUTOR_INCLUDE_VEC_CONVERT_HPP_ diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.cpp index 4b020a7be5c..6c404ac746e 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.cpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.cpp @@ -13,38 +13,3 @@ // limitations under the License. #include "vec_load.hpp" - -inline fp32x16 load_fp32x16(void const* mem_addr) { -#if __AVX512F__ - return _mm512_loadu_ps(mem_addr); -#else - float const* mem_addr_fp32 = reinterpret_cast(mem_addr); - return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)}; -#endif -} - -inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) { -#if __AVX512F__ - return _mm512_mask_loadu_ps(src, mask, mem_addr); -#else - float const* mem_addr_fp32 = reinterpret_cast(mem_addr); - return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)}; -#endif -} - -inline bf16x16 load_bf16x16(void const* mem_addr) { - __m256i const* mem_addr_bf16 = reinterpret_cast<__m256i const*>(mem_addr); - return _mm256_loadu_si256(mem_addr_bf16); -} - -inline bf16x16 maskz_load_bf16x16(int mask, void const* mem_addr) { -#if __AVX512F__ - __m256i const* mem_addr_bf16 = reinterpret_cast<__m256i const*>(mem_addr); - return _mm256_maskz_loadu_epi16(mask, mem_addr_bf16); -#else - bf16x16 res; - MASK_DECORATOR(_mm256_blend_epi16, _mm256_setzero_si256(), - _mm256_loadu_si256(reinterpret_cast<__m256i const*>(mem_addr)), mask, res); - return res; -#endif -} diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.hpp index 6d4d82a700d..b15832e40c4 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_load.hpp @@ -17,19 +17,31 @@ #include "vec_base.hpp" -template <> -float load_kernel_t(const void* src) { - return *reinterpret_cast(src); +inline fp32x16 load_fp32x16(void const* mem_addr) { +#if __AVX512F__ + return {_mm512_loadu_ps(mem_addr)}; +#else + float const* mem_addr_fp32 = reinterpret_cast(mem_addr); + return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)}; +#endif } - -inline fp32x16 load_fp32x16(void const* mem_addr); template <> fp32x16 load_kernel_t(const void* src) { return load_fp32x16(src); } -inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr); +inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) { +#if __AVX512F__ + return {_mm512_mask_loadu_ps(src.first, mask, mem_addr)}; +#else + float const* mem_addr_fp32 = reinterpret_cast(mem_addr); + return {_mm256_loadu_ps(mem_addr_fp32), _mm256_loadu_ps(mem_addr_fp32 + 8)}; +#endif +} -inline bf16x16 load_bf16x16(void const* mem_addr); +inline bf16x16 load_bf16x16(void const* mem_addr) { + __m256i const* mem_addr_bf16 = reinterpret_cast<__m256i const*>(mem_addr); + return {_mm256_loadu_si256(mem_addr_bf16)}; +} template <> bf16x16 load_kernel_t(const void* src) { return load_bf16x16(src); diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.cpp index 33b818bffda..b0f6bbfbb2e 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.cpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.cpp @@ -14,57 +14,39 @@ #include "vec_set.hpp" -inline fp32x16 set1_fp32x16(const float x) { +fp32x16 set1_fp32x16(const float x) { #if __AVX512F__ - return _mm512_set1_ps(x); + return {_mm512_set1_ps(x)}; #else return {_mm256_set1_ps(x), _mm256_set1_ps(x)}; #endif } -inline int32x16 set1_int8x16(const int8_t x) { -#if __AVX512F__ - return _mm512_set1_epi8(x); -#else - return {_mm256_set1_epi8(x), _mm256_set1_epi8(x)}; -#endif -} +s8x16 set1_s8x16(const int8_t x) { return {_mm_set1_epi8(x)}; } -inline int32x16 set1_int16x16(const int16_t x) { -#if __AVX512F__ - return _mm512_set1_epi16(x); -#else - return {_mm256_set1_epi16(x), _mm256_set1_epi16(x)}; -#endif -} +s16x16 set1_s16x16(const int16_t x) { return {_mm256_set1_epi16(x)}; } -inline int32x16 set1_fp16x16(const uint16_t x) { -#if __AVX512F__ - return _mm512_set1_epi16(x); -#else - return {_mm256_set1_epi16(x), _mm256_set1_epi16(x)}; -#endif -} +fp16x16 set1_fp16x16(const uint16_t x) { return {_mm256_set1_epi16(x)}; } -inline int32x16 set1_int32x16(const int16_t x) { +s32x16 set1_s32x16(const int32_t x) { #if __AVX512F__ - return _mm512_set1_epi32(x); + return {_mm512_set1_epi32(x)}; #else return {_mm256_set1_epi32(x), _mm256_set1_epi32(x)}; #endif } -inline int32x16 setzero_int32x16() { +s32x16 setzero_s32x16() { #if __AVX512F__ - return _mm512_setzero_epi32(); + return {_mm512_setzero_epi32()}; #else return {_mm256_setzero_si256(), _mm256_setzero_si256()}; #endif } -inline fp32x16 setzero_fp32x16() { +fp32x16 setzero_fp32x16() { #if __AVX512F__ - return _mm512_setzero_ps(); + return {_mm512_setzero_ps()}; #else return {_mm256_setzero_ps(), _mm256_setzero_ps()}; #endif diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.hpp index 2aae8ac4d37..3f5f890c69b 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_set.hpp @@ -17,18 +17,23 @@ #include "vec_base.hpp" -inline fp32x16 set1_fp32x16(const float x); +fp32x16 set1_fp32x16(const float x); +REGISTER_KERNEL_T(set1_fp32x16, fp32x16, float); -inline int32x16 set1_int8x16(const int8_t x); +s8x16 set1_s8x16(const int8_t x); +REGISTER_KERNEL_T(set1_s8x16, s8x16, int8_t); -inline int32x16 set1_int16x16(const int16_t x); +s16x16 set1_s16x16(const int16_t x); +REGISTER_KERNEL_T(set1_s16x16, s16x16, int16_t); -inline int32x16 set1_fp16x16(const uint16_t x); +fp16x16 set1_fp16x16(const uint16_t x); +REGISTER_KERNEL_T(set1_fp16x16, fp16x16, uint16_t); -inline int32x16 set1_int32x16(const int16_t x); +s32x16 set1_s32x16(const int32_t x); +REGISTER_KERNEL_T(set1_s32x16, s32x16, int32_t); -inline int32x16 setzero_int32x16(); +s32x16 setzero_s32x16(); -inline fp32x16 setzero_fp32x16(); +fp32x16 setzero_fp32x16(); #endif // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_ diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.cpp index 072f01c03ba..8e907c00644 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.cpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.cpp @@ -13,43 +13,3 @@ // limitations under the License. #include "vec_store.hpp" - -inline void store_int8x16(void* mem_addr, int8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a); } -inline void mask_store_int8x16(void* mem_addr, const int mask, int8x16 a) { -#ifdef __AVX512F__ - _mm_mask_storeu_epi8(mem_addr, mask, a); -#else - __m128i mask_reg = - _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512, - mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1); - _mm_maskmoveu_si128(a, mask_reg, reinterpret_cast(mem_addr)); -#endif -} - -inline void store_fp32x16(void* mem_addr, fp32x16 a) { -#ifdef __AVX512F__ - _mm512_storeu_ps(mem_addr, a); -#else - float* mem_addr_fp32 = reinterpret_cast(mem_addr); - _mm256_storeu_ps(mem_addr_fp32, a.first); - _mm256_storeu_ps(mem_addr_fp32 + 8, a.second); -#endif -} - -inline void store_bf16x16(void* mem_addr, bf16x16 a) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(mem_addr), a); } - -inline void cvtuint32x16_store_int8x16(void* base_addr, int32x16 a) { -#ifdef __AVX512F__ - _mm512_mask_cvtusepi32_storeu_epi8(base_addr, 0xffff, a); -#else - store_int8x16(base_addr, cvt_uint32x16_uint8x16(a)); -#endif -} - -inline void mask_cvtuint32x16_store_int8x16(void* base_addr, int mask, int32x16 a) { -#ifdef __AVX512F__ - _mm512_mask_cvtusepi32_storeu_epi8(base_addr, mask, a); -#else - mask_store_int8x16(base_addr, mask, maskz_cvt_uint32x16_uint8x16(mask, a)); -#endif -} diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.hpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.hpp index fcdfe455bdd..c8b8bd01ae7 100644 --- a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.hpp +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/cpu/vec_store.hpp @@ -17,15 +17,44 @@ #include "vec_base.hpp" -inline void store_int8x16(void* mem_addr, int8x16 a); +inline void store_s8x16(void* mem_addr, s8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); } +inline void store_u8x16(void* mem_addr, u8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); } template <> -void store_kernel_t(void* dst, int8x16 src) { - store_int8x16(dst, src); +void store_kernel_t(void* dst, s8x16 src) { + store_s8x16(dst, src); } -inline void mask_store_int8x16(void* mem_addr, const int mask, int8x16 a); +inline void mask_store_s8x16(void* mem_addr, const int mask, s8x16 a) { +#ifdef __AVX512F__ + _mm_mask_storeu_epi8(mem_addr, mask, a.first); +#else + __m128i mask_reg = + _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512, + mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1); + _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast(mem_addr)); +#endif +} + +inline void mask_store_u8x16(void* mem_addr, const int mask, u8x16 a) { +#ifdef __AVX512F__ + _mm_mask_storeu_epi8(mem_addr, mask, a.first); +#else + __m128i mask_reg = + _mm_set_epi8(mask & 32768, mask & 16384, mask & 8192, mask & 4096, mask & 2048, mask & 1024, mask & 512, + mask & 256, mask & 128, mask & 64, mask & 32, mask & 16, mask & 8, mask & 4, mask & 2, mask & 1); + _mm_maskmoveu_si128(a.first, mask_reg, reinterpret_cast(mem_addr)); +#endif +} -inline void store_fp32x16(void* mem_addr, fp32x16 a); +inline void store_fp32x16(void* mem_addr, fp32x16 a) { +#ifdef __AVX512F__ + _mm512_storeu_ps(mem_addr, a.first); +#else + float* mem_addr_fp32 = reinterpret_cast(mem_addr); + _mm256_storeu_ps(mem_addr_fp32, a.first); + _mm256_storeu_ps(mem_addr_fp32 + 8, a.second); +#endif +} template <> void store_kernel_t(void* dst, float src) { float* dst_fp32 = reinterpret_cast(dst); @@ -37,13 +66,13 @@ void store_kernel_t(void* dst, fp32x16 src) { store_fp32x16(dst, src); } -inline void store_bf16x16(void* mem_addr, bf16x16 a); +inline void store_bf16x16(void* mem_addr, bf16x16 a) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(mem_addr), a.first); +} + template <> void store_kernel_t(void* dst, bf16x16 src) { store_bf16x16(dst, src); } -inline void cvtuint32x16_store_int8x16(void* base_addr, int32x16 a); - -inline void mask_cvtuint32x16_store_int8x16(void* base_addr, int mask, int32x16 a); #endif // ENGINE_EXECUTOR_INCLUDE_VEC_STORE_HPP_ diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.cpp b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.cpp new file mode 100644 index 00000000000..1364f4f62b2 --- /dev/null +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.cpp @@ -0,0 +1,86 @@ +#include "vectors/cpu/vec.hpp" +#include "vectors/ele_wise.h" +#include "cmath" +#ifdef __cplusplus +extern "C" { +#endif +void ne_vec_set_i8_(const int n, int8_t* x, const int8_t v) { + ne_set1_s8x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(x), reinterpret_cast(&v)); + } + for (int i = n / 16 * 16; i < n; i++) x[i] = v; +} + +void ne_vec_set_i16_(const int n, int16_t* x, const int16_t v) { + ne_set1_s16x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(x), reinterpret_cast(&v)); + } + for (int i = n / 16 * 16; i < n; i++) x[i] = v; +} + +void ne_vec_set_i32_(const int n, int32_t* x, const int32_t v) { + ne_set1_s32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(x), reinterpret_cast(&v)); + } + for (int i = n / 16 * 16; i < n; i++) x[i] = v; +} + +void ne_vec_set_f16_(const int n, uint16_t* x, const int32_t v) { + ne_set1_fp16x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(x), reinterpret_cast(&v)); + } + for (int i = n / 16 * 16; i < n; i++) x[i] = v; +} + +void ne_vec_add_f32_(const int n, float* z, const float* x, const float* y) { + ne_add_fp32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(z), reinterpret_cast(x), reinterpret_cast(y)); + } + for (int i = n / 16 * 16; i < n; i++) z[i] = x[i] + y[i]; +} +void ne_vec_acc_f32_(const int n, float* y, const float* x) { + ne_add_fp32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(y), reinterpret_cast(x), reinterpret_cast(y)); + } + for (int i = n / 16 * 16; i < n; i++) y[i] = x[i] + y[i]; +} +void ne_vec_sub_f32_(const int n, float* z, const float* x, const float* y) { + ne_sub_fp32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(z), reinterpret_cast(x), reinterpret_cast(y)); + } + for (int i = n / 16 * 16; i < n; i++) z[i] = x[i] - y[i]; +} + +void ne_vec_set_f32_(const int n, float* x, const float v) { + ne_set1_fp32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(x), reinterpret_cast(&v)); + } + for (int i = n / 16 * 16; i < n; i++) x[i] = v; +} + +void ne_vec_mul_f32_(const int n, float* z, const float* x, const float* y) { + ne_mul_fp32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(z), reinterpret_cast(x), reinterpret_cast(y)); + } + for (int i = n / 16 * 16; i < n; i++) z[i] = x[i] * y[i]; +} +void ne_vec_div_f32_(const int n, float* z, const float* x, const float* y) { + ne_div_fp32x16_kernel_t k_t; + for (int i = 0; i < n / 16; ++i) { + k_t(reinterpret_cast(z), reinterpret_cast(x), reinterpret_cast(y)); + } + for (int i = n / 16 * 16; i < n; i++) z[i] = x[i] / y[i]; +} + +#ifdef __cplusplus +} +#endif diff --git a/intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.h b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.h new file mode 100644 index 00000000000..5a28dc6d545 --- /dev/null +++ b/intel_extension_for_transformers/backends/neural_engine/graph/vectors/ele_wise.h @@ -0,0 +1,40 @@ +#pragma once +#include +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef VEC_SHARED +#if defined(_WIN32) && !defined(__MINGW32__) +#ifdef VEC_BUILD +#define VEC_API __declspec(dllexport) +#else +#define VEC_API __declspec(dllimport) +#endif +#else +#define VEC_API __attribute__((visibility("default"))) +#endif +#else +#define VEC_API +#endif + +void ne_vec_set_i8_(const int n, int8_t* x, const int8_t v); + +void ne_vec_set_i16_(const int n, int16_t* x, const int16_t v); + +void ne_vec_set_i32_(const int n, int32_t* x, const int32_t v); + +void ne_vec_set_f16_(const int n, uint16_t* x, const int32_t v); + +void ne_vec_add_f32_(const int n, float* z, const float* x, const float* y); +void ne_vec_acc_f32_(const int n, float* y, const float* x); +void ne_vec_sub_f32_(const int n, float* z, const float* x, const float* y); + +void ne_vec_set_f32_(const int n, float* x, const float v); + +void ne_vec_mul_f32_(const int n, float* z, const float* x, const float* y); +void ne_vec_div_f32_(const int n, float* z, const float* x, const float* y); + +#ifdef __cplusplus +} +#endif