Skip to content

Commit

Permalink
[Graph] remove CPU vector warnings. (intel#1070)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuchengliu1 committed Jul 1, 2023
1 parent f8a6ea6 commit 2b34485
Show file tree
Hide file tree
Showing 18 changed files with 396 additions and 264 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include <math.h>
#include "core/data_types.h"
#include "vectors/cpu/simd.h"
#include "vectors/ele_wise.h"

#ifdef __cplusplus
extern "C" {
Expand All @@ -11,41 +11,29 @@ extern "C" {
// fundamental operations
//

inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) {
for (int i = 0; i < n; ++i) x[i] = v;
}
inline static void ne_vec_set_i8(const int n, int8_t* x, const int8_t v) { ne_vec_set_i8_(n, x, v); }

inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) {
for (int i = 0; i < n; ++i) x[i] = v;
}
inline static void ne_vec_set_i16(const int n, int16_t* x, const int16_t v) { ne_vec_set_i16_(n, x, v); }

inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) {
for (int i = 0; i < n; ++i) x[i] = v;
}
inline static void ne_vec_set_i32(const int n, int32_t* x, const int32_t v) { ne_vec_set_i32_(n, x, v); }

inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) {
for (int i = 0; i < n; ++i) x[i] = v;
}
inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) { ne_vec_set_f16_(n, x, v); }

inline static void ne_vec_add_f32(const int n, float* z, const float* x, const float* y) {
for (int i = 0; i < n; ++i) z[i] = x[i] + y[i];
ne_vec_add_f32_(n, z, x, y);
}
inline static void ne_vec_add1_f32(const int n, float* z, const float* x, const float v) {
for (int i = 0; i < n; ++i) z[i] = x[i] + v;
}
inline static void ne_vec_acc_f32(const int n, float* y, const float* x) {
for (int i = 0; i < n; ++i) y[i] += x[i];
}
inline static void ne_vec_acc_f32(const int n, float* y, const float* x) { ne_vec_acc_f32_(n, y, x); }
inline static void ne_vec_acc1_f32(const int n, float* y, const float v) {
for (int i = 0; i < n; ++i) y[i] += v;
}
inline static void ne_vec_sub_f32(const int n, float* z, const float* x, const float* y) {
for (int i = 0; i < n; ++i) z[i] = x[i] - y[i];
ne_vec_sub_f32_(n, z, x, y);
}

inline static void ne_vec_set_f32(const int n, float* x, const float v) {
for (int i = 0; i < n; ++i) x[i] = v;
}
inline static void ne_vec_set_f32(const int n, float* x, const float v) { ne_vec_set_f32_(n, x, v); }

inline static void ne_vec_cpy_f32(const int n, float* y, const float* x) {
for (int i = 0; i < n; ++i) y[i] = x[i];
Expand All @@ -54,10 +42,10 @@ inline static void ne_vec_neg_f32(const int n, float* y, const float* x) {
for (int i = 0; i < n; ++i) y[i] = -x[i];
}
inline static void ne_vec_mul_f32(const int n, float* z, const float* x, const float* y) {
for (int i = 0; i < n; ++i) z[i] = x[i] * y[i];
ne_vec_mul_f32_(n, z, x, y);
}
inline static void ne_vec_div_f32(const int n, float* z, const float* x, const float* y) {
for (int i = 0; i < n; ++i) z[i] = x[i] / y[i];
ne_vec_div_f32_(n, z, x, y);
}

inline static void ne_vec_mad_f32(const int n, float* __restrict y, const float* __restrict x, const float v) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ if (NE_GPU)
add_subdirectory(gpu)
endif()

add_library_w_warning(ne_vec ele_reduce.cpp)
add_library_w_warning(ne_vec ele_reduce.cpp ele_wise.cpp)
target_link_libraries(ne_vec PUBLIC cpu_vec)
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "vec_base.hpp"
#include "vec_compare.hpp"
#include "vec_convert.hpp"
#include "vec_load.hpp"
#include "vec_set.hpp"

#endif // ENGINE_EXECUTOR_INCLUDE_VEC_HPP_
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,30 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "vec_load.hpp"
#include "vec_store.hpp"
#include "vec_arithmetic.hpp"
#include "cmath"

inline fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) {
fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y) {
#if __AVX512F__
return _mm512_sub_ps(x, y);
return {_mm512_sub_ps(x.first, y.first)};
#else
return {_mm256_sub_ps(x.first, y.first), _mm256_sub_ps(x.second, y.second)};
#endif
}

inline fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
#if __AVX512F__
return _mm512_fmsub_ps(x, y, z);
return {_mm512_fmsub_ps(x.first, y.first, z.first)};
#else
return {_mm256_fmsub_ps(x.first, y.first, z.first), _mm256_fmsub_ps(x.second, y.second, z.second)};
#endif
}

inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
#if __AVX512F__
return _mm512_maskz_fmsub_ps(mask, x, y, z);
return {_mm512_maskz_fmsub_ps(mask, x.first, y.first, z.first)};
#else
__m256 first, second;
MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_fmsub_ps(x.first, y.first, z.first), mask & 255, first);
Expand All @@ -42,33 +45,33 @@ inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z) {
#endif
}

inline fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) {
fp32x16 add_fp32x16(fp32x16 x, fp32x16 y) {
#if __AVX512F__
return _mm512_add_ps(x, y);
return {_mm512_add_ps(x.first, y.first)};
#else
return {_mm256_add_ps(x.first, y.first), _mm256_add_ps(x.second, y.second)};
#endif
}

inline fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z) {
#if __AVX512F__
return _mm512_fmadd_ps(x, y, z);
return {_mm512_fmadd_ps(x.first, y.first, z.first)};
#else
return {_mm256_fmadd_ps(x.first, y.first, z.first), _mm256_fmadd_ps(x.second, y.second, z.second)};
#endif
}

inline fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) {
fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y) {
#if __AVX512F__
return _mm512_mul_ps(x, y);
return {_mm512_mul_ps(x.first, y.first)};
#else
return {_mm256_mul_ps(x.first, y.first), _mm256_mul_ps(x.second, y.second)};
#endif
}

inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
#if __AVX512F__
return _mm512_maskz_mul_ps(mask, x, y);
return {_mm512_maskz_mul_ps(mask, x.first, y.first)};
#else
__m256 first, second;
MASK_DECORATOR(_mm256_blend_ps, _mm256_setzero_ps(), _mm256_mul_ps(x.first, y.first), mask & 255, first);
Expand All @@ -78,31 +81,31 @@ inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y) {
}

template <int rounding>
inline fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) {
fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y) {
static_assert(rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) ||
rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || rounding == (_MM_FROUND_CUR_DIRECTION),
"ERROR: Not support rounding");
#if __AVX512F__
return _mm512_mul_round_ps(x, y, rounding);
return {_mm512_mul_round_ps(x.first, y.first, rounding)};
#else
return {_mm256_round_ps(_mm256_mul_ps(x.first, y.first), rounding),
_mm256_round_ps(_mm256_mul_ps(x.second, y.second), rounding)};
#endif
}

inline fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) {
fp32x16 div_fp32x16(fp32x16 x, fp32x16 y) {
#if __AVX512F__
return _mm512_div_ps(x, y);
return {_mm512_div_ps(x.first, y.first)};
#else
return {_mm256_div_ps(x.first, y.first), _mm256_div_ps(x.second, y.second)};
#endif
}

inline float reduce_add_fp32x16(fp32x16 x) {
float reduce_add_fp32x16(fp32x16 x) {
#if __AVX512F__
return _mm512_reduce_add_ps(x);
return {_mm512_reduce_add_ps(x.first)};
#else
const __m256 x256 = _mm256_add_ps(x.first, x.second);
const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x256, 1), _mm256_castps256_ps128(x256));
Expand All @@ -112,46 +115,55 @@ inline float reduce_add_fp32x16(fp32x16 x) {
#endif
}

inline fp32x16 sqrt_fp32x16(fp32x16 x) {
fp32x16 sqrt_fp32x16(fp32x16 x) {
#if __AVX512F__
return _mm512_sqrt_ps(x);
return {_mm512_sqrt_ps(x.first)};
#else
return {_mm256_sqrt_ps(x.first), _mm256_sqrt_ps(x.second)};
#endif
}

inline fp32x16 rsqrt14_fp32x16(fp32x16 x) {
fp32x16 rsqrt14_fp32x16(fp32x16 x) {
#if __AVX512F__
return _mm512_rsqrt14_ps(x);
return {_mm512_rsqrt14_ps(x.first)};
#else
// the max relative error is 6x than avx512
return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)};
#endif
}
inline fp32x16 ceil_fp32x16(fp32x16 x) {
fp32x16 ceil_fp32x16(fp32x16 x) {
#if __AVX512F__
return _mm512_ceil_ps(x);
return {_mm512_ceil_ps(x.first)};
#else
// the max relative error is 6x than avx512
return {_mm256_ceil_ps(x.first), _mm256_ceil_ps(x.second)};
#endif
}

inline fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) {
fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y) {
#if __AVX512F__
return _mm512_scalef_ps(x, y);
return {_mm512_scalef_ps(x.first, y.first)};
#else
// No intrinsic
assert("No intrinsic");
return {_mm256_rsqrt_ps(x.first), _mm256_rsqrt_ps(x.second)};
float* vec_x = new float[16];
float* vec_y = new float[16];
float* vec_z = new float[16];
store_fp32x16(vec_x, x);
store_fp32x16(vec_y, y);
for (int i = 0; i < 16; i++) vec_z[i] = vec_x[i] * exp2(vec_y[i]);
fp32x16 res = load_fp32x16(vec_z);
delete[] vec_x;
delete[] vec_y;
delete[] vec_z;
return res;
#endif
}

inline float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); }
float dot_fp32x16(fp32x16 x, fp32x16 y) { return reduce_add_fp32x16(mul_fp32x16(x, y)); }

inline fp32x16 abs_fp32x16(fp32x16 x) {
fp32x16 abs_fp32x16(fp32x16 x) {
#if __AVX512F__
return _mm512_abs_ps(x);
return {_mm512_abs_ps(x.first)};
#else
return {_mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.first))),
_mm256_castsi256_ps(_mm256_abs_epi32(_mm256_castps_si256(x.second)))};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,50 +17,50 @@

#include "vec_base.hpp"

inline fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y);
fp32x16 sub_fp32x16(fp32x16 x, fp32x16 y);
REGISTER_KERNEL_T(sub_fp32x16, fp32x16, fp32x16, fp32x16);

inline fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
fp32x16 fmsub_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
REGISTER_KERNEL_T(fmsub_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);

inline fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z);
fp32x16 maskz_fmsub_fp32x16(int mask, fp32x16 x, fp32x16 y, fp32x16 z);

inline fp32x16 add_fp32x16(fp32x16 x, fp32x16 y);
fp32x16 add_fp32x16(fp32x16 x, fp32x16 y);
REGISTER_KERNEL_T(add_fp32x16, fp32x16, fp32x16, fp32x16);

inline fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
fp32x16 fmadd_fp32x16(fp32x16 x, fp32x16 y, fp32x16 z);
REGISTER_KERNEL_T(fmadd_fp32x16, fp32x16, fp32x16, fp32x16, fp32x16);

inline fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y);
fp32x16 mul_fp32x16(fp32x16 x, fp32x16 y);
REGISTER_KERNEL_T(mul_fp32x16, fp32x16, fp32x16, fp32x16);

inline fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y);
fp32x16 maskz_mul_fp32x16(int mask, fp32x16 x, fp32x16 y);

template <int rounding>
inline fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y);
fp32x16 mul_round_fp32x16(fp32x16 x, fp32x16 y);

inline fp32x16 div_fp32x16(fp32x16 x, fp32x16 y);
fp32x16 div_fp32x16(fp32x16 x, fp32x16 y);
REGISTER_KERNEL_T(div_fp32x16, fp32x16, fp32x16, fp32x16);

inline float reduce_add_fp32x16(fp32x16 x);
float reduce_add_fp32x16(fp32x16 x);
REGISTER_KERNEL_T(reduce_add_fp32x16, float, fp32x16);

inline fp32x16 sqrt_fp32x16(fp32x16 x);
fp32x16 sqrt_fp32x16(fp32x16 x);
REGISTER_KERNEL_T(sqrt_fp32x16, fp32x16, fp32x16);

inline fp32x16 rsqrt14_fp32x16(fp32x16 x);
fp32x16 rsqrt14_fp32x16(fp32x16 x);
REGISTER_KERNEL_T(rsqrt14_fp32x16, fp32x16, fp32x16);

inline fp32x16 ceil_fp32x16(fp32x16 x);
fp32x16 ceil_fp32x16(fp32x16 x);
REGISTER_KERNEL_T(ceil_fp32x16, fp32x16, fp32x16);

inline fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y);
fp32x16 scale_fp32x16(fp32x16 x, fp32x16 y);
REGISTER_KERNEL_T(scale_fp32x16, fp32x16, fp32x16, fp32x16);

inline float dot_fp32x16(fp32x16 x, fp32x16 y);
float dot_fp32x16(fp32x16 x, fp32x16 y);
REGISTER_KERNEL_T(dot_fp32x16, float, fp32x16, fp32x16);

inline fp32x16 abs_fp32x16(fp32x16 x);
fp32x16 abs_fp32x16(fp32x16 x);
REGISTER_KERNEL_T(abs_fp32x16, fp32x16, fp32x16);

#endif // ENGINE_EXECUTOR_INCLUDE_VEC_SET_HPP_
Loading

0 comments on commit 2b34485

Please sign in to comment.