Skip to content

Commit 1c4235d

Browse files
committed
minor corrections and added some support for llvm-mca profiler
1 parent a7ae4d3 commit 1c4235d

File tree

6 files changed

+60
-23
lines changed

6 files changed

+60
-23
lines changed

simd_test.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6059,13 +6059,13 @@ printf("\n");
60596059

60606060
clock_gettime(CLOCK_REALTIME, &start);
60616061
for (l = 0; l < loop; l++)
6062-
tan256d(inoutd, inoutd2, len);
6062+
tan512d(inoutd, inoutd2, len);
60636063
clock_gettime(CLOCK_REALTIME, &stop);
60646064
elapsed = ((stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) * 1e-3) / (double) loop;
60656065
printf("tan512d %d %lf\n", len, elapsed);
60666066

60676067
l2_errd(inoutd_ref, inoutd2, len);
6068-
// for(int i = 0; i < len; i++) printf("%lf %lf %lf \n",inoutd[i],inoutd_ref[i],inoutd2[i]);
6068+
// for(int i = 0; i < 512len; i++) printf("%lf %lf %lf \n",inoutd[i],inoutd_ref[i],inoutd2[i]);
60696069
#endif
60706070

60716071
printf("\n");

simd_utils_avx512_float.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2493,6 +2493,10 @@ static inline void tanh512f(float *src, float *dst, int len)
24932493
#if 1
24942494
static inline v16sf tan512f_ps(v16sf xx)
24952495
{
2496+
#ifdef LLVMMCA
2497+
__asm volatile("# LLVM-MCA-BEGIN tan512f_ps" ::
2498+
: "memory");
2499+
#endif
24962500
v16sf x, y, z, zz;
24972501
v16si j; // long?
24982502
__mmask16 sign, xsupem4;
@@ -2537,7 +2541,10 @@ static inline v16sf tan512f_ps(v16sf xx)
25372541

25382542
sign = _mm512_cmp_ps_mask(xx, _mm512_setzero_ps(), _CMP_LT_OS); // 0xFFFFFFFF if xx < 0.0
25392543
y = _mm512_mask_blend_ps(sign, y, _mm512_xor_ps(*(v16sf *) _ps512_neg_sign_mask, y));
2540-
2544+
#ifdef LLVMMCA
2545+
__asm volatile("# LLVM-MCA-END tan512f_ps" ::
2546+
: "memory");
2547+
#endif
25412548
return (y);
25422549
}
25432550

simd_utils_avx_float.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
4747

4848
static inline v8sf log10256_ps(v8sf x)
4949
{
50+
#ifdef LLVMMCA
51+
__asm volatile("# LLVM-MCA-BEGIN log10256_ps" ::
52+
: "memory");
53+
#endif
5054
v8si imm0;
5155
v8sf one = *(v8sf *) _ps256_1;
5256

@@ -97,6 +101,10 @@ static inline v8sf log10256_ps(v8sf x)
97101
x = _mm256_fmadd_ps_custom(e, *(v8sf *) _ps256_cephes_L102A, z);
98102

99103
x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
104+
#ifdef LLVMMCA
105+
__asm volatile("# LLVM-MCA-END log10256_ps" ::
106+
: "memory");
107+
#endif
100108
return x;
101109
}
102110

@@ -2745,6 +2753,10 @@ static inline void tanh256f(float *src, float *dst, int len)
27452753
#if 1
27462754
static inline v8sf tan256f_ps(v8sf xx)
27472755
{
2756+
#ifdef LLVMMCA
2757+
__asm volatile("# LLVM-MCA-BEGIN tan256f_ps" ::
2758+
: "memory");
2759+
#endif
27482760
v8sf x, y, z, zz;
27492761
v8si j; // long?
27502762
v8sf sign, xsupem4;
@@ -2821,7 +2833,10 @@ static inline v8sf tan256f_ps(v8sf xx)
28212833

28222834
sign = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LT_OS); // 0xFFFFFFFF if xx < 0.0
28232835
y = _mm256_blendv_ps(y, _mm256_xor_ps(*(v8sf *) _ps256_neg_sign_mask, y), sign);
2824-
2836+
#ifdef LLVMMCA
2837+
__asm volatile("# LLVM-MCA-END tan256f_ps" ::
2838+
: "memory");
2839+
#endif
28252840
return (y);
28262841
}
28272842

simd_utils_constants.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,8 @@ typedef float32x4x2_t v4sfx2;
185185

186186
#else
187187

188-
typedef __m128 v4sf; // vector of 4 float (sse1)
189-
typedef __m128i v4si; // vector of 4 int (sse2)
190-
typedef __m128i v2sid; // vector of 2 int64 (sse2)
188+
typedef __m128 v4sf; // vector of 4 float (sse1)
189+
typedef __m128i v4si; // vector of 4 int (sse2)
191190
typedef struct {
192191
v4sf val[2];
193192
} v4sfx2;
@@ -207,8 +206,8 @@ typedef struct {
207206
#define ROUNDTOCEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)
208207
#define ROUNDTOZERO (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)
209208

210-
typedef __m128d v2sd; // vector of 2 double (sse)
211-
typedef __m128i v2si; // vector of 2 int 64 (sse)
209+
typedef __m128d v2sd; // vector of 2 double (sse)
210+
typedef __m128i v2sid; // vector of 2 int64 (sse2)
212211

213212
typedef struct {
214213
v2sd val[2];
@@ -1257,7 +1256,7 @@ typedef __vector char v16s8;
12571256
#endif
12581257

12591258
/// PRINT FUNCTIONS */
1260-
#if 1
1259+
#if 0
12611260

12621261
#ifdef SSE
12631262
/*

simd_utils_sse_double.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ static inline void vectorSlope128d(double *dst, int len, double offset, double s
460460
// in SSE, missing _mm_cvtepi64_pd, _mm_cvttpd_epi64
461461
// See : https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
462462

463-
static inline v2sd _mm_cvtepi64_pd_custom(v2si x)
463+
static inline v2sd _mm_cvtepi64_pd_custom(v2sid x)
464464
{
465465
#if 0
466466
//Signed
@@ -473,7 +473,7 @@ static inline v2sd _mm_cvtepi64_pd_custom(v2si x)
473473
#endif
474474
}
475475

476-
static inline v2si _mm_cvttpd_epi64_custom(v2sd x)
476+
static inline v2sid _mm_cvttpd_epi64_custom(v2sd x)
477477
{
478478
// Signed
479479
#if 0
@@ -495,7 +495,7 @@ static inline void sincos_pd(v2sd x, v2sd *s, v2sd *c)
495495
{
496496
v2sd xmm1, xmm2, xmm3 = _mm_setzero_pd(), sign_bit_sin, y;
497497

498-
v2si emm0, emm2, emm4;
498+
v2sid emm0, emm2, emm4;
499499

500500
sign_bit_sin = x;
501501
/* take the absolute value */
@@ -510,21 +510,21 @@ static inline void sincos_pd(v2sd x, v2sd *s, v2sd *c)
510510
/* store the integer part of y in emm2 */
511511
emm2 = _mm_cvttpd_epi64_custom(y);
512512
/* j=(j+1) & (~1) (see the cephes sources) */
513-
emm2 = _mm_add_epi64(emm2, *(v2si *) _pi64_1);
513+
emm2 = _mm_add_epi64(emm2, *(v2sid *) _pi64_1);
514514

515-
emm2 = _mm_and_si128(emm2, *(v2si *) _pi64_inv1);
515+
emm2 = _mm_and_si128(emm2, *(v2sid *) _pi64_inv1);
516516
y = _mm_cvtepi64_pd_custom(emm2);
517517
emm4 = emm2;
518518

519519
/* get the swap sign flag for the sine */
520-
emm0 = _mm_and_si128(emm2, *(v2si *) _pi64_4);
520+
emm0 = _mm_and_si128(emm2, *(v2sid *) _pi64_4);
521521
// print2i(emm0);
522522
emm0 = _mm_slli_epi64(emm0, 61);
523523
// print2i(emm0);
524524
v2sd swap_sign_bit_sin = _mm_castsi128_pd(emm0);
525525

526526
/* get the polynom selection mask for the sine*/
527-
emm2 = _mm_and_si128(emm2, *(v2si *) _pi64_2);
527+
emm2 = _mm_and_si128(emm2, *(v2sid *) _pi64_2);
528528
// SSE3
529529
emm2 = _mm_cmpeq_epi64(emm2, _mm_setzero_si128());
530530
v2sd poly_mask = _mm_castsi128_pd(emm2);
@@ -535,8 +535,8 @@ static inline void sincos_pd(v2sd x, v2sd *s, v2sd *c)
535535
x = _mm_fmadd_pd_custom(y, *(v2sd *) _pd_minus_cephes_DP2, x);
536536
x = _mm_fmadd_pd_custom(y, *(v2sd *) _pd_minus_cephes_DP3, x);
537537

538-
emm4 = _mm_sub_epi64(emm4, *(v2si *) _pi64_2);
539-
emm4 = _mm_andnot_si128(emm4, *(v2si *) _pi64_4);
538+
emm4 = _mm_sub_epi64(emm4, *(v2sid *) _pi64_2);
539+
emm4 = _mm_andnot_si128(emm4, *(v2sid *) _pi64_4);
540540
emm4 = _mm_slli_epi64(emm4, 61);
541541
v2sd sign_bit_cos = _mm_castsi128_pd(emm4);
542542

@@ -977,7 +977,7 @@ static inline v2sd exp_pd(v2sd x)
977977
{
978978
v2sd tmp = _mm_setzero_pd(), fx;
979979

980-
v2si emm0;
980+
v2sid emm0;
981981

982982
v2sd one = *(v2sd *) _pd_1;
983983
v2sd two = *(v2sd *) _pd_2;
@@ -1015,7 +1015,7 @@ static inline v2sd exp_pd(v2sd x)
10151015

10161016
/* build 2^n */
10171017
emm0 = _mm_cvttpd_epi64_custom(fx);
1018-
emm0 = _mm_add_epi64(emm0, *(v2si *) _pi64_0x7f);
1018+
emm0 = _mm_add_epi64(emm0, *(v2sid *) _pi64_0x7f);
10191019
emm0 = _mm_slli_epi64(emm0, 52);
10201020
v2sd pow2n = _mm_castsi128_pd(emm0);
10211021

@@ -1025,7 +1025,7 @@ static inline v2sd exp_pd(v2sd x)
10251025

10261026
static inline v2sd log_pd(v2sd x)
10271027
{
1028-
v2si emm0;
1028+
v2sid emm0;
10291029
v2sd one = *(v2sd *) _pd_1;
10301030

10311031
v2sd invalid_mask = _mm_cmple_pd(x, _mm_setzero_pd());

simd_utils_sse_float.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@
2020

2121
static inline v4sf log10_ps(v4sf x)
2222
{
23+
#ifdef LLVMMCA
24+
__asm volatile("# LLVM-MCA-BEGIN log10_ps" ::
25+
: "memory");
26+
#endif
27+
2328
v4si emm0;
2429
v4sf one = *(v4sf *) _ps_1;
2530
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
@@ -62,6 +67,10 @@ static inline v4sf log10_ps(v4sf x)
6267
x = _mm_fmadd_ps_custom(e, *(v4sf *) _ps_cephes_L102A, z);
6368

6469
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
70+
#ifdef LLVMMCA
71+
__asm volatile("# LLVM-MCA-END log10_ps" ::
72+
: "memory");
73+
#endif
6574
return x;
6675
}
6776

@@ -2694,6 +2703,10 @@ static inline void tanh128f(float *src, float *dst, int len)
26942703

26952704
static inline v4sf tanf_ps(v4sf xx)
26962705
{
2706+
#ifdef LLVMMCA
2707+
__asm volatile("# LLVM-MCA-BEGIN tanf_ps" ::
2708+
: "memory");
2709+
#endif
26972710
v4sf x, y, z, zz;
26982711
v4si j; // long?
26992712
v4sf sign, xsupem4;
@@ -2740,7 +2753,10 @@ static inline v4sf tanf_ps(v4sf xx)
27402753
// xor(rcp(y)) gives not good enough result
27412754
y = _mm_blendv_ps(y, _mm_div_ps(*(v4sf *) _ps_min1, y), (v4sf) (jandtwo));
27422755
y = _mm_xor_ps(y, sign);
2743-
2756+
#ifdef LLVMMCA
2757+
__asm volatile("# LLVM-MCA-END tanf_ps" ::
2758+
: "memory");
2759+
#endif
27442760
return (y);
27452761
}
27462762

0 commit comments

Comments
 (0)