-
Notifications
You must be signed in to change notification settings - Fork 4
/
avx.cpp
133 lines (111 loc) · 6.34 KB
/
avx.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include "common.hpp"
static Xbyak::Xmm
ymm_to_xmm(Xbyak::Ymm ymm)
{
return Xbyak::Xmm(ymm.getIdx());
}
void
test_avx()
{
using namespace Xbyak;
if (info.have_avx) {
GEN_throughput_only(Ymm, "movaps [mem]",
(g->vmovaps(dst, g->ptr[g->rdx])),
false, OT_FP32);
GEN_latency_only(Ymm, "movaps [mem] -> movq",
(g->vmovaps(dst, g->ptr[g->rdx + g->rdi])); (g->movq(g->rdi, ymm_to_xmm(dst))); ,
false, OT_FP32);
GEN_throughput_only(Ymm, "vmovdqu [mem+1]",
(g->vmovdqu(dst, g->ptr[g->rdx + 1])),
false, OT_FP32);
GEN_latency_only(Ymm, "vmovdqu [mem+1] -> movq",
(g->vmovdqu(dst, g->ptr[g->rdx + g->rdi + 1])); (g->movq(g->rdi, ymm_to_xmm(dst))); ,
false, OT_FP32);
GEN_throughput_only(Ymm, "vmovdqu [mem+63] (cross cache)",
(g->vmovdqu(dst, g->ptr[g->rdx + 63])),
false, OT_FP32);
GEN_latency_only(Ymm, "vmovdqu [mem+63] (cross cache) -> movq",
(g->vmovdqu(dst, g->ptr[g->rdx + g->rdi + 63])); (g->movq(g->rdi, ymm_to_xmm(dst))); ,
false, OT_FP32);
GEN_throughput_only(Ymm, "vmovdqu [mem+2MB-1] (cross page)",
(g->vmovdqu(dst, g->ptr[g->rdx + (2048*1024-1)])),
false, OT_FP32);
GEN_latency_only(Ymm, "vmovdqu [mem+2MB-1] (cross page) -> movq",
(g->vmovdqu(dst, g->ptr[g->rdx + g->rdi + (2048*1024-1)])); (g->movq(g->rdi, ymm_to_xmm(dst))); ,
false, OT_FP32);
GEN(Ymm, "vxorps", (g->vxorps(dst, dst, src)), false, OT_FP32);
GEN(Ymm, "vmulps", (g->vmulps(dst, dst, src)), false, OT_FP32);
GEN(Ymm, "vaddps", (g->vaddps(dst, dst, src)), false, OT_FP32);
GEN(Ymm, "vdivps", (g->vdivps(dst, dst, src)), false, OT_FP32);
GEN(Ymm, "vdivpd", (g->vdivpd(dst, dst, src)), false, OT_FP64);
GEN(Ymm, "vrsqrtps", (g->vrsqrtps(dst, dst)), false, OT_FP32);
GEN(Ymm, "vrcpps", (g->vrcpps(dst, dst)), false, OT_FP32);
GEN(Ymm, "vsqrtps", (g->vsqrtps(dst, dst)), false, OT_FP32);
GEN(Ymm, "vperm2f128", (g->vperm2f128(dst,dst,src,0)), false, OT_FP32);
}
if (info.have_avx2) {
GEN(Ymm, "vpxor", (g->vpxor(dst, dst, src)), false, OT_INT);
GEN(Ymm, "vpaddd", (g->vpaddd(dst, dst, src)), false, OT_INT);
GEN(Ymm, "vpermps", (g->vpermps(dst, dst, src)), false, OT_FP32);
GEN(Ymm, "vpermpd", (g->vpermpd(dst, dst, 0)), false, OT_FP64);
GEN(Ymm, "vpblendvb", (g->vpblendvb(dst, src, src, src)), false, OT_INT);
GEN_throughput_only(Ymm, "vpmovmskb", (g->vpmovmskb(g->edx,g->ymm0)), false, OT_INT);
GEN_latency(Ymm, "vpmovsxwd",
(g->vpmovsxwd(g->ymm1,g->xmm0)),
(g->vpmovsxwd(g->ymm0,g->xmm0)),
false, OT_INT);
GEN_latency(Ymm, "vpgatherdd",
(g->vpgatherdd(g->ymm2, g->ptr[g->rdx + g->ymm0*1], g->ymm1)),
(g->vpgatherdd(g->ymm2, g->ptr[g->rdx + g->ymm0*1], g->ymm1)); (g->vmovdqa(g->ymm0,g->ymm2)),
false, OT_INT);
GEN_latency(Ymm, "gather32(<ld+ins>x8 + perm)",
/* throughput */
(g->vmovd(g->xmm2, g->ptr[g->rdx]));
(g->vmovd(g->xmm3, g->ptr[g->rdx]));
(g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + 4], 0));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 4], 0));
(g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + 8], 0));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 8], 0));
(g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + 12], 0));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 12], 0));
(g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0));,
/* latency */
(g->vmovd(g->xmm2, g->ptr[g->rdx + g->rdi]));
(g->vmovd(g->xmm3, g->ptr[g->rdx + g->rdi]));
(g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + g->rdi + 4], 0));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + g->rdi + 4], 0));
(g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + g->rdi + 8], 0));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + g->rdi + 8], 0));
(g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + g->rdi + 12], 0));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + g->rdi + 12], 0));
(g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0));
(g->vmovd(g->edi, g->xmm2));
,false, OT_FP32);
GEN_latency(Ymm, "vgatherdpd",
(g->vgatherdpd(g->ymm2, g->ptr[g->rdx + g->xmm0*1], g->ymm1)),
(g->vgatherdpd(g->ymm2, g->ptr[g->rdx + g->xmm0*1], g->ymm1)); (g->vmovdqa(g->ymm0,g->ymm2)),
false, OT_INT);
GEN_latency(Ymm, "gather64(<ld+ins>x4 + perm)",
/* throughput */
(g->vmovq(g->xmm2, g->ptr[g->rdx]));
(g->vmovq(g->xmm3, g->ptr[g->rdx]));
(g->vpinsrq(g->xmm2, g->xmm2, g->ptr[g->rdx + 8], 1));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 8], 1));
(g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0));,
/* latency */
(g->vmovq(g->xmm2, g->ptr[g->rdx + g->rdi]));
(g->vmovq(g->xmm3, g->ptr[g->rdx + g->rdi]));
(g->vpinsrq(g->xmm2, g->xmm2, g->ptr[g->rdx + 8], 1));
(g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 8], 1));
(g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0));
(g->vmovd(g->edi, g->xmm2));,
false, OT_FP32);
GEN(Ymm, "vpshufb", (g->vpshufb(dst, src, src)), false, OT_INT);
}
if (info.have_fma) {
GEN(Ymm, "vfmaps", (g->vfmadd132ps(dst, src, src)), false, OT_FP32);
GEN(Ymm, "vfmapd", (g->vfmadd132pd(dst, src, src)), false, OT_FP64);
GEN(Xmm, "vfmaps", (g->vfmadd132ps(dst, src, src)), false, OT_FP32);
GEN(Xmm, "vfmapd", (g->vfmadd132pd(dst, src, src)), false, OT_FP64);
}
}