-
Notifications
You must be signed in to change notification settings - Fork 13
/
valgrind.fma_arm64.diff
206 lines (201 loc) · 8.31 KB
/
valgrind.fma_arm64.diff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index 97b094175..c4490c3fa 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -39,11 +39,6 @@
FMULX is treated the same as FMUL. That's also not correct.
- * Floating multiply-add (etc) insns. Are split into a multiply and
- an add, and so suffer double rounding and hence sometimes the
- least significant mantissa bit is incorrect. Fix: use the IR
- multiply-add IROps instead.
-
* FRINTX might be need updating to set the inexact computation FPSR flag
* Ditto FCVTXN. No idea what "round to odd" means. This implementation
@@ -2224,6 +2219,108 @@ static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
}
+/* Helper to unroll FMA*/
+static void unroll2_V128(IRExpr* aExpr, IRTemp* ai_F64){
+ IRTemp ai_I64[2];
+ for(int i=0; i<2; i++) ai_I64[i]= newTemp(Ity_I64);
+
+ assign(ai_I64[0], unop(Iop_V128to64, aExpr ));
+ assign(ai_I64[1], unop(Iop_V128HIto64, aExpr ));
+
+ for(int i=0; i<2; i++){
+ ai_F64[i]= newTemp(Ity_F64);
+ assign(ai_F64[i], unop(Iop_ReinterpI64asF64, mkexpr(ai_I64[i]) ));
+ }
+}
+
+static void unroll4_V128(IRExpr* aExpr, IRTemp* ai_F32){
+ IRTemp a64_LO = newTemp(Ity_I64);
+ IRTemp a64_HI = newTemp(Ity_I64);
+ assign(a64_LO, unop(Iop_V128to64, aExpr ));
+ assign(a64_HI, unop(Iop_V128HIto64, aExpr ));
+
+ IRTemp ai_I32[4];
+ for(int i=0; i<4 ; i++) ai_I32[i]=newTemp(Ity_I32);
+
+ assign(ai_I32[0], unop(Iop_64to32, mkexpr(a64_LO) ));
+ assign(ai_I32[1], unop(Iop_64HIto32, mkexpr(a64_LO) ));
+ assign(ai_I32[2], unop(Iop_64to32, mkexpr(a64_HI) ));
+ assign(ai_I32[3], unop(Iop_64HIto32, mkexpr(a64_HI) ));
+
+ for(int i=0; i<4 ; i++){
+ ai_F32[i]=newTemp(Ity_F32);
+ assign(ai_F32[i], unop(Iop_ReinterpI32asF32, mkexpr(ai_I32[i]) ));
+ }
+}
+
+static IRTemp unroll_vectorized2_fma(Bool isSub, IRExpr* rm, IRExpr* aExpr, IRExpr* bExpr, IRExpr* cExpr){
+ //a*b+c
+ //rm : rounding mode
+
+ // unroll arguments
+ IRTemp ai_F64[2], bi_F64[2], ci_F64[2];
+ unroll2_V128(aExpr,ai_F64);
+ unroll2_V128(bExpr,bi_F64);
+ unroll2_V128(cExpr,ci_F64);
+
+ // fma loop with type conversion (I32->F32 and F32->I32 )
+ IRTemp ri_F64[2],ri_I64[2];
+ IROp opMaddOrSub = isSub ? Iop_MSubF64 : Iop_MAddF64;
+ for(int i=0; i<2; i++){
+ ri_F64[i]= newTemp(Ity_F64);
+ //apply fma or fms
+ assign(ri_F64[i], qop( opMaddOrSub, rm,
+ mkexpr(ai_F64[i]), mkexpr(bi_F64[i]), mkexpr(ci_F64[i])));
+ ri_I64[i]=newTemp(Ity_I64);
+ assign(ri_I64[i], unop(Iop_ReinterpF64asI64, mkexpr(ri_F64[i])));
+ }
+
+ //roll up the results
+ IRTemp res = newTempV128();
+ assign(res, binop(Iop_64HLtoV128, mkexpr(ri_I64[1]), mkexpr(ri_I64[0])));
+ return res;
+}
+
+
+static IRTemp unroll_vectorized4_fma(Bool isSub, IRExpr* rm, IRExpr* aExpr, IRExpr* bExpr, IRExpr* cExpr){
+ //a*b+c rm : rounding mode
+
+ // unroll arguments
+ IRTemp ai_F32[4],bi_F32[4],ci_F32[4];
+ unroll4_V128(aExpr, ai_F32);
+ unroll4_V128(bExpr, bi_F32);
+ unroll4_V128(cExpr, ci_F32);
+
+ // fma loop with type conversion (I32->F32 and F32->I32 )
+ IRTemp ri_F32[4],ri_I32[4];
+ IROp opMaddOrSub = isSub ? Iop_MSubF32 : Iop_MAddF32;
+ for(int i=0; i<4 ; i++){
+ ri_F32[i]=newTemp(Ity_F32);
+ //apply fma or fms
+ assign(ri_F32[i], qop( opMaddOrSub, rm,
+ mkexpr(ai_F32[i]),mkexpr(bi_F32[i]),mkexpr(ci_F32[i])));
+ ri_I32[i]=newTemp(Ity_I32);
+ assign(ri_I32[i], unop(Iop_ReinterpF32asI32, mkexpr(ri_F32[i])));
+ }
+
+ //roll up the results
+ IRTemp resLo = newTemp(Ity_I64);
+ IRTemp resHi = newTemp(Ity_I64);
+ assign(resLo, binop(Iop_32HLto64, mkexpr(ri_I32[1]), mkexpr(ri_I32[0])));
+ assign(resHi, binop(Iop_32HLto64, mkexpr(ri_I32[3]), mkexpr(ri_I32[2])));
+ IRTemp res = newTempV128();
+ assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
+ return res;
+}
+
+static IRTemp unroll_vectorized_fma(UInt unrollSize, Bool isSub, IRExpr* rm,
+ IRExpr* aExpr, IRExpr* bExpr, IRExpr* cExpr){
+ vassert(unrollSize == 2 || unrollSize == 4 );
+ if(unrollSize==2){
+ return unroll_vectorized2_fma(isSub, rm, aExpr, bExpr,cExpr);
+ }
+ return unroll_vectorized4_fma(isSub, rm, aExpr, bExpr,cExpr);
+}
/*------------------------------------------------------------*/
/*--- FP comparison helpers ---*/
@@ -11488,19 +11585,17 @@ Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
UInt mm = (bitM << 4) | mmLO4;
assign(elem, getQRegLane(mm, index, ity));
IRTemp dupd = math_DUP_TO_V128(elem, ity);
- IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
- IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
- IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
IRTemp rm = mk_get_IR_rounding_mode();
- IRTemp t1 = newTempV128();
- IRTemp t2 = newTempV128();
- // FIXME: double rounding; use FMA primops instead
- assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
- assign(t2, triop(isSUB ? opSUB : opADD,
- mkexpr(rm), getQReg128(dd), mkexpr(t1)));
+
+ UInt unrollSize= (isD ? 2 : 4);
+ IRTemp res=unroll_vectorized_fma(unrollSize, isSUB, mkexpr(rm),
+ getQReg128(nn),
+ mkexpr(dupd),
+ getQReg128(dd));
+
putQReg128(dd,
mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
- mkexpr(t2))));
+ mkexpr(res))));
const HChar c = isD ? 'd' : 's';
DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
c, dd, c, nn, nameQReg128(mm), c, index);
@@ -12945,18 +13040,15 @@ Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
Bool isD = (size & 1) == 1;
Bool isSUB = (size & 2) == 2;
if (bitQ == 0 && isD) return False; // implied 1d case
- IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
- IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
- IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
IRTemp rm = mk_get_IR_rounding_mode();
- IRTemp t1 = newTempV128();
- IRTemp t2 = newTempV128();
- // FIXME: double rounding; use FMA primops instead
- assign(t1, triop(opMUL,
- mkexpr(rm), getQReg128(nn), getQReg128(mm)));
- assign(t2, triop(isSUB ? opSUB : opADD,
- mkexpr(rm), getQReg128(dd), mkexpr(t1)));
- putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
+
+ UInt unrollSize= (isD ? 2 : 4);
+ IRTemp res= unroll_vectorized_fma(unrollSize, isSUB, mkexpr(rm),
+ getQReg128(nn),
+ getQReg128(mm),
+ getQReg128(dd));
+
+ putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
@@ -14159,17 +14251,15 @@ Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
UInt mm = (bitM << 4) | mmLO4;
assign(elem, getQRegLane(mm, index, ity));
IRTemp dupd = math_DUP_TO_V128(elem, ity);
- IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
- IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
- IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
IRTemp rm = mk_get_IR_rounding_mode();
- IRTemp t1 = newTempV128();
- IRTemp t2 = newTempV128();
- // FIXME: double rounding; use FMA primops instead
- assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
- assign(t2, triop(isSUB ? opSUB : opADD,
- mkexpr(rm), getQReg128(dd), mkexpr(t1)));
- putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
+
+ UInt unrollSize= (isD ? 2 : 4);
+ IRTemp res=unroll_vectorized_fma(unrollSize, isSUB, mkexpr(rm),
+ getQReg128(nn),
+ mkexpr(dupd),
+ getQReg128(dd));
+
+ putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),