Skip to content

Commit f38e968

Browse files
committed
[dev.simd] cmd/compile: zero only low 128-bit of X15
Zeroing the upper part of X15 may make the CPU think it is "dirty" and slow down SSE operations. For now, just not zeroing the upper part, and construct a zero value on the fly if we need a 256- or 512-bit zero value. Maybe VZEROUPPER works better than explicitly zeroing X15, but we need to evaluate. Long term, we probably want to move more things from SSE to AVX. This essentially undoes CL 698237 and CL 698238, except keeping using X15 for 128-bit zeroing for SIMD. Change-Id: I1564e6332c4c57f9721397c92c7c734c5497534c Reviewed-on: https://go-review.googlesource.com/c/go/+/728240 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: David Chase <[email protected]>
1 parent 144cf17 commit f38e968

File tree

12 files changed

+25
-102
lines changed

12 files changed

+25
-102
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import (
1818
"cmd/internal/obj"
1919
"cmd/internal/obj/x86"
2020
"internal/abi"
21-
"internal/buildcfg"
2221
)
2322

2423
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@@ -1718,7 +1717,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
17181717
case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
17191718
s.Prog(v.Op.Asm())
17201719

1721-
case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted
1720+
case ssa.OpAMD64Zero128: // no code emitted
1721+
1722+
case ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
1723+
p := s.Prog(v.Op.Asm())
1724+
p.From.Type = obj.TYPE_REG
1725+
p.From.Reg = simdReg(v)
1726+
p.AddRestSourceReg(simdReg(v))
1727+
p.To.Type = obj.TYPE_REG
1728+
p.To.Reg = simdReg(v)
17221729

17231730
case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
17241731
// These are for initializing the least 32/64 bits of a SIMD register from a "float".
@@ -1871,34 +1878,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
18711878

18721879
// zeroX15 zeroes the X15 register.
18731880
func zeroX15(s *ssagen.State) {
1874-
if !buildcfg.Experiment.SIMD {
1875-
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
1876-
return
1877-
}
1878-
vxorps := func(s *ssagen.State) {
1879-
p := s.Prog(x86.AVXORPS)
1880-
p.From.Type = obj.TYPE_REG
1881-
p.From.Reg = x86.REG_X15
1882-
p.AddRestSourceReg(x86.REG_X15)
1883-
p.To.Type = obj.TYPE_REG
1884-
p.To.Reg = x86.REG_X15
1885-
}
1886-
if buildcfg.GOAMD64 >= 3 {
1887-
vxorps(s)
1888-
return
1889-
}
1890-
// AVX may not be available, check before zeroing the high bits.
1891-
p := s.Prog(x86.ACMPB)
1892-
p.From.Type = obj.TYPE_MEM
1893-
p.From.Name = obj.NAME_EXTERN
1894-
p.From.Sym = ir.Syms.X86HasAVX
1895-
p.To.Type = obj.TYPE_CONST
1896-
p.To.Offset = 1
1897-
jmp := s.Prog(x86.AJNE)
1898-
jmp.To.Type = obj.TYPE_BRANCH
1899-
vxorps(s)
1900-
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
1901-
jmp.To.SetTarget(sse)
1881+
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
19021882
}
19031883

19041884
// Example instruction: VRSQRTPS X1, X1

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ func init() {
214214
vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly}
215215
vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
216216

217+
v01 = regInfo{inputs: nil, outputs: vonly}
217218
v11 = regInfo{inputs: vonly, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
218219
v21 = regInfo{inputs: []regMask{v, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
219220
vk = regInfo{inputs: vzonly, outputs: maskonly}
@@ -232,6 +233,7 @@ func init() {
232233
gpv = regInfo{inputs: []regMask{gp}, outputs: vonly}
233234
v2flags = regInfo{inputs: []regMask{vz, vz}}
234235

236+
w01 = regInfo{inputs: nil, outputs: wonly}
235237
w11 = regInfo{inputs: wonly, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
236238
w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly}
237239
wk = regInfo{inputs: wzonly, outputs: maskonly}
@@ -1398,12 +1400,15 @@ func init() {
13981400
{name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
13991401
{name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
14001402

1403+
// X15 is the zero register up to 128-bit. For larger values, we zero it on the fly.
14011404
{name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
1402-
{name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
1403-
{name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
1405+
{name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"},
1406+
{name: "Zero512", argLength: 0, reg: w01, asm: "VPXORQ"},
14041407

1408+
// Move a 32/64 bit float to a 128-bit SIMD register.
14051409
{name: "VMOVSDf2v", argLength: 1, reg: fpv, asm: "VMOVSD"},
14061410
{name: "VMOVSSf2v", argLength: 1, reg: fpv, asm: "VMOVSS"},
1411+
14071412
{name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"},
14081413
{name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"},
14091414

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 8 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/runtime/asm_amd64.s

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,11 +1093,6 @@ needm:
10931093
// there's no need to handle that. Clear R14 so that there's
10941094
// a bad value in there, in case needm tries to use it.
10951095
XORPS X15, X15
1096-
#ifdef GOEXPERIMENT_simd
1097-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
1098-
JNE 2(PC)
1099-
VXORPS X15, X15, X15
1100-
#endif
11011096
XORQ R14, R14
11021097
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
11031098
CALL AX
@@ -1795,11 +1790,6 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
17951790
get_tls(R14)
17961791
MOVQ g(R14), R14
17971792
XORPS X15, X15
1798-
#ifdef GOEXPERIMENT_simd
1799-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
1800-
JNE 2(PC)
1801-
VXORPS X15, X15, X15
1802-
#endif
18031793
JMP ·sigpanic<ABIInternal>(SB)
18041794

18051795
// gcWriteBarrier informs the GC about heap pointer writes.

src/runtime/race_amd64.s

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -456,11 +456,6 @@ call:
456456
// Back to Go world, set special registers.
457457
// The g register (R14) is preserved in C.
458458
XORPS X15, X15
459-
#ifdef GOEXPERIMENT_simd
460-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
461-
JNE 2(PC)
462-
VXORPS X15, X15, X15
463-
#endif
464459
RET
465460

466461
// C->Go callback thunk that allows to call runtime·racesymbolize from C code.

src/runtime/sys_darwin_amd64.s

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
177177
get_tls(R12)
178178
MOVQ g(R12), R14
179179
PXOR X15, X15
180-
#ifdef GOEXPERIMENT_simd
181-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
182-
JNE 2(PC)
183-
VXORPS X15, X15, X15
184-
#endif
185180

186181
// Reserve space for spill slots.
187182
NOP SP // disable vet stack checking

src/runtime/sys_dragonfly_amd64.s

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
228228
get_tls(R12)
229229
MOVQ g(R12), R14
230230
PXOR X15, X15
231-
#ifdef GOEXPERIMENT_simd
232-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
233-
JNE 2(PC)
234-
VXORPS X15, X15, X15
235-
#endif
236231

237232
// Reserve space for spill slots.
238233
NOP SP // disable vet stack checking

src/runtime/sys_freebsd_amd64.s

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -265,11 +265,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
265265
get_tls(R12)
266266
MOVQ g(R12), R14
267267
PXOR X15, X15
268-
#ifdef GOEXPERIMENT_simd
269-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
270-
JNE 2(PC)
271-
VXORPS X15, X15, X15
272-
#endif
273268

274269
// Reserve space for spill slots.
275270
NOP SP // disable vet stack checking
@@ -295,11 +290,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
295290
get_tls(R12)
296291
MOVQ g(R12), R14
297292
PXOR X15, X15
298-
#ifdef GOEXPERIMENT_simd
299-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
300-
JNE 2(PC)
301-
VXORPS X15, X15, X15
302-
#endif
303293

304294
// Reserve space for spill slots.
305295
NOP SP // disable vet stack checking

src/runtime/sys_linux_amd64.s

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -352,11 +352,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
352352
get_tls(R12)
353353
MOVQ g(R12), R14
354354
PXOR X15, X15
355-
#ifdef GOEXPERIMENT_simd
356-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
357-
JNE 2(PC)
358-
VXORPS X15, X15, X15
359-
#endif
360355

361356
// Reserve space for spill slots.
362357
NOP SP // disable vet stack checking
@@ -382,11 +377,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
382377
get_tls(R12)
383378
MOVQ g(R12), R14
384379
PXOR X15, X15
385-
#ifdef GOEXPERIMENT_simd
386-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
387-
JNE 2(PC)
388-
VXORPS X15, X15, X15
389-
#endif
390380

391381
// Reserve space for spill slots.
392382
NOP SP // disable vet stack checking

src/runtime/sys_netbsd_amd64.s

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -310,11 +310,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
310310
get_tls(R12)
311311
MOVQ g(R12), R14
312312
PXOR X15, X15
313-
#ifdef GOEXPERIMENT_simd
314-
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
315-
JNE 2(PC)
316-
VXORPS X15, X15, X15
317-
#endif
318313

319314
// Reserve space for spill slots.
320315
NOP SP // disable vet stack checking

0 commit comments

Comments
 (0)