Skip to content

Commit bfb237d

Browse files
authored
Fix SIMD register destruction (ocaml-flambda#2311)
* simd destroyed_at_oper * destroyed_at_basic/max_register_pressure * enable more warnings
1 parent 3171d50 commit bfb237d

13 files changed

+191
-122
lines changed

backend/amd64/emit.mlp

+1-1
Original file line numberDiff line numberDiff line change
@@ -963,7 +963,7 @@ let emit_atomic instr op (size : Cmm.atomic_bitwidth) addr =
963963
I.movzx res8 res
964964

965965
let emit_simd_instr op i =
966-
(match Simd_selection.register_behavior op with
966+
(match Simd_proc.register_behavior op with
967967
| R_to_fst ->
968968
assert (Reg.same_loc i.arg.(0) i.res.(0));
969969
assert (Reg.is_reg i.arg.(0))

backend/amd64/proc.ml

+40-8
Original file line numberDiff line numberDiff line change
@@ -170,15 +170,16 @@ let phys_reg ty n =
170170

171171
let rax = phys_reg Int 0
172172
let rdx = phys_reg Int 4
173+
let rcx = phys_reg Int 5
173174
let r10 = phys_reg Int 10
174175
let r11 = phys_reg Int 11
175176
let rbp = phys_reg Int 12
176177

177178
(* CSE needs to know that all versions of xmm15 are destroyed. *)
178-
let destroy_xmm15 () =
179+
let destroy_xmm n =
179180
if Language_extension.is_enabled SIMD
180-
then [| phys_reg Float 115; phys_reg Vec128 115 |]
181-
else [| phys_reg Float 115 |]
181+
then [| phys_reg Float (100 + n); phys_reg Vec128 (100 + n) |]
182+
else [| phys_reg Float (100 + n) |]
182183

183184
let destroyed_by_plt_stub =
184185
if not X86_proc.use_plt then [| |] else [| r10; r11 |]
@@ -399,6 +400,21 @@ let destroyed_at_pushtrap =
399400
let has_pushtrap traps =
400401
List.exists (function Cmm.Push _ -> true | Pop _ -> false) traps
401402

403+
let destroyed_by_simd_op op =
404+
match Simd_proc.register_behavior op with
405+
| R_RM_rax_rdx_to_xmm0
406+
| R_RM_to_xmm0 -> destroy_xmm 0
407+
| R_RM_rax_rdx_to_rcx
408+
| R_RM_to_rcx -> [| rcx |]
409+
| R_to_fst
410+
| R_to_R
411+
| R_to_RM
412+
| RM_to_R
413+
| R_R_to_fst
414+
| R_RM_to_fst
415+
| R_RM_to_R
416+
| R_RM_xmm0_to_fst -> [||]
417+
402418
(* note: keep this function in sync with `destroyed_at_{basic,terminator}` below. *)
403419
let destroyed_at_oper = function
404420
Iop(Icall_ind | Icall_imm _) ->
@@ -410,7 +426,7 @@ let destroyed_at_oper = function
410426
| Iop(Iintop(Idiv | Imod)) | Iop(Iintop_imm((Idiv | Imod), _))
411427
-> [| rax; rdx |]
412428
| Iop(Istore(Single, _, _))
413-
-> destroy_xmm15 ()
429+
-> destroy_xmm 15
414430
| Iop(Ialloc _ | Ipoll _) -> destroyed_at_alloc_or_poll
415431
| Iop(Iintop(Imulh _ | Icomp _) | Iintop_imm((Icomp _), _))
416432
-> [| rax |]
@@ -420,9 +436,10 @@ let destroyed_at_oper = function
420436
| Ireturn traps when has_pushtrap traps -> assert false
421437
| Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax; rdx |]
422438
| Iop(Ispecific(Ilfence | Isfence | Imfence)) -> [||]
439+
| Iop(Ispecific(Isimd op)) -> destroyed_by_simd_op op
423440
| Iop(Ispecific(Isextend32 | Izextend32 | Ilea _
424441
| Istore_int (_, _, _) | Ioffset_loc (_, _)
425-
| Ipause | Iprefetch _ | Isimd _
442+
| Ipause | Iprefetch _
426443
| Ifloatarithmem (_, _) | Ifloatsqrtf _ | Ibswap _))
427444
| Iop(Iintop(Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
428445
| Ipopcnt | Iclz _ | Ictz _ ))
@@ -465,14 +482,15 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
465482
| Op (Intop (Idiv | Imod)) | Op (Intop_imm ((Idiv | Imod), _)) ->
466483
[| rax; rdx |]
467484
| Op(Store(Single, _, _)) ->
468-
destroy_xmm15 ()
485+
destroy_xmm 15
469486
| Op(Intop(Imulh _ | Icomp _) | Intop_imm((Icomp _), _)) ->
470487
[| rax |]
471488
| Op (Specific (Irdtsc | Irdpmc)) ->
472489
[| rax; rdx |]
473490
| Op Poll -> destroyed_at_alloc_or_poll
474491
| Op (Alloc _) ->
475492
destroyed_at_alloc_or_poll
493+
| Op (Specific (Isimd op)) -> destroyed_by_simd_op op
476494
| Op (Move | Spill | Reload
477495
| Const_int _ | Const_float _ | Const_symbol _ | Const_vec128 _
478496
| Stackoffset _
@@ -497,7 +515,7 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
497515
| Begin_region
498516
| End_region
499517
| Specific (Ilea _ | Istore_int _ | Ioffset_loc _
500-
| Ifloatarithmem _ | Ifloatsqrtf _ | Ibswap _ | Isimd _
518+
| Ifloatarithmem _ | Ifloatsqrtf _ | Ibswap _
501519
| Isextend32 | Izextend32 | Ipause
502520
| Iprefetch _ | Ilfence | Isfence | Imfence)
503521
| Name_for_debugger _ | Dls_get)
@@ -596,6 +614,20 @@ let max_register_pressure =
596614
consumes ~int:1 ~float:0
597615
| Istore(Single, _, _) | Icompf _ ->
598616
consumes ~int:0 ~float:1
617+
| Ispecific(Isimd op) ->
618+
(match Simd_proc.register_behavior op with
619+
| R_RM_rax_rdx_to_xmm0
620+
| R_RM_to_xmm0 -> consumes ~int:0 ~float:1
621+
| R_RM_rax_rdx_to_rcx
622+
| R_RM_to_rcx -> consumes ~int:1 ~float:0
623+
| R_to_fst
624+
| R_to_R
625+
| R_to_RM
626+
| RM_to_R
627+
| R_R_to_fst
628+
| R_RM_to_fst
629+
| R_RM_to_R
630+
| R_RM_xmm0_to_fst -> consumes ~int:0 ~float:0)
599631
| Iintop(Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
600632
| Ipopcnt|Iclz _| Ictz _)
601633
| Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr
@@ -613,7 +645,7 @@ let max_register_pressure =
613645
| Istackoffset _ | Iload _
614646
| Ispecific(Ilea _ | Isextend32 | Izextend32 | Iprefetch _ | Ipause
615647
| Irdtsc | Irdpmc | Istore_int (_, _, _)
616-
| Ilfence | Isfence | Imfence | Isimd _
648+
| Ilfence | Isfence | Imfence
617649
| Ioffset_loc (_, _) | Ifloatarithmem (_, _) | Ifloatsqrtf _
618650
| Ibswap _)
619651
| Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque

backend/amd64/regalloc_stack_operands.ml

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
166166
| Op (Addf | Subf | Mulf | Divf) ->
167167
may_use_stack_operand_for_second_argument map instr ~num_args:2 ~res_is_fst:true
168168
| Op (Specific (Isimd op)) ->
169-
(match Simd_selection.register_behavior op with
169+
(match Simd_proc.register_behavior op with
170170
| R_to_fst | R_to_R | R_R_to_fst -> May_still_have_spilled_registers
171171
| R_RM_to_fst ->
172172
may_use_stack_operand_for_second_argument map instr ~num_args:2 ~res_is_fst:true

backend/amd64/simd.ml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
(* *)
1313
(**************************************************************************)
1414

15-
[@@@ocaml.warning "+a-4-30-40-41-42"]
15+
[@@@ocaml.warning "+a-40-42"]
1616

1717
(* SIMD instructions for AMD64 *)
1818

backend/amd64/simd_proc.ml

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
(**************************************************************************)
2+
(* *)
3+
(* OCaml *)
4+
(* *)
5+
(* Max Slater, Jane Street *)
6+
(* *)
7+
(* Copyright 2024 Jane Street Group LLC *)
8+
(* *)
9+
(* All rights reserved. This file is distributed under the terms of *)
10+
(* the GNU Lesser General Public License version 2.1, with the *)
11+
(* special exception on linking described in the file LICENSE. *)
12+
(* *)
13+
(**************************************************************************)
14+
15+
[@@@ocaml.warning "+a-40-42"]
16+
17+
(* SIMD register behavior for AMD64 *)
18+
19+
open Simd
20+
21+
(* This will need to be expanded with the addition of three and four argument
22+
operations in AVX2 and AVX512. *)
23+
type register_behavior =
24+
| R_to_fst
25+
| R_to_R
26+
| R_to_RM
27+
| RM_to_R
28+
| R_R_to_fst
29+
| R_RM_to_fst
30+
| R_RM_to_R
31+
| R_RM_xmm0_to_fst
32+
| R_RM_rax_rdx_to_rcx
33+
| R_RM_to_rcx
34+
| R_RM_rax_rdx_to_xmm0
35+
| R_RM_to_xmm0
36+
37+
let register_behavior_clmul = function Clmul_64 _ -> R_RM_to_fst
38+
39+
let register_behavior_bmi2 = function Extract_64 | Deposit_64 -> R_RM_to_R
40+
41+
let register_behavior_sse = function
42+
| Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
43+
| Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
44+
R_RM_to_fst
45+
| Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
46+
| High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
47+
| Movemask_32 -> R_to_R
48+
49+
let register_behavior_sse2 = function
50+
| Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Add_saturating_i8
51+
| Min_scalar_f64 | Max_scalar_f64 | Add_saturating_i16
52+
| Add_saturating_unsigned_i8 | Add_saturating_unsigned_i16 | Sub_i8 | Sub_i16
53+
| Sub_i32 | Sub_i64 | Sub_f64 | Sub_saturating_i8 | Sub_saturating_i16
54+
| Sub_saturating_unsigned_i8 | Sub_saturating_unsigned_i16 | Max_unsigned_i8
55+
| Max_i16 | Max_f64 | Min_unsigned_i8 | Min_i16 | Min_f64 | Mul_f64 | Div_f64
56+
| And_bits | Andnot_bits | Or_bits | Xor_bits | Cmpeq_i8 | Cmpeq_i16
57+
| Cmpeq_i32 | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32 | Cmp_f64 _ | SLL_i16 | SLL_i32
58+
| SLL_i64 | SRL_i16 | SRL_i32 | SRL_i64 | SRA_i16 | SRA_i32 | Avg_unsigned_i8
59+
| Avg_unsigned_i16 | SAD_unsigned_i8 | Shuffle_64 _ | Interleave_high_8
60+
| Interleave_high_16 | Interleave_high_64 | Interleave_low_8
61+
| Interleave_low_16 | Interleave_low_64 | I16_to_i8 | I32_to_i16
62+
| I16_to_unsigned_i8 | I32_to_unsigned_i16 | Mulhi_i16 | Mulhi_unsigned_i16
63+
| Mullo_i16 | Mul_hadd_i16_to_i32 ->
64+
R_RM_to_fst
65+
| Shuffle_high_16 _ | Shuffle_low_16 _ | I32_to_f64 | I32_to_f32 | F64_to_i32
66+
| Cast_scalar_f64_i64 | F64_to_f32 | F32_to_i32 | F32_to_f64 | Sqrt_f64 ->
67+
RM_to_R
68+
| SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _
69+
| SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ ->
70+
R_to_fst
71+
| Movemask_8 | Movemask_64 -> R_to_R
72+
| Sqrt_scalar_f64 -> (* Backwards compatibility *) R_to_R
73+
74+
let register_behavior_sse3 = function
75+
| Addsub_f32 | Addsub_f64 | Hadd_f32 | Hadd_f64 | Hsub_f32 | Hsub_f64 ->
76+
R_RM_to_fst
77+
| Dup_low_64 | Dup_odd_32 | Dup_even_32 -> RM_to_R
78+
79+
let register_behavior_ssse3 = function
80+
| Hadd_i16 | Hadd_i32 | Hadd_saturating_i16 | Hsub_i16 | Hsub_i32
81+
| Hsub_saturating_i16 | Mulsign_i8 | Mulsign_i16 | Mulsign_i32 | Shuffle_8
82+
| Alignr_i8 _ | Mul_unsigned_hadd_saturating_i8_to_i16 ->
83+
R_RM_to_fst
84+
| Abs_i8 | Abs_i16 | Abs_i32 -> RM_to_R
85+
86+
let register_behavior_sse41 = function
87+
| Blend_16 _ | Blend_32 _ | Blend_64 _ | Cmpeq_i64 | Dp_f32 _ | Dp_f64 _
88+
| Max_i8 | Max_i32 | Max_unsigned_i16 | Max_unsigned_i32 | Min_i8 | Min_i32
89+
| Min_unsigned_i16 | Min_unsigned_i32 | Insert_i8 _ | Insert_i16 _
90+
| Insert_i32 _ | Insert_i64 _ | Multi_sad_unsigned_i8 _ | Mullo_i32 ->
91+
R_RM_to_fst
92+
| I8_sx_i16 | I8_sx_i32 | I8_sx_i64 | I16_sx_i32 | I16_sx_i64 | I32_sx_i64
93+
| I8_zx_i16 | I8_zx_i32 | I8_zx_i64 | I16_zx_i32 | I16_zx_i64 | I32_zx_i64
94+
| Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _ ->
95+
RM_to_R
96+
| Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
97+
| Extract_i64 _ | Extract_i32 _ -> R_to_RM
98+
| Extract_i8 _ | Extract_i16 _ ->
99+
(* CR mslater: (SIMD): replace once we have int8/int16/float32 *)
100+
R_to_R
101+
102+
let register_behavior_sse42 = function
103+
| Crc32_64 | Cmpgt_i64 -> R_RM_to_fst
104+
| Cmpestrm _ -> R_RM_rax_rdx_to_xmm0
105+
| Cmpistrm _ -> R_RM_to_xmm0
106+
| Cmpestra _ | Cmpestrc _ | Cmpestri _ | Cmpestro _ | Cmpestrs _ | Cmpestrz _
107+
->
108+
R_RM_rax_rdx_to_rcx
109+
| Cmpistra _ | Cmpistrc _ | Cmpistri _ | Cmpistro _ | Cmpistrs _ | Cmpistrz _
110+
->
111+
R_RM_to_rcx
112+
113+
let register_behavior = function
114+
| CLMUL op -> register_behavior_clmul op
115+
| BMI2 op -> register_behavior_bmi2 op
116+
| SSE op -> register_behavior_sse op
117+
| SSE2 op -> register_behavior_sse2 op
118+
| SSE3 op -> register_behavior_sse3 op
119+
| SSSE3 op -> register_behavior_ssse3 op
120+
| SSE41 op -> register_behavior_sse41 op
121+
| SSE42 op -> register_behavior_sse42 op

backend/amd64/simd_reload.ml

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@
1212
(* *)
1313
(**************************************************************************)
1414

15-
[@@@ocaml.warning "+a-4-30-40-41-42"]
15+
[@@@ocaml.warning "+a-40-42"]
1616

1717
(* SIMD instruction reload for AMD64 *)
1818

1919
let reload_operation makereg op arg res =
2020
let stackp r =
2121
match r.Reg.loc with Stack _ -> true | Reg _ | Unknown -> false
2222
in
23-
match Simd_selection.register_behavior op with
23+
match Simd_proc.register_behavior op with
2424
| R_to_fst ->
2525
(* Argument must be in a register; result must be the argument. *)
2626
let arg0 = if stackp arg.(0) then makereg arg.(0) else arg.(0) in

0 commit comments

Comments
 (0)