dkalinichenko-js
diff --git a/‎backend/amd64/emit.mlp
Lines changed: 1 addition & 1 deletion b/‎backend/amd64/emit.mlp
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/amd64/proc.ml
Lines changed: 40 additions & 8 deletions b/‎backend/amd64/proc.ml
Lines changed: 40 additions & 8 deletions
diff --git a/‎backend/amd64/regalloc_stack_operands.ml
Lines changed: 1 addition & 1 deletion b/‎backend/amd64/regalloc_stack_operands.ml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/amd64/simd.ml
Lines changed: 1 addition & 1 deletion b/‎backend/amd64/simd.ml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/amd64/simd_proc.ml
Lines changed: 121 additions & 0 deletions b/‎backend/amd64/simd_proc.ml
Lines changed: 121 additions & 0 deletions
diff --git a/‎backend/amd64/simd_reload.ml
Lines changed: 2 additions & 2 deletions b/‎backend/amd64/simd_reload.ml
Lines changed: 2 additions & 2 deletions
@@ -963,7 +963,7 @@ let emit_atomic instr op (size : Cmm.atomic_bitwidth) addr =
     I.movzx res8 res
 
 let emit_simd_instr op i =
-  (match Simd_selection.register_behavior op with
+  (match Simd_proc.register_behavior op with
   | R_to_fst ->
     assert (Reg.same_loc i.arg.(0) i.res.(0));
     assert (Reg.is_reg i.arg.(0))
 
@@ -170,15 +170,16 @@ let phys_reg ty n =
 
 let rax = phys_reg Int 0
 let rdx = phys_reg Int 4
+let rcx = phys_reg Int 5
 let r10 = phys_reg Int 10
 let r11 = phys_reg Int 11
 let rbp = phys_reg Int 12
 
 (* CSE needs to know that all versions of xmm15 are destroyed. *)
-let destroy_xmm15 () =
+let destroy_xmm n =
   if Language_extension.is_enabled SIMD
-  then [| phys_reg Float 115; phys_reg Vec128 115 |]
-  else [| phys_reg Float 115 |]
+  then [| phys_reg Float (100 + n); phys_reg Vec128 (100 + n) |]
+  else [| phys_reg Float (100 + n) |]
 
 let destroyed_by_plt_stub =
   if not X86_proc.use_plt then [| |] else [| r10; r11 |]
@@ -399,6 +400,21 @@ let destroyed_at_pushtrap =
 let has_pushtrap traps =
   List.exists (function Cmm.Push _ -> true | Pop _ -> false) traps
 
+let destroyed_by_simd_op op =
+  match Simd_proc.register_behavior op with
+  | R_RM_rax_rdx_to_xmm0
+  | R_RM_to_xmm0 -> destroy_xmm 0
+  | R_RM_rax_rdx_to_rcx
+  | R_RM_to_rcx -> [| rcx |]
+  | R_to_fst
+  | R_to_R
+  | R_to_RM
+  | RM_to_R
+  | R_R_to_fst
+  | R_RM_to_fst
+  | R_RM_to_R
+  | R_RM_xmm0_to_fst -> [||]
+
 (* note: keep this function in sync with `destroyed_at_{basic,terminator}` below. *)
 let destroyed_at_oper = function
     Iop(Icall_ind | Icall_imm _) ->
@@ -410,7 +426,7 @@ let destroyed_at_oper = function
   | Iop(Iintop(Idiv | Imod)) | Iop(Iintop_imm((Idiv | Imod), _))
         -> [| rax; rdx |]
   | Iop(Istore(Single, _, _))
-        -> destroy_xmm15 ()
+        -> destroy_xmm 15
   | Iop(Ialloc _ | Ipoll _) -> destroyed_at_alloc_or_poll
   | Iop(Iintop(Imulh _ | Icomp _) | Iintop_imm((Icomp _), _))
         -> [| rax |]
@@ -420,9 +436,10 @@ let destroyed_at_oper = function
   | Ireturn traps when has_pushtrap traps -> assert false
   | Iop(Ispecific (Irdtsc | Irdpmc)) -> [| rax; rdx |]
   | Iop(Ispecific(Ilfence | Isfence | Imfence)) -> [||]
+  | Iop(Ispecific(Isimd op)) -> destroyed_by_simd_op op
   | Iop(Ispecific(Isextend32 | Izextend32 | Ilea _
                  | Istore_int (_, _, _) | Ioffset_loc (_, _)
-                 | Ipause | Iprefetch _ | Isimd _
+                 | Ipause | Iprefetch _
                  | Ifloatarithmem (_, _) | Ifloatsqrtf _ | Ibswap _))
   | Iop(Iintop(Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
               | Ipopcnt | Iclz _ | Ictz _ ))
@@ -465,14 +482,15 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
   | Op (Intop (Idiv | Imod)) | Op (Intop_imm ((Idiv | Imod), _)) ->
     [| rax; rdx |]
   | Op(Store(Single, _, _)) ->
-    destroy_xmm15 ()
+    destroy_xmm 15
   | Op(Intop(Imulh _ | Icomp _) | Intop_imm((Icomp _), _)) ->
     [| rax |]
   | Op (Specific (Irdtsc | Irdpmc)) ->
     [| rax; rdx |]
   | Op Poll -> destroyed_at_alloc_or_poll
   | Op (Alloc _) ->
     destroyed_at_alloc_or_poll
+  | Op (Specific (Isimd op)) -> destroyed_by_simd_op op
   | Op (Move | Spill | Reload
        | Const_int _ | Const_float _ | Const_symbol _ | Const_vec128 _
        | Stackoffset _
@@ -497,7 +515,7 @@ let destroyed_at_basic (basic : Cfg_intf.S.basic) =
        | Begin_region
        | End_region
        | Specific (Ilea _ | Istore_int _ | Ioffset_loc _
-                  | Ifloatarithmem _ | Ifloatsqrtf _ | Ibswap _ | Isimd _
+                  | Ifloatarithmem _ | Ifloatsqrtf _ | Ibswap _
                   | Isextend32 | Izextend32 | Ipause
                   | Iprefetch _ | Ilfence | Isfence | Imfence)
        | Name_for_debugger _ | Dls_get)
@@ -596,6 +614,20 @@ let max_register_pressure =
     consumes ~int:1 ~float:0
   | Istore(Single, _, _) | Icompf _ ->
     consumes ~int:0 ~float:1
+  | Ispecific(Isimd op) ->
+    (match Simd_proc.register_behavior op with
+    | R_RM_rax_rdx_to_xmm0
+    | R_RM_to_xmm0 -> consumes ~int:0 ~float:1
+    | R_RM_rax_rdx_to_rcx
+    | R_RM_to_rcx -> consumes ~int:1 ~float:0
+    | R_to_fst
+    | R_to_R
+    | R_to_RM
+    | RM_to_R
+    | R_R_to_fst
+    | R_RM_to_fst
+    | R_RM_to_R
+    | R_RM_xmm0_to_fst -> consumes ~int:0 ~float:0)
   | Iintop(Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr
            | Ipopcnt|Iclz _| Ictz _)
   | Iintop_imm((Iadd | Isub | Imul | Imulh _ | Iand | Ior | Ixor | Ilsl | Ilsr
@@ -613,7 +645,7 @@ let max_register_pressure =
   | Istackoffset _ | Iload _
   | Ispecific(Ilea _ | Isextend32 | Izextend32 | Iprefetch _ | Ipause
              | Irdtsc | Irdpmc | Istore_int (_, _, _)
-             | Ilfence | Isfence | Imfence | Isimd _
+             | Ilfence | Isfence | Imfence
              | Ioffset_loc (_, _) | Ifloatarithmem (_, _) | Ifloatsqrtf _
              | Ibswap _)
   | Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque
 
@@ -166,7 +166,7 @@ let basic (map : spilled_map) (instr : Cfg.basic Cfg.instruction) =
   | Op (Addf | Subf | Mulf | Divf) ->
     may_use_stack_operand_for_second_argument map instr ~num_args:2 ~res_is_fst:true
   | Op (Specific (Isimd op)) ->
-    (match Simd_selection.register_behavior op with
+    (match Simd_proc.register_behavior op with
     | R_to_fst | R_to_R | R_R_to_fst -> May_still_have_spilled_registers
     | R_RM_to_fst ->
       may_use_stack_operand_for_second_argument map instr ~num_args:2 ~res_is_fst:true
 
@@ -12,7 +12,7 @@
 (*                                                                        *)
 (**************************************************************************)
 
-[@@@ocaml.warning "+a-4-30-40-41-42"]
+[@@@ocaml.warning "+a-40-42"]
 
 (* SIMD instructions for AMD64 *)
 
 
@@ -0,0 +1,121 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*                      Max Slater, Jane Street                           *)
+(*                                                                        *)
+(*   Copyright 2024 Jane Street Group LLC                                 *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+[@@@ocaml.warning "+a-40-42"]
+
+(* SIMD register behavior for AMD64 *)
+
+open Simd
+
+(* This will need to be expanded with the addition of three and four argument
+   operations in AVX2 and AVX512. *)
+type register_behavior =
+  | R_to_fst
+  | R_to_R
+  | R_to_RM
+  | RM_to_R
+  | R_R_to_fst
+  | R_RM_to_fst
+  | R_RM_to_R
+  | R_RM_xmm0_to_fst
+  | R_RM_rax_rdx_to_rcx
+  | R_RM_to_rcx
+  | R_RM_rax_rdx_to_xmm0
+  | R_RM_to_xmm0
+
+let register_behavior_clmul = function Clmul_64 _ -> R_RM_to_fst
+
+let register_behavior_bmi2 = function Extract_64 | Deposit_64 -> R_RM_to_R
+
+let register_behavior_sse = function
+  | Cmp_f32 _ | Add_f32 | Sub_f32 | Mul_f32 | Div_f32 | Max_f32 | Min_f32
+  | Interleave_low_32 | Interleave_high_32 | Shuffle_32 _ ->
+    R_RM_to_fst
+  | Rcp_f32 | Sqrt_f32 | Rsqrt_f32 -> RM_to_R
+  | High_64_to_low_64 | Low_64_to_high_64 -> R_R_to_fst
+  | Movemask_32 -> R_to_R
+
+let register_behavior_sse2 = function
+  | Add_i8 | Add_i16 | Add_i32 | Add_i64 | Add_f64 | Add_saturating_i8
+  | Min_scalar_f64 | Max_scalar_f64 | Add_saturating_i16
+  | Add_saturating_unsigned_i8 | Add_saturating_unsigned_i16 | Sub_i8 | Sub_i16
+  | Sub_i32 | Sub_i64 | Sub_f64 | Sub_saturating_i8 | Sub_saturating_i16
+  | Sub_saturating_unsigned_i8 | Sub_saturating_unsigned_i16 | Max_unsigned_i8
+  | Max_i16 | Max_f64 | Min_unsigned_i8 | Min_i16 | Min_f64 | Mul_f64 | Div_f64
+  | And_bits | Andnot_bits | Or_bits | Xor_bits | Cmpeq_i8 | Cmpeq_i16
+  | Cmpeq_i32 | Cmpgt_i8 | Cmpgt_i16 | Cmpgt_i32 | Cmp_f64 _ | SLL_i16 | SLL_i32
+  | SLL_i64 | SRL_i16 | SRL_i32 | SRL_i64 | SRA_i16 | SRA_i32 | Avg_unsigned_i8
+  | Avg_unsigned_i16 | SAD_unsigned_i8 | Shuffle_64 _ | Interleave_high_8
+  | Interleave_high_16 | Interleave_high_64 | Interleave_low_8
+  | Interleave_low_16 | Interleave_low_64 | I16_to_i8 | I32_to_i16
+  | I16_to_unsigned_i8 | I32_to_unsigned_i16 | Mulhi_i16 | Mulhi_unsigned_i16
+  | Mullo_i16 | Mul_hadd_i16_to_i32 ->
+    R_RM_to_fst
+  | Shuffle_high_16 _ | Shuffle_low_16 _ | I32_to_f64 | I32_to_f32 | F64_to_i32
+  | Cast_scalar_f64_i64 | F64_to_f32 | F32_to_i32 | F32_to_f64 | Sqrt_f64 ->
+    RM_to_R
+  | SLLi_i16 _ | SLLi_i32 _ | SLLi_i64 _ | SRLi_i16 _ | SRLi_i32 _ | SRLi_i64 _
+  | SRAi_i16 _ | SRAi_i32 _ | Shift_left_bytes _ | Shift_right_bytes _ ->
+    R_to_fst
+  | Movemask_8 | Movemask_64 -> R_to_R
+  | Sqrt_scalar_f64 -> (* Backwards compatibility *) R_to_R
+
+let register_behavior_sse3 = function
+  | Addsub_f32 | Addsub_f64 | Hadd_f32 | Hadd_f64 | Hsub_f32 | Hsub_f64 ->
+    R_RM_to_fst
+  | Dup_low_64 | Dup_odd_32 | Dup_even_32 -> RM_to_R
+
+let register_behavior_ssse3 = function
+  | Hadd_i16 | Hadd_i32 | Hadd_saturating_i16 | Hsub_i16 | Hsub_i32
+  | Hsub_saturating_i16 | Mulsign_i8 | Mulsign_i16 | Mulsign_i32 | Shuffle_8
+  | Alignr_i8 _ | Mul_unsigned_hadd_saturating_i8_to_i16 ->
+    R_RM_to_fst
+  | Abs_i8 | Abs_i16 | Abs_i32 -> RM_to_R
+
+let register_behavior_sse41 = function
+  | Blend_16 _ | Blend_32 _ | Blend_64 _ | Cmpeq_i64 | Dp_f32 _ | Dp_f64 _
+  | Max_i8 | Max_i32 | Max_unsigned_i16 | Max_unsigned_i32 | Min_i8 | Min_i32
+  | Min_unsigned_i16 | Min_unsigned_i32 | Insert_i8 _ | Insert_i16 _
+  | Insert_i32 _ | Insert_i64 _ | Multi_sad_unsigned_i8 _ | Mullo_i32 ->
+    R_RM_to_fst
+  | I8_sx_i16 | I8_sx_i32 | I8_sx_i64 | I16_sx_i32 | I16_sx_i64 | I32_sx_i64
+  | I8_zx_i16 | I8_zx_i32 | I8_zx_i64 | I16_zx_i32 | I16_zx_i64 | I32_zx_i64
+  | Round_f64 _ | Round_f32 _ | Minpos_unsigned_i16 | Round_scalar_f64 _ ->
+    RM_to_R
+  | Blendv_8 | Blendv_32 | Blendv_64 -> R_RM_xmm0_to_fst
+  | Extract_i64 _ | Extract_i32 _ -> R_to_RM
+  | Extract_i8 _ | Extract_i16 _ ->
+    (* CR mslater: (SIMD): replace once we have int8/int16/float32 *)
+    R_to_R
+
+let register_behavior_sse42 = function
+  | Crc32_64 | Cmpgt_i64 -> R_RM_to_fst
+  | Cmpestrm _ -> R_RM_rax_rdx_to_xmm0
+  | Cmpistrm _ -> R_RM_to_xmm0
+  | Cmpestra _ | Cmpestrc _ | Cmpestri _ | Cmpestro _ | Cmpestrs _ | Cmpestrz _
+    ->
+    R_RM_rax_rdx_to_rcx
+  | Cmpistra _ | Cmpistrc _ | Cmpistri _ | Cmpistro _ | Cmpistrs _ | Cmpistrz _
+    ->
+    R_RM_to_rcx
+
+let register_behavior = function
+  | CLMUL op -> register_behavior_clmul op
+  | BMI2 op -> register_behavior_bmi2 op
+  | SSE op -> register_behavior_sse op
+  | SSE2 op -> register_behavior_sse2 op
+  | SSE3 op -> register_behavior_sse3 op
+  | SSSE3 op -> register_behavior_ssse3 op
+  | SSE41 op -> register_behavior_sse41 op
+  | SSE42 op -> register_behavior_sse42 op
@@ -12,15 +12,15 @@
 (*                                                                        *)
 (**************************************************************************)
 
-[@@@ocaml.warning "+a-4-30-40-41-42"]
+[@@@ocaml.warning "+a-40-42"]
 
 (* SIMD instruction reload for AMD64 *)
 
 let reload_operation makereg op arg res =
   let stackp r =
     match r.Reg.loc with Stack _ -> true | Reg _ | Unknown -> false
   in
-  match Simd_selection.register_behavior op with
+  match Simd_proc.register_behavior op with
   | R_to_fst ->
     (* Argument must be in a register; result must be the argument. *)
     let arg0 = if stackp arg.(0) then makereg arg.(0) else arg.(0) in