diff --git a/backend/arm64/arch.ml b/backend/arm64/arch.ml index e0fb3cc4b74..5153a2f70d5 100644 --- a/backend/arm64/arch.ml +++ b/backend/arm64/arch.ml @@ -83,7 +83,7 @@ let size_float = 8 let size_vec128 = 16 -let allow_unaligned_access = true +let allow_unaligned_access = true (* Behavior of division *) @@ -93,10 +93,9 @@ let division_crashes_on_overflow = false let identity_addressing = Iindexed 0 -let offset_addressing addr delta = - match addr with - | Iindexed n -> Iindexed(n + delta) - | Ibased(s, n) -> Ibased(s, n + delta) +let offset_addressing _addr _delta = + (* Resulting offset might not be representable. *) + Misc.fatal_error "Arch.offset_addressing not supported" let num_args_addressing = function | Iindexed _ -> 1 diff --git a/backend/arm64/cfg_selection.ml b/backend/arm64/cfg_selection.ml index 5d744944f24..cf99fead37d 100644 --- a/backend/arm64/cfg_selection.ml +++ b/backend/arm64/cfg_selection.ml @@ -173,6 +173,56 @@ class selector = (* Other operations are regular *) | _ -> super#select_operation op args dbg ~label_after + method! emit_stores env dbg data regs_addr = + (* Override [emit_stores] to ensure that addressing mode always uses a + legal offset. *) + let offset = ref (-Arch.size_int) in + let base = + assert (Array.length regs_addr = 1); + ref regs_addr + in + List.iter + (fun arg -> + match self#emit_expr env arg ~bound_name:None with + | None -> assert false + | Some regs -> + for i = 0 to Array.length regs - 1 do + let r = regs.(i) in + let kind = + match r.Reg.typ with + | Float -> Double + | Float32 -> Single { reg = Float32 } + | Vec128 -> + (* 128-bit memory operations are default unaligned. Aligned + (big)array operations are handled separately via cmm. *) + Onetwentyeight_unaligned + | Val | Addr | Int -> Word_val + | Valx2 -> + Misc.fatal_error "Unexpected machtype_component Valx2" + in + if not (Selection_utils.is_offset kind !offset) + then ( + (* Use a temporary to store the address [!base + offset]. *) + let tmp = self#regs_for typ_int in + self#insert_debug env + (self#lift_op + (self#make_const_int (Nativeint.of_int !offset))) + dbg [||] tmp; + self#insert_debug env + (self#lift_op (Operation.Intop Iadd)) + dbg (Array.append !base tmp) tmp; + (* Use the temporary as the new base address. *) + base := tmp; + offset := 0); + self#insert_debug env + (self#make_store kind (Iindexed !offset) false) + dbg + (Array.append [| r |] !base) + [||]; + offset := !offset + Select_utils.size_component r.Reg.typ + done) + data + method! insert_move_extcall_arg env ty_arg src dst = let ty_arg_is_int32 = match ty_arg with diff --git a/backend/arm64/emit.mlp b/backend/arm64/emit.mlp index f57c1217fc6..c909aa8dbaa 100644 --- a/backend/arm64/emit.mlp +++ b/backend/arm64/emit.mlp @@ -383,29 +383,42 @@ let function_name = ref "" let tailrec_entry_point = ref None (* Pending floating-point literals *) let float_literals = ref ([] : (int64 * label) list) +let vec128_literals = ref ([] : (Cmm.vec128_bits * label) list) (* Label a floating-point literal *) -let float_literal f = +let add_literal p f = try - List.assoc f !float_literals + List.assoc f !p with Not_found -> let lbl = Cmm.new_label() in - float_literals := (f, lbl) :: !float_literals; + p := (f, lbl) :: !p; lbl +let float_literal f = add_literal float_literals f +let vec128_literal f = add_literal vec128_literals f + (* Emit all pending literals *) -let emit_literals() = - if !float_literals <> [] then begin +let emit_literals p align emit_literal = + if !p <> [] then begin if macosx then - ` .section __TEXT,__literal8,8byte_literals\n`; - ` .align 3\n`; - List.iter - (fun (f, lbl) -> - `{emit_label lbl}:`; emit_float64_directive ".quad" f) - !float_literals; - float_literals := [] + ` .section __TEXT,__literal{emit_int align},{emit_int align}byte_literals\n`; + ` .balign {emit_int align}\n`; + List.iter emit_literal !p; + p := [] end +let emit_float_literal (f, lbl) = + `{emit_label lbl}:`; emit_float64_directive ".quad" f + +let emit_vec128_literal (({ high; low; } : Cmm.vec128_bits), lbl) = + `{emit_label lbl}:\n`; + emit_float64_directive ".quad" low; + emit_float64_directive ".quad" high + +let emit_literals () = + emit_literals float_literals size_float emit_float_literal; + emit_literals vec128_literals size_vec128 emit_vec128_literal + (* Emit code to load the address of a symbol *) let emit_load_symbol_addr dst s = @@ -710,10 +723,8 @@ module BR = Branch_relaxation.Make (struct num_instructions_for_intconst n | Lop (Const_float32 _) -> 2 | Lop (Const_float _) -> 2 + | Lop (Const_vec128 _) -> 2 | Lop (Const_symbol _) -> 2 - | Lop (Const_vec128 _) -> - (* CR mslater: (SIMD) arm64 *) - Misc.fatal_error "SIMD is not supported on this architecture" | Lop (Intop_atomic _) -> (* Never generated; builtins are not yet translated to atomics *) assert false @@ -866,7 +877,7 @@ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo = let domain_local_top_offset = DS.(idx_of_field Domain_local_top) * 8 in ` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`; ` ldr {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`; - ` sub {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; + emit_subimm r r n; ` str {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`; ` cmp {emit_reg r}, {emit_reg reg_tmp1}\n`; let lbl_call = Cmm.new_label () in @@ -893,7 +904,7 @@ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo = assert (16 <= n && n < 0x1_000 && n land 0x7 = 0); let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in ` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`; - ` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`; + emit_subimm reg_alloc_ptr reg_alloc_ptr n; ` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`; if not far then begin ` b.lo {emit_label lbl_call_gc}\n` @@ -1064,9 +1075,16 @@ let emit_instr i = let lbl = float_literal f in emit_load_literal i.res.(0) lbl end - | Lop(Const_vec128 _) -> - (* CR mslater: (SIMD) arm64 *) - Misc.fatal_error "SIMD is not supported on this architecture" + | Lop(Const_vec128 ({high; low} as l)) -> + DSL.check_reg Vec128 i.res.(0); + begin match (high, low) with + | 0x0000_0000_0000_0000L, 0x0000_0000_0000_0000L -> + let dst = DSL.emit_reg_v2d i.res.(0) in + DSL.ins I.MOVI [| dst; DSL.imm 0 |] + | _ -> + let lbl = vec128_literal l in + emit_load_literal i.res.(0) lbl + end | Lop(Const_symbol s) -> emit_load_symbol_addr i.res.(0) s.sym_name | Lcall_op(Lcall_ind) -> @@ -1157,8 +1175,9 @@ let emit_instr i = DSL.check_reg Float32 dst; ` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n` | Onetwentyeight_aligned | Onetwentyeight_unaligned -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got 128 bit memory chunk" + (* CR gyorsh: check alignment *) + DSL.check_reg Vec128 dst; + ` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n` end | Lop(Store(size, addr, assignment)) -> (* NB: assignments other than Word_int and Word_val do not follow the @@ -1192,8 +1211,9 @@ let emit_instr i = DSL.check_reg Float32 src; ` str {emit_reg src}, {emit_addressing addr base}\n` | Onetwentyeight_aligned | Onetwentyeight_unaligned -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got 128 bit memory chunk" + (* CR gyorsh: check alignment *) + DSL.check_reg Vec128 src; + ` str {emit_reg src}, {emit_addressing addr base}\n` end | Lop(Alloc { bytes = n; dbginfo; mode = Heap }) -> assembly_code_for_allocation i ~n ~local:false ~far:false ~dbginfo @@ -1548,9 +1568,9 @@ let emit_item (d : Cmm.data_item) = | Cint n -> ` .quad {emit_nativeint n}\n` | Csingle f -> emit_float32_directive ".long" (Int32.bits_of_float f) | Cdouble f -> emit_float64_directive ".quad" (Int64.bits_of_float f) - | Cvec128 _ -> - (* CR mslater: (SIMD) arm64 *) - Misc.fatal_error "SIMD is not supported on this architecture" + | Cvec128 { high; low; } -> + emit_float64_directive ".quad" low; + emit_float64_directive ".quad" high; | Csymbol_address s -> ` .quad {emit_symbol s.sym_name}\n` | Csymbol_offset (s, o) -> ` .quad {emit_symbol s.sym_name}+{emit_int o}\n` | Cstring s -> emit_string_directive " .ascii " s diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml index 751404eb975..1b4c4c09d26 100644 --- a/backend/arm64/proc.ml +++ b/backend/arm64/proc.ml @@ -65,52 +65,47 @@ let float32_reg_name = "s16"; "s17"; "s18"; "s19"; "s20"; "s21"; "s22"; "s23"; "s24"; "s25"; "s26"; "s27"; "s28"; "s29"; "s30"; "s31" |] +let vec128_reg_name = + [| "q0"; "q1"; "q2"; "q3"; "q4"; "q5"; "q6"; "q7"; + "q8"; "q9"; "q10"; "q11"; "q12"; "q13"; "q14"; "q15"; + "q16"; "q17"; "q18"; "q19"; "q20"; "q21"; "q22"; "q23"; + "q24"; "q25"; "q26"; "q27"; "q28"; "q29"; "q30"; "q31" |] + let num_register_classes = 2 let register_class_of_machtype_component typ = match (typ : Cmm.machtype_component) with | Val | Int | Addr -> 0 - | Vec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" - | Valx2 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got valx2 register" | Float | Float32 -> 1 + | Vec128 -> 1 + | Valx2 -> 1 let register_class r = register_class_of_machtype_component r.typ -let num_stack_slot_classes = 2 +let num_stack_slot_classes = 3 let stack_slot_class typ = match (typ : Cmm.machtype_component) with | Val | Int | Addr -> 0 - | Vec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" - | Valx2 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got valx2 register" | Float | Float32 -> 1 + | Vec128 -> 2 + | Valx2 -> 2 let types_are_compatible left right = match left.typ, right.typ with | (Int | Val | Addr), (Int | Val | Addr) | Float, Float -> true | Float32, Float32 -> true - | Vec128, _ | _, Vec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" - | Valx2, _ | _, Valx2 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got valx2 register" - | (Int | Val | Addr | Float | Float32), _ -> false + | Vec128, Vec128 -> true + | Valx2,Valx2 -> true + | (Int | Val | Addr | Float | Float32 | Vec128 | Valx2), _ -> false let stack_class_tag c = match c with | 0 -> "i" | 1 -> "f" + | 2 -> "x" | c -> Misc.fatal_errorf "Unspecified stack slot class %d" c let num_available_registers = @@ -125,14 +120,10 @@ let register_name ty r = int_reg_name.(r - first_available_register.(0)) | Float -> float_reg_name.(r - first_available_register.(1)) - | Vec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" | Float32 -> float32_reg_name.(r - first_available_register.(1)) - | Valx2 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got valx2 register" + | Vec128 | Valx2 -> + vec128_reg_name.(r - first_available_register.(1)) (* Representation of hard registers by pseudo-registers *) @@ -150,8 +141,10 @@ v let hard_int_reg = hard_reg_gen Int (Array.length int_reg_name) let hard_float_reg = hard_reg_gen Float (Array.length float_reg_name) let hard_float32_reg = hard_reg_gen Float32 (Array.length float32_reg_name) +let hard_vec128_reg = hard_reg_gen Vec128 (Array.length vec128_reg_name) + let all_phys_regs = - Array.concat [hard_int_reg; hard_float_reg; hard_float32_reg; ] + Array.concat [hard_int_reg; hard_float_reg; hard_float32_reg; hard_vec128_reg; ] let precolored_regs = let phys_regs = Reg.set_of_array all_phys_regs in @@ -161,13 +154,8 @@ let phys_reg ty n = match (ty : Cmm.machtype_component) with | Int | Addr | Val -> hard_int_reg.(n) | Float -> hard_float_reg.(n - 100) - | Vec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" - | Valx2 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got valx2 register" | Float32 -> hard_float32_reg.(n - 100) + | Vec128 | Valx2 -> hard_vec128_reg.(n - 100) let gc_regs_offset _ = fatal_error "arm64: gc_reg_offset unreachable" @@ -204,6 +192,8 @@ let loc_float_gen kind size last_float make_stack float ofs = let loc_float = loc_float_gen Float Arch.size_float (* float32 slots still take up a full word *) let loc_float32 = loc_float_gen Float32 Arch.size_float +let loc_vec128 = loc_float_gen Vec128 Arch.size_vec128 + let loc_int32 last_int make_stack int ofs = if !int <= last_int then begin let l = phys_reg Int !int in @@ -227,14 +217,13 @@ let calling_conventions | Float -> loc.(i) <- loc_float last_float make_stack float ofs | Vec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" + loc.(i) <- loc_vec128 last_float make_stack float ofs | Float32 -> loc.(i) <- loc_float32 last_float make_stack float ofs | Valx2 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got valx2 register" + Misc.fatal_error "Unexpected machtype_component Valx2" done; + (* CR mslater: (SIMD) will need to be 32/64 if vec256/512 are used. *) (loc, Misc.align (max 0 !ofs) 16) (* keep stack 16-aligned *) let incoming ofs = @@ -297,8 +286,7 @@ let external_calling_conventions | XFloat -> loc.(i) <- [| loc_float last_float make_stack float ofs |] | XVec128 -> - (* CR mslater: (SIMD) arm64 *) - fatal_error "arm64: got vec128 register" + loc.(i) <- [| loc_vec128 last_float make_stack float ofs |] | XFloat32 -> loc.(i) <- [| loc_float32 last_float make_stack float ofs |] end) @@ -356,11 +344,12 @@ let destroyed_at_c_noalloc_call = Array.map (phys_reg Int) int_regs_destroyed_at_c_noalloc_call; Array.map (phys_reg Float) float_regs_destroyed_at_c_noalloc_call; Array.map (phys_reg Float32) float_regs_destroyed_at_c_noalloc_call; + Array.map (phys_reg Vec128) float_regs_destroyed_at_c_noalloc_call; ] (* CSE needs to know that all versions of neon are destroyed. *) let destroy_neon_reg n = - [| phys_reg Float (100 + n); phys_reg Float32 (100 + n); |] + [| phys_reg Float (100 + n); phys_reg Float32 (100 + n); phys_reg Vec128 (100 + n); |] let destroy_neon_reg7 = destroy_neon_reg 7 @@ -461,6 +450,7 @@ let is_destruction_point ~(more_destruction_points : bool) (terminator : Cfg_int let initial_stack_offset ~num_stack_slots ~contains_calls = (8 * num_stack_slots.(0)) + (8 * num_stack_slots.(1)) + + (16 * num_stack_slots.(2)) + if contains_calls then 8 else 0 let trap_frame_size_in_bytes = 16 @@ -497,9 +487,12 @@ let slot_offset (loc : Reg.stack_location) ~stack_class ~stack_offset | Local n -> let offset = stack_offset + + (* Preserves original ordering: int below float. *) (match stack_class with - | 0 -> n * 8 - | 1 -> fun_num_stack_slots.(0) * 8 + n * 8 + | 2 -> n * 16 + | 0 -> fun_num_stack_slots.(2) * 16 + n * 8 + | 1 -> fun_num_stack_slots.(2) * 16 + + fun_num_stack_slots.(0) * 8 + n * 8 | _ -> Misc.fatal_errorf "Unknown stack class %d" stack_class) in Bytes_relative_to_stack_pointer offset @@ -521,9 +514,8 @@ let assemble_file infile outfile = let init () = () let operation_supported : Cmm.operation -> bool = function + | Cprefetch _ | Catomic _ -> false | Cpopcnt - | Cprefetch _ | Catomic _ - (* CR mslater: (float32) arm64 *) | Cnegf Float32 | Cabsf Float32 | Caddf Float32 | Csubf Float32 | Cmulf Float32 | Cdivf Float32 | Cpackf32 @@ -533,7 +525,6 @@ let operation_supported : Cmm.operation -> bool = function | Cstatic_cast (Float_of_float32 | Float32_of_float | Int_of_float Float32 | Float_of_int Float32 | V128_of_scalar _ | Scalar_of_v128 _) - -> false (* Not implemented *) | Cclz _ | Cctz _ | Cbswap _ | Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _ | Caddi | Csubi | Cmuli | Cmulhi _ | Cdivi | Cmodi diff --git a/backend/arm64/selection_utils.ml b/backend/arm64/selection_utils.ml index 712c6806c0a..6295efc9b81 100644 --- a/backend/arm64/selection_utils.ml +++ b/backend/arm64/selection_utils.ml @@ -36,8 +36,7 @@ let is_offset chunk n = n land 3 = 0 && n lsr 2 < 0x1000 | Word_int | Word_val | Double -> n land 7 = 0 && n lsr 3 < 0x1000 | Onetwentyeight_aligned | Onetwentyeight_unaligned -> - (* CR mslater: (SIMD) arm64 *) - Misc.fatal_error "arm64: got 128 bit memory chunk" + n land 15 = 0 && n lsr 4 < 0x1000 let is_logical_immediate_int n = Arch.is_logical_immediate (Nativeint.of_int n)