Skip to content

Arm64: Vec128 support #3709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions backend/arm64/arch.ml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ let size_float = 8

let size_vec128 = 16

let allow_unaligned_access = true
let allow_unaligned_access = true

(* Behavior of division *)

Expand All @@ -93,10 +93,8 @@ let division_crashes_on_overflow = false

let identity_addressing = Iindexed 0

let offset_addressing addr delta =
match addr with
| Iindexed n -> Iindexed(n + delta)
| Ibased(s, n) -> Ibased(s, n + delta)
let offset_addressing _addr _delta =
Misc.fatal_error "Arch.offset_addressing not supported"

let num_args_addressing = function
| Iindexed _ -> 1
Expand Down
46 changes: 46 additions & 0 deletions backend/arm64/cfg_selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,52 @@ class selector =
(* Other operations are regular *)
| _ -> super#select_operation op args dbg ~label_after

method! emit_stores env dbg data regs_addr =
let offset = ref (-Arch.size_int) in
let base =
assert (Array.length regs_addr = 1);
ref regs_addr
in
List.iter
(fun arg ->
match self#emit_expr env arg ~bound_name:None with
| None -> assert false
| Some regs ->
for i = 0 to Array.length regs - 1 do
let r = regs.(i) in
let kind =
match r.Reg.typ with
| Float -> Double
| Float32 -> Single { reg = Float32 }
| Vec128 ->
(* 128-bit memory operations are default unaligned. Aligned
(big)array operations are handled separately via cmm. *)
Onetwentyeight_unaligned
| Val | Addr | Int -> Word_val
| Valx2 ->
Misc.fatal_error "Unexpected machtype_component Valx2"
in
if not (Selection_utils.is_offset kind !offset)
then (
let tmp = self#regs_for typ_int in
self#insert_debug env
(self#lift_op
(self#make_const_int (Nativeint.of_int !offset)))
dbg [||] tmp;
self#insert_debug env
(self#lift_op (Operation.Intop Iadd))
dbg (Array.append !base tmp) tmp;
base := tmp;
offset := 0);
self#insert_debug env
(self#make_store kind (Iindexed !offset) false)
dbg
(Array.append [| r |] !base)
[||];
offset := !offset + Select_utils.size_component r.Reg.typ
done)
data

method! insert_move_extcall_arg env ty_arg src dst =
let ty_arg_is_int32 =
match ty_arg with
Expand Down
74 changes: 47 additions & 27 deletions backend/arm64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -383,29 +383,42 @@ let function_name = ref ""
let tailrec_entry_point = ref None
(* Pending floating-point literals *)
let float_literals = ref ([] : (int64 * label) list)
let vec128_literals = ref ([] : (Cmm.vec128_bits * label) list)

(* Label a floating-point literal *)
let float_literal f =
let add_literal p f =
try
List.assoc f !float_literals
List.assoc f !p
with Not_found ->
let lbl = Cmm.new_label() in
float_literals := (f, lbl) :: !float_literals;
p := (f, lbl) :: !p;
lbl

let float_literal f = add_literal float_literals f
let vec128_literal f = add_literal vec128_literals f

(* Emit all pending literals *)
let emit_literals() =
if !float_literals <> [] then begin
let emit_literals p align emit_literal =
if !p <> [] then begin
if macosx then
` .section __TEXT,__literal8,8byte_literals\n`;
` .align 3\n`;
List.iter
(fun (f, lbl) ->
`{emit_label lbl}:`; emit_float64_directive ".quad" f)
!float_literals;
float_literals := []
` .section __TEXT,__literal{emit_int align},{emit_int align}byte_literals\n`;
` .balign {emit_int align}\n`;
List.iter emit_literal !p;
p := []
end

let emit_float_literal (f, lbl) =
`{emit_label lbl}:`; emit_float64_directive ".quad" f

let emit_vec128_literal (({ high; low; } : Cmm.vec128_bits), lbl) =
`{emit_label lbl}:\n`;
emit_float64_directive ".quad" low;
emit_float64_directive ".quad" high

let emit_literals () =
emit_literals float_literals size_float emit_float_literal;
emit_literals vec128_literals size_vec128 emit_vec128_literal

(* Emit code to load the address of a symbol *)

let emit_load_symbol_addr dst s =
Expand Down Expand Up @@ -710,10 +723,8 @@ module BR = Branch_relaxation.Make (struct
num_instructions_for_intconst n
| Lop (Const_float32 _) -> 2
| Lop (Const_float _) -> 2
| Lop (Const_vec128 _) -> 2
| Lop (Const_symbol _) -> 2
| Lop (Const_vec128 _) ->
(* CR mslater: (SIMD) arm64 *)
Misc.fatal_error "SIMD is not supported on this architecture"
| Lop (Intop_atomic _) ->
(* Never generated; builtins are not yet translated to atomics *)
assert false
Expand Down Expand Up @@ -866,7 +877,7 @@ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
let domain_local_top_offset = DS.(idx_of_field Domain_local_top) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`;
` ldr {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
` sub {emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
emit_subimm r r n;
` str {emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
` cmp {emit_reg r}, {emit_reg reg_tmp1}\n`;
let lbl_call = Cmm.new_label () in
Expand All @@ -893,7 +904,7 @@ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
` ldr {emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
` sub {emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
emit_subimm reg_alloc_ptr reg_alloc_ptr n;
` cmp {emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
if not far then begin
` b.lo {emit_label lbl_call_gc}\n`
Expand Down Expand Up @@ -1064,9 +1075,16 @@ let emit_instr i =
let lbl = float_literal f in
emit_load_literal i.res.(0) lbl
end
| Lop(Const_vec128 _) ->
(* CR mslater: (SIMD) arm64 *)
Misc.fatal_error "SIMD is not supported on this architecture"
| Lop(Const_vec128 ({high; low} as l)) ->
DSL.check_reg Vec128 i.res.(0);
begin match (high, low) with
| 0x0000_0000_0000_0000L, 0x0000_0000_0000_0000L ->
let dst = DSL.emit_reg_v2d i.res.(0) in
DSL.ins I.MOVI [| dst; DSL.imm 0 |]
| _ ->
let lbl = vec128_literal l in
emit_load_literal i.res.(0) lbl
end
| Lop(Const_symbol s) ->
emit_load_symbol_addr i.res.(0) s.sym_name
| Lcall_op(Lcall_ind) ->
Expand Down Expand Up @@ -1157,8 +1175,9 @@ let emit_instr i =
DSL.check_reg Float32 dst;
` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n`
| Onetwentyeight_aligned | Onetwentyeight_unaligned ->
(* CR mslater: (SIMD) arm64 *)
fatal_error "arm64: got 128 bit memory chunk"
(* CR gyorsh: check alignment *)
DSL.check_reg Vec128 dst;
` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n`
end
| Lop(Store(size, addr, assignment)) ->
(* NB: assignments other than Word_int and Word_val do not follow the
Expand Down Expand Up @@ -1192,8 +1211,9 @@ let emit_instr i =
DSL.check_reg Float32 src;
` str {emit_reg src}, {emit_addressing addr base}\n`
| Onetwentyeight_aligned | Onetwentyeight_unaligned ->
(* CR mslater: (SIMD) arm64 *)
fatal_error "arm64: got 128 bit memory chunk"
(* CR gyorsh: check alignment *)
DSL.check_reg Vec128 src;
` str {emit_reg src}, {emit_addressing addr base}\n`
end
| Lop(Alloc { bytes = n; dbginfo; mode = Heap }) ->
assembly_code_for_allocation i ~n ~local:false ~far:false ~dbginfo
Expand Down Expand Up @@ -1548,9 +1568,9 @@ let emit_item (d : Cmm.data_item) =
| Cint n -> ` .quad {emit_nativeint n}\n`
| Csingle f -> emit_float32_directive ".long" (Int32.bits_of_float f)
| Cdouble f -> emit_float64_directive ".quad" (Int64.bits_of_float f)
| Cvec128 _ ->
(* CR mslater: (SIMD) arm64 *)
Misc.fatal_error "SIMD is not supported on this architecture"
| Cvec128 { high; low; } ->
emit_float64_directive ".quad" low;
emit_float64_directive ".quad" high;
| Csymbol_address s -> ` .quad {emit_symbol s.sym_name}\n`
| Csymbol_offset (s, o) -> ` .quad {emit_symbol s.sym_name}+{emit_int o}\n`
| Cstring s -> emit_string_directive " .ascii " s
Expand Down
Loading