diff --git a/backend/arm64/arch.ml b/backend/arm64/arch.ml
index e0fb3cc4b74..5153a2f70d5 100644
--- a/backend/arm64/arch.ml
+++ b/backend/arm64/arch.ml
@@ -83,7 +83,7 @@ let size_float = 8
 
 let size_vec128 = 16
 
-let allow_unaligned_access = true 
+let allow_unaligned_access = true
 
 (* Behavior of division *)
 
@@ -93,10 +93,9 @@ let division_crashes_on_overflow = false
 
 let identity_addressing = Iindexed 0
 
-let offset_addressing addr delta =
-  match addr with
-  | Iindexed n -> Iindexed(n + delta)
-  | Ibased(s, n) -> Ibased(s, n + delta)
+let offset_addressing _addr _delta =
+  (* Resulting offset might not be representable.  *)
+  Misc.fatal_error "Arch.offset_addressing not supported"
 
 let num_args_addressing = function
   | Iindexed _ -> 1
diff --git a/backend/arm64/cfg_selection.ml b/backend/arm64/cfg_selection.ml
index 5d744944f24..cf99fead37d 100644
--- a/backend/arm64/cfg_selection.ml
+++ b/backend/arm64/cfg_selection.ml
@@ -173,6 +173,56 @@ class selector =
       (* Other operations are regular *)
       | _ -> super#select_operation op args dbg ~label_after
 
+    method! emit_stores env dbg data regs_addr =
+      (* Override [emit_stores] to ensure that addressing mode always uses a
+         legal offset. *)
+      let offset = ref (-Arch.size_int) in
+      let base =
+        assert (Array.length regs_addr = 1);
+        ref regs_addr
+      in
+      List.iter
+        (fun arg ->
+          match self#emit_expr env arg ~bound_name:None with
+          | None -> assert false
+          | Some regs ->
+            for i = 0 to Array.length regs - 1 do
+              let r = regs.(i) in
+              let kind =
+                match r.Reg.typ with
+                | Float -> Double
+                | Float32 -> Single { reg = Float32 }
+                | Vec128 ->
+                  (* 128-bit memory operations are default unaligned. Aligned
+                     (big)array operations are handled separately via cmm. *)
+                  Onetwentyeight_unaligned
+                | Val | Addr | Int -> Word_val
+                | Valx2 ->
+                  Misc.fatal_error "Unexpected machtype_component Valx2"
+              in
+              if not (Selection_utils.is_offset kind !offset)
+              then (
+                (* Use a temporary to store the address [!base + offset]. *)
+                let tmp = self#regs_for typ_int in
+                self#insert_debug env
+                  (self#lift_op
+                     (self#make_const_int (Nativeint.of_int !offset)))
+                  dbg [||] tmp;
+                self#insert_debug env
+                  (self#lift_op (Operation.Intop Iadd))
+                  dbg (Array.append !base tmp) tmp;
+                (* Use the temporary as the new base address. *)
+                base := tmp;
+                offset := 0);
+              self#insert_debug env
+                (self#make_store kind (Iindexed !offset) false)
+                dbg
+                (Array.append [| r |] !base)
+                [||];
+              offset := !offset + Select_utils.size_component r.Reg.typ
+            done)
+        data
+
     method! insert_move_extcall_arg env ty_arg src dst =
       let ty_arg_is_int32 =
         match ty_arg with
diff --git a/backend/arm64/emit.mlp b/backend/arm64/emit.mlp
index f57c1217fc6..c909aa8dbaa 100644
--- a/backend/arm64/emit.mlp
+++ b/backend/arm64/emit.mlp
@@ -383,29 +383,42 @@ let function_name = ref ""
 let tailrec_entry_point = ref None
 (* Pending floating-point literals *)
 let float_literals = ref ([] : (int64 * label) list)
+let vec128_literals = ref ([] : (Cmm.vec128_bits * label) list)
 
 (* Label a floating-point literal *)
-let float_literal f =
+let add_literal p f =
   try
-    List.assoc f !float_literals
+    List.assoc f !p
   with Not_found ->
     let lbl = Cmm.new_label() in
-    float_literals := (f, lbl) :: !float_literals;
+    p := (f, lbl) :: !p;
     lbl
 
+let float_literal f = add_literal float_literals f
+let vec128_literal f = add_literal vec128_literals f
+
 (* Emit all pending literals *)
-let emit_literals() =
-  if !float_literals <> [] then begin
+let emit_literals p align emit_literal =
+  if !p <> [] then begin
     if macosx then
-      `	.section	__TEXT,__literal8,8byte_literals\n`;
-    `	.align	3\n`;
-    List.iter
-      (fun (f, lbl) ->
-        `{emit_label lbl}:`; emit_float64_directive ".quad" f)
-      !float_literals;
-    float_literals := []
+    `	.section	__TEXT,__literal{emit_int align},{emit_int align}byte_literals\n`;
+    `	.balign	{emit_int align}\n`;
+    List.iter emit_literal !p;
+    p := []
   end
 
+let emit_float_literal (f, lbl) =
+     `{emit_label lbl}:`; emit_float64_directive ".quad" f
+
+let emit_vec128_literal (({ high; low; } : Cmm.vec128_bits), lbl) =
+     `{emit_label lbl}:\n`;
+     emit_float64_directive ".quad" low;
+     emit_float64_directive ".quad" high
+
+let emit_literals () =
+  emit_literals float_literals size_float emit_float_literal;
+  emit_literals vec128_literals size_vec128 emit_vec128_literal
+
 (* Emit code to load the address of a symbol *)
 
 let emit_load_symbol_addr dst s =
@@ -710,10 +723,8 @@ module BR = Branch_relaxation.Make (struct
       num_instructions_for_intconst n
     | Lop (Const_float32 _) -> 2
     | Lop (Const_float _) -> 2
+    | Lop (Const_vec128 _) -> 2
     | Lop (Const_symbol _) -> 2
-    | Lop (Const_vec128 _) ->
-      (* CR mslater: (SIMD) arm64 *)
-      Misc.fatal_error "SIMD is not supported on this architecture"
     | Lop (Intop_atomic _) ->
       (* Never generated; builtins are not yet translated to atomics *)
       assert false
@@ -866,7 +877,7 @@ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
     let domain_local_top_offset = DS.(idx_of_field Domain_local_top) * 8 in
     `	ldr	{emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_limit_offset}]\n`;
     `	ldr	{emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
-    `	sub	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
+    emit_subimm r r n;
     `	str	{emit_reg r}, [{emit_reg reg_domain_state_ptr}, #{emit_int domain_local_sp_offset}]\n`;
     `	cmp	{emit_reg r}, {emit_reg reg_tmp1}\n`;
     let lbl_call = Cmm.new_label () in
@@ -893,7 +904,7 @@ let assembly_code_for_allocation i ~local ~n ~far ~dbginfo =
       assert (16 <= n && n < 0x1_000 && n land 0x7 = 0);
       let offset = Domainstate.(idx_of_field Domain_young_limit) * 8 in
       `	ldr	{emit_reg reg_tmp1}, [{emit_reg reg_domain_state_ptr}, #{emit_int offset}]\n`;
-      `	sub	{emit_reg reg_alloc_ptr}, {emit_reg reg_alloc_ptr}, #{emit_int n}\n`;
+      emit_subimm reg_alloc_ptr reg_alloc_ptr n;
       `	cmp	{emit_reg reg_alloc_ptr}, {emit_reg reg_tmp1}\n`;
       if not far then begin
         `	b.lo	{emit_label lbl_call_gc}\n`
@@ -1064,9 +1075,16 @@ let emit_instr i =
           let lbl = float_literal f in
           emit_load_literal i.res.(0) lbl
         end
-    | Lop(Const_vec128 _) ->
-        (* CR mslater: (SIMD) arm64 *)
-        Misc.fatal_error "SIMD is not supported on this architecture"
+    | Lop(Const_vec128 ({high; low} as l)) ->
+      DSL.check_reg Vec128 i.res.(0);
+      begin match (high, low) with
+      | 0x0000_0000_0000_0000L, 0x0000_0000_0000_0000L ->
+          let dst = DSL.emit_reg_v2d i.res.(0) in
+          DSL.ins I.MOVI [| dst; DSL.imm 0  |]
+      | _ ->
+          let lbl = vec128_literal l in
+          emit_load_literal i.res.(0) lbl
+      end
     | Lop(Const_symbol s) ->
         emit_load_symbol_addr i.res.(0) s.sym_name
     | Lcall_op(Lcall_ind) ->
@@ -1157,8 +1175,9 @@ let emit_instr i =
             DSL.check_reg Float32 dst;
             ` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n`
         | Onetwentyeight_aligned | Onetwentyeight_unaligned ->
-            (* CR mslater: (SIMD) arm64 *)
-            fatal_error "arm64: got 128 bit memory chunk"
+            (* CR gyorsh: check alignment *)
+            DSL.check_reg Vec128 dst;
+            ` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n`
         end
     | Lop(Store(size, addr, assignment)) ->
       (* NB: assignments other than Word_int and Word_val do not follow the
@@ -1192,8 +1211,9 @@ let emit_instr i =
           DSL.check_reg Float32 src;
           ` str {emit_reg src}, {emit_addressing addr base}\n`
         | Onetwentyeight_aligned | Onetwentyeight_unaligned ->
-            (* CR mslater: (SIMD) arm64 *)
-            fatal_error "arm64: got 128 bit memory chunk"
+          (* CR gyorsh: check alignment *)
+          DSL.check_reg Vec128 src;
+          ` str {emit_reg src}, {emit_addressing addr base}\n`
         end
     | Lop(Alloc { bytes = n; dbginfo; mode = Heap }) ->
         assembly_code_for_allocation i ~n ~local:false ~far:false ~dbginfo
@@ -1548,9 +1568,9 @@ let emit_item (d : Cmm.data_item) =
   | Cint n -> `	.quad	{emit_nativeint n}\n`
   | Csingle f -> emit_float32_directive ".long" (Int32.bits_of_float f)
   | Cdouble f -> emit_float64_directive ".quad" (Int64.bits_of_float f)
-  | Cvec128 _ ->
-    (* CR mslater: (SIMD) arm64 *)
-    Misc.fatal_error "SIMD is not supported on this architecture"
+  | Cvec128 { high; low; } ->
+     emit_float64_directive ".quad" low;
+     emit_float64_directive ".quad" high;
   | Csymbol_address s -> `	.quad	{emit_symbol s.sym_name}\n`
   | Csymbol_offset (s, o) -> `	.quad	{emit_symbol s.sym_name}+{emit_int o}\n`
   | Cstring s -> emit_string_directive "	.ascii  " s
diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
index 751404eb975..1b4c4c09d26 100644
--- a/backend/arm64/proc.ml
+++ b/backend/arm64/proc.ml
@@ -65,52 +65,47 @@ let float32_reg_name =
      "s16"; "s17"; "s18"; "s19"; "s20"; "s21"; "s22"; "s23";
      "s24"; "s25"; "s26"; "s27"; "s28"; "s29"; "s30"; "s31" |]
 
+let vec128_reg_name =
+  [| "q0";  "q1";  "q2";  "q3";  "q4";  "q5";  "q6";  "q7";
+     "q8";  "q9";  "q10"; "q11"; "q12"; "q13"; "q14"; "q15";
+     "q16"; "q17"; "q18"; "q19"; "q20"; "q21"; "q22"; "q23";
+     "q24"; "q25"; "q26"; "q27"; "q28"; "q29"; "q30"; "q31" |]
+
 let num_register_classes = 2
 
 let register_class_of_machtype_component typ =
   match (typ : Cmm.machtype_component) with
   | Val | Int | Addr  -> 0
-  | Vec128 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got vec128 register"
-  | Valx2 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got valx2 register"
   | Float | Float32 -> 1
+  | Vec128 -> 1
+  | Valx2 -> 1
 
 let register_class r =
   register_class_of_machtype_component r.typ
 
-let num_stack_slot_classes = 2
+let num_stack_slot_classes = 3
 
 let stack_slot_class typ =
   match (typ : Cmm.machtype_component) with
   | Val | Int | Addr  -> 0
-  | Vec128 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got vec128 register"
-  | Valx2 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got valx2 register"
   | Float | Float32 -> 1
+  | Vec128 -> 2
+  | Valx2 -> 2
 
 let types_are_compatible left right =
   match left.typ, right.typ with
   | (Int | Val | Addr), (Int | Val | Addr)
   | Float, Float -> true
   | Float32, Float32 -> true
-  | Vec128, _ | _, Vec128 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got vec128 register"
-  | Valx2, _ | _, Valx2 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got valx2 register"
-  | (Int | Val | Addr | Float | Float32), _ -> false
+  | Vec128, Vec128 -> true
+  | Valx2,Valx2 -> true
+  | (Int | Val | Addr | Float | Float32 | Vec128 | Valx2), _ -> false
 
 let stack_class_tag c =
   match c with
   | 0 -> "i"
   | 1 -> "f"
+  | 2 -> "x"
   | c -> Misc.fatal_errorf "Unspecified stack slot class %d" c
 
 let num_available_registers =
@@ -125,14 +120,10 @@ let register_name ty r =
     int_reg_name.(r - first_available_register.(0))
   | Float ->
     float_reg_name.(r - first_available_register.(1))
-  | Vec128 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got vec128 register"
   | Float32 ->
     float32_reg_name.(r - first_available_register.(1))
-  | Valx2 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got valx2 register"
+  | Vec128 | Valx2 ->
+    vec128_reg_name.(r - first_available_register.(1))
 
 
 (* Representation of hard registers by pseudo-registers *)
@@ -150,8 +141,10 @@ v
 let hard_int_reg = hard_reg_gen Int (Array.length int_reg_name)
 let hard_float_reg = hard_reg_gen Float (Array.length float_reg_name)
 let hard_float32_reg = hard_reg_gen Float32 (Array.length float32_reg_name)
+let hard_vec128_reg = hard_reg_gen Vec128 (Array.length vec128_reg_name)
+
 let all_phys_regs =
-  Array.concat [hard_int_reg; hard_float_reg; hard_float32_reg; ]
+  Array.concat [hard_int_reg; hard_float_reg; hard_float32_reg; hard_vec128_reg; ]
 
 let precolored_regs =
   let phys_regs = Reg.set_of_array all_phys_regs in
@@ -161,13 +154,8 @@ let phys_reg ty n =
   match (ty : Cmm.machtype_component) with
   | Int | Addr | Val -> hard_int_reg.(n)
   | Float -> hard_float_reg.(n - 100)
-  | Vec128 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got vec128 register"
-  | Valx2 ->
-    (* CR mslater: (SIMD) arm64 *)
-    fatal_error "arm64: got valx2 register"
   | Float32 -> hard_float32_reg.(n - 100)
+  | Vec128 | Valx2 -> hard_vec128_reg.(n - 100)
 
 let gc_regs_offset _ =
     fatal_error "arm64: gc_reg_offset unreachable"
@@ -204,6 +192,8 @@ let loc_float_gen kind size last_float make_stack float ofs =
 let loc_float = loc_float_gen Float Arch.size_float
 (* float32 slots still take up a full word *)
 let loc_float32 = loc_float_gen Float32 Arch.size_float
+let loc_vec128 = loc_float_gen Vec128 Arch.size_vec128
+
 let loc_int32 last_int make_stack int ofs =
   if !int <= last_int then begin
     let l = phys_reg Int !int in
@@ -227,14 +217,13 @@ let calling_conventions
     | Float ->
         loc.(i) <- loc_float last_float make_stack float ofs
     | Vec128 ->
-        (* CR mslater: (SIMD) arm64 *)
-        fatal_error "arm64: got vec128 register"
+        loc.(i) <- loc_vec128 last_float make_stack float ofs
     | Float32 ->
         loc.(i) <- loc_float32 last_float make_stack float ofs
     | Valx2 ->
-      (* CR mslater: (SIMD) arm64 *)
-      fatal_error "arm64: got valx2 register"
+        Misc.fatal_error "Unexpected machtype_component Valx2"
   done;
+  (* CR mslater: (SIMD) will need to be 32/64 if vec256/512 are used. *)
   (loc, Misc.align (max 0 !ofs) 16)  (* keep stack 16-aligned *)
 
 let incoming ofs =
@@ -297,8 +286,7 @@ let external_calling_conventions
     | XFloat ->
         loc.(i) <- [| loc_float last_float make_stack float ofs |]
     | XVec128 ->
-        (* CR mslater: (SIMD) arm64 *)
-        fatal_error "arm64: got vec128 register"
+        loc.(i) <- [| loc_vec128 last_float make_stack float ofs |]
     | XFloat32 ->
         loc.(i) <- [| loc_float32 last_float make_stack float ofs |]
     end)
@@ -356,11 +344,12 @@ let destroyed_at_c_noalloc_call =
     Array.map (phys_reg Int) int_regs_destroyed_at_c_noalloc_call;
     Array.map (phys_reg Float) float_regs_destroyed_at_c_noalloc_call;
     Array.map (phys_reg Float32) float_regs_destroyed_at_c_noalloc_call;
+    Array.map (phys_reg Vec128) float_regs_destroyed_at_c_noalloc_call;
   ]
 
 (* CSE needs to know that all versions of neon are destroyed. *)
 let destroy_neon_reg n =
-  [| phys_reg Float (100 + n); phys_reg Float32 (100 + n); |]
+  [| phys_reg Float (100 + n); phys_reg Float32 (100 + n); phys_reg Vec128 (100 + n); |]
 
 let destroy_neon_reg7 = destroy_neon_reg 7
 
@@ -461,6 +450,7 @@ let is_destruction_point ~(more_destruction_points : bool) (terminator : Cfg_int
 let initial_stack_offset ~num_stack_slots ~contains_calls =
   (8 * num_stack_slots.(0))
   + (8 * num_stack_slots.(1))
+  + (16 * num_stack_slots.(2))
   + if contains_calls then 8 else 0
 
 let trap_frame_size_in_bytes = 16
@@ -497,9 +487,12 @@ let slot_offset (loc : Reg.stack_location) ~stack_class ~stack_offset
   | Local n ->
       let offset =
         stack_offset +
+        (* Preserves original ordering: int below float. *)
         (match stack_class with
-        | 0 -> n * 8
-        | 1 -> fun_num_stack_slots.(0) * 8 + n * 8
+        | 2 -> n * 16
+        | 0 -> fun_num_stack_slots.(2) * 16 + n * 8
+        | 1 -> fun_num_stack_slots.(2) * 16 +
+               fun_num_stack_slots.(0) * 8 + n * 8
         | _ -> Misc.fatal_errorf "Unknown stack class %d" stack_class)
       in
       Bytes_relative_to_stack_pointer offset
@@ -521,9 +514,8 @@ let assemble_file infile outfile =
 let init () = ()
 
 let operation_supported : Cmm.operation -> bool = function
+  | Cprefetch _ | Catomic _ -> false
   | Cpopcnt
-  | Cprefetch _ | Catomic _
-  (* CR mslater: (float32) arm64 *)
   | Cnegf Float32 | Cabsf Float32 | Caddf Float32
   | Csubf Float32 | Cmulf Float32 | Cdivf Float32
   | Cpackf32
@@ -533,7 +525,6 @@ let operation_supported : Cmm.operation -> bool = function
   | Cstatic_cast (Float_of_float32 | Float32_of_float |
                   Int_of_float Float32 | Float_of_int Float32 |
                   V128_of_scalar _ | Scalar_of_v128 _)
-    -> false   (* Not implemented *)
   | Cclz _ | Cctz _ | Cbswap _
   | Capply _ | Cextcall _ | Cload _ | Calloc _ | Cstore _
   | Caddi | Csubi | Cmuli | Cmulhi _ | Cdivi | Cmodi
diff --git a/backend/arm64/selection_utils.ml b/backend/arm64/selection_utils.ml
index 712c6806c0a..6295efc9b81 100644
--- a/backend/arm64/selection_utils.ml
+++ b/backend/arm64/selection_utils.ml
@@ -36,8 +36,7 @@ let is_offset chunk n =
        n land 3 = 0 && n lsr 2 < 0x1000
      | Word_int | Word_val | Double -> n land 7 = 0 && n lsr 3 < 0x1000
      | Onetwentyeight_aligned | Onetwentyeight_unaligned ->
-       (* CR mslater: (SIMD) arm64 *)
-       Misc.fatal_error "arm64: got 128 bit memory chunk"
+       n land 15 = 0 && n lsr 4 < 0x1000
 
 let is_logical_immediate_int n = Arch.is_logical_immediate (Nativeint.of_int n)