From 6883d5b3fddd1316a5b752f29640b2d9ea3c352e Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Tue, 18 Mar 2025 16:36:56 +0000
Subject: [PATCH 1/4] arm64 float32 regs

---
 backend/arm64/cfg_selection.ml   |  3 +-
 backend/arm64/proc.ml            | 91 +++++++++++++++++---------------
 backend/arm64/selection_utils.ml |  6 +--
 3 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/backend/arm64/cfg_selection.ml b/backend/arm64/cfg_selection.ml
index 89de872bc44..5d744944f24 100644
--- a/backend/arm64/cfg_selection.ml
+++ b/backend/arm64/cfg_selection.ml
@@ -159,8 +159,9 @@ class selector =
         | [Cop (Cmulf Float64, args, _); arg] ->
           specific Inegmulsubf, arg :: args
         | _ -> super#select_operation op args dbg ~label_after)
+      | Cpackf32 -> specific (Isimd Zip1_f32), args
       (* Recognize floating-point square root *)
-      | Cextcall { func = "sqrt" } -> specific Isqrtf, args
+      | Cextcall { func = "sqrt" | "sqrtf" } -> specific Isqrtf, args
       | Cextcall { func; builtin = true; _ } -> (
         match Simd_selection.select_operation_cfg func args with
         | Some (op, args) -> Basic (Op op), args
diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
index 0fb2c21c22e..e85a99f2512 100644
--- a/backend/arm64/proc.ml
+++ b/backend/arm64/proc.ml
@@ -59,53 +59,50 @@ let float_reg_name =
      "d16"; "d17"; "d18"; "d19"; "d20"; "d21"; "d22"; "d23";
      "d24"; "d25"; "d26"; "d27"; "d28"; "d29"; "d30"; "d31" |]
 
+let float32_reg_name =
+  [| "s0";  "s1";  "s2";  "s3";  "s4";  "s5";  "s6";  "s7";
+     "s8";  "s9";  "s10"; "s11"; "s12"; "s13"; "s14"; "s15";
+     "s16"; "s17"; "s18"; "s19"; "s20"; "s21"; "s22"; "s23";
+     "s24"; "s25"; "s26"; "s27"; "s28"; "s29"; "s30"; "s31" |]
+
 let num_register_classes = 2
 
 let register_class r =
   match (r.typ : Cmm.machtype_component) with
   | Val | Int | Addr  -> 0
-  | Float -> 1
   | Vec128 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got vec128 register"
-  | Float32 ->
-    (* CR mslater: (float32) arm64 *)
-    fatal_error "arm64: got float32 register"
   | Valx2 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got valx2 register"
+  | Float | Float32 -> 1
 
 let num_stack_slot_classes = 2
 
 let stack_slot_class typ =
   match (typ : Cmm.machtype_component) with
   | Val | Int | Addr  -> 0
-  | Float -> 1
   | Vec128 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got vec128 register"
-  | Float32 ->
-    (* CR mslater: (float32) arm64 *)
-    fatal_error "arm64: got float32 register"
   | Valx2 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got valx2 register"
+  | Float | Float32 -> 1
 
 let types_are_compatible left right =
   match left.typ, right.typ with
   | (Int | Val | Addr), (Int | Val | Addr)
-  | Float, Float ->
-    true
-  | Float32, _ | _, Float32 ->
-    (* CR mslater: (float32) arm64 *)
-    fatal_error "arm64: got float32 register"
+  | Float, Float -> true
+  | Float32, Float32 -> true
   | Vec128, _ | _, Vec128 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got vec128 register"
   | Valx2, _ | _, Valx2 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got valx2 register"
-  | (Int | Val | Addr | Float), _ -> false
+  | (Int | Val | Addr | Float | Float32), _ -> false
 
 let stack_class_tag c =
   match c with
@@ -129,12 +126,13 @@ let register_name ty r =
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got vec128 register"
   | Float32 ->
-    (* CR mslater: (float32) arm64 *)
-    fatal_error "arm64: got float32 register"
+    float32_reg_name.(r - first_available_register.(1))
   | Valx2 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got valx2 register"
 
+(* CR gyorsh for xclerc: [rotate_registers] used in [coloring] on Mach,
+   but not in IRC on CFG. Are we dropping an optimization here? *)
 let rotate_registers = true
 
 (* Representation of hard registers by pseudo-registers *)
@@ -146,15 +144,17 @@ let hard_int_reg =
   done;
   v
 
-let hard_float_reg =
+let hard_float_reg_gen kind =
   let v = Array.make 32 Reg.dummy in
   for i = 0 to 31 do
-    v.(i) <- Reg.at_location Float (Reg(100 + i))
+    v.(i) <- Reg.at_location kind (Reg(100 + i))
   done;
   v
 
+let hard_float_reg = hard_float_reg_gen Float
+let hard_float32_reg = hard_float_reg_gen Float32
 let all_phys_regs =
-  Array.append hard_int_reg hard_float_reg
+  Array.concat [hard_int_reg; hard_float_reg; hard_float32_reg; ]
 
 let precolored_regs =
   let phys_regs = Reg.set_of_array all_phys_regs in
@@ -167,19 +167,15 @@ let phys_reg ty n =
   | Vec128 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got vec128 register"
-  | Float32 ->
-    (* CR mslater: (float32) arm64 *)
-    fatal_error "arm64: got float32 register"
   | Valx2 ->
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got valx2 register"
+  | Float32 -> hard_float32_reg.(n - 100)
 
 let gc_regs_offset _ =
-    (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: gc_reg_offset unreachable"
 
 let reg_x8 = phys_reg Int 8
-let reg_d7 = phys_reg Float 107
 
 let stack_slot slot ty =
   Reg.at_location ty (Stack slot)
@@ -198,16 +194,19 @@ let loc_int last_int make_stack int ofs =
     ofs := !ofs + size_int; l
   end
 
-let loc_float last_float make_stack float ofs =
+let loc_float_gen kind size last_float make_stack float ofs =
   if !float <= last_float then begin
-    let l = phys_reg Float !float in
+    let l = phys_reg kind !float in
     incr float; l
   end else begin
-    ofs := Misc.align !ofs size_float;
-    let l = stack_slot (make_stack !ofs) Float in
-    ofs := !ofs + size_float; l
+    ofs := Misc.align !ofs size;
+    let l = stack_slot (make_stack !ofs) kind in
+    ofs := !ofs + size; l
   end
 
+let loc_float = loc_float_gen Float Arch.size_float
+(* float32 slots still take up a full word *)
+let loc_float32 = loc_float_gen Float32 Arch.size_float
 let loc_int32 last_int make_stack int ofs =
   if !int <= last_int then begin
     let l = phys_reg Int !int in
@@ -234,8 +233,7 @@ let calling_conventions
         (* CR mslater: (SIMD) arm64 *)
         fatal_error "arm64: got vec128 register"
     | Float32 ->
-        (* CR mslater: (float32) arm64 *)
-        fatal_error "arm64: got float32 register"
+        loc.(i) <- loc_float32 last_float make_stack float ofs
     | Valx2 ->
       (* CR mslater: (SIMD) arm64 *)
       fatal_error "arm64: got valx2 register"
@@ -305,8 +303,7 @@ let external_calling_conventions
         (* CR mslater: (SIMD) arm64 *)
         fatal_error "arm64: got vec128 register"
     | XFloat32 ->
-        (* CR mslater: (float32) arm64 *)
-        fatal_error "arm64: got float32 register"
+        loc.(i) <- [| loc_float32 last_float make_stack float ofs |]
     end)
     ty_args;
   (loc, Misc.align !ofs 16)  (* keep stack 16-aligned *)
@@ -350,13 +347,25 @@ let domainstate_ptr_dwarf_register_number = 28
 
 let destroyed_at_c_noalloc_call =
   (* x19-x28, d8-d15 preserved *)
-  Array.append
-  (Array.of_list (List.map (phys_reg Int)
-    [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15]))
-  (Array.of_list (List.map (phys_reg Float)
-    [100;101;102;103;104;105;106;107;
-     116;117;118;119;120;121;122;123;
-     124;125;126;127;128;129;130;131]))
+  let int_regs_destroyed_at_c_noalloc_call =
+    [| 0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15 |]
+  in
+  let float_regs_destroyed_at_c_noalloc_call =
+    [|100;101;102;103;104;105;106;107;
+      116;117;118;119;120;121;122;123;
+      124;125;126;127;128;129;130;131|]
+  in
+  Array.concat [
+    Array.map (phys_reg Int) int_regs_destroyed_at_c_noalloc_call;
+    Array.map (phys_reg Float) float_regs_destroyed_at_c_noalloc_call;
+    Array.map (phys_reg Float32) float_regs_destroyed_at_c_noalloc_call;
+  ]
+
+(* CSE needs to know that all versions of neon are destroyed. *)
+let destroy_neon_reg n =
+  [| phys_reg Float (100 + n); phys_reg Float32 (100 + n); |]
+
+let destroy_neon_reg7 = destroy_neon_reg 7
 
 let destroyed_at_raise = all_phys_regs
 
@@ -366,8 +375,6 @@ let destroyed_at_pushtrap = [| |]
 
 let destroyed_at_alloc_or_poll = [| reg_x8 |]
 
-let destroy_neon_reg7 = [| reg_d7 |]
-
 let destroyed_at_basic (basic : Cfg_intf.S.basic) =
   match basic with
   | Reloadretaddr ->
diff --git a/backend/arm64/selection_utils.ml b/backend/arm64/selection_utils.ml
index bb731579904..712c6806c0a 100644
--- a/backend/arm64/selection_utils.ml
+++ b/backend/arm64/selection_utils.ml
@@ -31,15 +31,13 @@ let is_offset chunk n =
      (* 12 bits unsigned, scaled by chunk size *)
      | Byte_unsigned | Byte_signed -> n < 0x1000
      | Sixteen_unsigned | Sixteen_signed -> n land 1 = 0 && n lsr 1 < 0x1000
-     | Thirtytwo_unsigned | Thirtytwo_signed | Single { reg = Float64 } ->
+     | Thirtytwo_unsigned | Thirtytwo_signed
+     | Single { reg = Float64 | Float32 } ->
        n land 3 = 0 && n lsr 2 < 0x1000
      | Word_int | Word_val | Double -> n land 7 = 0 && n lsr 3 < 0x1000
      | Onetwentyeight_aligned | Onetwentyeight_unaligned ->
        (* CR mslater: (SIMD) arm64 *)
        Misc.fatal_error "arm64: got 128 bit memory chunk"
-     | Single { reg = Float32 } ->
-       (* CR mslater: (float32) arm64 *)
-       Misc.fatal_error "arm64: got float32 memory chunk"
 
 let is_logical_immediate_int n = Arch.is_logical_immediate (Nativeint.of_int n)
 

From fd2c0953bc637e0e0e28327ef0c34ae3912afba8 Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Tue, 18 Mar 2025 17:18:40 +0000
Subject: [PATCH 2/4] arm64 float32 emit

---
 backend/arm64/emit.mlp | 88 +++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 45 deletions(-)

diff --git a/backend/arm64/emit.mlp b/backend/arm64/emit.mlp
index dcb150fd3e4..f57c1217fc6 100644
--- a/backend/arm64/emit.mlp
+++ b/backend/arm64/emit.mlp
@@ -317,6 +317,11 @@ let is_immediate_float bits =
   let mant = Int64.logand bits 0xF_FFFF_FFFF_FFFFL in
   exp >= -3 && exp <= 4 && Int64.logand mant 0xF_0000_0000_0000L = mant
 
+let is_immediate_float32 bits =
+  let exp = (Int32.(to_int (shift_right_logical bits 23)) land 0x7F) - 63 in
+  let mant = Int32.logand bits 0x7F_FFFFl in
+  exp >= -3 && exp <= 4 && Int32.logand mant 0x78_0000l = mant
+
 (* Adjust sp (up or down) by the given byte amount *)
 
 let emit_stack_adjustment n =
@@ -703,9 +708,7 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Move | Spill | Reload) -> 1
     | Lop (Const_int n) ->
       num_instructions_for_intconst n
-    | Lop (Const_float32 _) ->
-      (* CR mslater: (float32) arm64 *)
-      Misc.fatal_error "float32 is not supported on this architecture"
+    | Lop (Const_float32 _) -> 2
     | Lop (Const_float _) -> 2
     | Lop (Const_symbol _) -> 2
     | Lop (Const_vec128 _) ->
@@ -755,22 +758,19 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Begin_region | End_region) -> 1
     | Lop (Intop (Icomp _)) -> 2
     | Lop (Floatop (Float64, Icompf _)) -> 2
-    | Lop (Floatop (Float32, Icompf _)) ->
-      (* CR mslater: (float32) arm64 *)
-      Misc.fatal_error "float32 is not supported on this architecture"
+    | Lop (Floatop (Float32, Icompf _)) -> 2
     | Lop (Intop_imm (Icomp _, _)) -> 2
     | Lop (Intop Imod) -> 2
     | Lop (Intop (Imulh _)) -> 1
     | Lop (Intop (Iclz _)) -> 1
     | Lop (Intop (Ictz _)) -> 2
-    | Lop (Floatop (Float64, (Iabsf | Inegf)) | Specific Isqrtf) -> 1
-    | Lop (Floatop (Float32, (Iabsf | Inegf))) ->
-      (* CR mslater: (float32) arm64 *)
-      Misc.fatal_error "float32 is not supported on this architecture"
     | Lop (Intop (Iadd|Isub|Imul|Idiv|Iand|Ior|Ixor|Ilsl|Ilsr|Iasr|Ipopcnt)) -> 1
     | Lop (Intop_imm
              ((Iadd|Isub|Imul|Idiv|Imod|Imulh _|Iand|Ior|Ixor|Ilsl|Ilsr|Iasr
               | Iclz _ | Ictz _ |Ipopcnt),_)) -> 1
+    | Lop (Floatop (Float64, (Iabsf | Inegf))) -> 1
+    | Lop (Floatop (Float32, (Iabsf | Inegf))) -> 1
+    | Lop (Specific Isqrtf) -> 1
     | Lop (Reinterpret_cast (Value_of_int | Int_of_value |
                               Float_of_int64 | Int64_of_float)) -> 1
     | Lop (Reinterpret_cast (Float32_of_float | Float_of_float32 |
@@ -788,10 +788,9 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Static_cast (V128_of_scalar _ | Scalar_of_v128 _)) ->
       (* CR mslater: (SIMD) arm64 *)
       Misc.fatal_error "SIMD is not supported on this architecture"
-    | Lop (Floatop (Float64, (Iaddf | Isubf | Imulf | Idivf)) | Specific Inegmulf) -> 1
-    | Lop (Floatop (Float32, (Iaddf | Isubf | Imulf | Idivf))) ->
-      (* CR mslater: (float32) arm64 *)
-      Misc.fatal_error "float32 is not supported on this architecture"
+    | Lop (Floatop (Float64, (Iaddf | Isubf | Imulf | Idivf))) -> 1
+    | Lop (Floatop (Float32, (Iaddf | Isubf | Imulf | Idivf))) -> 1
+    | Lop (Specific Inegmulf) -> 1
     | Lop (Opaque) -> 0
     | Lop (Specific (Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf)) -> 1
     | Lop (Specific (Ishiftarith _)) -> 1
@@ -1030,7 +1029,7 @@ let emit_instr i =
         move i.arg.(0) i.res.(0)
     | Lop(Specific Imove32) ->
         let src = i.arg.(0) and dst = i.res.(0) in
-        if src.loc <> dst.loc then begin
+        if not (Reg.same_loc src dst) then begin
           match (src, dst) with
           | {loc = Reg _}, {loc = Reg _} ->
               `	mov	{emit_wreg dst}, {emit_wreg src}\n`
@@ -1045,9 +1044,17 @@ let emit_instr i =
         end
     | Lop(Const_int n) ->
         emit_intconst i.res.(0) n
-    | Lop (Const_float32 _) ->
-        (* CR mslater: (float32) arm64 *)
-        Misc.fatal_error "float32 is not supported on this architecture"
+    | Lop (Const_float32 f) ->
+        DSL.check_reg Float32 i.res.(0);
+        if f = 0l then
+          `	fmov	{emit_reg i.res.(0)}, wzr\n`
+        else if is_immediate_float32 f then
+          `	fmov	{emit_reg i.res.(0)}, #{emit_printf "%.7f" (Int32.float_of_bits f)}\n`
+        else begin
+          (* float32 constants still take up 8 bytes; we load the lower half. *)
+          let lbl = float_literal (Int64.of_int32 f) in
+          emit_load_literal i.res.(0) lbl
+        end
     | Lop(Const_float f) ->
         if f = 0L then
           `	fmov	{emit_reg i.res.(0)}, xzr\n`
@@ -1134,6 +1141,7 @@ let emit_instr i =
         | Thirtytwo_signed ->
             `	ldrsw	{emit_reg dst}, {emit_addressing addressing_mode base}\n`
         | Single { reg = Float64 } ->
+            DSL.check_reg Float dst;
             `	ldr	s7, {emit_addressing addressing_mode base}\n`;
             `	fcvt	{emit_reg dst}, s7\n`
         | Word_int | Word_val ->
@@ -1146,8 +1154,8 @@ let emit_instr i =
         | Double ->
                       `	ldr	{emit_reg dst}, {emit_addressing addressing_mode base}\n`
         | Single { reg = Float32 } ->
-            (* CR mslater: (float32) arm64 *)
-            fatal_error "arm64: got float32 memory chunk"
+            DSL.check_reg Float32 dst;
+            ` ldr {emit_reg dst}, {emit_addressing addressing_mode base}\n`
         | Onetwentyeight_aligned | Onetwentyeight_unaligned ->
             (* CR mslater: (SIMD) arm64 *)
             fatal_error "arm64: got 128 bit memory chunk"
@@ -1171,6 +1179,7 @@ let emit_instr i =
         | Thirtytwo_unsigned | Thirtytwo_signed ->
             `	str	{emit_wreg src}, {emit_addressing addr base}\n`
         | Single { reg = Float64 } ->
+            DSL.check_reg Float src;
             `	fcvt	s7, {emit_reg src}\n`;
             `	str	s7, {emit_addressing addr base}\n`;
         | Word_int | Word_val ->
@@ -1180,8 +1189,8 @@ let emit_instr i =
         | Double ->
           `	str	{emit_reg src}, {emit_addressing addr base}\n`
         | Single { reg = Float32 } ->
-            (* CR mslater: (float32) arm64 *)
-            fatal_error "arm64: got float32 memory chunk"
+          DSL.check_reg Float32 src;
+          ` str {emit_reg src}, {emit_addressing addr base}\n`
         | Onetwentyeight_aligned | Onetwentyeight_unaligned ->
             (* CR mslater: (SIMD) arm64 *)
             fatal_error "arm64: got 128 bit memory chunk"
@@ -1213,9 +1222,10 @@ let emit_instr i =
         let comp = name_for_float_comparison cmp in
         `	fcmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
         `	cset	{emit_reg i.res.(0)}, {emit_string comp}\n`
-    | Lop(Floatop(Float32, Icompf _)) ->
-        (* CR mslater: (float32) arm64 *)
-        Misc.fatal_error "float32 is not supported on this architecture"
+    | Lop(Floatop(Float32, Icompf cmp)) ->
+        let comp = name_for_float_comparison cmp in
+        `	fcmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
+        `	cset	{emit_reg i.res.(0)}, {emit_string comp}\n`
     | Lop(Intop_imm(Icomp cmp, n)) ->
         emit_cmpimm i.arg.(0) n;
         `	cset	{emit_reg i.res.(0)}, {emit_string (name_for_comparison cmp)}\n`
@@ -1240,25 +1250,19 @@ let emit_instr i =
     | Lop(Intop_imm(op, n)) ->
         let instr = name_for_int_operation op in
         `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`
-    | Lop(Floatop (Float32, (Iabsf | Inegf))) ->
-        (* CR mslater: (float32) arm64 *)
-        Misc.fatal_error "float32 is not supported on this architecture"
-    | Lop(Floatop (Float32, (Iaddf | Isubf | Imulf | Idivf))) ->
-        (* CR mslater: (float32) arm64 *)
-        Misc.fatal_error "float32 is not supported on this architecture"
     | Lop(Specific Isqrtf) ->
       `	fsqrt	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`
-    | Lop(Floatop ((Float64), Iabsf)) ->
+    | Lop(Floatop ((Float32 | Float64), Iabsf)) ->
       `	fabs	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`
-    | Lop(Floatop ((Float64), Inegf)) ->
+    | Lop(Floatop ((Float32 | Float64), Inegf)) ->
       `	fneg	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`
-    | Lop(Floatop ((Float64), Iaddf)) ->
+    | Lop(Floatop ((Float32 | Float64), Iaddf)) ->
      `	fadd	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
-    | Lop(Floatop ((Float64), Isubf)) ->
+    | Lop(Floatop ((Float32 | Float64), Isubf)) ->
      `	fsub	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
-    | Lop(Floatop ((Float64), Imulf)) ->
+    | Lop(Floatop ((Float32 | Float64), Imulf)) ->
      `	fmul	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
-    | Lop(Floatop ((Float64), Idivf)) ->
+    | Lop(Floatop ((Float32 | Float64), Idivf)) ->
      `	fdiv	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
     | Lop(Specific Inegmulf) ->
      `	fnmul	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`
@@ -1341,10 +1345,7 @@ let emit_instr i =
             let comp = name_for_comparison cmp in
             emit_cmpimm i.arg.(0) n;
             `	csel	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}, {emit_string comp}\n`
-        | Ifloattest (Float32, _cmp) ->
-            (* CR mslater: (float32) arm64 *)
-            Misc.fatal_error "float32 is not supported on this architecture"
-        | Ifloattest (Float64, cmp) ->
+        | Ifloattest ((Float32 | Float64), cmp) ->
             let comp = name_for_float_comparison cmp in
             `	fcmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
             `	csel	{emit_reg i.res.(0)}, {emit_reg i.arg.(2)}, {emit_reg i.arg.(3)}, {emit_string comp}\n`
@@ -1377,10 +1378,7 @@ let emit_instr i =
             emit_cmpimm i.arg.(0) n;
             let comp = name_for_comparison cmp in
             `	b.{emit_string comp}	{emit_label lbl}\n`
-        | Ifloattest (Float32, _cmp) ->
-            (* CR mslater: (float32) arm64 *)
-            Misc.fatal_error "float32 is not supported on this architecture"
-        | Ifloattest (Float64, cmp) ->
+        | Ifloattest ((Float32 | Float64), cmp) ->
             let comp = name_for_float_comparison cmp in
             `	fcmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
             `	b.{emit_string comp}	{emit_label lbl}\n`

From c2d3de79517fbf9930cf218f7fd036ffef69d296 Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Thu, 20 Mar 2025 13:58:07 +0000
Subject: [PATCH 3/4] remove unused rotate_registers

---
 backend/amd64/proc.ml | 4 ----
 backend/arm64/proc.ml | 3 ---
 backend/proc.mli      | 1 -
 3 files changed, 8 deletions(-)

diff --git a/backend/amd64/proc.ml b/backend/amd64/proc.ml
index dc9d2f010ec..fb270fb5070 100644
--- a/backend/amd64/proc.ml
+++ b/backend/amd64/proc.ml
@@ -140,10 +140,6 @@ let register_name ty r =
   | Float | Float32 | Vec128 | Valx2 ->
     float_reg_name.(r - first_available_register.(1))
 
-(* Pack registers starting at %rax so as to reduce the number of REX
-   prefixes and thus improve code density *)
-let rotate_registers = false
-
 (* Representation of hard registers by pseudo-registers *)
 
 let hard_int_reg =
diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
index e85a99f2512..71491f70024 100644
--- a/backend/arm64/proc.ml
+++ b/backend/arm64/proc.ml
@@ -131,9 +131,6 @@ let register_name ty r =
     (* CR mslater: (SIMD) arm64 *)
     fatal_error "arm64: got valx2 register"
 
-(* CR gyorsh for xclerc: [rotate_registers] used in [coloring] on Mach,
-   but not in IRC on CFG. Are we dropping an optimization here? *)
-let rotate_registers = true
 
 (* Representation of hard registers by pseudo-registers *)
 
diff --git a/backend/proc.mli b/backend/proc.mli
index 2a6982fb35a..4789bfd3345 100644
--- a/backend/proc.mli
+++ b/backend/proc.mli
@@ -26,7 +26,6 @@ val first_available_register: int array
 val register_name: Cmm.machtype_component -> int -> string
 val phys_reg: Cmm.machtype_component -> int -> Reg.t
 val gc_regs_offset : Reg.t -> int
-val rotate_registers: bool
 val precolored_regs : unit -> Reg.Set.t
 
 (* The number of stack slot classes may differ from the number of register classes.

From edcb49fcf4588f803d0e8e60a25e632d09a246ee Mon Sep 17 00:00:00 2001
From: Greta Yorsh <45005955+gretay-js@users.noreply.github.com>
Date: Thu, 20 Mar 2025 14:09:32 +0000
Subject: [PATCH 4/4] Generalize [hard_float_reg_gen] to cover int_hard_reg

and avoid magic constants: [hard_reg_gen]
---
 backend/arm64/proc.ml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/backend/arm64/proc.ml b/backend/arm64/proc.ml
index 71491f70024..751404eb975 100644
--- a/backend/arm64/proc.ml
+++ b/backend/arm64/proc.ml
@@ -67,8 +67,8 @@ let float32_reg_name =
 
 let num_register_classes = 2
 
-let register_class r =
-  match (r.typ : Cmm.machtype_component) with
+let register_class_of_machtype_component typ =
+  match (typ : Cmm.machtype_component) with
   | Val | Int | Addr  -> 0
   | Vec128 ->
     (* CR mslater: (SIMD) arm64 *)
@@ -78,6 +78,9 @@ let register_class r =
     fatal_error "arm64: got valx2 register"
   | Float | Float32 -> 1
 
+let register_class r =
+  register_class_of_machtype_component r.typ
+
 let num_stack_slot_classes = 2
 
 let stack_slot_class typ =
@@ -134,22 +137,19 @@ let register_name ty r =
 
 (* Representation of hard registers by pseudo-registers *)
 
-let hard_int_reg =
-  let v = Array.make 28 Reg.dummy in
-  for i = 0 to 27 do
-    v.(i) <- Reg.at_location Int (Reg i)
-  done;
-  v
 
-let hard_float_reg_gen kind =
-  let v = Array.make 32 Reg.dummy in
-  for i = 0 to 31 do
-    v.(i) <- Reg.at_location kind (Reg(100 + i))
+let hard_reg_gen typ n =
+  let reg_class = register_class_of_machtype_component typ in
+  let first = first_available_register.(reg_class) in
+  let v = Array.make n Reg.dummy in
+  for i = 0 to n - 1 do
+    v.(i) <- Reg.at_location typ (Reg(first + i))
   done;
-  v
+v
 
-let hard_float_reg = hard_float_reg_gen Float
-let hard_float32_reg = hard_float_reg_gen Float32
+let hard_int_reg = hard_reg_gen Int (Array.length int_reg_name)
+let hard_float_reg = hard_reg_gen Float (Array.length float_reg_name)
+let hard_float32_reg = hard_reg_gen Float32 (Array.length float32_reg_name)
 let all_phys_regs =
   Array.concat [hard_int_reg; hard_float_reg; hard_float32_reg; ]