@@ -2202,21 +2202,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
2202
2202
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
2203
2203
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
2204
2204
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
2205
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
2206
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
2207
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2205
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
2206
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2208
2207
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
2208
+ ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l
2209
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2210
+ ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
2209
2211
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
2210
2212
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
2211
- ; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7 , -v5 , v0, v6 op_sel_hi:[1,0,1]
2213
+ ; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5 , -v3 , v0, v2 op_sel_hi:[1,0,1]
2212
2214
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2213
- ; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v4
2214
- ; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v0, v6 op_sel_hi:[1,0,1]
2215
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
2215
+ ; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
2216
+ ; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
2216
2217
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2217
- ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l
2218
- ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
2219
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2220
2218
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
2221
2219
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
2222
2220
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2226,27 +2224,26 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
2226
2224
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2227
2225
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v4.l
2228
2226
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
2229
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_2)
2230
- ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v5, v7
2231
2227
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2228
+ ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v5, v7
2232
2229
; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
2233
- ; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2234
2230
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2231
+ ; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2235
2232
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
2233
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2236
2234
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v5, v8, v7
2237
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2238
2235
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2239
- ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v7
2240
2236
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2237
+ ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v7
2241
2238
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2
2242
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
2243
2239
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2240
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
2244
2241
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
2245
- ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v6.l, v4.l
2246
2242
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2243
+ ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v6.l, v4.l
2247
2244
; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
2245
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2248
2246
; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v6.l, v4.l
2249
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
2250
2247
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
2251
2248
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
2252
2249
; GFX11-TRUE16-NEXT: s_endpgm
0 commit comments