Skip to content

Commit cd6c4b6

Browse files
authored
[AMDGPU][True16][CodeGen] optimize codegen for mad-mix in true16 (#124995)
remove unnecessary COPY for SDAG for mad-mix pattern
1 parent 3f1267e commit cd6c4b6

File tree

4 files changed

+152
-323
lines changed

4 files changed

+152
-323
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3661,6 +3661,11 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
36613661
// TODO: Should we try to look for neg/abs here?
36623662
}
36633663

3664+
// Prevent unnecessary subreg COPY to VGPR_16
3665+
if (Src.getOpcode() == ISD::TRUNCATE &&
3666+
Src.getOperand(0).getValueType() == MVT::i32) {
3667+
Src = Src.getOperand(0);
3668+
}
36643669
return true;
36653670
}
36663671

llvm/test/CodeGen/AMDGPU/frem.ll

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2202,21 +2202,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
22022202
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
22032203
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
22042204
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
2205-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
2206-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
2207-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2205+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
2206+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
22082207
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
2208+
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l
2209+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2210+
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
22092211
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
22102212
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
2211-
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v0, v6 op_sel_hi:[1,0,1]
2213+
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
22122214
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2213-
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v4
2214-
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v0, v6 op_sel_hi:[1,0,1]
2215-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
2215+
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
2216+
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
22162217
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2217-
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l
2218-
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
2219-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
22202218
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
22212219
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
22222220
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2226,27 +2224,26 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
22262224
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
22272225
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v4.l
22282226
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
2229-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_2)
2230-
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v5, v7
22312227
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2228+
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v5, v5, v7
22322229
; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
2233-
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
22342230
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2231+
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
22352232
; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
2233+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
22362234
; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v5, v8, v7
2237-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
22382235
; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
2239-
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v7
22402236
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2237+
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v7
22412238
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2
2242-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
22432239
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2240+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v5
22442241
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
2245-
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v6.l, v4.l
22462242
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2243+
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v6.l, v4.l
22472244
; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
2245+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
22482246
; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v6.l, v4.l
2249-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
22502247
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
22512248
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
22522249
; GFX11-TRUE16-NEXT: s_endpgm

0 commit comments

Comments
 (0)