diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 99a176731599c..be0936ce74835 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2232,12 +2232,14 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { } static int -GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses) { - // 2 pass -> 3 - // 4 pass -> 5 - // 8 pass -> 9 - // 16 pass -> 17 - return NumPasses + 1; +GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 3 4 + // 4 pass | 5 6 + // 8 pass | 9 10 + // 16 pass | 17 18 + return NumPasses + 1 + IsGFX950; } static int @@ -2373,7 +2375,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { NeedWaitStates = isXDL(ST, *MI1) ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( - NumPasses) + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates( NumPasses); break; diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir index d2b2f226404da..b9135dbd46fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir @@ -145,7 +145,8 @@ body: | ... # GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: sgemm4x4_mfma_write_agpr_mfma_read_overlap body: | @@ -155,7 +156,8 @@ body: | ... # GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: sgemm4x4_mfma_write_vgpr_mfma_read_overlap body: | @@ -165,7 +167,8 @@ body: | ... # GCN-LABEL: name: sgemm4x4_mfma_write_agpr_smfmac_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_SMFMAC name: sgemm4x4_mfma_write_agpr_smfmac_read_overlap body: | @@ -175,8 +178,11 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 7 +# GFX940-NEXT: S_NOP 0 + +# GFX950-NEXT: S_NOP 7 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap body: | @@ -186,8 +192,11 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 7 +# GFX940-NEXT: S_NOP 0 + +# GFX950-NEXT: S_NOP 7 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap body: | @@ -216,8 +225,11 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 7 +# GFX940-NEXT: S_NOP 0 + +# GFX950-NEXT: S_NOP 7 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap body: | @@ -229,7 +241,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap body: | @@ -241,7 +254,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap body: | @@ -273,7 +287,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap body: | @@ -325,7 +340,8 @@ body: | ... # GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap body: | @@ -336,7 +352,8 @@ body: | # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap body: | @@ -348,7 +365,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap body: | @@ -359,7 +377,8 @@ body: | # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial body: | @@ -370,7 +389,8 @@ body: | # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial body: | @@ -1345,7 +1365,8 @@ body: | ... # GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap body: | @@ -1356,7 +1377,8 @@ body: | # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap body: | @@ -1368,7 +1390,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap body: | @@ -1589,7 +1612,8 @@ body: | ... # GCN-LABEL: name: sgemm16X16X16_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: sgemm16X16X16_mfma_write_agpr_mfma_read_overlap body: | @@ -1599,7 +1623,8 @@ body: | ... # GCN-LABEL: name: sgemm16X16X32_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: sgemm16X16X32_mfma_write_agpr_mfma_read_overlap body: | @@ -1609,7 +1634,8 @@ body: | ... # GCN-LABEL: name: sgemm16X16X16_mfma_write_agpr_dgemm_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: sgemm16X16X16_mfma_write_agpr_dgemm_read_overlap body: | @@ -1619,7 +1645,8 @@ body: | ... # GCN-LABEL: name: sgemm16X16X16_mfma_write_agpr_smfmac_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_SMFMAC name: sgemm16X16X16_mfma_write_agpr_smfmac_read_overlap body: | @@ -1629,7 +1656,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16_write_agpr_smfmac_read_overlap # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_SMFMAC name: smfmac16x16_write_agpr_smfmac_read_overlap body: | @@ -1727,7 +1755,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16x32_mfma_write_agpr_mfma_read_overlap # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_SMFMAC name: smfmac16x16x32_mfma_write_agpr_mfma_read_overlap body: | @@ -1738,7 +1767,8 @@ body: | # GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap body: | @@ -1804,7 +1834,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm4x4_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_sgemm4x4_mfma_write_agpr_mfma_read_overlap body: | @@ -1923,7 +1954,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_4pass_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_4pass_mfma_write_agpr_mfma_read_overlap body: | @@ -1933,7 +1965,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16_mfma_write_agpr_mfma_read_overlap # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: smfmac16x16_mfma_write_agpr_mfma_read_overlap body: | @@ -2047,7 +2080,8 @@ body: | # 2 pass source # GCN-LABEL: name: xdl_mfma_2pass_write_vgpr_xdl_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_mfma_2pass_write_vgpr_xdl_mfma_read_overlap_srcc body: | @@ -2092,7 +2126,8 @@ body: | # 4 pass source # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcc body: | @@ -2134,7 +2169,8 @@ body: | # 2 pass source # GCN-LABEL: name: xdl_mfma_2pass_write_vgpr_sgemm_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_mfma_2pass_write_vgpr_sgemm_mfma_read_overlap_srcc body: | @@ -2179,7 +2215,8 @@ body: | # 4 pass source # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940-NEXT: S_NOP 4 +# GFX950-NEXT: S_NOP 5 # GCN-NEXT: V_MFMA name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcc body: | @@ -2222,7 +2259,8 @@ body: | # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc body: | @@ -2268,7 +2306,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc body: | @@ -2356,7 +2395,8 @@ body: | # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc body: | @@ -2399,7 +2439,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcc body: | @@ -2446,7 +2487,8 @@ body: | # 2 pass source # GCN-LABEL: name: xdl_mfma_2pass_write_agpr_smfmac_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_SMFMAC_ name: xdl_mfma_2pass_write_agpr_smfmac_read_overlap_srcc body: | @@ -2460,7 +2502,8 @@ body: | ... # GCN-LABEL: name: xdl_4pass_mfma_write_agpr_smfmac_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 4 +# GFX940: S_NOP 4 +# GFX950: S_NOP 5 # GCN-NEXT: V_SMFMAC_ name: xdl_4pass_mfma_write_agpr_smfmac_read_overlap_srcc body: | @@ -2474,7 +2517,8 @@ body: | # GCN-LABEL: name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC_ name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc body: | @@ -2488,7 +2532,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GFX940-NEXT: S_NOP 0 +# GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC_ name: xdl_16pass_mfma_write_agpr_smfmac_read_overlap_srcc body: | diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index bf51efab2cdf1..f68b84c7140ba 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -16,7 +16,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec @@ -38,7 +38,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec @@ -60,7 +60,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec @@ -82,7 +82,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec @@ -102,7 +102,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 2, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 4 + ; GCN-NEXT: S_NOP 5 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 2, implicit $mode, implicit $exec @@ -122,7 +122,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 3, 3, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 4 + ; GCN-NEXT: S_NOP 5 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 3, 3, implicit $mode, implicit $exec @@ -142,7 +142,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 4, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 4 + ; GCN-NEXT: S_NOP 5 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 4, 4, implicit $mode, implicit $exec @@ -165,7 +165,7 @@ body: | ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -187,7 +187,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -210,7 +210,7 @@ body: | ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -232,7 +232,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $sgpr4, $vgpr32, 12, 4, implicit $mode, implicit $exec