[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877

changpeng · 2025-11-07T01:08:47Z

When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.

llvmbot · 2025-11-07T01:09:24Z

@llvm/pr-subscribers-backend-amdgpu

Author: Changpeng Fang (changpeng)

Changes

When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.

Full diff: https://github.com/llvm/llvm-project/pull/166877.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+1-1)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll (+297)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bb28084159e8..aac546f6c8213 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7012,7 +7012,7 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
   EVT CmpVT = Src0.getValueType();
   SDLoc SL(N);
 
-  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
+  if ((CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) || CmpVT == MVT::bf16) {
     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
new file mode 100644
index 0000000000000..831e63669c335
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32)
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
+; GFX12-LABEL: v_fcmp_bf16_oeq_with_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, s2, s3
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %temp = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat %temp, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
+; GFX12-LABEL: v_fcmp_bf16_oeq_both_operands_with_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX12-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX12-NEXT:    s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, s2, s3
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %temp = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %src_input = call bfloat @llvm.fabs.bf16(bfloat %src)
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src_input, bfloat %temp, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 -1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_oeq:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_one(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_one:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 6)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ogt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ogt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oge(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_oge:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 3)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_olt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_olt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 4)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ole(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ole:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 5)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ueq(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ueq:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 9)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_une(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_une:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 14)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ugt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ugt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 10)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_uge(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_uge:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 11)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ult(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ult:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 12)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_o(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_o:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 7)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_uo(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_uo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 8)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ule(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ule:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 13)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}

arsenm · 2025-11-07T01:35:11Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll

+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32)


These intrinsics are deprecated, why is someone trying to use them with bfloat?

Didn't find any use of the intrinsic with bfloat. You are right, this intrinsic seems not necessary. Will clang emit a warning of deprecated intrinsic? I am thinking abandon this PR.

clang does not directly emit intrinsics, it would have to go through a builtin

yes, is __builtin_amdgcn_fcmp deprecated then?

Yes, but that doesn't have a bfloat version

Yes, but that doesn't have a bfloat version

Right, and also does not have a half version. However, when you mentioned "deprecated", I think you meant types other than bfloat. I think amdgcn_fcmp is not necessary and thus deprecated, no matter what types.

If a combine is folding an FP cast into the intrinsic, that would be a problem

arsenm · 2025-11-07T01:35:43Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll

@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+


Can you add a check that these error appropriately for a target without bfloat compares

arsenm · 2025-11-07T01:36:01Z

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll

+; GFX12-NEXT:    s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, s2, s3


If we don't have bfloat compares, we shouldn't lower the intrinsic

+1. intrinsic should not be used like a builtin.

[AMDGPU] Fix llvm.amdgcn.fcmp.bf16

97180f7

When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.

llvmbot added the backend:AMDGPU label Nov 7, 2025

changpeng requested review from arsenm and shiltian November 7, 2025 01:09

arsenm reviewed Nov 7, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877

[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877

changpeng commented Nov 7, 2025

Uh oh!

llvmbot commented Nov 7, 2025

Uh oh!

arsenm Nov 7, 2025

Uh oh!

changpeng Nov 7, 2025

Uh oh!

arsenm Nov 7, 2025

Uh oh!

changpeng Nov 7, 2025

Uh oh!

arsenm Nov 7, 2025

Uh oh!

changpeng Nov 7, 2025

Uh oh!

arsenm Nov 7, 2025

Uh oh!

arsenm Nov 7, 2025

Uh oh!

arsenm Nov 7, 2025

Uh oh!

shiltian Nov 7, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

		@@ -0,0 +1,297 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
		; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s \| FileCheck -check-prefix=GFX12 %s

[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877

Are you sure you want to change the base?

[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877

Conversation

changpeng commented Nov 7, 2025

Uh oh!

llvmbot commented Nov 7, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants