Skip to content

Conversation

@changpeng
Copy link
Contributor

When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.

  When the source is of type bfloat, we should first expand the bfloats
to floats, and then do the comparison.
@llvmbot
Copy link
Member

llvmbot commented Nov 7, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Changpeng Fang (changpeng)

Changes

When the source is of type bfloat, we should first expand the bfloats to floats, and then do the comparison.


Full diff: https://github.com/llvm/llvm-project/pull/166877.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+1-1)
  • (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll (+297)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bb28084159e8..aac546f6c8213 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7012,7 +7012,7 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
   EVT CmpVT = Src0.getValueType();
   SDLoc SL(N);
 
-  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
+  if ((CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) || CmpVT == MVT::bf16) {
     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
new file mode 100644
index 0000000000000..831e63669c335
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32)
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
+; GFX12-LABEL: v_fcmp_bf16_oeq_with_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, s2, s3
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %temp = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat %temp, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
+; GFX12-LABEL: v_fcmp_bf16_oeq_both_operands_with_fabs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX12-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX12-NEXT:    s_and_b32 s3, s3, 0x7fff
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, s2, s3
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %temp = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %src_input = call bfloat @llvm.fabs.bf16(bfloat %src)
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src_input, bfloat %temp, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 -1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oeq(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_oeq:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 1)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_one(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_one:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 6)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ogt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ogt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 2)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_oge(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_oge:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 3)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_olt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_olt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 4)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ole(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ole:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 5)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ueq(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ueq:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 9)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_une(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_une:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 14)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ugt(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ugt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 10)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_uge(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_uge:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 11)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ult(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ult:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 12)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_o(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_o:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 7)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_uo(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_uo:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 8)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_fcmp_bf16_ule(ptr addrspace(1) %out, bfloat %src) {
+; GFX12-LABEL: v_fcmp_bf16_ule:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
+  %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 13)
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}

@changpeng changpeng requested review from arsenm and shiltian November 7, 2025 01:09
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s

declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These intrinsics are deprecated, why is someone trying to use them with bfloat?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't find any use of the intrinsic with bfloat. You are right, this intrinsic seems not necessary. Will clang emit a warning of deprecated intrinsic? I am thinking abandon this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clang does not directly emit intrinsics, it would have to go through a builtin

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, is __builtin_amdgcn_fcmp deprecated then?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but that doesn't have a bfloat version

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but that doesn't have a bfloat version

Right, and also does not have a half version. However, when you mentioned "deprecated", I think you meant types other than bfloat. I think amdgcn_fcmp is not necessary and thus deprecated, no matter what types.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a combine is folding an FP cast into the intrinsic, that would be a problem

@@ -0,0 +1,297 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a check that these error appropriately for a target without bfloat compares

; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b32 s3, s3, 16
; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't have bfloat compares, we shouldn't lower the intrinsic

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1. intrinsic should not be used like a builtin.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants