Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7012,7 +7012,7 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
EVT CmpVT = Src0.getValueType();
SDLoc SL(N);

if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
if ((CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) || CmpVT == MVT::bf16) {
Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
}
Expand Down
297 changes: 297 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.bf16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a check that these error appropriately for a target without bfloat compares

declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These intrinsics are deprecated, why is someone trying to use them with bfloat?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't find any use of the intrinsic with bfloat. You are right, this intrinsic seems not necessary. Will clang emit a warning of deprecated intrinsic? I am thinking abandon this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clang does not directly emit intrinsics, it would have to go through a builtin

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, is __builtin_amdgcn_fcmp deprecated then?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but that doesn't have a bfloat version

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but that doesn't have a bfloat version

Right, and also does not have a half version. However, when you mentioned "deprecated", I think you meant types other than bfloat. I think amdgcn_fcmp is not necessary and thus deprecated, no matter what types.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a combine is folding an FP cast into the intrinsic, that would be a problem

declare bfloat @llvm.fabs.bf16(bfloat)

define amdgpu_kernel void @v_fcmp_bf16_oeq_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
; GFX12-LABEL: v_fcmp_bf16_oeq_with_fabs:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_lshl_b32 s3, s3, 16
; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't have bfloat compares, we shouldn't lower the intrinsic

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1. intrinsic should not be used like a builtin.

; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%temp = call bfloat @llvm.fabs.bf16(bfloat %a)
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat %temp, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) {
; GFX12-LABEL: v_fcmp_bf16_oeq_both_operands_with_fabs:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0x7fff
; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_lshl_b32 s3, s3, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%temp = call bfloat @llvm.fabs.bf16(bfloat %a)
%src_input = call bfloat @llvm.fabs.bf16(bfloat %src)
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src_input, bfloat %temp, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_oeq(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_oeq:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_one(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_one:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 6)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_ogt(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_ogt:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 2)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_oge(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_oge:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 3)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_olt(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_olt:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 4)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_ole(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_ole:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 5)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_ueq(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_ueq:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 9)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_une(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_une:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 14)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_ugt(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_ugt:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 10)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_uge(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_uge:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 11)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_ult(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_ult:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 12)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_o(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_o:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 7)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_uo(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_uo:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 8)
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_fcmp_bf16_ule(ptr addrspace(1) %out, bfloat %src) {
; GFX12-LABEL: v_fcmp_bf16_ule:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s2, s2, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 13)
store i32 %result, ptr addrspace(1) %out
ret void
}