-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Fix llvm.amdgcn.fcmp.bf16 #166877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,297 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s | ||
|
|
||
| declare i32 @llvm.amdgcn.fcmp.bf16(bfloat, bfloat, i32) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These intrinsics are deprecated, why is someone trying to use them with bfloat?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't find any use of the intrinsic with bfloat. You are right, this intrinsic seems not necessary. Will clang emit a warning of deprecated intrinsic? I am thinking abandon this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. clang does not directly emit intrinsics, it would have to go through a builtin
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, is __builtin_amdgcn_fcmp deprecated then?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but that doesn't have a bfloat version
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right, and also does not have a half version. However, when you mentioned "deprecated", I think you meant types other than bfloat. I think amdgcn_fcmp is not necessary and thus deprecated, no matter what types.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a combine is folding an FP cast into the intrinsic, that would be a problem |
||
| declare bfloat @llvm.fabs.bf16(bfloat) | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_oeq_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_oeq_with_fabs: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) | ||
| ; GFX12-NEXT: s_lshl_b32 s3, s3, 16 | ||
| ; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we don't have bfloat compares, we shouldn't lower the intrinsic
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1. intrinsic should not be used like a builtin. |
||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %temp = call bfloat @llvm.fabs.bf16(bfloat %a) | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat %temp, i32 1) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_oeq_both_operands_with_fabs(ptr addrspace(1) %out, bfloat %src, bfloat %a) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_oeq_both_operands_with_fabs: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshr_b32 s3, s2, 16 | ||
| ; GFX12-NEXT: s_and_b32 s2, s2, 0x7fff | ||
| ; GFX12-NEXT: s_and_b32 s3, s3, 0x7fff | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_lshl_b32 s3, s3, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_eq_f32_e64 s2, s2, s3 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %temp = call bfloat @llvm.fabs.bf16(bfloat %a) | ||
| %src_input = call bfloat @llvm.fabs.bf16(bfloat %src) | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src_input, bfloat %temp, i32 1) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 -1) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_oeq(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_oeq: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 1) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_one(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_one: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 6) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_ogt(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_ogt: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 2) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_oge(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_oge: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 3) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_olt(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_olt: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 4) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_ole(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_ole: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 5) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_ueq(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_ueq: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 9) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_une(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_une: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 14) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_ugt(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_ugt: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 10) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_uge(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_uge: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 11) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_ult(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_ult: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 12) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_o(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_o: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 7) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_uo(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_uo: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 8) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_fcmp_bf16_ule(ptr addrspace(1) %out, bfloat %src) { | ||
| ; GFX12-LABEL: v_fcmp_bf16_ule: | ||
| ; GFX12: ; %bb.0: | ||
| ; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 | ||
| ; GFX12-NEXT: s_wait_kmcnt 0x0 | ||
| ; GFX12-NEXT: s_lshl_b32 s2, s2, 16 | ||
| ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) | ||
| ; GFX12-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2 | ||
| ; GFX12-NEXT: s_wait_alu 0xf1ff | ||
| ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 | ||
| ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] | ||
| ; GFX12-NEXT: s_endpgm | ||
| %result = call i32 @llvm.amdgcn.fcmp.bf16(bfloat %src, bfloat 100.00, i32 13) | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a check that these error appropriately for a target without bfloat compares