Skip to content

Commit d613fe2

Browse files
Pravin Jagtaparsenm
authored andcommitted
AMDGPU: Add v_prng_b32 instruction for gfx950
Rand num instruction for stochastic rounding.
1 parent 54f5e02 commit d613fe2

File tree

16 files changed

+252
-3
lines changed

16 files changed

+252
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,5 +522,7 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs",
522522
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
523523
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
524524

525+
TARGET_BUILTIN(__builtin_amdgcn_prng_b32, "UiUi", "nc", "prng-inst")
526+
525527
#undef BUILTIN
526528
#undef TARGET_BUILTIN

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9090
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
9191
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92-
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
92+
// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
9393
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9494
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
9595
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -emit-llvm \
2+
// RUN: -verify -o - %s
3+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -emit-llvm \
4+
// RUN: -verify -o - %s
5+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 -emit-llvm \
6+
// RUN: -verify -o - %s
7+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm \
8+
// RUN: -verify -o - %s
9+
10+
11+
// REQUIRES: amdgpu-registered-target
12+
13+
typedef unsigned int uint;
14+
void test_prng_b32(global uint* out, uint a) {
15+
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
16+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s
3+
// REQUIRES: amdgpu-registered-target
4+
5+
typedef unsigned int uint;
6+
7+
// CHECK-LABEL: @test_prng_b32(
8+
// CHECK-NEXT: entry:
9+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
10+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
11+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
12+
// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
13+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
14+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
15+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
16+
// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
17+
// CHECK-NEXT: ret void
18+
//
19+
void test_prng_b32(global uint* out, uint a) {
20+
*out = __builtin_amdgcn_prng_b32(a);
21+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
594594
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
595595
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
596596

597+
def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
598+
[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
599+
>, ClangBuiltin<"__builtin_amdgcn_prng_b32">;
600+
597601
} // TargetPrefix = "amdgcn"
598602

599603
// New-style image intrinsics

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,12 @@ def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
978978
"VMEM instructions of the same type write VGPR results in order"
979979
>;
980980

981+
def FeaturePrngInst : SubtargetFeature<"prng-inst",
982+
"HasPrngInst",
983+
"true",
984+
"Has v_prng_b32 instruction"
985+
>;
986+
981987
//===------------------------------------------------------------===//
982988
// Subtarget Features (options and debugging)
983989
//===------------------------------------------------------------===//
@@ -1495,7 +1501,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
14951501
FeatureFP8ConversionInsts,
14961502
FeatureCvtFP8VOP1Bug,
14971503
FeatureGFX950Insts,
1498-
FeatureAddressableLocalMemorySize163840
1504+
FeatureAddressableLocalMemorySize163840,
1505+
FeaturePrngInst
14991506
])>;
15001507

15011508
def FeatureISAVersion9_4_0 : FeatureSet<
@@ -2344,6 +2351,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
23442351
def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
23452352
AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
23462353

2354+
def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
2355+
AssemblerPredicate<(all_of FeaturePrngInst)>;
2356+
23472357
def HasGDS : Predicate<"Subtarget->hasGDS()">;
23482358

23492359
def HasGWS : Predicate<"Subtarget->hasGWS()">;

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
12531253

12541254
break;
12551255
}
1256+
case Intrinsic::amdgcn_prng_b32: {
1257+
auto *Src = II.getArgOperand(0);
1258+
if (isa<UndefValue>(Src)) {
1259+
return IC.replaceInstUsesWith(II, Src);
1260+
}
1261+
}
12561262
}
12571263
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12581264
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4515,6 +4515,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45154515
case Intrinsic::amdgcn_cvt_pk_u8_f32:
45164516
case Intrinsic::amdgcn_alignbyte:
45174517
case Intrinsic::amdgcn_perm:
4518+
case Intrinsic::amdgcn_prng_b32:
45184519
case Intrinsic::amdgcn_fdot2:
45194520
case Intrinsic::amdgcn_sdot2:
45204521
case Intrinsic::amdgcn_udot2:

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
220220
bool HasSALUFloatInsts = false;
221221
bool HasPseudoScalarTrans = false;
222222
bool HasRestrictedSOffset = false;
223-
223+
bool HasPrngInst = false;
224224
bool HasVcmpxPermlaneHazard = false;
225225
bool HasVMEMtoScalarWriteHazard = false;
226226
bool HasSMEMtoVectorWriteHazard = false;
@@ -1321,6 +1321,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13211321
/// instruction.
13221322
unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
13231323

1324+
bool hasPrngInst() const { return HasPrngInst; }
1325+
13241326
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
13251327
/// SGPRs
13261328
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,9 @@ let SubtargetPredicate = isGFX11Plus in {
761761
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
762762
} // End SubtargetPredicate = isGFX11Plus
763763

764+
let SubtargetPredicate = HasPrngInst in
765+
defm V_PRNG_B32 : VOP1Inst <"v_prng_b32", VOP_I32_I32, int_amdgcn_prng_b32>;
766+
764767
foreach vt = Reg32Types.types in {
765768
def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
766769
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
@@ -1516,6 +1519,8 @@ defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
15161519
defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
15171520
defm V_CVT_PK_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
15181521

1522+
defm V_PRNG_B32 : VOP1_Real_gfx9 <0x58>;
1523+
15191524
class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat <
15201525
(vt (int_amdgcn_mov_dpp8 vt:$src, timm:$dpp8)),
15211526
(Inst VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))> {

0 commit comments

Comments
 (0)