From ff2a819e6165ca2811826fd3539cd5d2dfdbaf3e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 9 Dec 2024 15:41:44 -0600 Subject: [PATCH] WIP: AMDGPU: Always select the VGPR version of MFMAs We do not want to use AGPRs unless absolutely required due to register pressure. Rely on a post-regalloc pass to replace VGPR MFMAs with the AGPR version if it avoids the copies introduced due to live range splitting. --- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 10 ++-- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 +------ .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 6 -- .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 6 -- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 55 ++++++++++--------- 5 files changed, 35 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b20760c356263..a8cabcb5831c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4867,31 +4867,29 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // for srcA/srcB? // // vdst, srcA, srcB, srcC - const SIMachineFunctionInfo *Info = MF.getInfo(); OpdsMapping[0] = - Info->mayNeedAGPRs() + !Subtarget.hasGFX90AInsts() ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + !Subtarget.hasGFX90AInsts() ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4: case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: { - const SIMachineFunctionInfo *Info = MF.getInfo(); OpdsMapping[0] = - Info->mayNeedAGPRs() + !Subtarget.hasGFX90AInsts() ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->mayNeedAGPRs() + !Subtarget.hasGFX90AInsts() ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d7dcf8c4a064..405535b7d27db 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16227,7 +16227,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - SIMachineFunctionInfo *Info = MF->getInfo(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. @@ -16238,7 +16237,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // use between vgpr and agpr as agpr tuples tend to be big. if (!MI.getDesc().operands().empty()) { unsigned Opc = MI.getOpcode(); - bool HasAGPRs = Info->mayNeedAGPRs(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); for (auto I : @@ -16246,7 +16244,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { if (I == -1) break; - if ((I == Src2Idx) && (HasAGPRs)) + if (I == Src2Idx) break; MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !Op.getReg().isVirtual()) @@ -16280,22 +16278,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, TII->legalizeOpWithMove(MI, Src1Idx); } } - - if (!HasAGPRs) - return; - - // Resolve the rest of AV operands to AGPRs. - if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { - if (Src2->isReg() && Src2->getReg().isVirtual()) { - auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); - if (TRI->isVectorSuperClass(RC)) { - auto *NewRC = TRI->getEquivalentAGPRClass(RC); - MRI.setRegClass(Src2->getReg(), NewRC); - if (Src2->isTied()) - MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); - } - } - } } return; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 67ad28661da43..a33dafac85b08 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -69,12 +69,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); - if (ST.hasGFX90AInsts() && - ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && - !mayUseAGPRs(F)) - MayNeedAGPRs = false; // We will select all MAI with VGPR operands. - if (AMDGPU::isChainCC(CC)) { // Chain functions don't receive an SP from their caller, but are free to // set one up. For now, we can use s32 to match what amdgpu_gfx functions diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 274a60adb8d07..1b45737f45106 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -497,8 +497,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; - bool MayNeedAGPRs : 1; - // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since // current hardware only allows a 16 bit value. @@ -1172,10 +1170,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; } - bool mayNeedAGPRs() const { - return MayNeedAGPRs; - } - // \returns true if a function has a use of AGPRs via inline asm or // has a call which may use it. bool mayUseAGPRs(const Function &F) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index e8db879ca5077..6b6b74234cfef 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -856,17 +856,11 @@ defvar MayNotNeedAGPRs_gisel = [{ return !MF.getInfo()->mayNeedAGPRs(); }]; -class AgprMAIFrag : - MAIFrag { - let GISelPredicateCode = MayNeedAGPRs_gisel; -} +class AgprMAIFrag + : MAIFrag {} -class VgprMAIFrag : - MAIFrag { - let GISelPredicateCode = MayNotNeedAGPRs_gisel; -} +class VgprMAIFrag + : MAIFrag {} let isAsCheapAsAMove = 1, isReMaterializable = 1 in { defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; @@ -917,10 +911,14 @@ multiclass MAIInst), Scaled>, MFMATable<0, "AGPR", NAME # "_e64">; - let OtherPredicates = [isGFX90APlus], Mnemonic = OpName in - def _vgprcd_e64 : MAIInst("VOPProfileMAI_" # P # "_VCD"), - !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag), Scaled>, - MFMATable<0, "VGPR", NAME # "_vgprcd_e64", NAME # "_e64">; + let OtherPredicates = [isGFX90APlus], Mnemonic = OpName, + AddedComplexity = 10 in def _vgprcd_e64 + : MAIInst("VOPProfileMAI_"#P#"_VCD"), + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, + VgprMAIFrag), + Scaled>, + MFMATable<0, "VGPR", NAME#"_vgprcd_e64", NAME#"_e64">; } if NoDstOverlap then { @@ -931,16 +929,22 @@ multiclass MAIInst), Scaled>, MFMATable<1, "AGPR", NAME # "_e64", NAME # "_mac_e64">; - let OtherPredicates = [isGFX90APlus] in - def _mac_vgprcd_e64 : MAIInst("VOPProfileMAI_" # P # "_VCD"), - !if(!eq(node, null_frag), null_frag, VgprMAIFrag), Scaled>, - MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">; + let OtherPredicates = [isGFX90APlus], + AddedComplexity = 10 in def _mac_vgprcd_e64 + : MAIInst("VOPProfileMAI_"#P#"_VCD"), + !if(!eq(node, null_frag), null_frag, + VgprMAIFrag), + Scaled>, + MFMATable<1, "VGPR", NAME#"_vgprcd_e64", NAME#"_mac_e64">; } } } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 } -// Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32 +// Provide a wrapper around MAIInst that provides the appended operands from +// V_MFMA_LD_SCALE_B32 AGPR variants are never selected; VGPR is selected and +// may later be rewritten to AGPR. multiclass ScaledMAIInst_mc { defvar VariantSuffix = !subst(!toupper(OpName), "", NAME); // Drop the main opcode name prefix to get the "_fN_fM" suffix. defvar UnscaledOpName = UnscaledOpName_#VariantSuffix; @@ -949,9 +953,9 @@ multiclass ScaledMAIInst_mc(!cast(UnscaledOpName#"_e64").Pfl).NoDstOverlap; - def _e64 : ScaledMAIInst(UnscaledOpName#"_e64"), !if(NoDstOverlap, null_frag, AgprMAIFrag)>, - MFMATable<0, "AGPR", NAME # "_e64">; + def _e64 + : ScaledMAIInst(UnscaledOpName#"_e64"), null_frag>, + MFMATable<0, "AGPR", NAME#"_e64">; def _vgprcd_e64 : ScaledMAIInst(UnscaledOpName#"_vgprcd_e64"), !if(NoDstOverlap, null_frag, VgprMAIFrag)>, @@ -961,9 +965,10 @@ multiclass ScaledMAIInst_mc(UnscaledOpName # "_mac_e64"), AgprMAIFrag>, - MFMATable<1, "AGPR", NAME # "_e64">; + def _mac_e64 + : ScaledMAIInst(UnscaledOpName#"_mac_e64"), null_frag>, + MFMATable<1, "AGPR", NAME#"_e64">; def _mac_vgprcd_e64 : ScaledMAIInst(UnscaledOpName # "_mac_vgprcd_e64"), VgprMAIFrag>,