-
Notifications
You must be signed in to change notification settings - Fork 15.6k
[AMDGPU] Add TDM Descriptor Optimization Pass #173324
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: None (tyb0807) ChangesAMDGPU code generation frequently creates tensor and address descriptors using sequences of This PR introduces the
Patch is 50.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/173324.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..e0110d33cf5f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -59,6 +59,7 @@ FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass();
FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
+FunctionPass *createAMDGPUTDMOptimizationPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
ModulePass *createAMDGPULowerBufferFatPointersPass();
@@ -170,6 +171,8 @@ extern char &AMDGPUPrepareAGPRAllocLegacyID;
void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsLegacyID;
+void initializeAMDGPUTDMOptimizationPass(PassRegistry &);
+
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp
new file mode 100644
index 0000000000000..6bcb4acd87b77
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp
@@ -0,0 +1,495 @@
+//===-- AMDGPUTDMOptimization.cpp - TDM Descriptor Optimization ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes Tensor Data Movement (TDM) descriptor creation patterns.
+// It identifies insertelement chains that create descriptors and transforms them
+// to use alloca+field updates, which SROA later optimizes to INSERT_SUBREG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-tdm-optimization"
+
+static cl::opt<unsigned>
+TDMOptBenefitThreshold("amdgpu-tdm-opt-threshold", cl::Hidden, cl::init(10),
+ cl::desc("Minimum optimization benefit threshold for TDM descriptor optimization"));
+
+namespace llvm {
+ void initializeAMDGPUTDMOptimizationPass(PassRegistry &);
+}
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Pattern Detection Data Structures
+//===----------------------------------------------------------------------===//
+
+/// Represents a single descriptor creation pattern
+struct DescriptorPattern {
+ Type *DescType; ///< <4 x i32> or <8 x i32>
+ Value *BaseValue; ///< Base template (constant or computed)
+ SmallVector<InsertElementInst *, 8> Chain; ///< Chain of insertelement instructions
+ SmallVector<unsigned, 8> VariableFields; ///< Fields that change
+ SmallVector<unsigned, 8> ConstantFields; ///< Fields that stay constant
+ BasicBlock *Location; ///< Where the pattern is located
+ Loop *ContainingLoop; ///< Loop containing this pattern (if any)
+
+ /// Calculate field reuse ratio (constant fields / total fields)
+ float getFieldReuseRatio() const {
+ unsigned totalFields = cast<FixedVectorType>(DescType)->getNumElements();
+ return (float)ConstantFields.size() / totalFields;
+ }
+
+ /// Check if this pattern is worth optimizing
+ bool isWorthOptimizing() const {
+ // Always optimize if in loop with reuse potential
+ if (ContainingLoop && getFieldReuseRatio() >= 0.5f)
+ return true;
+
+ // Optimize if significant field reuse
+ if (getFieldReuseRatio() >= 0.75f)
+ return true;
+
+ // Optimize address descriptors (common case)
+ if (isAddressDescriptor() && ConstantFields.size() >= 1)
+ return true;
+
+ return false;
+ }
+
+ /// Check if this is an address descriptor (<4 x i32>)
+ bool isAddressDescriptor() const {
+ auto *VecTy = cast<FixedVectorType>(DescType);
+ return VecTy->getNumElements() == 4 && VecTy->getElementType()->isIntegerTy(32);
+ }
+
+ /// Check if this is a tensor descriptor (<8 x i32>)
+ bool isTensorDescriptor() const {
+ auto *VecTy = cast<FixedVectorType>(DescType);
+ return VecTy->getNumElements() == 8 && VecTy->getElementType()->isIntegerTy(32);
+ }
+};
+
+/// Groups similar descriptor patterns for optimization
+struct DescriptorGroup {
+ SmallVector<DescriptorPattern, 4> Patterns;
+ Type *SharedType;
+ Value *SharedBase; ///< Common base value (if any)
+ SmallVector<unsigned, 8> SharedConstantFields;
+
+ /// Calculate total optimization benefit
+ unsigned getOptimizationBenefit() const {
+ unsigned benefit = 0;
+ for (const auto &pattern : Patterns) {
+ // Base benefit from field reuse
+ benefit += pattern.ConstantFields.size() * 2;
+
+ // Extra benefit for loop patterns
+ if (pattern.ContainingLoop)
+ benefit *= 5;
+ }
+ return benefit;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// AMDGPUTDMOptimization Pass
+//===----------------------------------------------------------------------===//
+
+class AMDGPUTDMOptimization : public FunctionPass {
+private:
+ LoopInfo *LI = nullptr;
+
+ /// Detected patterns in the function
+ SmallVector<DescriptorPattern, 16> DetectedPatterns;
+
+ /// Groups of optimizable patterns
+ SmallVector<DescriptorGroup, 8> OptimizationGroups;
+
+public:
+ static char ID;
+
+ AMDGPUTDMOptimization() : FunctionPass(ID) {
+ initializeAMDGPUTDMOptimizationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ /// Main optimization phases
+ bool detectDescriptorPatterns(Function &F);
+ void groupSimilarPatterns();
+ bool transformPatterns(Function &F);
+
+ /// Pattern detection helpers
+ bool isDescriptorType(Type *Ty) const;
+ DescriptorPattern analyzeInsertChain(InsertElementInst *FinalInsert);
+ Value *extractBaseValue(const DescriptorPattern &Pattern);
+
+ /// Transformation helpers
+ bool transformDescriptorGroup(DescriptorGroup &Group, Function &F);
+ Value *createSharedStorage(DescriptorGroup &Group, IRBuilder<> &Builder);
+ void transformSinglePattern(DescriptorPattern &Pattern, Value *SharedStorage,
+ IRBuilder<> &Builder);
+
+ /// Utility functions
+ Loop *getContainingLoop(BasicBlock *BB);
+ bool arePatternsSimilar(const DescriptorPattern &A, const DescriptorPattern &B);
+};
+
+//===----------------------------------------------------------------------===//
+// Pass Implementation
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUTDMOptimization::runOnFunction(Function &F) {
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ LLVM_DEBUG(dbgs() << "Running TDM optimization on function: " << F.getName() << "\n");
+
+ // Phase 1: Detect descriptor patterns
+ if (!detectDescriptorPatterns(F)) {
+ LLVM_DEBUG(dbgs() << "No descriptor patterns found\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Found " << DetectedPatterns.size() << " descriptor patterns\n");
+
+ // Phase 2: Group similar patterns for optimization
+ groupSimilarPatterns();
+
+ LLVM_DEBUG(dbgs() << "Created " << OptimizationGroups.size() << " optimization groups\n");
+
+ // Phase 3: Transform patterns
+ bool Changed = transformPatterns(F);
+
+ // Cleanup for next function
+ DetectedPatterns.clear();
+ OptimizationGroups.clear();
+
+ return Changed;
+}
+
+void AMDGPUTDMOptimization::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern Detection
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUTDMOptimization::detectDescriptorPatterns(Function &F) {
+ bool FoundPatterns = false;
+
+ // Scan function for insertelement instructions that create descriptors
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *IE = dyn_cast<InsertElementInst>(&I);
+ if (!IE || !isDescriptorType(IE->getType()))
+ continue;
+
+ // Check if this is the final insert in a descriptor creation chain
+ if (!IE->hasOneUse() || isa<InsertElementInst>(*IE->user_begin()))
+ continue;
+
+ // Analyze the complete chain
+ DescriptorPattern Pattern = analyzeInsertChain(IE);
+ if (Pattern.Chain.empty())
+ continue;
+
+ // Check if worth optimizing
+ if (!Pattern.isWorthOptimizing()) {
+ LLVM_DEBUG(dbgs() << "Pattern not worth optimizing: field reuse ratio = "
+ << Pattern.getFieldReuseRatio() << "\n");
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "Found optimizable pattern: "
+ << (Pattern.isAddressDescriptor() ? "Address" : "Tensor")
+ << " descriptor with " << Pattern.ConstantFields.size()
+ << " constant fields\n");
+
+ DetectedPatterns.push_back(std::move(Pattern));
+ FoundPatterns = true;
+ }
+ }
+
+ return FoundPatterns;
+}
+
+bool AMDGPUTDMOptimization::isDescriptorType(Type *Ty) const {
+ auto *VecTy = dyn_cast<FixedVectorType>(Ty);
+ if (!VecTy || !VecTy->getElementType()->isIntegerTy(32))
+ return false;
+
+ unsigned NumElements = VecTy->getNumElements();
+ return NumElements == 4 || NumElements == 8; // Address or tensor descriptors
+}
+
+DescriptorPattern AMDGPUTDMOptimization::analyzeInsertChain(InsertElementInst *FinalInsert) {
+ DescriptorPattern Pattern;
+ Pattern.DescType = FinalInsert->getType();
+ Pattern.Location = FinalInsert->getParent();
+ Pattern.ContainingLoop = getContainingLoop(Pattern.Location);
+
+ // Trace back the insertelement chain
+ SmallVector<InsertElementInst *, 8> Chain;
+ Value *CurrentVal = FinalInsert;
+
+ while (auto *IE = dyn_cast<InsertElementInst>(CurrentVal)) {
+ Chain.push_back(IE);
+ CurrentVal = IE->getOperand(0); // Vector being inserted into
+ }
+
+ // Reverse to get forward order
+ std::reverse(Chain.begin(), Chain.end());
+ Pattern.Chain = Chain;
+
+ // Extract base value (the initial vector)
+ Pattern.BaseValue = extractBaseValue(Pattern);
+
+ // Analyze which fields are constant vs variable
+ unsigned NumElements = cast<FixedVectorType>(Pattern.DescType)->getNumElements();
+ SmallBitVector FieldSet(NumElements, false);
+
+ for (auto *IE : Chain) {
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ unsigned Idx = CI->getZExtValue();
+ if (Idx < NumElements) {
+ FieldSet.set(Idx);
+ Pattern.VariableFields.push_back(Idx);
+ }
+ }
+ }
+
+ // Fields not in chain are constant
+ for (unsigned i = 0; i < NumElements; ++i) {
+ if (!FieldSet[i])
+ Pattern.ConstantFields.push_back(i);
+ }
+
+ return Pattern;
+}
+
+Value *AMDGPUTDMOptimization::extractBaseValue(const DescriptorPattern &Pattern) {
+ if (Pattern.Chain.empty())
+ return nullptr;
+
+ // Get the vector being inserted into by the first insert
+ Value *Base = Pattern.Chain[0]->getOperand(0);
+
+ // If base is a constant vector or another recognizable pattern, return it
+ if (isa<Constant>(Base))
+ return Base;
+
+ // For shufflevector results, we might want to trace further back
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(Base))
+ return SV; // Keep shufflevector as base for now
+
+ return Base;
+}
+
+Loop *AMDGPUTDMOptimization::getContainingLoop(BasicBlock *BB) {
+ return LI ? LI->getLoopFor(BB) : nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern Grouping
+//===----------------------------------------------------------------------===//
+
+void AMDGPUTDMOptimization::groupSimilarPatterns() {
+ // Simple grouping strategy: group by type and base similarity
+ for (auto &Pattern : DetectedPatterns) {
+ bool Added = false;
+
+ // Try to add to existing group
+ for (auto &Group : OptimizationGroups) {
+ if (Group.SharedType == Pattern.DescType &&
+ arePatternsSimilar(Group.Patterns[0], Pattern)) {
+ Group.Patterns.push_back(Pattern);
+ Added = true;
+ break;
+ }
+ }
+
+ // Create new group if needed
+ if (!Added) {
+ DescriptorGroup NewGroup;
+ NewGroup.SharedType = Pattern.DescType;
+ NewGroup.SharedBase = Pattern.BaseValue;
+ NewGroup.Patterns.push_back(Pattern);
+ OptimizationGroups.push_back(std::move(NewGroup));
+ }
+ }
+
+ // Remove groups that don't meet optimization criteria
+ OptimizationGroups.erase(
+ std::remove_if(OptimizationGroups.begin(), OptimizationGroups.end(),
+ [](const DescriptorGroup &Group) {
+ return Group.getOptimizationBenefit() < TDMOptBenefitThreshold;
+ }),
+ OptimizationGroups.end()
+ );
+}
+
+bool AMDGPUTDMOptimization::arePatternsSimilar(const DescriptorPattern &A,
+ const DescriptorPattern &B) {
+ // Patterns are similar if they have same type and similar field usage
+ if (A.DescType != B.DescType)
+ return false;
+
+ // Check if constant fields overlap significantly
+ SmallBitVector AConstants(cast<FixedVectorType>(A.DescType)->getNumElements());
+ SmallBitVector BConstants(cast<FixedVectorType>(B.DescType)->getNumElements());
+
+ for (unsigned Field : A.ConstantFields)
+ AConstants.set(Field);
+ for (unsigned Field : B.ConstantFields)
+ BConstants.set(Field);
+
+ // Count overlapping constant fields
+ auto Intersection = AConstants & BConstants;
+ unsigned OverlapCount = Intersection.count();
+ unsigned TotalConstants = std::max(AConstants.count(), BConstants.count());
+
+ return TotalConstants > 0 && (float)OverlapCount / TotalConstants >= 0.5f;
+}
+
+//===----------------------------------------------------------------------===//
+// Pattern Transformation
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUTDMOptimization::transformPatterns(Function &F) {
+ bool Changed = false;
+
+ for (auto &Group : OptimizationGroups) {
+ LLVM_DEBUG(dbgs() << "Transforming group with " << Group.Patterns.size()
+ << " patterns, benefit = " << Group.getOptimizationBenefit() << "\n");
+
+ if (transformDescriptorGroup(Group, F))
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool AMDGPUTDMOptimization::transformDescriptorGroup(DescriptorGroup &Group, Function &F) {
+ if (Group.Patterns.empty())
+ return false;
+
+ // Find the best location to place shared storage
+ BasicBlock *StorageLocation = Group.Patterns[0].Location;
+
+ // If patterns are in a loop, try to hoist storage outside loop
+ if (auto *Loop = Group.Patterns[0].ContainingLoop) {
+ if (auto *Preheader = Loop->getLoopPreheader()) {
+ StorageLocation = Preheader;
+ LLVM_DEBUG(dbgs() << "Hoisting storage outside loop\n");
+ }
+ }
+
+ // Create shared storage
+ IRBuilder<> Builder(StorageLocation->getTerminator());
+ Value *SharedStorage = createSharedStorage(Group, Builder);
+
+ if (!SharedStorage)
+ return false;
+
+ // Transform each pattern in the group
+ for (auto &Pattern : Group.Patterns) {
+ IRBuilder<> PatternBuilder(Pattern.Chain.back());
+ transformSinglePattern(Pattern, SharedStorage, PatternBuilder);
+ }
+
+ return true;
+}
+
+Value *AMDGPUTDMOptimization::createSharedStorage(DescriptorGroup &Group, IRBuilder<> &Builder) {
+ // Create alloca in address space 5 (AMDGPU private memory)
+ auto *StorageType = Group.SharedType;
+ auto *Storage = Builder.CreateAlloca(StorageType, /*AddrSpace=*/5, /*ArraySize=*/nullptr, "tdm_desc_storage");
+
+ // Initialize with base template if available
+ if (Group.SharedBase) {
+ auto *BaseConstant = dyn_cast<Constant>(Group.SharedBase);
+ if (BaseConstant) {
+ Builder.CreateStore(BaseConstant, Storage);
+ LLVM_DEBUG(dbgs() << "Initialized storage with constant base\n");
+ }
+ }
+
+ return Storage;
+}
+
+void AMDGPUTDMOptimization::transformSinglePattern(DescriptorPattern &Pattern,
+ Value *SharedStorage,
+ IRBuilder<> &Builder) {
+ // Create field pointers for variable fields
+ SmallVector<Value *, 8> FieldPointers;
+ for (unsigned FieldIdx : Pattern.VariableFields) {
+ Value *FieldPtr = Builder.CreateGEP(
+ Pattern.DescType, SharedStorage,
+ {Builder.getInt32(0), Builder.getInt32(FieldIdx)},
+ "tdm_field_" + Twine(FieldIdx) + "_ptr"
+ );
+ FieldPointers.push_back(FieldPtr);
+ }
+
+ // Update variable fields with values from the original chain
+ for (unsigned i = 0; i < Pattern.VariableFields.size() && i < Pattern.Chain.size(); ++i) {
+ auto *InsertInst = Pattern.Chain[i];
+ Value *NewValue = InsertInst->getOperand(1); // Value being inserted
+ Builder.CreateStore(NewValue, FieldPointers[i]);
+ }
+
+ // Replace final result with load from shared storage
+ Value *OptimizedDescriptor = Builder.CreateLoad(Pattern.DescType, SharedStorage,
+ "tdm_optimized_desc");
+
+ // Replace all uses of the final insert with the load
+ Pattern.Chain.back()->replaceAllUsesWith(OptimizedDescriptor);
+
+ // Let DCE (Dead Code Elimination) clean up the now-unused insertelement chains
+ // The instructions should be dead after replaceAllUsesWith above
+
+ LLVM_DEBUG(dbgs() << "Transformed pattern with " << Pattern.VariableFields.size()
+ << " variable fields\n");
+}
+
+} // end anonymous namespace
+
+char AMDGPUTDMOptimization::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUTDMOptimization, DEBUG_TYPE,
+ "AMDGPU TDM Descriptor Optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUTDMOptimization, DEBUG_TYPE,
+ "AMDGPU TDM Descriptor Optimization", false, false)
+
+namespace llvm {
+FunctionPass *createAMDGPUTDMOptimizationPass() {
+ return new AMDGPUTDMOptimization();
+}
+} // namespace llvm
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 309e92a2ee88e..459c693a5b979 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -129,6 +129,10 @@
using namespace llvm;
using namespace llvm::PatternMatch;
+static cl::opt<bool>
+EnableTDMOptimization("amdgpu-enable-tdm-opt", cl::Hidden, cl::init(true),
+ cl::desc("Enable AMDGPU TDM descriptor optimization"));
+
namespace {
//===----------------------------------------------------------------------===//
// AMDGPU CodeGen Pass Builder interface.
@@ -593,6 +597,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
+ initializeAMDGPUTDMOptimizationPass(*PR);
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1413,6 +1418,12 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
+ // Add TDM Descriptor Optimization + SROA sequence
+ if (EnableTDMOptimization) {
+ addPass(createAMDGPUTDMOptimizationPass()); // Create alloca patterns
+ addPass(createSROAPass()); // Convert to INSERT_SUBREG
+ }
+
TargetPassConfig::addCodeGenPrepare();
if (isPassEnabled(Ena...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Target/AMDGPU/AMDGPUTDMOptimization.cpp llvm/test/CodeGen/AMDGPU/tdm-optimization.ll llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cppThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
| Loop *ContainingLoop; ///< Loop containing this pattern (if any) | ||
|
|
||
| /// Calculate field reuse ratio (constant fields / total fields) | ||
| float getFieldReuseRatio() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe use unsigned.
And just produce a number between 0 and 10.
(I.e., 10x the size)
|
|
||
| // Optimize if significant field reuse | ||
| if (getFieldReuseRatio() >= 0.75f) | ||
| return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For a first iteration the somewhat magic constant could be fine, but I'd like to understand ultimately what's our objective function. What makes the transformation worth it and gear the analysis towards that.
|
|
||
| // Optimize address descriptors (common case) | ||
| if (isAddressDescriptor() && ConstantFields.size() >= 1) | ||
| return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If that's the common case, should it come first.
| return true; | ||
|
|
||
| // Optimize address descriptors (common case) | ||
| if (isAddressDescriptor() && ConstantFields.size() >= 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (isAddressDescriptor() && ConstantFields.size() >= 1) | |
| if (isAddressDescriptor() && !ConstantFields.empty()) |
| if (!IE || !isDescriptorType(IE->getType())) | ||
| continue; | ||
|
|
||
| // Check if this is the final insert in a descriptor creation chain |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here and everywhere, period at the end of the comment, per the coding standard.
|
|
||
| // Check if this is the final insert in a descriptor creation chain | ||
| if (!IE->hasOneUse() || isa<InsertElementInst>(*IE->user_begin())) | ||
| continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the one use limitation in line with what's produced by the frontend currently?
|
|
||
| bool AMDGPUTDMOptimization::runOnFunction(Function &F) { | ||
| LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you assert that DetectedPatterns is empty here?
| } | ||
|
|
||
| Loop *AMDGPUTDMOptimization::getContainingLoop(BasicBlock *BB) { | ||
| return LI ? LI->getLoopFor(BB) : nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LI shouldn't be able to be nullptr here if I'm not mistaking.
1d7f41c to
319bec4
Compare
🪟 Windows x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/GlobalISel/flat-scratch.llLLVM.CodeGen/AMDGPU/GlobalISel/insertelement.llLLVM.CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.llLLVM.CodeGen/AMDGPU/captured-frame-index.llLLVM.CodeGen/AMDGPU/cgp-addressing-modes.llLLVM.CodeGen/AMDGPU/extload-private.llLLVM.CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.llLLVM.CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.llLLVM.CodeGen/AMDGPU/flat-scratch.llLLVM.CodeGen/AMDGPU/frame-index-elimination.llLLVM.CodeGen/AMDGPU/insert_vector_elt.llLLVM.CodeGen/AMDGPU/llc-pipeline.llLLVM.CodeGen/AMDGPU/load-hi16.llLLVM.CodeGen/AMDGPU/load-lo16.llLLVM.CodeGen/AMDGPU/mdt-preserving-crash.llLLVM.CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.llLLVM.CodeGen/AMDGPU/nested-calls.llLLVM.CodeGen/AMDGPU/parallelandifcollapse.llLLVM.CodeGen/AMDGPU/required-export-priority.llLLVM.CodeGen/AMDGPU/resource-optimization-remarks.llLLVM.CodeGen/AMDGPU/sibling-call.llLLVM.CodeGen/AMDGPU/splitkit-getsubrangeformask.llLLVM.CodeGen/AMDGPU/stacksave_stackrestore.llLLVM.CodeGen/AMDGPU/store-hi16.llLLVM.DebugInfo/AMDGPU/pointer-address-space.llLLVM.DebugInfo/AMDGPU/variable-locations.llLLVM.tools/llvm-objdump/ELF/AMDGPU/source-lines.llIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) LLVMLLVM.CodeGen/AMDGPU/GlobalISel/flat-scratch.llLLVM.CodeGen/AMDGPU/GlobalISel/insertelement.llLLVM.CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.llLLVM.CodeGen/AMDGPU/captured-frame-index.llLLVM.CodeGen/AMDGPU/cgp-addressing-modes.llLLVM.CodeGen/AMDGPU/extload-private.llLLVM.CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.llLLVM.CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.llLLVM.CodeGen/AMDGPU/flat-scratch.llLLVM.CodeGen/AMDGPU/frame-index-elimination.llLLVM.CodeGen/AMDGPU/insert_vector_elt.llLLVM.CodeGen/AMDGPU/llc-pipeline.llLLVM.CodeGen/AMDGPU/load-hi16.llLLVM.CodeGen/AMDGPU/load-lo16.llLLVM.CodeGen/AMDGPU/mdt-preserving-crash.llLLVM.CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.llLLVM.CodeGen/AMDGPU/nested-calls.llLLVM.CodeGen/AMDGPU/parallelandifcollapse.llLLVM.CodeGen/AMDGPU/required-export-priority.llLLVM.CodeGen/AMDGPU/resource-optimization-remarks.llLLVM.CodeGen/AMDGPU/sibling-call.llLLVM.CodeGen/AMDGPU/splitkit-getsubrangeformask.llLLVM.CodeGen/AMDGPU/stacksave_stackrestore.llLLVM.CodeGen/AMDGPU/store-hi16.llLLVM.DebugInfo/AMDGPU/pointer-address-space.llLLVM.DebugInfo/AMDGPU/variable-locations.llLLVM.tools/UpdateTestChecks/update_llc_test_checks/amdgpu-isel-support.testLLVM.tools/UpdateTestChecks/update_llc_test_checks/amdgpu_generated_funcs.testLLVM.tools/llvm-objdump/ELF/AMDGPU/source-lines.llIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
AMDGPU code generation frequently creates tensor and address descriptors using sequences of insertelement instructions. When these patterns have significant field reuse or cross-iteration dependencies, the current approach generates inefficient REG_SEQUENCE operations in MIR, instead of INSERT_SUBREG chains, leading to suboptimal register allocation. This PR introduces the AMDGPUTDMOptimization pass that: - Pattern Detection: Identifies descriptor creation chains with reusable constant fields - Grouping: Groups similar patterns to maximize optimization opportunities - Transformation: Converts insertelement chains to alloca + field updates in address space 5 - Integration: Works with SROA to generate INSERT_SUBREG operations for optimal register allocation
| for (auto &BB : F) { | ||
| for (auto &I : BB) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can remove one of the loops by using llvm::instructions() from InstIterator.h
| for (auto &BB : F) { | |
| for (auto &I : BB) { | |
| for (auto &I : instructions(F)) { |
| << " constant fields\n"); | ||
|
|
||
| DetectedPatterns.push_back(std::move(Pattern)); | ||
| FoundPatterns = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can remove FoundPatterns. It's equal to !DetectedPatterns.empty().
| if (isa<Constant>(Base)) | ||
| return Base; | ||
|
|
||
| // For shufflevector results, we might want to trace further back | ||
| if (auto *SV = dyn_cast<ShuffleVectorInst>(Base)) | ||
| return SV; // Keep shufflevector as base for now | ||
|
|
||
| return Base; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All this code is equivalent to return Base at the moment, right ?
| if (isa<Constant>(Base)) | |
| return Base; | |
| // For shufflevector results, we might want to trace further back | |
| if (auto *SV = dyn_cast<ShuffleVectorInst>(Base)) | |
| return SV; // Keep shufflevector as base for now | |
| return Base; | |
| return Base; |
If you're thinking about "we might want to trace further back". Sure, but let's keep that for a future PR and keep this one as simple as possible.
| SmallBitVector FieldSet(NumElements, false); | ||
|
|
||
| for (auto *IE : Chain) { | ||
| if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if one of the indices is dynamic and not a constant ?
| //===----------------------------------------------------------------------===// | ||
| // Pattern Grouping | ||
| //===----------------------------------------------------------------------===// |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a very subjective take, but I find problematic the use of comments for structuring code.
I find that carefully choosing the name of the functions better (which you did actually).
Somebody will add code below, that is not related to "pattern grouping" and the comment will add to the confusion.
|
|
||
| // Try to add to existing group | ||
| for (auto &Group : OptimizationGroups) { | ||
| if (Group.SharedType == Pattern.DescType && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this condition is redundant with the first condition in arePatternsSimilar.
if (Group.SharedType == Pattern.DescType &&
If instead of doing arePatternsSimilar we had a function bool isPatternInGroup(const DescriptorGroup &G, const DescriptorPattern &Patter) the code would be clearer; since you would be able to sink the condition in the function and remove it.
| NewGroup.SharedType = Pattern.DescType; | ||
| NewGroup.SharedBase = Pattern.BaseValue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since SharedType and SharedBase seem to always alias Group.Patterns[0].DescType and Group.Patterns[0].BaseValue; I'd add a method to DescriptorGroup that computes them from the first pattern.
| // Create alloca in address space 5 (AMDGPU private memory) | ||
| auto *StorageType = Group.SharedType; | ||
| auto *Storage = Builder.CreateAlloca( | ||
| StorageType, /*AddrSpace=*/5, /*ArraySize=*/nullptr, "tdm_desc_storage"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't create alloca instructions outside of the Function's entry block (specially if they have a constant known size).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And don't hardcode the address space, take from the datalayout (or at least use the enum)
| if (!SharedStorage) | ||
| return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
createSharedStorage always returns something right ?
| // Add TDM Descriptor Optimization + SROA sequence | ||
| if (EnableTDMOptimization) { | ||
| addPass(createAMDGPUTDMOptimizationPass()); // Create alloca patterns | ||
| addPass(createSROAPass()); // Convert to INSERT_SUBREG |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not want to introduce a run of SROA into the backend pipeline. We just managed to finally remove it. I also don't follow how this is "convert to INSERT_SUBREG".
| FunctionPass *createAMDGPUTDMOptimizationPass() { | ||
| return new AMDGPUTDMOptimization(); | ||
| } | ||
| } // namespace llvm No newline at end of file |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing newline error
| // Create alloca in address space 5 (AMDGPU private memory) | ||
| auto *StorageType = Group.SharedType; | ||
| auto *Storage = Builder.CreateAlloca( | ||
| StorageType, /*AddrSpace=*/5, /*ArraySize=*/nullptr, "tdm_desc_storage"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And don't hardcode the address space, take from the datalayout (or at least use the enum)
| BasicBlock *StorageLocation = Group.Patterns[0].Location; | ||
|
|
||
| // If patterns are in a loop, try to hoist storage outside loop | ||
| if (auto *Loop = Group.Patterns[0].ContainingLoop) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove all the autos. LLVM policy is supposed to be almost never auto
| return false; | ||
|
|
||
| unsigned NumElements = VecTy->getNumElements(); | ||
| return NumElements == 4 || NumElements == 8; // Address or tensor descriptors |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this another case that really should be a fat pointer?
AMDGPU code generation frequently creates tensor and address descriptors using sequences of
insertelementinstructions. When these patterns have significant field reuse or cross-iteration dependencies, the current approach generates inefficientREG_SEQUENCEoperations in MIR, instead ofINSERT_SUBREGchains, leading to suboptimal register allocation.This PR introduces the
AMDGPUTDMOptimizationpass that:insertelementchains toalloca+ field updates in address space 5INSERT_SUBREGoperations for optimal register allocation