Skip to content

Commit 47589ee

Browse files
committed
[AMDGPU] Support merging 16-bit TBUFFER load/store instruction
1 parent 64fe323 commit 47589ee

File tree

2 files changed

+491
-10
lines changed

2 files changed

+491
-10
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,32 +1040,58 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
10401040
if (CI.Offset == Paired.Offset)
10411041
return false;
10421042

1043+
// Use 2-byte element size if both tbuffer formats are 16-bit.
1044+
unsigned EltSize = CI.EltSize;
1045+
auto Has16BitComponents = [&](unsigned Format) -> bool {
1046+
const auto *Info = AMDGPU::getGcnBufferFormatInfo(Format, STI);
1047+
return Info && Info->BitsPerComp == 16;
1048+
};
1049+
1050+
if ((CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE)) {
1051+
// TODO: Support merging 8-bit tbuffer load/store instructions
1052+
if (Has16BitComponents(CI.Format) && Has16BitComponents(Paired.Format))
1053+
EltSize = 2;
1054+
}
1055+
10431056
// This won't be valid if the offset isn't aligned.
1044-
if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1057+
if ((CI.Offset % EltSize != 0) || (Paired.Offset % EltSize != 0))
10451058
return false;
10461059

10471060
if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
10481061

1049-
const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1050-
llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1062+
const AMDGPU::GcnBufferFormatInfo *Info0 =
1063+
AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
10511064
if (!Info0)
10521065
return false;
1053-
const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1054-
llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1066+
const AMDGPU::GcnBufferFormatInfo *Info1 =
1067+
AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
10551068
if (!Info1)
10561069
return false;
10571070

10581071
if (Info0->BitsPerComp != Info1->BitsPerComp ||
10591072
Info0->NumFormat != Info1->NumFormat)
10601073
return false;
10611074

1062-
// TODO: Should be possible to support more formats, but if format loads
1063-
// are not dword-aligned, the merged load might not be valid.
1064-
if (Info0->BitsPerComp != 32)
1075+
// Buffer instructions support up to 4 components per access (e.g., x, xy,
1076+
// xyz, xyzw).
1077+
unsigned NumCombinedComponents = CI.Width + Paired.Width;
1078+
if (NumCombinedComponents > 4)
10651079
return false;
10661080

1067-
if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1081+
if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
1082+
0)
10681083
return false;
1084+
1085+
// Merge only when the two access ranges are strictly back-to-back,
1086+
// any gap or overlap can over-write data or leave holes.
1087+
unsigned BytePerComp = Info0->BitsPerComp / 8;
1088+
unsigned ElemIndex0 = CI.Offset / BytePerComp;
1089+
unsigned ElemIndex1 = Paired.Offset / BytePerComp;
1090+
if (!(ElemIndex0 + CI.Width == ElemIndex1 ||
1091+
ElemIndex1 + Paired.Width == ElemIndex0))
1092+
return false;
1093+
1094+
return true;
10691095
}
10701096

10711097
uint32_t EltOffset0 = CI.Offset / CI.EltSize;
@@ -1076,7 +1102,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
10761102
// Handle all non-DS instructions.
10771103
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
10781104
if (EltOffset0 + CI.Width != EltOffset1 &&
1079-
EltOffset1 + Paired.Width != EltOffset0)
1105+
EltOffset1 + Paired.Width != EltOffset0)
10801106
return false;
10811107
if (CI.CPol != Paired.CPol)
10821108
return false;

0 commit comments

Comments
 (0)