@@ -1040,32 +1040,58 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1040
1040
if (CI.Offset == Paired.Offset )
1041
1041
return false ;
1042
1042
1043
+ // Use 2-byte element size if both tbuffer formats are 16-bit.
1044
+ unsigned EltSize = CI.EltSize ;
1045
+ auto Has16BitComponents = [&](unsigned Format) -> bool {
1046
+ const auto *Info = AMDGPU::getGcnBufferFormatInfo (Format, STI);
1047
+ return Info && Info->BitsPerComp == 16 ;
1048
+ };
1049
+
1050
+ if ((CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE)) {
1051
+ // TODO: Support merging 8-bit tbuffer load/store instructions
1052
+ if (Has16BitComponents (CI.Format ) && Has16BitComponents (Paired.Format ))
1053
+ EltSize = 2 ;
1054
+ }
1055
+
1043
1056
// This won't be valid if the offset isn't aligned.
1044
- if ((CI.Offset % CI. EltSize != 0 ) || (Paired.Offset % CI. EltSize != 0 ))
1057
+ if ((CI.Offset % EltSize != 0 ) || (Paired.Offset % EltSize != 0 ))
1045
1058
return false ;
1046
1059
1047
1060
if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1048
1061
1049
- const llvm:: AMDGPU::GcnBufferFormatInfo *Info0 =
1050
- llvm:: AMDGPU::getGcnBufferFormatInfo (CI.Format , STI);
1062
+ const AMDGPU::GcnBufferFormatInfo *Info0 =
1063
+ AMDGPU::getGcnBufferFormatInfo (CI.Format , STI);
1051
1064
if (!Info0)
1052
1065
return false ;
1053
- const llvm:: AMDGPU::GcnBufferFormatInfo *Info1 =
1054
- llvm:: AMDGPU::getGcnBufferFormatInfo (Paired.Format , STI);
1066
+ const AMDGPU::GcnBufferFormatInfo *Info1 =
1067
+ AMDGPU::getGcnBufferFormatInfo (Paired.Format , STI);
1055
1068
if (!Info1)
1056
1069
return false ;
1057
1070
1058
1071
if (Info0->BitsPerComp != Info1->BitsPerComp ||
1059
1072
Info0->NumFormat != Info1->NumFormat )
1060
1073
return false ;
1061
1074
1062
- // TODO: Should be possible to support more formats, but if format loads
1063
- // are not dword-aligned, the merged load might not be valid.
1064
- if (Info0->BitsPerComp != 32 )
1075
+ // Buffer instructions support up to 4 components per access (e.g., x, xy,
1076
+ // xyz, xyzw).
1077
+ unsigned NumCombinedComponents = CI.Width + Paired.Width ;
1078
+ if (NumCombinedComponents > 4 )
1065
1079
return false ;
1066
1080
1067
- if (getBufferFormatWithCompCount (CI.Format , CI.Width + Paired.Width , STI) == 0 )
1081
+ if (getBufferFormatWithCompCount (CI.Format , NumCombinedComponents, STI) ==
1082
+ 0 )
1068
1083
return false ;
1084
+
1085
+ // Merge only when the two access ranges are strictly back-to-back,
1086
+ // any gap or overlap can over-write data or leave holes.
1087
+ unsigned BytePerComp = Info0->BitsPerComp / 8 ;
1088
+ unsigned ElemIndex0 = CI.Offset / BytePerComp;
1089
+ unsigned ElemIndex1 = Paired.Offset / BytePerComp;
1090
+ if (!(ElemIndex0 + CI.Width == ElemIndex1 ||
1091
+ ElemIndex1 + Paired.Width == ElemIndex0))
1092
+ return false ;
1093
+
1094
+ return true ;
1069
1095
}
1070
1096
1071
1097
uint32_t EltOffset0 = CI.Offset / CI.EltSize ;
@@ -1076,7 +1102,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1076
1102
// Handle all non-DS instructions.
1077
1103
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1078
1104
if (EltOffset0 + CI.Width != EltOffset1 &&
1079
- EltOffset1 + Paired.Width != EltOffset0)
1105
+ EltOffset1 + Paired.Width != EltOffset0)
1080
1106
return false ;
1081
1107
if (CI.CPol != Paired.CPol )
1082
1108
return false ;
0 commit comments