-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AArch64] Use correct regclass for spills of ZPR2/ZPR4 #148806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Use correct regclass for spills of ZPR2/ZPR4 #148806
Conversation
Commit 8a397b6 forced the register class of ZPR[24]StridedOrContiguous for spills/fills of ZPR2 and ZPR4, but this may result in issues when the regclass for the fill is a ZPR2/ZPR4 which would allow the register allocator to pick `z1_z2`, which is not a supported register for ZPR2StridedOrContiguous that only supports tuples of the form (strided) `z0_z8`, `z1_z9` or (contiguous, start at multiple of 2) `z0_z1`, `z2_z3`. For spills we could add a new register class that supports any of the tuple forms, but I've decided to use two pseudos similar to the fills for consistency.
@llvm/pr-subscribers-backend-aarch64 Author: Sander de Smalen (sdesmalen-arm) ChangesCommit 8a397b6 forced the register Full diff: https://github.com/llvm/llvm-project/pull/148806.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 36f3a670808d4..07b36d20b0c6d 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1591,18 +1591,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
"Non-writeback variants of STGloop / STZGloop should not "
"survive past PrologEpilogInserter.");
case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
case AArch64::STR_ZZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
case AArch64::STR_PPXI:
return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
case AArch64::LDR_ZZZXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
case AArch64::LDR_PPXI:
return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa76..5420545cc3cec 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2482,8 +2482,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::LDR_PXI:
case AArch64::LDR_ZXI:
case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZZXI:
case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDRBBui:
case AArch64::LDRBui:
case AArch64::LDRDui:
@@ -2525,8 +2527,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
case AArch64::STR_PXI:
case AArch64::STR_ZXI:
case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::STR_ZZZXI:
case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::STRBBui:
case AArch64::STRBui:
case AArch64::STRDui:
@@ -4318,7 +4322,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
break;
// SVE
case AArch64::STR_ZZZZXI:
+ case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZZZXI:
+ case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
Scale = TypeSize::getScalable(16);
Width = TypeSize::getScalable(16 * 4);
MinOffset = -256;
@@ -4332,7 +4338,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
MaxOffset = 253;
break;
case AArch64::STR_ZZXI:
+ case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
case AArch64::LDR_ZZXI:
+ case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
Scale = TypeSize::getScalable(16);
Width = TypeSize::getScalable(16 * 2);
MinOffset = -256;
@@ -5559,8 +5567,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Twov2d;
Offset = false;
- } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register store without SVE store instructions");
+ Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZXI;
@@ -5584,8 +5596,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
Opc = AArch64::ST1Fourv2d;
Offset = false;
- } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register store without SVE store instructions");
+ Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register store without SVE store instructions");
Opc = AArch64::STR_ZZZZXI;
@@ -5736,8 +5752,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Twov2d;
Offset = false;
- } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register load without SVE load instructions");
+ Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZXI;
@@ -5761,8 +5781,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
Opc = AArch64::LD1Fourv2d;
Offset = false;
- } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
- AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+ "Unexpected register load without SVE load instructions");
+ Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
+ StackID = TargetStackID::ScalableVector;
+ } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
"Unexpected register load without SVE load instructions");
Opc = AArch64::LDR_ZZZZXI;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eddb96979f7b8..0c4b4f4c3ed88 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
// AArch64ExpandPseudoInsts.
let mayLoad = 1, hasSideEffects = 0 in {
- def LDR_ZZXI : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+ def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
let mayStore = 1, hasSideEffects = 0 in {
- def STR_ZZXI : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+ def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
- def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+ def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
}
let AddedComplexity = 1 in {
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 83c9b73c57570..2b16dd0f29ecc 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s --check-prefix=EXPAND
--- |
; ModuleID = '<stdin>'
source_filename = "<stdin>"
@@ -14,13 +14,14 @@
define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
- define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
+ define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #2 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
- define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
+ define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #2 { entry: unreachable }
attributes #0 = { nounwind "target-features"="+sve" }
attributes #1 = { nounwind "target-features"="+sve2p1" }
+ attributes #2 = { nounwind "target-features"="+sve,+sme2" "aarch64_pstate_sm_enabled" }
...
---
@@ -318,10 +319,10 @@ registers:
- { id: 0, class: zpr2 }
stack:
liveins:
- - { reg: '$z0_z1', virtual-reg: '%0' }
+ - { reg: '$z1_z2', virtual-reg: '%0' }
body: |
bb.0.entry:
- liveins: $z0_z1
+ liveins: $z1_z2
; CHECK-LABEL: name: spills_fills_stack_id_zpr2
; CHECK: stack:
@@ -329,12 +330,12 @@ body: |
; CHECK-NEXT: stack-id: scalable-vector
; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
- ; EXPAND: STR_ZXI $z0, $sp, 0
- ; EXPAND: STR_ZXI $z1, $sp, 1
- ; EXPAND: $z0 = LDR_ZXI $sp, 0
- ; EXPAND: $z1 = LDR_ZXI $sp, 1
+ ; EXPAND: STR_ZXI $z1, $sp, 0
+ ; EXPAND: STR_ZXI $z2, $sp, 1
+ ; EXPAND: $z1 = LDR_ZXI $sp, 0
+ ; EXPAND: $z2 = LDR_ZXI $sp, 1
- %0:zpr2 = COPY $z0_z1
+ %0:zpr2 = COPY $z1_z2
$z0_z1_z2_z3 = IMPLICIT_DEF
$z4_z5_z6_z7 = IMPLICIT_DEF
@@ -345,7 +346,7 @@ body: |
$z24_z25_z26_z27 = IMPLICIT_DEF
$z28_z29_z30_z31 = IMPLICIT_DEF
- $z0_z1 = COPY %0
+ $z1_z2 = COPY %0
RET_ReallyLR
...
---
@@ -439,10 +440,10 @@ registers:
- { id: 0, class: zpr4 }
stack:
liveins:
- - { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
+ - { reg: '$z1_z2_z3_z4', virtual-reg: '%0' }
body: |
bb.0.entry:
- liveins: $z0_z1_z2_z3
+ liveins: $z1_z2_z3_z4
; CHECK-LABEL: name: spills_fills_stack_id_zpr4
; CHECK: stack:
@@ -450,16 +451,16 @@ body: |
; CHECK-NEXT: stack-id: scalable-vector
; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
- ; EXPAND: STR_ZXI $z0, $sp, 0
- ; EXPAND: STR_ZXI $z1, $sp, 1
- ; EXPAND: STR_ZXI $z2, $sp, 2
- ; EXPAND: STR_ZXI $z3, $sp, 3
- ; EXPAND: $z0 = LDR_ZXI $sp, 0
- ; EXPAND: $z1 = LDR_ZXI $sp, 1
- ; EXPAND: $z2 = LDR_ZXI $sp, 2
- ; EXPAND: $z3 = LDR_ZXI $sp, 3
+ ; EXPAND: STR_ZXI $z1, $sp, 0
+ ; EXPAND: STR_ZXI $z2, $sp, 1
+ ; EXPAND: STR_ZXI $z3, $sp, 2
+ ; EXPAND: STR_ZXI $z4, $sp, 3
+ ; EXPAND: $z1 = LDR_ZXI $sp, 0
+ ; EXPAND: $z2 = LDR_ZXI $sp, 1
+ ; EXPAND: $z3 = LDR_ZXI $sp, 2
+ ; EXPAND: $z4 = LDR_ZXI $sp, 3
- %0:zpr4 = COPY $z0_z1_z2_z3
+ %0:zpr4 = COPY $z1_z2_z3_z4
$z0_z1_z2_z3 = IMPLICIT_DEF
$z4_z5_z6_z7 = IMPLICIT_DEF
@@ -470,7 +471,7 @@ body: |
$z24_z25_z26_z27 = IMPLICIT_DEF
$z28_z29_z30_z31 = IMPLICIT_DEF
- $z0_z1_z2_z3 = COPY %0
+ $z1_z2_z3_z4 = COPY %0
RET_ReallyLR
...
---
|
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in { | |||
// These get expanded to individual LDR_ZXI/STR_ZXI instructions in | |||
// AArch64ExpandPseudoInsts. | |||
let mayLoad = 1, hasSideEffects = 0 in { | |||
def LDR_ZZXI : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; | |||
def LDR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: not all of this is this PR, but it bugs me that there's three different ways to refer to the same concept (which makes future grepping harder):
strided_and_contiguous
StridedOrContiguous
STRIDED_CONTIGUOUS
(my vote is for "strided or contiguous")
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, we've not been very consistent with the naming, I'll fix that up in a follow-up NFC patch. My vote is also for "strided or contiguous" in that case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Commit a629322 forced the register
class of ZPR[24]StridedOrContiguous for spills/fills of ZPR2 and ZPR4,
but this may result in issues when the regclass for the fill is a
ZPR2/ZPR4 which would allow the register allocator to pick
z1_z2
,which is not a supported register for ZPR2StridedOrContiguous that only
supports tuples of the form (strided)
z0_z8
,z1_z9
or (contiguous,start at multiple of 2)
z0_z1
,z2_z3
. For spills we could add a newregister class that supports any of the tuple forms, but I've decided
to use two pseudos similar to the fills for consistency.
Fixes #148655