-
Notifications
You must be signed in to change notification settings - Fork 14k
[X86] X86FixupInstTuning - prefer VPBLENDD to VPBLENDW shuffles on AVX2+ targets #144269
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…X2+ targets On many Intel AVX2 targets (Haswell+), VPBLENDD has notably better throughput than VPBLENDW - and the remaining Intel/AMD targets have no preference. This patch replaces VPBLENDW shuffles if the shuffle mask can be safely widened from vXi16 to vXi32 and that the scheduler model doesn't consider it a regression (I haven't found any target where this is true, but we should retain the model check). Noticed while working on llvm#142972 where VMOVSS nodes were regressing to VPBLENDW nodes during domain switching.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesOn many Intel AVX2 targets (Haswell+), VPBLENDD has notably better throughput than VPBLENDW - and the remaining Intel/AMD targets have no preference. This patch replaces VPBLENDW shuffles if the shuffle mask can be safely widened from vXi16 to vXi32 and that the scheduler model doesn't consider it a regression (I haven't found any target where this is true, but we should retain the model check). Noticed while working on #142972 where VMOVSS nodes were regressing to VPBLENDW nodes during domain switching. Patch is 26.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144269.diff 10 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 89093b2e1a3f5..68d6d2a76508c 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -242,6 +242,26 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessUNPCKToIntDomain(NewOpc);
};
+ auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
+ if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
+ return false;
+ // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
+ APInt MaskW =
+ APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
+ APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
+ if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
+ return false;
+ APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
+ LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+ {
+ MI.setDesc(TII->get(MovOpc));
+ MI.removeOperand(NumOperands - 1);
+ MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
+ }
+ LLVM_DEBUG(dbgs() << " With: " << MI);
+ return true;
+ };
+
auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
unsigned MovImm) -> bool {
if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
@@ -270,6 +290,15 @@ bool X86FixupInstTuningPass::processInstruction(
return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
+ case X86::VPBLENDWrri:
+ return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
+ case X86::VPBLENDWrmi:
+ return ProcessBLENDWToBLENDD(X86::VPBLENDDrmi, 8);
+ case X86::VPBLENDWYrri:
+ return ProcessBLENDWToBLENDD(X86::VPBLENDDYrri, 4);
+ case X86::VPBLENDWYrmi:
+ return ProcessBLENDWToBLENDD(X86::VPBLENDDYrmi, 8);
+
case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 14e3767f65564..38ea796c0fcb0 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -424,7 +424,7 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 04d7a9691b645..3aa77c3955c63 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -317,8 +317,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2
; AVXVNNI-NEXT: vmovd %xmm2, %eax
; AVXVNNI-NEXT: addl %edx, %eax
@@ -328,9 +328,9 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
; AVX512VNNI-NEXT: vmovd %xmm2, %eax
@@ -343,8 +343,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index dfae853f9961e..456e6e8f263aa 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -27,7 +27,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
; AVXVNNI-LABEL: mul_4xi8_zc:
; AVXVNNI: # %bb.0: # %entry
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
; AVXVNNI-NEXT: addl %edi, %eax
@@ -36,7 +36,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
; AVX512VNNI-LABEL: mul_4xi8_zc:
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
; AVX512VNNI-NEXT: vmovd %xmm1, %eax
@@ -47,7 +47,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI-LABEL: mul_4xi8_zc:
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
@@ -67,7 +67,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
; AVXVNNI-NEXT: addl %edi, %eax
@@ -78,7 +78,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
; AVX512VNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
; AVX512VNNI-NEXT: vmovd %xmm1, %eax
@@ -107,7 +107,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVXVNNI-LABEL: mul_4xi8_cs:
; AVXVNNI: # %bb.0: # %entry
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
@@ -117,7 +117,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VNNI-LABEL: mul_4xi8_cs:
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
@@ -129,7 +129,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI-LABEL: mul_4xi8_cs:
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 84ae818d91832..05c855ed90b3f 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1014,7 +1014,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1023,7 +1023,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1038,7 +1038,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 4898ae98faea2..983ae594e3ab1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -112,7 +112,7 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index 937ac3d2db885..d99b200385585 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -231,7 +231,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
; AVX2-LABEL: test_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
@@ -239,7 +239,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
; AVX512-LABEL: test_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index 6cc0e1e73fcdb..aed4e023e340c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -1025,19 +1025,28 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
+; AVX1-LABEL: test_v4i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index ddd7f10168936..cacc43e96b6ea 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1329,7 +1329,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
@@ -1340,7 +1340,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -1351,7 +1351,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -2428,7 +2428,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
@@ -2439,7 +2439,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -2450,7 +2450,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -4996,7 +4996,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
@@ -5063,7 +5063,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -5282,7 +5282,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
@@ -5295,7 +5295,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1
; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx)
@@ -5308,7 +5308,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512BW-NEXT: ...
[truncated]
|
return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4); | ||
case X86::VPBLENDWYrri: | ||
// TODO: Add X86::VPBLENDWYrmi handling | ||
return ProcessBLENDWToBLENDD(X86::VPBLENDDYrmi, 8); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see a test for it either.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/138/builds/14614 Here is the relevant piece of the build log for the reference
|
On many Intel AVX2 targets (Haswell+), VPBLENDD has notably better throughput than VPBLENDW - and the remaining Intel/AMD targets have no preference.
This patch replaces VPBLENDW shuffles if the shuffle mask can be safely widened from vXi16 to vXi32 and that the scheduler model doesn't consider it a regression (I haven't found any target where this is true, but we should retain the model check).
Noticed while working on #142972 where VMOVSS nodes were regressing to VPBLENDW nodes during domain switching.