Skip to content

Commit fe28ea3

Browse files
authored
[LoongArch] Add demanded bits support for [X]VMSKLTZ (#143528)
This patch adds a DAG combine hook for the [X]VMSKLTZ nodes to simplify their input when possible. It also implements target-specific logic in SimplifyDemandedBitsForTargetNode to optimize away unnecessary computations when only a subset of the sign bits in the vector results is actually used.
1 parent 20d5d09 commit fe28ea3

File tree

4 files changed

+82
-28
lines changed

4 files changed

+82
-28
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5634,6 +5634,21 @@ static SDValue performMOVFR2GR_SCombine(SDNode *N, SelectionDAG &DAG,
56345634
return SDValue();
56355635
}
56365636

5637+
static SDValue performVMSKLTZCombine(SDNode *N, SelectionDAG &DAG,
5638+
TargetLowering::DAGCombinerInfo &DCI,
5639+
const LoongArchSubtarget &Subtarget) {
5640+
MVT VT = N->getSimpleValueType(0);
5641+
unsigned NumBits = VT.getScalarSizeInBits();
5642+
5643+
// Simplify the inputs.
5644+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5645+
APInt DemandedMask(APInt::getAllOnes(NumBits));
5646+
if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
5647+
return SDValue(N, 0);
5648+
5649+
return SDValue();
5650+
}
5651+
56375652
SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
56385653
DAGCombinerInfo &DCI) const {
56395654
SelectionDAG &DAG = DCI.DAG;
@@ -5658,6 +5673,9 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
56585673
return performMOVGR2FR_WCombine(N, DAG, DCI, Subtarget);
56595674
case LoongArchISD::MOVFR2GR_S_LA64:
56605675
return performMOVFR2GR_SCombine(N, DAG, DCI, Subtarget);
5676+
case LoongArchISD::VMSKLTZ:
5677+
case LoongArchISD::XVMSKLTZ:
5678+
return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
56615679
}
56625680
return SDValue();
56635681
}
@@ -8192,3 +8210,58 @@ unsigned LoongArchTargetLowering::getNumRegistersForCallingConv(
81928210

81938211
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
81948212
}
8213+
8214+
bool LoongArchTargetLowering::SimplifyDemandedBitsForTargetNode(
8215+
SDValue Op, const APInt &OriginalDemandedBits,
8216+
const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
8217+
unsigned Depth) const {
8218+
EVT VT = Op.getValueType();
8219+
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
8220+
unsigned Opc = Op.getOpcode();
8221+
switch (Opc) {
8222+
default:
8223+
break;
8224+
case LoongArchISD::VMSKLTZ:
8225+
case LoongArchISD::XVMSKLTZ: {
8226+
SDValue Src = Op.getOperand(0);
8227+
MVT SrcVT = Src.getSimpleValueType();
8228+
unsigned SrcBits = SrcVT.getScalarSizeInBits();
8229+
unsigned NumElts = SrcVT.getVectorNumElements();
8230+
8231+
// If we don't need the sign bits at all just return zero.
8232+
if (OriginalDemandedBits.countr_zero() >= NumElts)
8233+
return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
8234+
8235+
// Only demand the vector elements of the sign bits we need.
8236+
APInt KnownUndef, KnownZero;
8237+
APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
8238+
if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
8239+
TLO, Depth + 1))
8240+
return true;
8241+
8242+
Known.Zero = KnownZero.zext(BitWidth);
8243+
Known.Zero.setHighBits(BitWidth - NumElts);
8244+
8245+
// [X]VMSKLTZ only uses the MSB from each vector element.
8246+
KnownBits KnownSrc;
8247+
APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
8248+
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
8249+
Depth + 1))
8250+
return true;
8251+
8252+
if (KnownSrc.One[SrcBits - 1])
8253+
Known.One.setLowBits(NumElts);
8254+
else if (KnownSrc.Zero[SrcBits - 1])
8255+
Known.Zero.setLowBits(NumElts);
8256+
8257+
// Attempt to avoid multi-use ops if we don't need anything from it.
8258+
if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
8259+
Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
8260+
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
8261+
return false;
8262+
}
8263+
}
8264+
8265+
return TargetLowering::SimplifyDemandedBitsForTargetNode(
8266+
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
8267+
}

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,12 @@ class LoongArchTargetLowering : public TargetLowering {
314314
bool isFPImmVLDILegal(const APFloat &Imm, EVT VT) const;
315315
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
316316

317+
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
318+
const APInt &DemandedElts,
319+
KnownBits &Known,
320+
TargetLoweringOpt &TLO,
321+
unsigned Depth) const override;
322+
317323
private:
318324
/// Target-specific function used to lower LoongArch calling conventions.
319325
typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,

llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -383,9 +383,8 @@ define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
383383
; CHECK-LABEL: xvmsk_eq_vsel_slt_v8i32:
384384
; CHECK: # %bb.0:
385385
; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1
386-
; CHECK-NEXT: xvslti.w $xr1, $xr2, 0
387-
; CHECK-NEXT: xvrepli.b $xr2, -1
388-
; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0
386+
; CHECK-NEXT: xvrepli.b $xr1, -1
387+
; CHECK-NEXT: xvbitsel.v $xr0, $xr2, $xr1, $xr0
389388
; CHECK-NEXT: xvmskltz.w $xr0, $xr0
390389
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
391390
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -408,8 +407,7 @@ define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
408407
; CHECK-NEXT: xvreplgr2vr.w $xr4, $a0
409408
; CHECK-NEXT: xvand.v $xr2, $xr2, $xr4
410409
; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1
411-
; CHECK-NEXT: xvslti.w $xr1, $xr3, 0
412-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
410+
; CHECK-NEXT: xvor.v $xr0, $xr3, $xr0
413411
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2
414412
; CHECK-NEXT: xvmskltz.w $xr0, $xr0
415413
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
@@ -530,7 +528,6 @@ define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) {
530528
; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
531529
; CHECK-NEXT: vpackev.h $vr0, $vr0, $vr1
532530
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
533-
; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
534531
; CHECK-NEXT: vmskltz.h $vr0, $vr0
535532
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
536533
; CHECK-NEXT: ret
@@ -558,7 +555,6 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) {
558555
; CHECK-NEXT: st.h $a0, $sp, 0
559556
; CHECK-NEXT: vld $vr0, $sp, 0
560557
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
561-
; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
562558
; CHECK-NEXT: vmskltz.h $vr0, $vr0
563559
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
564560
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -586,7 +582,6 @@ define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) {
586582
; CHECK-NEXT: st.h $a0, $sp, 0
587583
; CHECK-NEXT: vld $vr0, $sp, 0
588584
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
589-
; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
590585
; CHECK-NEXT: vmskltz.h $vr0, $vr0
591586
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
592587
; CHECK-NEXT: addi.d $sp, $sp, 16
@@ -601,7 +596,6 @@ define i32 @xvmsk_trunc_i8(<32 x i8> %a) {
601596
; CHECK-LABEL: xvmsk_trunc_i8:
602597
; CHECK: # %bb.0:
603598
; CHECK-NEXT: xvslli.b $xr0, $xr0, 7
604-
; CHECK-NEXT: xvsrai.b $xr0, $xr0, 7
605599
; CHECK-NEXT: xvmskltz.b $xr0, $xr0
606600
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
607601
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -616,7 +610,6 @@ define i16 @xvmsk_trunc_i16(<16 x i16> %a) {
616610
; CHECK-LABEL: xvmsk_trunc_i16:
617611
; CHECK: # %bb.0:
618612
; CHECK-NEXT: xvslli.h $xr0, $xr0, 15
619-
; CHECK-NEXT: xvsrai.h $xr0, $xr0, 15
620613
; CHECK-NEXT: xvmskltz.h $xr0, $xr0
621614
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
622615
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -631,7 +624,6 @@ define i8 @xvmsk_trunc_i32(<8 x i32> %a) {
631624
; CHECK-LABEL: xvmsk_trunc_i32:
632625
; CHECK: # %bb.0:
633626
; CHECK-NEXT: xvslli.w $xr0, $xr0, 31
634-
; CHECK-NEXT: xvsrai.w $xr0, $xr0, 31
635627
; CHECK-NEXT: xvmskltz.w $xr0, $xr0
636628
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
637629
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4
@@ -646,7 +638,6 @@ define i4 @xvmsk_trunc_i64(<4 x i64> %a) {
646638
; CHECK-LABEL: xvmsk_trunc_i64:
647639
; CHECK: # %bb.0:
648640
; CHECK-NEXT: xvslli.d $xr0, $xr0, 63
649-
; CHECK-NEXT: xvsrai.d $xr0, $xr0, 63
650641
; CHECK-NEXT: xvmskltz.d $xr0, $xr0
651642
; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0
652643
; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4

llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,6 @@ define i2 @vmsk_sgt_v2i8(<2 x i8> %a, <2 x i8> %b) {
181181
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
182182
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
183183
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
184-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
185184
; CHECK-NEXT: vmskltz.d $vr0, $vr0
186185
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
187186
; CHECK-NEXT: ret
@@ -197,7 +196,6 @@ define i2 @vmsk_sgt_v2i16(<2 x i16> %a, <2 x i16> %b) {
197196
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
198197
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
199198
; CHECK-NEXT: vslli.d $vr0, $vr0, 48
200-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
201199
; CHECK-NEXT: vmskltz.d $vr0, $vr0
202200
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
203201
; CHECK-NEXT: ret
@@ -212,7 +210,6 @@ define i2 @vmsk_sgt_v2i32(<2 x i32> %a, <2 x i32> %b) {
212210
; CHECK-NEXT: vslt.w $vr0, $vr1, $vr0
213211
; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16
214212
; CHECK-NEXT: vslli.d $vr0, $vr0, 32
215-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 32
216213
; CHECK-NEXT: vmskltz.d $vr0, $vr0
217214
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
218215
; CHECK-NEXT: ret
@@ -252,7 +249,6 @@ define i4 @vmsk_sgt_v4i8(<4 x i8> %a, <4 x i8> %b) {
252249
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
253250
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
254251
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
255-
; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
256252
; CHECK-NEXT: vmskltz.w $vr0, $vr0
257253
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
258254
; CHECK-NEXT: ret
@@ -267,7 +263,6 @@ define i4 @vmsk_sgt_v4i16(<4 x i16> %a, <4 x i16> %b) {
267263
; CHECK-NEXT: vslt.h $vr0, $vr1, $vr0
268264
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
269265
; CHECK-NEXT: vslli.w $vr0, $vr0, 16
270-
; CHECK-NEXT: vsrai.w $vr0, $vr0, 16
271266
; CHECK-NEXT: vmskltz.w $vr0, $vr0
272267
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
273268
; CHECK-NEXT: ret
@@ -306,7 +301,6 @@ define i8 @vmsk_sgt_v8i8(<8 x i8> %a, <8 x i8> %b) {
306301
; CHECK-NEXT: vslt.b $vr0, $vr1, $vr0
307302
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
308303
; CHECK-NEXT: vslli.h $vr0, $vr0, 8
309-
; CHECK-NEXT: vsrai.h $vr0, $vr0, 8
310304
; CHECK-NEXT: vmskltz.h $vr0, $vr0
311305
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
312306
; CHECK-NEXT: ret
@@ -349,7 +343,6 @@ define i2 @vmsk_sgt_and_sgt_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8>
349343
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
350344
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
351345
; CHECK-NEXT: vslli.d $vr0, $vr0, 56
352-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 56
353346
; CHECK-NEXT: vmskltz.d $vr0, $vr0
354347
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
355348
; CHECK-NEXT: ret
@@ -369,7 +362,6 @@ define i2 @vmsk_sgt_and_sgt_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
369362
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
370363
; CHECK-NEXT: vilvl.w $vr0, $vr0, $vr0
371364
; CHECK-NEXT: vslli.d $vr0, $vr0, 48
372-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 48
373365
; CHECK-NEXT: vmskltz.d $vr0, $vr0
374366
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
375367
; CHECK-NEXT: ret
@@ -388,7 +380,6 @@ define i2 @vmsk_sgt_and_sgt_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
388380
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
389381
; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16
390382
; CHECK-NEXT: vslli.d $vr0, $vr0, 32
391-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 32
392383
; CHECK-NEXT: vmskltz.d $vr0, $vr0
393384
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
394385
; CHECK-NEXT: ret
@@ -440,7 +431,6 @@ define i4 @vmsk_sgt_and_sgt_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
440431
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
441432
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
442433
; CHECK-NEXT: vslli.w $vr0, $vr0, 24
443-
; CHECK-NEXT: vsrai.w $vr0, $vr0, 24
444434
; CHECK-NEXT: vmskltz.w $vr0, $vr0
445435
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
446436
; CHECK-NEXT: ret
@@ -459,7 +449,6 @@ define i4 @vmsk_sgt_and_sgt_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
459449
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
460450
; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0
461451
; CHECK-NEXT: vslli.w $vr0, $vr0, 16
462-
; CHECK-NEXT: vsrai.w $vr0, $vr0, 16
463452
; CHECK-NEXT: vmskltz.w $vr0, $vr0
464453
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
465454
; CHECK-NEXT: ret
@@ -510,7 +499,6 @@ define i8 @vmsk_sgt_and_sgt_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8>
510499
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
511500
; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0
512501
; CHECK-NEXT: vslli.h $vr0, $vr0, 8
513-
; CHECK-NEXT: vsrai.h $vr0, $vr0, 8
514502
; CHECK-NEXT: vmskltz.h $vr0, $vr0
515503
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
516504
; CHECK-NEXT: ret
@@ -557,7 +545,6 @@ define i16 @vmsk_trunc_i8(<16 x i8> %a) {
557545
; CHECK-LABEL: vmsk_trunc_i8:
558546
; CHECK: # %bb.0:
559547
; CHECK-NEXT: vslli.b $vr0, $vr0, 7
560-
; CHECK-NEXT: vsrai.b $vr0, $vr0, 7
561548
; CHECK-NEXT: vmskltz.b $vr0, $vr0
562549
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
563550
; CHECK-NEXT: ret
@@ -570,7 +557,6 @@ define i8 @vmsk_trunc_i16(<8 x i16> %a) {
570557
; CHECK-LABEL: vmsk_trunc_i16:
571558
; CHECK: # %bb.0:
572559
; CHECK-NEXT: vslli.h $vr0, $vr0, 15
573-
; CHECK-NEXT: vsrai.h $vr0, $vr0, 15
574560
; CHECK-NEXT: vmskltz.h $vr0, $vr0
575561
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
576562
; CHECK-NEXT: ret
@@ -583,7 +569,6 @@ define i4 @vmsk_trunc_i32(<4 x i32> %a) {
583569
; CHECK-LABEL: vmsk_trunc_i32:
584570
; CHECK: # %bb.0:
585571
; CHECK-NEXT: vslli.w $vr0, $vr0, 31
586-
; CHECK-NEXT: vsrai.w $vr0, $vr0, 31
587572
; CHECK-NEXT: vmskltz.w $vr0, $vr0
588573
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
589574
; CHECK-NEXT: ret
@@ -596,7 +581,6 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
596581
; CHECK-LABEL: vmsk_trunc_i64:
597582
; CHECK: # %bb.0:
598583
; CHECK-NEXT: vslli.d $vr0, $vr0, 63
599-
; CHECK-NEXT: vsrai.d $vr0, $vr0, 63
600584
; CHECK-NEXT: vmskltz.d $vr0, $vr0
601585
; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0
602586
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)