@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
35523552 return true ;
35533553}
35543554
3555+ // Match BITOP3 operation and return a number of matched instructions plus
3556+ // truth table.
3557+ static std::pair<unsigned , uint8_t > BitOp3_Op (SDValue In,
3558+ SmallVectorImpl<SDValue> &Src) {
3559+ unsigned NumOpcodes = 0 ;
3560+ uint8_t LHSBits, RHSBits;
3561+
3562+ auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3563+ // Define truth table given Src0, Src1, Src2 bits permutations:
3564+ // 0 0 0
3565+ // 0 0 1
3566+ // 0 1 0
3567+ // 0 1 1
3568+ // 1 0 0
3569+ // 1 0 1
3570+ // 1 1 0
3571+ // 1 1 1
3572+ const uint8_t SrcBits[3 ] = { 0xf0 , 0xcc , 0xaa };
3573+
3574+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3575+ if (C->isAllOnes ()) {
3576+ Bits = 0xff ;
3577+ return true ;
3578+ }
3579+ if (C->isZero ()) {
3580+ Bits = 0 ;
3581+ return true ;
3582+ }
3583+ }
3584+
3585+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3586+ // Try to find existing reused operand
3587+ if (Src[I] == Op) {
3588+ Bits = SrcBits[I];
3589+ return true ;
3590+ }
3591+ // Try to replace parent operator
3592+ if (Src[I] == In) {
3593+ Bits = SrcBits[I];
3594+ Src[I] = Op;
3595+ return true ;
3596+ }
3597+ }
3598+
3599+ if (Src.size () == 3 ) {
3600+ // No room left for operands. Try one last time, there can be a 'not' of
3601+ // one of our source operands. In this case we can compute the bits
3602+ // without growing Src vector.
3603+ if (Op.getOpcode () == ISD::XOR) {
3604+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand (1 ))) {
3605+ if (C->isAllOnes ()) {
3606+ SDValue LHS = Op.getOperand (0 );
3607+ for (unsigned I = 0 ; I < Src.size (); ++I) {
3608+ if (Src[I] == LHS) {
3609+ Bits = ~SrcBits[I];
3610+ return true ;
3611+ }
3612+ }
3613+ }
3614+ }
3615+ }
3616+
3617+ return false ;
3618+ }
3619+
3620+ Bits = SrcBits[Src.size ()];
3621+ Src.push_back (Op);
3622+ return true ;
3623+ };
3624+
3625+ switch (In.getOpcode ()) {
3626+ case ISD::AND:
3627+ case ISD::OR:
3628+ case ISD::XOR: {
3629+ SDValue LHS = In.getOperand (0 );
3630+ SDValue RHS = In.getOperand (1 );
3631+
3632+ SmallVector<SDValue, 3 > Backup (Src.begin (), Src.end ());
3633+ if (!getOperandBits (LHS, LHSBits) ||
3634+ !getOperandBits (RHS, RHSBits)) {
3635+ Src = Backup;
3636+ return std::make_pair (0 , 0 );
3637+ }
3638+
3639+ // Recursion is naturally limited by the size of the operand vector.
3640+ auto Op = BitOp3_Op (LHS, Src);
3641+ if (Op.first ) {
3642+ NumOpcodes += Op.first ;
3643+ LHSBits = Op.second ;
3644+ }
3645+
3646+ Op = BitOp3_Op (RHS, Src);
3647+ if (Op.first ) {
3648+ NumOpcodes += Op.first ;
3649+ RHSBits = Op.second ;
3650+ }
3651+ break ;
3652+ }
3653+ default :
3654+ return std::make_pair (0 , 0 );
3655+ }
3656+
3657+ uint8_t TTbl;
3658+ switch (In.getOpcode ()) {
3659+ case ISD::AND:
3660+ TTbl = LHSBits & RHSBits;
3661+ break ;
3662+ case ISD::OR:
3663+ TTbl = LHSBits | RHSBits;
3664+ break ;
3665+ case ISD::XOR:
3666+ TTbl = LHSBits ^ RHSBits;
3667+ break ;
3668+ default :
3669+ break ;
3670+ }
3671+
3672+ return std::make_pair (NumOpcodes + 1 , TTbl);
3673+ }
3674+
3675+ bool AMDGPUDAGToDAGISel::SelectBITOP3 (SDValue In, SDValue &Src0, SDValue &Src1,
3676+ SDValue &Src2, SDValue &Tbl) const {
3677+ SmallVector<SDValue, 3 > Src;
3678+ uint8_t TTbl;
3679+ unsigned NumOpcodes;
3680+
3681+ std::tie (NumOpcodes, TTbl) = BitOp3_Op (In, Src);
3682+
3683+ // Src.empty() case can happen if all operands are all zero or all ones.
3684+ // Normally it shall be optimized out before reaching this.
3685+ if (NumOpcodes < 2 || Src.empty ())
3686+ return false ;
3687+
3688+ // For a uniform case threshold should be higher to account for moves between
3689+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3690+ // and a readtfirstlane after.
3691+ if (NumOpcodes < 4 && !In->isDivergent ())
3692+ return false ;
3693+
3694+ if (NumOpcodes == 2 && In.getValueType () == MVT::i32 ) {
3695+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3696+ // asm more readable. This cannot be modeled with AddedComplexity because
3697+ // selector does not know how many operations did we match.
3698+ if ((In.getOpcode () == ISD::XOR || In.getOpcode () == ISD::OR) &&
3699+ (In.getOperand (0 ).getOpcode () == In.getOpcode () ||
3700+ In.getOperand (1 ).getOpcode () == In.getOpcode ()))
3701+ return false ;
3702+
3703+ if (In.getOpcode () == ISD::OR &&
3704+ (In.getOperand (0 ).getOpcode () == ISD::AND ||
3705+ In.getOperand (1 ).getOpcode () == ISD::AND))
3706+ return false ;
3707+ }
3708+
3709+ // Last operand can be ignored, turning a ternary operation into a binary.
3710+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3711+ // 'c' with 'a' here without changing the answer. In some pathological
3712+ // cases it should be possible to get an operation with a single operand
3713+ // too if optimizer would not catch it.
3714+ while (Src.size () < 3 )
3715+ Src.push_back (Src[0 ]);
3716+
3717+ Src0 = Src[0 ];
3718+ Src1 = Src[1 ];
3719+ Src2 = Src[2 ];
3720+
3721+ Tbl = CurDAG->getTargetConstant (TTbl, SDLoc (In), MVT::i32 );
3722+ return true ;
3723+ }
3724+
35553725SDValue AMDGPUDAGToDAGISel::getHi16Elt (SDValue In) const {
35563726 if (In.isUndef ())
35573727 return CurDAG->getUNDEF (MVT::i32 );
0 commit comments