From e714da12266a98b8b612242f35c6b11eb5b69b46 Mon Sep 17 00:00:00 2001 From: midronij Date: Mon, 30 Oct 2023 19:16:24 -0400 Subject: [PATCH] Use Vector Instructions to Optimize Unsafe.setMemory() on PPC On P8 and higher, we can make use of vector stores (stxvd2x and, for P10 specifically, stxvl) to reduce the number of memory accesses and avoid checks needed to set residual bytes in the assembly code that is generated for Unsafe.setMemory(). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 238 ++++++++++++++++++------ 1 file changed, 183 insertions(+), 55 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index b2abcabb3ae..df8091d3bc7 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5937,25 +5937,82 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: (dstOffsetNode->getConstValue() >= LOWER_IMMED) && (dstOffsetNode->getConstValue() <= UPPER_IMMED); bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; - bool stopUsingCopyRegOffset = (dstOffsetNode && !useOffsetAsImmVal) ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; - bool stopUsingCopyRegLen, stopUsingCopyRegVal; + bool stopUsingCopyRegOffset, stopUsingCopyRegLen, stopUsingCopyRegVal; + //dstOffsetNode (type: long) + if (dstOffsetNode && !useOffsetAsImmVal) //only want to allocate a register for dstoffset if we're using it for the array check AND it isn't a constant + { + if (!cg->canClobberNodesRegister(lengthNode)) //only need to copy dstOffset into another register if the current one isn't clobberable + { + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + TR::Register *offsetCopyReg = cg->allocateRegister(); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, dstOffsetNode, offsetCopyReg, dstOffsetReg->getLowOrder()); + + dstOffsetReg = offsetCopyReg; + stopUsingCopyRegOffset = true; + } + else + { + stopUsingCopyRegOffset = TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg); + } + } + else + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + dstOffsetReg = dstOffsetReg->getLowOrder(); + + stopUsingCopyRegOffset = false; + } + } + else + { + stopUsingCopyRegOffset = false; + } + + //lengthNode (type: long) lengthReg = cg->evaluate(lengthNode); if (!cg->canClobberNodesRegister(lengthNode)) { - TR::Register *lenCopyReg = cg->allocateRegister(); - generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + TR::Register *lenCopyReg = cg->allocateRegister(); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of length from the register pair + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg->getLowOrder()); + else //on 64-bit system, can just do a normal copy + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + lengthReg = lenCopyReg; stopUsingCopyRegLen = true; } + else + { + if (cg->comp()->target().is32Bit()) //on 32-bit system, need to grab lower 32 bits of length from the register pair + lengthReg = lengthReg->getLowOrder(); + + stopUsingCopyRegLen = false; + } + //valueNode (type: byte) valueReg = cg->evaluate(valueNode); - if (!cg->canClobberNodesRegister(valueNode)) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) { - TR::Register *valCopyReg = cg->allocateRegister(); + //on P8 or higher, we can use vector instructions to cut down on loop iterations and residual tests -> need to copy valueReg into a VSX register + TR::Register *valVectorReg = cg->allocateRegister(TR_VRF); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mtvsrd, valueNode, valVectorReg, valueReg); + + valueReg = valVectorReg; + stopUsingCopyRegVal = true; + } + else if (!cg->canClobberNodesRegister(valueNode)) + { + TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); + valueReg = valCopyReg; stopUsingCopyRegVal = true; } @@ -5963,13 +6020,25 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); TR::LabelSymbol * loopStartLabel = generateLabelSymbol(cg); TR::LabelSymbol * doneLabel = generateLabelSymbol(cg); - TR::LabelSymbol * label8aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label4aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label2aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); + + //these labels are not needed for the vector approach to storing to residual bytes (i.e.: P10+) + TR::LabelSymbol *label8aligned, *label4aligned, *label2aligned, *label1aligned; + + if (!cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + label8aligned = generateLabelSymbol(cg); + label4aligned = generateLabelSymbol(cg); + label2aligned = generateLabelSymbol(cg); + label1aligned = generateLabelSymbol(cg); + } TR::RegisterDependencyConditions *conditions; - int32_t numDeps = (!arrayCheckNeeded || useOffsetAsImmVal) ? 6 : 7; + int32_t numDeps = 6; + + //need extra register for offset only if it isn't already included in the destination address AND it isn't a constant + if (arrayCheckNeeded && !useOffsetAsImmVal) + numDeps++; + conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); @@ -6005,7 +6074,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: #if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) if (arrayCheckNeeded) // CASE (3) - { + { // There are two scenarios in which we DON'T want to modify the dest base address: // 1.) If the object is NULL (since we can't load dataAddr from a NULL pointer) // 2.) If the object is a non-array object @@ -6020,7 +6089,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //generate NULL test generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpi, node, cndReg, dstBaseAddrReg, 0); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); - } + } //Array Check TR::Register *dstClassInfoReg = temp1Reg; @@ -6033,7 +6102,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: else generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); - + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); @@ -6049,7 +6118,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); - + //arrayCHK will skip to here if object is not an array generateLabelInstruction(cg, TR::InstOpCode::label, node, noDataAddr); @@ -6057,20 +6126,27 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: dstAddrReg = dstBaseAddrReg; if (useOffsetAsImmVal) - { + { int offsetImmVal = dstOffsetNode->getConstValue(); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstBaseAddrReg, offsetImmVal); - } + } else generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); - } + } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ // assemble the double word value from byte value - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, 0xffffffff00000000); + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::vspltb, valueNode, valueReg, valueReg, 7); + } + else + { + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 8, CONSTANT64(0x000000000000FF00)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 16, CONSTANT64(0x00000000FFFF0000)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, CONSTANT64(0xFFFFFFFF00000000)); + } generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); @@ -6078,42 +6154,94 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, temp1Reg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + + //store designated value to memory in chunks of 32 bytes + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //on P8 and higher, we can use vector instructions to cut down on loop iterations/number of stores + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + } + else + { + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + } + + //decrement counter and return to start of loop generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, cndReg); - generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); //check 16 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check 8 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check 4 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check 2 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //check 1 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueReg); + //loop exit + generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); + + //Set residual bytes (max number of residual bytes = 31 = 0x1F) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) //on P10, we can use stxvl to store all residual bytes efficiently + { + //First 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); //get first hex char (can only be 0 or 1) + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, temp2Reg, temp1Reg); //keep a copy of first hex char + + //store to memory + //NOTE: due to a quirk of the stxvl instruction on P10, the number of residual bytes must be shifted over before it can be used + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); + + //advance to next 16 byte chunk IF number of residual bytes >= 16 + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstAddrReg, temp2Reg); + + //Second 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 15); //get second hex char + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); //shift num residual bytes + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); //store to memory + } + else + { + TR::Register *valueResidueReg; + + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //since P8 and P9 used the vector approach, we first need to copy valueReg back into a GPR + generateTrg1Src1Instruction(cg, TR::InstOpCode::mfvsrd, node, temp2Reg, valueReg); + valueResidueReg = temp2Reg; + } + else + valueResidueReg = valueReg; + + //check if residual < 16 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check if residual < 8 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check if residual < 4 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check if residual < 2 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //residual <= 1 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueResidueReg); + } generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions);