From a4f64329d224fce702a259bb089010a156b0c2f1 Mon Sep 17 00:00:00 2001 From: midronij Date: Tue, 12 Sep 2023 12:30:20 -0400 Subject: [PATCH 1/5] Offheap Adjustments for Unsafe.setMemory() When Unsafe.setMemory() is called on an array and offheap changes are enabled, adjust arguments so that dataAddr is passed in as base address of object. Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 132 +++++++++++++++++++----- 1 file changed, 107 insertions(+), 25 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 232167eebf0..792faa0845e 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5902,23 +5902,28 @@ OMR::Power::TreeEvaluator::generateHelperBranchAndLinkInstruction( TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Compilation *comp = cg->comp(); - TR::Node *dstAddrNode, *lengthNode, *valueNode; - dstAddrNode = node->getChild(0); - lengthNode = node->getChild(1); - valueNode = node->getChild(2); - TR::Register *dstAddrReg, *lengthReg, *valueReg; - bool stopUsingCopyReg1, stopUsingCopyReg2 = false, stopUsingCopyReg3 = false; + TR::Node *dstBaseAddrNode = node->getChild(0); + TR::Node *dstOffsetNode = node->getChild(1); + TR::Node *lengthNode = node->getChild(2); + TR::Node *valueNode = node->getChild(3); + + TR::Register *dstBaseAddrReg, *dstOffsetReg, *lengthReg, *valueReg; + bool stopUsingCopyReg1, stopUsingCopyReg2, stopUsingCopyReg3 = false, stopUsingCopyReg4 = false; + + bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; + bool stopUsingCopyRegOffset = dstOffsetNode ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; + bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; - stopUsingCopyReg1 = TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg); + bool stopUsingCopyRegLen, stopUsingCopyRegVal; lengthReg = cg->evaluate(lengthNode); if (!cg->canClobberNodesRegister(lengthNode)) { - TR::Register *lenCopyReg = cg->allocateRegister(); + TR::Register *lenCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); lengthReg = lenCopyReg; - stopUsingCopyReg2 = true; + stopUsingCopyReg3 = true; } valueReg = cg->evaluate(valueNode); @@ -5927,7 +5932,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); valueReg = valCopyReg; - stopUsingCopyReg3 = true; + stopUsingCopyReg4 = true; } TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); @@ -5939,15 +5944,88 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); TR::RegisterDependencyConditions *conditions; - int32_t numDeps = 5; + int32_t numDeps = 7; conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); - TR::addDependency(conditions, dstAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, lengthReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, valueReg, TR::RealRegister::NoReg, TR_GPR, cg); - TR::Register * tempReg = cg->allocateRegister(); - TR::addDependency(conditions, tempReg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::Register * temp1Reg = cg->allocateRegister(); + TR::Register * temp2Reg = cg->allocateRegister(); + TR::addDependency(conditions, temp1Reg, TR::RealRegister::NoReg, TR_GPR, cg); + TR::addDependency(conditions, temp2Reg, TR::RealRegister::NoReg, TR_GPR, cg); + + +#if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) + // When using balanced GC policy with offheap allocation enabled, there are three possible cases: + // 1.) The object at dstBaseAddr is known to be a non-array object at compile time. In this scenario, no arrayCHK is + // generated, and no adjustments are made to dstBaseAddr or dstOffset. The behavior in this case should be identical + // to that under gencon GC policy. + // 2.) The object at dstBaseAddr is known to be an array at compile time. In this scenario, no arrayCHK is generated, but + // the dstBaseAddr and dstOffset with be adjusted as needed for offheap. + // 3.) The type of the object at dstBaseAddr is unknown at compile time. In this scenario, a runtime arrayCHK will generated, + // with two possible outcomes: if the object is an array, the dstBaseAddr and dstOffset will be adjusted, and if not, + // no adjustments will be made. + + //check dstBaseAddrNode type at compile time + int length; + const char *objTypeSig = dstBaseAddrNode->getSymbolReference()->getTypeSignature(length); + + //generate arrayCHK in case (3) only + bool arrayCheckNeeded = TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && + (objTypeSig == NULL || strstr(objTypeSig, "Ljava/lang/Object")); + + //adjust dstBaseAddr and dstOffset in cases (2) and (3) + bool adjustmentNeeded = arrayCheckNeeded || + TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && objTypeSig[0] == '['; + + //generate array check if needed + TR::LabelSymbol *notArray = generateLabelSymbol(cg); + + if (arrayCheckNeeded) + { + TR::Register *dstClassInfoReg = temp1Reg; + TR::Register *arrayFlagReg = temp2Reg; + + //load dst class info into temp1Reg + if (TR::Compiler->om.compressObjectReferences()) + generateTrg1MemInstruction(cg, TR::InstOpCode::lwz, node, dstClassInfoReg, + TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), 4)); + else + generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, + TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); + + TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstClassInfoReg, dstClassMR); + + //generate arrayCHK + loadConstant(cg, node, comp->fej9()->getFlagValueForArrayCheck(), arrayFlagReg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, arrayFlagReg, dstClassInfoReg, arrayFlagReg); + generateTrg1Src1ImmInstruction(cg,TR::InstOpCode::cmpi8, node, cndReg, arrayFlagReg, 0); + + //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, notArray, cndReg); + } + + //adjust dstBaseAddr if needed + if (adjustmentNeeded) + { + //load dataAddr + TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); + generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); + } + + //arrayCHK will skip to here if object is not an array + generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); + +#endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ + + //calculate dstAddr = dstBaseAddr + dstOffset + TR::Register *dstAddrReg = dstBaseAddrReg; + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); // assemble the double word value from byte value generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); @@ -5957,8 +6035,8 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::cmpli4 : TR::InstOpCode::cmpli8, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); - generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, tempReg, lengthReg, 5); - generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, tempReg); + generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); + generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, temp1Reg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); @@ -5968,48 +6046,52 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, cndReg); generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); //check 16 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 16); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check 8 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 8); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check 4 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 4); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check 2 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 2); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueReg); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //check 1 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, lengthReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueReg); generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); if (stopUsingCopyReg1) - cg->stopUsingRegister(dstAddrReg); + cg->stopUsingRegister(dstBaseAddrReg); if (stopUsingCopyReg2) - cg->stopUsingRegister(lengthReg); + cg->stopUsingRegister(dstOffsetReg); if (stopUsingCopyReg3) + cg->stopUsingRegister(lengthReg); + if (stopUsingCopyReg4) cg->stopUsingRegister(valueReg); cg->stopUsingRegister(cndReg); - cg->stopUsingRegister(tempReg); + cg->stopUsingRegister(temp1Reg); + cg->stopUsingRegister(temp2Reg); - cg->decReferenceCount(dstAddrNode); + cg->decReferenceCount(dstBaseAddrNode); + cg->decReferenceCount(dstOffsetNode); cg->decReferenceCount(lengthNode); cg->decReferenceCount(valueNode); From 310722641a65dabb3650e80b25d4f28ed6e9b67b Mon Sep 17 00:00:00 2001 From: midronij Date: Tue, 24 Oct 2023 13:52:19 -0400 Subject: [PATCH 2/5] Skip uneccesary runtime array check for Unsafe.setMemory() Only generate runtime array check in setmemoryEvaluator() if it is needed (i.e.: object type is unknown at compile time). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 138 +++++++++++++----------- 1 file changed, 78 insertions(+), 60 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 792faa0845e..3a931a5e79f 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5902,14 +5902,35 @@ OMR::Power::TreeEvaluator::generateHelperBranchAndLinkInstruction( TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR::CodeGenerator *cg) { TR::Compilation *comp = cg->comp(); + TR::Node *dstBaseAddrNode, *dstOffsetNode, *dstAddrNode, *lengthNode, *valueNode; - TR::Node *dstBaseAddrNode = node->getChild(0); - TR::Node *dstOffsetNode = node->getChild(1); - TR::Node *lengthNode = node->getChild(2); - TR::Node *valueNode = node->getChild(3); + bool arrayCheckNeeded; + + // IL tree structure depends on whether or not it's been determined that a runtime arrayCHK is needed: + // if node has four children (i.e.: object base address and offset are separate), need array check + // if node three children (i.e.: object base address and offset have already been added together), don't need array check + if (node->getNumChildren() == 4) + { + arrayCheckNeeded = true; + + dstBaseAddrNode = node->getChild(0); + dstOffsetNode = node->getChild(1); + dstAddrNode = NULL; + lengthNode = node->getChild(2); + valueNode = node->getChild(3); + } + else //i.e.: node->getNumChildren() == 3 + { + arrayCheckNeeded = false; + + dstBaseAddrNode = NULL; + dstOffsetNode = NULL; + dstAddrNode = node->getChild(0); + lengthNode = node->getChild(1); + valueNode = node->getChild(2); + } - TR::Register *dstBaseAddrReg, *dstOffsetReg, *lengthReg, *valueReg; - bool stopUsingCopyReg1, stopUsingCopyReg2, stopUsingCopyReg3 = false, stopUsingCopyReg4 = false; + TR::Register *dstBaseAddrReg, *dstOffsetReg, *dstAddrReg, *lengthReg, *valueReg; bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; bool stopUsingCopyRegOffset = dstOffsetNode ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; @@ -5923,16 +5944,16 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::Register *lenCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); lengthReg = lenCopyReg; - stopUsingCopyReg3 = true; + stopUsingCopyRegLen = true; } valueReg = cg->evaluate(valueNode); if (!cg->canClobberNodesRegister(valueNode)) { - TR::Register *valCopyReg = cg->allocateRegister(); + TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); valueReg = valCopyReg; - stopUsingCopyReg4 = true; + stopUsingCopyRegVal = true; } TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); @@ -5944,48 +5965,46 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); TR::RegisterDependencyConditions *conditions; - int32_t numDeps = 7; + int32_t numDeps = arrayCheckNeeded ? 7 : 6; conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); - TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); - TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); + + if (arrayCheckNeeded) + { + //dstBaseAddrReg holds the address of the object being written to, so need to exclude GPR0 + TR::addDependency(conditions, dstBaseAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(conditions->getAddCursorForPost() - 1)->setExcludeGPR0(); + + if (!useOffsetAsImmVal) + TR::addDependency(conditions, dstOffsetReg, TR::RealRegister::NoReg, TR_GPR, cg); + } + else + { + //dstAddrReg holds the address of the object being written to, so need to exclude GPR0 + TR::addDependency(conditions, dstAddrReg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(1)->setExcludeGPR0(); + } + TR::addDependency(conditions, lengthReg, TR::RealRegister::NoReg, TR_GPR, cg); TR::addDependency(conditions, valueReg, TR::RealRegister::NoReg, TR_GPR, cg); + + //temp1Reg will later be used to hold the J9Class flags for the object at dst, so need to exclude GPR0 TR::Register * temp1Reg = cg->allocateRegister(); - TR::Register * temp2Reg = cg->allocateRegister(); TR::addDependency(conditions, temp1Reg, TR::RealRegister::NoReg, TR_GPR, cg); + conditions->getPostConditions()->getRegisterDependency(conditions->getAddCursorForPost() - 1)->setExcludeGPR0(); + + TR::Register * temp2Reg = cg->allocateRegister(); TR::addDependency(conditions, temp2Reg, TR::RealRegister::NoReg, TR_GPR, cg); #if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) - // When using balanced GC policy with offheap allocation enabled, there are three possible cases: - // 1.) The object at dstBaseAddr is known to be a non-array object at compile time. In this scenario, no arrayCHK is - // generated, and no adjustments are made to dstBaseAddr or dstOffset. The behavior in this case should be identical - // to that under gencon GC policy. - // 2.) The object at dstBaseAddr is known to be an array at compile time. In this scenario, no arrayCHK is generated, but - // the dstBaseAddr and dstOffset with be adjusted as needed for offheap. - // 3.) The type of the object at dstBaseAddr is unknown at compile time. In this scenario, a runtime arrayCHK will generated, - // with two possible outcomes: if the object is an array, the dstBaseAddr and dstOffset will be adjusted, and if not, - // no adjustments will be made. - - //check dstBaseAddrNode type at compile time - int length; - const char *objTypeSig = dstBaseAddrNode->getSymbolReference()->getTypeSignature(length); - - //generate arrayCHK in case (3) only - bool arrayCheckNeeded = TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && - (objTypeSig == NULL || strstr(objTypeSig, "Ljava/lang/Object")); - - //adjust dstBaseAddr and dstOffset in cases (2) and (3) - bool adjustmentNeeded = arrayCheckNeeded || - TR::Compiler->om.isOffHeapAllocationEnabled() && comp->target().is64Bit() && objTypeSig[0] == '['; - - //generate array check if needed - TR::LabelSymbol *notArray = generateLabelSymbol(cg); - if (arrayCheckNeeded) + if (arrayCheckNeeded) // CASE (3) { + //generate array check if needed + TR::LabelSymbol *notArray = generateLabelSymbol(cg); + TR::Register *dstClassInfoReg = temp1Reg; TR::Register *arrayFlagReg = temp2Reg; @@ -5996,37 +6015,33 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: else generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstClassInfoReg, dstClassMR); - //generate arrayCHK - loadConstant(cg, node, comp->fej9()->getFlagValueForArrayCheck(), arrayFlagReg); - generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, arrayFlagReg, dstClassInfoReg, arrayFlagReg); - generateTrg1Src1ImmInstruction(cg,TR::InstOpCode::cmpi8, node, cndReg, arrayFlagReg, 0); + //generate array check + int32_t arrayFlagValue = comp->fej9()->getFlagValueForArrayCheck(); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, arrayFlagReg, dstClassInfoReg, arrayFlagValue >> 16); //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, notArray, cndReg); - } - //adjust dstBaseAddr if needed - if (adjustmentNeeded) - { - //load dataAddr + //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); - } + + //arrayCHK will skip to here if object is not an array + generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); - //arrayCHK will skip to here if object is not an array - generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); + //calculate dstAddr = dstBaseAddr + dstOffset + dstAddrReg = dstBaseAddrReg; + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); + } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ - //calculate dstAddr = dstBaseAddr + dstOffset - TR::Register *dstAddrReg = dstBaseAddrReg; - generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); - // assemble the double word value from byte value generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); @@ -6077,21 +6092,24 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions); - if (stopUsingCopyReg1) + if (stopUsingCopyRegBase) cg->stopUsingRegister(dstBaseAddrReg); - if (stopUsingCopyReg2) + if (stopUsingCopyRegOffset) cg->stopUsingRegister(dstOffsetReg); - if (stopUsingCopyReg3) + if (stopUsingCopyRegAddr) + cg->stopUsingRegister(dstAddrReg); + if (stopUsingCopyRegLen) cg->stopUsingRegister(lengthReg); - if (stopUsingCopyReg4) + if (stopUsingCopyRegVal) cg->stopUsingRegister(valueReg); cg->stopUsingRegister(cndReg); cg->stopUsingRegister(temp1Reg); cg->stopUsingRegister(temp2Reg); - cg->decReferenceCount(dstBaseAddrNode); - cg->decReferenceCount(dstOffsetNode); + if (dstBaseAddrNode) cg->decReferenceCount(dstBaseAddrNode); + if (dstOffsetNode) cg->decReferenceCount(dstOffsetNode); + if (dstAddrNode) cg->decReferenceCount(dstAddrNode); cg->decReferenceCount(lengthNode); cg->decReferenceCount(valueNode); From 038794441609ff0102fc76a5617caef582ce1145 Mon Sep 17 00:00:00 2001 From: midronij Date: Wed, 29 May 2024 00:36:23 -0400 Subject: [PATCH 3/5] Add NULLCHK on object address passed in to Unsafe.setMemory() In situations where an array check is needed, there are scenarios in which we do not want to modify the dest base address: 1.) If the object is a NULL reference (since we can't load dataAddr from a NULL pointer) 2.) If the object is a non-array object Thus, before the array check is performed, a null test is needed to account for situation (1). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 3a931a5e79f..806c192b214 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -6002,9 +6002,23 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: if (arrayCheckNeeded) // CASE (3) { - //generate array check if needed - TR::LabelSymbol *notArray = generateLabelSymbol(cg); + // There are two scenarios in which we DON'T want to modify the dest base address: + // 1.) If the object is NULL (since we can't load dataAddr from a NULL pointer) + // 2.) If the object is a non-array object + // So two checks are required (NULL, Array) to determine whether dataAddr should be loaded or not + TR::LabelSymbol *noDataAddr = generateLabelSymbol(cg); + + // We only want to generate a runtime NULL check if the status of the object (i.e.: whether it is NULL or non-NULL) + // is NOT known. Note that if the object is known to be NULL, arrayCheckNeeded will be false, so there is no need to check + // that condition here. + if (!dstBaseAddrNode->isNonNull()) + { + //generate NULL test + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpi, node, cndReg, dstBaseAddrReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); + } + //Array Check TR::Register *dstClassInfoReg = temp1Reg; TR::Register *arrayFlagReg = temp2Reg; @@ -6026,14 +6040,14 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, arrayFlagReg, dstClassInfoReg, arrayFlagValue >> 16); //if object is not an array (i.e.: temp1Reg & temp2Reg == 0), skip adjusting dstBaseAddr and dstOffset - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, notArray, cndReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); //arrayCHK will skip to here if object is not an array - generateLabelInstruction(cg, TR::InstOpCode::label, node, notArray); + generateLabelInstruction(cg, TR::InstOpCode::label, node, noDataAddr); //calculate dstAddr = dstBaseAddr + dstOffset dstAddrReg = dstBaseAddrReg; @@ -6047,7 +6061,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, 0xffffffff00000000); - generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::cmpli4 : TR::InstOpCode::cmpli8, node, cndReg, lengthReg, 32); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); From cf8b0b509ebac44b18df5ce78375e67cb8706e9c Mon Sep 17 00:00:00 2001 From: midronij Date: Mon, 30 Oct 2023 17:06:34 -0400 Subject: [PATCH 4/5] Avoid allocating extra register when Unsafe.setMemory() offset is constant When destOffset is a constant 16-bit value, it can be represented as the immediate value argument to addi when calculating the final destination address (i.e.: dest = base address + offset). This allows us to allocate one less register when generating the assembly code sequence for Unsafe.setMemory(). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index 806c192b214..b2abcabb3ae 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5932,8 +5932,12 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::Register *dstBaseAddrReg, *dstOffsetReg, *dstAddrReg, *lengthReg, *valueReg; + // if the offset is a constant value less than 16 bits, then we dont need a separate register for it + bool useOffsetAsImmVal = dstOffsetNode && dstOffsetNode->getOpCode().isLoadConst() && + (dstOffsetNode->getConstValue() >= LOWER_IMMED) && (dstOffsetNode->getConstValue() <= UPPER_IMMED); + bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; - bool stopUsingCopyRegOffset = dstOffsetNode ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; + bool stopUsingCopyRegOffset = (dstOffsetNode && !useOffsetAsImmVal) ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; bool stopUsingCopyRegLen, stopUsingCopyRegVal; @@ -5965,7 +5969,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); TR::RegisterDependencyConditions *conditions; - int32_t numDeps = arrayCheckNeeded ? 7 : 6; + int32_t numDeps = (!arrayCheckNeeded || useOffsetAsImmVal) ? 6 : 7; conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); @@ -6051,7 +6055,14 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //calculate dstAddr = dstBaseAddr + dstOffset dstAddrReg = dstBaseAddrReg; - generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); + + if (useOffsetAsImmVal) + { + int offsetImmVal = dstOffsetNode->getConstValue(); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstBaseAddrReg, offsetImmVal); + } + else + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ From e714da12266a98b8b612242f35c6b11eb5b69b46 Mon Sep 17 00:00:00 2001 From: midronij Date: Mon, 30 Oct 2023 19:16:24 -0400 Subject: [PATCH 5/5] Use Vector Instructions to Optimize Unsafe.setMemory() on PPC On P8 and higher, we can make use of vector stores (stxvd2x and, for P10 specifically, stxvl) to reduce the number of memory accesses and avoid checks needed to set residual bytes in the assembly code that is generated for Unsafe.setMemory(). Signed-off-by: midronij --- compiler/p/codegen/OMRTreeEvaluator.cpp | 238 ++++++++++++++++++------ 1 file changed, 183 insertions(+), 55 deletions(-) diff --git a/compiler/p/codegen/OMRTreeEvaluator.cpp b/compiler/p/codegen/OMRTreeEvaluator.cpp index b2abcabb3ae..df8091d3bc7 100644 --- a/compiler/p/codegen/OMRTreeEvaluator.cpp +++ b/compiler/p/codegen/OMRTreeEvaluator.cpp @@ -5937,25 +5937,82 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: (dstOffsetNode->getConstValue() >= LOWER_IMMED) && (dstOffsetNode->getConstValue() <= UPPER_IMMED); bool stopUsingCopyRegBase = dstBaseAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstBaseAddrNode, dstBaseAddrReg, cg) : false; - bool stopUsingCopyRegOffset = (dstOffsetNode && !useOffsetAsImmVal) ? TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg) : false; bool stopUsingCopyRegAddr = dstAddrNode ? TR::TreeEvaluator::stopUsingCopyReg(dstAddrNode, dstAddrReg, cg) : false ; - bool stopUsingCopyRegLen, stopUsingCopyRegVal; + bool stopUsingCopyRegOffset, stopUsingCopyRegLen, stopUsingCopyRegVal; + //dstOffsetNode (type: long) + if (dstOffsetNode && !useOffsetAsImmVal) //only want to allocate a register for dstoffset if we're using it for the array check AND it isn't a constant + { + if (!cg->canClobberNodesRegister(lengthNode)) //only need to copy dstOffset into another register if the current one isn't clobberable + { + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + TR::Register *offsetCopyReg = cg->allocateRegister(); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, dstOffsetNode, offsetCopyReg, dstOffsetReg->getLowOrder()); + + dstOffsetReg = offsetCopyReg; + stopUsingCopyRegOffset = true; + } + else + { + stopUsingCopyRegOffset = TR::TreeEvaluator::stopUsingCopyReg(dstOffsetNode, dstOffsetReg, cg); + } + } + else + { + dstOffsetReg = cg->evaluate(dstOffsetNode); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of offset from the register pair + dstOffsetReg = dstOffsetReg->getLowOrder(); + + stopUsingCopyRegOffset = false; + } + } + else + { + stopUsingCopyRegOffset = false; + } + + //lengthNode (type: long) lengthReg = cg->evaluate(lengthNode); if (!cg->canClobberNodesRegister(lengthNode)) { - TR::Register *lenCopyReg = cg->allocateRegister(); - generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + TR::Register *lenCopyReg = cg->allocateRegister(); + + if (cg->comp()->target().is32Bit()) //on 32-bit systems, need to grab the lower 32 bits of length from the register pair + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg->getLowOrder()); + else //on 64-bit system, can just do a normal copy + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, lengthNode, lenCopyReg, lengthReg); + lengthReg = lenCopyReg; stopUsingCopyRegLen = true; } + else + { + if (cg->comp()->target().is32Bit()) //on 32-bit system, need to grab lower 32 bits of length from the register pair + lengthReg = lengthReg->getLowOrder(); + + stopUsingCopyRegLen = false; + } + //valueNode (type: byte) valueReg = cg->evaluate(valueNode); - if (!cg->canClobberNodesRegister(valueNode)) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) { - TR::Register *valCopyReg = cg->allocateRegister(); + //on P8 or higher, we can use vector instructions to cut down on loop iterations and residual tests -> need to copy valueReg into a VSX register + TR::Register *valVectorReg = cg->allocateRegister(TR_VRF); + generateTrg1Src1Instruction(cg, TR::InstOpCode::mtvsrd, valueNode, valVectorReg, valueReg); + + valueReg = valVectorReg; + stopUsingCopyRegVal = true; + } + else if (!cg->canClobberNodesRegister(valueNode)) + { + TR::Register *valCopyReg = cg->allocateRegister(); generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, valueNode, valCopyReg, valueReg); + valueReg = valCopyReg; stopUsingCopyRegVal = true; } @@ -5963,13 +6020,25 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: TR::LabelSymbol * residualLabel = generateLabelSymbol(cg); TR::LabelSymbol * loopStartLabel = generateLabelSymbol(cg); TR::LabelSymbol * doneLabel = generateLabelSymbol(cg); - TR::LabelSymbol * label8aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label4aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label2aligned = generateLabelSymbol(cg); - TR::LabelSymbol * label1aligned = generateLabelSymbol(cg); + + //these labels are not needed for the vector approach to storing to residual bytes (i.e.: P10+) + TR::LabelSymbol *label8aligned, *label4aligned, *label2aligned, *label1aligned; + + if (!cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) + { + label8aligned = generateLabelSymbol(cg); + label4aligned = generateLabelSymbol(cg); + label2aligned = generateLabelSymbol(cg); + label1aligned = generateLabelSymbol(cg); + } TR::RegisterDependencyConditions *conditions; - int32_t numDeps = (!arrayCheckNeeded || useOffsetAsImmVal) ? 6 : 7; + int32_t numDeps = 6; + + //need extra register for offset only if it isn't already included in the destination address AND it isn't a constant + if (arrayCheckNeeded && !useOffsetAsImmVal) + numDeps++; + conditions = new (cg->trHeapMemory()) TR::RegisterDependencyConditions(numDeps, numDeps, cg->trMemory()); TR::Register *cndReg = cg->allocateRegister(TR_CCR); TR::addDependency(conditions, cndReg, TR::RealRegister::cr0, TR_CCR, cg); @@ -6005,7 +6074,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: #if defined (J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION) if (arrayCheckNeeded) // CASE (3) - { + { // There are two scenarios in which we DON'T want to modify the dest base address: // 1.) If the object is NULL (since we can't load dataAddr from a NULL pointer) // 2.) If the object is a non-array object @@ -6020,7 +6089,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //generate NULL test generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpi, node, cndReg, dstBaseAddrReg, 0); generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, noDataAddr, cndReg); - } + } //Array Check TR::Register *dstClassInfoReg = temp1Reg; @@ -6033,7 +6102,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: else generateTrg1MemInstruction(cg,TR::InstOpCode::Op_load, node, dstClassInfoReg, TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, static_cast(TR::Compiler->om.offsetOfObjectVftField()), TR::Compiler->om.sizeofReferenceAddress())); - + TR::TreeEvaluator::generateVFTMaskInstruction(cg, node, dstClassInfoReg); TR::MemoryReference *dstClassMR = TR::MemoryReference::createWithDisplacement(cg, dstClassInfoReg, offsetof(J9Class, classDepthAndFlags), TR::Compiler->om.sizeofReferenceAddress()); @@ -6049,7 +6118,7 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: //load dataAddr if object is array: TR::MemoryReference *dataAddrSlotMR = TR::MemoryReference::createWithDisplacement(cg, dstBaseAddrReg, comp->fej9()->getOffsetOfContiguousDataAddrField(), TR::Compiler->om.sizeofReferenceAddress()); generateTrg1MemInstruction(cg, TR::InstOpCode::Op_load, node, dstBaseAddrReg, dataAddrSlotMR); - + //arrayCHK will skip to here if object is not an array generateLabelInstruction(cg, TR::InstOpCode::label, node, noDataAddr); @@ -6057,20 +6126,27 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: dstAddrReg = dstBaseAddrReg; if (useOffsetAsImmVal) - { + { int offsetImmVal = dstOffsetNode->getConstValue(); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstBaseAddrReg, offsetImmVal); - } + } else generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstBaseAddrReg, dstOffsetReg); - } + } #endif /* J9VM_GC_ENABLE_SPARSE_HEAP_ALLOCATION */ // assemble the double word value from byte value - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 8, 0xff00); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rlwimi, node, valueReg, valueReg, 16, 0xffff0000); - generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, 0xffffffff00000000); + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::vspltb, valueNode, valueReg, valueReg, 7); + } + else + { + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 8, CONSTANT64(0x000000000000FF00)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 16, CONSTANT64(0x00000000FFFF0000)); + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldimi, node, valueReg, valueReg, 32, CONSTANT64(0xFFFFFFFF00000000)); + } generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::Op_cmpli, node, cndReg, lengthReg, 32); generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, residualLabel, cndReg); @@ -6078,42 +6154,94 @@ TR::Register *OMR::Power::TreeEvaluator::setmemoryEvaluator(TR::Node *node, TR:: generateTrg1Src1ImmInstruction(cg, lengthNode->getType().isInt32() ? TR::InstOpCode::srawi : TR::InstOpCode::sradi, node, temp1Reg, lengthReg, 5); generateSrc1Instruction(cg, TR::InstOpCode::mtctr, node, temp1Reg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopStartLabel); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + + //store designated value to memory in chunks of 32 bytes + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //on P8 and higher, we can use vector instructions to cut down on loop iterations/number of stores + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + generateMemSrc1Instruction(cg, TR::InstOpCode::stxvd2x, node, TR::MemoryReference::createWithIndexReg(cg, NULL, dstAddrReg, 16), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + } + else + { + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 16, 8), valueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 24, 8), valueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 32); + } + + //decrement counter and return to start of loop generateConditionalBranchInstruction(cg, TR::InstOpCode::bdnz, node, loopStartLabel, cndReg); - generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); //check 16 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check 8 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check 4 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check 2 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueReg); - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); - - generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //check 1 aligned - generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); - generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); - generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueReg); + //loop exit + generateLabelInstruction(cg, TR::InstOpCode::label, node, residualLabel); + + //Set residual bytes (max number of residual bytes = 31 = 0x1F) + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P10)) //on P10, we can use stxvl to store all residual bytes efficiently + { + //First 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); //get first hex char (can only be 0 or 1) + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, temp2Reg, temp1Reg); //keep a copy of first hex char + + //store to memory + //NOTE: due to a quirk of the stxvl instruction on P10, the number of residual bytes must be shifted over before it can be used + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); + + //advance to next 16 byte chunk IF number of residual bytes >= 16 + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, dstAddrReg, dstAddrReg, temp2Reg); + + //Second 16 byte segment + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 15); //get second hex char + generateTrg1Src1Imm2Instruction(cg, TR::InstOpCode::rldicr, node, temp1Reg, temp1Reg, 56, CONSTANT64(0xFF00000000000000)); //shift num residual bytes + generateSrc3Instruction(cg, TR::InstOpCode::stxvl, node, valueReg, dstAddrReg, temp1Reg); //store to memory + } + else + { + TR::Register *valueResidueReg; + + if (cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8)) + { + //since P8 and P9 used the vector approach, we first need to copy valueReg back into a GPR + generateTrg1Src1Instruction(cg, TR::InstOpCode::mfvsrd, node, temp2Reg, valueReg); + valueResidueReg = temp2Reg; + } + else + valueResidueReg = valueReg; + + //check if residual < 16 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 16); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label8aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 8, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 16); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label8aligned); //check if residual < 8 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 8); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label4aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::std, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 8), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 8); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label4aligned); //check if residual < 4 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label2aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stw, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 4), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 4); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label2aligned); //check if residual < 2 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 2); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, label1aligned, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::sth, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 2), valueResidueReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, dstAddrReg, dstAddrReg, 2); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, label1aligned); //residual <= 1 + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, temp1Reg, lengthReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, doneLabel, cndReg); + generateMemSrc1Instruction(cg, TR::InstOpCode::stb, node, TR::MemoryReference::createWithDisplacement(cg, dstAddrReg, 0, 1), valueResidueReg); + } generateDepLabelInstruction(cg, TR::InstOpCode::label, node, doneLabel, conditions);