Skip to content

Commit

Permalink
Optimize stackalloc zeroing via BLK (#83255)
Browse files Browse the repository at this point in the history
Co-authored-by: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com>
  • Loading branch information
EgorBo and SingleAccretion committed Apr 4, 2023
1 parent 3e6ad47 commit e13f0dc
Show file tree
Hide file tree
Showing 6 changed files with 502 additions and 118 deletions.
95 changes: 26 additions & 69 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2742,18 +2742,10 @@ void CodeGen::genLclHeap(GenTree* tree)

// compute the amount of memory to allocate to properly STACK_ALIGN.
size_t amount = 0;
if (size->IsCnsIntOrI())
if (size->IsCnsIntOrI() && size->isContained())
{
// If size is a constant, then it must be contained.
assert(size->isContained());

// If amount is zero then return null in targetReg
amount = size->AsIntCon()->gtIconVal;
if (amount == 0)
{
instGen_Set_Reg_To_Zero(EA_PTRSIZE, targetReg);
goto BAILOUT;
}
assert((amount > 0) && (amount <= UINT_MAX));

// 'amount' is the total number of bytes to localloc to properly STACK_ALIGN
amount = AlignUp(amount, STACK_ALIGN);
Expand Down Expand Up @@ -2848,77 +2840,44 @@ void CodeGen::genLclHeap(GenTree* tree)
goto ALLOC_DONE;
}

inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
stackAdjustment += (target_size_t)compiler->lvaOutgoingArgSpaceSize;
locAllocStackOffset = stackAdjustment;
if (size->IsCnsIntOrI() && size->isContained())
{
stackAdjustment = 0;
locAllocStackOffset = (target_size_t)compiler->lvaOutgoingArgSpaceSize;
}
else
{
inst_RV_IV(INS_add, REG_SPBASE, compiler->lvaOutgoingArgSpaceSize, EA_PTRSIZE);
stackAdjustment += (target_size_t)compiler->lvaOutgoingArgSpaceSize;
locAllocStackOffset = stackAdjustment;
}
}
#endif

if (size->IsCnsIntOrI())
if (size->IsCnsIntOrI() && size->isContained())
{
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);
assert((amount % STACK_ALIGN) == 0);
assert((amount % REGSIZE_BYTES) == 0);

// For small allocations we will generate up to six push 0 inline
size_t cntRegSizedWords = amount / REGSIZE_BYTES;
if (compiler->info.compInitMem && (cntRegSizedWords <= 6))
// We should reach here only for non-zero, constant size allocations which we zero
// via BLK explicitly, so just bump the stack pointer.
if ((amount >= compiler->eeGetPageSize()) || (TARGET_POINTER_SIZE == 4))
{
for (; cntRegSizedWords != 0; cntRegSizedWords--)
{
inst_IV(INS_push_hide, 0); // push_hide means don't track the stack
}

lastTouchDelta = 0;

goto ALLOC_DONE;
}

#ifdef TARGET_X86
bool needRegCntRegister = true;
#else // !TARGET_X86
bool needRegCntRegister = initMemOrLargeAlloc;
#endif // !TARGET_X86

if (needRegCntRegister)
{
// If compInitMem=true, we can reuse targetReg as regcnt.
// Since size is a constant, regCnt is not yet initialized.
assert(regCnt == REG_NA);
if (compiler->info.compInitMem)
{
assert(tree->AvailableTempRegCount() == 0);
regCnt = targetReg;
}
else
{
regCnt = tree->GetSingleTempReg();
}
regCnt = tree->GetSingleTempReg();
instGen_Set_Reg_To_Imm(EA_PTRSIZE, regCnt, -(ssize_t)amount);
genStackPointerDynamicAdjustmentWithProbe(regCnt);
// lastTouchDelta is dynamic, and can be up to a page. So if we have outgoing arg space,
// we're going to assume the worst and probe.
}

if (!initMemOrLargeAlloc)
else
{
// Since the size is less than a page, and we don't need to zero init memory, simply adjust ESP.
// ESP might already be in the guard page, so we must touch it BEFORE
// the alloc, not after.

assert(amount < compiler->eeGetPageSize()); // must be < not <=
// ESP might already be in the guard page, so we must touch it BEFORE the alloc, not after.
lastTouchDelta = genStackPointerConstantAdjustmentLoopWithProbe(-(ssize_t)amount,
/* trackSpAdjustments */ regCnt == REG_NA);
goto ALLOC_DONE;
/* trackSpAdjustments */ true);
}

// else, "mov regCnt, amount"

if (compiler->info.compInitMem)
{
// When initializing memory, we want 'amount' to be the loop count.
assert((amount % STACK_ALIGN) == 0);
amount /= STACK_ALIGN;
}

instGen_Set_Reg_To_Imm(((size_t)(int)amount == amount) ? EA_4BYTE : EA_8BYTE, regCnt, amount);
goto ALLOC_DONE;
}

// We should not have any temp registers at this point.
Expand Down Expand Up @@ -2996,8 +2955,6 @@ void CodeGen::genLclHeap(GenTree* tree)
genDefineTempLabel(endLabel);
}

BAILOUT:

#ifdef JIT32_GCENCODER
if (compiler->lvaLocAllocSPvar != BAD_VAR_NUM)
{
Expand Down
66 changes: 65 additions & 1 deletion src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ GenTree* Lowering::LowerNode(GenTree* node)
break;

case GT_LCLHEAP:
ContainCheckLclHeap(node->AsOp());
LowerLclHeap(node);
break;

#ifdef TARGET_XARCH
Expand Down Expand Up @@ -7992,6 +7992,70 @@ void Lowering::TransformUnusedIndirection(GenTreeIndir* ind, Compiler* comp, Bas
}
}

//------------------------------------------------------------------------
// LowerLclHeap: a common logic to lower LCLHEAP.
//
// Arguments:
// blkNode - the LCLHEAP node we are lowering.
//
void Lowering::LowerLclHeap(GenTree* node)
{
assert(node->OperIs(GT_LCLHEAP));

#if defined(TARGET_XARCH)
if (node->gtGetOp1()->IsCnsIntOrI())
{
GenTreeIntCon* sizeNode = node->gtGetOp1()->AsIntCon();
ssize_t size = sizeNode->IconValue();

if (size == 0)
{
// Replace with null for LCLHEAP(0)
node->BashToZeroConst(TYP_I_IMPL);
BlockRange().Remove(sizeNode);
return;
}

if (comp->info.compInitMem)
{
ssize_t alignedSize = ALIGN_UP(size, STACK_ALIGN);
if ((size > UINT_MAX) || (alignedSize > UINT_MAX))
{
// Size is too big - don't mark sizeNode as contained
return;
}

LIR::Use use;
if (BlockRange().TryGetUse(node, &use))
{
// Align LCLHEAP size for more efficient zeroing via BLK
sizeNode->SetIconValue(alignedSize);

// Emit STORE_BLK to zero it
//
// * STORE_BLK struct<alignedSize> (init) (Unroll)
// +--* LCL_VAR long V01
// \--* CNS_INT int 0
//
GenTree* heapLcl = comp->gtNewLclvNode(use.ReplaceWithLclVar(comp), TYP_I_IMPL);
GenTree* zero = comp->gtNewIconNode(0);
GenTreeBlk* storeBlk = new (comp, GT_STORE_BLK)
GenTreeBlk(GT_STORE_BLK, TYP_STRUCT, heapLcl, zero, comp->typGetBlkLayout((unsigned)alignedSize));
storeBlk->gtFlags |= (GTF_IND_UNALIGNED | GTF_ASG | GTF_EXCEPT | GTF_GLOB_REF);
BlockRange().InsertAfter(use.Def(), heapLcl, zero, storeBlk);
LowerNode(storeBlk);
}
else
{
// Value is unused and we don't mark the size node as contained
return;
}
}
}
#endif
ContainCheckLclHeap(node->AsOp());
}

//------------------------------------------------------------------------
// LowerBlockStoreCommon: a common logic to lower STORE_OBJ/BLK/DYN_BLK.
//
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ class Lowering final : public Phase
GenTree* LowerSignedDivOrMod(GenTree* node);
void LowerBlockStore(GenTreeBlk* blkNode);
void LowerBlockStoreCommon(GenTreeBlk* blkNode);
void LowerLclHeap(GenTree* node);
void ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenTree* addr, GenTree* addrParent);
void LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode);
#ifdef TARGET_XARCH
Expand Down
53 changes: 5 additions & 48 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1843,60 +1843,17 @@ int LinearScan::BuildLclHeap(GenTree* tree)
{
int srcCount = 1;

// Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
// Here '-' means don't care.
//
// Size? Init Memory? # temp regs
// 0 - 0 (returns 0)
// const and <=6 reg words - 0 (pushes '0')
// const and >6 reg words Yes 0 (pushes '0')
// const and <PageSize No 0 (amd64) 1 (x86)
//
// const and >=PageSize No 1 (regCnt)
// Non-const Yes 0 (regCnt=targetReg and pushes '0')
// Non-const No 1 (regCnt)
//
// Note: Here we don't need internal register to be different from targetReg.
// Rather, require it to be different from operand's reg.

GenTree* size = tree->gtGetOp1();
if (size->IsCnsIntOrI())
if (size->IsCnsIntOrI() && size->isContained())
{
assert(size->isContained());
srcCount = 0;
size_t sizeVal = size->AsIntCon()->gtIconVal;
size_t sizeVal = AlignUp((size_t)size->AsIntCon()->gtIconVal, STACK_ALIGN);

if (sizeVal == 0)
// Explicitly zeroed LCLHEAP also needs a regCnt in case of x86 or large page
if ((TARGET_POINTER_SIZE == 4) || (sizeVal >= compiler->eeGetPageSize()))
{
// For regCnt
buildInternalIntRegisterDefForNode(tree);
}
else
{
// Compute the amount of memory to properly STACK_ALIGN.
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
// This should also help in debugging as we can examine the original size specified with localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);

// For small allocations up to 6 pointer sized words (i.e. 48 bytes of localloc)
// we will generate 'push 0'.
assert((sizeVal % REGSIZE_BYTES) == 0);

if (!compiler->info.compInitMem)
{
#ifdef TARGET_X86
// x86 always needs regCnt.
// For regCnt
buildInternalIntRegisterDefForNode(tree);
#else // !TARGET_X86
if (sizeVal >= compiler->eeGetPageSize())
{
// For regCnt
buildInternalIntRegisterDefForNode(tree);
}
#endif // !TARGET_X86
}
}
}
else
{
Expand All @@ -1905,7 +1862,7 @@ int LinearScan::BuildLclHeap(GenTree* tree)
// For regCnt
buildInternalIntRegisterDefForNode(tree);
}
BuildUse(size);
BuildUse(size); // could be a non-contained constant
}
buildInternalRegisterUses();
BuildDef(tree);
Expand Down
Loading

0 comments on commit e13f0dc

Please sign in to comment.