Skip to content

Commit

Permalink
Local heap optimizations on Arm64 (dotnet#64481)
Browse files Browse the repository at this point in the history
# Local heap optimizations on Arm64

1. When not required to zero the allocated space for local heap (for sizes up to 64 bytes) - do not emit zeroing sequence. Instead do stack probing and adjust stack pointer:

```diff
-            stp     xzr, xzr, [sp,#-16]!
-            stp     xzr, xzr, [sp,#-16]!
-            stp     xzr, xzr, [sp,#-16]!
-            stp     xzr, xzr, [sp,#-16]!
+            ldr     wzr, [sp],#-64
```

2. For sizes less than one `PAGE_SIZE` use `ldr wzr, [sp], #-amount` that does probing at `[sp]` and allocates the space at the same time. This saves one instruction for such local heap allocations:

```diff
-            ldr     wzr, [sp]
-            sub     sp, sp, #208
+            ldr     wzr, [sp],#-208
```

Use `ldp tmpReg, xzr, [sp], #-amount` when the offset not encodable by post-index variant of `ldr`:
```diff
-            ldr     wzr, [sp]
-            sub     sp, sp, dotnet#512
+            ldp     x0, xzr, [sp],#-512
```

3. Allow non-loop zeroing (i.e. unrolled sequence) for sizes up to 128 bytes (i.e. up to `LCLHEAP_UNROLL_LIMIT`). This frees up two internal integer registers for such cases:

```diff
-            mov     w11, #128
-                                               ;; bbWeight=0.50 PerfScore 0.25
-G_M44913_IG19:        ; gcrefRegs=00F9 {x0 x3 x4 x5 x6 x7}, byrefRegs=0000 {}, byref, isz
             stp     xzr, xzr, [sp,#-16]!
-            subs    x11, x11, #16
-            bne     G_M44913_IG19
+            stp     xzr, xzr, [sp,#-112]!
+            stp     xzr, xzr, [sp,#16]
+            stp     xzr, xzr, [sp,#32]
+            stp     xzr, xzr, [sp,#48]
+            stp     xzr, xzr, [sp,#64]
+            stp     xzr, xzr, [sp,#80]
+            stp     xzr, xzr, [sp,#96]
```

4. Do zeroing in ascending order of the effective address:

```diff
-            mov     w7, #96
-G_M49279_IG13:
             stp     xzr, xzr, [sp,#-16]!
-            subs    x7, x7, #16
-            bne     G_M49279_IG13
+            stp     xzr, xzr, [sp,#-80]!
+            stp     xzr, xzr, [sp,#16]
+            stp     xzr, xzr, [sp,#32]
+            stp     xzr, xzr, [sp,#48]
+            stp     xzr, xzr, [sp,#64]
```

In the example, the zeroing is done at `[initialSp-16], [initialSp-96], [initialSp-80], [initialSp-64], [initialSp-48], [initialSp-32]` addresses. The idea here is to allow a CPU to detect the sequential `memset` to `0` pattern and switch into write streaming mode.
  • Loading branch information
echesakov committed Feb 8, 2022
1 parent b11469f commit 012278b
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 24 deletions.
74 changes: 58 additions & 16 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2311,34 +2311,76 @@ void CodeGen::genLclHeap(GenTree* tree)
// We should reach here only for non-zero, constant size allocations.
assert(amount > 0);

const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES;

// For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes.
static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time
size_t stpCount = amount / (REGSIZE_BYTES * 2);
if (stpCount <= 4)
static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes);
assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time

if (compiler->info.compInitMem)
{
while (stpCount != 0)
if (amount <= LCLHEAP_UNROLL_LIMIT)
{
// We can use pre-indexed addressing.
// stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
stpCount -= 1;
}
// The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
// stp xzr, xzr, [sp, #-16]!
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes,
INS_OPTS_PRE_INDEX);

lastTouchDelta = 0;
if (amount > storePairRegsWritesBytes)
{
// The following sets SP to its final value and zeroes the first 16 bytes of the allocated space.
// stp xzr, xzr, [sp, #-amount+16]!
const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes;
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta,
INS_OPTS_PRE_INDEX);

// The following zeroes the remaining space in [finalSp+16, initialSp-16) interval
// using a sequence of stp instruction with unsigned offset.
for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta;
offset += storePairRegsWritesBytes)
{
// stp xzr, xzr, [sp, #offset]
GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset);
}
}

goto ALLOC_DONE;
lastTouchDelta = 0;

goto ALLOC_DONE;
}
}
else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <=
else if (amount < compiler->eeGetPageSize()) // must be < not <=
{
// Since the size is less than a page, simply adjust the SP value.
// The SP might already be in the guard page, so we must touch it BEFORE
// the alloc, not after.

// ldr wz, [SP, #0]
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0);
// Note the we check against the lower boundary of the post-index immediate range [-256, 256)
// since the offset is -amount.
const bool canEncodeLoadRegPostIndexOffset = amount <= 256;

genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
if (canEncodeLoadRegPostIndexOffset)
{
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, -(ssize_t)amount,
INS_OPTS_POST_INDEX);
}
else if (emitter::canEncodeLoadOrStorePairOffset(-(ssize_t)amount, EA_8BYTE))
{
// The following probes the page and allocates the local heap.
// ldp tmpReg, xzr, [sp], #-amount
// Note that we cannot use ldp xzr, xzr since
// the behaviour of ldp where two source registers are the same is unpredictable.
const regNumber tmpReg = targetReg;
GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount,
INS_OPTS_POST_INDEX);
}
else
{
// ldr wzr, [sp]
// sub, sp, #amount
GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount);
genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
}

lastTouchDelta = amount;

Expand Down
13 changes: 5 additions & 8 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree)
{
assert(dstCount == 1);

// Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
// Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp):
// Here '-' means don't care.
//
// Size? Init Memory? # temp regs
// 0 - 0
// const and <=6 ptr words - 0
// const and <=UnrollLimit - 0
// const and <PageSize No 0
// >6 ptr words Yes 0
// >UnrollLimit Yes 0
// Non-const Yes 0
// Non-const No 2
//
Expand All @@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree)
// Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
// This should also help in debugging as we can examine the original size specified with
// localloc.
sizeVal = AlignUp(sizeVal, STACK_ALIGN);
size_t stpCount = sizeVal / (REGSIZE_BYTES * 2);
sizeVal = AlignUp(sizeVal, STACK_ALIGN);

// For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc)
//
if (stpCount <= 4)
if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
{
// Need no internal registers
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
#define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)

#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
Expand Down

0 comments on commit 012278b

Please sign in to comment.