From 012278bb91610ae415cf695a1d3846299a20e754 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 7 Feb 2022 20:37:37 -0800 Subject: [PATCH] Local heap optimizations on Arm64 (#64481) # Local heap optimizations on Arm64 1. When not required to zero the allocated space for local heap (for sizes up to 64 bytes) - do not emit zeroing sequence. Instead do stack probing and adjust stack pointer: ```diff - stp xzr, xzr, [sp,#-16]! - stp xzr, xzr, [sp,#-16]! - stp xzr, xzr, [sp,#-16]! - stp xzr, xzr, [sp,#-16]! + ldr wzr, [sp],#-64 ``` 2. For sizes less than one `PAGE_SIZE` use `ldr wzr, [sp], #-amount` that does probing at `[sp]` and allocates the space at the same time. This saves one instruction for such local heap allocations: ```diff - ldr wzr, [sp] - sub sp, sp, #208 + ldr wzr, [sp],#-208 ``` Use `ldp tmpReg, xzr, [sp], #-amount` when the offset not encodable by post-index variant of `ldr`: ```diff - ldr wzr, [sp] - sub sp, sp, #512 + ldp x0, xzr, [sp],#-512 ``` 3. Allow non-loop zeroing (i.e. unrolled sequence) for sizes up to 128 bytes (i.e. up to `LCLHEAP_UNROLL_LIMIT`). This frees up two internal integer registers for such cases: ```diff - mov w11, #128 - ;; bbWeight=0.50 PerfScore 0.25 -G_M44913_IG19: ; gcrefRegs=00F9 {x0 x3 x4 x5 x6 x7}, byrefRegs=0000 {}, byref, isz stp xzr, xzr, [sp,#-16]! - subs x11, x11, #16 - bne G_M44913_IG19 + stp xzr, xzr, [sp,#-112]! + stp xzr, xzr, [sp,#16] + stp xzr, xzr, [sp,#32] + stp xzr, xzr, [sp,#48] + stp xzr, xzr, [sp,#64] + stp xzr, xzr, [sp,#80] + stp xzr, xzr, [sp,#96] ``` 4. Do zeroing in ascending order of the effective address: ```diff - mov w7, #96 -G_M49279_IG13: stp xzr, xzr, [sp,#-16]! - subs x7, x7, #16 - bne G_M49279_IG13 + stp xzr, xzr, [sp,#-80]! + stp xzr, xzr, [sp,#16] + stp xzr, xzr, [sp,#32] + stp xzr, xzr, [sp,#48] + stp xzr, xzr, [sp,#64] ``` In the example, the zeroing is done at `[initialSp-16], [initialSp-96], [initialSp-80], [initialSp-64], [initialSp-48], [initialSp-32]` addresses. The idea here is to allow a CPU to detect the sequential `memset` to `0` pattern and switch into write streaming mode. --- src/coreclr/jit/codegenarm64.cpp | 74 +++++++++++++++++++++++++------- src/coreclr/jit/lsraarm64.cpp | 13 +++--- src/coreclr/jit/targetarm64.h | 1 + 3 files changed, 64 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 595e2a232e541..edc756f18e04c 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2311,34 +2311,76 @@ void CodeGen::genLclHeap(GenTree* tree) // We should reach here only for non-zero, constant size allocations. assert(amount > 0); + const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES; + // For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes. - static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2)); - assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time - size_t stpCount = amount / (REGSIZE_BYTES * 2); - if (stpCount <= 4) + static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes); + assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time + + if (compiler->info.compInitMem) { - while (stpCount != 0) + if (amount <= LCLHEAP_UNROLL_LIMIT) { - // We can use pre-indexed addressing. - // stp ZR, ZR, [SP, #-16]! // STACK_ALIGN is 16 - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX); - stpCount -= 1; - } + // The following zeroes the last 16 bytes and probes the page containing [sp, #16] address. + // stp xzr, xzr, [sp, #-16]! + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes, + INS_OPTS_PRE_INDEX); - lastTouchDelta = 0; + if (amount > storePairRegsWritesBytes) + { + // The following sets SP to its final value and zeroes the first 16 bytes of the allocated space. + // stp xzr, xzr, [sp, #-amount+16]! + const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes; + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta, + INS_OPTS_PRE_INDEX); + + // The following zeroes the remaining space in [finalSp+16, initialSp-16) interval + // using a sequence of stp instruction with unsigned offset. + for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta; + offset += storePairRegsWritesBytes) + { + // stp xzr, xzr, [sp, #offset] + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset); + } + } - goto ALLOC_DONE; + lastTouchDelta = 0; + + goto ALLOC_DONE; + } } - else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <= + else if (amount < compiler->eeGetPageSize()) // must be < not <= { // Since the size is less than a page, simply adjust the SP value. // The SP might already be in the guard page, so we must touch it BEFORE // the alloc, not after. - // ldr wz, [SP, #0] - GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0); + // Note the we check against the lower boundary of the post-index immediate range [-256, 256) + // since the offset is -amount. + const bool canEncodeLoadRegPostIndexOffset = amount <= 256; - genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg()); + if (canEncodeLoadRegPostIndexOffset) + { + GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, -(ssize_t)amount, + INS_OPTS_POST_INDEX); + } + else if (emitter::canEncodeLoadOrStorePairOffset(-(ssize_t)amount, EA_8BYTE)) + { + // The following probes the page and allocates the local heap. + // ldp tmpReg, xzr, [sp], #-amount + // Note that we cannot use ldp xzr, xzr since + // the behaviour of ldp where two source registers are the same is unpredictable. + const regNumber tmpReg = targetReg; + GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount, + INS_OPTS_POST_INDEX); + } + else + { + // ldr wzr, [sp] + // sub, sp, #amount + GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount); + genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg()); + } lastTouchDelta = amount; diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index f18099f4f573e..3091700df4222 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree) { assert(dstCount == 1); - // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp): + // Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp): // Here '-' means don't care. // // Size? Init Memory? # temp regs // 0 - 0 - // const and <=6 ptr words - 0 + // const and <=UnrollLimit - 0 // const and 6 ptr words Yes 0 + // >UnrollLimit Yes 0 // Non-const Yes 0 // Non-const No 2 // @@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree) // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size. // This should also help in debugging as we can examine the original size specified with // localloc. - sizeVal = AlignUp(sizeVal, STACK_ALIGN); - size_t stpCount = sizeVal / (REGSIZE_BYTES * 2); + sizeVal = AlignUp(sizeVal, STACK_ALIGN); - // For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc) - // - if (stpCount <= 4) + if (sizeVal <= LCLHEAP_UNROLL_LIMIT) { // Need no internal registers } diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 4cc6b63f73009..a4ef3a96782f7 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -15,6 +15,7 @@ #define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack) #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk #define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack) + #define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required) #ifdef FEATURE_SIMD #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned