From 012278bb91610ae415cf695a1d3846299a20e754 Mon Sep 17 00:00:00 2001
From: Egor Chesakov <Egor.Chesakov@microsoft.com>
Date: Mon, 7 Feb 2022 20:37:37 -0800
Subject: [PATCH] Local heap optimizations  on Arm64 (#64481)

# Local heap optimizations on Arm64

1. When not required to zero the allocated space for local heap (for sizes up to 64 bytes) - do not emit zeroing sequence. Instead do stack probing and adjust stack pointer:

```diff
-            stp     xzr, xzr, [sp,#-16]!
-            stp     xzr, xzr, [sp,#-16]!
-            stp     xzr, xzr, [sp,#-16]!
-            stp     xzr, xzr, [sp,#-16]!
+            ldr     wzr, [sp],#-64
```

2. For sizes less than one `PAGE_SIZE` use `ldr wzr, [sp], #-amount` that does probing at `[sp]` and allocates the space at the same time. This saves one instruction for such local heap allocations:

```diff
-            ldr     wzr, [sp]
-            sub     sp, sp, #208
+            ldr     wzr, [sp],#-208
```

Use `ldp tmpReg, xzr, [sp], #-amount` when the offset not encodable by post-index variant of `ldr`:
```diff
-            ldr     wzr, [sp]
-            sub     sp, sp, #512
+            ldp     x0, xzr, [sp],#-512
```

3. Allow non-loop zeroing (i.e. unrolled sequence) for sizes up to 128 bytes (i.e. up to `LCLHEAP_UNROLL_LIMIT`). This frees up two internal integer registers for such cases:

```diff
-            mov     w11, #128
-                                               ;; bbWeight=0.50 PerfScore 0.25
-G_M44913_IG19:        ; gcrefRegs=00F9 {x0 x3 x4 x5 x6 x7}, byrefRegs=0000 {}, byref, isz
             stp     xzr, xzr, [sp,#-16]!
-            subs    x11, x11, #16
-            bne     G_M44913_IG19
+            stp     xzr, xzr, [sp,#-112]!
+            stp     xzr, xzr, [sp,#16]
+            stp     xzr, xzr, [sp,#32]
+            stp     xzr, xzr, [sp,#48]
+            stp     xzr, xzr, [sp,#64]
+            stp     xzr, xzr, [sp,#80]
+            stp     xzr, xzr, [sp,#96]
```

4. Do zeroing in ascending order of the effective address:

```diff
-            mov     w7, #96
-G_M49279_IG13:
             stp     xzr, xzr, [sp,#-16]!
-            subs    x7, x7, #16
-            bne     G_M49279_IG13
+            stp     xzr, xzr, [sp,#-80]!
+            stp     xzr, xzr, [sp,#16]
+            stp     xzr, xzr, [sp,#32]
+            stp     xzr, xzr, [sp,#48]
+            stp     xzr, xzr, [sp,#64]
```

In the example, the zeroing is done at `[initialSp-16], [initialSp-96], [initialSp-80], [initialSp-64], [initialSp-48], [initialSp-32]` addresses. The idea here is to allow a CPU to detect the sequential `memset` to `0` pattern and switch into write streaming mode.
---
 src/coreclr/jit/codegenarm64.cpp | 74 +++++++++++++++++++++++++-------
 src/coreclr/jit/lsraarm64.cpp    | 13 +++---
 src/coreclr/jit/targetarm64.h    |  1 +
 3 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp
index 595e2a232e541..edc756f18e04c 100644
--- a/src/coreclr/jit/codegenarm64.cpp
+++ b/src/coreclr/jit/codegenarm64.cpp
@@ -2311,34 +2311,76 @@ void CodeGen::genLclHeap(GenTree* tree)
         // We should reach here only for non-zero, constant size allocations.
         assert(amount > 0);
 
+        const int storePairRegsWritesBytes = 2 * REGSIZE_BYTES;
+
         // For small allocations we will generate up to four stp instructions, to zero 16 to 64 bytes.
-        static_assert_no_msg(STACK_ALIGN == (REGSIZE_BYTES * 2));
-        assert(amount % (REGSIZE_BYTES * 2) == 0); // stp stores two registers at a time
-        size_t stpCount = amount / (REGSIZE_BYTES * 2);
-        if (stpCount <= 4)
+        static_assert_no_msg(STACK_ALIGN == storePairRegsWritesBytes);
+        assert(amount % storePairRegsWritesBytes == 0); // stp stores two registers at a time
+
+        if (compiler->info.compInitMem)
         {
-            while (stpCount != 0)
+            if (amount <= LCLHEAP_UNROLL_LIMIT)
             {
-                // We can use pre-indexed addressing.
-                // stp ZR, ZR, [SP, #-16]!   // STACK_ALIGN is 16
-                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, REG_SPBASE, -16, INS_OPTS_PRE_INDEX);
-                stpCount -= 1;
-            }
+                // The following zeroes the last 16 bytes and probes the page containing [sp, #16] address.
+                // stp xzr, xzr, [sp, #-16]!
+                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -storePairRegsWritesBytes,
+                                              INS_OPTS_PRE_INDEX);
 
-            lastTouchDelta = 0;
+                if (amount > storePairRegsWritesBytes)
+                {
+                    // The following sets SP to its final value and zeroes the first 16 bytes of the allocated space.
+                    // stp xzr, xzr, [sp, #-amount+16]!
+                    const ssize_t finalSpDelta = (ssize_t)amount - storePairRegsWritesBytes;
+                    GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, -finalSpDelta,
+                                                  INS_OPTS_PRE_INDEX);
+
+                    // The following zeroes the remaining space in [finalSp+16, initialSp-16) interval
+                    // using a sequence of stp instruction with unsigned offset.
+                    for (ssize_t offset = storePairRegsWritesBytes; offset < finalSpDelta;
+                         offset += storePairRegsWritesBytes)
+                    {
+                        // stp xzr, xzr, [sp, #offset]
+                        GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_8BYTE, REG_ZR, REG_ZR, REG_SPBASE, offset);
+                    }
+                }
 
-            goto ALLOC_DONE;
+                lastTouchDelta = 0;
+
+                goto ALLOC_DONE;
+            }
         }
-        else if (!compiler->info.compInitMem && (amount < compiler->eeGetPageSize())) // must be < not <=
+        else if (amount < compiler->eeGetPageSize()) // must be < not <=
         {
             // Since the size is less than a page, simply adjust the SP value.
             // The SP might already be in the guard page, so we must touch it BEFORE
             // the alloc, not after.
 
-            // ldr wz, [SP, #0]
-            GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SP, 0);
+            // Note the we check against the lower boundary of the post-index immediate range [-256, 256)
+            // since the offset is -amount.
+            const bool canEncodeLoadRegPostIndexOffset = amount <= 256;
 
-            genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
+            if (canEncodeLoadRegPostIndexOffset)
+            {
+                GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, -(ssize_t)amount,
+                                            INS_OPTS_POST_INDEX);
+            }
+            else if (emitter::canEncodeLoadOrStorePairOffset(-(ssize_t)amount, EA_8BYTE))
+            {
+                // The following probes the page and allocates the local heap.
+                // ldp tmpReg, xzr, [sp], #-amount
+                // Note that we cannot use ldp xzr, xzr since
+                // the behaviour of ldp where two source registers are the same is unpredictable.
+                const regNumber tmpReg = targetReg;
+                GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, REG_ZR, REG_SPBASE, -(ssize_t)amount,
+                                              INS_OPTS_POST_INDEX);
+            }
+            else
+            {
+                // ldr wzr, [sp]
+                // sub, sp, #amount
+                GetEmitter()->emitIns_R_R_I(INS_ldr, EA_4BYTE, REG_ZR, REG_SPBASE, amount);
+                genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, amount, rsGetRsvdReg());
+            }
 
             lastTouchDelta = amount;
 
diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp
index f18099f4f573e..3091700df4222 100644
--- a/src/coreclr/jit/lsraarm64.cpp
+++ b/src/coreclr/jit/lsraarm64.cpp
@@ -543,14 +543,14 @@ int LinearScan::BuildNode(GenTree* tree)
         {
             assert(dstCount == 1);
 
-            // Need a variable number of temp regs (see genLclHeap() in codegenamd64.cpp):
+            // Need a variable number of temp regs (see genLclHeap() in codegenarm64.cpp):
             // Here '-' means don't care.
             //
             //  Size?                   Init Memory?    # temp regs
             //   0                          -               0
-            //   const and <=6 ptr words    -               0
+            //   const and <=UnrollLimit    -               0
             //   const and <PageSize        No              0
-            //   >6 ptr words               Yes             0
+            //   >UnrollLimit               Yes             0
             //   Non-const                  Yes             0
             //   Non-const                  No              2
             //
@@ -569,12 +569,9 @@ int LinearScan::BuildNode(GenTree* tree)
                     // Note: The Gentree node is not updated here as it is cheap to recompute stack aligned size.
                     // This should also help in debugging as we can examine the original size specified with
                     // localloc.
-                    sizeVal         = AlignUp(sizeVal, STACK_ALIGN);
-                    size_t stpCount = sizeVal / (REGSIZE_BYTES * 2);
+                    sizeVal = AlignUp(sizeVal, STACK_ALIGN);
 
-                    // For small allocations up to 4 'stp' instructions (i.e. 16 to 64 bytes of localloc)
-                    //
-                    if (stpCount <= 4)
+                    if (sizeVal <= LCLHEAP_UNROLL_LIMIT)
                     {
                         // Need no internal registers
                     }
diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h
index 4cc6b63f73009..a4ef3a96782f7 100644
--- a/src/coreclr/jit/targetarm64.h
+++ b/src/coreclr/jit/targetarm64.h
@@ -15,6 +15,7 @@
   #define CPBLK_LCL_UNROLL_LIMIT   128    // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
   #define INITBLK_UNROLL_LIMIT     64     // Upper bound to let the code generator to loop unroll InitBlk
   #define INITBLK_LCL_UNROLL_LIMIT 128    // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)
+  #define LCLHEAP_UNROLL_LIMIT     128    // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required)
 
 #ifdef FEATURE_SIMD
   #define ALIGN_SIMD_TYPES         1       // whether SIMD type locals are to be aligned