From ee3f7daed083477689a4c6240025afa45ffa3352 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 26 Feb 2021 15:25:07 -0800 Subject: [PATCH] Separate refactoring changes in 43250 (#48199) --- src/coreclr/jit/codegenarm.cpp | 34 +- src/coreclr/jit/codegenarm64.cpp | 11 +- src/coreclr/jit/codegenarmarch.cpp | 572 ++++++++++++++++++++ src/coreclr/jit/codegencommon.cpp | 644 +---------------------- src/coreclr/jit/codegenxarch.cpp | 57 ++ src/coreclr/jit/compiler.h | 13 - src/coreclr/vm/amd64/JitHelpers_Fast.asm | 6 +- src/coreclr/vm/amd64/jithelpers_fast.S | 6 +- src/coreclr/vm/arm/asmhelpers.S | 18 +- src/coreclr/vm/arm/asmhelpers.asm | 18 +- src/coreclr/vm/i386/jithelp.S | 6 +- src/coreclr/vm/i386/jithelp.asm | 12 +- 12 files changed, 698 insertions(+), 699 deletions(-) diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 4eae6d636ae8d..c5dc53c712a6a 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -1849,28 +1849,22 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni { GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, frameSize); } - else if (frameSize < compiler->getVeryLargeFrameSize()) - { - for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) - { - // Generate: - // movw initReg, -probeOffset - // ldr initReg, [SP + initReg] - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, -(ssize_t)probeOffset); - GetEmitter()->emitIns_R_R_R(INS_ldr, EA_PTRSIZE, initReg, REG_SPBASE, initReg); - } - - regSet.verifyRegUsed(initReg); - *pInitRegZeroed = false; // The initReg does not contain zero - - instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, frameSize); - compiler->unwindPadding(); - GetEmitter()->emitIns_R_R_R(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, initReg); - } else { - assert(frameSize >= compiler->getVeryLargeFrameSize()); + // Generate the following code: + // + // movw r4, #frameSize + // sub r4, sp, r4 + // bl CORINFO_HELP_STACK_PROBE + // mov sp, r4 + // + // If frameSize can not be encoded by movw immediate this becomes: + // + // movw r4, #frameSizeLo16 + // movt r4, #frameSizeHi16 + // sub r4, sp, r4 + // bl CORINFO_HELP_STACK_PROBE + // mov sp, r4 genInstrWithConstant(INS_sub, EA_PTRSIZE, REG_STACK_PROBE_HELPER_ARG, REG_SPBASE, frameSize, INS_FLAGS_DONT_CARE, REG_STACK_PROBE_HELPER_ARG); diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 5efee1618dce5..a92367389bc72 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -9658,8 +9658,15 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni { lastTouchDelta = frameSize; } - else if (frameSize < compiler->getVeryLargeFrameSize()) + else if (frameSize < 3 * pageSize) { + // The probing loop in "else"-case below would require at least 6 instructions (and more if + // 'frameSize' or 'pageSize' can not be encoded with mov-instruction immediate). + // Hence for frames that are smaller than 3 * PAGE_SIZE the JIT inlines the following probing code + // to decrease code size. + // TODO-ARM64: The probing mechanisms should be replaced by a call to stack probe helper + // as it is done on other platforms. + lastTouchDelta = frameSize; for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) @@ -9681,8 +9688,6 @@ void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pIni } else { - assert(frameSize >= compiler->getVeryLargeFrameSize()); - // Emit the following sequence to 'tickle' the pages. Note it is important that stack pointer not change // until this is complete since the tickles could cause a stack overflow, and we need to be able to crawl // the stack afterward (which means the stack pointer needs to be known). diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 499a89faf4deb..51ab100a33eac 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3667,4 +3667,576 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, ReturnTypeDesc* retTypeDesc) } #endif // FEATURE_SIMD +//------------------------------------------------------------------------ +// genPushCalleeSavedRegisters: Push any callee-saved registers we have used. +// +// Arguments (arm64): +// initReg - A scratch register (that gets set to zero on some platforms). +// pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'true' if this method sets initReg register to zero, +// 'false' if initReg was set to a non-zero value, and left unchanged if initReg was not touched. +// +#if defined(TARGET_ARM64) +void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed) +#else +void CodeGen::genPushCalleeSavedRegisters() +#endif +{ + assert(compiler->compGeneratingProlog); + +#ifdef TARGET_ARM64 + // Probe large frames now, if necessary, since genPushCalleeSavedRegisters() will allocate the frame. Note that + // for arm64, genAllocLclFrame only probes the frame; it does not actually allocate it (it does not change SP). + // For arm64, we are probing the frame before the callee-saved registers are saved. The 'initReg' might have + // been calculated to be one of the callee-saved registers (say, if all the integer argument registers are + // in use, and perhaps with other conditions being satisfied). This is ok in other cases, after the callee-saved + // registers have been saved. So instead of letting genAllocLclFrame use initReg as a temporary register, + // always use REG_SCRATCH. We don't care if it trashes it, so ignore the initRegZeroed output argument. + bool ignoreInitRegZeroed = false; + genAllocLclFrame(compiler->compLclFrameSize, REG_SCRATCH, &ignoreInitRegZeroed, + intRegState.rsCalleeRegArgMaskLiveIn); +#endif + + regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; + +#if ETW_EBP_FRAMED + if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE)) + { + noway_assert(!"Used register RBM_FPBASE as a scratch register!"); + } +#endif + +#ifdef TARGET_ARMARCH + // On ARM we push the FP (frame-pointer) here along with all other callee saved registers + if (isFramePointerUsed()) + rsPushRegs |= RBM_FPBASE; + + // + // It may be possible to skip pushing/popping lr for leaf methods. However, such optimization would require + // changes in GC suspension architecture. + // + // We would need to guarantee that a tight loop calling a virtual leaf method can be suspended for GC. Today, we + // generate partially interruptible code for both the method that contains the tight loop with the call and the leaf + // method. GC suspension depends on return address hijacking in this case. Return address hijacking depends + // on the return address to be saved on the stack. If we skipped pushing/popping lr, the return address would never + // be saved on the stack and the GC suspension would time out. + // + // So if we wanted to skip pushing pushing/popping lr for leaf frames, we would also need to do one of + // the following to make GC suspension work in the above scenario: + // - Make return address hijacking work even when lr is not saved on the stack. + // - Generate fully interruptible code for loops that contains calls + // - Generate fully interruptible code for leaf methods + // + // Given the limited benefit from this optimization (<10k for CoreLib NGen image), the extra complexity + // is not worth it. + // + rsPushRegs |= RBM_LR; // We must save the return address (in the LR register) + + regSet.rsMaskCalleeSaved = rsPushRegs; +#endif // TARGET_ARMARCH + +#ifdef DEBUG + if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) + { + printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", + compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); + dspRegMask(rsPushRegs); + printf("\n"); + assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); + } +#endif // DEBUG + +#if defined(TARGET_ARM) + regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT; + regMaskTP maskPushRegsInt = rsPushRegs & ~maskPushRegsFloat; + + maskPushRegsInt |= genStackAllocRegisterMask(compiler->compLclFrameSize, maskPushRegsFloat); + + assert(FitsIn(maskPushRegsInt)); + inst_IV(INS_push, (int)maskPushRegsInt); + compiler->unwindPushMaskInt(maskPushRegsInt); + + if (maskPushRegsFloat != 0) + { + genPushFltRegs(maskPushRegsFloat); + compiler->unwindPushMaskFloat(maskPushRegsFloat); + } +#elif defined(TARGET_ARM64) + // See the document "ARM64 JIT Frame Layout" and/or "ARM64 Exception Data" for more details or requirements and + // options. Case numbers in comments here refer to this document. See also Compiler::lvaAssignFrameOffsets() + // for pictures of the general frame layouts, and CodeGen::genFuncletProlog() implementations (per architecture) + // for pictures of the funclet frame layouts. + // + // For most frames, generate, e.g.: + // stp fp, lr, [sp,-0x80]! // predecrement SP with full frame size, and store FP/LR pair. + // stp r19, r20, [sp, 0x60] // store at positive offset from SP established above, into callee-saved area + // // at top of frame (highest addresses). + // stp r21, r22, [sp, 0x70] + // + // Notes: + // 1. We don't always need to save FP. If FP isn't saved, then LR is saved with the other callee-saved registers + // at the top of the frame. + // 2. If we save FP, then the first store is FP, LR. + // 3. General-purpose registers are 8 bytes, floating-point registers are 16 bytes, but FP/SIMD registers only + // preserve their lower 8 bytes, by calling convention. + // 4. For frames with varargs, we spill the integer register arguments to the stack, so all the arguments are + // consecutive, and at the top of the frame. + // 5. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc). + // + // For functions with GS and localloc, we change the frame so the frame pointer and LR are saved at the top + // of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same + // rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP. + // Since this frame type is relatively rare, we force using it via stress modes, for additional coverage. + // + // The frames look like the following (simplified to only include components that matter for establishing the + // frames). See also Compiler::lvaAssignFrameOffsets(). + // + // Frames with FP, LR saved at bottom of frame (above outgoing argument space): + // + // | | + // |-----------------------| + // | incoming arguments | + // +=======================+ <---- Caller's SP + // | Varargs regs space | // Only for varargs functions; 64 bytes + // |-----------------------| + // |Callee saved registers | // not including FP/LR; multiple of 8 bytes + // |-----------------------| + // | PSP slot | // 8 bytes (omitted in CoreRT ABI) + // |-----------------------| + // | locals, temps, etc. | + // |-----------------------| + // | possible GS cookie | + // |-----------------------| + // | Saved LR | // 8 bytes + // |-----------------------| + // | Saved FP | // 8 bytes + // |-----------------------| + // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + // |-----------------------| <---- Ambient SP + // | | | + // ~ | Stack grows ~ + // | | downward | + // V + // + // Frames with FP, LR saved at top of frame (below saved varargs incoming arguments): + // + // | | + // |-----------------------| + // | incoming arguments | + // +=======================+ <---- Caller's SP + // | Varargs regs space | // Only for varargs functions; 64 bytes + // |-----------------------| + // | Saved LR | // 8 bytes + // |-----------------------| + // | Saved FP | // 8 bytes + // |-----------------------| + // |Callee saved registers | // not including FP/LR; multiple of 8 bytes + // |-----------------------| + // | PSP slot | // 8 bytes (omitted in CoreRT ABI) + // |-----------------------| + // | locals, temps, etc. | + // |-----------------------| + // | possible GS cookie | + // |-----------------------| + // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + // |-----------------------| <---- Ambient SP + // | | | + // ~ | Stack grows ~ + // | | downward | + // V + // + + int totalFrameSize = genTotalFrameSize(); + + int offset; // This will be the starting place for saving the callee-saved registers, in increasing order. + + regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; + regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; + +#ifdef DEBUG + if (verbose) + { + printf("Save float regs: "); + dspRegMask(maskSaveRegsFloat); + printf("\n"); + printf("Save int regs: "); + dspRegMask(maskSaveRegsInt); + printf("\n"); + } +#endif // DEBUG + + // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we + // generate based on various sizes. + int frameType = 0; + + // The amount to subtract from SP before starting to store the callee-saved registers. It might be folded into the + // first save instruction as a "predecrement" amount, if possible. + int calleeSaveSPDelta = 0; + + if (isFramePointerUsed()) + { + // We need to save both FP and LR. + + assert((maskSaveRegsInt & RBM_FP) != 0); + assert((maskSaveRegsInt & RBM_LR) != 0); + + // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address + // (FP and LR) are protected from buffer overrun by the GS cookie. If FP/LR are at the lowest addresses, + // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will + // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our + // saved FP/LR. In that case, we save FP/LR along with the rest of the callee-saved registers, above + // the GS cookie. + // + // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to + // create a frame pointer chain. + // + // Do we need another frame pointer register to get good code quality in the case of having the frame pointer + // point high in the frame, so we can take advantage of arm64's preference for positive offsets? C++ native + // code dedicates callee-saved x19 to this, so generates: + // mov x19, sp + // in the prolog, then uses x19 for local var accesses. Given that this case is so rare, we currently do + // not do this. That means that negative offsets from FP might need to use the reserved register to form + // the local variable offset for an addressing mode. + + if (((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize <= 504)) && + !genSaveFpLrWithAllCalleeSavedRegisters) + { + // Case #1. + // + // Generate: + // stp fp,lr,[sp,#-framesz]! + // + // The (totalFrameSize <= 504) condition ensures that both the pre-index STP instruction + // used in the prolog, and the post-index LDP instruction used in the epilog, can be generated. + // Note that STP and the unwind codes can handle -512, but LDP with a positive post-index value + // can only handle up to 504, and we want our prolog and epilog to match. + // + // After saving callee-saved registers, we establish the frame pointer with: + // mov fp,sp + // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. + + JITDUMP("Frame type 1. #outsz=0; #framesz=%d; LclFrameSize=%d\n", totalFrameSize, + compiler->compLclFrameSize); + + frameType = 1; + + assert(totalFrameSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); + + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -totalFrameSize, + INS_OPTS_PRE_INDEX); + compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, -totalFrameSize); + + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR + offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR + } + else if (totalFrameSize <= 512) + { + // Case #2. + // + // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP + // with signed offset encoding. The maximum positive STP offset is 504, but when storing a pair of + // 8 byte registers, the largest actual offset we use would be 512 - 8 * 2 = 496. And STR with positive + // offset has a range 0 to 32760. + // + // After saving callee-saved registers, we establish the frame pointer with: + // add fp,sp,#outsz + // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. + + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 4 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 4; + + // The frame will be allocated below, when the callee-saved registers are saved. This might mean a + // separate SUB instruction or the SP adjustment might be folded in to the first STP if there is + // no outgoing argument space AND no local frame space, that is, if the only thing the frame does + // is save callee-saved registers (and possibly varargs argument registers). + calleeSaveSPDelta = totalFrameSize; + + offset = (int)compiler->compLclFrameSize; + } + else + { + JITDUMP("Frame type 2 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 2; + + // Generate: + // sub sp,sp,#framesz + // stp fp,lr,[sp,#outsz] // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496. + + assert(totalFrameSize - compiler->lvaOutgoingArgSpaceSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); + + GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); + compiler->unwindAllocStack(totalFrameSize); + + assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize); + + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, + compiler->lvaOutgoingArgSpaceSize); + compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize); + + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR + offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR + } + } + else + { + // Case 5 or 6. + // + // First, the callee-saved registers will be saved, and the callee-saved register code must use + // pre-index to subtract from SP as the first instruction. It must also leave space for varargs + // registers to be stored. For example: + // stp r19,r20,[sp,#-96]! + // stp d8,d9,[sp,#16] + // ... save varargs incoming integer registers ... + // Note that all SP alterations must be 16-byte aligned. We have already calculated any alignment to be + // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate + // alignment). So, if there is an odd number of callee-saved registers, we use (for example, with just + // one saved register): + // sub sp,sp,#16 + // str r19,[sp,#8] + // This is one additional instruction, but it centralizes the aligned space. Otherwise, it might be + // possible to have two 8-byte alignment padding words, one below the callee-saved registers, and one + // above them. If that is preferable, we could implement it. + // + // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument + // registers. + // + // Then, define #remainingFrameSz = #framesz - (callee-saved size + varargs space + possible alignment + // padding from above). Note that #remainingFrameSz must not be zero, since we still need to save FP,SP. + // + // Generate: + // sub sp,sp,#remainingFrameSz + // or, for large frames: + // mov rX, #remainingFrameSz // maybe multiple instructions + // sub sp,sp,rX + // + // followed by: + // stp fp,lr,[sp,#outsz] + // add fp,sp,#outsz + // + // However, we need to handle the case where #outsz is larger than the constant signed offset encoding + // can handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e., + // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of + // the following sequences: + // + // Define #remainingFrameSz2 = #remainingFrameSz - #outsz. + // + // sub sp,sp,#remainingFrameSz2 // if #remainingFrameSz2 is 16-byte aligned + // stp fp,lr,[sp] + // mov fp,sp + // sub sp,sp,#outsz // in this case, #outsz must also be 16-byte aligned + // + // Or: + // + // sub sp,sp,roundUp(#remainingFrameSz2,16) // if #remainingFrameSz2 is not 16-byte aligned (it is + // // always guaranteed to be 8 byte aligned). + // stp fp,lr,[sp,#8] // it will always be #8 in the unaligned case + // add fp,sp,#8 + // sub sp,sp,#outsz - #8 + // + // (As usual, for a large constant "#outsz - #8", we might need multiple instructions: + // mov rX, #outsz - #8 // maybe multiple instructions + // sub sp,sp,rX + // ) + // + // Note that even if we align the SP alterations, that does not imply that we are creating empty alignment + // slots. In fact, we are not; any empty alignment slots were calculated in + // Compiler::lvaAssignFrameOffsets() and its callees. + + int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize; + if (genSaveFpLrWithAllCalleeSavedRegisters) + { + JITDUMP("Frame type 5 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + // This case is much simpler, because we allocate space for the callee-saved register area, including + // FP/LR. Note the SP adjustment might be SUB or be folded into the first store as a predecrement. + // Then, we use a single SUB to establish the rest of the frame. We need to be careful about where + // to establish the frame pointer, as there is a limit of 2040 bytes offset from SP to FP in the + // unwind codes when FP is established. + frameType = 5; + } + else + { + JITDUMP("Frame type 3 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + + frameType = 3; + + calleeSaveSPDeltaUnaligned -= 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later. + + // We'll take care of these later, but callee-saved regs code shouldn't see them. + maskSaveRegsInt &= ~(RBM_FP | RBM_LR); + } + + assert(calleeSaveSPDeltaUnaligned >= 0); + assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. + calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); + + offset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; + + JITDUMP(" calleeSaveSPDelta=%d, offset=%d\n", calleeSaveSPDelta, offset); + + // At most one alignment slot between SP and where we store the callee-saved registers. + assert((offset == 0) || (offset == REGSIZE_BYTES)); + } + } + else + { + // No frame pointer (no chaining). + assert((maskSaveRegsInt & RBM_FP) == 0); + assert((maskSaveRegsInt & RBM_LR) != 0); + + // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using + // 'stp' if we only have one callee-saved register plus LR to save. + + NYI("Frame without frame pointer"); + offset = 0; + } + + assert(frameType != 0); + + JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); + genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta); + + offset += genCountBits(maskSaveRegsInt | maskSaveRegsFloat) * REGSIZE_BYTES; + + // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here, + // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't + // need to add codes at all. + + if (compiler->info.compIsVarArgs) + { + JITDUMP(" compIsVarArgs=true\n"); + + // There are 8 general-purpose registers to home, thus 'offset' must be 16-byte aligned here. + assert((offset % 16) == 0); + for (regNumber reg1 = REG_ARG_FIRST; reg1 < REG_ARG_LAST; reg1 = REG_NEXT(REG_NEXT(reg1))) + { + regNumber reg2 = REG_NEXT(reg1); + // stp REG, REG + 1, [SP, #offset] + GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, offset); + compiler->unwindNop(); + offset += 2 * REGSIZE_BYTES; + } + } + + // By default, we'll establish the frame pointer chain. (Note that currently frames without FP are NYI.) + bool establishFramePointer = true; + + // If we do establish the frame pointer, what is the amount we add to SP to do so? + unsigned offsetSpToSavedFp = 0; + + if (frameType == 1) + { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + assert(offsetSpToSavedFp == 0); + } + else if (frameType == 2) + { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + + offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; + } + else if (frameType == 3) + { + assert(!genSaveFpLrWithAllCalleeSavedRegisters); + + int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; + assert(remainingFrameSz > 0); + assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- + // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. + + if (compiler->lvaOutgoingArgSpaceSize > 504) + { + // We can't do "stp fp,lr,[sp,#outsz]" because #outsz is too big. + // If compiler->lvaOutgoingArgSpaceSize is not aligned, we need to align the SP adjustment. + assert(remainingFrameSz > (int)compiler->lvaOutgoingArgSpaceSize); + int spAdjustment2Unaligned = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize; + int spAdjustment2 = (int)roundUp((unsigned)spAdjustment2Unaligned, STACK_ALIGN); + int alignmentAdjustment2 = spAdjustment2 - spAdjustment2Unaligned; + assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == 8)); + + JITDUMP(" spAdjustment2=%d\n", spAdjustment2); + + genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -spAdjustment2, false, initReg, pInitRegZeroed); + offset += spAdjustment2; + + // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" + // included some of it) + + int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2; + assert(spAdjustment3 > 0); + assert((spAdjustment3 % 16) == 0); + + JITDUMP(" alignmentAdjustment2=%d\n", alignmentAdjustment2); + genEstablishFramePointer(alignmentAdjustment2, /* reportUnwindData */ true); + + // We just established the frame pointer chain; don't do it again. + establishFramePointer = false; + + JITDUMP(" spAdjustment3=%d\n", spAdjustment3); + + // We've already established the frame pointer, so no need to report the stack pointer change to unwind + // info. + genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed, /* reportUnwindData */ false); + offset += spAdjustment3; + } + else + { + genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -remainingFrameSz, false, initReg, + pInitRegZeroed); + offset += remainingFrameSz; + + offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; + } + } + else if (frameType == 4) + { + assert(genSaveFpLrWithAllCalleeSavedRegisters); + offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + } + else if (frameType == 5) + { + assert(genSaveFpLrWithAllCalleeSavedRegisters); + + offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - + 2 * REGSIZE_BYTES; // -2 for FP, LR + JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); + genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + + // We just established the frame pointer chain; don't do it again. + establishFramePointer = false; + + int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; + assert(remainingFrameSz > 0); + assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- + // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. + + JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz); + + // We've already established the frame pointer, so no need to report the stack pointer change to unwind info. + genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed, /* reportUnwindData */ false); + offset += remainingFrameSz; + } + else + { + unreached(); + } + + if (establishFramePointer) + { + JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); + genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + } + + assert(offset == totalFrameSize); +#endif // TARGET_ARM64 +} + #endif // TARGET_ARMARCH diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 66ad2f340951f..75f038612b3c2 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4822,601 +4822,6 @@ void CodeGen::genCheckUseBlockInit() } } -/*----------------------------------------------------------------------------- - * - * Push any callee-saved registers we have used - */ - -#if defined(TARGET_ARM64) -void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed) -#else -void CodeGen::genPushCalleeSavedRegisters() -#endif -{ - assert(compiler->compGeneratingProlog); - -#if defined(TARGET_XARCH) - // x86/x64 doesn't support push of xmm/ymm regs, therefore consider only integer registers for pushing onto stack - // here. Space for float registers to be preserved is stack allocated and saved as part of prolog sequence and not - // here. - regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_INT_CALLEE_SAVED; -#else // !defined(TARGET_XARCH) - regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; -#endif - -#if ETW_EBP_FRAMED - if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE)) - { - noway_assert(!"Used register RBM_FPBASE as a scratch register!"); - } -#endif - -#ifdef TARGET_XARCH - // On X86/X64 we have already pushed the FP (frame-pointer) prior to calling this method - if (isFramePointerUsed()) - { - rsPushRegs &= ~RBM_FPBASE; - } -#endif - -#ifdef TARGET_ARMARCH - // On ARM we push the FP (frame-pointer) here along with all other callee saved registers - if (isFramePointerUsed()) - rsPushRegs |= RBM_FPBASE; - - // - // It may be possible to skip pushing/popping lr for leaf methods. However, such optimization would require - // changes in GC suspension architecture. - // - // We would need to guarantee that a tight loop calling a virtual leaf method can be suspended for GC. Today, we - // generate partially interruptible code for both the method that contains the tight loop with the call and the leaf - // method. GC suspension depends on return address hijacking in this case. Return address hijacking depends - // on the return address to be saved on the stack. If we skipped pushing/popping lr, the return address would never - // be saved on the stack and the GC suspension would time out. - // - // So if we wanted to skip pushing pushing/popping lr for leaf frames, we would also need to do one of - // the following to make GC suspension work in the above scenario: - // - Make return address hijacking work even when lr is not saved on the stack. - // - Generate fully interruptible code for loops that contains calls - // - Generate fully interruptible code for leaf methods - // - // Given the limited benefit from this optimization (<10k for CoreLib NGen image), the extra complexity - // is not worth it. - // - rsPushRegs |= RBM_LR; // We must save the return address (in the LR register) - - regSet.rsMaskCalleeSaved = rsPushRegs; -#endif // TARGET_ARMARCH - -#ifdef DEBUG - if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) - { - printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", - compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); - dspRegMask(rsPushRegs); - printf("\n"); - assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); - } -#endif // DEBUG - -#if defined(TARGET_ARM) - regMaskTP maskPushRegsFloat = rsPushRegs & RBM_ALLFLOAT; - regMaskTP maskPushRegsInt = rsPushRegs & ~maskPushRegsFloat; - - maskPushRegsInt |= genStackAllocRegisterMask(compiler->compLclFrameSize, maskPushRegsFloat); - - assert(FitsIn(maskPushRegsInt)); - inst_IV(INS_push, (int)maskPushRegsInt); - compiler->unwindPushMaskInt(maskPushRegsInt); - - if (maskPushRegsFloat != 0) - { - genPushFltRegs(maskPushRegsFloat); - compiler->unwindPushMaskFloat(maskPushRegsFloat); - } -#elif defined(TARGET_ARM64) - // See the document "ARM64 JIT Frame Layout" and/or "ARM64 Exception Data" for more details or requirements and - // options. Case numbers in comments here refer to this document. See also Compiler::lvaAssignFrameOffsets() - // for pictures of the general frame layouts, and CodeGen::genFuncletProlog() implementations (per architecture) - // for pictures of the funclet frame layouts. - // - // For most frames, generate, e.g.: - // stp fp, lr, [sp,-0x80]! // predecrement SP with full frame size, and store FP/LR pair. - // stp r19, r20, [sp, 0x60] // store at positive offset from SP established above, into callee-saved area - // // at top of frame (highest addresses). - // stp r21, r22, [sp, 0x70] - // - // Notes: - // 1. We don't always need to save FP. If FP isn't saved, then LR is saved with the other callee-saved registers - // at the top of the frame. - // 2. If we save FP, then the first store is FP, LR. - // 3. General-purpose registers are 8 bytes, floating-point registers are 16 bytes, but FP/SIMD registers only - // preserve their lower 8 bytes, by calling convention. - // 4. For frames with varargs, we spill the integer register arguments to the stack, so all the arguments are - // consecutive, and at the top of the frame. - // 5. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc). - // - // For functions with GS and localloc, we change the frame so the frame pointer and LR are saved at the top - // of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same - // rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP. - // Since this frame type is relatively rare, we force using it via stress modes, for additional coverage. - // - // The frames look like the following (simplified to only include components that matter for establishing the - // frames). See also Compiler::lvaAssignFrameOffsets(). - // - // Frames with FP, LR saved at bottom of frame (above outgoing argument space): - // - // | | - // |-----------------------| - // | incoming arguments | - // +=======================+ <---- Caller's SP - // | Varargs regs space | // Only for varargs functions; 64 bytes - // |-----------------------| - // |Callee saved registers | // not including FP/LR; multiple of 8 bytes - // |-----------------------| - // | PSP slot | // 8 bytes (omitted in CoreRT ABI) - // |-----------------------| - // | locals, temps, etc. | - // |-----------------------| - // | possible GS cookie | - // |-----------------------| - // | Saved LR | // 8 bytes - // |-----------------------| - // | Saved FP | // 8 bytes - // |-----------------------| - // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) - // |-----------------------| <---- Ambient SP - // | | | - // ~ | Stack grows ~ - // | | downward | - // V - // - // Frames with FP, LR saved at top of frame (below saved varargs incoming arguments): - // - // | | - // |-----------------------| - // | incoming arguments | - // +=======================+ <---- Caller's SP - // | Varargs regs space | // Only for varargs functions; 64 bytes - // |-----------------------| - // | Saved LR | // 8 bytes - // |-----------------------| - // | Saved FP | // 8 bytes - // |-----------------------| - // |Callee saved registers | // not including FP/LR; multiple of 8 bytes - // |-----------------------| - // | PSP slot | // 8 bytes (omitted in CoreRT ABI) - // |-----------------------| - // | locals, temps, etc. | - // |-----------------------| - // | possible GS cookie | - // |-----------------------| - // | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) - // |-----------------------| <---- Ambient SP - // | | | - // ~ | Stack grows ~ - // | | downward | - // V - // - - int totalFrameSize = genTotalFrameSize(); - - int offset; // This will be the starting place for saving the callee-saved registers, in increasing order. - - regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; - regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; - -#ifdef DEBUG - if (verbose) - { - printf("Save float regs: "); - dspRegMask(maskSaveRegsFloat); - printf("\n"); - printf("Save int regs: "); - dspRegMask(maskSaveRegsInt); - printf("\n"); - } -#endif // DEBUG - - // The frameType number is arbitrary, is defined below, and corresponds to one of the frame styles we - // generate based on various sizes. - int frameType = 0; - - // The amount to subtract from SP before starting to store the callee-saved registers. It might be folded into the - // first save instruction as a "predecrement" amount, if possible. - int calleeSaveSPDelta = 0; - - if (isFramePointerUsed()) - { - // We need to save both FP and LR. - - assert((maskSaveRegsInt & RBM_FP) != 0); - assert((maskSaveRegsInt & RBM_LR) != 0); - - // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address - // (FP and LR) are protected from buffer overrun by the GS cookie. If FP/LR are at the lowest addresses, - // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will - // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our - // saved FP/LR. In that case, we save FP/LR along with the rest of the callee-saved registers, above - // the GS cookie. - // - // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to - // create a frame pointer chain. - // - // Do we need another frame pointer register to get good code quality in the case of having the frame pointer - // point high in the frame, so we can take advantage of arm64's preference for positive offsets? C++ native - // code dedicates callee-saved x19 to this, so generates: - // mov x19, sp - // in the prolog, then uses x19 for local var accesses. Given that this case is so rare, we currently do - // not do this. That means that negative offsets from FP might need to use the reserved register to form - // the local variable offset for an addressing mode. - - if (((compiler->lvaOutgoingArgSpaceSize == 0) && (totalFrameSize <= 504)) && - !genSaveFpLrWithAllCalleeSavedRegisters) - { - // Case #1. - // - // Generate: - // stp fp,lr,[sp,#-framesz]! - // - // The (totalFrameSize <= 504) condition ensures that both the pre-index STP instruction - // used in the prolog, and the post-index LDP instruction used in the epilog, can be generated. - // Note that STP and the unwind codes can handle -512, but LDP with a positive post-index value - // can only handle up to 504, and we want our prolog and epilog to match. - // - // After saving callee-saved registers, we establish the frame pointer with: - // mov fp,sp - // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. - - JITDUMP("Frame type 1. #outsz=0; #framesz=%d; LclFrameSize=%d\n", totalFrameSize, - compiler->compLclFrameSize); - - frameType = 1; - - assert(totalFrameSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); - - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, -totalFrameSize, - INS_OPTS_PRE_INDEX); - compiler->unwindSaveRegPairPreindexed(REG_FP, REG_LR, -totalFrameSize); - - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR - offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR - } - else if (totalFrameSize <= 512) - { - // Case #2. - // - // The (totalFrameSize <= 512) condition ensures the callee-saved registers can all be saved using STP - // with signed offset encoding. The maximum positive STP offset is 504, but when storing a pair of - // 8 byte registers, the largest actual offset we use would be 512 - 8 * 2 = 496. And STR with positive - // offset has a range 0 to 32760. - // - // After saving callee-saved registers, we establish the frame pointer with: - // add fp,sp,#outsz - // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. - - if (genSaveFpLrWithAllCalleeSavedRegisters) - { - JITDUMP("Frame type 4 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 4; - - // The frame will be allocated below, when the callee-saved registers are saved. This might mean a - // separate SUB instruction or the SP adjustment might be folded in to the first STP if there is - // no outgoing argument space AND no local frame space, that is, if the only thing the frame does - // is save callee-saved registers (and possibly varargs argument registers). - calleeSaveSPDelta = totalFrameSize; - - offset = (int)compiler->compLclFrameSize; - } - else - { - JITDUMP("Frame type 2 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 2; - - // Generate: - // sub sp,sp,#framesz - // stp fp,lr,[sp,#outsz] // note that by necessity, #outsz <= #framesz - 16, so #outsz <= 496. - - assert(totalFrameSize - compiler->lvaOutgoingArgSpaceSize <= STACK_PROBE_BOUNDARY_THRESHOLD_BYTES); - - GetEmitter()->emitIns_R_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); - compiler->unwindAllocStack(totalFrameSize); - - assert(compiler->lvaOutgoingArgSpaceSize + 2 * REGSIZE_BYTES <= (unsigned)totalFrameSize); - - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, - compiler->lvaOutgoingArgSpaceSize); - compiler->unwindSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize); - - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); // We've already saved FP/LR - offset = (int)compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // 2 for FP/LR - } - } - else - { - // Case 5 or 6. - // - // First, the callee-saved registers will be saved, and the callee-saved register code must use - // pre-index to subtract from SP as the first instruction. It must also leave space for varargs - // registers to be stored. For example: - // stp r19,r20,[sp,#-96]! - // stp d8,d9,[sp,#16] - // ... save varargs incoming integer registers ... - // Note that all SP alterations must be 16-byte aligned. We have already calculated any alignment to be - // lower on the stack than the callee-saved registers (see lvaAlignFrame() for how we calculate - // alignment). So, if there is an odd number of callee-saved registers, we use (for example, with just - // one saved register): - // sub sp,sp,#16 - // str r19,[sp,#8] - // This is one additional instruction, but it centralizes the aligned space. Otherwise, it might be - // possible to have two 8-byte alignment padding words, one below the callee-saved registers, and one - // above them. If that is preferable, we could implement it. - // - // Note that any varargs saved space will always be 16-byte aligned, since there are 8 argument - // registers. - // - // Then, define #remainingFrameSz = #framesz - (callee-saved size + varargs space + possible alignment - // padding from above). Note that #remainingFrameSz must not be zero, since we still need to save FP,SP. - // - // Generate: - // sub sp,sp,#remainingFrameSz - // or, for large frames: - // mov rX, #remainingFrameSz // maybe multiple instructions - // sub sp,sp,rX - // - // followed by: - // stp fp,lr,[sp,#outsz] - // add fp,sp,#outsz - // - // However, we need to handle the case where #outsz is larger than the constant signed offset encoding - // can handle. And, once again, we might need to deal with #outsz that is not aligned to 16-bytes (i.e., - // STACK_ALIGN). So, in the case of large #outsz we will have an additional SP adjustment, using one of - // the following sequences: - // - // Define #remainingFrameSz2 = #remainingFrameSz - #outsz. - // - // sub sp,sp,#remainingFrameSz2 // if #remainingFrameSz2 is 16-byte aligned - // stp fp,lr,[sp] - // mov fp,sp - // sub sp,sp,#outsz // in this case, #outsz must also be 16-byte aligned - // - // Or: - // - // sub sp,sp,roundUp(#remainingFrameSz2,16) // if #remainingFrameSz2 is not 16-byte aligned (it is - // // always guaranteed to be 8 byte aligned). - // stp fp,lr,[sp,#8] // it will always be #8 in the unaligned case - // add fp,sp,#8 - // sub sp,sp,#outsz - #8 - // - // (As usual, for a large constant "#outsz - #8", we might need multiple instructions: - // mov rX, #outsz - #8 // maybe multiple instructions - // sub sp,sp,rX - // ) - // - // Note that even if we align the SP alterations, that does not imply that we are creating empty alignment - // slots. In fact, we are not; any empty alignment slots were calculated in - // Compiler::lvaAssignFrameOffsets() and its callees. - - int calleeSaveSPDeltaUnaligned = totalFrameSize - compiler->compLclFrameSize; - if (genSaveFpLrWithAllCalleeSavedRegisters) - { - JITDUMP("Frame type 5 (save FP/LR at top). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - // This case is much simpler, because we allocate space for the callee-saved register area, including - // FP/LR. Note the SP adjustment might be SUB or be folded into the first store as a predecrement. - // Then, we use a single SUB to establish the rest of the frame. We need to be careful about where - // to establish the frame pointer, as there is a limit of 2040 bytes offset from SP to FP in the - // unwind codes when FP is established. - frameType = 5; - } - else - { - JITDUMP("Frame type 3 (save FP/LR at bottom). #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 3; - - calleeSaveSPDeltaUnaligned -= 2 * REGSIZE_BYTES; // 2 for FP, LR which we'll save later. - - // We'll take care of these later, but callee-saved regs code shouldn't see them. - maskSaveRegsInt &= ~(RBM_FP | RBM_LR); - } - - assert(calleeSaveSPDeltaUnaligned >= 0); - assert((calleeSaveSPDeltaUnaligned % 8) == 0); // It better at least be 8 byte aligned. - calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDeltaUnaligned, STACK_ALIGN); - - offset = calleeSaveSPDelta - calleeSaveSPDeltaUnaligned; - - JITDUMP(" calleeSaveSPDelta=%d, offset=%d\n", calleeSaveSPDelta, offset); - - // At most one alignment slot between SP and where we store the callee-saved registers. - assert((offset == 0) || (offset == REGSIZE_BYTES)); - } - } - else - { - // No frame pointer (no chaining). - assert((maskSaveRegsInt & RBM_FP) == 0); - assert((maskSaveRegsInt & RBM_LR) != 0); - - // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using - // 'stp' if we only have one callee-saved register plus LR to save. - - NYI("Frame without frame pointer"); - offset = 0; - } - - assert(frameType != 0); - - JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); - genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta); - - offset += genCountBits(maskSaveRegsInt | maskSaveRegsFloat) * REGSIZE_BYTES; - - // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here, - // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't - // need to add codes at all. - - if (compiler->info.compIsVarArgs) - { - JITDUMP(" compIsVarArgs=true\n"); - - // There are 8 general-purpose registers to home, thus 'offset' must be 16-byte aligned here. - assert((offset % 16) == 0); - for (regNumber reg1 = REG_ARG_FIRST; reg1 < REG_ARG_LAST; reg1 = REG_NEXT(REG_NEXT(reg1))) - { - regNumber reg2 = REG_NEXT(reg1); - // stp REG, REG + 1, [SP, #offset] - GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, offset); - compiler->unwindNop(); - offset += 2 * REGSIZE_BYTES; - } - } - - // By default, we'll establish the frame pointer chain. (Note that currently frames without FP are NYI.) - bool establishFramePointer = true; - - // If we do establish the frame pointer, what is the amount we add to SP to do so? - unsigned offsetSpToSavedFp = 0; - - if (frameType == 1) - { - assert(!genSaveFpLrWithAllCalleeSavedRegisters); - assert(offsetSpToSavedFp == 0); - } - else if (frameType == 2) - { - assert(!genSaveFpLrWithAllCalleeSavedRegisters); - - offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; - } - else if (frameType == 3) - { - assert(!genSaveFpLrWithAllCalleeSavedRegisters); - - int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; - assert(remainingFrameSz > 0); - assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- - // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. - - if (compiler->lvaOutgoingArgSpaceSize > 504) - { - // We can't do "stp fp,lr,[sp,#outsz]" because #outsz is too big. - // If compiler->lvaOutgoingArgSpaceSize is not aligned, we need to align the SP adjustment. - assert(remainingFrameSz > (int)compiler->lvaOutgoingArgSpaceSize); - int spAdjustment2Unaligned = remainingFrameSz - compiler->lvaOutgoingArgSpaceSize; - int spAdjustment2 = (int)roundUp((unsigned)spAdjustment2Unaligned, STACK_ALIGN); - int alignmentAdjustment2 = spAdjustment2 - spAdjustment2Unaligned; - assert((alignmentAdjustment2 == 0) || (alignmentAdjustment2 == 8)); - - JITDUMP(" spAdjustment2=%d\n", spAdjustment2); - - genPrologSaveRegPair(REG_FP, REG_LR, alignmentAdjustment2, -spAdjustment2, false, initReg, pInitRegZeroed); - offset += spAdjustment2; - - // Now subtract off the #outsz (or the rest of the #outsz if it was unaligned, and the above "sub" - // included some of it) - - int spAdjustment3 = compiler->lvaOutgoingArgSpaceSize - alignmentAdjustment2; - assert(spAdjustment3 > 0); - assert((spAdjustment3 % 16) == 0); - - JITDUMP(" alignmentAdjustment2=%d\n", alignmentAdjustment2); - genEstablishFramePointer(alignmentAdjustment2, /* reportUnwindData */ true); - - // We just established the frame pointer chain; don't do it again. - establishFramePointer = false; - - JITDUMP(" spAdjustment3=%d\n", spAdjustment3); - - // We've already established the frame pointer, so no need to report the stack pointer change to unwind - // info. - genStackPointerAdjustment(-spAdjustment3, initReg, pInitRegZeroed, /* reportUnwindData */ false); - offset += spAdjustment3; - } - else - { - genPrologSaveRegPair(REG_FP, REG_LR, compiler->lvaOutgoingArgSpaceSize, -remainingFrameSz, false, initReg, - pInitRegZeroed); - offset += remainingFrameSz; - - offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; - } - } - else if (frameType == 4) - { - assert(genSaveFpLrWithAllCalleeSavedRegisters); - offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - - 2 * REGSIZE_BYTES; // -2 for FP, LR - } - else if (frameType == 5) - { - assert(genSaveFpLrWithAllCalleeSavedRegisters); - - offsetSpToSavedFp = calleeSaveSPDelta - (compiler->info.compIsVarArgs ? MAX_REG_ARG * REGSIZE_BYTES : 0) - - 2 * REGSIZE_BYTES; // -2 for FP, LR - JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); - genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); - - // We just established the frame pointer chain; don't do it again. - establishFramePointer = false; - - int remainingFrameSz = totalFrameSize - calleeSaveSPDelta; - assert(remainingFrameSz > 0); - assert((remainingFrameSz % 16) == 0); // this is guaranteed to be 16-byte aligned because each component -- - // totalFrameSize and calleeSaveSPDelta -- is 16-byte aligned. - - JITDUMP(" remainingFrameSz=%d\n", remainingFrameSz); - - // We've already established the frame pointer, so no need to report the stack pointer change to unwind info. - genStackPointerAdjustment(-remainingFrameSz, initReg, pInitRegZeroed, /* reportUnwindData */ false); - offset += remainingFrameSz; - } - else - { - unreached(); - } - - if (establishFramePointer) - { - JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); - genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); - } - - assert(offset == totalFrameSize); - -#elif defined(TARGET_XARCH) - // Push backwards so we match the order we will pop them in the epilog - // and all the other code that expects it to be in this order. - for (regNumber reg = REG_INT_LAST; rsPushRegs != RBM_NONE; reg = REG_PREV(reg)) - { - regMaskTP regBit = genRegMask(reg); - - if ((regBit & rsPushRegs) != 0) - { - inst_RV(INS_push, reg, TYP_REF); - compiler->unwindPush(reg); -#ifdef USING_SCOPE_INFO - if (!doubleAlignOrFramePointerUsed()) - { - psiAdjustStackLevel(REGSIZE_BYTES); - } -#endif // USING_SCOPE_INFO - rsPushRegs &= ~regBit; - } - } - -#else - assert(!"Unknown TARGET"); -#endif // TARGET* -} - #if defined(TARGET_ARM) void CodeGen::genPushFltRegs(regMaskTP regMask) @@ -7227,10 +6632,8 @@ void CodeGen::genFinalizeFrame() #endif // TARGET_X86 #ifdef TARGET_ARM - // Make sure that callee-saved registers used by call to a stack probing helper generated for very large stack - // frames - // (see `getVeryLargeFrameSize`) are pushed on stack. - if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize()) + // Make sure that callee-saved registers used by call to a stack probing helper generated are pushed on stack. + if (compiler->compLclFrameSize >= compiler->eeGetPageSize()) { regSet.rsSetRegsModified(RBM_STACK_PROBE_HELPER_ARG | RBM_STACK_PROBE_HELPER_CALL_TARGET | RBM_STACK_PROBE_HELPER_TRASH); @@ -7815,37 +7218,26 @@ void CodeGen::genFnProlog() } #endif // TARGET_ARM -#if defined(TARGET_XARCH) - if (compiler->compLclFrameSize >= compiler->getVeryLargeFrameSize()) + tempMask = initRegs & ~excludeMask & ~regSet.rsMaskResvd; + + if (tempMask != RBM_NONE) { - // We currently must use REG_EAX on x86 here - // because the loop's backwards branch depends upon the size of EAX encodings - assert(initReg == REG_EAX); + // We will use one of the registers that we were planning to zero init anyway. + // We pick the lowest register number. + tempMask = genFindLowestBit(tempMask); + initReg = genRegNumFromMask(tempMask); } + // Next we prefer to use one of the unused argument registers. + // If they aren't available we use one of the caller-saved integer registers. else -#endif // TARGET_XARCH { - tempMask = initRegs & ~excludeMask & ~regSet.rsMaskResvd; - + tempMask = regSet.rsGetModifiedRegsMask() & RBM_ALLINT & ~excludeMask & ~regSet.rsMaskResvd; if (tempMask != RBM_NONE) { - // We will use one of the registers that we were planning to zero init anyway. - // We pick the lowest register number. + // We pick the lowest register number tempMask = genFindLowestBit(tempMask); initReg = genRegNumFromMask(tempMask); } - // Next we prefer to use one of the unused argument registers. - // If they aren't available we use one of the caller-saved integer registers. - else - { - tempMask = regSet.rsGetModifiedRegsMask() & RBM_ALLINT & ~excludeMask & ~regSet.rsMaskResvd; - if (tempMask != RBM_NONE) - { - // We pick the lowest register number - tempMask = genFindLowestBit(tempMask); - initReg = genRegNumFromMask(tempMask); - } - } } noway_assert(!compiler->compMethodRequiresPInvokeFrame() || (initReg != REG_PINVOKE_FRAME)); @@ -7907,16 +7299,6 @@ void CodeGen::genFnProlog() #endif // TARGET_XARCH #ifdef TARGET_ARM64 - // Probe large frames now, if necessary, since genPushCalleeSavedRegisters() will allocate the frame. Note that - // for arm64, genAllocLclFrame only probes the frame; it does not actually allocate it (it does not change SP). - // For arm64, we are probing the frame before the callee-saved registers are saved. The 'initReg' might have - // been calculated to be one of the callee-saved registers (say, if all the integer argument registers are - // in use, and perhaps with other conditions being satisfied). This is ok in other cases, after the callee-saved - // registers have been saved. So instead of letting genAllocLclFrame use initReg as a temporary register, - // always use REG_SCRATCH. We don't care if it trashes it, so ignore the initRegZeroed output argument. - bool ignoreInitRegZeroed = false; - genAllocLclFrame(compiler->compLclFrameSize, REG_SCRATCH, &ignoreInitRegZeroed, - intRegState.rsCalleeRegArgMaskLiveIn); genPushCalleeSavedRegisters(initReg, &initRegZeroed); #else // !TARGET_ARM64 genPushCalleeSavedRegisters(); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index b4f4c08070bd5..f1f08a68e855e 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -8950,4 +8950,61 @@ void CodeGen::genProfilingLeaveCallback(unsigned helper) #endif // PROFILING_SUPPORTED +//------------------------------------------------------------------------ +// genPushCalleeSavedRegisters: Push any callee-saved registers we have used. +// +void CodeGen::genPushCalleeSavedRegisters() +{ + assert(compiler->compGeneratingProlog); + + // x86/x64 doesn't support push of xmm/ymm regs, therefore consider only integer registers for pushing onto stack + // here. Space for float registers to be preserved is stack allocated and saved as part of prolog sequence and not + // here. + regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_INT_CALLEE_SAVED; + +#if ETW_EBP_FRAMED + if (!isFramePointerUsed() && regSet.rsRegsModified(RBM_FPBASE)) + { + noway_assert(!"Used register RBM_FPBASE as a scratch register!"); + } +#endif + + // On X86/X64 we have already pushed the FP (frame-pointer) prior to calling this method + if (isFramePointerUsed()) + { + rsPushRegs &= ~RBM_FPBASE; + } + +#ifdef DEBUG + if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) + { + printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", + compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); + dspRegMask(rsPushRegs); + printf("\n"); + assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); + } +#endif // DEBUG + + // Push backwards so we match the order we will pop them in the epilog + // and all the other code that expects it to be in this order. + for (regNumber reg = REG_INT_LAST; rsPushRegs != RBM_NONE; reg = REG_PREV(reg)) + { + regMaskTP regBit = genRegMask(reg); + + if ((regBit & rsPushRegs) != 0) + { + inst_RV(INS_push, reg, TYP_REF); + compiler->unwindPush(reg); +#ifdef USING_SCOPE_INFO + if (!doubleAlignOrFramePointerUsed()) + { + psiAdjustStackLevel(REGSIZE_BYTES); + } +#endif // USING_SCOPE_INFO + rsPushRegs &= ~regBit; + } + } +} + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 1f9cbb6bfd431..4a40dcb0d9734 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7510,19 +7510,6 @@ class Compiler return (target_size_t)eeGetEEInfo()->osPageSize; } - // Returns the frame size at which we will generate a loop to probe the stack. - target_size_t getVeryLargeFrameSize() - { -#ifdef TARGET_ARM - // The looping probe code is 40 bytes, whereas the straight-line probing for - // the (0x2000..0x3000) case is 44, so use looping for anything 0x2000 bytes - // or greater, to generate smaller code. - return 2 * eeGetPageSize(); -#else - return 3 * eeGetPageSize(); -#endif - } - //------------------------------------------------------------------------ // VirtualStubParam: virtual stub dispatch extra parameter (slot address). // diff --git a/src/coreclr/vm/amd64/JitHelpers_Fast.asm b/src/coreclr/vm/amd64/JitHelpers_Fast.asm index 28631e9f7b24f..82a301bb0cbd1 100644 --- a/src/coreclr/vm/amd64/JitHelpers_Fast.asm +++ b/src/coreclr/vm/amd64/JitHelpers_Fast.asm @@ -395,7 +395,7 @@ LEAF_END_MARKED JIT_ByRefWriteBarrier, _TEXT ; ; NOTE: this helper will NOT modify a value of rsp and can be defined as a leaf function. -PAGE_SIZE equ 1000h +PROBE_PAGE_SIZE equ 1000h LEAF_ENTRY JIT_StackProbe, _TEXT ; On entry: @@ -408,11 +408,11 @@ LEAF_ENTRY JIT_StackProbe, _TEXT ; NOTE: this helper will probe at least one page below the one pointed by rsp. mov rax, rsp ; rax points to some byte on the last probed page - and rax, -PAGE_SIZE ; rax points to the **lowest address** on the last probed page + and rax, -PROBE_PAGE_SIZE ; rax points to the **lowest address** on the last probed page ; This is done to make the following loop end condition simpler. ProbeLoop: - sub rax, PAGE_SIZE ; rax points to the lowest address of the **next page** to probe + sub rax, PROBE_PAGE_SIZE ; rax points to the lowest address of the **next page** to probe test dword ptr [rax], eax ; rax points to the lowest address on the **last probed** page cmp rax, r11 jg ProbeLoop ; If (rax > r11), then we need to probe at least one more page. diff --git a/src/coreclr/vm/amd64/jithelpers_fast.S b/src/coreclr/vm/amd64/jithelpers_fast.S index a1788a37afeaf..a13afb4878511 100644 --- a/src/coreclr/vm/amd64/jithelpers_fast.S +++ b/src/coreclr/vm/amd64/jithelpers_fast.S @@ -425,7 +425,7 @@ LEAF_END JIT_WriteBarrier_Callable, _TEXT // // See also https://github.com/dotnet/runtime/issues/9899#issue-303331518 for more information. -#define PAGE_SIZE 0x1000 +#define PROBE_PAGE_SIZE 0x1000 LEAF_ENTRY JIT_StackProbe, _TEXT // On entry: @@ -442,11 +442,11 @@ LEAF_ENTRY JIT_StackProbe, _TEXT END_PROLOGUE - and rsp, -PAGE_SIZE // rsp points to the **lowest address** on the last probed page + and rsp, -PROBE_PAGE_SIZE // rsp points to the **lowest address** on the last probed page // This is done to make the following loop end condition simpler. LOCAL_LABEL(ProbeLoop): - sub rsp, PAGE_SIZE // rsp points to the lowest address of the **next page** to probe + sub rsp, PROBE_PAGE_SIZE // rsp points to the lowest address of the **next page** to probe test dword ptr [rsp], eax // rsp points to the lowest address on the **last probed** page cmp rsp, r11 jg LOCAL_LABEL(ProbeLoop) // if (rsp > r11), then we need to probe at least one more page. diff --git a/src/coreclr/vm/arm/asmhelpers.S b/src/coreclr/vm/arm/asmhelpers.S index dcdfda4df350d..930395b56dc7e 100644 --- a/src/coreclr/vm/arm/asmhelpers.S +++ b/src/coreclr/vm/arm/asmhelpers.S @@ -1106,7 +1106,7 @@ DelayLoad_Helper\suffix: // The following helper will access ("probe") a word on each page of the stack // starting with the page right beneath sp down to the one pointed to by r4. // The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. -// The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required. +// The call to the helper will be emitted by JIT in the function/funclet prolog when stack frame is larger than an OS page. // On entry: // r4 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) // sp - points to some byte on the last probed page @@ -1115,23 +1115,23 @@ DelayLoad_Helper\suffix: // r5 - is not preserved // // NOTE: this helper will probe at least one page below the one pointed to by sp. -#define PAGE_SIZE 0x1000 -#define PAGE_SIZE_LOG2 12 +#define PROBE_PAGE_SIZE 4096 +#define PROBE_PAGE_SIZE_LOG2 12 LEAF_ENTRY JIT_StackProbe, _TEXT PROLOG_PUSH "{r7}" PROLOG_STACK_SAVE r7 - mov r5, sp // r5 points to some byte on the last probed page - bfc r5, #0, #PAGE_SIZE_LOG2 // r5 points to the **lowest address** on the last probed page + mov r5, sp // r5 points to some byte on the last probed page + bfc r5, #0, #PROBE_PAGE_SIZE_LOG2 // r5 points to the **lowest address** on the last probed page mov sp, r5 ProbeLoop: - // Immediate operand for the following instruction can not be greater than 4095. - sub sp, #(PAGE_SIZE - 4) // sp points to the **fourth** byte on the **next page** to probe - ldr r5, [sp, #-4]! // sp points to the lowest address on the **last probed** page + // Immediate operand for the following instruction can not be greater than 4095. + sub sp, #(PROBE_PAGE_SIZE - 4) // sp points to the **fourth** byte on the **next page** to probe + ldr r5, [sp, #-4]! // sp points to the lowest address on the **last probed** page cmp sp, r4 - bhi ProbeLoop // If (sp > r4), then we need to probe at least one more page. + bhi ProbeLoop // If (sp > r4), then we need to probe at least one more page. EPILOG_STACK_RESTORE r7 EPILOG_POP "{r7}" diff --git a/src/coreclr/vm/arm/asmhelpers.asm b/src/coreclr/vm/arm/asmhelpers.asm index 1565c13d3d456..d20540e62090e 100644 --- a/src/coreclr/vm/arm/asmhelpers.asm +++ b/src/coreclr/vm/arm/asmhelpers.asm @@ -1835,7 +1835,7 @@ $__RealName ;; The following helper will access ("probe") a word on each page of the stack ;; starting with the page right beneath sp down to the one pointed to by r4. ;; The procedure is needed to make sure that the "guard" page is pushed down below the allocated stack frame. -;; The call to the helper will be emitted by JIT in the function/funclet prolog when large (larger than 0x3000 bytes) stack frame is required. +;; The call to the helper will be emitted by JIT in the function/funclet prolog when stack frame is larger than an OS page. ;;----------------------------------------------------------------------------- ; On entry: ; r4 - points to the lowest address on the stack frame being allocated (i.e. [InitialSp - FrameSize]) @@ -1845,21 +1845,23 @@ $__RealName ; r5 - is not preserved ; ; NOTE: this helper will probe at least one page below the one pointed to by sp. -#define PAGE_SIZE_LOG2 12 +#define PROBE_PAGE_SIZE 4096 +#define PROBE_PAGE_SIZE_LOG2 12 + LEAF_ENTRY JIT_StackProbe PROLOG_PUSH {r7} PROLOG_STACK_SAVE r7 - mov r5, sp ; r5 points to some byte on the last probed page - bfc r5, #0, #PAGE_SIZE_LOG2 ; r5 points to the **lowest address** on the last probed page + mov r5, sp ; r5 points to some byte on the last probed page + bfc r5, #0, #PROBE_PAGE_SIZE_LOG2 ; r5 points to the **lowest address** on the last probed page mov sp, r5 ProbeLoop - ; Immediate operand for the following instruction can not be greater than 4095. - sub sp, #(PAGE_SIZE - 4) ; sp points to the **fourth** byte on the **next page** to probe - ldr r5, [sp, #-4]! ; sp points to the lowest address on the **last probed** page + ; Immediate operand for the following instruction can not be greater than 4095. + sub sp, #(PROBE_PAGE_SIZE - 4) ; sp points to the **fourth** byte on the **next page** to probe + ldr r5, [sp, #-4]! ; sp points to the lowest address on the **last probed** page cmp sp, r4 - bhi ProbeLoop ; if (sp > r4), then we need to probe at least one more page. + bhi ProbeLoop ; if (sp > r4), then we need to probe at least one more page. EPILOG_STACK_RESTORE r7 EPILOG_POP {r7} diff --git a/src/coreclr/vm/i386/jithelp.S b/src/coreclr/vm/i386/jithelp.S index 435b22ba4f244..facce7cacd3ef 100644 --- a/src/coreclr/vm/i386/jithelp.S +++ b/src/coreclr/vm/i386/jithelp.S @@ -618,7 +618,7 @@ LEAF_END JIT_Dbl2IntSSE2, _TEXT // NOTE: this helper will modify a value of esp and must establish the frame pointer. // NOTE: On Linux we must advance the stack pointer as we probe - it is not allowed to access 65535 bytes below esp. // -#define PAGE_SIZE 0x1000 +#define PROBE_PAGE_SIZE 0x1000 NESTED_ENTRY JIT_StackProbe, _TEXT, NoHandler // On entry: // eax - the lowest address of the stack frame being allocated (i.e. [InitialSp - FrameSize]) @@ -627,11 +627,11 @@ NESTED_ENTRY JIT_StackProbe, _TEXT, NoHandler PROLOG_BEG PROLOG_END - and esp, -PAGE_SIZE // esp points to the **lowest address** on the last probed page + and esp, -PROBE_PAGE_SIZE // esp points to the **lowest address** on the last probed page // This is done to make the loop end condition simpler. LOCAL_LABEL(ProbeLoop): - sub esp, PAGE_SIZE // esp points to the lowest address of the **next page** to probe + sub esp, PROBE_PAGE_SIZE // esp points to the lowest address of the **next page** to probe test [esp], eax // esp points to the lowest address on the **last probed** page cmp esp, eax jg LOCAL_LABEL(ProbeLoop) // if esp > eax, then we need to probe at least one more page. diff --git a/src/coreclr/vm/i386/jithelp.asm b/src/coreclr/vm/i386/jithelp.asm index 0d013a6724bff..3743ac3cbe02f 100644 --- a/src/coreclr/vm/i386/jithelp.asm +++ b/src/coreclr/vm/i386/jithelp.asm @@ -1313,7 +1313,7 @@ JIT_EndCatch ENDP ; The call to the helper will be emitted by JIT in the function prolog when large (larger than 0x3000 bytes) stack frame is required. ; ; NOTE: this helper will modify a value of esp and must establish the frame pointer. -PAGE_SIZE equ 1000h +PROBE_PAGE_SIZE equ 1000h _JIT_StackProbe@0 PROC public ; On entry: @@ -1323,13 +1323,13 @@ _JIT_StackProbe@0 PROC public push ebp mov ebp, esp - and esp, -PAGE_SIZE ; esp points to the **lowest address** on the last probed page - ; This is done to make the loop end condition simpler. + and esp, -PROBE_PAGE_SIZE ; esp points to the **lowest address** on the last probed page + ; This is done to make the loop end condition simpler. ProbeLoop: - test [esp - 4], eax ; esp points to the lowest address on the **last probed** page - sub esp, PAGE_SIZE ; esp points to the lowest address of the **next page** to probe + test [esp - 4], eax ; esp points to the lowest address on the **last probed** page + sub esp, PROBE_PAGE_SIZE ; esp points to the lowest address of the **next page** to probe cmp esp, eax - jg ProbeLoop ; if esp > eax, then we need to probe at least one more page. + jg ProbeLoop ; if esp > eax, then we need to probe at least one more page. mov esp, ebp pop ebp