Skip to content

Commit

Permalink
Emit armv8.4 ldapur* for volatile loads with contained offsets (#89681)
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorBo committed Aug 31, 2023
1 parent ff2de36 commit f0463a9
Show file tree
Hide file tree
Showing 20 changed files with 162 additions and 44 deletions.
1 change: 1 addition & 0 deletions src/coreclr/inc/clrconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rdm, W("EnableArm64Rd
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha1, W("EnableArm64Sha1"), 1, "Allows Arm64 Sha1+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha256, W("EnableArm64Sha256"), 1, "Allows Arm64 Sha256+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc, W("EnableArm64Rcpc"), 1, "Allows Arm64 Rcpc+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc2, W("EnableArm64Rcpc2"), 1, "Allows Arm64 Rcpc2+ hardware intrinsics to be disabled")
#endif

///
Expand Down
20 changes: 12 additions & 8 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@ enum CORINFO_InstructionSet
InstructionSet_Dczva=12,
InstructionSet_Rcpc=13,
InstructionSet_VectorT128=14,
InstructionSet_ArmBase_Arm64=15,
InstructionSet_AdvSimd_Arm64=16,
InstructionSet_Aes_Arm64=17,
InstructionSet_Crc32_Arm64=18,
InstructionSet_Dp_Arm64=19,
InstructionSet_Rdm_Arm64=20,
InstructionSet_Sha1_Arm64=21,
InstructionSet_Sha256_Arm64=22,
InstructionSet_Rcpc2=15,
InstructionSet_ArmBase_Arm64=16,
InstructionSet_AdvSimd_Arm64=17,
InstructionSet_Aes_Arm64=18,
InstructionSet_Crc32_Arm64=19,
InstructionSet_Dp_Arm64=20,
InstructionSet_Rdm_Arm64=21,
InstructionSet_Sha1_Arm64=22,
InstructionSet_Sha256_Arm64=23,
#endif // TARGET_ARM64
#ifdef TARGET_AMD64
InstructionSet_X86Base=1,
Expand Down Expand Up @@ -761,6 +762,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "Rcpc";
case InstructionSet_VectorT128 :
return "VectorT128";
case InstructionSet_Rcpc2 :
return "Rcpc2";
#endif // TARGET_ARM64
#ifdef TARGET_AMD64
case InstructionSet_X86Base :
Expand Down Expand Up @@ -994,6 +997,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Atomics: return InstructionSet_Atomics;
case READYTORUN_INSTRUCTION_Rcpc: return InstructionSet_Rcpc;
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
case READYTORUN_INSTRUCTION_Rcpc2: return InstructionSet_Rcpc2;
#endif // TARGET_ARM64
#ifdef TARGET_AMD64
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
#define GUID_DEFINED
#endif // !GUID_DEFINED

constexpr GUID JITEEVersionIdentifier = { /* 5bf301d6-d08e-4c74-ab9b-1d9c1975950f */
0x5bf301d6,
0xd08e,
0x4c74,
{0xab, 0x9b, 0x1d, 0x9c, 0x19, 0x75, 0x95, 0x0f}
constexpr GUID JITEEVersionIdentifier = { /* a2974440-e8ee-4d95-9e6e-799a330be1a0 */
0xa2974440,
0xe8ee,
0x4d95,
{0x9e, 0x6e, 0x79, 0x9a, 0x33, 0x0b, 0xe1, 0xa0}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/inc/readytoruninstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ enum ReadyToRunInstructionSet
READYTORUN_INSTRUCTION_VectorT128=39,
READYTORUN_INSTRUCTION_VectorT256=40,
READYTORUN_INSTRUCTION_VectorT512=41,
READYTORUN_INSTRUCTION_Rcpc2=42,

};

Expand Down
29 changes: 27 additions & 2 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1761,10 +1761,35 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
bool addrIsInReg = tree->Addr()->isUsedFromReg();
bool addrIsAligned = ((tree->gtFlags & GTF_IND_UNALIGNED) == 0);

// on arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid
// On arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid
// full memory barriers if mixed with STLR
bool hasRcpc = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc);

// On arm64-v8.4+ we can use ldapur* instructions with acquire/release semantics to
// avoid full memory barriers if address is contained and unscaled
bool hasRcpc2 = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc2);

bool handledWithLdapur = false;
if (hasRcpc2 && !addrIsInReg && tree->Addr()->OperIs(GT_LEA) && !tree->HasIndex() && (tree->Scale() == 1) &&
emitter::emitIns_valid_imm_for_unscaled_ldst_offset(tree->Offset()))
{
if (ins == INS_ldrb)
{
ins = INS_ldapurb;
handledWithLdapur = true;
}
else if (ins == INS_ldrh)
{
ins = INS_ldapurh;
handledWithLdapur = true;
}
else if (ins == INS_ldr)
{
ins = INS_ldapur;
handledWithLdapur = true;
}
}

if ((ins == INS_ldrb) && addrIsInReg)
{
ins = hasRcpc ? INS_ldaprb : INS_ldarb;
Expand All @@ -1777,7 +1802,7 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
{
ins = hasRcpc ? INS_ldapr : INS_ldar;
}
else
else if (!handledWithLdapur)
#endif // TARGET_ARM64
{
emitBarrier = true;
Expand Down
18 changes: 11 additions & 7 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,8 @@ class emitter
#define MAX_ENCODED_SIZE 15
#elif defined(TARGET_ARM64)
#define INSTR_ENCODED_SIZE 4
static_assert_no_msg(INS_count <= 512);
instruction _idIns : 9;
static_assert_no_msg(INS_count <= 1024);
instruction _idIns : 10;
#elif defined(TARGET_LOONGARCH64)
// TODO-LoongArch64: not include SIMD-vector.
static_assert_no_msg(INS_count <= 512);
Expand Down Expand Up @@ -712,7 +712,7 @@ class emitter
// x86: 17 bits
// amd64: 17 bits
// arm: 16 bits
// arm64: 17 bits
// arm64: 18 bits
// loongarch64: 14 bits
// risc-v: 14 bits

Expand Down Expand Up @@ -754,7 +754,7 @@ class emitter
// x86: 38 bits
// amd64: 38 bits
// arm: 32 bits
// arm64: 31 bits
// arm64: 32 bits
// loongarch64: 28 bits
// risc-v: 28 bits

Expand All @@ -763,10 +763,12 @@ class emitter
unsigned _idLargeDsp : 1; // does a large displacement follow?
unsigned _idLargeCall : 1; // large call descriptor used

unsigned _idBound : 1; // jump target / frame offset bound
unsigned _idBound : 1; // jump target / frame offset bound
#ifndef TARGET_ARMARCH
unsigned _idCallRegPtr : 1; // IL indirect calls: addr in reg
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
#endif
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
#if defined(TARGET_XARCH)
unsigned _idEvexbContext : 1; // does EVEX.b need to be set.
#endif // TARGET_XARCH
Expand Down Expand Up @@ -1509,6 +1511,7 @@ class emitter
_idBound = 1;
}

#ifndef TARGET_ARMARCH
bool idIsCallRegPtr() const
{
return _idCallRegPtr != 0;
Expand All @@ -1517,6 +1520,7 @@ class emitter
{
_idCallRegPtr = 1;
}
#endif

// Only call instructions that call helper functions may be marked as "IsNoGC", indicating
// that a thread executing such a call cannot be stopped for GC. Thus, in partially-interruptible
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/emitarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4770,8 +4770,6 @@ void emitter::emitIns_Call(EmitCallType callType,
{
/* This is an indirect call (either a virtual call or func ptr call) */

id->idSetIsCallRegPtr();

if (isJump)
{
ins = INS_bx; // INS_bx Reg
Expand Down
26 changes: 24 additions & 2 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1157,7 +1157,9 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
case INS_ldrb:
case INS_strb:
case INS_ldurb:
case INS_ldapurb:
case INS_sturb:
case INS_stlurb:
result = EA_4BYTE;
break;

Expand All @@ -1172,6 +1174,8 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
case INS_strh:
case INS_ldurh:
case INS_sturh:
case INS_ldapurh:
case INS_stlurh:
result = EA_4BYTE;
break;

Expand Down Expand Up @@ -1209,6 +1213,8 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
case INS_str:
case INS_ldur:
case INS_stur:
case INS_ldapur:
case INS_stlur:
result = id->idOpSize();
break;

Expand Down Expand Up @@ -1237,7 +1243,9 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
case INS_ldrb:
case INS_strb:
case INS_ldurb:
case INS_ldapurb:
case INS_sturb:
case INS_stlurb:
case INS_ldrsb:
case INS_ldursb:
result = EA_1BYTE;
Expand All @@ -1252,6 +1260,8 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
case INS_sturh:
case INS_ldrsh:
case INS_ldursh:
case INS_ldapurh:
case INS_stlurh:
result = EA_2BYTE;
break;

Expand All @@ -1275,6 +1285,8 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
case INS_str:
case INS_ldur:
case INS_stur:
case INS_ldapur:
case INS_stlur:
result = id->idOpSize();
break;

Expand Down Expand Up @@ -2372,6 +2384,12 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
return false;
}

// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction
/*static*/ bool emitter::emitIns_valid_imm_for_unscaled_ldst_offset(INT64 imm)
{
return (imm >= -256) && (imm <= 255);
}

// true if this 'imm' can be encoded as the offset in a ldr/str instruction
/*static*/ bool emitter::emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr attr)
{
Expand Down Expand Up @@ -5505,6 +5523,8 @@ void emitter::emitIns_R_R_I(
isLdSt = true;
break;

case INS_ldapurb:
case INS_stlurb:
case INS_ldurb:
case INS_sturb:
// size is ignored
Expand All @@ -5522,7 +5542,9 @@ void emitter::emitIns_R_R_I(
break;

case INS_ldurh:
case INS_ldapurh:
case INS_sturh:
case INS_stlurh:
// size is ignored
unscaledOp = true;
scale = 0;
Expand Down Expand Up @@ -5550,6 +5572,8 @@ void emitter::emitIns_R_R_I(

case INS_ldur:
case INS_stur:
case INS_ldapur:
case INS_stlur:
// Is the target a vector register?
if (isVectorRegister(reg1))
{
Expand Down Expand Up @@ -8813,8 +8837,6 @@ void emitter::emitIns_Call(EmitCallType callType,
{
/* This is an indirect call (either a virtual call or func ptr call) */

id->idSetIsCallRegPtr();

if (isJump)
{
ins = INS_br_tail; // INS_br_tail Reg
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/emitarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,9 @@ static bool emitIns_valid_imm_for_alu(INT64 imm, emitAttr size);
// true if this 'imm' can be encoded as the offset in a ldr/str instruction
static bool emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr size);

// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction
static bool emitIns_valid_imm_for_unscaled_ldst_offset(INT64 imm);

// true if this 'imm' can be encoded as a input operand to a ccmp instruction
static bool emitIns_valid_imm_for_ccmp(INT64 imm);

Expand Down
26 changes: 21 additions & 5 deletions src/coreclr/jit/instrsarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1053,17 +1053,15 @@ INST1(ldarb, "ldarb", LD, IF_LS_2A, 0x08DFFC00)
INST1(ldarh, "ldarh", LD, IF_LS_2A, 0x48DFFC00)
// ldarh Rt,[Xn] LS_2A 0100100011011111 111111nnnnnttttt 48DF FC00


INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000)
INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000)
// ldapr Rt,[Xn] LS_2A 1X11100010111111 110000nnnnnttttt B8BF C000 Rm Rt Rn ARMv8.3 LRCPC

INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000)
INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000)
// ldaprb Rt,[Xn] LS_2A 0011100010111111 110000nnnnnttttt 38BF C000 Rm Rt Rn ARMv8.3 LRCPC

INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000)
INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000)
// ldaprh Rt,[Xn] LS_2A 0111100010111111 110000nnnnnttttt 78BF C000 Rm Rt Rn ARMv8.3 LRCPC


INST1(ldxr, "ldxr", LD, IF_LS_2A, 0x885F7C00)
// ldxr Rt,[Xn] LS_2A 1X00100001011111 011111nnnnnttttt 885F 7C00

Expand Down Expand Up @@ -1100,6 +1098,15 @@ INST1(ldursh, "ldursh", LD, IF_LS_2C, 0x78800000)
INST1(ldursw, "ldursw", LD, IF_LS_2C, 0xB8800000)
// ldursw Rt,[Xn+simm9] LS_2C 10111000100iiiii iiii00nnnnnttttt B880 0000 [Xn imm(-256..+255)]

INST1(ldapur, "ldapur", LD, IF_LS_2C, 0x99400000)
// ldapur Rt,[Xn+simm9] LS_2C 1X011001010iiiii iiii00nnnnnttttt 9940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(ldapurb, "ldapurb", LD, IF_LS_2C, 0x19400000)
// ldapurb Rt,[Xn+simm9] LS_2C 00011001010iiiii iiii00nnnnnttttt 1940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(ldapurh, "ldapurh", LD, IF_LS_2C, 0x59400000)
// ldapurh Rt,[Xn+simm9] LS_2C 01011001010iiiii iiii00nnnnnttttt 5940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(stlr, "stlr", ST, IF_LS_2A, 0x889FFC00)
// stlr Rt,[Xn] LS_2A 1X00100010011111 111111nnnnnttttt 889F FC00

Expand Down Expand Up @@ -1136,6 +1143,15 @@ INST1(sturb, "sturb", ST, IF_LS_2C, 0x38000000)
INST1(sturh, "sturh", ST, IF_LS_2C, 0x78000000)
// sturh Rt,[Xn+simm9] LS_2C 01111000000iiiii iiii00nnnnnttttt 7800 0000 [Xn imm(-256..+255)]

INST1(stlur, "stlur", ST, IF_LS_2C, 0x99000000)
// stlur Rt,[Xn+simm9] LS_2C 1X011001000iiiii iiii00nnnnnttttt 9900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(stlurb, "stlurb", ST, IF_LS_2C, 0x19000000)
// stlurb Rt,[Xn+simm9] LS_2C 00011001000iiiii iiii00nnnnnttttt 1900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(stlurh, "stlurh", ST, IF_LS_2C, 0x59000000)
// stlurh Rt,[Xn+simm9] LS_2C 01011001000iiiii iiii00nnnnnttttt 5900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(casb, "casb", LD|ST, IF_LS_3E, 0x08A07C00)
// casb Wm, Wt, [Xn] LS_3E 00001000101mmmmm 011111nnnnnttttt 08A0 7C00 Rm Rt Rn ARMv8.1 LSE Atomics

Expand Down
17 changes: 16 additions & 1 deletion src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6033,7 +6033,8 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par
}

#ifdef TARGET_ARM64
if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile())
const bool hasRcpc2 = comp->compOpportunisticallyDependsOn(InstructionSet_Rcpc2);
if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile() && !hasRcpc2)
{
// For Arm64 we avoid using LEA for volatile INDs
// because we won't be able to use ldar/star
Expand All @@ -6056,6 +6057,20 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par
&scale, // scaling
&offset); // displacement

#ifdef TARGET_ARM64
if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile())
{
// Generally, we try to avoid creating addressing modes for volatile INDs so we can then use
// ldar/stlr instead of ldr/str + dmb. Although, with Arm 8.4+'s RCPC2 we can handle unscaled
// addressing modes (if the offset fits into 9 bits)
assert(hasRcpc2);
if ((scale > 1) || (!emitter::emitIns_valid_imm_for_unscaled_ldst_offset(offset)) || (index != nullptr))
{
return false;
}
}
#endif

var_types targetType = parent->OperIsIndir() ? parent->TypeGet() : TYP_UNDEF;

#ifdef TARGET_ARMARCH
Expand Down
Loading

0 comments on commit f0463a9

Please sign in to comment.