Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit armv8.4 ldapur* for volatile loads with contained offsets #89681

Merged
merged 22 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/inc/clrconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rdm, W("EnableArm64Rd
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha1, W("EnableArm64Sha1"), 1, "Allows Arm64 Sha1+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha256, W("EnableArm64Sha256"), 1, "Allows Arm64 Sha256+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc, W("EnableArm64Rcpc"), 1, "Allows Arm64 Rcpc+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc2, W("EnableArm64Rcpc2"), 1, "Allows Arm64 Rcpc2+ hardware intrinsics to be disabled")
#endif

///
Expand Down
20 changes: 12 additions & 8 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@ enum CORINFO_InstructionSet
InstructionSet_Dczva=12,
InstructionSet_Rcpc=13,
InstructionSet_VectorT128=14,
InstructionSet_ArmBase_Arm64=15,
InstructionSet_AdvSimd_Arm64=16,
InstructionSet_Aes_Arm64=17,
InstructionSet_Crc32_Arm64=18,
InstructionSet_Dp_Arm64=19,
InstructionSet_Rdm_Arm64=20,
InstructionSet_Sha1_Arm64=21,
InstructionSet_Sha256_Arm64=22,
InstructionSet_Rcpc2=15,
InstructionSet_ArmBase_Arm64=16,
InstructionSet_AdvSimd_Arm64=17,
InstructionSet_Aes_Arm64=18,
InstructionSet_Crc32_Arm64=19,
InstructionSet_Dp_Arm64=20,
InstructionSet_Rdm_Arm64=21,
InstructionSet_Sha1_Arm64=22,
InstructionSet_Sha256_Arm64=23,
#endif // TARGET_ARM64
#ifdef TARGET_AMD64
InstructionSet_X86Base=1,
Expand Down Expand Up @@ -761,6 +762,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "Rcpc";
case InstructionSet_VectorT128 :
return "VectorT128";
case InstructionSet_Rcpc2 :
return "Rcpc2";
#endif // TARGET_ARM64
#ifdef TARGET_AMD64
case InstructionSet_X86Base :
Expand Down Expand Up @@ -994,6 +997,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Atomics: return InstructionSet_Atomics;
case READYTORUN_INSTRUCTION_Rcpc: return InstructionSet_Rcpc;
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
case READYTORUN_INSTRUCTION_Rcpc2: return InstructionSet_Rcpc2;
#endif // TARGET_ARM64
#ifdef TARGET_AMD64
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
#define GUID_DEFINED
#endif // !GUID_DEFINED

constexpr GUID JITEEVersionIdentifier = { /* 5bf301d6-d08e-4c74-ab9b-1d9c1975950f */
0x5bf301d6,
0xd08e,
0x4c74,
{0xab, 0x9b, 0x1d, 0x9c, 0x19, 0x75, 0x95, 0x0f}
constexpr GUID JITEEVersionIdentifier = { /* a2974440-e8ee-4d95-9e6e-799a330be1a0 */
0xa2974440,
0xe8ee,
0x4d95,
{0x9e, 0x6e, 0x79, 0x9a, 0x33, 0x0b, 0xe1, 0xa0}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/inc/readytoruninstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ enum ReadyToRunInstructionSet
READYTORUN_INSTRUCTION_VectorT128=39,
READYTORUN_INSTRUCTION_VectorT256=40,
READYTORUN_INSTRUCTION_VectorT512=41,
READYTORUN_INSTRUCTION_Rcpc2=42,

};

Expand Down
29 changes: 27 additions & 2 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1761,10 +1761,35 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
bool addrIsInReg = tree->Addr()->isUsedFromReg();
bool addrIsAligned = ((tree->gtFlags & GTF_IND_UNALIGNED) == 0);

// on arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid
// On arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid
// full memory barriers if mixed with STLR
bool hasRcpc = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc);

// On arm64-v8.4+ we can use ldapur* instructions with acquire/release semantics to
// avoid full memory barriers if address is contained and unscaled
bool hasRcpc2 = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc2);

bool handledWithLdapur = false;
if (hasRcpc2 && !addrIsInReg && tree->Addr()->OperIs(GT_LEA) && !tree->HasIndex() && (tree->Scale() == 1) &&
emitter::emitIns_valid_imm_for_unscaled_ldst_offset(tree->Offset()))
{
if (ins == INS_ldrb)
{
ins = INS_ldapurb;
handledWithLdapur = true;
}
else if (ins == INS_ldrh)
{
ins = INS_ldapurh;
handledWithLdapur = true;
}
else if (ins == INS_ldr)
{
ins = INS_ldapur;
handledWithLdapur = true;
}
}

if ((ins == INS_ldrb) && addrIsInReg)
{
ins = hasRcpc ? INS_ldaprb : INS_ldarb;
Expand All @@ -1777,7 +1802,7 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
{
ins = hasRcpc ? INS_ldapr : INS_ldar;
}
else
else if (!handledWithLdapur)
#endif // TARGET_ARM64
{
emitBarrier = true;
Expand Down
18 changes: 11 additions & 7 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,8 @@ class emitter
#define MAX_ENCODED_SIZE 15
#elif defined(TARGET_ARM64)
#define INSTR_ENCODED_SIZE 4
static_assert_no_msg(INS_count <= 512);
instruction _idIns : 9;
static_assert_no_msg(INS_count <= 1024);
instruction _idIns : 10;
Copy link
Member Author

@EgorBo EgorBo Jul 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a bit here since we now have more than 512 instructions (and that for sure will be increasing in future) but I also removed an used (on arm) _idCallRegPtr bit so the total size of the struct remains the same - 16b

#elif defined(TARGET_LOONGARCH64)
// TODO-LoongArch64: not include SIMD-vector.
static_assert_no_msg(INS_count <= 512);
Expand Down Expand Up @@ -712,7 +712,7 @@ class emitter
// x86: 17 bits
// amd64: 17 bits
// arm: 16 bits
// arm64: 17 bits
// arm64: 18 bits
// loongarch64: 14 bits
// risc-v: 14 bits

Expand Down Expand Up @@ -754,7 +754,7 @@ class emitter
// x86: 38 bits
// amd64: 38 bits
// arm: 32 bits
// arm64: 31 bits
// arm64: 32 bits
// loongarch64: 28 bits
// risc-v: 28 bits

Expand All @@ -763,10 +763,12 @@ class emitter
unsigned _idLargeDsp : 1; // does a large displacement follow?
unsigned _idLargeCall : 1; // large call descriptor used

unsigned _idBound : 1; // jump target / frame offset bound
unsigned _idBound : 1; // jump target / frame offset bound
#ifndef TARGET_ARMARCH
unsigned _idCallRegPtr : 1; // IL indirect calls: addr in reg
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
#endif
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
#if defined(TARGET_XARCH)
unsigned _idEvexbContext : 1; // does EVEX.b need to be set.
#endif // TARGET_XARCH
Expand Down Expand Up @@ -1509,6 +1511,7 @@ class emitter
_idBound = 1;
}

#ifndef TARGET_ARMARCH
bool idIsCallRegPtr() const
{
return _idCallRegPtr != 0;
Expand All @@ -1517,6 +1520,7 @@ class emitter
{
_idCallRegPtr = 1;
}
#endif

// Only call instructions that call helper functions may be marked as "IsNoGC", indicating
// that a thread executing such a call cannot be stopped for GC. Thus, in partially-interruptible
Expand Down
2 changes: 0 additions & 2 deletions src/coreclr/jit/emitarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4770,8 +4770,6 @@ void emitter::emitIns_Call(EmitCallType callType,
{
/* This is an indirect call (either a virtual call or func ptr call) */

id->idSetIsCallRegPtr();

if (isJump)
{
ins = INS_bx; // INS_bx Reg
Expand Down
26 changes: 24 additions & 2 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1157,7 +1157,9 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
case INS_ldrb:
case INS_strb:
case INS_ldurb:
case INS_ldapurb:
case INS_sturb:
case INS_stlurb:
result = EA_4BYTE;
break;

Expand All @@ -1172,6 +1174,8 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
case INS_strh:
case INS_ldurh:
case INS_sturh:
case INS_ldapurh:
case INS_stlurh:
result = EA_4BYTE;
break;

Expand Down Expand Up @@ -1209,6 +1213,8 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
case INS_str:
case INS_ldur:
case INS_stur:
case INS_ldapur:
case INS_stlur:
result = id->idOpSize();
break;

Expand Down Expand Up @@ -1237,7 +1243,9 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
case INS_ldrb:
case INS_strb:
case INS_ldurb:
case INS_ldapurb:
case INS_sturb:
case INS_stlurb:
case INS_ldrsb:
case INS_ldursb:
result = EA_1BYTE;
Expand All @@ -1252,6 +1260,8 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
case INS_sturh:
case INS_ldrsh:
case INS_ldursh:
case INS_ldapurh:
case INS_stlurh:
result = EA_2BYTE;
break;

Expand All @@ -1275,6 +1285,8 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
case INS_str:
case INS_ldur:
case INS_stur:
case INS_ldapur:
case INS_stlur:
result = id->idOpSize();
break;

Expand Down Expand Up @@ -2372,6 +2384,12 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
return false;
}

// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction
/*static*/ bool emitter::emitIns_valid_imm_for_unscaled_ldst_offset(INT64 imm)
{
return (imm >= -256) && (imm <= 255);
}

// true if this 'imm' can be encoded as the offset in a ldr/str instruction
/*static*/ bool emitter::emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr attr)
{
Expand Down Expand Up @@ -5505,6 +5523,8 @@ void emitter::emitIns_R_R_I(
isLdSt = true;
break;

case INS_ldapurb:
case INS_stlurb:
case INS_ldurb:
case INS_sturb:
// size is ignored
Expand All @@ -5522,7 +5542,9 @@ void emitter::emitIns_R_R_I(
break;

case INS_ldurh:
case INS_ldapurh:
case INS_sturh:
case INS_stlurh:
// size is ignored
unscaledOp = true;
scale = 0;
Expand Down Expand Up @@ -5550,6 +5572,8 @@ void emitter::emitIns_R_R_I(

case INS_ldur:
case INS_stur:
case INS_ldapur:
case INS_stlur:
// Is the target a vector register?
if (isVectorRegister(reg1))
{
Expand Down Expand Up @@ -8813,8 +8837,6 @@ void emitter::emitIns_Call(EmitCallType callType,
{
/* This is an indirect call (either a virtual call or func ptr call) */

id->idSetIsCallRegPtr();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this removed for arm32?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned it in https://github.com/dotnet/runtime/pull/89681/files#r1278410873 comment, basically, it sets a bit that is unused for arm, so I removed that bit because I needed to extend _idIns's width


if (isJump)
{
ins = INS_br_tail; // INS_br_tail Reg
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/emitarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,9 @@ static bool emitIns_valid_imm_for_alu(INT64 imm, emitAttr size);
// true if this 'imm' can be encoded as the offset in a ldr/str instruction
static bool emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr size);

// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction
static bool emitIns_valid_imm_for_unscaled_ldst_offset(INT64 imm);

// true if this 'imm' can be encoded as a input operand to a ccmp instruction
static bool emitIns_valid_imm_for_ccmp(INT64 imm);

Expand Down
26 changes: 21 additions & 5 deletions src/coreclr/jit/instrsarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1053,17 +1053,15 @@ INST1(ldarb, "ldarb", LD, IF_LS_2A, 0x08DFFC00)
INST1(ldarh, "ldarh", LD, IF_LS_2A, 0x48DFFC00)
// ldarh Rt,[Xn] LS_2A 0100100011011111 111111nnnnnttttt 48DF FC00


INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000)
INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000)
// ldapr Rt,[Xn] LS_2A 1X11100010111111 110000nnnnnttttt B8BF C000 Rm Rt Rn ARMv8.3 LRCPC

INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000)
INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000)
// ldaprb Rt,[Xn] LS_2A 0011100010111111 110000nnnnnttttt 38BF C000 Rm Rt Rn ARMv8.3 LRCPC

INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000)
INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000)
// ldaprh Rt,[Xn] LS_2A 0111100010111111 110000nnnnnttttt 78BF C000 Rm Rt Rn ARMv8.3 LRCPC


INST1(ldxr, "ldxr", LD, IF_LS_2A, 0x885F7C00)
// ldxr Rt,[Xn] LS_2A 1X00100001011111 011111nnnnnttttt 885F 7C00

Expand Down Expand Up @@ -1100,6 +1098,15 @@ INST1(ldursh, "ldursh", LD, IF_LS_2C, 0x78800000)
INST1(ldursw, "ldursw", LD, IF_LS_2C, 0xB8800000)
// ldursw Rt,[Xn+simm9] LS_2C 10111000100iiiii iiii00nnnnnttttt B880 0000 [Xn imm(-256..+255)]

INST1(ldapur, "ldapur", LD, IF_LS_2C, 0x99400000)
// ldapur Rt,[Xn+simm9] LS_2C 1X011001010iiiii iiii00nnnnnttttt 9940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(ldapurb, "ldapurb", LD, IF_LS_2C, 0x19400000)
// ldapurb Rt,[Xn+simm9] LS_2C 00011001010iiiii iiii00nnnnnttttt 1940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(ldapurh, "ldapurh", LD, IF_LS_2C, 0x59400000)
// ldapurh Rt,[Xn+simm9] LS_2C 01011001010iiiii iiii00nnnnnttttt 5940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(stlr, "stlr", ST, IF_LS_2A, 0x889FFC00)
// stlr Rt,[Xn] LS_2A 1X00100010011111 111111nnnnnttttt 889F FC00

Expand Down Expand Up @@ -1136,6 +1143,15 @@ INST1(sturb, "sturb", ST, IF_LS_2C, 0x38000000)
INST1(sturh, "sturh", ST, IF_LS_2C, 0x78000000)
// sturh Rt,[Xn+simm9] LS_2C 01111000000iiiii iiii00nnnnnttttt 7800 0000 [Xn imm(-256..+255)]

INST1(stlur, "stlur", ST, IF_LS_2C, 0x99000000)
// stlur Rt,[Xn+simm9] LS_2C 1X011001000iiiii iiii00nnnnnttttt 9900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(stlurb, "stlurb", ST, IF_LS_2C, 0x19000000)
// stlurb Rt,[Xn+simm9] LS_2C 00011001000iiiii iiii00nnnnnttttt 1900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(stlurh, "stlurh", ST, IF_LS_2C, 0x59000000)
// stlurh Rt,[Xn+simm9] LS_2C 01011001000iiiii iiii00nnnnnttttt 5900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2

INST1(casb, "casb", LD|ST, IF_LS_3E, 0x08A07C00)
// casb Wm, Wt, [Xn] LS_3E 00001000101mmmmm 011111nnnnnttttt 08A0 7C00 Rm Rt Rn ARMv8.1 LSE Atomics

Expand Down
17 changes: 16 additions & 1 deletion src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6033,7 +6033,8 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par
}

#ifdef TARGET_ARM64
if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile())
const bool hasRcpc2 = comp->compOpportunisticallyDependsOn(InstructionSet_Rcpc2);
if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile() && !hasRcpc2)
{
// For Arm64 we avoid using LEA for volatile INDs
// because we won't be able to use ldar/star
Expand All @@ -6056,6 +6057,20 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par
&scale, // scaling
&offset); // displacement

#ifdef TARGET_ARM64
if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile())
{
// Generally, we try to avoid creating addressing modes for volatile INDs so we can then use
// ldar/stlr instead of ldr/str + dmb. Although, with Arm 8.4+'s RCPC2 we can handle unscaled
// addressing modes (if the offset fits into 9 bits)
assert(hasRcpc2);
if ((scale > 1) || (!emitter::emitIns_valid_imm_for_unscaled_ldst_offset(offset)) || (index != nullptr))
{
return false;
}
}
#endif

var_types targetType = parent->OperIsIndir() ? parent->TypeGet() : TYP_UNDEF;

#ifdef TARGET_ARMARCH
Expand Down
Loading
Loading