Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decompose some bitwise operations in HIR to allow more overall optimizations to kick in #104517

Merged
merged 14 commits into from
Jul 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 63 additions & 142 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20531,10 +20531,17 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si

GenTree* bitMask;

bitMask = gtNewDconNode(-0.0, simdBaseType);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND_NOT, type, op1, bitMask, simdBaseJitType, simdSize);
if (simdBaseType == TYP_FLOAT)
{
bitMask = gtNewIconNode(0x7FFFFFFF);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_INT, simdSize);
}
else
{
bitMask = gtNewLconNode(0x7FFFFFFFFFFFFFFF);
bitMask = gtNewSimdCreateBroadcastNode(type, bitMask, CORINFO_TYPE_LONG, simdSize);
}
return gtNewSimdBinOpNode(GT_AND, type, op1, bitMask, simdBaseJitType, simdSize);
}

NamedIntrinsic intrinsic = NI_Illegal;
Expand Down Expand Up @@ -20775,12 +20782,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
}
}
}

if (op == GT_AND_NOT)
{
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
needsReverseOps = true;
}
break;
}
#endif // TARGET_XARCH
Expand Down Expand Up @@ -20811,11 +20812,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(

if (intrinsic != NI_Illegal)
{
if (op == GT_AND_NOT)
{
assert(fgNodeThreading == NodeThreading::LIR);

#if defined(TARGET_XARCH)
// GT_AND_NOT expects `op1 & ~op2`, but xarch does `~op1 & op2`
// We specially handle this here since we're only producing a
// native intrinsic node in LIR

std::swap(op1, op2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you remind me - is it fine to do this swap here in terms of side-effect reordering?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, it's LIR so I guess it is

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, in this case its fine specifically because we've validated we're in LIR already.

Otherwise, we should be applying GTF_REVERSE_OPS or have some comment about expecting the user to have already spilled side effects, etc.

#endif // TARGET_XARCH
}
return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize);
}

switch (op)
{
case GT_AND_NOT:
{
// Prior to LIR, we want to explicitly decompose this operation so that downstream phases can
// appropriately optimize around the individual operations being performed, particularly ~op2,
// and produce overall better codegen.
assert(fgNodeThreading != NodeThreading::LIR);

op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize);
}

#if defined(TARGET_XARCH)
case GT_RSZ:
{
Expand Down Expand Up @@ -20980,9 +21004,6 @@ GenTree* Compiler::gtNewSimdBinOpNode(
vecCon1->gtSimdVal.u64[i] = 0x00FF00FF00FF00FF;
}

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

// Vector256<short> maskedProduct = Avx2.And(widenedProduct, vecCon1).AsInt16()
GenTree* maskedProduct = gtNewSimdBinOpNode(GT_AND, widenedType, widenedProduct, vecCon1,
widenedSimdBaseJitType, widenedSimdSize);
Expand Down Expand Up @@ -21947,9 +21968,6 @@ GenTree* Compiler::gtNewSimdCmpOpNode(
v = gtNewSimdHWIntrinsicNode(type, v, gtNewIconNode(SHUFFLE_ZZXX, TYP_INT), NI_SSE2_Shuffle,
CORINFO_TYPE_INT, simdSize);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

op2 = gtNewSimdBinOpNode(GT_AND, type, u, v, simdBaseJitType, simdSize);
return gtNewSimdBinOpNode(GT_OR, type, op1, op2, simdBaseJitType, simdSize);
}
Expand Down Expand Up @@ -24340,9 +24358,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE,
Expand Down Expand Up @@ -24381,9 +24396,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);
tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT,
Expand Down Expand Up @@ -24485,9 +24497,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);

Expand Down Expand Up @@ -24524,9 +24533,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(

GenTree* vecCon2 = gtCloneExpr(vecCon1);

// Validate we can't use AVX512F_VL_TernaryLogic here
assert(!canUseEvexEncodingDebugOnly());

tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize);
tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize);

Expand Down Expand Up @@ -28145,6 +28151,14 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
assert(!isScalar);
assert(op2->TypeIs(simdType));

if (comp->fgNodeThreading != NodeThreading::LIR)
{
// We don't want to support creating AND_NOT nodes prior to LIR
// as it can break important optimizations. We'll produces this
// in lowering instead.
break;
}

#if defined(TARGET_XARCH)
if (simdSize == 64)
{
Expand Down Expand Up @@ -29212,6 +29226,21 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec
return IsUserCall() && (operand == Op(2));
}

#if defined(TARGET_XARCH)
case NI_SSE_Xor:
case NI_SSE2_Xor:
case NI_AVX_Xor:
case NI_AVX2_Xor:
case NI_AVX512F_Xor:
case NI_AVX512DQ_Xor:
case NI_AVX10v1_V512_Xor:
{
// We recognize this as GT_NOT which can enable other optimizations
assert(GetOperandCount() == 2);
return vecCon->IsVectorAllBitsSet();
}
#endif // TARGET_XARCH

default:
{
break;
Expand Down Expand Up @@ -29961,7 +29990,8 @@ bool GenTreeLclVar::IsNeverNegative(Compiler* comp) const
unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3)
{
#if defined(TARGET_XARCH)
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId));
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId) || HWIntrinsicInfo::IsPermuteVar2x(gtHWIntrinsicId) ||
HWIntrinsicInfo::IsTernaryLogic(gtHWIntrinsicId));
#elif defined(TARGET_ARM64)
assert(HWIntrinsicInfo::IsFmaIntrinsic(gtHWIntrinsicId));
#endif
Expand Down Expand Up @@ -30005,85 +30035,6 @@ unsigned GenTreeHWIntrinsic::GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree

return 0;
}

//------------------------------------------------------------------------
// GetTernaryControlByte: calculate the value of the control byte for ternary node
// with given logic nodes on the input.
//
// Return value: the value of the ternary control byte.
uint8_t GenTreeHWIntrinsic::GetTernaryControlByte(GenTreeHWIntrinsic* second) const
{
// we assume we have a structure like:
/*
/- A
+- B
t1 = binary logical op1

/- C
+- t1
t2 = binary logical op2
*/

// To calculate the control byte value:
// The way the constants work is we have three keys:
// * A: 0xF0
// * B: 0xCC
// * C: 0xAA
//
// To compute the correct control byte, you simply perform the corresponding operation on these keys. So, if you
// wanted to do (A & B) ^ C, you would compute (0xF0 & 0xCC) ^ 0xAA or 0x6A.
assert(second->Op(1) == this || second->Op(2) == this);
const uint8_t A = 0xF0;
const uint8_t B = 0xCC;
const uint8_t C = 0xAA;

bool isScalar = false;

genTreeOps firstOper = GetOperForHWIntrinsicId(&isScalar);
assert(!isScalar);

genTreeOps secondOper = second->GetOperForHWIntrinsicId(&isScalar);
assert(!isScalar);

uint8_t AB = 0;
uint8_t ABC = 0;

if (firstOper == GT_AND)
{
AB = A & B;
}
else if (firstOper == GT_OR)
{
AB = A | B;
}
else if (firstOper == GT_XOR)
{
AB = A ^ B;
}
else
{
unreached();
}

if (secondOper == GT_AND)
{
ABC = AB & C;
}
else if (secondOper == GT_OR)
{
ABC = AB | C;
}
else if (secondOper == GT_XOR)
{
ABC = AB ^ C;
}
else
{
unreached();
}

return ABC;
}
#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS

unsigned GenTreeLclFld::GetSize() const
Expand Down Expand Up @@ -30479,13 +30430,8 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
bool isScalar = false;
genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar);

#if defined(TARGET_XARCH)
if (oper == GT_AND_NOT)
{
// xarch does: ~op1 & op2, we need op1 & ~op2
std::swap(op1, op2);
}
#endif // TARGET_XARCH
// We shouldn't find AND_NOT nodes since it should only be produced in lowering
assert(oper != GT_AND_NOT);

GenTree* cnsNode = nullptr;
GenTree* otherNode = nullptr;
Expand Down Expand Up @@ -30998,31 +30944,6 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
break;
}

case GT_AND_NOT:
{
// Handle `x & ~0 == x` and `0 & ~x == 0`
if (cnsNode->IsVectorZero())
{
if (cnsNode == op1)
{
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
break;
}
else
{
resultNode = otherNode;
}
break;
}

// Handle `x & ~AllBitsSet == 0`
if (cnsNode->IsVectorAllBitsSet() && (cnsNode == op2))
{
resultNode = gtWrapWithSideEffects(cnsNode, otherNode, GTF_ALL_EFFECT);
}
break;
}

case GT_DIV:
{
if (varTypeIsFloating(simdBaseType))
Expand Down Expand Up @@ -31413,12 +31334,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
{
switch (ni)
{
case NI_Vector128_ConditionalSelect:
#if defined(TARGET_XARCH)
case NI_Vector128_ConditionalSelect:
case NI_Vector256_ConditionalSelect:
case NI_Vector512_ConditionalSelect:
#elif defined(TARGET_ARM64)
case NI_Vector64_ConditionalSelect:
case NI_AdvSimd_BitwiseSelect:
case NI_Sve_ConditionalSelect:
#endif
{
Expand Down
1 change: 0 additions & 1 deletion src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -6529,7 +6529,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
bool OperRequiresGlobRefFlag() const;

unsigned GetResultOpNumForRmwIntrinsic(GenTree* use, GenTree* op1, GenTree* op2, GenTree* op3);
uint8_t GetTernaryControlByte(GenTreeHWIntrinsic* second) const;

ClassLayout* GetLayout(Compiler* compiler) const;

Expand Down
Loading
Loading