Skip to content

Commit

Permalink
[VPlan] First step towards VPlan cost modeling.
Browse files Browse the repository at this point in the history
This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
  • Loading branch information
fhahn committed Oct 14, 2023
1 parent cdb42aa commit 9557529
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 3 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,8 @@ class LoopVectorizationPlanner {
/// A builder used to construct the current plan.
VPBuilder Builder;

InstructionCost computeCost(VPlan &Plan, ElementCount VF);

public:
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
const TargetTransformInfo &TTI,
Expand All @@ -339,6 +341,8 @@ class LoopVectorizationPlanner {
/// Return the best VPlan for \p VF.
VPlan &getBestPlanFor(ElementCount VF) const;

std::pair<VPlan &, ElementCount> getBestPlan();

/// Generate the IR code for the body of the vectorized loop according to the
/// best selected \p VF, \p UF and VPlan \p BestPlan.
/// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
Expand Down
114 changes: 111 additions & 3 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
/// TargetTransformInfo to query the different backends for the cost of
/// different operations.
class LoopVectorizationCostModel {
friend class LoopVectorizationPlanner;

public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
PredicatedScalarEvolution &PSE, LoopInfo *LI,
Expand Down Expand Up @@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
: Candidate.Width.getFixedValue();
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
<< " costs: " << (Candidate.Cost / Width));
<< " costs: " << Candidate.Cost / Width);
if (i.isScalable())
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")");
Expand Down Expand Up @@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VF;
}

InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
ElementCount VF) {
InstructionCost Cost = 0;

VPBasicBlock *Header =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());

// Cost modeling for inductions is inaccurate in the legacy cost model. Try as
// to match it here initially during VPlan cost model bring up:
// * VPWidenIntOrFpInductionRecipes implement computeCost,
// * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
// * other inductions only have a cost of 1 (i.e. the cost of the scalar
// induction increment).
unsigned NumWideIVs = count_if(Header->phis(), [](VPRecipeBase &R) {
return isa<VPWidenPointerInductionRecipe>(&R) ||
(isa<VPWidenIntOrFpInductionRecipe>(&R) &&
!cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst());
});
Cost += Legal->getInductionVars().size() - NumWideIVs;

for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
assert(Region->isReplicator());
VPBasicBlock *Then =
cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
for (VPRecipeBase &R : *Then) {
if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
continue;
auto *RepR = cast<VPReplicateRecipe>(&R);
Cost += CM.getInstructionCost(RepR->getUnderlyingInstr(), VF).first;
}
continue;
}

VPCostContext Ctx(CM.TTI, OrigLoop->getHeader()->getContext());
for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
InstructionCost RecipeCost = R.computeCost(VF, Ctx);
if (!RecipeCost.isValid()) {
if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
} else if (auto *WidenMem =
dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
RecipeCost =
CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
} else if (auto *I = dyn_cast_or_null<Instruction>(
R.getVPSingleValue()->getUnderlyingValue()))
RecipeCost = CM.getInstructionCost(I, VF).first;
else
continue;
}
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
Cost = InstructionCost(ForceTargetInstructionCost);

LLVM_DEBUG({
dbgs() << "Cost of " << RecipeCost << " for " << VF << ": ";
R.dump();
});
Cost += RecipeCost;
}
}
Cost += 1;
LLVM_DEBUG(dbgs() << "Cost for " << VF << ": " << Cost << "\n");
return Cost;
}

std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
// If there is a single VPlan with a single VF, return it directly.
if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
ElementCount VF = *VPlans[0]->vectorFactors().begin();
return {*VPlans[0], VF};
}

VPlan *BestPlan = &*VPlans[0];
assert(hasPlanWithVF(ElementCount::getFixed(1)));
ElementCount BestVF = ElementCount::getFixed(1);
InstructionCost ScalarCost = computeCost(
getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
InstructionCost BestCost = ScalarCost;
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization) {
// Ignore scalar width, because the user explicitly wants vectorization.
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
// evaluation.
BestCost = InstructionCost::getMax();
}

for (auto &P : VPlans) {
for (ElementCount VF : P->vectorFactors()) {
if (VF.isScalar())
continue;
InstructionCost Cost = computeCost(*P, VF);
if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
VectorizationFactor(BestVF, BestCost, ScalarCost))) {
BestCost = Cost;
BestVF = VF;
BestPlan = &*P;
}
}
}
return {*BestPlan, BestVF};
}

VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
assert(count_if(VPlans,
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
Expand Down Expand Up @@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
PSI, Checks);

VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
const auto &[BestPlan, Width] = LVP.getBestPlan();
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
<< "\n");
assert(VF.Width == Width &&
"VPlan cost model and legacy cost model disagreed");
LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;

// Add metadata to disable runtime unrolling a scalar loop when there
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H

#include "VPlanAnalysis.h"
#include "VPlanValue.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
Expand All @@ -38,6 +39,7 @@
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/FMF.h"
#include "llvm/IR/Operator.h"
#include "llvm/Support/InstructionCost.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
Expand Down Expand Up @@ -697,6 +699,14 @@ class VPLiveOut : public VPUser {
#endif
};

struct VPCostContext {
const TargetTransformInfo &TTI;
VPTypeAnalysis Types;

VPCostContext(const TargetTransformInfo &TTI, LLVMContext &Ctx)
: TTI(TTI), Types(Ctx) {}
};

/// VPRecipeBase is a base class modeling a sequence of one or more output IR
/// instructions. VPRecipeBase owns the VPValues it defines through VPDef
/// and is responsible for deleting its defined values. Single-value
Expand Down Expand Up @@ -762,6 +772,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
/// \returns an iterator pointing to the element after the erased one
iplist<VPRecipeBase>::iterator eraseFromParent();

virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
return InstructionCost::getInvalid();
}

/// Returns the underlying instruction, if the recipe is a VPValue or nullptr
/// otherwise.
Instruction *getUnderlyingInstr() {
Expand Down Expand Up @@ -1169,6 +1183,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {

unsigned getOpcode() const { return Opcode; }

InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down Expand Up @@ -1463,6 +1479,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
Type *getScalarType() const {
return Trunc ? Trunc->getType() : IV->getType();
}

InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
};

class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
Expand Down Expand Up @@ -1749,6 +1767,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
"Op must be an operand of the recipe");
return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
}

Instruction *getInsertPos() const { return IG->getInsertPos(); }
};

/// A recipe to represent inloop reduction operations, performing a reduction on
Expand Down Expand Up @@ -2598,6 +2618,10 @@ class VPlan {

bool hasVF(ElementCount VF) { return VFs.count(VF); }

iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
return {VFs.begin(), VFs.end()};
}

bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }

bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
Expand Down
80 changes: 80 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,76 @@ void VPWidenRecipe::execute(VPTransformState &State) {
#endif
}

InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
switch (Opcode) {
case Instruction::FNeg: {
Type *VectorTy =
ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
return Ctx.TTI.getArithmeticInstrCost(
Opcode, VectorTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
VPValue *Op2 = getOperand(1);
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
TargetTransformInfo::OperandValueInfo Op2Info = {
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
if (Op2->isLiveIn())
Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());

if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
getOperand(1)->isDefinedOutsideVectorRegions())
Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
Type *VectorTy =
ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());

SmallVector<const Value *, 4> Operands;
if (CtxI)
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
return Ctx.TTI.getArithmeticInstrCost(
Opcode, VectorTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
Op2Info, Operands, CtxI);
}
case Instruction::Freeze: {
// This opcode is unknown. Assume that it is the same as 'mul'.
Type *VectorTy =
ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Type *VectorTy = ToVectorTy(Ctx.Types.inferType(getOperand(0)), VF);
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
CostKind);
}
default:
llvm_unreachable("Unsupported opcode for instruction");
}
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
Expand Down Expand Up @@ -985,6 +1055,16 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
return StartC && StartC->isZero() && StepC && StepC->isOne();
}

InstructionCost VPWidenIntOrFpInductionRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) {

if (getTruncInst())
return 0;
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *VectorTy = ToVectorTy(getScalarType(), VF);
return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, VectorTy, CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
Expand Down

0 comments on commit 9557529

Please sign in to comment.