diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 9691e1cd4f2ed0..08142fa014c178 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -316,6 +316,8 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; + InstructionCost computeCost(VPlan &Plan, ElementCount VF); + public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo &TTI, @@ -339,6 +341,8 @@ class LoopVectorizationPlanner { /// Return the best VPlan for \p VF. VPlan &getBestPlanFor(ElementCount VF) const; + std::pair getBestPlan(); + /// Generate the IR code for the body of the vectorized loop according to the /// best selected \p VF, \p UF and VPlan \p BestPlan. /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2ca7e75f97f0f0..9355b6f89bacab 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair; /// TargetTransformInfo to query the different backends for the cost of /// different operations. class LoopVectorizationCostModel { + friend class LoopVectorizationPlanner; + public: LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, @@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale : Candidate.Width.getFixedValue(); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i - << " costs: " << (Candidate.Cost / Width)); + << " costs: " << Candidate.Cost / Width); if (i.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"); @@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VF; } +InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan, + ElementCount VF) { + InstructionCost Cost = 0; + + VPBasicBlock *Header = + cast(Plan.getVectorLoopRegion()->getEntry()); + + // Cost modeling for inductions is inaccurate in the legacy cost model. Try as + // to match it here initially during VPlan cost model bring up: + // * VPWidenIntOrFpInductionRecipes implement computeCost, + // * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model + // * other inductions only have a cost of 1 (i.e. the cost of the scalar + // induction increment). + unsigned NumWideIVs = count_if(Header->phis(), [](VPRecipeBase &R) { + return isa(&R) || + (isa(&R) && + !cast(&R)->getTruncInst()); + }); + Cost += Legal->getInductionVars().size() - NumWideIVs; + + for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) { + if (auto *Region = dyn_cast(Block)) { + assert(Region->isReplicator()); + VPBasicBlock *Then = + cast(Region->getEntry()->getSuccessors()[0]); + for (VPRecipeBase &R : *Then) { + if (isa(&R)) + continue; + auto *RepR = cast(&R); + Cost += CM.getInstructionCost(RepR->getUnderlyingInstr(), VF).first; + } + continue; + } + + VPCostContext Ctx(CM.TTI, OrigLoop->getHeader()->getContext()); + for (VPRecipeBase &R : *cast(Block)) { + InstructionCost RecipeCost = R.computeCost(VF, Ctx); + if (!RecipeCost.isValid()) { + if (auto *IG = dyn_cast(&R)) { + RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first; + } else if (auto *WidenMem = + dyn_cast(&R)) { + RecipeCost = + CM.getInstructionCost(&WidenMem->getIngredient(), VF).first; + } else if (auto *I = dyn_cast_or_null( + R.getVPSingleValue()->getUnderlyingValue())) + RecipeCost = CM.getInstructionCost(I, VF).first; + else + continue; + } + if (ForceTargetInstructionCost.getNumOccurrences() > 0) + Cost = InstructionCost(ForceTargetInstructionCost); + + LLVM_DEBUG({ + dbgs() << "Cost of " << RecipeCost << " for " << VF << ": "; + R.dump(); + }); + Cost += RecipeCost; + } + } + Cost += 1; + LLVM_DEBUG(dbgs() << "Cost for " << VF << ": " << Cost << "\n"); + return Cost; +} + +std::pair LoopVectorizationPlanner::getBestPlan() { + // If there is a single VPlan with a single VF, return it directly. + if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) { + ElementCount VF = *VPlans[0]->vectorFactors().begin(); + return {*VPlans[0], VF}; + } + + VPlan *BestPlan = &*VPlans[0]; + assert(hasPlanWithVF(ElementCount::getFixed(1))); + ElementCount BestVF = ElementCount::getFixed(1); + InstructionCost ScalarCost = computeCost( + getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1)); + InstructionCost BestCost = ScalarCost; + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization) { + // Ignore scalar width, because the user explicitly wants vectorization. + // Initialize cost to max so that VF = 2 is, at least, chosen during cost + // evaluation. + BestCost = InstructionCost::getMax(); + } + + for (auto &P : VPlans) { + for (ElementCount VF : P->vectorFactors()) { + if (VF.isScalar()) + continue; + InstructionCost Cost = computeCost(*P, VF); + if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost), + VectorizationFactor(BestVF, BestCost, ScalarCost))) { + BestCost = Cost; + BestVF = VF; + BestPlan = &*P; + } + } + } + return {*BestPlan, BestVF}; +} + VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == @@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + const auto &[BestPlan, Width] = LVP.getBestPlan(); + LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width + << "\n"); + assert(VF.Width == Width && + "VPlan cost model and legacy cost model disagreed"); + LVP.executePlan(Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index ea1f8a5b9d1e9a..02d93915e3c8d6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -23,6 +23,7 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H +#include "VPlanAnalysis.h" #include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" @@ -38,6 +39,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" #include "llvm/IR/Operator.h" +#include "llvm/Support/InstructionCost.h" #include #include #include @@ -697,6 +699,14 @@ class VPLiveOut : public VPUser { #endif }; +struct VPCostContext { + const TargetTransformInfo &TTI; + VPTypeAnalysis Types; + + VPCostContext(const TargetTransformInfo &TTI, LLVMContext &Ctx) + : TTI(TTI), Types(Ctx) {} +}; + /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -762,6 +772,10 @@ class VPRecipeBase : public ilist_node_with_parent, /// \returns an iterator pointing to the element after the erased one iplist::iterator eraseFromParent(); + virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) { + return InstructionCost::getInvalid(); + } + /// Returns the underlying instruction, if the recipe is a VPValue or nullptr /// otherwise. Instruction *getUnderlyingInstr() { @@ -1169,6 +1183,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue { unsigned getOpcode() const { return Opcode; } + InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1463,6 +1479,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { Type *getScalarType() const { return Trunc ? Trunc->getType() : IV->getType(); } + + InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override; }; class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { @@ -1749,6 +1767,8 @@ class VPInterleaveRecipe : public VPRecipeBase { "Op must be an operand of the recipe"); return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -2598,6 +2618,10 @@ class VPlan { bool hasVF(ElementCount VF) { return VFs.count(VF); } + iterator_range::iterator> vectorFactors() { + return {VFs.begin(), VFs.end()}; + } + bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b616abddb00f99..c58e947075032d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -751,6 +751,76 @@ void VPWidenRecipe::execute(VPTransformState &State) { #endif } +InstructionCost VPWidenRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + switch (Opcode) { + case Instruction::FNeg: { + Type *VectorTy = + ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF); + return Ctx.TTI.getArithmeticInstrCost( + Opcode, VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + VPValue *Op2 = getOperand(1); + // Certain instructions can be cheaper to vectorize if they have a constant + // second vector operand. One example of this are shifts on x86. + TargetTransformInfo::OperandValueInfo Op2Info = { + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; + if (Op2->isLiveIn()) + Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue()); + + if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && + getOperand(1)->isDefinedOutsideVectorRegions()) + Op2Info.Kind = TargetTransformInfo::OK_UniformValue; + Type *VectorTy = + ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF); + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + + SmallVector Operands; + if (CtxI) + Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); + return Ctx.TTI.getArithmeticInstrCost( + Opcode, VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Op2Info, Operands, CtxI); + } + case Instruction::Freeze: { + // This opcode is unknown. Assume that it is the same as 'mul'. + Type *VectorTy = + ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF); + return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *VectorTy = ToVectorTy(Ctx.Types.inferType(getOperand(0)), VF); + return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(), + CostKind); + } + default: + llvm_unreachable("Unsupported opcode for instruction"); + } +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -985,6 +1055,16 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const { return StartC && StartC->isZero() && StepC && StepC->isOne(); } +InstructionCost VPWidenIntOrFpInductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) { + + if (getTruncInst()) + return 0; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Type *VectorTy = ToVectorTy(getScalarType(), VF); + return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, VectorTy, CostKind); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {