diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index c03c278fcebe78..6011e160762202 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -344,6 +344,16 @@ class LoopVectorizationPlanner { /// A builder used to construct the current plan. VPBuilder Builder; + /// Computes the cost of \p Plan for vectorization factor \p VF. + /// + /// The current implementation requires access to the + /// LoopVectorizationLegality to handle inductions and reductions, which is + /// why it is kept separate from the VPlan-only cost infrastructure. + /// + /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has + /// been retired. + InstructionCost cost(VPlan &Plan, ElementCount VF) const; + public: LoopVectorizationPlanner( Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, @@ -365,6 +375,9 @@ class LoopVectorizationPlanner { /// Return the best VPlan for \p VF. VPlan &getBestPlanFor(ElementCount VF) const; + /// Return the most profitable plan and fix its VF to the most profitable one. + VPlan &getBestPlan() const; + /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan /// according to the best selected \p VF and \p UF. /// @@ -443,7 +456,9 @@ class LoopVectorizationPlanner { ElementCount MinVF); /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. + /// This method checks every VF in \p CandidateVFs. This is now only used to + /// verify the decisions by the new VPlan-based cost-model and will be retired + /// once the VPlan-based cost-model is stabilized. VectorizationFactor selectVectorizationFactor(const ElementCountSet &CandidateVFs); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9571cfe358bf34..d4a2399f09369f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -290,7 +290,7 @@ static cl::opt ForceTargetMaxVectorInterleaveFactor( cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops.")); -static cl::opt ForceTargetInstructionCost( +cl::opt ForceTargetInstructionCost( "force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " @@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); } -/// A helper function that returns the reciprocal of the block probability of -/// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for every X iterations of the loop header. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -static unsigned getReciprocalPredBlockProb() { return 2; } - /// Returns "best known" trip count for the specified loop \p L as defined by /// the following procedure: /// 1) Returns exact trip count if it is known. @@ -1621,6 +1613,16 @@ class LoopVectorizationCostModel { /// \p VF is the vectorization factor chosen for the original loop. bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + std::optional + getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, + TTI::TargetCostKind CostKind) const; + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + private: unsigned NumPredStores = 0; @@ -1646,21 +1648,11 @@ class LoopVectorizationCostModel { /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); - /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. InstructionCost getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); - /// Return the cost of instructions in an inloop reduction pattern, if I is - /// part of that pattern. - std::optional - getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, - TTI::TargetCostKind CostKind) const; - /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -7297,7 +7289,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (!MaxFactors.hasVector()) return VectorizationFactor::Disabled(); - // Select the optimal vectorization factor. + // Select the optimal vectorization factor according to the legacy cost-model. + // This is now only used to verify the decisions by the new VPlan-based + // cost-model and will be retired once the VPlan-based cost-model is + // stabilized. VectorizationFactor VF = selectVectorizationFactor(VFCandidates); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); if (!hasPlanWithVF(VF.Width)) { @@ -7308,6 +7303,196 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { return VF; } +InstructionCost VPCostContext::getLegacyCost(Instruction *UI, + ElementCount VF) const { + return CM.getInstructionCost(UI, VF).first; +} + +bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { + return (IsVector && CM.VecValuesToIgnore.contains(UI)) || + SkipCostComputation.contains(UI); +} + +InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, + ElementCount VF) const { + InstructionCost Cost = 0; + LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext(); + VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM); + + // Cost modeling for inductions is inaccurate in the legacy cost model + // compared to the recipes that are generated. To match here initially during + // VPlan cost model bring up directly use the induction costs from the legacy + // cost model. Note that we do this as pre-processing; the VPlan may not have + // any recipes associated with the original induction increment instruction + // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute + // the cost of both induction increment instructions that are represented by + // recipes and those that are not, to avoid distinguishing between them here, + // and skip all recipes that represent induction increments (the former case) + // later on, if they exist, to avoid counting them twice. Similarly we + // pre-compute the cost of any optimized truncates. + // TODO: Switch to more accurate costing based on VPlan. + for (const auto &[IV, IndDesc] : Legal->getInductionVars()) { + Instruction *IVInc = cast( + IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + if (CostCtx.SkipCostComputation.insert(IVInc).second) { + InstructionCost InductionCost = CostCtx.getLegacyCost(IVInc, VF); + LLVM_DEBUG({ + dbgs() << "Cost of " << InductionCost << " for VF " << VF + << ":\n induction increment " << *IVInc << "\n"; + IVInc->dump(); + }); + Cost += InductionCost; + } + for (User *U : IV->users()) { + auto *CI = cast(U); + if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF)) + continue; + assert(!CostCtx.SkipCostComputation.contains(CI) && + "Same cast for multiple inductions?"); + CostCtx.SkipCostComputation.insert(CI); + InstructionCost CastCost = CostCtx.getLegacyCost(CI, VF); + LLVM_DEBUG({ + dbgs() << "Cost of " << CastCost << " for VF " << VF + << ":\n induction cast " << *CI << "\n"; + CI->dump(); + }); + Cost += CastCost; + } + } + + /// Compute the cost of all exiting conditions of the loop using the legacy + /// cost model. This is to match the legacy behavior, which adds the cost of + /// all exit conditions. Note that this over-estimates the cost, as there will + /// be a single condition to control the vector loop. + SmallVector Exiting; + CM.TheLoop->getExitingBlocks(Exiting); + SetVector ExitInstrs; + // Collect all exit conditions. + for (BasicBlock *EB : Exiting) { + auto *Term = dyn_cast(EB->getTerminator()); + if (!Term) + continue; + if (auto *CondI = dyn_cast(Term->getOperand(0))) { + ExitInstrs.insert(CondI); + } + } + // Compute the cost of all instructions only feeding the exit conditions. + for (unsigned I = 0; I != ExitInstrs.size(); ++I) { + Instruction *CondI = ExitInstrs[I]; + if (!OrigLoop->contains(CondI) || + !CostCtx.SkipCostComputation.insert(CondI).second) + continue; + Cost += CostCtx.getLegacyCost(CondI, VF); + for (Value *Op : CondI->operands()) { + auto *OpI = dyn_cast(Op); + if (!OpI || any_of(OpI->users(), [&ExitInstrs](User *U) { + return !ExitInstrs.contains(cast(U)); + })) + continue; + ExitInstrs.insert(OpI); + } + } + + // The legacy cost model has special logic to compute the cost of in-loop + // reductions, which may be smaller than the sum of all instructions involved + // in the reduction. For AnyOf reductions, VPlan codegen may remove the select + // which the legacy cost model uses to assign cost. Pre-compute their costs + // for now. + // TODO: Switch to costing based on VPlan once the logic has been ported. + for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { + if (!CM.isInLoopReduction(RedPhi) && + !RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) + continue; + + // AnyOf reduction codegen may remove the select. To match the legacy cost + // model, pre-compute the cost for AnyOf reductions here. + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + auto *Select = cast(*find_if( + RedPhi->users(), [](User *U) { return isa(U); })); + assert(!CostCtx.SkipCostComputation.contains(Select) && + "reduction op visited multiple times"); + CostCtx.SkipCostComputation.insert(Select); + auto ReductionCost = CostCtx.getLegacyCost(Select, VF); + LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF + << ":\n any-of reduction " << *Select << "\n"); + Cost += ReductionCost; + continue; + } + + const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); + SetVector ChainOpsAndOperands(ChainOps.begin(), + ChainOps.end()); + // Also include the operands of instructions in the chain, as the cost-model + // may mark extends as free. + for (auto *ChainOp : ChainOps) { + for (Value *Op : ChainOp->operands()) { + if (auto *I = dyn_cast(Op)) + ChainOpsAndOperands.insert(I); + } + } + + // Pre-compute the cost for I, if it has a reduction pattern cost. + for (Instruction *I : ChainOpsAndOperands) { + auto ReductionCost = CM.getReductionPatternCost( + I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput); + if (!ReductionCost) + continue; + + assert(!CostCtx.SkipCostComputation.contains(I) && + "reduction op visited multiple times"); + CostCtx.SkipCostComputation.insert(I); + LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF + << ":\n in-loop reduction " << *I << "\n"); + Cost += *ReductionCost; + } + } + + // Now compute and add the VPlan-based cost. + Cost += Plan.cost(VF, CostCtx); + LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); + return Cost; +} + +VPlan &LoopVectorizationPlanner::getBestPlan() const { + // If there is a single VPlan with a single VF, return it directly. + VPlan &FirstPlan = *VPlans[0]; + if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1) + return FirstPlan; + + VPlan *BestPlan = &FirstPlan; + ElementCount ScalarVF = ElementCount::getFixed(1); + assert(hasPlanWithVF(ScalarVF) && + "More than a single plan/VF w/o any plan having scalar VF"); + + InstructionCost ScalarCost = cost(getBestPlanFor(ScalarVF), ScalarVF); + VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost); + + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization) { + // Ignore scalar width, because the user explicitly wants vectorization. + // Initialize cost to max so that VF = 2 is, at least, chosen during cost + // evaluation. + BestFactor.Cost = InstructionCost::getMax(); + } + + for (auto &P : VPlans) { + for (ElementCount VF : P->vectorFactors()) { + if (VF.isScalar()) + continue; + InstructionCost Cost = cost(*P, VF); + VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); + if (isMoreProfitable(CurrentFactor, BestFactor)) { + BestFactor = CurrentFactor; + BestPlan = &*P; + } + } + } + BestPlan->setVF(BestFactor.Width); + return *BestPlan; +} + VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { assert(count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == @@ -10166,8 +10351,15 @@ bool LoopVectorizePass::processLoop(Loop *L) { VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + VPlan &BestPlan = LVP.getBestPlan(); + assert(size(BestPlan.vectorFactors()) == 1 && + "Plan should have a single VF"); + ElementCount Width = *BestPlan.vectorFactors().begin(); + LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width + << "\n"); + assert(VF.Width == Width && + "VPlan cost model and legacy cost model disagreed"); + LVP.executePlan(Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5070950f87c275..3ad1ae36811846 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -752,6 +752,67 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance.reset(); } +InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) { + InstructionCost Cost = 0; + for (VPRecipeBase &R : Recipes) + Cost += R.cost(VF, Ctx); + return Cost; +} + +InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { + if (!isReplicator()) { + InstructionCost Cost = 0; + for (VPBlockBase *Block : vp_depth_first_shallow(getEntry())) + Cost += Block->cost(VF, Ctx); + return Cost; + } + + // Compute the cost of a replicate region. Replicating isn't supported for + // scalable vectors, return an invalid cost for them. + // TODO: Discard scalable VPlans with replicate recipes earlier after + // construction. + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + // First compute the cost of the conditionally executed recipes, followed by + // account for the branching cost, except if the mask is a header mask or + // uniform condition. + using namespace llvm::VPlanPatternMatch; + VPBasicBlock *Then = cast(getEntry()->getSuccessors()[0]); + InstructionCost ThenCost = Then->cost(VF, Ctx); + + // Note the cost estimates below closely match the current legacy cost model. + auto *BOM = cast(&getEntryBasicBlock()->front()); + VPValue *Cond = BOM->getOperand(0); + + // Check if Cond is a header mask and don't account for branching costs as the + // header mask will always be true except in the last iteration. + if (vputils::isHeaderMask(Cond, *getPlan())) + return ThenCost; + + // For the scalar case, we may not always execute the original predicated + // block, Thus, scale the block's cost by the probability of executing it. + if (VF.isScalar()) + return ThenCost / getReciprocalPredBlockProb(); + + // Check if Cond is a uniform compare and don't account for branching costs as + // a uniform condition corresponds to a single branch per VF. + if (vputils::isUniformBoolean(Cond)) + return ThenCost; + + // Add the cost for branches around scalarized and predicated blocks. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx.LLVMCtx), VF); + auto FixedVF = VF.getFixedValue(); // Known to be non scalable. + InstructionCost Cost = ThenCost; + Cost += Ctx.TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnes(FixedVF), + /*Insert*/ false, /*Extract*/ true, + CostKind); + Cost += Ctx.TTI.getCFInstrCost(Instruction::Br, CostKind) * FixedVF; + return Cost; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -941,6 +1002,12 @@ void VPlan::execute(VPTransformState *State) { "DT not preserved correctly"); } +InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { + // For now only return the cost of the vector loop region, ignoring any other + // blocks, like the preheader or middle blocks. + return getVectorLoopRegion()->cost(VF, Ctx); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPlan::printLiveIns(raw_ostream &O) const { VPSlotTracker SlotTracker(this); @@ -1483,7 +1550,8 @@ bool vputils::isHeaderMask(VPValue *V, VPlan &Plan) { auto IsWideCanonicalIV = [](VPValue *A) { return isa(A) || (isa(A) && - cast(A)->isCanonical()); + cast(A)->isCanonical()) || + match(A, m_ScalarIVSteps(m_CanonicalIV(), m_SpecificInt(1))); }; VPValue *A, *B; @@ -1495,3 +1563,17 @@ bool vputils::isHeaderMask(VPValue *V, VPlan &Plan) { return match(V, m_Binary(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount(); } + +bool vputils::isUniformBoolean(VPValue *Cond) { + if (match(Cond, m_Not(m_VPValue()))) + Cond = Cond->getDefiningRecipe()->getOperand(0); + auto *R = Cond->getDefiningRecipe(); + if (!R) + return true; + // TODO: match additional patterns preserving uniformity of booleans, e.g., + // AND/OR/etc. + return match(R, m_Binary(m_VPValue(), m_VPValue())) && + all_of(R->operands(), [](VPValue *Op) { + return vputils::isUniformAfterVectorization(Op); + }); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fc25ed907b1530..6a51023aed5016 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -42,6 +42,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" #include "llvm/IR/Operator.h" +#include "llvm/Support/InstructionCost.h" #include #include #include @@ -64,8 +65,11 @@ class VPlan; class VPReplicateRecipe; class VPlanSlp; class Value; +class LoopVectorizationCostModel; class LoopVersioning; +struct VPCostContext; + namespace Intrinsic { typedef unsigned ID; } @@ -82,6 +86,14 @@ Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, Loop *CurLoop = nullptr); +/// A helper function that returns the reciprocal of the block probability of +/// predicated blocks. If we return X, we are assuming the predicated block +/// will execute once for every X iterations of the loop header. +/// +/// TODO: We should use actual block probability here, if available. Currently, +/// we always assume predicated blocks have a 50% chance of executing. +inline unsigned getReciprocalPredBlockProb() { return 2; } + /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 16) = {1, 2, 4, 8} @@ -624,6 +636,9 @@ class VPBlockBase { /// VPBlockBase, thereby "executing" the VPlan. virtual void execute(VPTransformState *State) = 0; + /// Return the cost of the block. + virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; + /// Delete all blocks reachable from a given VPBlockBase, inclusive. static void deleteCFG(VPBlockBase *Entry); @@ -707,6 +722,27 @@ class VPLiveOut : public VPUser { #endif }; +/// Struct to hold various analysis needed for cost computations. +struct VPCostContext { + const TargetTransformInfo &TTI; + VPTypeAnalysis Types; + LLVMContext &LLVMCtx; + LoopVectorizationCostModel &CM; + SmallPtrSet SkipCostComputation; + + VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, + LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM) + : TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {} + + /// Return the cost for \p UI with \p VF using the legacy cost model as + /// fallback until computing the cost of all recipes migrates to VPlan. + InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const; + + /// Return true if the cost for \p UI shouldn't be computed, e.g. because it + /// has already been pre-computed. + bool skipCostComputation(Instruction *UI, bool IsVector) const; +}; + /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -746,6 +782,11 @@ class VPRecipeBase : public ilist_node_with_parent, /// this VPRecipe, thereby "executing" the VPlan. virtual void execute(VPTransformState &State) = 0; + /// Return the cost of this recipe, taking into account if the cost + /// computation should be skipped and the ForceTargetInstructionCost flag. + /// Also takes care of printing the cost for debugging. + virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx); + /// Insert an unlinked recipe into a basic block immediately before /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); @@ -806,6 +847,11 @@ class VPRecipeBase : public ilist_node_with_parent, /// Returns the debug location of the recipe. DebugLoc getDebugLoc() const { return DL; } + +protected: + /// Compute the cost of this recipe using the legacy cost model and the + /// underlying instructions. + InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const; }; // Helper macro to define common classof implementations for recipes. @@ -1381,8 +1427,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags { ResultTy(ResultTy) { assert(UI.getOpcode() == Opcode && "opcode of underlying cast doesn't match"); - assert(UI.getType() == ResultTy && - "result type of underlying cast doesn't match"); } VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) @@ -2096,6 +2140,8 @@ class VPInterleaveRecipe : public VPRecipeBase { "Op must be an operand of the recipe"); return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op); } + + Instruction *getInsertPos() const { return IG->getInsertPos(); } }; /// A recipe to represent inloop reduction operations, performing a reduction on @@ -2910,6 +2956,9 @@ class VPBasicBlock : public VPBlockBase { /// this VPBasicBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; + /// Return the cost of this VPBasicBlock. + InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; + /// Return the position of the first non-phi node recipe in the block. iterator getFirstNonPhi(); @@ -3084,6 +3133,9 @@ class VPRegionBlock : public VPBlockBase { /// this VPRegionBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; + // Return the cost of this region. + InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; + void dropAllReferences(VPValue *NewValue) override; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3203,6 +3255,9 @@ class VPlan { /// Generate the IR code for this VPlan. void execute(VPTransformState *State); + /// Return the cost of this plan. + InstructionCost cost(ElementCount VF, VPCostContext &Ctx); + VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } @@ -3246,6 +3301,11 @@ class VPlan { return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); }); } + iterator_range::iterator> + vectorFactors() const { + return {VFs.begin(), VFs.end()}; + } + bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } @@ -3669,6 +3729,9 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { /// Return true if \p V is a header mask in \p Plan. bool isHeaderMask(VPValue *V, VPlan &Plan); +/// Return true if \p Cond is a uniform boolean. +bool isUniformBoolean(VPValue *Cond); + } // end namespace vputils } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a3ff6395bb39ed..972d895f49d9ff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -39,6 +39,7 @@ using VectorParts = SmallVector; namespace llvm { extern cl::opt EnableVPlanNativePath; } +extern cl::opt ForceTargetInstructionCost; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -255,6 +256,40 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, insertBefore(BB, I); } +InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { + if (auto *S = dyn_cast(this)) { + auto *UI = dyn_cast_or_null(S->getUnderlyingValue()); + if (UI && Ctx.skipCostComputation(UI, VF.isVector())) + return 0; + } + + InstructionCost RecipeCost = computeCost(VF, Ctx); + if (ForceTargetInstructionCost.getNumOccurrences() > 0 && + RecipeCost.isValid()) + RecipeCost = InstructionCost(ForceTargetInstructionCost); + + LLVM_DEBUG({ + dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; + dump(); + }); + return RecipeCost; +} + +InstructionCost VPRecipeBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + // Compute the cost for the recipe falling back to the legacy cost model using + // the underlying instruction. If there is no underlying instruction, returns + // 0. + Instruction *UI = nullptr; + if (auto *S = dyn_cast(this)) + UI = dyn_cast_or_null(S->getUnderlyingValue()); + else if (auto *IG = dyn_cast(this)) + UI = IG->getInsertPos(); + else if (auto *WidenMem = dyn_cast(this)) + UI = &WidenMem->getIngredient(); + return UI ? Ctx.getLegacyCost(UI, VF) : 0; +} + FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e2b7b0c7a219df..d979ca331cb5ff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -999,6 +999,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { : Instruction::ZExt; auto *VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + // UnderlyingExt has distinct return type, used to retain legacy cost. + VPC->setUnderlyingValue(UnderlyingExt); + } VPC->insertBefore(&R); Trunc->replaceAllUsesWith(VPC); } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { @@ -1515,6 +1519,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( VPInstruction *New = Builder.createOverflowingOp( Instruction::Add, {A, B}, {false, false}, RecWithFlags->getDebugLoc()); + New->setUnderlyingValue(RecWithFlags->getUnderlyingValue()); RecWithFlags->replaceAllUsesWith(New); RecWithFlags->eraseFromParent(); CurRec = New; diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 8d945f6f2b8ea8..fa6a65ff2f3ada 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -74,8 +74,7 @@ class VPValue { public: /// Return the underlying Value attached to this VPValue. - Value *getUnderlyingValue() { return UnderlyingVal; } - const Value *getUnderlyingValue() const { return UnderlyingVal; } + Value *getUnderlyingValue() const { return UnderlyingVal; } /// An enumeration for keeping track of the concrete subclass of VPValue that /// are actually instantiated. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index b5aa96eb23f5e5..41879f3ebef5a5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -119,6 +119,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop +; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4 ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK: LV: Interleaving disabled by the pass manager ; CHECK-NEXT: LV: Vectorizing: innermost loop. @@ -260,6 +261,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop +; CHECK-NEXT: VF picked by VPlan cost model: vscale x 4 ; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK: LV: Interleaving disabled by the pass manager ; CHECK-NEXT: LV: Vectorizing: innermost loop.