llvm · fhahn · Jun 13, 2024 · Sep 27, 2023 · May 9, 2024 · May 9, 2024
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -344,6 +344,15 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  /// Computes the cost of \p Plan for vectorization factor \p VF.
+  ///
+  /// The current implementation requires access to the legacy cost model which
+  /// is why it is kept separate from the VPlan-only cost infrastructure.
+  ///
+  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
-  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
+  /// TODO: Move to VPlan::computeCost once the use of Legal
-  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
+  /// TODO: Move to VPlan::computeCost once the use of Legal
+  /// has been retired.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF) const;
+
 public:
   LoopVectorizationPlanner(
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -365,6 +374,9 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  /// Return the most profitable plan.
+  VPlan &getBestPlan() const;
+
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
   ///

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,6 +59,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
@@ -289,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
     cl::desc("A flag that overrides the target's max interleave factor for "
              "vectorized loops."));
 
-static cl::opt<unsigned> ForceTargetInstructionCost(
+cl::opt<unsigned> ForceTargetInstructionCost(
     "force-target-instruction-cost", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's expected cost for "
              "an instruction to a single constant value. Mostly "
@@ -1621,6 +1622,16 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  /// Return the cost of instructions in an inloop reduction pattern, if I is
+  /// part of that pattern.
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
+  /// Return the cost of instructions in an inloop reduction pattern, if \p I
+  /// is part of that pattern.
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
+  /// Return the cost of instructions in an inloop reduction pattern, if \p I
+  /// is part of that pattern.
+  std::optional<InstructionCost>
+  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+                          TTI::TargetCostKind CostKind) const;
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1646,21 +1657,11 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
                                      Type *&VectorTy);
 
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
-  std::optional<InstructionCost>
-  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
-                          TTI::TargetCostKind CostKind) const;
-
   /// Calculate vectorization cost of memory instruction \p I.
   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
 
@@ -7396,6 +7397,122 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) {
+  return CM.getInstructionCost(UI, VF).first;
+}
+
+bool VPCostContext::skipCostComputation(Instruction *UI) const {
+  return CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI);
+}
+
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
-InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
-InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
+                                                      ElementCount VF) const {
+  InstructionCost Cost = 0;
+  LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model
+  // compared to the recipes that are generated. To match here initially during
+  // VPlan cost model bring up directly use the induction costs from the legacy
+  // cost model and skip induction bump recipes. Note that we do this as
+  // pre-processing; the VPlan may not have any recipes associated with the
+  // original induction increment instruction.
+  // TODO: Switch to more accurate costing based on VPlan.
+  for (const auto &[IV, _] : Legal->getInductionVars()) {
+    Instruction *IVInc = cast<Instruction>(
+        IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    InstructionCost InductionCost = CM.getInstructionCost(IVInc, VF).first;
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << InductionCost << " for VF " << VF
+             << ":\n induction increment " << *IVInc << "\n";
+      IVInc->dump();
+    });
+    Cost += InductionCost;
+    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
+           "Same IV increment for multiple inductions?");
+    CostCtx.SkipCostComputation.insert(IVInc);
-    LLVM_DEBUG({
-      dbgs() << "Cost of " << InductionCost << " for VF " << VF
-             << ":\n induction increment " << *IVInc << "\n";
-      IVInc->dump();
-    });
-    Cost += InductionCost;
-    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
-           "Same IV increment for multiple inductions?");
-    CostCtx.SkipCostComputation.insert(IVInc);
+    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
+           "Same IV increment for multiple inductions?");
+    CostCtx.SkipCostComputation.insert(IVInc);
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << InductionCost << " for VF " << VF
+             << ":\n induction increment " << *IVInc << "\n";
+      IVInc->dump();
+    });
+    Cost += InductionCost;
-    LLVM_DEBUG({
-      dbgs() << "Cost of " << InductionCost << " for VF " << VF
-             << ":\n induction increment " << *IVInc << "\n";
-      IVInc->dump();
-    });
-    Cost += InductionCost;
-    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
-           "Same IV increment for multiple inductions?");
-    CostCtx.SkipCostComputation.insert(IVInc);
+    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
+           "Same IV increment for multiple inductions?");
+    CostCtx.SkipCostComputation.insert(IVInc);
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << InductionCost << " for VF " << VF
+             << ":\n induction increment " << *IVInc << "\n";
+      IVInc->dump();
+    });
+    Cost += InductionCost;
+  }
+
+  // The legacy cost model has special logic to compute the cost of in-loop
+  // reductions, which may be smaller than the sum of all instructions involved
+  // in the reduction. Pre-compute the cost for now.
+  // TODO: Switch to costing based on VPlan once the logic has been ported.
+  for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
+    if (!CM.isInLoopReduction(RedPhi))
+      continue;
+
+    const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+    SetVector<Instruction *> ReductionOperations(ChainOps.begin(),
-    SetVector<Instruction *> ReductionOperations(ChainOps.begin(),
+    SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
-    SetVector<Instruction *> ReductionOperations(ChainOps.begin(),
+    SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
+                                                 ChainOps.end());
+    // Also include the operands of instructions in the chain, as the cost-model
+    // may mark extends as free.
+    for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) {
+      for (Value *Op : ReductionOperations[I]->operands()) {
+        if (auto *I = dyn_cast<Instruction>(Op))
+          ReductionOperations.insert(I);
+      }
+    }
+    for (Instruction *I : ReductionOperations) {
+      auto ReductionCost = CM.getReductionPatternCost(
+          I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+      if (!ReductionCost)
+        continue;
+
+      assert(!CostCtx.SkipCostComputation.contains(I) &&
+             "reduction op visited multiple times");
+      CostCtx.SkipCostComputation.insert(I);
+      LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+                        << ":\n in-loop reduction " << *I << "\n");
+      Cost += *ReductionCost;
+    }
+  }
+
-
+  // Now compute and add the VPlan-based cost.
-
+  // Now compute and add the VPlan-based cost.
+  Cost += Plan.computeCost(VF, CostCtx);
+  // Add the cost for the backedge.
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+VPlan &LoopVectorizationPlanner::getBestPlan() const {
+  // If there is a single VPlan with a single VF, return it directly.
+  VPlan &FirstPlan = *VPlans[0];
+  if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
+    return FirstPlan;
+
+  VPlan *BestPlan = &FirstPlan;
+  ElementCount ScalarVF = ElementCount::getFixed(1);
+  assert(hasPlanWithVF(ScalarVF) &&
+         "More than a single plan/VF w/o any plan having scalar VF");
+
+  InstructionCost ScalarCost =
+      computeCost(getBestPlanFor(ElementCount::getFixed(1)), ScalarVF);
-      computeCost(getBestPlanFor(ElementCount::getFixed(1)), ScalarVF);
+      computeCost(getBestPlanFor(ScalarVF), ScalarVF);
-      computeCost(getBestPlanFor(ElementCount::getFixed(1)), ScalarVF);
+      computeCost(getBestPlanFor(ScalarVF), ScalarVF);
+  VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
+
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestFactor.Cost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+        BestFactor = CurrentFactor;
+        BestPlan = &*P;
+      }
+    }
+  }
+  BestPlan->setVF(BestFactor.Width);
+  return *BestPlan;
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10253,8 +10370,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        VPlan &BestPlan = LVP.getBestPlan();
+        assert(size(BestPlan.vectorFactors()) == 1 &&
+               "Plan should have a single VF");
+        ElementCount Width = *BestPlan.vectorFactors().begin();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -52,6 +52,7 @@ using namespace llvm::VPlanPatternMatch;
 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
+extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 #define DEBUG_TYPE "vplan"
 
@@ -730,6 +731,89 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Instance.reset();
 }
 
+static InstructionCost computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                                            VPCostContext &Ctx) {
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) {
+    auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+    if (UI && Ctx.skipCostComputation(UI))
+      return 0;
+  }
+
+  InstructionCost RecipeCost = R->computeCost(VF, Ctx);
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+InstructionCost VPBasicBlock::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  for (VPRecipeBase &R : *this)
-  for (VPRecipeBase &R : *this)
+  for (VPRecipeBase &R : Recipes)
-  for (VPRecipeBase &R : *this)
+  for (VPRecipeBase &R : Recipes)
+    Cost += computeCostForRecipe(&R, VF, Ctx);
+  return Cost;
+}
+
+InstructionCost VPRegionBlock::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  if (!isReplicator()) {
-  InstructionCost Cost = 0;
-  if (!isReplicator()) {
+  if (!isReplicator()) {
+    InstructionCost Cost = 0;
-  InstructionCost Cost = 0;
-  if (!isReplicator()) {
+  if (!isReplicator()) {
+    InstructionCost Cost = 0;
+    for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+      Cost += Block->computeCost(VF, Ctx);
+    return Cost;
+  }
+
+  // Compute the cost of a replicate region. Replicating isn't supported for
+  // scalable vectors, return an invalid cost for them.
+  if (VF.isScalable())
+    return InstructionCost::getInvalid();
+
+  // First compute the cost of the conditionally executed recipes, followed by
+  // account for the branching cost, except if the mask is a header mask or
+  // uniform condition.
+  using namespace llvm::VPlanPatternMatch;
+  VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    Cost += computeCostForRecipe(&R, VF, Ctx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM = cast<VPBranchOnMaskRecipe>(&getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare or a header mask and don't account for
+  // branching costs. A uniform condition correspondings to a single branch per
+  // VF, and the header mask will always be true except in the last iteration.
+  VPValue *Op;
+  bool IsHeaderMaskOrUniformCond =
+      vputils::isUniformBoolean(Cond) || isa<VPActiveLaneMaskPHIRecipe>(Cond) ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
+       Op == getPlan()->getOrCreateBackedgeTakenCount());
+  if (IsHeaderMaskOrUniformCond)
+    return Cost;
-    return Cost;
+    return ThenCost;
-    return Cost;
+    return ThenCost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return Cost / 2;
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx.Ctx), VF);
+  return Cost +
+         Ctx.TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (Ctx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
+          VF.getFixedValue());
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -900,6 +984,13 @@ void VPlan::execute(VPTransformState *State) {
   }
 }
 
+InstructionCost VPlan::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+    Cost += Block->computeCost(VF, Ctx);
+  return Cost;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPlan::printLiveIns(raw_ostream &O) const {
   VPSlotTracker SlotTracker(this);
@@ -1472,3 +1563,15 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
 }
+
+bool vputils::isUniformBoolean(VPValue *Cond) {
+  if (match(Cond, m_Not(m_VPValue())))
+    Cond = Cond->getDefiningRecipe()->getOperand(0);
+  auto *R = Cond->getDefiningRecipe();
+  if (!R)
+    return true;
+  return match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
+         all_of(R->operands(), [](VPValue *Op) {
+           return vputils::isUniformAfterVectorization(Op);
+         });
+}