[VPlan] Introduce recipes for VP loads and stores.

Introduce new subclasses of VPWidenMemoryRecipe for VP (vector-predicated) loads and stores to address multiple TODOs from llvm#76172 Note that the introduction of the new recipes also improves code-gen for VP gather/scatters by removing the redundant header mask. With the new approach, it is not sufficient to look at users of the widened canonical IV to find all uses of the header mask. In some cases, a widened IV is used instead of separately widening the canonical IV. To handle those cases, iterate over all recipes in the vector loop region to make sure all widened memory recipes are processed. Depends on llvm#87411.
fhahn · Apr 5, 2024 · 6dcd584 · 6dcd584
1 parent 2ec0e32
commit 6dcd584
Show file tree

Hide file tree

Showing 8 changed files with 260 additions and 120 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9417,52 +9417,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
 }
 
-/// Creates either vp_store or vp_scatter intrinsics calls to represent
-/// predicated store/scatter.
-static Instruction *
-lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
-                                Value *StoredVal, bool IsScatter, Value *Mask,
-                                Value *EVL, const Align &Alignment) {
-  CallInst *Call;
-  if (IsScatter) {
-    Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
-                                   Intrinsic::vp_scatter,
-                                   {StoredVal, Addr, Mask, EVL});
-  } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    Call = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Store, Type::getVoidTy(EVL->getContext()),
-        {StoredVal, Addr}));
-  }
-  Call->addParamAttr(
-      1, Attribute::getWithAlignment(Call->getContext(), Alignment));
-  return Call;
-}
-
-/// Creates either vp_load or vp_gather intrinsics calls to represent
-/// predicated load/gather.
-static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
-                                                   VectorType *DataTy,
-                                                   Value *Addr, bool IsGather,
-                                                   Value *Mask, Value *EVL,
-                                                   const Align &Alignment) {
-  CallInst *Call;
-  if (IsGather) {
-    Call =
-        Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
-                                nullptr, "wide.masked.gather");
-  } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    Call = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Load, DataTy, Addr, "vp.op.load"));
-  }
-  Call->addParamAttr(
-      0, Attribute::getWithAlignment(Call->getContext(), Alignment));
-  return Call;
-}
-
 void VPWidenLoadRecipe::execute(VPTransformState &State) {
   // Attempt to issue a wide load.
   auto *LI = cast<LoadInst>(&Ingredient);
@@ -9491,25 +9445,7 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
-    // TODO: split this into several classes for better design.
-    if (State.EVL) {
-      assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
-                              "explicit vector length.");
-      assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
-                 VPInstruction::ExplicitVectorLength &&
-             "EVL must be VPInstruction::ExplicitVectorLength.");
-      Value *EVL = State.get(State.EVL, VPIteration(0, 0));
-      // If EVL is not nullptr, then EVL must be a valid value set during plan
-      // creation, possibly default value = whole vector register length. EVL
-      // is created only if TTI prefers predicated vectorization, thus if EVL
-      // is not nullptr it also implies preference for predicated
-      // vectorization.
-      // FIXME: Support reverse loading after vp_reverse is added.
-      Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
-      NewLI = lowerLoadUsingVectorIntrinsics(
-          Builder, DataTy, State.get(getAddr(), Part, !CreateGather),
-          CreateGather, MaskPart, EVL, Alignment);
-    } else if (CreateGather) {
+    if (CreateGather) {
       Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
@@ -9535,6 +9471,51 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
   }
 }
 
+void VPWidenVPLoadRecipe::execute(VPTransformState &State) {
+  assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                          "explicit vector length.");
+  // FIXME: Support reverse loading after vp_reverse is added.
+  assert(!isReverse() && "Reverse loads are not implemented yet.");
+
+  // Attempt to issue a wide load.
+  auto *LI = cast<LoadInst>(&Ingredient);
+
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+  bool CreateGather = !isConsecutive();
+
+  auto &Builder = State.Builder;
+  // Handle loads.
+  assert(LI && "Must have a load instruction");
+  State.setDebugLocFrom(getDebugLoc());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    CallInst *NewLI;
+    Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+    Value *Addr = State.get(getAddr(), Part, !CreateGather);
+    Value *Mask =
+        getMask()
+            ? State.get(getMask(), Part)
+            : Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+    if (CreateGather) {
+      NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather,
+                                      {Addr, Mask, EVL}, nullptr,
+                                      "wide.masked.gather");
+    } else {
+      VectorBuilder VBuilder(Builder);
+      VBuilder.setEVL(EVL).setMask(Mask);
+      NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
+          Instruction::Load, DataTy, Addr, "vp.op.load"));
+    }
+    NewLI->addParamAttr(
+        0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+
+    // Add metadata to the load.
+    State.addMetadata(NewLI, LI);
+    State.set(this, NewLI, Part);
+  }
+}
+
 void VPWidenStoreRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
 
@@ -9562,24 +9543,7 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
     Instruction *NewSI = nullptr;
     Value *StoredVal = State.get(StoredValue, Part);
     // TODO: split this into several classes for better design.
-    if (State.EVL) {
-      assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
-                              "explicit vector length.");
-      assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
-                 VPInstruction::ExplicitVectorLength &&
-             "EVL must be VPInstruction::ExplicitVectorLength.");
-      Value *EVL = State.get(State.EVL, VPIteration(0, 0));
-      // If EVL is not nullptr, then EVL must be a valid value set during plan
-      // creation, possibly default value = whole vector register length. EVL
-      // is created only if TTI prefers predicated vectorization, thus if EVL
-      // is not nullptr it also implies preference for predicated
-      // vectorization.
-      // FIXME: Support reverse store after vp_reverse is added.
-      Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
-      NewSI = lowerStoreUsingVectorIntrinsics(
-          Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal,
-          CreateScatter, MaskPart, EVL, Alignment);
-    } else if (CreateScatter) {
+    if (CreateScatter) {
       Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
       NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
@@ -9603,6 +9567,48 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
   }
 }
 
+void VPWidenVPStoreRecipe::execute(VPTransformState &State) {
+  assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                          "explicit vector length.");
+  // FIXME: Support reverse loading after vp_reverse is added.
+  assert(!isReverse() && "Reverse store are not implemented yet.");
+
+  auto *SI = cast<StoreInst>(&Ingredient);
+
+  VPValue *StoredValue = getStoredValue();
+  bool CreateScatter = !isConsecutive();
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(getDebugLoc());
+
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    CallInst *NewSI = nullptr;
+    Value *StoredVal = State.get(StoredValue, Part);
+    Value *EVL = State.get(getEVL(), VPIteration(0, 0));
+    // FIXME: Support reverse store after vp_reverse is added.
+    Value *Mask =
+        getMask()
+            ? State.get(getMask(), Part)
+            : Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+    Value *Addr = State.get(getAddr(), Part, !CreateScatter);
+    if (CreateScatter) {
+      NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+                                      Intrinsic::vp_scatter,
+                                      {StoredVal, Addr, Mask, EVL});
+    } else {
+      VectorBuilder VBuilder(Builder);
+      VBuilder.setEVL(EVL).setMask(Mask);
+      NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+          Instruction::Store, Type::getVoidTy(EVL->getContext()),
+          {StoredVal, Addr}));
+    }
+    NewSI->addParamAttr(
+        1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
+
+    State.addMetadata(NewSI, SI);
+  }
+}
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
 // predication, and 4) a TTI hook that analyses whether the loop is suitable

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -242,15 +242,6 @@ struct VPTransformState {
   ElementCount VF;
   unsigned UF;
 
-  /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
-  /// value set during plan transformation, possibly a default value = whole
-  /// vector register length. EVL is created only if TTI prefers predicated
-  /// vectorization, thus if EVL is not nullptr it also implies preference for
-  /// predicated vectorization.
-  /// TODO: this is a temporarily solution, the EVL must be explicitly used by
-  /// the recipes and must be removed here.
-  VPValue *EVL = nullptr;
-
   /// Hold the indices to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
   /// instructions.
@@ -2304,8 +2295,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
   VPRecipeBase *clone() override = 0;
 
   static inline bool classof(const VPRecipeBase *R) {
-    return R->getVPDefID() == VPDef::VPWidenLoadSC ||
-           R->getVPDefID() == VPDef::VPWidenStoreSC;
+    return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenVPLoadSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenVPStoreSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2320,6 +2313,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
       return getNumOperands() == 2;
     case VPDef::VPWidenStoreSC:
       return getNumOperands() == 3;
+    case VPDef::VPWidenVPLoadSC:
+      return getNumOperands() == 3;
+    case VPDef::VPWidenVPStoreSC:
+      return getNumOperands() == 4;
     default:
       llvm_unreachable("unhandled recipe");
     }
@@ -2329,8 +2326,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
   VPValue *getAddr() const {
     switch (getVPDefID()) {
     case VPDef::VPWidenLoadSC:
+    case VPDef::VPWidenVPLoadSC:
       return getOperand(0);
     case VPDef::VPWidenStoreSC:
+    case VPDef::VPWidenVPStoreSC:
       return getOperand(1);
     default:
       llvm_unreachable("unhandled recipe");
@@ -2392,7 +2391,51 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
   bool onlyFirstLaneUsed(const VPValue *Op) const override {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
+    // Widened, consecutive memory operations only demand the first lane of
+    // their address, unless the same operand is also stored. That latter can
+    // happen with opaque pointers.
+    return Op == getAddr() && isConsecutive();
+  }
+};
 
+/// A recipe for widening load operations with vector-predication intrinsics,
+/// using the address to load from, the explicit vector length and an optional
+/// mask.
+struct VPWidenVPLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
+  VPWidenVPLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *EVL,
+                      VPValue *Mask, bool IsConsecutive, DebugLoc DL)
+      : VPWidenMemoryRecipe(VPDef::VPWidenVPLoadSC, Load, {Addr, EVL},
+                            IsConsecutive, false, DL),
+        VPValue(this, &Load) {
+    setMask(Mask);
+  }
+
+  VPRecipeBase *clone() override {
+    return new VPWidenVPLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+                                   getEVL(), getMask(), isConsecutive(),
+                                   getDebugLoc());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenVPLoadSC)
+
+  /// Return the EVL operand.
+  VPValue *getEVL() const { return getOperand(1); }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    if (Op == getEVL())
+      return true;
     // Widened, consecutive loads operations only demand the first lane of
     // their address.
     return Op == getAddr() && isConsecutive();
@@ -2439,6 +2482,55 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
     return Op == getAddr() && isConsecutive() && Op != getStoredValue();
   }
 };
+
+/// A recipe for widening store operations with vector-predication intrinsics,
+/// using the value to store, the address to store to , the explicit vector
+/// length and an optional mask.
+struct VPWidenVPStoreRecipe final : public VPWidenMemoryRecipe {
+  VPWidenVPStoreRecipe(StoreInst &Store, VPValue *StoredVal, VPValue *Addr,
+                       VPValue *EVL, VPValue *Mask, bool IsConsecutive,
+                       DebugLoc DL)
+      : VPWidenMemoryRecipe(VPDef::VPWidenVPStoreSC, Store,
+                            {StoredVal, Addr, EVL}, IsConsecutive, false, DL) {
+    setMask(Mask);
+  }
+
+  VPRecipeBase *clone() override {
+    return new VPWidenVPStoreRecipe(cast<StoreInst>(Ingredient),
+                                    getStoredValue(), getAddr(), getEVL(),
+                                    getMask(), isConsecutive(), getDebugLoc());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenVPStoreSC)
+
+  /// Return the address accessed by this recipe.
+  VPValue *getStoredValue() const { return getOperand(0); }
+
+  /// Return the EVL operand.
+  VPValue *getEVL() const { return getOperand(2); }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    if (Op == getEVL())
+      return true;
+    // Widened, consecutive memory operations only demand the first lane of
+    // their address, unless the same operand is also stored. That latter can
+    // happen with opaque pointers.
+    return Op == getAddr() && isConsecutive() && Op != getStoredValue();
+  }
+};
+
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -109,7 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
-  assert(isa<VPWidenLoadRecipe>(R) &&
+  assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenVPLoadRecipe>(R)) &&
          "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }