Skip to content

Commit

Permalink
[VPlan] Introduce recipes for VP loads and stores.
Browse files Browse the repository at this point in the history
Introduce new subclasses of VPWidenMemoryRecipe for VP
(vector-predicated) loads and stores to address multiple TODOs from
llvm#76172

Note that the introduction of the new recipes also improves code-gen for
VP gather/scatters by removing the redundant header mask. With the new
approach, it is not sufficient to look at users of the widened canonical
IV to find all uses of the header mask.

In some cases, a widened IV is used instead of separately widening the
canonical IV. To handle those cases, iterate over all recipes in the
vector loop region to make sure all widened memory recipes are
processed.

Depends on llvm#87411.
  • Loading branch information
fhahn committed Apr 5, 2024
1 parent 2ec0e32 commit 6dcd584
Show file tree
Hide file tree
Showing 8 changed files with 260 additions and 120 deletions.
172 changes: 89 additions & 83 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9417,52 +9417,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
}

/// Creates either vp_store or vp_scatter intrinsics calls to represent
/// predicated store/scatter.
static Instruction *
lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
Value *StoredVal, bool IsScatter, Value *Mask,
Value *EVL, const Align &Alignment) {
CallInst *Call;
if (IsScatter) {
Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
Call = cast<CallInst>(VBuilder.createVectorInstruction(
Instruction::Store, Type::getVoidTy(EVL->getContext()),
{StoredVal, Addr}));
}
Call->addParamAttr(
1, Attribute::getWithAlignment(Call->getContext(), Alignment));
return Call;
}

/// Creates either vp_load or vp_gather intrinsics calls to represent
/// predicated load/gather.
static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
VectorType *DataTy,
Value *Addr, bool IsGather,
Value *Mask, Value *EVL,
const Align &Alignment) {
CallInst *Call;
if (IsGather) {
Call =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
Call = cast<CallInst>(VBuilder.createVectorInstruction(
Instruction::Load, DataTy, Addr, "vp.op.load"));
}
Call->addParamAttr(
0, Attribute::getWithAlignment(Call->getContext(), Alignment));
return Call;
}

void VPWidenLoadRecipe::execute(VPTransformState &State) {
// Attempt to issue a wide load.
auto *LI = cast<LoadInst>(&Ingredient);
Expand Down Expand Up @@ -9491,25 +9445,7 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *NewLI;
// TODO: split this into several classes for better design.
if (State.EVL) {
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
"explicit vector length.");
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
VPInstruction::ExplicitVectorLength &&
"EVL must be VPInstruction::ExplicitVectorLength.");
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
// If EVL is not nullptr, then EVL must be a valid value set during plan
// creation, possibly default value = whole vector register length. EVL
// is created only if TTI prefers predicated vectorization, thus if EVL
// is not nullptr it also implies preference for predicated
// vectorization.
// FIXME: Support reverse loading after vp_reverse is added.
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
NewLI = lowerLoadUsingVectorIntrinsics(
Builder, DataTy, State.get(getAddr(), Part, !CreateGather),
CreateGather, MaskPart, EVL, Alignment);
} else if (CreateGather) {
if (CreateGather) {
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(getAddr(), Part);
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
Expand All @@ -9535,6 +9471,51 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) {
}
}

void VPWidenVPLoadRecipe::execute(VPTransformState &State) {
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
"explicit vector length.");
// FIXME: Support reverse loading after vp_reverse is added.
assert(!isReverse() && "Reverse loads are not implemented yet.");

// Attempt to issue a wide load.
auto *LI = cast<LoadInst>(&Ingredient);

Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
bool CreateGather = !isConsecutive();

auto &Builder = State.Builder;
// Handle loads.
assert(LI && "Must have a load instruction");
State.setDebugLocFrom(getDebugLoc());
for (unsigned Part = 0; Part < State.UF; ++Part) {
CallInst *NewLI;
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
Value *Addr = State.get(getAddr(), Part, !CreateGather);
Value *Mask =
getMask()
? State.get(getMask(), Part)
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
if (CreateGather) {
NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather,
{Addr, Mask, EVL}, nullptr,
"wide.masked.gather");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
Instruction::Load, DataTy, Addr, "vp.op.load"));
}
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));

// Add metadata to the load.
State.addMetadata(NewLI, LI);
State.set(this, NewLI, Part);
}
}

void VPWidenStoreRecipe::execute(VPTransformState &State) {
auto *SI = cast<StoreInst>(&Ingredient);

Expand Down Expand Up @@ -9562,24 +9543,7 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
Instruction *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue, Part);
// TODO: split this into several classes for better design.
if (State.EVL) {
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
"explicit vector length.");
assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
VPInstruction::ExplicitVectorLength &&
"EVL must be VPInstruction::ExplicitVectorLength.");
Value *EVL = State.get(State.EVL, VPIteration(0, 0));
// If EVL is not nullptr, then EVL must be a valid value set during plan
// creation, possibly default value = whole vector register length. EVL
// is created only if TTI prefers predicated vectorization, thus if EVL
// is not nullptr it also implies preference for predicated
// vectorization.
// FIXME: Support reverse store after vp_reverse is added.
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
NewSI = lowerStoreUsingVectorIntrinsics(
Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal,
CreateScatter, MaskPart, EVL, Alignment);
} else if (CreateScatter) {
if (CreateScatter) {
Value *MaskPart = IsMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(getAddr(), Part);
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
Expand All @@ -9603,6 +9567,48 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) {
}
}

void VPWidenVPStoreRecipe::execute(VPTransformState &State) {
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
"explicit vector length.");
// FIXME: Support reverse loading after vp_reverse is added.
assert(!isReverse() && "Reverse store are not implemented yet.");

auto *SI = cast<StoreInst>(&Ingredient);

VPValue *StoredValue = getStoredValue();
bool CreateScatter = !isConsecutive();
const Align Alignment = getLoadStoreAlignment(&Ingredient);

auto &Builder = State.Builder;
State.setDebugLocFrom(getDebugLoc());

for (unsigned Part = 0; Part < State.UF; ++Part) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue, Part);
Value *EVL = State.get(getEVL(), VPIteration(0, 0));
// FIXME: Support reverse store after vp_reverse is added.
Value *Mask =
getMask()
? State.get(getMask(), Part)
: Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
Value *Addr = State.get(getAddr(), Part, !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
Instruction::Store, Type::getVoidTy(EVL->getContext()),
{StoredVal, Addr}));
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));

State.addMetadata(NewSI, SI);
}
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
// predication, and 4) a TTI hook that analyses whether the loop is suitable
Expand Down
114 changes: 103 additions & 11 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -242,15 +242,6 @@ struct VPTransformState {
ElementCount VF;
unsigned UF;

/// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid
/// value set during plan transformation, possibly a default value = whole
/// vector register length. EVL is created only if TTI prefers predicated
/// vectorization, thus if EVL is not nullptr it also implies preference for
/// predicated vectorization.
/// TODO: this is a temporarily solution, the EVL must be explicitly used by
/// the recipes and must be removed here.
VPValue *EVL = nullptr;

/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
/// instructions.
Expand Down Expand Up @@ -2304,8 +2295,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPRecipeBase *clone() override = 0;

static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPDef::VPWidenLoadSC ||
R->getVPDefID() == VPDef::VPWidenStoreSC;
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenVPLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenVPStoreSC;
}

static inline bool classof(const VPUser *U) {
Expand All @@ -2320,6 +2313,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
return getNumOperands() == 2;
case VPDef::VPWidenStoreSC:
return getNumOperands() == 3;
case VPDef::VPWidenVPLoadSC:
return getNumOperands() == 3;
case VPDef::VPWidenVPStoreSC:
return getNumOperands() == 4;
default:
llvm_unreachable("unhandled recipe");
}
Expand All @@ -2329,8 +2326,10 @@ class VPWidenMemoryRecipe : public VPRecipeBase {
VPValue *getAddr() const {
switch (getVPDefID()) {
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenVPLoadSC:
return getOperand(0);
case VPDef::VPWidenStoreSC:
case VPDef::VPWidenVPStoreSC:
return getOperand(1);
default:
llvm_unreachable("unhandled recipe");
Expand Down Expand Up @@ -2392,7 +2391,51 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Widened, consecutive memory operations only demand the first lane of
// their address, unless the same operand is also stored. That latter can
// happen with opaque pointers.
return Op == getAddr() && isConsecutive();
}
};

/// A recipe for widening load operations with vector-predication intrinsics,
/// using the address to load from, the explicit vector length and an optional
/// mask.
struct VPWidenVPLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
VPWidenVPLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *EVL,
VPValue *Mask, bool IsConsecutive, DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenVPLoadSC, Load, {Addr, EVL},
IsConsecutive, false, DL),
VPValue(this, &Load) {
setMask(Mask);
}

VPRecipeBase *clone() override {
return new VPWidenVPLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
getEVL(), getMask(), isConsecutive(),
getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPWidenVPLoadSC)

/// Return the EVL operand.
VPValue *getEVL() const { return getOperand(1); }

/// Generate the wide load/store.
void execute(VPTransformState &State) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
if (Op == getEVL())
return true;
// Widened, consecutive loads operations only demand the first lane of
// their address.
return Op == getAddr() && isConsecutive();
Expand Down Expand Up @@ -2439,6 +2482,55 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
return Op == getAddr() && isConsecutive() && Op != getStoredValue();
}
};

/// A recipe for widening store operations with vector-predication intrinsics,
/// using the value to store, the address to store to , the explicit vector
/// length and an optional mask.
struct VPWidenVPStoreRecipe final : public VPWidenMemoryRecipe {
VPWidenVPStoreRecipe(StoreInst &Store, VPValue *StoredVal, VPValue *Addr,
VPValue *EVL, VPValue *Mask, bool IsConsecutive,
DebugLoc DL)
: VPWidenMemoryRecipe(VPDef::VPWidenVPStoreSC, Store,
{StoredVal, Addr, EVL}, IsConsecutive, false, DL) {
setMask(Mask);
}

VPRecipeBase *clone() override {
return new VPWidenVPStoreRecipe(cast<StoreInst>(Ingredient),
getStoredValue(), getAddr(), getEVL(),
getMask(), isConsecutive(), getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPWidenVPStoreSC)

/// Return the address accessed by this recipe.
VPValue *getStoredValue() const { return getOperand(0); }

/// Return the EVL operand.
VPValue *getEVL() const { return getOperand(2); }

/// Generate the wide load/store.
void execute(VPTransformState &State) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
if (Op == getEVL())
return true;
// Widened, consecutive memory operations only demand the first lane of
// their address, unless the same operand is also stored. That latter can
// happen with opaque pointers.
return Op == getAddr() && isConsecutive() && Op != getStoredValue();
}
};

/// Recipe to expand a SCEV expression.
class VPExpandSCEVRecipe : public VPSingleDefRecipe {
const SCEV *Expr;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
assert(isa<VPWidenLoadRecipe>(R) &&
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenVPLoadRecipe>(R)) &&
"Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
Expand Down
Loading

0 comments on commit 6dcd584

Please sign in to comment.