From c2d15ac4d4432788557e77c15ce572ac655a8fec Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 18 Jan 2025 13:22:34 +0000 Subject: [PATCH] [VPlan] Update final IV exit value via VPlan. (#112147) Model updating IV users directly in VPlan, replace fixupIVUsers. Now simple extracts are created for all phis in the exit block during initial VPlan construction. A later VPlan transform (optimizeInductionExitUsers) replaces extracts of inductions with their pre-computed values if possible. This completes the transition towards modeling all live-outs directly in VPlan. There are a few follow-ups: * emit extracts initially also for resume phis, and optimize them tougher with IV exit users * support for VPlans with multiple exits in optimizeInductionExitUsers. Depends on https://github.com/llvm/llvm-project/pull/110004, https://github.com/llvm/llvm-project/pull/109975 and https://github.com/llvm/llvm-project/pull/112145. --- .../Transforms/Vectorize/LoopVectorize.cpp | 214 ++---------------- llvm/lib/Transforms/Vectorize/VPlan.h | 6 + .../Transforms/Vectorize/VPlanTransforms.cpp | 126 +++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 7 + llvm/lib/Transforms/Vectorize/VPlanUtils.h | 4 +- .../RISCV/riscv-vector-reverse.ll | 2 - .../LoopVectorize/X86/multi-exit-cost.ll | 14 -- .../LoopVectorize/iv_outside_user.ll | 120 ++++++++++ .../single_early_exit_live_outs.ll | 5 +- .../unused-blend-mask-for-first-operand.ll | 6 +- 10 files changed, 288 insertions(+), 216 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d79d9e8445b3..34c5bc3312ae 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -543,11 +543,6 @@ class InnerLoopVectorizer { protected: friend class LoopVectorizationPlanner; - /// Set up the values of the IVs correctly when exiting the vector loop. - virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *VectorTripCount, BasicBlock *MiddleBlock, - VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); @@ -785,10 +780,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; - - void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *VectorTripCount, BasicBlock *MiddleBlock, - VPTransformState &State) override {}; }; // A specialized derived class of inner loop vectorizer that performs @@ -2782,97 +2773,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton( return LoopVectorPreHeader; } -// Fix up external users of the induction variable. At this point, we are -// in LCSSA form, with all external PHIs that use the IV having one input value, -// coming from the remainder loop. We need those PHIs to also have a correct -// value for the IV when arriving directly from the middle block. -void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, - const InductionDescriptor &II, - Value *VectorTripCount, - BasicBlock *MiddleBlock, - VPTransformState &State) { - // There are two kinds of external IV usages - those that use the value - // computed in the last iteration (the PHI) and those that use the penultimate - // value (the value that feeds into the phi from the loop latch). - // We allow both, but they, obviously, have different values. - - DenseMap MissingVals; - - Value *EndValue = cast(OrigPhi->getIncomingValueForBlock( - OrigLoop->getLoopPreheader())) - ->getIncomingValueForBlock(MiddleBlock); - - // An external user of the last iteration's value should see the value that - // the remainder loop uses to initialize its own IV. - Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); - for (User *U : PostInc->users()) { - Instruction *UI = cast(U); - if (!OrigLoop->contains(UI)) { - assert(isa(UI) && "Expected LCSSA form"); - MissingVals[UI] = EndValue; - } - } - - // An external user of the penultimate value need to see EndValue - Step. - // The simplest way to get this is to recompute it from the constituent SCEVs, - // that is Start + (Step * (CRD - 1)). - for (User *U : OrigPhi->users()) { - auto *UI = cast(U); - if (!OrigLoop->contains(UI)) { - assert(isa(UI) && "Expected LCSSA form"); - IRBuilder<> B(MiddleBlock->getTerminator()); - - // Fast-math-flags propagate from the original induction instruction. - if (isa_and_nonnull(II.getInductionBinOp())) - B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); - - VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); - assert(StepVPV && "step must have been expanded during VPlan execution"); - Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() - : State.get(StepVPV, VPLane(0)); - Value *Escape = nullptr; - if (EndValue->getType()->isIntegerTy()) - Escape = B.CreateSub(EndValue, Step); - else if (EndValue->getType()->isPointerTy()) - Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step)); - else { - assert(EndValue->getType()->isFloatingPointTy() && - "Unexpected induction type"); - Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() == - Instruction::FAdd - ? Instruction::FSub - : Instruction::FAdd, - EndValue, Step); - } - Escape->setName("ind.escape"); - MissingVals[UI] = Escape; - } - } - - assert((MissingVals.empty() || - all_of(MissingVals, - [MiddleBlock, this](const std::pair &P) { - return all_of( - predecessors(cast(P.first)->getParent()), - [MiddleBlock, this](BasicBlock *Pred) { - return Pred == MiddleBlock || - Pred == OrigLoop->getLoopLatch(); - }); - })) && - "Expected escaping values from latch/middle.block only"); - - for (auto &I : MissingVals) { - PHINode *PHI = cast(I.first); - // One corner case we have to handle is two IVs "chasing" each-other, - // that is %IV2 = phi [...], [ %IV1, %latch ] - // In this case, if IV1 has an external use, we need to avoid adding both - // "last value of IV1" and "penultimate value of IV2". So, verify that we - // don't already have an incoming value for the middle block. - if (PHI->getBasicBlockIndex(MiddleBlock) == -1) - PHI->addIncoming(I.second, MiddleBlock); - } -} - namespace { struct CSEDenseMapInfo { @@ -2999,24 +2899,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - if (Cost->requiresScalarEpilogue(VF.isVector())) { - // No edge from the middle block to the unique exit block has been inserted - // and there is nothing to fix from vector loop; phis should have incoming - // from scalar loop only. - } else { - // TODO: Check in VPlan to see if IV users need fixing instead of checking - // the cost model. - - // If we inserted an edge from the middle block to the unique exit block, - // update uses outside the loop (phis) to account for the newly inserted - // edge. - - // Fix-up external users of the induction variables. - for (const auto &Entry : Legal->getInductionVars()) - fixupIVUsers(Entry.first, Entry.second, - getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State); - } - // Don't apply optimizations below when no vector region remains, as they all // require a vector loop at the moment. if (!State.Plan->getVectorLoopRegion()) @@ -9049,11 +8931,9 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the /// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute /// the end value of the induction. -static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, - VPBuilder &VectorPHBuilder, - VPBuilder &ScalarPHBuilder, - VPTypeAnalysis &TypeInfo, - VPValue *VectorTC) { +static VPInstruction *addResumePhiRecipeForInduction( + VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, + VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) { auto *WideIntOrFp = dyn_cast(WideIV); // Truncated wide inductions resume from the last lane of their vector value // in the last vector iteration which is handled elsewhere. @@ -9087,8 +8967,10 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, /// Create resume phis in the scalar preheader for first-order recurrences, /// reductions and inductions, and update the VPIRInstructions wrapping the -/// original phis in the scalar header. -static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { +/// original phis in the scalar header. End values for inductions are added to +/// \p IVEndValues. +static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, + DenseMap &IVEndValues) { VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast(ScalarPH->getSinglePredecessor()); @@ -9105,11 +8987,16 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { if (!ScalarPhiI) break; + // TODO: Extract final value from induction recipe initially, optimize to + // pre-computed end value together in optimizeInductionExitUsers. auto *VectorPhiR = cast(Builder.getRecipe(ScalarPhiI)); if (auto *WideIVR = dyn_cast(VectorPhiR)) { - if (VPValue *ResumePhi = addResumePhiRecipeForInduction( + if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction( WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo, &Plan.getVectorTripCount())) { + assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi && + "Expected a ResumePhi"); + IVEndValues[WideIVR] = ResumePhi->getOperand(0); ScalarPhiIRI->addOperand(ResumePhi); continue; } @@ -9140,65 +9027,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { } } -/// Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is -/// either an untruncated wide induction, or if it increments a wide induction -/// by its step. -static bool isOptimizableIVOrUse(VPValue *VPV) { - VPRecipeBase *Def = VPV->getDefiningRecipe(); - if (!Def) - return false; - auto *WideIV = dyn_cast(Def); - if (WideIV) { - // VPV itself is a wide induction, separately compute the end value for exit - // users if it is not a truncated IV. - return isa(WideIV) || - !cast(WideIV)->getTruncInst(); - } - - // Check if VPV is an optimizable induction increment. - if (Def->getNumOperands() != 2) - return false; - WideIV = dyn_cast(Def->getOperand(0)); - if (!WideIV) - WideIV = dyn_cast(Def->getOperand(1)); - if (!WideIV) - return false; - - using namespace VPlanPatternMatch; - auto &ID = WideIV->getInductionDescriptor(); - - // Check if VPV increments the induction by the induction step. - VPValue *IVStep = WideIV->getStepValue(); - switch (ID.getInductionOpcode()) { - case Instruction::Add: - return match(VPV, m_c_Binary(m_Specific(WideIV), - m_Specific(IVStep))); - case Instruction::FAdd: - return match(VPV, m_c_Binary(m_Specific(WideIV), - m_Specific(IVStep))); - case Instruction::FSub: - return match(VPV, m_Binary(m_Specific(WideIV), - m_Specific(IVStep))); - case Instruction::Sub: { - // IVStep will be the negated step of the subtraction. Check if Step == -1 * - // IVStep. - VPValue *Step; - if (!match(VPV, m_Binary(m_VPValue(), m_VPValue(Step))) || - !Step->isLiveIn() || !IVStep->isLiveIn()) - return false; - auto *StepCI = dyn_cast(Step->getLiveInIRValue()); - auto *IVStepCI = dyn_cast(IVStep->getLiveInIRValue()); - return StepCI && IVStepCI && - StepCI->getValue() == (-1 * IVStepCI->getValue()); - } - default: - return ID.getKind() == InductionDescriptor::IK_PtrInduction && - match(VPV, m_GetElementPtr(m_Specific(WideIV), - m_Specific(WideIV->getStepValue()))); - } - llvm_unreachable("should have been covered by switch above"); -} - // Collect VPIRInstructions for phis in the exit blocks that are modeled // in VPlan and add the exiting VPValue as operand. Some exiting values are not // modeled explicitly yet and won't be included. Those are un-truncated @@ -9228,12 +9056,6 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, } Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); - // Exit values for inductions are computed and updated outside of VPlan - // and independent of induction recipes. - // TODO: Compute induction exit values in VPlan. - if (isOptimizableIVOrUse(V) && - ExitVPBB->getSinglePredecessor() == MiddleVPBB) - continue; ExitUsersToFix.insert(ExitIRI); ExitIRI->addOperand(V); } @@ -9253,6 +9075,7 @@ addUsersInExitBlocks(VPlan &Plan, auto *MiddleVPBB = Plan.getMiddleBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); // Introduce extract for exiting values and update the VPIRInstructions // modeling the corresponding LCSSA phis. @@ -9574,7 +9397,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPlanTransforms::handleUncountableEarlyExit( *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); } - addScalarResumePhis(RecipeBuilder, *Plan); + DenseMap IVEndValues; + addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); SetVector ExitUsersToFix = collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); @@ -9657,6 +9481,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, WithoutRuntimeCheck); } + VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; @@ -9708,7 +9533,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { auto *HeaderR = cast(&R); RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR); } - addScalarResumePhis(RecipeBuilder, *Plan); + DenseMap IVEndValues; + // TODO: IVEndValues are not used yet in the native path, to optimize exit + // values. + addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 784cee6ed4b0..db45ad8aadbb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1422,6 +1422,12 @@ class VPIRInstruction : public VPRecipeBase { "Op must be an operand of the recipe"); return true; } + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } }; /// VPWidenRecipe is a recipe for producing a widened instruction using the diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a04ad1b37053..9febd612c644 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -667,6 +667,131 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { } } +/// Check if \p VPV is an untruncated wide induction, either before or after the +/// increment. If so return the header IV (before the increment), otherwise +/// return null. +static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) { + auto *WideIV = dyn_cast(VPV); + if (WideIV) { + // VPV itself is a wide induction, separately compute the end value for exit + // users if it is not a truncated IV. + auto *IntOrFpIV = dyn_cast(WideIV); + return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV; + } + + // Check if VPV is an optimizable induction increment. + VPRecipeBase *Def = VPV->getDefiningRecipe(); + if (!Def || Def->getNumOperands() != 2) + return nullptr; + WideIV = dyn_cast(Def->getOperand(0)); + if (!WideIV) + WideIV = dyn_cast(Def->getOperand(1)); + if (!WideIV) + return nullptr; + + auto IsWideIVInc = [&]() { + using namespace VPlanPatternMatch; + auto &ID = WideIV->getInductionDescriptor(); + + // Check if VPV increments the induction by the induction step. + VPValue *IVStep = WideIV->getStepValue(); + switch (ID.getInductionOpcode()) { + case Instruction::Add: + return match(VPV, m_c_Binary(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::FAdd: + return match(VPV, m_c_Binary(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::FSub: + return match(VPV, m_Binary(m_Specific(WideIV), + m_Specific(IVStep))); + case Instruction::Sub: { + // IVStep will be the negated step of the subtraction. Check if Step == -1 + // * IVStep. + VPValue *Step; + if (!match(VPV, + m_Binary(m_VPValue(), m_VPValue(Step))) || + !Step->isLiveIn() || !IVStep->isLiveIn()) + return false; + auto *StepCI = dyn_cast(Step->getLiveInIRValue()); + auto *IVStepCI = dyn_cast(IVStep->getLiveInIRValue()); + return StepCI && IVStepCI && + StepCI->getValue() == (-1 * IVStepCI->getValue()); + } + default: + return ID.getKind() == InductionDescriptor::IK_PtrInduction && + match(VPV, m_GetElementPtr(m_Specific(WideIV), + m_Specific(WideIV->getStepValue()))); + } + llvm_unreachable("should have been covered by switch above"); + }; + return IsWideIVInc() ? WideIV : nullptr; +} + +void VPlanTransforms::optimizeInductionExitUsers( + VPlan &Plan, DenseMap &EndValues) { + using namespace VPlanPatternMatch; + SmallVector ExitVPBBs(Plan.getExitBlocks()); + if (ExitVPBBs.size() != 1) + return; + + VPIRBasicBlock *ExitVPBB = ExitVPBBs[0]; + VPBlockBase *PredVPBB = ExitVPBB->getSinglePredecessor(); + if (!PredVPBB) + return; + assert(PredVPBB == Plan.getMiddleBlock() && + "predecessor must be the middle block"); + + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + VPBuilder B(Plan.getMiddleBlock()->getTerminator()); + for (VPRecipeBase &R : *ExitVPBB) { + auto *ExitIRI = cast(&R); + if (!isa(ExitIRI->getInstruction())) + break; + + VPValue *Incoming; + if (!match(ExitIRI->getOperand(0), + m_VPInstruction( + m_VPValue(Incoming), m_SpecificInt(1)))) + continue; + + auto *WideIV = getOptimizableIVOf(Incoming); + if (!WideIV) + continue; + VPValue *EndValue = EndValues.lookup(WideIV); + assert(EndValue && "end value must have been pre-computed"); + + if (Incoming != WideIV) { + ExitIRI->setOperand(0, EndValue); + continue; + } + + VPValue *Escape = nullptr; + VPValue *Step = WideIV->getStepValue(); + Type *ScalarTy = TypeInfo.inferScalarType(WideIV); + if (ScalarTy->isIntegerTy()) { + Escape = + B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape"); + } else if (ScalarTy->isPointerTy()) { + auto *Zero = Plan.getOrAddLiveIn( + ConstantInt::get(Step->getLiveInIRValue()->getType(), 0)); + Escape = B.createPtrAdd(EndValue, + B.createNaryOp(Instruction::Sub, {Zero, Step}), + {}, "ind.escape"); + } else if (ScalarTy->isFloatingPointTy()) { + const auto &ID = WideIV->getInductionDescriptor(); + Escape = B.createNaryOp( + ID.getInductionBinOp()->getOpcode() == Instruction::FAdd + ? Instruction::FSub + : Instruction::FAdd, + {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()}); + } else { + llvm_unreachable("all possible induction types must be handled"); + } + ExitIRI->setOperand(0, Escape); + } +} + /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing /// them with already existing recipes expanding the same SCEV expression. static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { @@ -1318,6 +1443,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { removeRedundantInductionCasts(Plan); simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); + removeDeadRecipes(Plan); legalizeAndOptimizeInductions(Plan); removeRedundantExpandSCEVRecipes(Plan); simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index fddde8689116..a751b8b5e8dc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -137,6 +137,13 @@ struct VPlanTransforms { /// Lower abstract recipes to concrete ones, that can be codegen'd. static void convertToConcreteRecipes(VPlan &Plan); + + /// If there's a single exit block, optimize its phi recipes that use exiting + /// IV values by feeding them precomputed end values instead, possibly taken + /// one step backwards. + static void + optimizeInductionExitUsers(VPlan &Plan, + DenseMap &EndValues); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 777944264f45..1395202c10d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -45,8 +45,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { assert(Def && "Must have definition for value defined inside vector region"); if (auto *Rep = dyn_cast(Def)) return Rep->isUniform(); - if (auto *GEP = dyn_cast(Def)) - return all_of(GEP->operands(), isUniformAfterVectorization); + if (isa(Def)) + return all_of(Def->operands(), isUniformAfterVectorization); if (auto *VPI = dyn_cast(Def)) return VPI->isSingleScalar() || VPI->isVectorToScalar(); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 951d833fa941..f630f4f21e06 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -230,7 +230,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK: LV: Loop does not require scalar epilogue ; entry: %cmp7 = icmp sgt i32 %n, 0 @@ -480,7 +479,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK: LV: Loop does not require scalar epilogue ; entry: %cmp7 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index 7b29d0ef7cbb..6c97ab362fc8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -29,21 +29,7 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP10]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP11]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = and i64 [[TMP12]], 1 -; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP13]], 1 -; CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP14]], 1 -; CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP15]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = and i64 [[TMP16]], 1 -; CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP17]], 1 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -7 diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index 3e61546da2ce..eb1dc9debc6b 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -860,6 +860,126 @@ exit: ret float %add } +define float @fp_postinc_use_fadd_ops_swapped(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) { +; VEC-LABEL: define float @fp_postinc_use_fadd_ops_swapped( +; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) { +; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VEC: [[VECTOR_PH]]: +; VEC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; VEC-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; VEC-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] +; VEC-NEXT: [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]] +; VEC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 +; VEC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer +; VEC-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0 +; VEC-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT1]], <2 x float> poison, <2 x i32> zeroinitializer +; VEC-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> , [[DOTSPLAT2]] +; VEC-NEXT: [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP2]] +; VEC-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00 +; VEC-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 +; VEC-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT3]], <2 x float> poison, <2 x i32> zeroinitializer +; VEC-NEXT: br label %[[VECTOR_BODY:.*]] +; VEC: [[VECTOR_BODY]]: +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VEC-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VEC-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 +; VEC-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4 +; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]] +; VEC-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; VEC: [[MIDDLE_BLOCK]]: +; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; VEC-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VEC: [[SCALAR_PH]]: +; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VEC-NEXT: [[BC_RESUME_VAL5:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ] +; VEC-NEXT: br label %[[LOOP:.*]] +; VEC: [[LOOP]]: +; VEC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; VEC-NEXT: store float [[FP_IV]], ptr [[GEP_A]], align 4 +; VEC-NEXT: [[ADD]] = fadd fast float [[FPINC]], [[FP_IV]] +; VEC-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; VEC-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; VEC-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} +; VEC: [[EXIT]]: +; VEC-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ] +; VEC-NEXT: ret float [[ADD_LCSSA]] +; +; INTERLEAVE-LABEL: define float @fp_postinc_use_fadd_ops_swapped( +; INTERLEAVE-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) { +; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; INTERLEAVE: [[VECTOR_PH]]: +; INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; INTERLEAVE-NEXT: [[TMP0:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] +; INTERLEAVE-NEXT: [[TMP1:%.*]] = fadd fast float [[INIT]], [[TMP0]] +; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] +; INTERLEAVE: [[VECTOR_BODY]]: +; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; INTERLEAVE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 1 +; INTERLEAVE-NEXT: [[DOTCAST1:%.*]] = sitofp i64 [[INDEX]] to float +; INTERLEAVE-NEXT: [[TMP4:%.*]] = fmul fast float [[FPINC]], [[DOTCAST1]] +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[INIT]], [[TMP4]] +; INTERLEAVE-NEXT: [[TMP5:%.*]] = fmul fast float 0.000000e+00, [[FPINC]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP5]] +; INTERLEAVE-NEXT: [[TMP7:%.*]] = fmul fast float 1.000000e+00, [[FPINC]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP7]] +; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]] +; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP3]] +; INTERLEAVE-NEXT: store float [[TMP6]], ptr [[TMP9]], align 4 +; INTERLEAVE-NEXT: store float [[TMP8]], ptr [[TMP10]], align 4 +; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; INTERLEAVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}} +; INTERLEAVE: [[MIDDLE_BLOCK]]: +; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; INTERLEAVE: [[SCALAR_PH]]: +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL2:%.*]] = phi float [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[INIT]], %[[ENTRY]] ] +; INTERLEAVE-NEXT: br label %[[LOOP:.*]] +; INTERLEAVE: [[LOOP]]: +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[FP_IV:%.*]] = phi float [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; INTERLEAVE-NEXT: store float [[FP_IV]], ptr [[GEP_A]], align 4 +; INTERLEAVE-NEXT: [[ADD]] = fadd fast float [[FPINC]], [[FP_IV]] +; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; INTERLEAVE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; INTERLEAVE-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}} +; INTERLEAVE: [[EXIT]]: +; INTERLEAVE-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %fp.iv = phi float [ %init, %entry ], [ %add, %loop ] + %gep.A = getelementptr inbounds float, ptr %A, i64 %iv + store float %fp.iv, ptr %gep.A, align 4 + %add = fadd fast float %fpinc, %fp.iv + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop + +exit: + ret float %add +} + define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N, float %fpinc) { ; VEC-LABEL: define float @fp_postinc_use_fsub( ; VEC-SAME: float [[INIT:%.*]], ptr noalias nocapture [[A:%.*]], i64 [[N:%.*]], float [[FPINC:%.*]]) { diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 085438aa80f2..6e542bd873b8 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -532,6 +532,7 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] @@ -545,11 +546,13 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.split: ; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] @@ -570,7 +573,7 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ 67, [[MIDDLE_SPLIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll index d7d7d5d9c5da..50c1f74d2aac 100644 --- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll +++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll @@ -84,8 +84,6 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -94,8 +92,8 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 % ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]] ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]] ; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0