From 156661a47bfaeb5dcd0588683c34ec91eeb8402a Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 21 Jan 2025 19:43:15 -0800
Subject: [PATCH 01/12] [IA][RISCV] Support VP intrinsics in
 InterleavedAccessPass

Teach InterleavedAccessPass to recognize the following patterns:

  - vp.store an interleaved scalable vector
  - Deinterleaving a scalable vector loaded from vp.load

Upon recognizing these patterns, IA will collect the interleaved /
deinterleaved operands and delegate them over to their respective
newly-added TLI hooks.

For RISC-V, these patterns are lowered into segmented loads/stores
(except when we're interleaving constant splats, in which case a
unit-strde store will be generated)

Right now we only recognized power-of-two (de)interleave cases, in which
(de)interleave4/8 are synthesized from a tree of (de)interleave2.

Co-authored-by: Nikolay Panchenko <nicholas.panchenko@gmail.com>
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  29 +
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 110 +++-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 257 ++++++++
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   9 +
 .../scalable-vectors-interleaved-access.ll    | 619 ++++++++++++++++++
 5 files changed, 1005 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 59743dbe4d2ea..50fa3a39c0a0c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -94,6 +94,7 @@ class TargetRegisterClass;
 class TargetRegisterInfo;
 class TargetTransformInfo;
 class Value;
+class VPIntrinsic;
 
 namespace Sched {
 
@@ -3152,6 +3153,34 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Lower an interleaved load to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p Load is a vp.load instruction.
+  /// \p Mask is a mask value
+  /// \p DeinterleaveIntrin is vector.deinterleave intrinsic
+  /// \p DeinterleaveRes is a list of deinterleaved results.
+  virtual bool
+  lowerInterleavedScalableLoad(VPIntrinsic *Load, Value *Mask,
+                               IntrinsicInst *DeinterleaveIntrin,
+                               ArrayRef<Value *> DeinterleaveRes) const {
+    return false;
+  }
+
+  /// Lower an interleaved store to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p Store is the vp.store instruction.
+  /// \p Mask is a mask value
+  /// \p InterleaveIntrin is vector.interleave intrinsic
+  /// \p InterleaveOps is a list of values being interleaved.
+  virtual bool
+  lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask,
+                                IntrinsicInst *InterleaveIntrin,
+                                ArrayRef<Value *> InterleaveOps) const {
+    return false;
+  }
+
   /// Lower a deinterleave intrinsic to a target specific load intrinsic.
   /// Return true on success. Currently only supports
   /// llvm.vector.deinterleave2
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 3f6a69ecb7d72..9b15d0351ebe1 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -630,11 +630,34 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
   return true;
 }
 
+/// Check the interleaved mask
+///
+/// - if a value within the optional is non-nullptr, the value corresponds to
+///   deinterleaved mask
+/// - if a value within the option is nullptr, the value corresponds to all-true
+///   mask
+/// - return nullopt if mask cannot be deinterleaved
+static std::optional<Value *> getMask(Value *WideMask, unsigned Factor) {
+  using namespace llvm::PatternMatch;
+  if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
+    SmallVector<Value *, 8> Operands;
+    SmallVector<Instruction *, 8> DeadInsts;
+    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
+      assert(!Operands.empty());
+      if (Operands.size() == Factor &&
+          std::equal(Operands.begin(), Operands.end(), Operands.begin()))
+        return Operands.front();
+    }
+  }
+  if (match(WideMask, m_AllOnes()))
+    return nullptr;
+  return std::nullopt;
+}
+
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
     IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
-
-  if (!LI || !LI->hasOneUse() || !LI->isSimple())
+  Value *LoadedVal = DI->getOperand(0);
+  if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
     return false;
 
   SmallVector<Value *, 8> DeinterleaveValues;
@@ -643,16 +666,42 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
                                    DeinterleaveDeadInsts))
     return false;
 
-  LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
-                    << " with factor = " << DeinterleaveValues.size() << "\n");
+  const unsigned Factor = DeinterleaveValues.size();
 
-  // Try and match this with target specific intrinsics.
-  if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
-    return false;
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
+    if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
+      return false;
+    // Check mask operand. Handle both all-true and interleaved mask.
+    Value *WideMask = VPLoad->getOperand(1);
+    std::optional<Value *> Mask = getMask(WideMask, Factor);
+    if (!Mask)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
+                      << *DI << " and factor = " << Factor << "\n");
+
+    // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
+    // TLI function to emit target-specific interleaved instruction.
+    if (!TLI->lowerInterleavedScalableLoad(VPLoad, *Mask, DI,
+                                           DeinterleaveValues))
+      return false;
+
+  } else {
+    auto *LI = cast<LoadInst>(LoadedVal);
+    if (!LI->isSimple())
+      return false;
+
+    LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
+                      << " and factor = " << Factor << "\n");
+
+    // Try and match this with target specific intrinsics.
+    if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
+      return false;
+  }
 
   DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end());
   // We now have a target-specific load, so delete the old one.
-  DeadInsts.insert(LI);
+  DeadInsts.insert(cast<Instruction>(LoadedVal));
   return true;
 }
 
@@ -660,10 +709,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
     IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
   if (!II->hasOneUse())
     return false;
-
-  StoreInst *SI = dyn_cast<StoreInst>(*(II->users().begin()));
-
-  if (!SI || !SI->isSimple())
+  Value *StoredBy = II->user_back();
+  if (!isa<StoreInst, VPIntrinsic>(StoredBy))
     return false;
 
   SmallVector<Value *, 8> InterleaveValues;
@@ -671,15 +718,40 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
     return false;
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II
-                    << " with factor = " << InterleaveValues.size() << "\n");
+  const unsigned Factor = InterleaveValues.size();
 
-  // Try and match this with target specific intrinsics.
-  if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
-    return false;
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
+    if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
+      return false;
+
+    Value *WideMask = VPStore->getOperand(2);
+    std::optional<Value *> Mask = getMask(WideMask, Factor);
+    if (!Mask)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
+                      << *II << " and factor = " << Factor << "\n");
+
+    // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
+    // TLI function to emit target-specific interleaved instruction.
+    if (!TLI->lowerInterleavedScalableStore(VPStore, *Mask, II,
+                                            InterleaveValues))
+      return false;
+  } else {
+    auto *SI = cast<StoreInst>(StoredBy);
+    if (!SI->isSimple())
+      return false;
+
+    LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
+                      << " and factor = " << Factor << "\n");
+
+    // Try and match this with target specific intrinsics.
+    if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
+      return false;
+  }
 
   // We now have a target-specific store, so delete the old one.
-  DeadInsts.insert(SI);
+  DeadInsts.insert(cast<Instruction>(StoredBy));
   DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
   return true;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 295fd315c56da..b5155106a506b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22529,6 +22529,263 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   return true;
 }
 
+/// Lower an interleaved vp.load into a vlsegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.load (Factor = 2):
+///   %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
+///                                                         %mask,
+///                                                         i32 %wide.rvl)
+///   %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+///             @llvm.vector.deinterleave2.nxv64i8(
+///               <vscale x 64 x i8> %l)
+///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
+///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
+///
+/// Into:
+///   %rvl = udiv %wide.rvl, 2
+///   %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
+///             @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
+///                                                 <vscale x 32 x i8> undef,
+///                                                 ptr %ptr,
+///                                                 %mask,
+///                                                 i64 %rvl,
+///                                                 i64 1)
+///   %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
+///   %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
+///
+/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
+/// removed by the caller
+bool RISCVTargetLowering::lowerInterleavedScalableLoad(
+    VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+    ArrayRef<Value *> DeInterleaveResults) const {
+  assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
+         "Unexpected intrinsic");
+
+  const unsigned Factor = DeInterleaveResults.size();
+
+  auto *WideVTy = cast<VectorType>(Load->getType());
+  unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue();
+  assert(WideNumElements % Factor == 0 &&
+         "ElementCount of a wide load must be divisible by interleave factor");
+  auto *VTy =
+      VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor,
+                      WideVTy->isScalableTy());
+  // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
+  // to emit it first.
+  auto &DL = Load->getModule()->getDataLayout();
+  Align Alignment = Align(DL.getTypeStoreSize(WideVTy->getScalarType()));
+  if (!isLegalInterleavedAccessType(
+          VTy, Factor, Alignment,
+          Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
+    return false;
+
+  IRBuilder<> Builder(Load);
+  Value *WideEVL = Load->getArgOperand(2);
+  auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
+  Value *EVL = Builder.CreateZExtOrTrunc(
+      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+      XLenTy);
+
+  static const Intrinsic::ID IntrMaskIds[] = {
+      Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+      Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+      Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+      Intrinsic::riscv_vlseg8_mask,
+  };
+
+  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+  Type *VecTupTy = TargetExtType::get(
+      Load->getContext(), "riscv.vector.tuple",
+      ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
+                              NumElts * SEW / 8),
+      Factor);
+
+  Value *PoisonVal = PoisonValue::get(VecTupTy);
+  SmallVector<Value *> Operands;
+  Operands.append({PoisonVal, Load->getArgOperand(0)});
+
+  if (!Mask)
+    Mask = ConstantVector::getSplat(VTy->getElementCount(),
+                                    ConstantInt::getTrue(Load->getContext()));
+
+  Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+      Load->getModule(), IntrMaskIds[Factor - 2],
+      {VecTupTy, Mask->getType(), EVL->getType()});
+
+  Operands.push_back(Mask);
+
+  Operands.push_back(EVL);
+
+  // Tail-policy
+  Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC));
+
+  Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+
+  CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
+
+  SmallVector<Type *, 8> AggrTypes{Factor, VTy};
+  Value *Return =
+      PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+  Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+      Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
+  for (unsigned i = 0; i < Factor; ++i) {
+    Value *VecExtract =
+        Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
+    Return = Builder.CreateInsertValue(Return, VecExtract, i);
+  }
+
+  for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) {
+    // We have to create a brand new ExtractValue to replace each
+    // of these old ExtractValue instructions.
+    Value *NewEV =
+        Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
+    DIO->replaceAllUsesWith(NewEV);
+  }
+
+  return true;
+}
+
+/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
+/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
+/// larger splat
+/// `<vscale x 4 x i64> <splat of ((777 << 32) | 666)>` first before casting it
+/// into
+/// `<vscale x 8 x i32>`. This will resuling a simple unit stride store rather
+/// than a segment store, which is more expensive in this case.
+static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin,
+                                            VectorType *VTy,
+                                            const TargetLowering *TLI,
+                                            Instruction *VPStore) {
+  // We only handle Factor = 2 for now.
+  assert(InterleaveIntrin->arg_size() == 2);
+  auto *SplatVal0 = dyn_cast_or_null<ConstantInt>(
+      getSplatValue(InterleaveIntrin->getArgOperand(0)));
+  auto *SplatVal1 = dyn_cast_or_null<ConstantInt>(
+      getSplatValue(InterleaveIntrin->getArgOperand(1)));
+  if (!SplatVal0 || !SplatVal1)
+    return nullptr;
+
+  auto &Ctx = VPStore->getContext();
+  auto &DL = VPStore->getModule()->getDataLayout();
+
+  auto *NewVTy = VectorType::getExtendedElementVectorType(VTy);
+  if (!TLI->isTypeLegal(TLI->getValueType(DL, NewVTy)))
+    return nullptr;
+
+  // InterleavedAccessPass will remove VPStore after this but we still want to
+  // preserve it, hence clone another one here.
+  auto *ClonedVPStore = VPStore->clone();
+  ClonedVPStore->insertBefore(VPStore);
+  IRBuilder<> Builder(ClonedVPStore);
+
+  Type *ETy = VTy->getElementType();
+  unsigned Width = ETy->getIntegerBitWidth();
+
+  APInt NewSplatVal(Width * 2, SplatVal1->getZExtValue());
+  NewSplatVal <<= Width;
+  NewSplatVal |= SplatVal0->getZExtValue();
+  auto *NewSplat = ConstantVector::getSplat(NewVTy->getElementCount(),
+                                            ConstantInt::get(Ctx, NewSplatVal));
+  return Builder.CreateBitCast(NewSplat,
+                               VectorType::getDoubleElementsVectorType(VTy));
+}
+
+/// Lower an interleaved vp.store into a vssegN intrinsic.
+///
+/// E.g. Lower an interleaved vp.store (Factor = 2):
+///
+///   %is = tail call <vscale x 64 x i8>
+///             @llvm.vector.interleave2.nxv64i8(
+///                               <vscale x 32 x i8> %load0,
+///                               <vscale x 32 x i8> %load1
+///   %wide.rvl = shl nuw nsw i32 %rvl, 1
+///   tail call void @llvm.vp.store.nxv64i8.p0(
+///                               <vscale x 64 x i8> %is, ptr %ptr,
+///                               %mask,
+///                               i32 %wide.rvl)
+///
+/// Into:
+///   call void @llvm.riscv.vsseg2.mask.nxv32i8.i64(
+///                               <vscale x 32 x i8> %load1,
+///                               <vscale x 32 x i8> %load2, ptr %ptr,
+///                               %mask,
+///                               i64 %rvl)
+bool RISCVTargetLowering::lowerInterleavedScalableStore(
+    VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin,
+    ArrayRef<Value *> InterleaveOperands) const {
+  assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
+         "Unexpected intrinsic");
+
+  const unsigned Factor = InterleaveOperands.size();
+
+  VectorType *VTy = cast<VectorType>(InterleaveOperands[0]->getType());
+
+  // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
+  // to emit it first.
+  const DataLayout &DL = Store->getDataLayout();
+  Align Alignment = Align(DL.getTypeStoreSize(VTy->getScalarType()));
+  if (!isLegalInterleavedAccessType(
+          VTy, Factor, Alignment,
+          Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
+    return false;
+
+  if (Factor == 2)
+    if (Value *BC =
+            foldInterleaved2OfConstSplats(InterleaveIntrin, VTy, this, Store)) {
+      InterleaveIntrin->replaceAllUsesWith(BC);
+      return true;
+    }
+
+  IRBuilder<> Builder(Store);
+  Value *WideEVL = Store->getArgOperand(3);
+  auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
+  Value *EVL = Builder.CreateZExtOrTrunc(
+      Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
+      XLenTy);
+
+  static const Intrinsic::ID IntrMaskIds[] = {
+      Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+      Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+      Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+      Intrinsic::riscv_vsseg8_mask,
+  };
+
+  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+  Type *VecTupTy = TargetExtType::get(
+      Store->getContext(), "riscv.vector.tuple",
+      ScalableVectorType::get(Type::getInt8Ty(Store->getContext()),
+                              NumElts * SEW / 8),
+      Factor);
+
+  Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
+      Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy});
+  Value *StoredVal = PoisonValue::get(VecTupTy);
+  for (unsigned i = 0; i < Factor; ++i)
+    StoredVal = Builder.CreateCall(
+        VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
+
+  SmallVector<Value *, 5> Operands;
+  Operands.push_back(StoredVal);
+  Operands.push_back(Store->getArgOperand(1));
+
+  if (!Mask)
+    Mask = ConstantVector::getSplat(VTy->getElementCount(),
+                                    ConstantInt::getTrue(Store->getContext()));
+
+  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+      Store->getModule(), IntrMaskIds[Factor - 2],
+      {VecTupTy, Mask->getType(), EVL->getType()});
+
+  Operands.push_back(Mask);
+  Operands.push_back(EVL);
+  Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+
+  Builder.CreateCall(VssegNFunc, Operands);
+  return true;
+}
+
 MachineInstr *
 RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
                                    MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 77605a3076a80..f779f4fdc26c1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -910,6 +910,15 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
+  bool lowerInterleavedScalableLoad(
+      VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+      ArrayRef<Value *> DeinterleaveRes) const override;
+
+  bool
+  lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask,
+                                IntrinsicInst *InterleaveIntrin,
+                                ArrayRef<Value *> InterleaveOps) const override;
+
   bool supportKCFIBundles() const override { return true; }
 
   SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
new file mode 100644
index 0000000000000..ac254792e167a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
@@ -0,0 +1,619 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: load_factor2_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor2_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg2e32.v v8, (a0)
+; RV64-NEXT:    ret
+  %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+  %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: load_factor4_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg4e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor4_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg4e32.v v8, (a0)
+; RV64-NEXT:    ret
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor8_v2(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: load_factor8_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 3
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg8e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor8_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 3
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg8e32.v v8, (a0)
+; RV64-NEXT:    ret
+  %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %rvl)
+  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
+  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
+  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
+  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
+  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
+  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
+  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
+  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
+
+  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
+  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
+  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
+  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
+  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
+  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
+  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
+  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
+  %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
+  %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
+  %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
+}
+
+define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: store_factor2_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vsseg2e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store_factor2_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vsseg2e32.v v8, (a0)
+; RV64-NEXT:    ret
+  %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+  ret void
+}
+
+; Expecting unit-stride store here rather than segmented store.
+define void @store_factor2_const_splat(ptr %dst) {
+; RV32-LABEL: store_factor2_const_splat:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    li a1, 777
+; RV32-NEXT:    li a2, 666
+; RV32-NEXT:    sw a2, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    li a1, 87
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vse32.v v8, (a0)
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store_factor2_const_splat:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 777
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    addi a1, a1, 666
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    li a1, 87
+; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV64-NEXT:    vse32.v v8, (a0)
+; RV64-NEXT:    ret
+  %interleave2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(
+    <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 666, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer),
+    <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 777, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  )
+  call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst,
+    <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 1, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer),
+    i32 87)
+  ret void
+}
+
+define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: store_factor4_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vmv1r.v v11, v9
+; RV32-NEXT:    vsseg4e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store_factor4_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v10, v8
+; RV64-NEXT:    vmv1r.v v11, v9
+; RV64-NEXT:    vsseg4e32.v v8, (a0)
+; RV64-NEXT:    ret
+  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+  ret void
+}
+
+define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: store_factor8_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 3
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vmv1r.v v11, v9
+; RV32-NEXT:    vmv1r.v v12, v8
+; RV32-NEXT:    vmv1r.v v13, v9
+; RV32-NEXT:    vmv1r.v v14, v8
+; RV32-NEXT:    vmv1r.v v15, v9
+; RV32-NEXT:    vsseg8e32.v v8, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: store_factor8_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 3
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v10, v8
+; RV64-NEXT:    vmv1r.v v11, v9
+; RV64-NEXT:    vmv1r.v v12, v8
+; RV64-NEXT:    vmv1r.v v13, v9
+; RV64-NEXT:    vmv1r.v v14, v8
+; RV64-NEXT:    vmv1r.v v15, v9
+; RV64-NEXT:    vsseg8e32.v v8, (a0)
+; RV64-NEXT:    ret
+  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+  %interleaved.vec3 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+  %interleaved.vec4 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+  %interleaved.vec5 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec3, <vscale x 2 x i32> %interleaved.vec4)
+  %interleaved.vec6 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %interleaved.vec2, <vscale x 4 x i32> %interleaved.vec5)
+  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  ret void
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_factor2_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v8, (a0), v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: masked_load_factor2_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
+; RV64-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+  %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor4_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_factor4_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg4e32.v v8, (a0), v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: masked_load_factor4_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg4e32.v v8, (a0), v0.t
+; RV64-NEXT:    ret
+  %interleaved.mask0 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %interleaved.mask1 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %interleaved.mask2 = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %interleaved.mask0, <vscale x 4 x i1> %interleaved.mask1)
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask2, i32 %rvl)
+  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_store_factor2_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v9, v8
+; RV32-NEXT:    vsseg2e32.v v8, (a0), v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: masked_store_factor2_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v9, v8
+; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
+; RV64-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+  %interleaved.vec = tail call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+  tail call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> %interleaved.mask, i32 %rvl)
+  ret void
+}
+
+define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_store_factor2_v2_shared_mask:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vlseg2e32.v v8, (a0), v0.t
+; RV32-NEXT:    vsseg2e32.v v8, (a0), v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: masked_load_store_factor2_v2_shared_mask:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
+; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
+; RV64-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+  %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %t0, <vscale x 2 x i32> %t1)
+  tail call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+  ret void
+}
+
+define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v8, v0
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    li a2, -1
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmerge.vim v11, v9, 1, v0
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    vwaddu.vv v12, v11, v11
+; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v11, v12, a3
+; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v11, 0
+; RV32-NEXT:    add a2, a3, a3
+; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v9, a3
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT:    vle32.v v10, (a0), v0.t
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT:    vnsrl.wx v13, v10, a2
+; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vnsrl.wi v12, v10, 0
+; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vsseg2e32.v v12, (a0), v0.t
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v8, v0
+; RV64-NEXT:    vmv.v.i v9, 0
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmerge.vim v11, v9, 1, v0
+; RV64-NEXT:    srli a3, a3, 2
+; RV64-NEXT:    vwaddu.vv v12, v11, v11
+; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v11, v12, a3
+; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v11, 0
+; RV64-NEXT:    add a2, a3, a3
+; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v9, a3
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    srli a2, a2, 32
+; RV64-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; RV64-NEXT:    vle32.v v10, (a0), v0.t
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wx v13, v10, a2
+; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vnsrl.wi v12, v10, 0
+; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+  %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  %r0 = extractelement <vscale x 4 x i32> %wide.masked.load, i32 0
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %t0, <vscale x 2 x i32> %t1)
+  tail call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+  ret i32 %r0
+}
+
+define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: masked_store_factor4_v2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV32-NEXT:    vmv1r.v v10, v8
+; RV32-NEXT:    vmv1r.v v11, v9
+; RV32-NEXT:    vsseg4e32.v v8, (a0), v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: masked_store_factor4_v2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    vmv1r.v v10, v8
+; RV64-NEXT:    vmv1r.v v11, v9
+; RV64-NEXT:    vsseg4e32.v v8, (a0), v0.t
+; RV64-NEXT:    ret
+  %interleaved.mask0 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+  %interleaved.mask1 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+  %interleaved.mask2 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %interleaved.mask0, <vscale x 2 x i1> %interleaved.mask1)
+  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
+  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> %interleaved.mask2, i32 %rvl)
+  ret void
+}
+
+; Negative tests
+
+; We should not transform this function because the deinterleave tree is not in a desired form.
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: incorrect_extract_value_index:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vnsrl.wi v12, v8, 0
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vnsrl.wx v9, v12, a0
+; RV32-NEXT:    vnsrl.wi v8, v12, 0
+; RV32-NEXT:    vmv.v.v v10, v9
+; RV32-NEXT:    vmv.v.v v11, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: incorrect_extract_value_index:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT:    vnsrl.wi v12, v8, 0
+; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wx v9, v12, a0
+; RV64-NEXT:    vnsrl.wi v8, v12, 0
+; RV64-NEXT:    vmv.v.v v10, v9
+; RV64-NEXT:    vmv.v.v v11, v9
+; RV64-NEXT:    ret
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
+; We should not transform this function because the expression is not a balanced tree.
+define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: not_balanced_load_tree:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vnsrl.wx v8, v12, a0
+; RV32-NEXT:    vnsrl.wi v16, v12, 0
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vnsrl.wi v10, v16, 0
+; RV32-NEXT:    vnsrl.wx v11, v16, a0
+; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV32-NEXT:    vnsrl.wx v12, v11, a0
+; RV32-NEXT:    vnsrl.wi v11, v11, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_balanced_load_tree:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0)
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT:    vnsrl.wx v8, v12, a0
+; RV64-NEXT:    vnsrl.wi v16, v12, 0
+; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wi v10, v16, 0
+; RV64-NEXT:    vnsrl.wx v11, v16, a0
+; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; RV64-NEXT:    vnsrl.wx v12, v11, a0
+; RV64-NEXT:    vnsrl.wi v11, v11, 0
+; RV64-NEXT:    ret
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+  %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+  %d1.1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+  %d2 = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 2 x i32> %d1.1)
+  %t2 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 0
+  %t3 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 1
+
+  %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } undef, <vscale x 4 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res1, <vscale x 1 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res2, <vscale x 1 x i32> %t3, 3
+  ret { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res3
+}
+
+define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: not_balanced_store_tree:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; RV32-NEXT:    vwaddu.vv v12, v8, v8
+; RV32-NEXT:    li a2, -1
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    vwmaccu.vx v12, a2, v8
+; RV32-NEXT:    srli a3, a3, 3
+; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vx v8, v12, a3
+; RV32-NEXT:    add a4, a3, a3
+; RV32-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; RV32-NEXT:    vslideup.vx v12, v8, a3
+; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT:    vwaddu.vv v14, v12, v9
+; RV32-NEXT:    vwmaccu.vx v14, a2, v9
+; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; RV32-NEXT:    vwaddu.vv v16, v14, v10
+; RV32-NEXT:    vwmaccu.vx v16, a2, v10
+; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT:    vse32.v v16, (a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_balanced_store_tree:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; RV64-NEXT:    vwaddu.vv v12, v8, v8
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    vwmaccu.vx v12, a2, v8
+; RV64-NEXT:    srli a3, a3, 3
+; RV64-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vx v8, v12, a3
+; RV64-NEXT:    add a4, a3, a3
+; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT:    vslideup.vx v12, v8, a3
+; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV64-NEXT:    vwaddu.vv v14, v12, v9
+; RV64-NEXT:    vwmaccu.vx v14, a2, v9
+; RV64-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; RV64-NEXT:    vwaddu.vv v16, v14, v10
+; RV64-NEXT:    vwmaccu.vx v16, a2, v10
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vse32.v v16, (a0)
+; RV64-NEXT:    ret
+  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
+  %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
+  %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
+  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

From c869bec122d63858c7196040d6e29e679761d459 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 27 Jan 2025 13:34:29 -0800
Subject: [PATCH 02/12] fixup! Address review comments

  - Change what getMask returns
  - Change the name of TLI hooks
  - Limit the TLI hooks to scalable vectors only for now
---
 llvm/include/llvm/CodeGen/TargetLowering.h  | 12 ++----
 llvm/lib/CodeGen/InterleavedAccessPass.cpp  | 19 +++++----
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 46 ++++++++++-----------
 llvm/lib/Target/RISCV/RISCVISelLowering.h   | 11 +++--
 4 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 50fa3a39c0a0c..9411dc66b2931 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3158,12 +3158,10 @@ class TargetLoweringBase {
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
-  /// \p DeinterleaveIntrin is vector.deinterleave intrinsic
   /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool
-  lowerInterleavedScalableLoad(VPIntrinsic *Load, Value *Mask,
-                               IntrinsicInst *DeinterleaveIntrin,
-                               ArrayRef<Value *> DeinterleaveRes) const {
+  lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
+                                      ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
@@ -3172,12 +3170,10 @@ class TargetLoweringBase {
   ///
   /// \p Store is the vp.store instruction.
   /// \p Mask is a mask value
-  /// \p InterleaveIntrin is vector.interleave intrinsic
   /// \p InterleaveOps is a list of values being interleaved.
   virtual bool
-  lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask,
-                                IntrinsicInst *InterleaveIntrin,
-                                ArrayRef<Value *> InterleaveOps) const {
+  lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
+                                     ArrayRef<Value *> InterleaveOps) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9b15d0351ebe1..32ca90938d7ab 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -637,7 +637,7 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
 /// - if a value within the option is nullptr, the value corresponds to all-true
 ///   mask
 /// - return nullopt if mask cannot be deinterleaved
-static std::optional<Value *> getMask(Value *WideMask, unsigned Factor) {
+static Value *getMask(Value *WideMask, unsigned Factor) {
   using namespace llvm::PatternMatch;
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     SmallVector<Value *, 8> Operands;
@@ -650,8 +650,9 @@ static std::optional<Value *> getMask(Value *WideMask, unsigned Factor) {
     }
   }
   if (match(WideMask, m_AllOnes()))
-    return nullptr;
-  return std::nullopt;
+    return WideMask;
+
+  return nullptr;
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -673,7 +674,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
     // Check mask operand. Handle both all-true and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
-    std::optional<Value *> Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor);
     if (!Mask)
       return false;
 
@@ -682,8 +683,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedScalableLoad(VPLoad, *Mask, DI,
-                                           DeinterleaveValues))
+    if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
+                                                  DeinterleaveValues))
       return false;
 
   } else {
@@ -725,7 +726,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
       return false;
 
     Value *WideMask = VPStore->getOperand(2);
-    std::optional<Value *> Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor);
     if (!Mask)
       return false;
 
@@ -734,8 +735,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedScalableStore(VPStore, *Mask, II,
-                                            InterleaveValues))
+    if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
+                                                 InterleaveValues))
       return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b5155106a506b..a93f05a85d3aa 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22555,15 +22555,20 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
 ///
 /// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
 /// removed by the caller
-bool RISCVTargetLowering::lowerInterleavedScalableLoad(
-    VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
+    VPIntrinsic *Load, Value *Mask,
     ArrayRef<Value *> DeInterleaveResults) const {
+  assert(Mask && "Expect a valid mask");
   assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
          "Unexpected intrinsic");
 
   const unsigned Factor = DeInterleaveResults.size();
 
-  auto *WideVTy = cast<VectorType>(Load->getType());
+  auto *WideVTy = dyn_cast<ScalableVectorType>(Load->getType());
+  // TODO: Support fixed vectors.
+  if (!WideVTy)
+    return false;
+
   unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue();
   assert(WideNumElements % Factor == 0 &&
          "ElementCount of a wide load must be divisible by interleave factor");
@@ -22605,10 +22610,6 @@ bool RISCVTargetLowering::lowerInterleavedScalableLoad(
   SmallVector<Value *> Operands;
   Operands.append({PoisonVal, Load->getArgOperand(0)});
 
-  if (!Mask)
-    Mask = ConstantVector::getSplat(VTy->getElementCount(),
-                                    ConstantInt::getTrue(Load->getContext()));
-
   Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
       Load->getModule(), IntrMaskIds[Factor - 2],
       {VecTupTy, Mask->getType(), EVL->getType()});
@@ -22653,16 +22654,12 @@ bool RISCVTargetLowering::lowerInterleavedScalableLoad(
 /// into
 /// `<vscale x 8 x i32>`. This will resuling a simple unit stride store rather
 /// than a segment store, which is more expensive in this case.
-static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin,
+static Value *foldInterleaved2OfConstSplats(Value *Op0, Value *Op1,
                                             VectorType *VTy,
                                             const TargetLowering *TLI,
                                             Instruction *VPStore) {
-  // We only handle Factor = 2 for now.
-  assert(InterleaveIntrin->arg_size() == 2);
-  auto *SplatVal0 = dyn_cast_or_null<ConstantInt>(
-      getSplatValue(InterleaveIntrin->getArgOperand(0)));
-  auto *SplatVal1 = dyn_cast_or_null<ConstantInt>(
-      getSplatValue(InterleaveIntrin->getArgOperand(1)));
+  auto *SplatVal0 = dyn_cast_or_null<ConstantInt>(getSplatValue(Op0));
+  auto *SplatVal1 = dyn_cast_or_null<ConstantInt>(getSplatValue(Op1));
   if (!SplatVal0 || !SplatVal1)
     return nullptr;
 
@@ -22711,15 +22708,19 @@ static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin,
 ///                               <vscale x 32 x i8> %load2, ptr %ptr,
 ///                               %mask,
 ///                               i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedScalableStore(
-    VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin,
+bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
+    VPIntrinsic *Store, Value *Mask,
     ArrayRef<Value *> InterleaveOperands) const {
+  assert(Mask && "Expect a valid mask");
   assert(Store->getIntrinsicID() == Intrinsic::vp_store &&
          "Unexpected intrinsic");
 
   const unsigned Factor = InterleaveOperands.size();
 
-  VectorType *VTy = cast<VectorType>(InterleaveOperands[0]->getType());
+  auto *VTy = dyn_cast<ScalableVectorType>(InterleaveOperands[0]->getType());
+  // TODO: Support fixed vectors.
+  if (!VTy)
+    return false;
 
   // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
   // to emit it first.
@@ -22731,9 +22732,10 @@ bool RISCVTargetLowering::lowerInterleavedScalableStore(
     return false;
 
   if (Factor == 2)
-    if (Value *BC =
-            foldInterleaved2OfConstSplats(InterleaveIntrin, VTy, this, Store)) {
-      InterleaveIntrin->replaceAllUsesWith(BC);
+    if (Value *BC = foldInterleaved2OfConstSplats(
+            InterleaveOperands[0], InterleaveOperands[1], VTy, this, Store)) {
+      // Store is guranteed to be the only user of the interleaved intrinsic.
+      Store->getOperand(0)->replaceAllUsesWith(BC);
       return true;
     }
 
@@ -22770,10 +22772,6 @@ bool RISCVTargetLowering::lowerInterleavedScalableStore(
   Operands.push_back(StoredVal);
   Operands.push_back(Store->getArgOperand(1));
 
-  if (!Mask)
-    Mask = ConstantVector::getSplat(VTy->getElementCount(),
-                                    ConstantInt::getTrue(Store->getContext()));
-
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
       Store->getModule(), IntrMaskIds[Factor - 2],
       {VecTupTy, Mask->getType(), EVL->getType()});
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index f779f4fdc26c1..e9dd8ff96fa37 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -910,14 +910,13 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
-  bool lowerInterleavedScalableLoad(
-      VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin,
+  bool lowerDeinterleavedIntrinsicToVPLoad(
+      VPIntrinsic *Load, Value *Mask,
       ArrayRef<Value *> DeinterleaveRes) const override;
 
-  bool
-  lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask,
-                                IntrinsicInst *InterleaveIntrin,
-                                ArrayRef<Value *> InterleaveOps) const override;
+  bool lowerInterleavedIntrinsicToVPStore(
+      VPIntrinsic *Store, Value *Mask,
+      ArrayRef<Value *> InterleaveOps) const override;
 
   bool supportKCFIBundles() const override { return true; }
 

From 3688672841431cb526b2a2e7f347c3d07437ce15 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 27 Jan 2025 13:45:25 -0800
Subject: [PATCH 03/12] fixup! Rename test

And add negative test case for fixed vectors.
---
 ...ess.ll => vp-vector-interleaved-access.ll} | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 rename llvm/test/CodeGen/RISCV/rvv/{scalable-vectors-interleaved-access.ll => vp-vector-interleaved-access.ll} (93%)

diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
similarity index 93%
rename from llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
rename to llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index ac254792e167a..648f6e324c59d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -615,5 +615,56 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
   ret void
 }
 
+; We only support scalable vectors for now.
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %rvl) {
+; RV32-LABEL: not_scalable_vectors:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vnsrl.wx v12, v8, a0
+; RV32-NEXT:    vnsrl.wi v11, v8, 0
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vnsrl.wx v10, v11, a0
+; RV32-NEXT:    vnsrl.wi v8, v11, 0
+; RV32-NEXT:    vnsrl.wx v11, v12, a0
+; RV32-NEXT:    vnsrl.wi v9, v12, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_scalable_vectors:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wx v12, v8, a0
+; RV64-NEXT:    vnsrl.wi v11, v8, 0
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vnsrl.wx v10, v11, a0
+; RV64-NEXT:    vnsrl.wi v8, v11, 0
+; RV64-NEXT:    vnsrl.wx v11, v12, a0
+; RV64-NEXT:    vnsrl.wi v9, v12, 0
+; RV64-NEXT:    ret
+  %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl)
+  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
+  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
+  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
+  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
+  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
+  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
+  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0
+  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
+  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
+  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}

From 32d8de5a1988a3ff80fc236a8c6a96b361fc62f3 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 27 Jan 2025 16:43:25 -0800
Subject: [PATCH 04/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  5 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 86 +++++++++++++++++++
 2 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 32ca90938d7ab..beffc193695ac 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -644,9 +644,10 @@ static Value *getMask(Value *WideMask, unsigned Factor) {
     SmallVector<Instruction *, 8> DeadInsts;
     if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
       assert(!Operands.empty());
+      Value *FirstOp = Operands[0];
       if (Operands.size() == Factor &&
-          std::equal(Operands.begin(), Operands.end(), Operands.begin()))
-        return Operands.front();
+          llvm::all_of(Operands, [=](Value *Op) { return Op == FirstOp; }))
+        return FirstOp;
     }
   }
   if (match(WideMask, m_AllOnes()))
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 648f6e324c59d..2fb8661ca38f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -666,5 +666,91 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
 }
 
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %rvl) {
+; RV32-LABEL: not_same_mask:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv1r.v v9, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    li a2, -1
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmerge.vim v11, v8, 1, v0
+; RV32-NEXT:    vmv1r.v v0, v9
+; RV32-NEXT:    vmerge.vim v9, v8, 1, v0
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    vwaddu.vv v12, v9, v11
+; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v9, v12, a3
+; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v9, 0
+; RV32-NEXT:    add a2, a3, a3
+; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v8, a3
+; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT:    vle32.v v10, (a0), v0.t
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vnsrl.wx v9, v10, a0
+; RV32-NEXT:    vnsrl.wi v8, v10, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_same_mask:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv1r.v v9, v0
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmerge.vim v11, v8, 1, v0
+; RV64-NEXT:    vmv1r.v v0, v9
+; RV64-NEXT:    vmerge.vim v9, v8, 1, v0
+; RV64-NEXT:    srli a3, a3, 2
+; RV64-NEXT:    vwaddu.vv v12, v9, v11
+; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v9, v12, a3
+; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v9, 0
+; RV64-NEXT:    add a2, a3, a3
+; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v8, a3
+; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV64-NEXT:    vle32.v v10, (a0), v0.t
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wx v9, v10, a0
+; RV64-NEXT:    vnsrl.wi v8, v10, 0
+; RV64-NEXT:    ret
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1)
+  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
+  %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}

From 6b7f8e9ab5259f29859ef8ed7abbba65bb3b8414 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 27 Jan 2025 16:51:52 -0800
Subject: [PATCH 05/12] fixup! fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index beffc193695ac..cec7ed5840ed5 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -644,10 +644,8 @@ static Value *getMask(Value *WideMask, unsigned Factor) {
     SmallVector<Instruction *, 8> DeadInsts;
     if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
       assert(!Operands.empty());
-      Value *FirstOp = Operands[0];
-      if (Operands.size() == Factor &&
-          llvm::all_of(Operands, [=](Value *Op) { return Op == FirstOp; }))
-        return FirstOp;
+      if (Operands.size() == Factor && llvm::all_equal(Operands))
+        return Operands[0];
     }
   }
   if (match(WideMask, m_AllOnes()))

From f7242dfd188e8291a1bc7435e87893004253ec7e Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 28 Jan 2025 14:26:07 -0800
Subject: [PATCH 06/12] fixup! Address some of the review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  9 ++-----
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  7 +++--
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 27 ++++++++-----------
 3 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index cec7ed5840ed5..92a2d8b36947b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -630,13 +630,8 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
   return true;
 }
 
-/// Check the interleaved mask
-///
-/// - if a value within the optional is non-nullptr, the value corresponds to
-///   deinterleaved mask
-/// - if a value within the option is nullptr, the value corresponds to all-true
-///   mask
-/// - return nullopt if mask cannot be deinterleaved
+// Return nullptr if the value corresponds to a all-true mask. Otherwise,
+// return the value that is corresponded to a deinterleaved mask.
 static Value *getMask(Value *WideMask, unsigned Factor) {
   using namespace llvm::PatternMatch;
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a93f05a85d3aa..2f9381f6b9d82 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22607,8 +22607,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
       Factor);
 
   Value *PoisonVal = PoisonValue::get(VecTupTy);
-  SmallVector<Value *> Operands;
-  Operands.append({PoisonVal, Load->getArgOperand(0)});
+  SmallVector<Value *> Operands{PoisonVal, Load->getArgOperand(0)};
 
   Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
       Load->getModule(), IntrMaskIds[Factor - 2],
@@ -22618,8 +22617,8 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 
   Operands.push_back(EVL);
 
-  // Tail-policy
-  Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC));
+  Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC |
+                                                  RISCVII::MASK_AGNOSTIC));
 
   Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 2fb8661ca38f1..1e2c611d260cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -16,7 +16,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+  %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
@@ -39,7 +39,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
   %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
@@ -71,7 +71,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 %rvl)
+  %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
   %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
@@ -121,7 +121,7 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
   %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
-  call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %rvl)
+  call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -156,13 +156,8 @@ define void @store_factor2_const_splat(ptr %dst) {
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    ret
-  %interleave2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(
-    <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 666, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer),
-    <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 777, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-  )
-  call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst,
-    <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 1, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer),
-    i32 87)
+  %interleave2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> splat (i32 666), <vscale x 8 x i32> splat (i32 777))
+  call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst, <vscale x 16 x i1> splat (i1 true), i32 87)
   ret void
 }
 
@@ -187,7 +182,7 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
   %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
   %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %rvl)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -224,7 +219,7 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
   %interleaved.vec4 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
   %interleaved.vec5 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec3, <vscale x 2 x i32> %interleaved.vec4)
   %interleaved.vec6 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %interleaved.vec2, <vscale x 4 x i32> %interleaved.vec5)
-  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -489,7 +484,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vmv.v.v v10, v9
 ; RV64-NEXT:    vmv.v.v v11, v9
 ; RV64-NEXT:    ret
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
   %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
@@ -542,7 +537,7 @@ define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1
 ; RV64-NEXT:    vnsrl.wx v12, v11, a0
 ; RV64-NEXT:    vnsrl.wi v11, v11, 0
 ; RV64-NEXT:    ret
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
   %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
@@ -611,7 +606,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
   %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
   %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
-  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 %rvl)
+  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 

From 0e4bf695150d0c578bf899c87d251e4645a9f239 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 29 Jan 2025 15:28:58 -0800
Subject: [PATCH 07/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |  38 ++-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  23 ++
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 223 +++++++++++++-----
 3 files changed, 214 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 92a2d8b36947b..2eb22434bf9b3 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -630,9 +630,12 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
   return true;
 }
 
-// Return nullptr if the value corresponds to a all-true mask. Otherwise,
-// return the value that is corresponded to a deinterleaved mask.
-static Value *getMask(Value *WideMask, unsigned Factor) {
+// Return the corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      VectorType *LeafValueTy) {
+  Value *MaskVal = nullptr;
+
   using namespace llvm::PatternMatch;
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     SmallVector<Value *, 8> Operands;
@@ -640,13 +643,28 @@ static Value *getMask(Value *WideMask, unsigned Factor) {
     if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
       assert(!Operands.empty());
       if (Operands.size() == Factor && llvm::all_equal(Operands))
-        return Operands[0];
+        MaskVal = Operands[0];
     }
   }
-  if (match(WideMask, m_AllOnes()))
-    return WideMask;
 
-  return nullptr;
+  if (match(WideMask, m_AllOnes())) {
+    // Scale the vector length.
+    ElementCount OrigEC =
+        cast<VectorType>(WideMask->getType())->getElementCount();
+    MaskVal =
+        ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
+                                 cast<Constant>(WideMask)->getSplatValue());
+  }
+
+  if (MaskVal) {
+    // Check if the vector length of mask matches that of the leaf values.
+    auto *MaskTy = cast<VectorType>(MaskVal->getType());
+    if (!MaskTy->getElementType()->isIntegerTy(/*Bitwidth=*/1) ||
+        MaskTy->getElementCount() != LeafValueTy->getElementCount())
+      return nullptr;
+  }
+
+  return MaskVal;
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -668,7 +686,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
     // Check mask operand. Handle both all-true and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
-    Value *Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor,
+                          cast<VectorType>(DeinterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
@@ -720,7 +739,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
       return false;
 
     Value *WideMask = VPStore->getOperand(2);
-    Value *Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor,
+                          cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2f9381f6b9d82..27049cdb42623 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -22529,6 +22530,22 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   return true;
 }
 
+static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
+  assert(N);
+  if (N == 1)
+    return true;
+
+  if (isPowerOf2_32(N)) {
+    KnownBits KB = llvm::computeKnownBits(V, DL);
+    return KB.countMinTrailingZeros() >= Log2_32(N);
+  } else {
+    using namespace PatternMatch;
+    // Right now we're only recognizing the simplest pattern.
+    uint64_t C;
+    return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0;
+  }
+}
+
 /// Lower an interleaved vp.load into a vlsegN intrinsic.
 ///
 /// E.g. Lower an interleaved vp.load (Factor = 2):
@@ -22586,6 +22603,9 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 
   IRBuilder<> Builder(Load);
   Value *WideEVL = Load->getArgOperand(2);
+  if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
+    return false;
+
   auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
   Value *EVL = Builder.CreateZExtOrTrunc(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
@@ -22740,6 +22760,9 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
 
   IRBuilder<> Builder(Store);
   Value *WideEVL = Store->getArgOperand(3);
+  if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
+    return false;
+
   auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
   Value *EVL = Builder.CreateZExtOrTrunc(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 1e2c611d260cc..419dd4d86ed8f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -2,9 +2,10 @@
 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s
 
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor2_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg2e32.v v8, (a0)
@@ -12,10 +13,12 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
 ;
 ; RV64-LABEL: load_factor2_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 33
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %wide.masked.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
@@ -25,9 +28,10 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor4_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor4_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg4e32.v v8, (a0)
@@ -35,10 +39,12 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ;
 ; RV64-LABEL: load_factor4_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    slli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 34
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
@@ -57,9 +63,10 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
 }
 
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor8_v2(ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor8_v2(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: load_factor8_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg8e32.v v8, (a0)
@@ -67,10 +74,12 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ;
 ; RV64-LABEL: load_factor8_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 35
+; RV64-NEXT:    srli a1, a1, 35
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 8
   %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
@@ -106,9 +115,10 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
 }
 
-define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor2_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vsseg2e32.v v8, (a0)
@@ -116,10 +126,12 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ;
 ; RV64-LABEL: store_factor2_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 33
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v8, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %interleaved.vec = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
   call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> splat (i1 true), i32 %rvl)
   ret void
@@ -138,7 +150,7 @@ define void @store_factor2_const_splat(ptr %dst) {
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    li a1, 87
+; RV32-NEXT:    li a1, 88
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    addi sp, sp, 16
@@ -152,18 +164,19 @@ define void @store_factor2_const_splat(ptr %dst) {
 ; RV64-NEXT:    addi a1, a1, 666
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    li a1, 87
+; RV64-NEXT:    li a1, 88
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    ret
   %interleave2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> splat (i32 666), <vscale x 8 x i32> splat (i32 777))
-  call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst, <vscale x 16 x i1> splat (i1 true), i32 87)
+  call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst, <vscale x 16 x i1> splat (i1 true), i32 88)
   ret void
 }
 
-define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor4_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv1r.v v10, v8
@@ -173,12 +186,14 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ;
 ; RV64-LABEL: store_factor4_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    slli a1, a1, 35
+; RV64-NEXT:    srli a1, a1, 34
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv1r.v v10, v8
 ; RV64-NEXT:    vmv1r.v v11, v9
 ; RV64-NEXT:    vsseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 8
   %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
   %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
@@ -186,9 +201,10 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
   ret void
 }
 
-define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor8_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv1r.v v10, v8
@@ -202,7 +218,8 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ;
 ; RV64-LABEL: store_factor8_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 3
+; RV64-NEXT:    slli a1, a1, 35
+; RV64-NEXT:    srli a1, a1, 35
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv1r.v v10, v8
 ; RV64-NEXT:    vmv1r.v v11, v9
@@ -212,6 +229,7 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vmv1r.v v15, v9
 ; RV64-NEXT:    vsseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 8
   %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
@@ -223,9 +241,10 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
   ret void
 }
 
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: masked_load_factor2_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg2e32.v v8, (a0), v0.t
@@ -233,10 +252,12 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale
 ;
 ; RV64-LABEL: masked_load_factor2_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 33
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -247,9 +268,10 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor4_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor4_v2(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: masked_load_factor4_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg4e32.v v8, (a0), v0.t
@@ -257,10 +279,12 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ;
 ; RV64-LABEL: masked_load_factor4_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    slli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 34
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %interleaved.mask0 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %interleaved.mask1 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %interleaved.mask2 = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %interleaved.mask0, <vscale x 4 x i1> %interleaved.mask1)
@@ -282,31 +306,37 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
 }
 
-define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+define void @masked_store_factor2_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: masked_store_factor2_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv1r.v v9, v8
 ; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; RV32-NEXT:    vmv1r.v v9, v8
 ; RV32-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: masked_store_factor2_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 1
-; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64-NEXT:    vmv1r.v v9, v8
+; RV64-NEXT:    srli a1, a1, 33
+; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %interleaved.mask = tail call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.vec = tail call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   tail call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> %interleaved.vec, ptr %ptr, <vscale x 2 x i1> %interleaved.mask, i32 %rvl)
   ret void
 }
 
-define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: masked_load_store_factor2_v2_shared_mask:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    srli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV32-NEXT:    vlseg2e32.v v8, (a0), v0.t
@@ -315,11 +345,13 @@ define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, p
 ;
 ; RV64-LABEL: masked_load_store_factor2_v2_shared_mask:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 33
+; RV64-NEXT:    srli a1, a1, 33
 ; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; RV64-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    vsseg2e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -330,7 +362,7 @@ define void @masked_load_store_factor2_v2_shared_mask(<vscale x 2 x i1> %mask, p
   ret void
 }
 
-define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %mask, ptr %ptr, i32 %rvl) {
+define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %mask, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
@@ -357,18 +389,19 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV32-NEXT:    vslideup.vx v10, v9, a3
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
-; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; RV32-NEXT:    slli a2, a1, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
-; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v13, v10, a2
-; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vnsrl.wx v13, v10, a1
+; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vnsrl.wi v12, v10, 0
-; RV32-NEXT:    srli a1, a1, 1
+; RV32-NEXT:    srli a2, a2, 1
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; RV32-NEXT:    vsseg2e32.v v12, (a0), v0.t
-; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract:
@@ -380,38 +413,39 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    slli a4, a1, 33
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v9, 1, v0
 ; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v11, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vx v11, v12, a3
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v11, 0
-; RV64-NEXT:    add a2, a3, a3
+; RV64-NEXT:    add a1, a3, a3
 ; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vx v10, v9, a3
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    srli a2, a2, 32
-; RV64-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; RV64-NEXT:    srli a1, a4, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v10, (a0), v0.t
-; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v13, v10, a2
-; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wx v13, v10, a1
+; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vnsrl.wi v12, v10, 0
-; RV64-NEXT:    srliw a1, a1, 1
+; RV64-NEXT:    srli a4, a4, 33
 ; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -423,9 +457,10 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
   ret i32 %r0
 }
 
-define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %rvl) {
+define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: masked_store_factor4_v2:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    srli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv1r.v v10, v8
@@ -435,12 +470,14 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ;
 ; RV64-LABEL: masked_store_factor4_v2:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    srliw a1, a1, 2
+; RV64-NEXT:    slli a1, a1, 34
+; RV64-NEXT:    srli a1, a1, 34
 ; RV64-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv1r.v v10, v8
 ; RV64-NEXT:    vmv1r.v v11, v9
 ; RV64-NEXT:    vsseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %interleaved.mask0 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.mask1 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
   %interleaved.mask2 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %interleaved.mask0, <vscale x 2 x i1> %interleaved.mask1)
@@ -454,9 +491,10 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; Negative tests
 
 ; We should not transform this function because the deinterleave tree is not in a desired form.
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: incorrect_extract_value_index:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
@@ -471,7 +509,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ;
 ; RV64-LABEL: incorrect_extract_value_index:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a1, a1, 34
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
@@ -484,6 +522,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vmv.v.v v10, v9
 ; RV64-NEXT:    vmv.v.v v11, v9
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
@@ -503,9 +542,10 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 }
 
 ; We should not transform this function because the expression is not a balanced tree.
-define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %rvl) {
+define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_balanced_load_tree:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v12, (a0)
 ; RV32-NEXT:    li a0, 32
@@ -522,7 +562,7 @@ define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1
 ;
 ; RV64-LABEL: not_balanced_load_tree:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a1, a1, 34
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV64-NEXT:    vle32.v v12, (a0)
@@ -537,6 +577,7 @@ define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1
 ; RV64-NEXT:    vnsrl.wx v12, v11, a0
 ; RV64-NEXT:    vnsrl.wi v11, v11, 0
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
@@ -555,9 +596,10 @@ define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1
   ret { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res3
 }
 
-define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %rvl) {
+define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_balanced_store_tree:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwaddu.vv v12, v8, v8
 ; RV32-NEXT:    li a2, -1
@@ -585,7 +627,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV64-NEXT:    vwaddu.vv v12, v8, v8
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a1, a1, 34
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v8
 ; RV64-NEXT:    srli a3, a3, 3
 ; RV64-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
@@ -603,6 +645,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV64-NEXT:    vse32.v v16, (a0)
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
   %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
   %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
@@ -611,9 +654,10 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 }
 
 ; We only support scalable vectors for now.
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %rvl) {
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_scalable_vectors:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    li a0, 32
@@ -629,7 +673,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p
 ;
 ; RV64-LABEL: not_scalable_vectors:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a1, a1, 34
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
@@ -643,6 +687,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p
 ; RV64-NEXT:    vnsrl.wx v11, v12, a0
 ; RV64-NEXT:    vnsrl.wi v9, v12, 0
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 4
   %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl)
   %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
   %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
@@ -661,7 +706,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
 }
 
-define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %rvl) {
+define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_same_mask:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
@@ -691,6 +736,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV32-NEXT:    vslideup.vx v10, v8, a3
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
 ; RV32-NEXT:    li a0, 32
@@ -709,7 +755,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a1, a1, 33
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV64-NEXT:    vmv1r.v v0, v9
@@ -737,6 +783,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    vnsrl.wx v9, v10, a0
 ; RV64-NEXT:    vnsrl.wi v8, v10, 0
 ; RV64-NEXT:    ret
+  %rvl = mul i32 %evl, 2
   %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1)
   %wide.masked.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
@@ -747,5 +794,59 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
 
+; EVL should be a multiple of factor
+define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @invalid_evl(ptr %ptr, i32 %evl) {
+; RV32-LABEL: invalid_evl:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ori a1, a1, 1
+; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vnsrl.wx v12, v8, a0
+; RV32-NEXT:    vnsrl.wi v14, v8, 0
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vnsrl.wx v10, v14, a0
+; RV32-NEXT:    vnsrl.wi v8, v14, 0
+; RV32-NEXT:    vnsrl.wx v11, v12, a0
+; RV32-NEXT:    vnsrl.wi v9, v12, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: invalid_evl:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ori a1, a1, 1
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    srli a1, a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT:    vnsrl.wx v12, v8, a0
+; RV64-NEXT:    vnsrl.wi v14, v8, 0
+; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV64-NEXT:    vnsrl.wx v10, v14, a0
+; RV64-NEXT:    vnsrl.wi v8, v14, 0
+; RV64-NEXT:    vnsrl.wx v11, v12, a0
+; RV64-NEXT:    vnsrl.wi v9, v12, 0
+; RV64-NEXT:    ret
+  %rvl = or i32 %evl, 1
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
+  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
+  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
+  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
+  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
+  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}

From 1afd2cc11caa6edb2028af9001921a54a05c6ad5 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 29 Jan 2025 15:47:26 -0800
Subject: [PATCH 08/12] fixup! Add more comments

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27049cdb42623..623f6e87bd2b1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22603,6 +22603,8 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 
   IRBuilder<> Builder(Load);
   Value *WideEVL = Load->getArgOperand(2);
+  // Conservatively check if EVL is a multiple of factor, otherwise some
+  // (trailing) elements might be lost after the transformation.
   if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
     return false;
 
@@ -22760,6 +22762,8 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
 
   IRBuilder<> Builder(Store);
   Value *WideEVL = Store->getArgOperand(3);
+  // Conservatively check if EVL is a multiple of factor, otherwise some
+  // (trailing) elements might be lost after the transformation.
   if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor))
     return false;
 

From 16a88d9a8180635e398aade94306161ea8bee9bf Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 29 Jan 2025 16:00:17 -0800
Subject: [PATCH 09/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp  | 21 ++++++---------------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 +++++-----
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 2eb22434bf9b3..92a4ef75e63a1 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -634,8 +634,6 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
 // mask.
 static Value *getMask(Value *WideMask, unsigned Factor,
                       VectorType *LeafValueTy) {
-  Value *MaskVal = nullptr;
-
   using namespace llvm::PatternMatch;
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
     SmallVector<Value *, 8> Operands;
@@ -643,28 +641,21 @@ static Value *getMask(Value *WideMask, unsigned Factor,
     if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
       assert(!Operands.empty());
       if (Operands.size() == Factor && llvm::all_equal(Operands))
-        MaskVal = Operands[0];
+        return Operands[0];
     }
   }
 
   if (match(WideMask, m_AllOnes())) {
-    // Scale the vector length.
+    // Scale the vector length of all-ones mask.
     ElementCount OrigEC =
         cast<VectorType>(WideMask->getType())->getElementCount();
-    MaskVal =
-        ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
-                                 cast<Constant>(WideMask)->getSplatValue());
-  }
-
-  if (MaskVal) {
-    // Check if the vector length of mask matches that of the leaf values.
-    auto *MaskTy = cast<VectorType>(MaskVal->getType());
-    if (!MaskTy->getElementType()->isIntegerTy(/*Bitwidth=*/1) ||
-        MaskTy->getElementCount() != LeafValueTy->getElementCount())
+    if (OrigEC.getKnownMinValue() % Factor)
       return nullptr;
+    return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
+                                    cast<Constant>(WideMask)->getSplatValue());
   }
 
-  return MaskVal;
+  return nullptr;
 }
 
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 623f6e87bd2b1..5dfbd614df9e4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22538,12 +22538,12 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
   if (isPowerOf2_32(N)) {
     KnownBits KB = llvm::computeKnownBits(V, DL);
     return KB.countMinTrailingZeros() >= Log2_32(N);
-  } else {
-    using namespace PatternMatch;
-    // Right now we're only recognizing the simplest pattern.
-    uint64_t C;
-    return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0;
   }
+
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0;
 }
 
 /// Lower an interleaved vp.load into a vlsegN intrinsic.

From 5cd5be43257f9eb72618a2f0c52c4d6d774e1eab Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Thu, 30 Jan 2025 12:03:18 -0800
Subject: [PATCH 10/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp  |  3 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 32 +++++++++------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 92a4ef75e63a1..3261f2858b236 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -649,8 +649,7 @@ static Value *getMask(Value *WideMask, unsigned Factor,
     // Scale the vector length of all-ones mask.
     ElementCount OrigEC =
         cast<VectorType>(WideMask->getType())->getElementCount();
-    if (OrigEC.getKnownMinValue() % Factor)
-      return nullptr;
+    assert(OrigEC.getKnownMinValue() % Factor == 0);
     return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
                                     cast<Constant>(WideMask)->getSplatValue());
   }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5dfbd614df9e4..6d997f5d41e27 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22574,12 +22574,12 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 /// removed by the caller
 bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
     VPIntrinsic *Load, Value *Mask,
-    ArrayRef<Value *> DeInterleaveResults) const {
+    ArrayRef<Value *> DeinterleaveResults) const {
   assert(Mask && "Expect a valid mask");
   assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
          "Unexpected intrinsic");
 
-  const unsigned Factor = DeInterleaveResults.size();
+  const unsigned Factor = DeinterleaveResults.size();
 
   auto *WideVTy = dyn_cast<ScalableVectorType>(Load->getType());
   // TODO: Support fixed vectors.
@@ -22592,10 +22592,9 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
   auto *VTy =
       VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor,
                       WideVTy->isScalableTy());
-  // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
-  // to emit it first.
   auto &DL = Load->getModule()->getDataLayout();
-  Align Alignment = Align(DL.getTypeStoreSize(WideVTy->getScalarType()));
+  Align Alignment = Load->getParamAlign(0).value_or(
+      DL.getABITypeAlign(WideVTy->getElementType()));
   if (!isLegalInterleavedAccessType(
           VTy, Factor, Alignment,
           Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
@@ -22629,20 +22628,18 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
       Factor);
 
   Value *PoisonVal = PoisonValue::get(VecTupTy);
-  SmallVector<Value *> Operands{PoisonVal, Load->getArgOperand(0)};
 
   Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
       Load->getModule(), IntrMaskIds[Factor - 2],
       {VecTupTy, Mask->getType(), EVL->getType()});
 
-  Operands.push_back(Mask);
-
-  Operands.push_back(EVL);
-
-  Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC |
-                                                  RISCVII::MASK_AGNOSTIC));
-
-  Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+  Value *Operands[] = {
+      PoisonVal,
+      Load->getArgOperand(0),
+      Mask,
+      EVL,
+      ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC),
+      ConstantInt::get(XLenTy, Log2_64(SEW))};
 
   CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
 
@@ -22657,7 +22654,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
     Return = Builder.CreateInsertValue(Return, VecExtract, i);
   }
 
-  for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) {
+  for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
     // We have to create a brand new ExtractValue to replace each
     // of these old ExtractValue instructions.
     Value *NewEV =
@@ -22743,10 +22740,9 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
   if (!VTy)
     return false;
 
-  // FIXME: Should pass alignment attribute from pointer, but vectorizer needs
-  // to emit it first.
   const DataLayout &DL = Store->getDataLayout();
-  Align Alignment = Align(DL.getTypeStoreSize(VTy->getScalarType()));
+  Align Alignment = Store->getParamAlign(1).value_or(
+      DL.getABITypeAlign(VTy->getElementType()));
   if (!isLegalInterleavedAccessType(
           VTy, Factor, Alignment,
           Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))

From 16915c94915dbdaeae72f6d2b283c141b32256b9 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Thu, 30 Jan 2025 16:45:43 -0800
Subject: [PATCH 11/12] fixup! Split out the folding rule on interleave2 of two
 const splats

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 49 -------------------
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 36 --------------
 2 files changed, 85 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6d997f5d41e27..8f3dd51a45a86 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22665,47 +22665,6 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
   return true;
 }
 
-/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
-/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
-/// larger splat
-/// `<vscale x 4 x i64> <splat of ((777 << 32) | 666)>` first before casting it
-/// into
-/// `<vscale x 8 x i32>`. This will resuling a simple unit stride store rather
-/// than a segment store, which is more expensive in this case.
-static Value *foldInterleaved2OfConstSplats(Value *Op0, Value *Op1,
-                                            VectorType *VTy,
-                                            const TargetLowering *TLI,
-                                            Instruction *VPStore) {
-  auto *SplatVal0 = dyn_cast_or_null<ConstantInt>(getSplatValue(Op0));
-  auto *SplatVal1 = dyn_cast_or_null<ConstantInt>(getSplatValue(Op1));
-  if (!SplatVal0 || !SplatVal1)
-    return nullptr;
-
-  auto &Ctx = VPStore->getContext();
-  auto &DL = VPStore->getModule()->getDataLayout();
-
-  auto *NewVTy = VectorType::getExtendedElementVectorType(VTy);
-  if (!TLI->isTypeLegal(TLI->getValueType(DL, NewVTy)))
-    return nullptr;
-
-  // InterleavedAccessPass will remove VPStore after this but we still want to
-  // preserve it, hence clone another one here.
-  auto *ClonedVPStore = VPStore->clone();
-  ClonedVPStore->insertBefore(VPStore);
-  IRBuilder<> Builder(ClonedVPStore);
-
-  Type *ETy = VTy->getElementType();
-  unsigned Width = ETy->getIntegerBitWidth();
-
-  APInt NewSplatVal(Width * 2, SplatVal1->getZExtValue());
-  NewSplatVal <<= Width;
-  NewSplatVal |= SplatVal0->getZExtValue();
-  auto *NewSplat = ConstantVector::getSplat(NewVTy->getElementCount(),
-                                            ConstantInt::get(Ctx, NewSplatVal));
-  return Builder.CreateBitCast(NewSplat,
-                               VectorType::getDoubleElementsVectorType(VTy));
-}
-
 /// Lower an interleaved vp.store into a vssegN intrinsic.
 ///
 /// E.g. Lower an interleaved vp.store (Factor = 2):
@@ -22748,14 +22707,6 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
           Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL))
     return false;
 
-  if (Factor == 2)
-    if (Value *BC = foldInterleaved2OfConstSplats(
-            InterleaveOperands[0], InterleaveOperands[1], VTy, this, Store)) {
-      // Store is guranteed to be the only user of the interleaved intrinsic.
-      Store->getOperand(0)->replaceAllUsesWith(BC);
-      return true;
-    }
-
   IRBuilder<> Builder(Store);
   Value *WideEVL = Store->getArgOperand(3);
   // Conservatively check if EVL is a multiple of factor, otherwise some
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 419dd4d86ed8f..1474c32f85d49 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -137,42 +137,6 @@ define void @store_factor2_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
   ret void
 }
 
-; Expecting unit-stride store here rather than segmented store.
-define void @store_factor2_const_splat(ptr %dst) {
-; RV32-LABEL: store_factor2_const_splat:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    li a1, 777
-; RV32-NEXT:    li a2, 666
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    li a1, 88
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: store_factor2_const_splat:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a1, 777
-; RV64-NEXT:    slli a1, a1, 32
-; RV64-NEXT:    addi a1, a1, 666
-; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    li a1, 88
-; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    ret
-  %interleave2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> splat (i32 666), <vscale x 8 x i32> splat (i32 777))
-  call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> %interleave2, ptr %dst, <vscale x 16 x i1> splat (i1 true), i32 88)
-  ret void
-}
-
 define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: store_factor4_v2:
 ; RV32:       # %bb.0:

From 725dfad46b81ec4d1b336fb64c9d211ce825bf07 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 31 Jan 2025 11:25:03 -0800
Subject: [PATCH 12/12] fixup! Address review comments

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 16 +++++++--------
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 20 +++++++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8f3dd51a45a86..fc59b7e690ba1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22572,6 +22572,9 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 ///
 /// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
 /// removed by the caller
+/// TODO: We probably can loosen the dependency on matching extractvalue when
+/// dealing with factor of 2 (extractvalue is still required for most of other
+/// factors though).
 bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
     VPIntrinsic *Load, Value *Mask,
     ArrayRef<Value *> DeinterleaveResults) const {
@@ -22608,7 +22611,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
     return false;
 
   auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExtOrTrunc(
+  Value *EVL = Builder.CreateZExt(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
       XLenTy);
 
@@ -22715,7 +22718,7 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
     return false;
 
   auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen());
-  Value *EVL = Builder.CreateZExtOrTrunc(
+  Value *EVL = Builder.CreateZExt(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
       XLenTy);
 
@@ -22741,17 +22744,12 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
     StoredVal = Builder.CreateCall(
         VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)});
 
-  SmallVector<Value *, 5> Operands;
-  Operands.push_back(StoredVal);
-  Operands.push_back(Store->getArgOperand(1));
-
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
       Store->getModule(), IntrMaskIds[Factor - 2],
       {VecTupTy, Mask->getType(), EVL->getType()});
 
-  Operands.push_back(Mask);
-  Operands.push_back(EVL);
-  Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW)));
+  Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL,
+                       ConstantInt::get(XLenTy, Log2_64(SEW))};
 
   Builder.CreateCall(VssegNFunc, Operands);
   return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 1474c32f85d49..e481891dfd52f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -23,7 +23,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @load_factor2_v2(ptr %ptr, i32 %
   %deinterleaved.results = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
@@ -56,7 +56,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
   %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
 
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
   %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
@@ -104,7 +104,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
   %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
 
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
   %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
@@ -227,7 +227,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @masked_load_factor2_v2(<vscale
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
@@ -263,7 +263,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
   %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
 
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
   %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
@@ -498,7 +498,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
   %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
 
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
   %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
@@ -553,7 +553,7 @@ define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1
   %t2 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 0
   %t3 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 1
 
-  %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } undef, <vscale x 4 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 4 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   %res2 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res1, <vscale x 1 x i32> %t2, 2
   %res3 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res2, <vscale x 1 x i32> %t3, 3
@@ -663,7 +663,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p
   %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
   %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
 
-  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0
+  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
   %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
   %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
   %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
@@ -753,7 +753,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
   %deinterleaved.results = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 4 x i32> %wide.masked.load)
   %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 0
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %deinterleaved.results, 1
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   ret { <vscale x 2 x i32>, <vscale x 2 x i32> } %res1
 }
@@ -805,7 +805,7 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
   %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
   %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
 
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
+  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
   %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
   %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3