From 156661a47bfaeb5dcd0588683c34ec91eeb8402a Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 21 Jan 2025 19:43:15 -0800 Subject: [PATCH 01/12] [IA][RISCV] Support VP intrinsics in InterleavedAccessPass Teach InterleavedAccessPass to recognize the following patterns: - vp.store an interleaved scalable vector - Deinterleaving a scalable vector loaded from vp.load Upon recognizing these patterns, IA will collect the interleaved / deinterleaved operands and delegate them over to their respective newly-added TLI hooks. For RISC-V, these patterns are lowered into segmented loads/stores (except when we're interleaving constant splats, in which case a unit-strde store will be generated) Right now we only recognized power-of-two (de)interleave cases, in which (de)interleave4/8 are synthesized from a tree of (de)interleave2. Co-authored-by: Nikolay Panchenko --- llvm/include/llvm/CodeGen/TargetLowering.h | 29 + llvm/lib/CodeGen/InterleavedAccessPass.cpp | 110 +++- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 257 ++++++++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 9 + .../scalable-vectors-interleaved-access.ll | 619 ++++++++++++++++++ 5 files changed, 1005 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 59743dbe4d2ea..50fa3a39c0a0c 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -94,6 +94,7 @@ class TargetRegisterClass; class TargetRegisterInfo; class TargetTransformInfo; class Value; +class VPIntrinsic; namespace Sched { @@ -3152,6 +3153,34 @@ class TargetLoweringBase { return false; } + /// Lower an interleaved load to target specific intrinsics. Return + /// true on success. + /// + /// \p Load is a vp.load instruction. + /// \p Mask is a mask value + /// \p DeinterleaveIntrin is vector.deinterleave intrinsic + /// \p DeinterleaveRes is a list of deinterleaved results. + virtual bool + lowerInterleavedScalableLoad(VPIntrinsic *Load, Value *Mask, + IntrinsicInst *DeinterleaveIntrin, + ArrayRef DeinterleaveRes) const { + return false; + } + + /// Lower an interleaved store to target specific intrinsics. Return + /// true on success. + /// + /// \p Store is the vp.store instruction. + /// \p Mask is a mask value + /// \p InterleaveIntrin is vector.interleave intrinsic + /// \p InterleaveOps is a list of values being interleaved. + virtual bool + lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask, + IntrinsicInst *InterleaveIntrin, + ArrayRef InterleaveOps) const { + return false; + } + /// Lower a deinterleave intrinsic to a target specific load intrinsic. /// Return true on success. Currently only supports /// llvm.vector.deinterleave2 diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 3f6a69ecb7d72..9b15d0351ebe1 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -630,11 +630,34 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, return true; } +/// Check the interleaved mask +/// +/// - if a value within the optional is non-nullptr, the value corresponds to +/// deinterleaved mask +/// - if a value within the option is nullptr, the value corresponds to all-true +/// mask +/// - return nullopt if mask cannot be deinterleaved +static std::optional getMask(Value *WideMask, unsigned Factor) { + using namespace llvm::PatternMatch; + if (auto *IMI = dyn_cast(WideMask)) { + SmallVector Operands; + SmallVector DeadInsts; + if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { + assert(!Operands.empty()); + if (Operands.size() == Factor && + std::equal(Operands.begin(), Operands.end(), Operands.begin())) + return Operands.front(); + } + } + if (match(WideMask, m_AllOnes())) + return nullptr; + return std::nullopt; +} + bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector &DeadInsts) { - LoadInst *LI = dyn_cast(DI->getOperand(0)); - - if (!LI || !LI->hasOneUse() || !LI->isSimple()) + Value *LoadedVal = DI->getOperand(0); + if (!LoadedVal->hasOneUse() || !isa(LoadedVal)) return false; SmallVector DeinterleaveValues; @@ -643,16 +666,42 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( DeinterleaveDeadInsts)) return false; - LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI - << " with factor = " << DeinterleaveValues.size() << "\n"); + const unsigned Factor = DeinterleaveValues.size(); - // Try and match this with target specific intrinsics. - if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) - return false; + if (auto *VPLoad = dyn_cast(LoadedVal)) { + if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) + return false; + // Check mask operand. Handle both all-true and interleaved mask. + Value *WideMask = VPLoad->getOperand(1); + std::optional Mask = getMask(WideMask, Factor); + if (!Mask) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic " + << *DI << " and factor = " << Factor << "\n"); + + // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special + // TLI function to emit target-specific interleaved instruction. + if (!TLI->lowerInterleavedScalableLoad(VPLoad, *Mask, DI, + DeinterleaveValues)) + return false; + + } else { + auto *LI = cast(LoadedVal); + if (!LI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI + << " and factor = " << Factor << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) + return false; + } DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end()); // We now have a target-specific load, so delete the old one. - DeadInsts.insert(LI); + DeadInsts.insert(cast(LoadedVal)); return true; } @@ -660,10 +709,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( IntrinsicInst *II, SmallSetVector &DeadInsts) { if (!II->hasOneUse()) return false; - - StoreInst *SI = dyn_cast(*(II->users().begin())); - - if (!SI || !SI->isSimple()) + Value *StoredBy = II->user_back(); + if (!isa(StoredBy)) return false; SmallVector InterleaveValues; @@ -671,15 +718,40 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts)) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II - << " with factor = " << InterleaveValues.size() << "\n"); + const unsigned Factor = InterleaveValues.size(); - // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) - return false; + if (auto *VPStore = dyn_cast(StoredBy)) { + if (VPStore->getIntrinsicID() != Intrinsic::vp_store) + return false; + + Value *WideMask = VPStore->getOperand(2); + std::optional Mask = getMask(WideMask, Factor); + if (!Mask) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic " + << *II << " and factor = " << Factor << "\n"); + + // Since lowerInterleavedStore expects Shuffle and StoreInst, use special + // TLI function to emit target-specific interleaved instruction. + if (!TLI->lowerInterleavedScalableStore(VPStore, *Mask, II, + InterleaveValues)) + return false; + } else { + auto *SI = cast(StoredBy); + if (!SI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II + << " and factor = " << Factor << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) + return false; + } // We now have a target-specific store, so delete the old one. - DeadInsts.insert(SI); + DeadInsts.insert(cast(StoredBy)); DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end()); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 295fd315c56da..b5155106a506b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22529,6 +22529,263 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( return true; } +/// Lower an interleaved vp.load into a vlsegN intrinsic. +/// +/// E.g. Lower an interleaved vp.load (Factor = 2): +/// %l = call @llvm.vp.load.nxv64i8.p0(ptr %ptr, +/// %mask, +/// i32 %wide.rvl) +/// %dl = tail call { , } +/// @llvm.vector.deinterleave2.nxv64i8( +/// %l) +/// %r0 = extractvalue { , } %dl, 0 +/// %r1 = extractvalue { , } %dl, 1 +/// +/// Into: +/// %rvl = udiv %wide.rvl, 2 +/// %sl = call { , } +/// @llvm.riscv.vlseg2.mask.nxv32i8.i64( undef, +/// undef, +/// ptr %ptr, +/// %mask, +/// i64 %rvl, +/// i64 1) +/// %r0 = extractvalue { , } %sl, 0 +/// %r1 = extractvalue { , } %sl, 1 +/// +/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be +/// removed by the caller +bool RISCVTargetLowering::lowerInterleavedScalableLoad( + VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin, + ArrayRef DeInterleaveResults) const { + assert(Load->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + + const unsigned Factor = DeInterleaveResults.size(); + + auto *WideVTy = cast(Load->getType()); + unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue(); + assert(WideNumElements % Factor == 0 && + "ElementCount of a wide load must be divisible by interleave factor"); + auto *VTy = + VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor, + WideVTy->isScalableTy()); + // FIXME: Should pass alignment attribute from pointer, but vectorizer needs + // to emit it first. + auto &DL = Load->getModule()->getDataLayout(); + Align Alignment = Align(DL.getTypeStoreSize(WideVTy->getScalarType())); + if (!isLegalInterleavedAccessType( + VTy, Factor, Alignment, + Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) + return false; + + IRBuilder<> Builder(Load); + Value *WideEVL = Load->getArgOperand(2); + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + Value *EVL = Builder.CreateZExtOrTrunc( + Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + + static const Intrinsic::ID IntrMaskIds[] = { + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask, + }; + + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Load->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), + NumElts * SEW / 8), + Factor); + + Value *PoisonVal = PoisonValue::get(VecTupTy); + SmallVector Operands; + Operands.append({PoisonVal, Load->getArgOperand(0)}); + + if (!Mask) + Mask = ConstantVector::getSplat(VTy->getElementCount(), + ConstantInt::getTrue(Load->getContext())); + + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), IntrMaskIds[Factor - 2], + {VecTupTy, Mask->getType(), EVL->getType()}); + + Operands.push_back(Mask); + + Operands.push_back(EVL); + + // Tail-policy + Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC)); + + Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW))); + + CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); + + SmallVector AggrTypes{Factor, VTy}; + Value *Return = + PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); + for (unsigned i = 0; i < Factor; ++i) { + Value *VecExtract = + Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); + Return = Builder.CreateInsertValue(Return, VecExtract, i); + } + + for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) { + // We have to create a brand new ExtractValue to replace each + // of these old ExtractValue instructions. + Value *NewEV = + Builder.CreateExtractValue(Return, {static_cast(Idx)}); + DIO->replaceAllUsesWith(NewEV); + } + + return true; +} + +/// If we're interleaving 2 constant splats, for instance ` +/// ` and ` `, we can create a +/// larger splat +/// ` ` first before casting it +/// into +/// ``. This will resuling a simple unit stride store rather +/// than a segment store, which is more expensive in this case. +static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin, + VectorType *VTy, + const TargetLowering *TLI, + Instruction *VPStore) { + // We only handle Factor = 2 for now. + assert(InterleaveIntrin->arg_size() == 2); + auto *SplatVal0 = dyn_cast_or_null( + getSplatValue(InterleaveIntrin->getArgOperand(0))); + auto *SplatVal1 = dyn_cast_or_null( + getSplatValue(InterleaveIntrin->getArgOperand(1))); + if (!SplatVal0 || !SplatVal1) + return nullptr; + + auto &Ctx = VPStore->getContext(); + auto &DL = VPStore->getModule()->getDataLayout(); + + auto *NewVTy = VectorType::getExtendedElementVectorType(VTy); + if (!TLI->isTypeLegal(TLI->getValueType(DL, NewVTy))) + return nullptr; + + // InterleavedAccessPass will remove VPStore after this but we still want to + // preserve it, hence clone another one here. + auto *ClonedVPStore = VPStore->clone(); + ClonedVPStore->insertBefore(VPStore); + IRBuilder<> Builder(ClonedVPStore); + + Type *ETy = VTy->getElementType(); + unsigned Width = ETy->getIntegerBitWidth(); + + APInt NewSplatVal(Width * 2, SplatVal1->getZExtValue()); + NewSplatVal <<= Width; + NewSplatVal |= SplatVal0->getZExtValue(); + auto *NewSplat = ConstantVector::getSplat(NewVTy->getElementCount(), + ConstantInt::get(Ctx, NewSplatVal)); + return Builder.CreateBitCast(NewSplat, + VectorType::getDoubleElementsVectorType(VTy)); +} + +/// Lower an interleaved vp.store into a vssegN intrinsic. +/// +/// E.g. Lower an interleaved vp.store (Factor = 2): +/// +/// %is = tail call +/// @llvm.vector.interleave2.nxv64i8( +/// %load0, +/// %load1 +/// %wide.rvl = shl nuw nsw i32 %rvl, 1 +/// tail call void @llvm.vp.store.nxv64i8.p0( +/// %is, ptr %ptr, +/// %mask, +/// i32 %wide.rvl) +/// +/// Into: +/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64( +/// %load1, +/// %load2, ptr %ptr, +/// %mask, +/// i64 %rvl) +bool RISCVTargetLowering::lowerInterleavedScalableStore( + VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin, + ArrayRef InterleaveOperands) const { + assert(Store->getIntrinsicID() == Intrinsic::vp_store && + "Unexpected intrinsic"); + + const unsigned Factor = InterleaveOperands.size(); + + VectorType *VTy = cast(InterleaveOperands[0]->getType()); + + // FIXME: Should pass alignment attribute from pointer, but vectorizer needs + // to emit it first. + const DataLayout &DL = Store->getDataLayout(); + Align Alignment = Align(DL.getTypeStoreSize(VTy->getScalarType())); + if (!isLegalInterleavedAccessType( + VTy, Factor, Alignment, + Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) + return false; + + if (Factor == 2) + if (Value *BC = + foldInterleaved2OfConstSplats(InterleaveIntrin, VTy, this, Store)) { + InterleaveIntrin->replaceAllUsesWith(BC); + return true; + } + + IRBuilder<> Builder(Store); + Value *WideEVL = Store->getArgOperand(3); + auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); + Value *EVL = Builder.CreateZExtOrTrunc( + Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + + static const Intrinsic::ID IntrMaskIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask, + }; + + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), + NumElts * SEW / 8), + Factor); + + Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy}); + Value *StoredVal = PoisonValue::get(VecTupTy); + for (unsigned i = 0; i < Factor; ++i) + StoredVal = Builder.CreateCall( + VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); + + SmallVector Operands; + Operands.push_back(StoredVal); + Operands.push_back(Store->getArgOperand(1)); + + if (!Mask) + Mask = ConstantVector::getSplat(VTy->getElementCount(), + ConstantInt::getTrue(Store->getContext())); + + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), IntrMaskIds[Factor - 2], + {VecTupTy, Mask->getType(), EVL->getType()}); + + Operands.push_back(Mask); + Operands.push_back(EVL); + Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW))); + + Builder.CreateCall(VssegNFunc, Operands); + return true; +} + MachineInstr * RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 77605a3076a80..f779f4fdc26c1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -910,6 +910,15 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; + bool lowerInterleavedScalableLoad( + VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin, + ArrayRef DeinterleaveRes) const override; + + bool + lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask, + IntrinsicInst *InterleaveIntrin, + ArrayRef InterleaveOps) const override; + bool supportKCFIBundles() const override { return true; } SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll new file mode 100644 index 0000000000000..ac254792e167a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll @@ -0,0 +1,619 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s + +define {, } @load_factor2_v2(ptr %ptr, i32 %rvl) { +; RV32-LABEL: load_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v8, (a0) +; RV64-NEXT: ret + %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret { , } %res1 +} + +define {, , , } @load_factor4_v2(ptr %ptr, i32 %rvl) { +; RV32-LABEL: load_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg4e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 2 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg4e32.v v8, (a0) +; RV64-NEXT: ret + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } undef, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +define {, , , , , , , } @load_factor8_v2(ptr %ptr, i32 %rvl) { +; RV32-LABEL: load_factor8_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg8e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor8_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 3 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg8e32.v v8, (a0) +; RV64-NEXT: ret + %wide.masked.load = call @llvm.vp.load.nxv16i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) + %d1.0 = extractvalue { , } %d1, 0 + %d1.1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) + %d2.0 = extractvalue { , } %d2, 0 + %d2.1 = extractvalue { , } %d2, 1 + + %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) + %t0 = extractvalue { , } %d3, 0 + %t4 = extractvalue { , } %d3, 1 + %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) + %t2 = extractvalue { , } %d4, 0 + %t6 = extractvalue { , } %d4, 1 + %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) + %t1 = extractvalue { , } %d5, 0 + %t5 = extractvalue { , } %d5, 1 + %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) + %t3 = extractvalue { , } %d6, 0 + %t7 = extractvalue { , } %d6, 1 + + %res0 = insertvalue { , , , , , , , } undef, %t0, 0 + %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , , , } %res4, %t5, 5 + %res6 = insertvalue { , , , , , , , } %res5, %t6, 6 + %res7 = insertvalue { , , , , , , , } %res6, %t7, 7 + ret { , , , , , , , } %res7 +} + +define void @store_factor2_v2( %v0, %v1, ptr %ptr, i32 %rvl) { +; RV32-LABEL: store_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsseg2e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsseg2e32.v v8, (a0) +; RV64-NEXT: ret + %interleaved.vec = call @llvm.vector.interleave2.nxv2i32( %v0, %v1) + call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + ret void +} + +; Expecting unit-stride store here rather than segmented store. +define void @store_factor2_const_splat(ptr %dst) { +; RV32-LABEL: store_factor2_const_splat: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: li a1, 777 +; RV32-NEXT: li a2, 666 +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a1), zero +; RV32-NEXT: li a1, 87 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor2_const_splat: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 777 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: addi a1, a1, 666 +; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: li a1, 87 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: ret + %interleave2 = call @llvm.vector.interleave2.nxv16i32( + shufflevector ( insertelement ( poison, i32 666, i64 0), poison, zeroinitializer), + shufflevector ( insertelement ( poison, i32 777, i64 0), poison, zeroinitializer) + ) + call void @llvm.vp.store.nxv16i32.p0( %interleave2, ptr %dst, + shufflevector ( insertelement ( poison, i1 1, i64 0), poison, zeroinitializer), + i32 87) + ret void +} + +define void @store_factor4_v2( %v0, %v1, ptr %ptr, i32 %rvl) { +; RV32-LABEL: store_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 +; RV32-NEXT: vsseg4e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 2 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 +; RV64-NEXT: vsseg4e32.v v8, (a0) +; RV64-NEXT: ret + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + ret void +} + +define void @store_factor8_v2( %v0, %v1, ptr %ptr, i32 %rvl) { +; RV32-LABEL: store_factor8_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 +; RV32-NEXT: vmv1r.v v12, v8 +; RV32-NEXT: vmv1r.v v13, v9 +; RV32-NEXT: vmv1r.v v14, v8 +; RV32-NEXT: vmv1r.v v15, v9 +; RV32-NEXT: vsseg8e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor8_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 3 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 +; RV64-NEXT: vmv1r.v v12, v8 +; RV64-NEXT: vmv1r.v v13, v9 +; RV64-NEXT: vmv1r.v v14, v8 +; RV64-NEXT: vmv1r.v v15, v9 +; RV64-NEXT: vsseg8e32.v v8, (a0) +; RV64-NEXT: ret + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) + %interleaved.vec3 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec4 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec5 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec3, %interleaved.vec4) + %interleaved.vec6 = call @llvm.vector.interleave2.nxv8i32( %interleaved.vec2, %interleaved.vec5) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec6, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + ret void +} + +define {, } @masked_load_factor2_v2( %mask, ptr %ptr, i32 %rvl) { +; RV32-LABEL: masked_load_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret { , } %res1 +} + +define {, , , } @masked_load_factor4_v2( %mask, ptr %ptr, i32 %rvl) { +; RV32-LABEL: masked_load_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg4e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 2 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg4e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %interleaved.mask0 = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %interleaved.mask1 = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %interleaved.mask2 = call @llvm.vector.interleave2.nxv8i1( %interleaved.mask0, %interleaved.mask1) + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, %interleaved.mask2, i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } undef, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +define void @masked_store_factor2_v2( %mask, %v0, %v1, ptr %ptr, i32 %rvl) { +; RV32-LABEL: masked_store_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v9, v8 +; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_store_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v9, v8 +; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave2.nxv2i1( %mask, %mask) + %interleaved.vec = tail call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + tail call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) + ret void +} + +define void @masked_load_store_factor2_v2_shared_mask( %mask, ptr %ptr, i32 %rvl) { +; RV32-LABEL: masked_load_store_factor2_v2_shared_mask: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_store_factor2_v2_shared_mask: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %interleaved.vec = tail call @llvm.vector.interleave2.nxv4i32( %t0, %t1) + tail call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) + ret void +} + +define i32 @masked_load_store_factor2_v2_shared_mask_extract( %mask, ptr %ptr, i32 %rvl) { +; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv1r.v v8, v0 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV32-NEXT: vmerge.vim v11, v9, 1, v0 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: vwaddu.vv v12, v11, v11 +; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v11, v12, a3 +; RV32-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmsne.vi v0, v11, 0 +; RV32-NEXT: add a2, a3, a3 +; RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v10, v9, a3 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vle32.v v10, (a0), v0.t +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v13, v10, a2 +; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vnsrl.wi v12, v10, 0 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vsseg2e32.v v12, (a0), v0.t +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv1r.v v8, v0 +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV64-NEXT: vmerge.vim v11, v9, 1, v0 +; RV64-NEXT: srli a3, a3, 2 +; RV64-NEXT: vwaddu.vv v12, v11, v11 +; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v11, v12, a3 +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmsne.vi v0, v11, 0 +; RV64-NEXT: add a2, a3, a3 +; RV64-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v10, v9, a3 +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0), v0.t +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v13, v10, a2 +; RV64-NEXT: vmv.x.s a2, v10 +; RV64-NEXT: vnsrl.wi v12, v10, 0 +; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %r0 = extractelement %wide.masked.load, i32 0 + %interleaved.vec = tail call @llvm.vector.interleave2.nxv4i32( %t0, %t1) + tail call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) + ret i32 %r0 +} + +define void @masked_store_factor4_v2( %mask, %v0, %v1, ptr %ptr, i32 %rvl) { +; RV32-LABEL: masked_store_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 +; RV32-NEXT: vsseg4e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_store_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: srliw a1, a1, 2 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 +; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %interleaved.mask0 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) + %interleaved.mask1 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) + %interleaved.mask2 = call @llvm.vector.interleave2.nxv4i1( %interleaved.mask0, %interleaved.mask1) + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, %interleaved.mask2, i32 %rvl) + ret void +} + +; Negative tests + +; We should not transform this function because the deinterleave tree is not in a desired form. +define {, , , } @incorrect_extract_value_index(ptr %ptr, i32 %rvl) { +; RV32-LABEL: incorrect_extract_value_index: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vnsrl.wi v12, v8, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v9, v12, a0 +; RV32-NEXT: vnsrl.wi v8, v12, 0 +; RV32-NEXT: vmv.v.v v10, v9 +; RV32-NEXT: vmv.v.v v11, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: incorrect_extract_value_index: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vnsrl.wi v12, v8, 0 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v9, v12, a0 +; RV64-NEXT: vnsrl.wi v8, v12, 0 +; RV64-NEXT: vmv.v.v v10, v9 +; RV64-NEXT: vmv.v.v v11, v9 +; RV64-NEXT: ret + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 0 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 1 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } undef, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +; We should not transform this function because the expression is not a balanced tree. +define {, , , } @not_balanced_load_tree(ptr %ptr, i32 %rvl) { +; RV32-LABEL: not_balanced_load_tree: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vnsrl.wx v8, v12, a0 +; RV32-NEXT: vnsrl.wi v16, v12, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wi v10, v16, 0 +; RV32-NEXT: vnsrl.wx v11, v16, a0 +; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; RV32-NEXT: vnsrl.wx v12, v11, a0 +; RV32-NEXT: vnsrl.wi v11, v11, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_balanced_load_tree: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vnsrl.wx v8, v12, a0 +; RV64-NEXT: vnsrl.wi v16, v12, 0 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wi v10, v16, 0 +; RV64-NEXT: vnsrl.wx v11, v16, a0 +; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; RV64-NEXT: vnsrl.wx v12, v11, a0 +; RV64-NEXT: vnsrl.wi v11, v11, 0 +; RV64-NEXT: ret + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %t0 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t1 = extractvalue { , } %d1, 0 + %d1.1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) + %t2 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } undef, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +define void @not_balanced_store_tree( %v0, %v1, %v2, ptr %ptr, i32 %rvl) { +; RV32-LABEL: not_balanced_store_tree: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV32-NEXT: vwaddu.vv v12, v8, v8 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: vwmaccu.vx v12, a2, v8 +; RV32-NEXT: srli a3, a3, 3 +; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vx v8, v12, a3 +; RV32-NEXT: add a4, a3, a3 +; RV32-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v12, v8, a3 +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vwaddu.vv v14, v12, v9 +; RV32-NEXT: vwmaccu.vx v14, a2, v9 +; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; RV32-NEXT: vwaddu.vv v16, v14, v10 +; RV32-NEXT: vwmaccu.vx v16, a2, v10 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vse32.v v16, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: not_balanced_store_tree: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV64-NEXT: vwaddu.vv v12, v8, v8 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: vwmaccu.vx v12, a2, v8 +; RV64-NEXT: srli a3, a3, 3 +; RV64-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vx v8, v12, a3 +; RV64-NEXT: add a4, a3, a3 +; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v12, v8, a3 +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vwaddu.vv v14, v12, v9 +; RV64-NEXT: vwmaccu.vx v14, a2, v9 +; RV64-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; RV64-NEXT: vwaddu.vv v16, v14, v10 +; RV64-NEXT: vwmaccu.vx v16, a2, v10 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vse32.v v16, (a0) +; RV64-NEXT: ret + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %interleaved.vec0, %v1) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec1, %v2) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec2, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + ret void +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} From c869bec122d63858c7196040d6e29e679761d459 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Mon, 27 Jan 2025 13:34:29 -0800 Subject: [PATCH 02/12] fixup! Address review comments - Change what getMask returns - Change the name of TLI hooks - Limit the TLI hooks to scalable vectors only for now --- llvm/include/llvm/CodeGen/TargetLowering.h | 12 ++---- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 19 +++++---- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 46 ++++++++++----------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 11 +++-- 4 files changed, 41 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 50fa3a39c0a0c..9411dc66b2931 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3158,12 +3158,10 @@ class TargetLoweringBase { /// /// \p Load is a vp.load instruction. /// \p Mask is a mask value - /// \p DeinterleaveIntrin is vector.deinterleave intrinsic /// \p DeinterleaveRes is a list of deinterleaved results. virtual bool - lowerInterleavedScalableLoad(VPIntrinsic *Load, Value *Mask, - IntrinsicInst *DeinterleaveIntrin, - ArrayRef DeinterleaveRes) const { + lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const { return false; } @@ -3172,12 +3170,10 @@ class TargetLoweringBase { /// /// \p Store is the vp.store instruction. /// \p Mask is a mask value - /// \p InterleaveIntrin is vector.interleave intrinsic /// \p InterleaveOps is a list of values being interleaved. virtual bool - lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask, - IntrinsicInst *InterleaveIntrin, - ArrayRef InterleaveOps) const { + lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 9b15d0351ebe1..32ca90938d7ab 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -637,7 +637,7 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, /// - if a value within the option is nullptr, the value corresponds to all-true /// mask /// - return nullopt if mask cannot be deinterleaved -static std::optional getMask(Value *WideMask, unsigned Factor) { +static Value *getMask(Value *WideMask, unsigned Factor) { using namespace llvm::PatternMatch; if (auto *IMI = dyn_cast(WideMask)) { SmallVector Operands; @@ -650,8 +650,9 @@ static std::optional getMask(Value *WideMask, unsigned Factor) { } } if (match(WideMask, m_AllOnes())) - return nullptr; - return std::nullopt; + return WideMask; + + return nullptr; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( @@ -673,7 +674,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - std::optional Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor); if (!Mask) return false; @@ -682,8 +683,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedScalableLoad(VPLoad, *Mask, DI, - DeinterleaveValues)) + if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask, + DeinterleaveValues)) return false; } else { @@ -725,7 +726,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( return false; Value *WideMask = VPStore->getOperand(2); - std::optional Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor); if (!Mask) return false; @@ -734,8 +735,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( // Since lowerInterleavedStore expects Shuffle and StoreInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedScalableStore(VPStore, *Mask, II, - InterleaveValues)) + if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask, + InterleaveValues)) return false; } else { auto *SI = cast(StoredBy); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b5155106a506b..a93f05a85d3aa 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22555,15 +22555,20 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( /// /// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be /// removed by the caller -bool RISCVTargetLowering::lowerInterleavedScalableLoad( - VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin, +bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( + VPIntrinsic *Load, Value *Mask, ArrayRef DeInterleaveResults) const { + assert(Mask && "Expect a valid mask"); assert(Load->getIntrinsicID() == Intrinsic::vp_load && "Unexpected intrinsic"); const unsigned Factor = DeInterleaveResults.size(); - auto *WideVTy = cast(Load->getType()); + auto *WideVTy = dyn_cast(Load->getType()); + // TODO: Support fixed vectors. + if (!WideVTy) + return false; + unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue(); assert(WideNumElements % Factor == 0 && "ElementCount of a wide load must be divisible by interleave factor"); @@ -22605,10 +22610,6 @@ bool RISCVTargetLowering::lowerInterleavedScalableLoad( SmallVector Operands; Operands.append({PoisonVal, Load->getArgOperand(0)}); - if (!Mask) - Mask = ConstantVector::getSplat(VTy->getElementCount(), - ConstantInt::getTrue(Load->getContext())); - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( Load->getModule(), IntrMaskIds[Factor - 2], {VecTupTy, Mask->getType(), EVL->getType()}); @@ -22653,16 +22654,12 @@ bool RISCVTargetLowering::lowerInterleavedScalableLoad( /// into /// ``. This will resuling a simple unit stride store rather /// than a segment store, which is more expensive in this case. -static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin, +static Value *foldInterleaved2OfConstSplats(Value *Op0, Value *Op1, VectorType *VTy, const TargetLowering *TLI, Instruction *VPStore) { - // We only handle Factor = 2 for now. - assert(InterleaveIntrin->arg_size() == 2); - auto *SplatVal0 = dyn_cast_or_null( - getSplatValue(InterleaveIntrin->getArgOperand(0))); - auto *SplatVal1 = dyn_cast_or_null( - getSplatValue(InterleaveIntrin->getArgOperand(1))); + auto *SplatVal0 = dyn_cast_or_null(getSplatValue(Op0)); + auto *SplatVal1 = dyn_cast_or_null(getSplatValue(Op1)); if (!SplatVal0 || !SplatVal1) return nullptr; @@ -22711,15 +22708,19 @@ static Value *foldInterleaved2OfConstSplats(IntrinsicInst *InterleaveIntrin, /// %load2, ptr %ptr, /// %mask, /// i64 %rvl) -bool RISCVTargetLowering::lowerInterleavedScalableStore( - VPIntrinsic *Store, Value *Mask, IntrinsicInst *InterleaveIntrin, +bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( + VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOperands) const { + assert(Mask && "Expect a valid mask"); assert(Store->getIntrinsicID() == Intrinsic::vp_store && "Unexpected intrinsic"); const unsigned Factor = InterleaveOperands.size(); - VectorType *VTy = cast(InterleaveOperands[0]->getType()); + auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); + // TODO: Support fixed vectors. + if (!VTy) + return false; // FIXME: Should pass alignment attribute from pointer, but vectorizer needs // to emit it first. @@ -22731,9 +22732,10 @@ bool RISCVTargetLowering::lowerInterleavedScalableStore( return false; if (Factor == 2) - if (Value *BC = - foldInterleaved2OfConstSplats(InterleaveIntrin, VTy, this, Store)) { - InterleaveIntrin->replaceAllUsesWith(BC); + if (Value *BC = foldInterleaved2OfConstSplats( + InterleaveOperands[0], InterleaveOperands[1], VTy, this, Store)) { + // Store is guranteed to be the only user of the interleaved intrinsic. + Store->getOperand(0)->replaceAllUsesWith(BC); return true; } @@ -22770,10 +22772,6 @@ bool RISCVTargetLowering::lowerInterleavedScalableStore( Operands.push_back(StoredVal); Operands.push_back(Store->getArgOperand(1)); - if (!Mask) - Mask = ConstantVector::getSplat(VTy->getElementCount(), - ConstantInt::getTrue(Store->getContext())); - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( Store->getModule(), IntrMaskIds[Factor - 2], {VecTupTy, Mask->getType(), EVL->getType()}); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f779f4fdc26c1..e9dd8ff96fa37 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -910,14 +910,13 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; - bool lowerInterleavedScalableLoad( - VPIntrinsic *Load, Value *Mask, IntrinsicInst *DeinterleaveIntrin, + bool lowerDeinterleavedIntrinsicToVPLoad( + VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveRes) const override; - bool - lowerInterleavedScalableStore(VPIntrinsic *Store, Value *Mask, - IntrinsicInst *InterleaveIntrin, - ArrayRef InterleaveOps) const override; + bool lowerInterleavedIntrinsicToVPStore( + VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const override; bool supportKCFIBundles() const override { return true; } From 3688672841431cb526b2a2e7f347c3d07437ce15 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Mon, 27 Jan 2025 13:45:25 -0800 Subject: [PATCH 03/12] fixup! Rename test And add negative test case for fixed vectors. --- ...ess.ll => vp-vector-interleaved-access.ll} | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) rename llvm/test/CodeGen/RISCV/rvv/{scalable-vectors-interleaved-access.ll => vp-vector-interleaved-access.ll} (93%) diff --git a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll similarity index 93% rename from llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll rename to llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index ac254792e167a..648f6e324c59d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/scalable-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -615,5 +615,56 @@ define void @not_balanced_store_tree( %v0, ret void } +; We only support scalable vectors for now. +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %rvl) { +; RV32-LABEL: not_scalable_vectors: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v12, v8, a0 +; RV32-NEXT: vnsrl.wi v11, v8, 0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vnsrl.wx v10, v11, a0 +; RV32-NEXT: vnsrl.wi v8, v11, 0 +; RV32-NEXT: vnsrl.wx v11, v12, a0 +; RV32-NEXT: vnsrl.wi v9, v12, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_scalable_vectors: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v12, v8, a0 +; RV64-NEXT: vnsrl.wi v11, v8, 0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vnsrl.wx v10, v11, a0 +; RV64-NEXT: vnsrl.wi v8, v11, 0 +; RV64-NEXT: vnsrl.wx v11, v12, a0 +; RV64-NEXT: vnsrl.wi v9, v12, 0 +; RV64-NEXT: ret + %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl) + %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) + %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 + %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 + %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) + %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 + %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 + %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) + %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 + %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 + + %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0 + %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 + %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 + %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} From 32d8de5a1988a3ff80fc236a8c6a96b361fc62f3 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Mon, 27 Jan 2025 16:43:25 -0800 Subject: [PATCH 04/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 5 +- .../RISCV/rvv/vp-vector-interleaved-access.ll | 86 +++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 32ca90938d7ab..beffc193695ac 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -644,9 +644,10 @@ static Value *getMask(Value *WideMask, unsigned Factor) { SmallVector DeadInsts; if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { assert(!Operands.empty()); + Value *FirstOp = Operands[0]; if (Operands.size() == Factor && - std::equal(Operands.begin(), Operands.end(), Operands.begin())) - return Operands.front(); + llvm::all_of(Operands, [=](Value *Op) { return Op == FirstOp; })) + return FirstOp; } } if (match(WideMask, m_AllOnes())) diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 648f6e324c59d..2fb8661ca38f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -666,5 +666,91 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 } +define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %rvl) { +; RV32-LABEL: not_same_mask: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv1r.v v9, v0 +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV32-NEXT: vmerge.vim v11, v8, 1, v0 +; RV32-NEXT: vmv1r.v v0, v9 +; RV32-NEXT: vmerge.vim v9, v8, 1, v0 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: vwaddu.vv v12, v9, v11 +; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v9, v12, a3 +; RV32-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmsne.vi v0, v9, 0 +; RV32-NEXT: add a2, a3, a3 +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 +; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v10, v8, a3 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vle32.v v10, (a0), v0.t +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v9, v10, a0 +; RV32-NEXT: vnsrl.wi v8, v10, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_same_mask: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv1r.v v9, v0 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV64-NEXT: vmerge.vim v11, v8, 1, v0 +; RV64-NEXT: vmv1r.v v0, v9 +; RV64-NEXT: vmerge.vim v9, v8, 1, v0 +; RV64-NEXT: srli a3, a3, 2 +; RV64-NEXT: vwaddu.vv v12, v9, v11 +; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v9, v12, a3 +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmsne.vi v0, v9, 0 +; RV64-NEXT: add a2, a3, a3 +; RV64-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v10, v8, a3 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0), v0.t +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v9, v10, a0 +; RV64-NEXT: vnsrl.wi v8, v10, 0 +; RV64-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask0, %mask1) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } undef, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret { , } %res1 +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} From 6b7f8e9ab5259f29859ef8ed7abbba65bb3b8414 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Mon, 27 Jan 2025 16:51:52 -0800 Subject: [PATCH 05/12] fixup! fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index beffc193695ac..cec7ed5840ed5 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -644,10 +644,8 @@ static Value *getMask(Value *WideMask, unsigned Factor) { SmallVector DeadInsts; if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { assert(!Operands.empty()); - Value *FirstOp = Operands[0]; - if (Operands.size() == Factor && - llvm::all_of(Operands, [=](Value *Op) { return Op == FirstOp; })) - return FirstOp; + if (Operands.size() == Factor && llvm::all_equal(Operands)) + return Operands[0]; } } if (match(WideMask, m_AllOnes())) From f7242dfd188e8291a1bc7435e87893004253ec7e Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 28 Jan 2025 14:26:07 -0800 Subject: [PATCH 06/12] fixup! Address some of the review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 9 ++----- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 7 +++-- .../RISCV/rvv/vp-vector-interleaved-access.ll | 27 ++++++++----------- 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index cec7ed5840ed5..92a2d8b36947b 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -630,13 +630,8 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, return true; } -/// Check the interleaved mask -/// -/// - if a value within the optional is non-nullptr, the value corresponds to -/// deinterleaved mask -/// - if a value within the option is nullptr, the value corresponds to all-true -/// mask -/// - return nullopt if mask cannot be deinterleaved +// Return nullptr if the value corresponds to a all-true mask. Otherwise, +// return the value that is corresponded to a deinterleaved mask. static Value *getMask(Value *WideMask, unsigned Factor) { using namespace llvm::PatternMatch; if (auto *IMI = dyn_cast(WideMask)) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a93f05a85d3aa..2f9381f6b9d82 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22607,8 +22607,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( Factor); Value *PoisonVal = PoisonValue::get(VecTupTy); - SmallVector Operands; - Operands.append({PoisonVal, Load->getArgOperand(0)}); + SmallVector Operands{PoisonVal, Load->getArgOperand(0)}; Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( Load->getModule(), IntrMaskIds[Factor - 2], @@ -22618,8 +22617,8 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( Operands.push_back(EVL); - // Tail-policy - Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC)); + Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | + RISCVII::MASK_AGNOSTIC)); Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW))); diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 2fb8661ca38f1..1e2c611d260cc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -16,7 +16,7 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg2e32.v v8, (a0) ; RV64-NEXT: ret - %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) %t0 = extractvalue { , } %deinterleaved.results, 0 %t1 = extractvalue { , } %deinterleaved.results, 1 @@ -39,7 +39,7 @@ define {, , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 %d0.1 = extractvalue { , } %d0, 1 @@ -71,7 +71,7 @@ define {, , , @llvm.vp.load.nxv16i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %wide.masked.load = call @llvm.vp.load.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 %d0.1 = extractvalue { , } %d0, 1 @@ -121,7 +121,7 @@ define void @store_factor2_v2( %v0, %v1, pt ; RV64-NEXT: vsseg2e32.v v8, (a0) ; RV64-NEXT: ret %interleaved.vec = call @llvm.vector.interleave2.nxv2i32( %v0, %v1) - call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void } @@ -156,13 +156,8 @@ define void @store_factor2_const_splat(ptr %dst) { ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: ret - %interleave2 = call @llvm.vector.interleave2.nxv16i32( - shufflevector ( insertelement ( poison, i32 666, i64 0), poison, zeroinitializer), - shufflevector ( insertelement ( poison, i32 777, i64 0), poison, zeroinitializer) - ) - call void @llvm.vp.store.nxv16i32.p0( %interleave2, ptr %dst, - shufflevector ( insertelement ( poison, i1 1, i64 0), poison, zeroinitializer), - i32 87) + %interleave2 = call @llvm.vector.interleave2.nxv16i32( splat (i32 666), splat (i32 777)) + call void @llvm.vp.store.nxv16i32.p0( %interleave2, ptr %dst, splat (i1 true), i32 87) ret void } @@ -187,7 +182,7 @@ define void @store_factor4_v2( %v0, %v1, pt %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) - call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, splat (i1 true), i32 %rvl) ret void } @@ -224,7 +219,7 @@ define void @store_factor8_v2( %v0, %v1, pt %interleaved.vec4 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) %interleaved.vec5 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec3, %interleaved.vec4) %interleaved.vec6 = call @llvm.vector.interleave2.nxv8i32( %interleaved.vec2, %interleaved.vec5) - call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec6, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec6, ptr %ptr, splat (i1 true), i32 %rvl) ret void } @@ -489,7 +484,7 @@ define {, , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 %d0.1 = extractvalue { , } %d0, 0 @@ -542,7 +537,7 @@ define {, , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 %t0 = extractvalue { , } %d0, 1 @@ -611,7 +606,7 @@ define void @not_balanced_store_tree( %v0, %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %interleaved.vec0, %v1) %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec1, %v2) - call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec2, ptr %ptr, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), i32 %rvl) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec2, ptr %ptr, splat (i1 true), i32 %rvl) ret void } From 0e4bf695150d0c578bf899c87d251e4645a9f239 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 29 Jan 2025 15:28:58 -0800 Subject: [PATCH 07/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 38 ++- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 23 ++ .../RISCV/rvv/vp-vector-interleaved-access.ll | 223 +++++++++++++----- 3 files changed, 214 insertions(+), 70 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 92a2d8b36947b..2eb22434bf9b3 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -630,9 +630,12 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, return true; } -// Return nullptr if the value corresponds to a all-true mask. Otherwise, -// return the value that is corresponded to a deinterleaved mask. -static Value *getMask(Value *WideMask, unsigned Factor) { +// Return the corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +static Value *getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { + Value *MaskVal = nullptr; + using namespace llvm::PatternMatch; if (auto *IMI = dyn_cast(WideMask)) { SmallVector Operands; @@ -640,13 +643,28 @@ static Value *getMask(Value *WideMask, unsigned Factor) { if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { assert(!Operands.empty()); if (Operands.size() == Factor && llvm::all_equal(Operands)) - return Operands[0]; + MaskVal = Operands[0]; } } - if (match(WideMask, m_AllOnes())) - return WideMask; - return nullptr; + if (match(WideMask, m_AllOnes())) { + // Scale the vector length. + ElementCount OrigEC = + cast(WideMask->getType())->getElementCount(); + MaskVal = + ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), + cast(WideMask)->getSplatValue()); + } + + if (MaskVal) { + // Check if the vector length of mask matches that of the leaf values. + auto *MaskTy = cast(MaskVal->getType()); + if (!MaskTy->getElementType()->isIntegerTy(/*Bitwidth=*/1) || + MaskTy->getElementCount() != LeafValueTy->getElementCount()) + return nullptr; + } + + return MaskVal; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( @@ -668,7 +686,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - Value *Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor, + cast(DeinterleaveValues[0]->getType())); if (!Mask) return false; @@ -720,7 +739,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( return false; Value *WideMask = VPStore->getOperand(2); - Value *Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor, + cast(InterleaveValues[0]->getType())); if (!Mask) return false; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2f9381f6b9d82..27049cdb42623 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -22529,6 +22530,22 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( return true; } +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } else { + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0; + } +} + /// Lower an interleaved vp.load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved vp.load (Factor = 2): @@ -22586,6 +22603,9 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( IRBuilder<> Builder(Load); Value *WideEVL = Load->getArgOperand(2); + if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) + return false; + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); Value *EVL = Builder.CreateZExtOrTrunc( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), @@ -22740,6 +22760,9 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( IRBuilder<> Builder(Store); Value *WideEVL = Store->getArgOperand(3); + if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) + return false; + auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); Value *EVL = Builder.CreateZExtOrTrunc( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 1e2c611d260cc..419dd4d86ed8f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -2,9 +2,10 @@ ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s -define {, } @load_factor2_v2(ptr %ptr, i32 %rvl) { +define {, } @load_factor2_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor2_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg2e32.v v8, (a0) @@ -12,10 +13,12 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % ; ; RV64-LABEL: load_factor2_v2: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg2e32.v v8, (a0) ; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) %t0 = extractvalue { , } %deinterleaved.results, 0 @@ -25,9 +28,10 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % ret { , } %res1 } -define {, , , } @load_factor4_v2(ptr %ptr, i32 %rvl) { +define {, , , } @load_factor4_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor4_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg4e32.v v8, (a0) @@ -35,10 +39,12 @@ define {, , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 @@ -57,9 +63,10 @@ define {, , , , , , } %res3 } -define {, , , , , , , } @load_factor8_v2(ptr %ptr, i32 %rvl) { +define {, , , , , , , } @load_factor8_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor8_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg8e32.v v8, (a0) @@ -67,10 +74,12 @@ define {, , , @llvm.vp.load.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 @@ -106,9 +115,10 @@ define {, , , , , , , , , , } %res7 } -define void @store_factor2_v2( %v0, %v1, ptr %ptr, i32 %rvl) { +define void @store_factor2_v2( %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor2_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vsseg2e32.v v8, (a0) @@ -116,10 +126,12 @@ define void @store_factor2_v2( %v0, %v1, pt ; ; RV64-LABEL: store_factor2_v2: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg2e32.v v8, (a0) ; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 %interleaved.vec = call @llvm.vector.interleave2.nxv2i32( %v0, %v1) call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -138,7 +150,7 @@ define void @store_factor2_const_splat(ptr %dst) { ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: li a1, 87 +; RV32-NEXT: li a1, 88 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: addi sp, sp, 16 @@ -152,18 +164,19 @@ define void @store_factor2_const_splat(ptr %dst) { ; RV64-NEXT: addi a1, a1, 666 ; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: li a1, 87 +; RV64-NEXT: li a1, 88 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: ret %interleave2 = call @llvm.vector.interleave2.nxv16i32( splat (i32 666), splat (i32 777)) - call void @llvm.vp.store.nxv16i32.p0( %interleave2, ptr %dst, splat (i1 true), i32 87) + call void @llvm.vp.store.nxv16i32.p0( %interleave2, ptr %dst, splat (i1 true), i32 88) ret void } -define void @store_factor4_v2( %v0, %v1, ptr %ptr, i32 %rvl) { +define void @store_factor4_v2( %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor4_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vmv1r.v v10, v8 @@ -173,12 +186,14 @@ define void @store_factor4_v2( %v0, %v1, pt ; ; RV64-LABEL: store_factor4_v2: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 2 +; RV64-NEXT: slli a1, a1, 35 +; RV64-NEXT: srli a1, a1, 34 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vmv1r.v v10, v8 ; RV64-NEXT: vmv1r.v v11, v9 ; RV64-NEXT: vsseg4e32.v v8, (a0) ; RV64-NEXT: ret + %rvl = mul i32 %evl, 8 %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) @@ -186,9 +201,10 @@ define void @store_factor4_v2( %v0, %v1, pt ret void } -define void @store_factor8_v2( %v0, %v1, ptr %ptr, i32 %rvl) { +define void @store_factor8_v2( %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor8_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: srli a1, a1, 3 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vmv1r.v v10, v8 @@ -202,7 +218,8 @@ define void @store_factor8_v2( %v0, %v1, pt ; ; RV64-LABEL: store_factor8_v2: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 3 +; RV64-NEXT: slli a1, a1, 35 +; RV64-NEXT: srli a1, a1, 35 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vmv1r.v v10, v8 ; RV64-NEXT: vmv1r.v v11, v9 @@ -212,6 +229,7 @@ define void @store_factor8_v2( %v0, %v1, pt ; RV64-NEXT: vmv1r.v v15, v9 ; RV64-NEXT: vsseg8e32.v v8, (a0) ; RV64-NEXT: ret + %rvl = mul i32 %evl, 8 %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) @@ -223,9 +241,10 @@ define void @store_factor8_v2( %v0, %v1, pt ret void } -define {, } @masked_load_factor2_v2( %mask, ptr %ptr, i32 %rvl) { +define {, } @masked_load_factor2_v2( %mask, ptr %ptr, i32 %evl) { ; RV32-LABEL: masked_load_factor2_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t @@ -233,10 +252,12 @@ define {, } @masked_load_factor2_v2( @llvm.vector.interleave2.nxv4i1( %mask, %mask) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -247,9 +268,10 @@ define {, } @masked_load_factor2_v2(, } %res1 } -define {, , , } @masked_load_factor4_v2( %mask, ptr %ptr, i32 %rvl) { +define {, , , } @masked_load_factor4_v2( %mask, ptr %ptr, i32 %evl) { ; RV32-LABEL: masked_load_factor4_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg4e32.v v8, (a0), v0.t @@ -257,10 +279,12 @@ define {, , , @llvm.vector.interleave2.nxv4i1( %mask, %mask) %interleaved.mask1 = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) %interleaved.mask2 = call @llvm.vector.interleave2.nxv8i1( %interleaved.mask0, %interleaved.mask1) @@ -282,31 +306,37 @@ define {, , , , , , } %res3 } -define void @masked_store_factor2_v2( %mask, %v0, %v1, ptr %ptr, i32 %rvl) { +define void @masked_store_factor2_v2( %mask, %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: masked_store_factor2_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv1r.v v9, v8 ; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; RV32-NEXT: vmv1r.v v9, v8 ; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: masked_store_factor2_v2: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 1 -; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64-NEXT: vmv1r.v v9, v8 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t ; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv2i1( %mask, %mask) %interleaved.vec = tail call @llvm.vector.interleave2.nxv2i32( %v0, %v0) tail call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) ret void } -define void @masked_load_store_factor2_v2_shared_mask( %mask, ptr %ptr, i32 %rvl) { +define void @masked_load_store_factor2_v2_shared_mask( %mask, ptr %ptr, i32 %evl) { ; RV32-LABEL: masked_load_store_factor2_v2_shared_mask: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t @@ -315,11 +345,13 @@ define void @masked_load_store_factor2_v2_shared_mask( %mask, p ; ; RV64-LABEL: masked_load_store_factor2_v2_shared_mask: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t ; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t ; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -330,7 +362,7 @@ define void @masked_load_store_factor2_v2_shared_mask( %mask, p ret void } -define i32 @masked_load_store_factor2_v2_shared_mask_extract( %mask, ptr %ptr, i32 %rvl) { +define i32 @masked_load_store_factor2_v2_shared_mask_extract( %mask, ptr %ptr, i32 %evl) { ; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma @@ -357,18 +389,19 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV32-NEXT: vslideup.vx v10, v9, a3 ; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: slli a2, a1, 1 +; RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma ; RV32-NEXT: vle32.v v10, (a0), v0.t -; RV32-NEXT: li a2, 32 +; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wx v13, v10, a2 -; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vnsrl.wx v13, v10, a1 +; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: vnsrl.wi v12, v10, 0 -; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: srli a2, a2, 1 ; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma ; RV32-NEXT: vsseg2e32.v v12, (a0), v0.t -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract: @@ -380,38 +413,39 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV64-NEXT: slli a4, a1, 33 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; RV64-NEXT: vmerge.vim v11, v9, 1, v0 ; RV64-NEXT: srli a3, a3, 2 ; RV64-NEXT: vwaddu.vv v12, v11, v11 ; RV64-NEXT: vwmaccu.vx v12, a2, v11 ; RV64-NEXT: vmsne.vi v0, v12, 0 -; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV64-NEXT: vslidedown.vx v11, v12, a3 ; RV64-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma ; RV64-NEXT: vmsne.vi v0, v11, 0 -; RV64-NEXT: add a2, a3, a3 +; RV64-NEXT: add a1, a3, a3 ; RV64-NEXT: vmerge.vim v9, v9, 1, v0 -; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vx v10, v9, a3 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: srli a2, a2, 32 -; RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; RV64-NEXT: srli a1, a4, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vle32.v v10, (a0), v0.t -; RV64-NEXT: li a2, 32 -; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v13, v10, a2 -; RV64-NEXT: vmv.x.s a2, v10 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v13, v10, a1 +; RV64-NEXT: vmv.x.s a1, v10 ; RV64-NEXT: vnsrl.wi v12, v10, 0 -; RV64-NEXT: srliw a1, a1, 1 +; RV64-NEXT: srli a4, a4, 33 ; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma ; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -423,9 +457,10 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ret i32 %r0 } -define void @masked_store_factor4_v2( %mask, %v0, %v1, ptr %ptr, i32 %rvl) { +define void @masked_store_factor4_v2( %mask, %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: masked_store_factor4_v2: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vmv1r.v v10, v8 @@ -435,12 +470,14 @@ define void @masked_store_factor4_v2( %mask, ; ; RV64-LABEL: masked_store_factor4_v2: ; RV64: # %bb.0: -; RV64-NEXT: srliw a1, a1, 2 +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 34 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vmv1r.v v10, v8 ; RV64-NEXT: vmv1r.v v11, v9 ; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t ; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 %interleaved.mask0 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) %interleaved.mask1 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) %interleaved.mask2 = call @llvm.vector.interleave2.nxv4i1( %interleaved.mask0, %interleaved.mask1) @@ -454,9 +491,10 @@ define void @masked_store_factor4_v2( %mask, ; Negative tests ; We should not transform this function because the deinterleave tree is not in a desired form. -define {, , , } @incorrect_extract_value_index(ptr %ptr, i32 %rvl) { +define {, , , } @incorrect_extract_value_index(ptr %ptr, i32 %evl) { ; RV32-LABEL: incorrect_extract_value_index: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: li a0, 32 @@ -471,7 +509,7 @@ define {, , , , , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 @@ -503,9 +542,10 @@ define {, , , , , , } @not_balanced_load_tree(ptr %ptr, i32 %rvl) { +define {, , , } @not_balanced_load_tree(ptr %ptr, i32 %evl) { ; RV32-LABEL: not_balanced_load_tree: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma ; RV32-NEXT: vle32.v v12, (a0) ; RV32-NEXT: li a0, 32 @@ -522,7 +562,7 @@ define {, , , , , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) %d0.0 = extractvalue { , } %d0, 0 @@ -555,9 +596,10 @@ define {, , , , , , } %res3 } -define void @not_balanced_store_tree( %v0, %v1, %v2, ptr %ptr, i32 %rvl) { +define void @not_balanced_store_tree( %v0, %v1, %v2, ptr %ptr, i32 %evl) { ; RV32-LABEL: not_balanced_store_tree: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; RV32-NEXT: vwaddu.vv v12, v8, v8 ; RV32-NEXT: li a2, -1 @@ -585,7 +627,7 @@ define void @not_balanced_store_tree( %v0, ; RV64-NEXT: vwaddu.vv v12, v8, v8 ; RV64-NEXT: li a2, -1 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a1, a1, 34 ; RV64-NEXT: vwmaccu.vx v12, a2, v8 ; RV64-NEXT: srli a3, a3, 3 ; RV64-NEXT: vsetvli a4, zero, e32, m1, ta, ma @@ -603,6 +645,7 @@ define void @not_balanced_store_tree( %v0, ; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma ; RV64-NEXT: vse32.v v16, (a0) ; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %interleaved.vec0, %v1) %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec1, %v2) @@ -611,9 +654,10 @@ define void @not_balanced_store_tree( %v0, } ; We only support scalable vectors for now. -define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %rvl) { +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) { ; RV32-LABEL: not_scalable_vectors: ; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: li a0, 32 @@ -629,7 +673,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p ; ; RV64-LABEL: not_scalable_vectors: ; RV64: # %bb.0: -; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a1, a1, 34 ; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vle32.v v8, (a0) @@ -643,6 +687,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p ; RV64-NEXT: vnsrl.wx v11, v12, a0 ; RV64-NEXT: vnsrl.wi v9, v12, 0 ; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl) %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 @@ -661,7 +706,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 } -define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %rvl) { +define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %evl) { ; RV32-LABEL: not_same_mask: ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma @@ -691,6 +736,7 @@ define {, } @not_same_mask( ; RV32-NEXT: vslideup.vx v10, v8, a3 ; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV32-NEXT: vle32.v v10, (a0), v0.t ; RV32-NEXT: li a0, 32 @@ -709,7 +755,7 @@ define {, } @not_same_mask( ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a1, a1, 33 ; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma ; RV64-NEXT: vmerge.vim v11, v8, 1, v0 ; RV64-NEXT: vmv1r.v v0, v9 @@ -737,6 +783,7 @@ define {, } @not_same_mask( ; RV64-NEXT: vnsrl.wx v9, v10, a0 ; RV64-NEXT: vnsrl.wi v8, v10, 0 ; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask0, %mask1) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -747,5 +794,59 @@ define {, } @not_same_mask( ret { , } %res1 } +; EVL should be a multiple of factor +define {, , , } @invalid_evl(ptr %ptr, i32 %evl) { +; RV32-LABEL: invalid_evl: +; RV32: # %bb.0: +; RV32-NEXT: ori a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vnsrl.wx v12, v8, a0 +; RV32-NEXT: vnsrl.wi v14, v8, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v10, v14, a0 +; RV32-NEXT: vnsrl.wi v8, v14, 0 +; RV32-NEXT: vnsrl.wx v11, v12, a0 +; RV32-NEXT: vnsrl.wi v9, v12, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: invalid_evl: +; RV64: # %bb.0: +; RV64-NEXT: ori a1, a1, 1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vnsrl.wx v12, v8, a0 +; RV64-NEXT: vnsrl.wi v14, v8, 0 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v10, v14, a0 +; RV64-NEXT: vnsrl.wi v8, v14, 0 +; RV64-NEXT: vnsrl.wx v11, v12, a0 +; RV64-NEXT: vnsrl.wi v9, v12, 0 +; RV64-NEXT: ret + %rvl = or i32 %evl, 1 + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } undef, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} From 1afd2cc11caa6edb2028af9001921a54a05c6ad5 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 29 Jan 2025 15:47:26 -0800 Subject: [PATCH 08/12] fixup! Add more comments --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 27049cdb42623..623f6e87bd2b1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22603,6 +22603,8 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( IRBuilder<> Builder(Load); Value *WideEVL = Load->getArgOperand(2); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) return false; @@ -22760,6 +22762,8 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( IRBuilder<> Builder(Store); Value *WideEVL = Store->getArgOperand(3); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) return false; From 16a88d9a8180635e398aade94306161ea8bee9bf Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 29 Jan 2025 16:00:17 -0800 Subject: [PATCH 09/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 21 ++++++--------------- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 +++++----- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 2eb22434bf9b3..92a4ef75e63a1 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -634,8 +634,6 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, // mask. static Value *getMask(Value *WideMask, unsigned Factor, VectorType *LeafValueTy) { - Value *MaskVal = nullptr; - using namespace llvm::PatternMatch; if (auto *IMI = dyn_cast(WideMask)) { SmallVector Operands; @@ -643,28 +641,21 @@ static Value *getMask(Value *WideMask, unsigned Factor, if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { assert(!Operands.empty()); if (Operands.size() == Factor && llvm::all_equal(Operands)) - MaskVal = Operands[0]; + return Operands[0]; } } if (match(WideMask, m_AllOnes())) { - // Scale the vector length. + // Scale the vector length of all-ones mask. ElementCount OrigEC = cast(WideMask->getType())->getElementCount(); - MaskVal = - ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), - cast(WideMask)->getSplatValue()); - } - - if (MaskVal) { - // Check if the vector length of mask matches that of the leaf values. - auto *MaskTy = cast(MaskVal->getType()); - if (!MaskTy->getElementType()->isIntegerTy(/*Bitwidth=*/1) || - MaskTy->getElementCount() != LeafValueTy->getElementCount()) + if (OrigEC.getKnownMinValue() % Factor) return nullptr; + return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), + cast(WideMask)->getSplatValue()); } - return MaskVal; + return nullptr; } bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 623f6e87bd2b1..5dfbd614df9e4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22538,12 +22538,12 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { if (isPowerOf2_32(N)) { KnownBits KB = llvm::computeKnownBits(V, DL); return KB.countMinTrailingZeros() >= Log2_32(N); - } else { - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0; } + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0; } /// Lower an interleaved vp.load into a vlsegN intrinsic. From 5cd5be43257f9eb72618a2f0c52c4d6d774e1eab Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 30 Jan 2025 12:03:18 -0800 Subject: [PATCH 10/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 3 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 32 +++++++++------------ 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 92a4ef75e63a1..3261f2858b236 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -649,8 +649,7 @@ static Value *getMask(Value *WideMask, unsigned Factor, // Scale the vector length of all-ones mask. ElementCount OrigEC = cast(WideMask->getType())->getElementCount(); - if (OrigEC.getKnownMinValue() % Factor) - return nullptr; + assert(OrigEC.getKnownMinValue() % Factor == 0); return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), cast(WideMask)->getSplatValue()); } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 5dfbd614df9e4..6d997f5d41e27 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22574,12 +22574,12 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// removed by the caller bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( VPIntrinsic *Load, Value *Mask, - ArrayRef DeInterleaveResults) const { + ArrayRef DeinterleaveResults) const { assert(Mask && "Expect a valid mask"); assert(Load->getIntrinsicID() == Intrinsic::vp_load && "Unexpected intrinsic"); - const unsigned Factor = DeInterleaveResults.size(); + const unsigned Factor = DeinterleaveResults.size(); auto *WideVTy = dyn_cast(Load->getType()); // TODO: Support fixed vectors. @@ -22592,10 +22592,9 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( auto *VTy = VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor, WideVTy->isScalableTy()); - // FIXME: Should pass alignment attribute from pointer, but vectorizer needs - // to emit it first. auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Align(DL.getTypeStoreSize(WideVTy->getScalarType())); + Align Alignment = Load->getParamAlign(0).value_or( + DL.getABITypeAlign(WideVTy->getElementType())); if (!isLegalInterleavedAccessType( VTy, Factor, Alignment, Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) @@ -22629,20 +22628,18 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( Factor); Value *PoisonVal = PoisonValue::get(VecTupTy); - SmallVector Operands{PoisonVal, Load->getArgOperand(0)}; Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( Load->getModule(), IntrMaskIds[Factor - 2], {VecTupTy, Mask->getType(), EVL->getType()}); - Operands.push_back(Mask); - - Operands.push_back(EVL); - - Operands.push_back(ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | - RISCVII::MASK_AGNOSTIC)); - - Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW))); + Value *Operands[] = { + PoisonVal, + Load->getArgOperand(0), + Mask, + EVL, + ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC), + ConstantInt::get(XLenTy, Log2_64(SEW))}; CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); @@ -22657,7 +22654,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( Return = Builder.CreateInsertValue(Return, VecExtract, i); } - for (auto [Idx, DIO] : enumerate(DeInterleaveResults)) { + for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { // We have to create a brand new ExtractValue to replace each // of these old ExtractValue instructions. Value *NewEV = @@ -22743,10 +22740,9 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( if (!VTy) return false; - // FIXME: Should pass alignment attribute from pointer, but vectorizer needs - // to emit it first. const DataLayout &DL = Store->getDataLayout(); - Align Alignment = Align(DL.getTypeStoreSize(VTy->getScalarType())); + Align Alignment = Store->getParamAlign(1).value_or( + DL.getABITypeAlign(VTy->getElementType())); if (!isLegalInterleavedAccessType( VTy, Factor, Alignment, Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) From 16915c94915dbdaeae72f6d2b283c141b32256b9 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 30 Jan 2025 16:45:43 -0800 Subject: [PATCH 11/12] fixup! Split out the folding rule on interleave2 of two const splats --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 49 ------------------- .../RISCV/rvv/vp-vector-interleaved-access.ll | 36 -------------- 2 files changed, 85 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6d997f5d41e27..8f3dd51a45a86 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22665,47 +22665,6 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( return true; } -/// If we're interleaving 2 constant splats, for instance ` -/// ` and ` `, we can create a -/// larger splat -/// ` ` first before casting it -/// into -/// ``. This will resuling a simple unit stride store rather -/// than a segment store, which is more expensive in this case. -static Value *foldInterleaved2OfConstSplats(Value *Op0, Value *Op1, - VectorType *VTy, - const TargetLowering *TLI, - Instruction *VPStore) { - auto *SplatVal0 = dyn_cast_or_null(getSplatValue(Op0)); - auto *SplatVal1 = dyn_cast_or_null(getSplatValue(Op1)); - if (!SplatVal0 || !SplatVal1) - return nullptr; - - auto &Ctx = VPStore->getContext(); - auto &DL = VPStore->getModule()->getDataLayout(); - - auto *NewVTy = VectorType::getExtendedElementVectorType(VTy); - if (!TLI->isTypeLegal(TLI->getValueType(DL, NewVTy))) - return nullptr; - - // InterleavedAccessPass will remove VPStore after this but we still want to - // preserve it, hence clone another one here. - auto *ClonedVPStore = VPStore->clone(); - ClonedVPStore->insertBefore(VPStore); - IRBuilder<> Builder(ClonedVPStore); - - Type *ETy = VTy->getElementType(); - unsigned Width = ETy->getIntegerBitWidth(); - - APInt NewSplatVal(Width * 2, SplatVal1->getZExtValue()); - NewSplatVal <<= Width; - NewSplatVal |= SplatVal0->getZExtValue(); - auto *NewSplat = ConstantVector::getSplat(NewVTy->getElementCount(), - ConstantInt::get(Ctx, NewSplatVal)); - return Builder.CreateBitCast(NewSplat, - VectorType::getDoubleElementsVectorType(VTy)); -} - /// Lower an interleaved vp.store into a vssegN intrinsic. /// /// E.g. Lower an interleaved vp.store (Factor = 2): @@ -22748,14 +22707,6 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) return false; - if (Factor == 2) - if (Value *BC = foldInterleaved2OfConstSplats( - InterleaveOperands[0], InterleaveOperands[1], VTy, this, Store)) { - // Store is guranteed to be the only user of the interleaved intrinsic. - Store->getOperand(0)->replaceAllUsesWith(BC); - return true; - } - IRBuilder<> Builder(Store); Value *WideEVL = Store->getArgOperand(3); // Conservatively check if EVL is a multiple of factor, otherwise some diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 419dd4d86ed8f..1474c32f85d49 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -137,42 +137,6 @@ define void @store_factor2_v2( %v0, %v1, pt ret void } -; Expecting unit-stride store here rather than segmented store. -define void @store_factor2_const_splat(ptr %dst) { -; RV32-LABEL: store_factor2_const_splat: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: li a1, 777 -; RV32-NEXT: li a2, 666 -; RV32-NEXT: sw a2, 8(sp) -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: addi a1, sp, 8 -; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v8, (a1), zero -; RV32-NEXT: li a1, 88 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: store_factor2_const_splat: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 777 -; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: addi a1, a1, 666 -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: li a1, 88 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: ret - %interleave2 = call @llvm.vector.interleave2.nxv16i32( splat (i32 666), splat (i32 777)) - call void @llvm.vp.store.nxv16i32.p0( %interleave2, ptr %dst, splat (i1 true), i32 88) - ret void -} - define void @store_factor4_v2( %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor4_v2: ; RV32: # %bb.0: From 725dfad46b81ec4d1b336fb64c9d211ce825bf07 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 31 Jan 2025 11:25:03 -0800 Subject: [PATCH 12/12] fixup! Address review comments --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 16 +++++++-------- .../RISCV/rvv/vp-vector-interleaved-access.ll | 20 +++++++++---------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8f3dd51a45a86..fc59b7e690ba1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22572,6 +22572,9 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// /// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be /// removed by the caller +/// TODO: We probably can loosen the dependency on matching extractvalue when +/// dealing with factor of 2 (extractvalue is still required for most of other +/// factors though). bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveResults) const { @@ -22608,7 +22611,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( return false; auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExtOrTrunc( + Value *EVL = Builder.CreateZExt( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), XLenTy); @@ -22715,7 +22718,7 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( return false; auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExtOrTrunc( + Value *EVL = Builder.CreateZExt( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), XLenTy); @@ -22741,17 +22744,12 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( StoredVal = Builder.CreateCall( VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); - SmallVector Operands; - Operands.push_back(StoredVal); - Operands.push_back(Store->getArgOperand(1)); - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( Store->getModule(), IntrMaskIds[Factor - 2], {VecTupTy, Mask->getType(), EVL->getType()}); - Operands.push_back(Mask); - Operands.push_back(EVL); - Operands.push_back(ConstantInt::get(XLenTy, Log2_64(SEW))); + Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL, + ConstantInt::get(XLenTy, Log2_64(SEW))}; Builder.CreateCall(VssegNFunc, Operands); return true; diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 1474c32f85d49..e481891dfd52f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -23,7 +23,7 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) %t0 = extractvalue { , } %deinterleaved.results, 0 %t1 = extractvalue { , } %deinterleaved.results, 1 - %res0 = insertvalue { , } undef, %t0, 0 + %res0 = insertvalue { , } poison, %t0, 0 %res1 = insertvalue { , } %res0, %t1, 1 ret { , } %res1 } @@ -56,7 +56,7 @@ define {, , , , } %d2, 0 %t3 = extractvalue { , } %d2, 1 - %res0 = insertvalue { , , , } undef, %t0, 0 + %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 %res2 = insertvalue { , , , } %res1, %t2, 2 %res3 = insertvalue { , , , } %res2, %t3, 3 @@ -104,7 +104,7 @@ define {, , , , } %d6, 0 %t7 = extractvalue { , } %d6, 1 - %res0 = insertvalue { , , , , , , , } undef, %t0, 0 + %res0 = insertvalue { , , , , , , , } poison, %t0, 0 %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 %res2 = insertvalue { , , , , , , , } %res1, %t2, 2 %res3 = insertvalue { , , , , , , , } %res2, %t3, 3 @@ -227,7 +227,7 @@ define {, } @masked_load_factor2_v2(, } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) %t0 = extractvalue { , } %deinterleaved.results, 0 %t1 = extractvalue { , } %deinterleaved.results, 1 - %res0 = insertvalue { , } undef, %t0, 0 + %res0 = insertvalue { , } poison, %t0, 0 %res1 = insertvalue { , } %res0, %t1, 1 ret { , } %res1 } @@ -263,7 +263,7 @@ define {, , , , } %d2, 0 %t3 = extractvalue { , } %d2, 1 - %res0 = insertvalue { , , , } undef, %t0, 0 + %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 %res2 = insertvalue { , , , } %res1, %t2, 2 %res3 = insertvalue { , , , } %res2, %t3, 3 @@ -498,7 +498,7 @@ define {, , , , } %d2, 1 %t3 = extractvalue { , } %d2, 1 - %res0 = insertvalue { , , , } undef, %t0, 0 + %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 %res2 = insertvalue { , , , } %res1, %t2, 2 %res3 = insertvalue { , , , } %res2, %t3, 3 @@ -553,7 +553,7 @@ define {, , , , } %d2, 0 %t3 = extractvalue { , } %d2, 1 - %res0 = insertvalue { , , , } undef, %t0, 0 + %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 %res2 = insertvalue { , , , } %res1, %t2, 2 %res3 = insertvalue { , , , } %res2, %t3, 3 @@ -663,7 +663,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %p %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 - %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0 + %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 @@ -753,7 +753,7 @@ define {, } @not_same_mask( %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) %t0 = extractvalue { , } %deinterleaved.results, 0 %t1 = extractvalue { , } %deinterleaved.results, 1 - %res0 = insertvalue { , } undef, %t0, 0 + %res0 = insertvalue { , } poison, %t0, 0 %res1 = insertvalue { , } %res0, %t1, 1 ret { , } %res1 } @@ -805,7 +805,7 @@ define {, , , , } %d2, 0 %t3 = extractvalue { , } %d2, 1 - %res0 = insertvalue { , , , } undef, %t0, 0 + %res0 = insertvalue { , , , } poison, %t0, 0 %res1 = insertvalue { , , , } %res0, %t1, 1 %res2 = insertvalue { , , , } %res1, %t2, 2 %res3 = insertvalue { , , , } %res2, %t3, 3