-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MachineLICM] Allow hoisting loads from invariant address #70796
[MachineLICM] Allow hoisting loads from invariant address #70796
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-x86 Author: Igor Kirillov (igogo-x86) ChangesSometimes, loads can appear in a loop after the LICM pass is executed Patch is 35.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70796.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index e29f28ecaea0dce..f1af74328f0025a 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -72,6 +72,11 @@ static cl::opt<bool>
HoistConstStores("hoist-const-stores",
cl::desc("Hoist invariant stores"),
cl::init(true), cl::Hidden);
+
+static cl::opt<bool> HoistConstLoads("hoist-const-loads",
+ cl::desc("Hoist invariant loads"),
+ cl::init(true), cl::Hidden);
+
// The default threshold of 100 (i.e. if target block is 100 times hotter)
// is based on empirical data on a single target and is subject to tuning.
static cl::opt<unsigned>
@@ -222,9 +227,11 @@ namespace {
void AddToLiveIns(MCRegister Reg, MachineLoop *CurLoop);
- bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop);
+ bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad);
- bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop);
+ bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad);
bool HasLoopPHIUse(const MachineInstr *MI, MachineLoop *CurLoop);
@@ -277,7 +284,7 @@ namespace {
bool MayCSE(MachineInstr *MI);
unsigned Hoist(MachineInstr *MI, MachineBasicBlock *Preheader,
- MachineLoop *CurLoop);
+ MachineLoop *CurLoop, bool SafeToMoveLoad);
void InitCSEMap(MachineBasicBlock *BB);
@@ -494,7 +501,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
// operands. FIXME: Consider unfold load folding instructions.
if (Def && !RuledOut) {
int FI = std::numeric_limits<int>::min();
- if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop)) ||
+ if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop, false)) ||
(TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
Candidates.push_back(CandidateInfo(MI, Def, FI));
}
@@ -772,6 +779,32 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN,
BackTrace.clear();
InitRegPressure(Preheader);
+ // Compute information about whether it is allowed to move load instruction
+ // out of the current loop or one of the inner loops
+ SmallDenseMap<MachineLoop *, bool> AllowedToHoistLoads;
+ if (HoistConstLoads) {
+ SmallVector<MachineLoop *, 4> Worklist{CurLoop};
+
+ while (!Worklist.empty()) {
+ auto *L = Worklist.pop_back_val();
+ AllowedToHoistLoads[L] = true;
+ Worklist.insert(Worklist.end(), L->getSubLoops().begin(),
+ L->getSubLoops().end());
+ }
+
+ for (auto *MBB : CurLoop->blocks()) {
+ for (auto &MI : *MBB) {
+ if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef())) {
+ for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != CurLoop;
+ L = L->getParentLoop())
+ AllowedToHoistLoads[L] = false;
+ AllowedToHoistLoads[CurLoop] = false;
+ break;
+ }
+ }
+ }
+ }
+
// Now perform LICM.
for (MachineDomTreeNode *Node : Scopes) {
MachineBasicBlock *MBB = Node->getBlock();
@@ -780,9 +813,23 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN,
// Process the block
SpeculationState = SpeculateUnknown;
+
+ auto CanMoveLoad = [](MachineLoop *L) -> bool {
+ dbgs() << L << "\n";
+ for (auto *MBB : L->blocks()) {
+ for (auto &MI : *MBB) {
+ // Taken from MachineInstr::isSafeToMove
+ if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef()))
+ return false;
+ }
+ }
+ return true;
+ };
+
+ bool SafeToMoveLoad = HoistConstLoads && AllowedToHoistLoads[CurLoop];
for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
unsigned HoistRes = HoistResult::NotHoisted;
- HoistRes = Hoist(&MI, Preheader, CurLoop);
+ HoistRes = Hoist(&MI, Preheader, CurLoop, SafeToMoveLoad);
if (HoistRes & HoistResult::NotHoisted) {
// We have failed to hoist MI to outermost loop's preheader. If MI is in
// a subloop, try to hoist it to subloop's preheader.
@@ -793,9 +840,12 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN,
while (!InnerLoopWorkList.empty()) {
MachineLoop *InnerLoop = InnerLoopWorkList.pop_back_val();
+ bool SafeToMoveLoadInner =
+ HoistConstLoads && AllowedToHoistLoads[InnerLoop];
MachineBasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
if (InnerLoopPreheader) {
- HoistRes = Hoist(&MI, InnerLoopPreheader, InnerLoop);
+ HoistRes =
+ Hoist(&MI, InnerLoopPreheader, InnerLoop, SafeToMoveLoadInner);
if (HoistRes & HoistResult::Hoisted)
break;
}
@@ -990,9 +1040,10 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
/// Returns true if the instruction may be a suitable candidate for LICM.
/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
-bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
+bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad) {
// Check if it's safe to move the instruction.
- bool DontMoveAcrossStore = true;
+ bool DontMoveAcrossStore = !SafeToMoveLoad;
if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
!(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
@@ -1025,9 +1076,9 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
}
/// Returns true if the instruction is loop invariant.
-bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I,
- MachineLoop *CurLoop) {
- if (!IsLICMCandidate(I, CurLoop)) {
+bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad) {
+ if (!IsLICMCandidate(I, CurLoop, SafeToMoveLoad)) {
LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n");
return false;
}
@@ -1305,7 +1356,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI,
MBB->insert(Pos, NewMIs[1]);
// If unfolding produced a load that wasn't loop-invariant or profitable to
// hoist, discard the new instructions and bail.
- if (!IsLoopInvariantInst(*NewMIs[0], CurLoop) ||
+ if (!IsLoopInvariantInst(*NewMIs[0], CurLoop, /*SaveToMovLoad=*/false) ||
!IsProfitableToHoist(*NewMIs[0], CurLoop)) {
NewMIs[0]->eraseFromParent();
NewMIs[1]->eraseFromParent();
@@ -1432,7 +1483,7 @@ bool MachineLICMBase::MayCSE(MachineInstr *MI) {
/// that are safe to hoist, this instruction is called to do the dirty work.
/// It returns true if the instruction is hoisted.
unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader,
- MachineLoop *CurLoop) {
+ MachineLoop *CurLoop, bool SafeToMoveLoad) {
MachineBasicBlock *SrcBlock = MI->getParent();
// Disable the instruction hoisting due to block hotness
@@ -1444,7 +1495,7 @@ unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader,
}
// First check whether we should hoist this instruction.
bool HasExtractHoistableLoad = false;
- if (!IsLoopInvariantInst(*MI, CurLoop) ||
+ if (!IsLoopInvariantInst(*MI, CurLoop, SafeToMoveLoad) ||
!IsProfitableToHoist(*MI, CurLoop)) {
// If not, try unfolding a hoistable load.
MI = ExtractHoistableLoad(MI, CurLoop);
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
new file mode 100644
index 000000000000000..6b76b03fe00fc81
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -0,0 +1,433 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define i64 @one_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: one_dimensional:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz x2, .LBB0_4
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB0_2: // %for.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr x10, [x0], #8
+; CHECK-NEXT: ldr w10, [x10]
+; CHECK-NEXT: cmp w10, w9
+; CHECK-NEXT: cinc x8, x8, ne
+; CHECK-NEXT: subs x2, x2, #1
+; CHECK-NEXT: b.ne .LBB0_2
+; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+entry:
+ %cmp4 = icmp eq i64 %N, 0
+ br i1 %cmp4, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.body ]
+ ret i64 %sum.0.lcssa
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+ %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
+ %0 = load ptr, ptr %arrayidx, align 8
+ %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.05, %add
+ %inc = add nuw i64 %i.06, 1
+ %exitcond = icmp eq i64 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: two_dimensional:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: cbz x2, .LBB1_6
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: cbz x3, .LBB1_6
+; CHECK-NEXT: // %bb.2: // %for.cond1.preheader.preheader
+; CHECK-NEXT: ldr w10, [x1]
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB1_3: // %for.cond1.preheader
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB1_4 Depth 2
+; CHECK-NEXT: ldr x11, [x0, x9, lsl #3]
+; CHECK-NEXT: mov x12, x3
+; CHECK-NEXT: .LBB1_4: // %for.body4
+; CHECK-NEXT: // Parent Loop BB1_3 Depth=1
+; CHECK-NEXT: // => This Inner Loop Header: Depth=2
+; CHECK-NEXT: ldr x13, [x11], #8
+; CHECK-NEXT: ldr w13, [x13]
+; CHECK-NEXT: cmp w13, w10
+; CHECK-NEXT: cinc x8, x8, ne
+; CHECK-NEXT: subs x12, x12, #1
+; CHECK-NEXT: b.ne .LBB1_4
+; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond.cleanup3_crit_edge
+; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1
+; CHECK-NEXT: add x9, x9, #1
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: b.ne .LBB1_3
+; CHECK-NEXT: .LBB1_6: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+entry:
+ %cmp17 = icmp eq i64 %N, 0
+ %cmp214 = icmp eq i64 %M, 0
+ %or.cond = or i1 %cmp17, %cmp214
+ br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge
+ %i.019 = phi i64 [ %inc7, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ]
+ %sum.018 = phi i64 [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019
+ %0 = load ptr, ptr %arrayidx, align 8
+ br label %for.body4
+
+for.body4: ; preds = %for.cond1.preheader, %for.body4
+ %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
+ %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ]
+ %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016
+ %1 = load ptr, ptr %arrayidx5, align 8
+ %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.115, %add
+ %inc = add nuw i64 %j.016, 1
+ %exitcond = icmp eq i64 %inc, %M
+ br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge, label %for.body4
+
+for.cond1.for.cond.cleanup3_crit_edge: ; preds = %for.body4
+ %inc7 = add nuw i64 %i.019, 1
+ %exitcond22 = icmp eq i64 %inc7, %N
+ br i1 %exitcond22, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ]
+ ret i64 %sum.0.lcssa
+}
+
+define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: three_dimensional_middle:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: cbz x2, .LBB2_9
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: cbz x3, .LBB2_9
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: cbz x4, .LBB2_9
+; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: .LBB2_4: // %for.cond1.preheader
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB2_5 Depth 2
+; CHECK-NEXT: // Child Loop BB2_6 Depth 3
+; CHECK-NEXT: ldr x10, [x8, x9, lsl #3]
+; CHECK-NEXT: mov x11, xzr
+; CHECK-NEXT: .LBB2_5: // %for.cond5.preheader
+; CHECK-NEXT: // Parent Loop BB2_4 Depth=1
+; CHECK-NEXT: // => This Loop Header: Depth=2
+; CHECK-NEXT: // Child Loop BB2_6 Depth 3
+; CHECK-NEXT: lsl x12, x11, #3
+; CHECK-NEXT: mov x14, x4
+; CHECK-NEXT: ldr x13, [x1, x12]
+; CHECK-NEXT: ldr x12, [x10, x12]
+; CHECK-NEXT: ldr w13, [x13]
+; CHECK-NEXT: .LBB2_6: // %for.body8
+; CHECK-NEXT: // Parent Loop BB2_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB2_5 Depth=2
+; CHECK-NEXT: // => This Inner Loop Header: Depth=3
+; CHECK-NEXT: ldr x15, [x12], #8
+; CHECK-NEXT: ldr w15, [x15]
+; CHECK-NEXT: cmp w15, w13
+; CHECK-NEXT: cinc x0, x0, ne
+; CHECK-NEXT: subs x14, x14, #1
+; CHECK-NEXT: b.ne .LBB2_6
+; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond
+; CHECK-NEXT: // in Loop: Header=BB2_5 Depth=2
+; CHECK-NEXT: add x11, x11, #1
+; CHECK-NEXT: cmp x11, x3
+; CHECK-NEXT: b.ne .LBB2_5
+; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond
+; CHECK-NEXT: // in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT: add x9, x9, #1
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: b.ne .LBB2_4
+; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %cmp33 = icmp eq i64 %N, 0
+ %cmp229 = icmp eq i64 %M, 0
+ %or.cond = or i1 %cmp33, %cmp229
+ %cmp626 = icmp eq i64 %K, 0
+ %or.cond48 = or i1 %or.cond, %cmp626
+ br i1 %or.cond48, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond
+ %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ]
+ %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035
+ %0 = load ptr, ptr %arrayidx, align 8
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader
+ %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ]
+ %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
+ %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031
+ %1 = load ptr, ptr %arrayidx9, align 8
+ %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031
+ %2 = load ptr, ptr %arrayidx11, align 8
+ br label %for.body8
+
+for.body8: ; preds = %for.body8, %for.cond5.preheader
+ %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
+ %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
+ %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028
+ %3 = load ptr, ptr %arrayidx10, align 8
+ %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.227, %add
+ %inc = add nuw i64 %k.028, 1
+ %exitcond = icmp eq i64 %inc, %K
+ br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
+
+for.cond5.for.cond: ; preds = %for.body8
+ %inc13 = add nuw i64 %j.031, 1
+ %exitcond46 = icmp eq i64 %inc13, %M
+ br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader
+
+for.cond1.for.cond: ; preds = %for.cond5.for.cond
+ %inc16 = add nuw i64 %i.035, 1
+ %exitcond47 = icmp eq i64 %inc16, %N
+ br i1 %exitcond47, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ]
+ ret i64 %sum.0.lcssa
+}
+
+define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: three_dimensional:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: cbz x2, .LBB3_9
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: cbz x3, .LBB3_9
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: cbz x4, .LBB3_9
+; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader
+; CHECK-NEXT: ldr w10, [x1]
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB3_4: // %for.cond1.preheader
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB3_5 Depth 2
+; CHECK-NEXT: // Child Loop BB3_6 Depth 3
+; CHECK-NEXT: ldr x11, [x0, x9, lsl #3]
+; CHECK-NEXT: mov x12, xzr
+; CHECK-NEXT: .LBB3_5: // %for.cond5.preheader
+; CHECK-NEXT: // Parent Loop BB3_4 Depth=1
+; CHECK-NEXT: // => This Loop Header: Depth=2
+; CHECK-NEXT: // Child Loop BB3_6 Depth 3
+; CHECK-NEXT: ldr x13, [x11, x12, lsl #3]
+; CHECK-NEXT: mov x14, x4
+; CHECK-NEXT: .LBB3_6: // %for.body8
+; CHECK-NEXT: // Parent Loop BB3_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB3_5 Depth=2
+; CHECK-NEXT: // => This Inner Loop Header: Depth=3
+; CHECK-NEXT: ldr x15, [x13], #8
+; CHECK-NEXT: ldr w15, [x15]
+; CHECK-NEXT: cmp w15, w10
+; CHECK-NEXT: cinc x8, x8, ne
+; CHECK-NEXT: subs x14, x14, #1
+; CHECK-NEXT: b.ne .LBB3_6
+; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond
+; CHECK-NEXT: // in Loop: Header=BB3_5 Depth=2
+; CHECK-NEXT: add x12, x12, #1
+; CHECK-NEXT: cmp x12, x3
+; CHECK-NEXT: b.ne .LBB3_5
+; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond
+; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT: add x9, x9, #1
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: b.ne .LBB3_4
+; CHECK-NEXT: .LBB3_9: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+entry:
+ %cmp31 = icmp eq i64 %N, 0
+ %cmp227 = icmp eq i64 %M, 0
+ %or.cond = or i1 %cmp31, %cmp227
+ %cmp624 = icmp eq i64 %K, 0
+ %or.cond46 = or i1 %or.cond, %cmp624
+ br i1 %or.cond46, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond
+ %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ]
+ %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033
+ %0 = load ptr, ptr %arrayidx, align 8
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader
+ %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ]
+ %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
+ %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029
+ %1 = load ptr, ptr %arrayidx9, align 8
+ br label %for.body8
+
+for.body8: ; preds = %for.body8, %for.cond5.preheader
+ %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
+ %sum.225 = phi i64 [ %s...
[truncated]
|
This patch affects several old tests, and I processed them differently:
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this @igogo-x86! Looks like a nice improvement, and is in line with what we already do for hoisting stores. I've not reviewed the code changes, but left some comments on the tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this test may care about the load being in the loop. Perhaps a good candidate for disabling load hoisting?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This regression test has guarded against some crashes in the past, so it is probably better to keep the set of flags closer to the default.
eaf9568
to
c4defb0
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems like a useful improvement to have @igogo-x86! I'd expect the IR version of LICM to hoist out most cases, but things like ExpandMemCmp, CodeGenPrepare or other pre-isel passes may introduce loads in loops that we do want to hoist out where possible.
c4defb0
to
0aa4a4f
Compare
0aa4a4f
to
be3d1d5
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks a lot neater now @igogo-x86, thanks! I just have two more minor comments, then it looks good I think.
llvm/lib/CodeGen/MachineLICM.cpp
Outdated
while (!TmpWorklist.empty()) { | ||
auto *L = TmpWorklist.pop_back_val(); | ||
AllowedToHoistLoads[L] = true; | ||
TmpWorklist.insert(TmpWorklist.end(), L->getSubLoops().begin(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, does MLI not already include subloops? I was hoping you could get away with just:
for (auto *L : Worklist) {
AllowedToHoistLoads[L] = true;
}
It's quite likely that was a bogus assumption. :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, at least it is the same behaviour as with IR Loops
e250cc6
to
8a9360b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Thanks for all the changes @igogo-x86.
Sometimes, loads can appear in a loop after the LICM pass is executed the final time. For example, ExpandMemCmp pass creates loads in a loop, and one of the operands may be an invariant address. This patch extends the pre-regalloc stage MachineLICM by allowing to hoist invariant loads from loops that don't have any stores or calls and allows load reorderings.
AArch64/machine-licm-hoist-load.ll - added memcmp-sized 6 test, comments, removed irrelevant lines and from AArch64/machine-licm-hoist-load.ll AArch64/sinksplat.ll, AArch64/zext-to-tbl.ll - regenerated check lines without `-hoist-const-loads=false` Adjusted tests after seeing that manually hoisted load results in the same assembly as the code where MachineLICM does this job: * Hexagon/reg-scavengebug-2.ll * Mips/lcb5.ll * Hexagon/swp-const-tc2.ll (added `-hoist-const-loads=false` flag as I am unsure about the test's purpose)
Also improve the algorithm to early exit when loop chain is proven to be unhoistable
8a9360b
to
37c7bc0
Compare
Sometimes, loads can appear in a loop after the LICM pass is executed the final time. For example, ExpandMemCmp pass creates loads in a loop, and one of the operands may be an invariant address. This patch extends the pre-regalloc stage MachineLICM by allowing to hoist invariant loads from loops that don't have any stores or calls and allows load reorderings.
Sometimes, loads can appear in a loop after the LICM pass is executed the final time. For example, ExpandMemCmp pass creates loads in a loop, and one of the operands may be an invariant address. This patch extends the pre-regalloc stage MachineLICM by allowing to hoist invariant loads from loops that don't have any stores or calls and allows load reorderings.
The improvements in 63917e1 / llvm#70796 do not check for memory barriers/unmodelled sideeffects, which means we may incorrectly hoist loads across memory barriers. Fix this by checking any machine instruction in the loop is a load-fold barrier.
if (!AllowedToHoistLoads[Loop]) | ||
continue; | ||
for (auto &MI : *MBB) { | ||
if (!MI.mayStore() && !MI.isCall() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this doesn't consider instructions with unmodelled sideeffects, like memory barriers and we now (incorrectly) hoist loads across memory barriers. I put up #116987
The improvements in 63917e1 / llvm#70796 do not check for memory barriers/unmodelled sideeffects, which means we may incorrectly hoist loads across memory barriers. Fix this by checking any machine instruction in the loop is a load-fold barrier.
llvm#116987) The improvements in 63917e1 / llvm#70796 do not check for memory barriers/unmodelled sideeffects, which means we may incorrectly hoist loads across memory barriers. Fix this by checking any machine instruction in the loop is a load-fold barrier. PR: llvm#116987 (cherry picked from commit ef102b4)
llvm#116987) The improvements in 63917e1 / llvm#70796 do not check for memory barriers/unmodelled sideeffects, which means we may incorrectly hoist loads across memory barriers. Fix this by checking any machine instruction in the loop is a load-fold barrier. PR: llvm#116987 (cherry picked from commit ef102b4)
Sometimes, loads can appear in a loop after the LICM pass is executed
the final time. For example, ExpandMemCmp pass creates loads in a loop,
and one of the operands may be an invariant address.
This patch extends the pre-regalloc stage MachineLICM by allowing to
hoist invariant loads from loops that don't have any stores or calls
and allows load reorderings.