-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Allow spilling to unused Zcmp Stack #125959
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Sam Elliott (lenary) ChangesThis is a tiny change that can save up to 16 bytes of stack allocation, which is more beneficial on RV32 than RV64. cm.push allocates multiples of 16 bytes, but only uses a subset of those bytes for pushing callee-saved registers. Up to 12 (rv32) or 8 (rv64) bytes are left unused, depending on how many registers are pushed. Before this change, we told LLVM that the entire allocation was used, by creating a fixed stack object which covered the whole allocation. This change instead gives an accurate extent to the fixed stack object, to only cover the registers that have been pushed. This allows the PrologEpilogInserter to use any unused bytes for spills. Potentially this saves an extra move of the stack pointer after the push, because the push can allocate up to 48 more bytes than it needs for registers. We cannot do the same change for save/restore, because the restore routines restore in batches of This is stacked on #125939, ignore the first commit. Patch is 170.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125959.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index bb2e5781c34db6..c125a7f0173dbd 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1847,11 +1847,15 @@ bool RISCVFrameLowering::assignCalleeSavedSpillSlots(
MFI.setStackID(FrameIdx, TargetStackID::ScalableVector);
}
- // Allocate a fixed object that covers the full push or libcall size.
if (RVFI->isPushable(MF)) {
- if (int64_t PushSize = RVFI->getRVPushStackSize())
- MFI.CreateFixedSpillStackObject(PushSize, -PushSize);
+ // Allocate a fixed object that covers all the registers that are pushed.
+ if (unsigned PushedRegs = RVFI->getRVPushRegs()) {
+ int64_t PushedRegsBytes = static_cast<int64_t>(PushedRegs) * (STI.getXLen() / 8);
+ MFI.CreateFixedSpillStackObject(PushedRegsBytes, -PushedRegsBytes);
+ }
} else if (int LibCallRegs = getLibCallID(MF, CSI) + 1) {
+ // Allocate a fixed object that covers all of the stack allocated by the
+ // libcall.
int64_t LibCallFrameSize =
alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign());
MFI.CreateFixedSpillStackObject(LibCallFrameSize, -LibCallFrameSize);
diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
index 90a1ebec5abffe..f9f1ba60a8ac01 100644
--- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
+++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll
@@ -37,10 +37,11 @@
; This function tests that RISCVRegisterInfo::getCalleeSavedRegs returns
; something appropriate.
-define void @callee() nounwind {
+define void @callee() {
; RV32I-LABEL: callee:
; RV32I: # %bb.0:
; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: .cfi_def_cfa_offset 80
; RV32I-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s1, 68(sp) # 4-byte Folded Spill
@@ -54,6 +55,19 @@ define void @callee() nounwind {
; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: .cfi_offset s2, -16
+; RV32I-NEXT: .cfi_offset s3, -20
+; RV32I-NEXT: .cfi_offset s4, -24
+; RV32I-NEXT: .cfi_offset s5, -28
+; RV32I-NEXT: .cfi_offset s6, -32
+; RV32I-NEXT: .cfi_offset s7, -36
+; RV32I-NEXT: .cfi_offset s8, -40
+; RV32I-NEXT: .cfi_offset s9, -44
+; RV32I-NEXT: .cfi_offset s10, -48
+; RV32I-NEXT: .cfi_offset s11, -52
; RV32I-NEXT: lui a7, %hi(var)
; RV32I-NEXT: lw a0, %lo(var)(a7)
; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
@@ -145,15 +159,33 @@ define void @callee() nounwind {
; RV32I-NEXT: lw s9, 36(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s10, 32(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s11, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: .cfi_restore s2
+; RV32I-NEXT: .cfi_restore s3
+; RV32I-NEXT: .cfi_restore s4
+; RV32I-NEXT: .cfi_restore s5
+; RV32I-NEXT: .cfi_restore s6
+; RV32I-NEXT: .cfi_restore s7
+; RV32I-NEXT: .cfi_restore s8
+; RV32I-NEXT: .cfi_restore s9
+; RV32I-NEXT: .cfi_restore s10
+; RV32I-NEXT: .cfi_restore s11
; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: .cfi_def_cfa_offset 0
; RV32I-NEXT: ret
;
; RV32I-ILP32E-LABEL: callee:
; RV32I-ILP32E: # %bb.0:
; RV32I-ILP32E-NEXT: addi sp, sp, -36
+; RV32I-ILP32E-NEXT: .cfi_def_cfa_offset 36
; RV32I-ILP32E-NEXT: sw ra, 32(sp) # 4-byte Folded Spill
; RV32I-ILP32E-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
; RV32I-ILP32E-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
+; RV32I-ILP32E-NEXT: .cfi_offset ra, -4
+; RV32I-ILP32E-NEXT: .cfi_offset s0, -8
+; RV32I-ILP32E-NEXT: .cfi_offset s1, -12
; RV32I-ILP32E-NEXT: lui a7, %hi(var)
; RV32I-ILP32E-NEXT: lw a0, %lo(var)(a7)
; RV32I-ILP32E-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
@@ -235,12 +267,17 @@ define void @callee() nounwind {
; RV32I-ILP32E-NEXT: lw ra, 32(sp) # 4-byte Folded Reload
; RV32I-ILP32E-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
; RV32I-ILP32E-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
+; RV32I-ILP32E-NEXT: .cfi_restore ra
+; RV32I-ILP32E-NEXT: .cfi_restore s0
+; RV32I-ILP32E-NEXT: .cfi_restore s1
; RV32I-ILP32E-NEXT: addi sp, sp, 36
+; RV32I-ILP32E-NEXT: .cfi_def_cfa_offset 0
; RV32I-ILP32E-NEXT: ret
;
; RV32I-WITH-FP-LABEL: callee:
; RV32I-WITH-FP: # %bb.0:
; RV32I-WITH-FP-NEXT: addi sp, sp, -80
+; RV32I-WITH-FP-NEXT: .cfi_def_cfa_offset 80
; RV32I-WITH-FP-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; RV32I-WITH-FP-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; RV32I-WITH-FP-NEXT: sw s1, 68(sp) # 4-byte Folded Spill
@@ -254,7 +291,21 @@ define void @callee() nounwind {
; RV32I-WITH-FP-NEXT: sw s9, 36(sp) # 4-byte Folded Spill
; RV32I-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill
; RV32I-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill
+; RV32I-WITH-FP-NEXT: .cfi_offset ra, -4
+; RV32I-WITH-FP-NEXT: .cfi_offset s0, -8
+; RV32I-WITH-FP-NEXT: .cfi_offset s1, -12
+; RV32I-WITH-FP-NEXT: .cfi_offset s2, -16
+; RV32I-WITH-FP-NEXT: .cfi_offset s3, -20
+; RV32I-WITH-FP-NEXT: .cfi_offset s4, -24
+; RV32I-WITH-FP-NEXT: .cfi_offset s5, -28
+; RV32I-WITH-FP-NEXT: .cfi_offset s6, -32
+; RV32I-WITH-FP-NEXT: .cfi_offset s7, -36
+; RV32I-WITH-FP-NEXT: .cfi_offset s8, -40
+; RV32I-WITH-FP-NEXT: .cfi_offset s9, -44
+; RV32I-WITH-FP-NEXT: .cfi_offset s10, -48
+; RV32I-WITH-FP-NEXT: .cfi_offset s11, -52
; RV32I-WITH-FP-NEXT: addi s0, sp, 80
+; RV32I-WITH-FP-NEXT: .cfi_def_cfa s0, 0
; RV32I-WITH-FP-NEXT: lui t0, %hi(var)
; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(t0)
; RV32I-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill
@@ -335,6 +386,7 @@ define void @callee() nounwind {
; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(t0)
; RV32I-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload
; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(t0)
+; RV32I-WITH-FP-NEXT: .cfi_def_cfa sp, 80
; RV32I-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32I-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32I-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload
@@ -348,26 +400,54 @@ define void @callee() nounwind {
; RV32I-WITH-FP-NEXT: lw s9, 36(sp) # 4-byte Folded Reload
; RV32I-WITH-FP-NEXT: lw s10, 32(sp) # 4-byte Folded Reload
; RV32I-WITH-FP-NEXT: lw s11, 28(sp) # 4-byte Folded Reload
+; RV32I-WITH-FP-NEXT: .cfi_restore ra
+; RV32I-WITH-FP-NEXT: .cfi_restore s0
+; RV32I-WITH-FP-NEXT: .cfi_restore s1
+; RV32I-WITH-FP-NEXT: .cfi_restore s2
+; RV32I-WITH-FP-NEXT: .cfi_restore s3
+; RV32I-WITH-FP-NEXT: .cfi_restore s4
+; RV32I-WITH-FP-NEXT: .cfi_restore s5
+; RV32I-WITH-FP-NEXT: .cfi_restore s6
+; RV32I-WITH-FP-NEXT: .cfi_restore s7
+; RV32I-WITH-FP-NEXT: .cfi_restore s8
+; RV32I-WITH-FP-NEXT: .cfi_restore s9
+; RV32I-WITH-FP-NEXT: .cfi_restore s10
+; RV32I-WITH-FP-NEXT: .cfi_restore s11
; RV32I-WITH-FP-NEXT: addi sp, sp, 80
+; RV32I-WITH-FP-NEXT: .cfi_def_cfa_offset 0
; RV32I-WITH-FP-NEXT: ret
;
; RV32IZCMP-LABEL: callee:
; RV32IZCMP: # %bb.0:
-; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96
+; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -80
+; RV32IZCMP-NEXT: .cfi_def_cfa_offset 80
+; RV32IZCMP-NEXT: .cfi_offset ra, -52
+; RV32IZCMP-NEXT: .cfi_offset s0, -48
+; RV32IZCMP-NEXT: .cfi_offset s1, -44
+; RV32IZCMP-NEXT: .cfi_offset s2, -40
+; RV32IZCMP-NEXT: .cfi_offset s3, -36
+; RV32IZCMP-NEXT: .cfi_offset s4, -32
+; RV32IZCMP-NEXT: .cfi_offset s5, -28
+; RV32IZCMP-NEXT: .cfi_offset s6, -24
+; RV32IZCMP-NEXT: .cfi_offset s7, -20
+; RV32IZCMP-NEXT: .cfi_offset s8, -16
+; RV32IZCMP-NEXT: .cfi_offset s9, -12
+; RV32IZCMP-NEXT: .cfi_offset s10, -8
+; RV32IZCMP-NEXT: .cfi_offset s11, -4
; RV32IZCMP-NEXT: lui t0, %hi(var)
; RV32IZCMP-NEXT: lw a0, %lo(var)(t0)
-; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT: lw a0, %lo(var+4)(t0)
; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT: lw a0, %lo(var+8)(t0)
+; RV32IZCMP-NEXT: lw a0, %lo(var+4)(t0)
; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT: lw a0, %lo(var+12)(t0)
+; RV32IZCMP-NEXT: lw a0, %lo(var+8)(t0)
; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
+; RV32IZCMP-NEXT: lw a0, %lo(var+12)(t0)
+; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
; RV32IZCMP-NEXT: addi a5, t0, %lo(var)
; RV32IZCMP-NEXT: lw a0, 16(a5)
-; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32IZCMP-NEXT: lw a0, 20(a5)
; RV32IZCMP-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; RV32IZCMP-NEXT: lw a0, 20(a5)
+; RV32IZCMP-NEXT: sw a0, 4(sp) # 4-byte Folded Spill
; RV32IZCMP-NEXT: lw t4, 24(a5)
; RV32IZCMP-NEXT: lw t5, 28(a5)
; RV32IZCMP-NEXT: lw t6, 32(a5)
@@ -420,23 +500,24 @@ define void @callee() nounwind {
; RV32IZCMP-NEXT: sw t6, 32(a5)
; RV32IZCMP-NEXT: sw t5, 28(a5)
; RV32IZCMP-NEXT: sw t4, 24(a5)
-; RV32IZCMP-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
+; RV32IZCMP-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
; RV32IZCMP-NEXT: sw a0, 20(a5)
-; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32IZCMP-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
; RV32IZCMP-NEXT: sw a0, 16(a5)
-; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
+; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
; RV32IZCMP-NEXT: sw a0, %lo(var+12)(t0)
-; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
+; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
; RV32IZCMP-NEXT: sw a0, %lo(var+8)(t0)
-; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
+; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
; RV32IZCMP-NEXT: sw a0, %lo(var+4)(t0)
-; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload
+; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
; RV32IZCMP-NEXT: sw a0, %lo(var)(t0)
-; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96
+; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 80
;
; RV32IZCMP-WITH-FP-LABEL: callee:
; RV32IZCMP-WITH-FP: # %bb.0:
; RV32IZCMP-WITH-FP-NEXT: addi sp, sp, -80
+; RV32IZCMP-WITH-FP-NEXT: .cfi_def_cfa_offset 80
; RV32IZCMP-WITH-FP-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; RV32IZCMP-WITH-FP-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; RV32IZCMP-WITH-FP-NEXT: sw s1, 68(sp) # 4-byte Folded Spill
@@ -450,7 +531,21 @@ define void @callee() nounwind {
; RV32IZCMP-WITH-FP-NEXT: sw s9, 36(sp) # 4-byte Folded Spill
; RV32IZCMP-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill
; RV32IZCMP-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset ra, -4
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s0, -8
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s1, -12
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s2, -16
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s3, -20
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s4, -24
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s5, -28
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s6, -32
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s7, -36
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s8, -40
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s9, -44
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s10, -48
+; RV32IZCMP-WITH-FP-NEXT: .cfi_offset s11, -52
; RV32IZCMP-WITH-FP-NEXT: addi s0, sp, 80
+; RV32IZCMP-WITH-FP-NEXT: .cfi_def_cfa s0, 0
; RV32IZCMP-WITH-FP-NEXT: lui t1, %hi(var)
; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(t1)
; RV32IZCMP-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill
@@ -531,6 +626,7 @@ define void @callee() nounwind {
; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(t1)
; RV32IZCMP-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload
; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(t1)
+; RV32IZCMP-WITH-FP-NEXT: .cfi_def_cfa sp, 80
; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32IZCMP-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32IZCMP-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload
@@ -544,12 +640,27 @@ define void @callee() nounwind {
; RV32IZCMP-WITH-FP-NEXT: lw s9, 36(sp) # 4-byte Folded Reload
; RV32IZCMP-WITH-FP-NEXT: lw s10, 32(sp) # 4-byte Folded Reload
; RV32IZCMP-WITH-FP-NEXT: lw s11, 28(sp) # 4-byte Folded Reload
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore ra
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s0
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s1
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s2
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s3
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s4
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s5
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s6
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s7
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s8
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s9
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s10
+; RV32IZCMP-WITH-FP-NEXT: .cfi_restore s11
; RV32IZCMP-WITH-FP-NEXT: addi sp, sp, 80
+; RV32IZCMP-WITH-FP-NEXT: .cfi_def_cfa_offset 0
; RV32IZCMP-WITH-FP-NEXT: ret
;
; RV64I-LABEL: callee:
; RV64I: # %bb.0:
; RV64I-NEXT: addi sp, sp, -160
+; RV64I-NEXT: .cfi_def_cfa_offset 160
; RV64I-NEXT: sd ra, 152(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s0, 144(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s1, 136(sp) # 8-byte Folded Spill
@@ -563,6 +674,19 @@ define void @callee() nounwind {
; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill
; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: .cfi_offset s2, -32
+; RV64I-NEXT: .cfi_offset s3, -40
+; RV64I-NEXT: .cfi_offset s4, -48
+; RV64I-NEXT: .cfi_offset s5, -56
+; RV64I-NEXT: .cfi_offset s6, -64
+; RV64I-NEXT: .cfi_offset s7, -72
+; RV64I-NEXT: .cfi_offset s8, -80
+; RV64I-NEXT: .cfi_offset s9, -88
+; RV64I-NEXT: .cfi_offset s10, -96
+; RV64I-NEXT: .cfi_offset s11, -104
; RV64I-NEXT: lui a7, %hi(var)
; RV64I-NEXT: lw a0, %lo(var)(a7)
; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
@@ -654,15 +778,33 @@ define void @callee() nounwind {
; RV64I-NEXT: ld s9, 72(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s10, 64(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s11, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: .cfi_restore s2
+; RV64I-NEXT: .cfi_restore s3
+; RV64I-NEXT: .cfi_restore s4
+; RV64I-NEXT: .cfi_restore s5
+; RV64I-NEXT: .cfi_restore s6
+; RV64I-NEXT: .cfi_restore s7
+; RV64I-NEXT: .cfi_restore s8
+; RV64I-NEXT: .cfi_restore s9
+; RV64I-NEXT: .cfi_restore s10
+; RV64I-NEXT: .cfi_restore s11
; RV64I-NEXT: addi sp, sp, 160
+; RV64I-NEXT: .cfi_def_cfa_offset 0
; RV64I-NEXT: ret
;
; RV64I-LP64E-LABEL: callee:
; RV64I-LP64E: # %bb.0:
; RV64I-LP64E-NEXT: addi sp, sp, -72
+; RV64I-LP64E-NEXT: .cfi_def_cfa_offset 72
; RV64I-LP64E-NEXT: sd ra, 64(sp) # 8-byte Folded Spill
; RV64I-LP64E-NEXT: sd s0, 56(sp) # 8-byte Folded Spill
; RV64I-LP64E-NEXT: sd s1, 48(sp) # 8-byte Folded Spill
+; RV64I-LP64E-NEXT: .cfi_offset ra, -8
+; RV64I-LP64E-NEXT: .cfi_offset s0, -16
+; RV64I-LP64E-NEXT: .cfi_offset s1, -24
; RV64I-LP64E-NEXT: lui a7, %hi(var)
; RV64I-LP64E-NEXT: lw a0, %lo(var)(a7)
; RV64I-LP64E-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
@@ -744,12 +886,17 @@ define void @callee() nounwind {
; RV64I-LP64E-NEXT: ld ra, 64(sp) # 8-byte Folded Reload
; RV64I-LP64E-NEXT: ld s0, 56(sp) # 8-byte Folded Reload
; RV64I-LP64E-NEXT: ld s1, 48(sp) # 8-byte Folded Reload
+; RV64I-LP64E-NEXT: .cfi_restore ra
+; RV64I-LP64E-NEXT: .cfi_restore s0
+; RV64I-LP64E-NEXT: .cfi_restore s1
; RV64I-LP64E-NEXT: addi sp, sp, 72
+; RV64I-LP64E-NEXT: .cfi_def_cfa_offset 0
; RV64I-LP64E-NEXT: ret
;
; RV64I-WITH-FP-LABEL: callee:
; RV64I-WITH-FP: # %bb.0:
; RV64I-WITH-FP-NEXT: addi sp, sp, -160
+; RV64I-WITH-FP-NEXT: .cfi_def_cfa_offset 160
; RV64I-WITH-FP-NEXT: sd ra, 152(sp) # 8-byte Folded Spill
; RV64I-WITH-FP-NEXT: sd s0, 144(sp) # 8-byte Folded Spill
; RV64I-WITH-FP-NEXT: sd s1, 136(sp) # 8-byte Folded Spill
@@ -763,7 +910,21 @@ define void @callee() nounwind {
; RV64I-WITH-FP-NEXT: sd s9, 72(sp) # 8-byte Folded Spill
; RV64I-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill
; RV64I-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill
+; RV64I-WITH-FP-NEXT: .cfi_offset ra, -8
+; RV64I-WITH-FP-NEXT: .cfi_offset s0, -16
+; RV64I-WITH-FP-NEXT: .cfi_offset s1, -24
+; RV64I-WITH-FP-NEXT: .cfi_offset s2, -32
+; RV64I-WITH-FP-NEXT: .cfi_offset s3, -40
+; RV64I-WITH-FP-NEXT: .cfi_offset s4, -48
+; RV64I-WITH-FP-NEXT: .cfi_offset s5, -56
+; RV64I-WITH-FP-NEXT: .cfi_offset s6, -64
+; RV64I-WITH-FP-NEXT: .cfi_offset s7, -72
+; RV64I-WITH-FP-NEXT: .cfi_offset s8, -80
+; RV64I-WITH-FP-NEXT: .cfi_offset s9, -88
+; RV64I-WITH-FP-NEXT: .cfi_offset s10, -96
+; RV64I-WITH-FP-NEXT: .cfi_offset s11, -104
; RV64I-WITH-FP-NEXT: addi s0, sp, 160
+; RV64I-WITH-FP-NEXT: .cfi_def_cfa s0, 0
; RV64I-WITH-FP-NEXT: lui t0, %hi(var)
; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(t0)
; RV64I-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill
@@ -844,6 +1005,7 @@ define void @callee() nounwind {
; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(t0)
; RV64I-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload
; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(t0)
+; RV64I-WITH-FP-NEXT: .cfi_def_cfa sp, 160
; RV64I-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload
; RV64I-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload
; RV64I-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload
@@ -857,26 +1019,54 @@ define void @callee() nounwind {
; RV64I-WITH-FP-NEXT: ld s9, 72(sp) # 8-byte Folded Reload
; RV64I-WITH-FP-NEXT: ld s10, 64(sp) # 8-byte Folded Reload
; RV64I-WITH-FP-NEXT: ld s11, 56(sp) # 8-byte Folded Reload
+; RV64I-WITH-FP-NEXT: .cfi_restore ra
+; RV64I-WITH-FP-NEXT: .cfi_restore s0
+; RV64I-WITH-FP-NEXT: .cfi_restore s1
+; RV64I-WITH-FP-NEXT: .cfi_restore s2
+; RV64I-WITH-FP-NEXT: .cfi_restore s3
+; RV64I-WITH-FP-NEXT: .cfi_restore s4
+; RV64I-WITH-FP-NEXT: .cfi_restore s5
+; RV64I-WITH-FP-NEXT: .cfi_restore s6
+; RV64I-WITH-FP-NEXT: .cfi_restore s7
+; RV64I-WITH-FP-NEXT: .cfi_restore s8
+; RV64I-WITH-FP-NEXT: .cfi_restore s9
+; RV64I-WITH-FP-NEXT: .cfi_restore s10
+; RV64I-WITH-FP-NEXT: .cfi_restore s11
; RV64I-WITH-FP-NEXT: addi sp, sp, 160
+; RV64I-WITH-FP-NEXT: .cfi_def_cfa_offset 0
; RV64I-WITH-FP-NEXT: ret
;
; RV64IZCMP-LABEL: callee:
; RV64IZCMP: # %bb.0:
; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160
+; RV64IZCMP-NEXT: .cfi_def_cfa_offset 160
+; RV64IZCMP-NEXT: .cfi_offset ra, -104
+; RV64IZCMP-NEXT: .cfi_offset s0, -96
+; RV64IZCMP-NEXT: .cfi_offset s1, -88
+; RV64IZCMP-NEXT: .cfi_offset s2, -80
+; RV64IZCMP-NEXT: .cfi_offset s3, -72
+; RV64IZCMP-NEXT: .cfi_offset s4, -64
+; RV64IZCMP-NEXT: .cfi_offset s5, -56
+; RV64IZCMP-NEXT: .cfi_offset s6, -48
+; RV64IZCMP-NEXT: .cfi_offset s7, -40
+; RV64IZCMP-NEXT: .cfi_offset s8, -32
+; RV64IZCMP-NEXT: .cfi_offset s9, -24
+; RV64IZCMP-NEXT: .cfi_offset s10, -16
+; RV64IZCMP-NEXT: .cfi_offset s11, -8
; RV64IZCMP-NEXT: lui t0, %hi(var)
; RV64IZCMP-NEXT: lw a0, %lo(var)(t0)
-; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
+; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
; RV64IZCMP-NEXT: lw a0, %lo(var+4)(t0)
-; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
+; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
; RV64IZCMP-NEXT: lw a0, %lo(var+8)(t0)
-; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
+; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byt...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
; RV32-NEXT: addi sp, sp, -4 | ||
; RV32-NEXT: .cfi_def_cfa_offset 20 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This stack pointer looks unaligned but it's not - because the whole test uses ilp32e
if (RVFI->isPushable(MF)) { | ||
if (int64_t PushSize = RVFI->getRVPushStackSize()) | ||
MFI.CreateFixedSpillStackObject(PushSize, -PushSize); | ||
// Allocate a fixed object that covers all the registers that are pushed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a bit curious if this object is needed at all. We already created an object for each register. The save/restore needs an object to cover the missing registers, but maybe push/pop doesn't?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might not have created an object for each register. See void @spill_x10()
in llvm/test/CodeGen/RISCV/push-pop-popret.ll
, where from what I can tell, we only have an object for s10
and s11
. For the same reasons as with save/restore, we cannot clobber the stack used by the other (lower-numbered) callee-saved registers, because we definitely restore into them with pop
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok thanks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This is a tiny change that can save up to 16 bytes of stack allocation, which is more beneficial on RV32 than RV64. cm.push allocates multiples of 16 bytes, but only uses a subset of those bytes for pushing callee-saved registers. Up to 12 (rv32) or 8 (rv64) bytes are left unused, depending on how many registers are pushed. Before this change, we told LLVM that the entire allocation was used, by creating a fixed stack object which covered the whole allocation. This change instead gives an accurate extent to the fixed stack object, to only cover the registers that have been pushed. This allows the PrologEpilogInserter to use any unused bytes for spills. Potentially this saves an extra move of the stack pointer after the push, because the push can allocate up to 48 more bytes than it needs for registers. We cannot do the same change for save/restore, because the restore routines restore in batches of `stackalign/(xlen/8)` registers, and we don't want to clobber the saved values of registers that we didn't tell the compiler we were saving/restoring - for instance `__riscv_restore_0` is used by the compiler when it only wants to save `ra`, but will end up restoring `ra` and `s0`.
8061805
to
9a0c3cf
Compare
The build issue is in |
This is a tiny change that can save up to 16 bytes of stack allocation, which is more beneficial on RV32 than RV64. cm.push allocates multiples of 16 bytes, but only uses a subset of those bytes for pushing callee-saved registers. Up to 12 (rv32) or 8 (rv64) bytes are left unused, depending on how many registers are pushed. Before this change, we told LLVM that the entire allocation was used, by creating a fixed stack object which covered the whole allocation. This change instead gives an accurate extent to the fixed stack object, to only cover the registers that have been pushed. This allows the PrologEpilogInserter to use any unused bytes for spills. Potentially this saves an extra move of the stack pointer after the push, because the push can allocate up to 48 more bytes than it needs for registers. We cannot do the same change for save/restore, because the restore routines restore in batches of `stackalign/(xlen/8)` registers, and we don't want to clobber the saved values of registers that we didn't tell the compiler we were saving/restoring - for instance `__riscv_restore_0` is used by the compiler when it only wants to save `ra`, but will end up restoring `ra` and `s0`.
This is a tiny change that can save up to 16 bytes of stack allocation, which is more beneficial on RV32 than RV64.
cm.push allocates multiples of 16 bytes, but only uses a subset of those bytes for pushing callee-saved registers. Up to 12 (rv32) or 8 (rv64) bytes are left unused, depending on how many registers are pushed. Before this change, we told LLVM that the entire allocation was used, by creating a fixed stack object which covered the whole allocation.
This change instead gives an accurate extent to the fixed stack object, to only cover the registers that have been pushed. This allows the PrologEpilogInserter to use any unused bytes for spills. Potentially this saves an extra move of the stack pointer after the push, because the push can allocate up to 48 more bytes than it needs for registers.
We cannot do the same change for save/restore, because the restore routines restore in batches of
stackalign/(xlen/8)
registers, and we don't want to clobber the saved values of registers that we didn't tell the compiler we were saving/restoring - for instance__riscv_restore_0
is used by the compiler when it only wants to savera
, but will end up restoringra
ands0
.