Skip to content

Commit

Permalink
Implement lazy FPU stacking
Browse files Browse the repository at this point in the history
Lazy FPU stacking greatly improves interrupt latency and context switch time
for an SDK build with FPU support. This initial implementation can be unstable,
has seen little testing and is not yet properly optimized. It may spawn evil
dragons that will leave a bloody mess in place of what once was your cute,
beloved kitten. Studies have shown that cleaning up mauled kittens from floors
is widely regarded as an unpleasant job, so be careful.
  • Loading branch information
ReservedField committed Jul 19, 2016
1 parent 64491fa commit c6f24c7
Show file tree
Hide file tree
Showing 9 changed files with 169 additions and 30 deletions.
32 changes: 20 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ OBJS := $(NUVOSDK)/Device/Nuvoton/M451Series/Source/system_M451Series.o \
src/startup/init.o \
src/startup/mainthread.o \
src/startup/sleep.o \
src/thread/Thread.o \
src/thread/Queue.o \
src/sysinfo/SysInfo.o \
src/dataflash/Dataflash.o \
src/display/Display_SSD.o \
Expand All @@ -41,6 +39,9 @@ OBJS := $(NUVOSDK)/Device/Nuvoton/M451Series/Source/system_M451Series.o \
src/battery/Battery.o \
src/atomizer/Atomizer.o

OBJS_NOFPU := src/thread/Thread.o \
src/thread/Queue.o

ifneq ($(EVICSDK_FPU_SUPPORT),)
FPUSPEC := fpu
else
Expand All @@ -62,6 +63,9 @@ DOCDIR := doc
ifneq ($(EVICSDK_FAULT_HANDLER),)
OBJS_CRT0 += src/startup/fault.o
endif
ifneq ($(EVICSDK_FPU_SUPPORT),)
OBJS_CRT0 += src/thread/UsageFault_fpu.o
endif

# We need to find out if on cygwin or not
ifeq ($(OS),Windows_NT)
Expand Down Expand Up @@ -119,14 +123,17 @@ ifneq ($(ARMGCC),)
ifdef NEED_FIXPATH
ifeq ($(WIN_CYG), 0)
OBJS_FIXPATH := $(OBJS)
OBJS_NOFPU_FIXPATH := $(OBJS_NOFPU)
OBJS_CRT0_FIXPATH := $(OBJS_CRT0)
else
OBJS_FIXPATH := $(shell cygpath -w $(OBJS))
OBJS_NOFPU_FIXPATH := $(shell cygpath -w $(OBJS_NOFPU))
OBJS_CRT0_FIXPATH := $(shell cygpath -w $(OBJS_CRT0))
EVICSDK := $(shell cygpath -w $(EVICSDK))
endif
else
OBJS_FIXPATH := $(OBJS)
OBJS_NOFPU_FIXPATH := $(OBJS_NOFPU)
OBJS_CRT0_FIXPATH := $(OBJS_CRT0)
endif
endif
Expand All @@ -151,34 +158,35 @@ INCDIRS := $(foreach d,$(shell arm-none-eabi-gcc -x c -v -E /dev/null 2>&1 | sed
-I$(NUVOSDK)/StdDriver/inc \
-Iinclude

CPUFLAGS := -mcpu=cortex-m4 -mthumb
CPUFLAGS_NOFPU := -mcpu=cortex-m4 -mthumb

ifneq ($(EVICSDK_FPU_SUPPORT),)
CPUFLAGS += -mfloat-abi=hard -mfpu=fpv4-sp-d16
CPUFLAGS := $(CPUFLAGS_NOFPU) -mfloat-abi=hard -mfpu=fpv4-sp-d16
CFLAGS += -DEVICSDK_FPU_SUPPORT
TARGET := libevicsdk_fpu
else
CPUFLAGS := $(CPUFLAGS_NOFPU)
TARGET := libevicsdk
endif

TARGET_CRT0 := $(TARGET)_crt0

CFLAGS += -Wall $(CPUFLAGS) -Os -fdata-sections -ffunction-sections
CFLAGS += -Wall -Os -fdata-sections -ffunction-sections
CFLAGS += $(INCDIRS)

ASFLAGS := $(CPUFLAGS)

all: env_check gen_tag $(TARGET_CRT0).o $(TARGET).a

$(OBJS_NOFPU_FIXPATH): CPUFLAGS := $(CPUFLAGS_NOFPU)

%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@
$(CC) $(CPUFLAGS) $(CFLAGS) -c $< -o $@

%.o: %.s
$(AS) $(ASFLAGS) -o $@ $<
$(AS) $(CPUFLAGS) $(ASFLAGS) -o $@ $<

$(TARGET).a: $(OBJS_FIXPATH)
$(TARGET).a: $(OBJS_FIXPATH) $(OBJS_NOFPU_FIXPATH)
test -d $(OUTDIR) || mkdir $(OUTDIR)
$(AR) -rv $(OUTDIR)/$(TARGET).a $(OBJS_FIXPATH)
$(AR) -rv $(OUTDIR)/$(TARGET).a $(OBJS_FIXPATH) $(OBJS_NOFPU_FIXPATH)

$(TARGET_CRT0).o: $(OBJS_CRT0_FIXPATH)
test -d $(OUTDIR) || mkdir $(OUTDIR)
Expand All @@ -188,7 +196,7 @@ docs:
doxygen

clean:
rm -rf $(OBJS) $(OBJS_CRT0) $(AEABI_OBJS) $(OUTDIR)/$(TARGET).a $(OUTDIR)/$(TARGET_CRT0).o $(OUTDIR) $(DOCDIR)
rm -rf $(OBJS) $(OBJS_NOFPU) $(OBJS_CRT0) $(AEABI_OBJS) $(OUTDIR)/$(TARGET).a $(OUTDIR)/$(TARGET_CRT0).o $(OUTDIR) $(DOCDIR)

env_check:
ifeq ($(ARMGCC),)
Expand Down
9 changes: 8 additions & 1 deletion make/Base.mk
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,15 @@ CPPFLAGS += -fno-exceptions -fno-rtti

ASFLAGS += $(CPUFLAGS)

# Yes, I know what I'm doing with --no-warn-mismatch.
# The thread library is compiled without FPU support to avoid issues with FPU
# context switching, which would normally result in a linker error due to
# different ABIs (soft/hard) when the SDK is compiled with FPU support. Since
# no function in the thread library accepts FP arguments, they will work fine
# together. Of course this trainwrecks when SDK is compiled with FPU support
# and APROM is not. Oh well...
LDFLAGS += $(LIBDIRS)
LDFLAGS += -nostdlib -nostartfiles -T$(LDSCRIPT) --gc-sections
LDFLAGS += -nostdlib -nostartfiles -T$(LDSCRIPT) --gc-sections --no-warn-mismatch

all: env_check $(TARGET).bin

Expand Down
6 changes: 6 additions & 0 deletions src/startup/fault.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,12 @@ static void Fault_Delay(uint16_t delay) {
void Fault_HandleHardFault(uint32_t *stack) {
uint8_t isHigh, btnState, oldBtnState, pressRelease;

#ifdef EVICSDK_FPU_SUPPORT
// Re-enable UsageFault for the thread library
// in case this is an escalated usage fault
SCB->SHCSR |= SCB_SHCSR_USGFAULTENA_Msk;
#endif

Display_Clear();
Fault_DumpFaultLow(stack);
Display_Update();
Expand Down
8 changes: 3 additions & 5 deletions src/startup/fpsetup_fpu.s
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@
.global Startup_FpSetup
.thumb_func
Startup_FpSetup:
@ Disable lazy stacking, enable FPU state saving
@ (ASPEN = 1, LSPEN = 0, i.e. FPCCR[31:30] = 10)
@ TODO: use lazy stacking to improve interrupt latency
@ Enable lazy stacking
@ (ASPEN = 1, LSPEN = 1, i.e. FPCCR[31:30] = 11)
LDR R0, =0xE000EF34
LDR R1, [R0]
BIC R1, R1, #(0x1 << 30)
ORR R1, R1, #(0x2 << 30)
ORR R1, R1, #(0x3 << 30)
STR R1, [R0]

@ Enable FPU (enable CP10/CP11, i.e. CPACR[23:20] = 1111)
Expand Down
13 changes: 5 additions & 8 deletions src/thread/ContextSwitch_fpu.s
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,19 @@ PendSV_Handler:

@ If needed, save old thread context
TEQ R1, #0
ITTT NE
ITT NE
MRSNE R2, PSP
STMNE R1!, {R2, R4-R11}
VSTMNE R1!, {S16-S31}

@ If needed, switch context
@ Also clear any exclusive lock held by the old thread
TEQ R0, #0
ITTTT NE
ITTT NE
LDMNE R0!, {R2, R4-R11}
VLDMNE R0!, {S16-S31}
MSRNE PSP, R2
CLREXNE

@ Return to thread mode, use PSP, restore FP state
LDR LR, =0xFFFFFFED

@ Resume thread
@ Return to thread mode, use PSP, no FP state
@ TODO: not sure about EXC_RETURN[4] = 1
LDR LR, =0xFFFFFFFD
BX LR
2 changes: 0 additions & 2 deletions src/thread/ContextSwitch_nofpu.s
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,4 @@ PendSV_Handler:

@ Return to thread mode, use PSP, no FP state
LDR LR, =0xFFFFFFFD

@ Resume thread
BX LR
5 changes: 5 additions & 0 deletions src/thread/Queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
* Copyright (C) 2016 ReservedField
*/

/*
* NOTE: this file is always compiled without hardware FPU support, to avoid
* issues with FPU context switching (Thread.c uses this queue implementation).
*/

#include <stdlib.h>
#include <Queue.h>

Expand Down
42 changes: 40 additions & 2 deletions src/thread/Thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
* Copyright (C) 2016 ReservedField
*/

/*
* NOTE: this file is always compiled without hardware FPU support, to avoid
* issues with FPU context switching.
*/

#include <stdlib.h>
#include <malloc.h>
#include <M451Series.h>
Expand Down Expand Up @@ -194,11 +199,10 @@ static Queue_t Thread_chronoQueue;

/**
* Current thread TCB.
* Accessed by threads and scheduler.
* This must be NULL even before Thread_Init().
* Synchronization: global critical section.
*/
Thread_TCB_t *Thread_curTcb = NULL;
static Thread_TCB_t *Thread_curTcb = NULL;

/**
* Critical section counter. Zero when not in a critical section.
Expand All @@ -211,6 +215,24 @@ static volatile uint32_t Thread_criticalCount;
*/
volatile uint32_t Thread_sysTick;

#ifdef EVICSDK_FPU_SUPPORT
/**
* Pointer to software-saved FPU state (Thread_SoftwareContex_t.s)
* of the thread that held FPU last. Will be saved when another
* thread uses the FPU. NULL if no thread holds the FPU state.
* Shared with the UsageFault handler.
*/
uint32_t *Thread_fpuHolderCtx = NULL;

/**
* Pointer to software-saved FPU state (Thread_SoftwareContext_t.s)
* for the current thread. Will be restored when the current thread
* uses the FPU. NULL if the current thread has no FPU state.
* Shared with the UsageFault handler.
*/
uint32_t *Thread_fpuCurCtx = NULL;
#endif

/**
* Inserts a thread into a queue ordered by info.chronoTime.
* This is an internal function.
Expand Down Expand Up @@ -401,12 +423,23 @@ uint64_t Thread_Schedule() {
// Switch to next thread
Thread_curTcb = nextTcb;
newCtx = &Thread_curTcb->ctx;
#ifdef EVICSDK_FPU_SUPPORT
Thread_fpuCurCtx = Thread_curTcb->ctx.s;
#endif

// Configure stack guard: stack is at the beginning
// of the allocated block.
primask = Thread_IrqDisable();
Thread_SetupStackGuard(Thread_curTcb->blockPtr);
Thread_IrqRestore(primask);

#ifdef EVICSDK_FPU_SUPPORT
// Disable FPU (CP10/CP11), i.e. CPACR[23:20] = 0000
SCB->CPACR &= ~(0xFUL << 20);
// Ensure write completed, flush pipeline
__DMB();
__ISB();
#endif
}

// Reset quantum
Expand Down Expand Up @@ -488,6 +521,11 @@ void Thread_Init() {
SysTick->VAL = 0;
SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk | SysTick_CTRL_TICKINT_Msk;
NVIC_SetPriority(SysTick_IRQn, 0 << 2);

#ifdef EVICSDK_FPU_SUPPORT
// Enable UsageFault for lazy stacking
SCB->SHCSR |= SCB_SHCSR_USGFAULTENA_Msk;
#endif
}

Thread_Error_t Thread_Create(Thread_t *thread, Thread_EntryPtr_t entry, void *args, uint16_t stackSize) {
Expand Down
82 changes: 82 additions & 0 deletions src/thread/UsageFault_fpu.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
@ This file is part of eVic SDK.
@
@ eVic SDK is free software: you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation, either version 3 of the License, or
@ (at your option) any later version.
@
@ eVic SDK is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with eVic SDK. If not, see <http://www.gnu.org/licenses/>.
@
@ Copyright (C) 2016 ReservedField

.syntax unified

.global UsageFault_Handler
UsageFault_Handler:
.thumb_func

@ Check if NOCP (CFSR[16+3]) is set
@ If not, this is not a CP denial
LDR R0, =0xE000ED28
LDR R1, [R0]
TST R1, #(1 << 19)
BEQ UsageFault_Handler_escalate

@ Check if FPU is enabled (i.e. CPACR[23:20] != 0000)
@ If it is, this is not an FPU CP denial
LDR R0, =0xE000ED88
LDR R1, [R0]
TST R1, #(0xF << 20)
BNE UsageFault_Handler_escalate

@ Enable FPU (enable CP10/CP11, i.e. CPACR[23:20] = 1111)
ORR R1, #(0xF << 20)
STR R1, [R0]
@ FPU enabled: sync barrier, flush pipeline
DSB
ISB

@ If the current thread is the FPU holder, we are done
LDR R0, =Thread_fpuHolderCtx
LDR R2, [R0]
LDR R1, =Thread_fpuCurCtx
LDR R3, [R1]
TEQ R2, R3
IT EQ
BXEQ LR

@ If needed, save FPU context for FPU holder
TEQ R2, #0
IT NE
VSTMNE R2, {S16-S31}

@ If needed, restore FPU context for current
@ thread and set it as FPU holder
TEQ R3, #0
ITT NE
VLDMNE R3, {S16-S31}
STRNE R3, [R0]

BX LR

UsageFault_Handler_escalate:
@ If this is not an FPU CP denial, we want to escalate
@ it to a hard fault to hand it over to the SDK fault
@ handler (if enabled). We do this by disabling the
@ UsageFault exception (i.e. SHCSR[18] = 0) and returning.
@ The faulting instruction will be executed again, and
@ this time it will escalate to a hard fault because the
@ UsageFault handler is disabled. If the SDK fault handler
@ is enabled, it will re-enable the UsageFault handler for
@ us. If not, the system will halt.
LDR R0, =0xE000ED24
LDR R1, [R0]
BIC R1, #(1 << 18)
STR R1, [R0]
BX LR

0 comments on commit c6f24c7

Please sign in to comment.