From c6f24c7116bac90224071466cca6f85993462c75 Mon Sep 17 00:00:00 2001 From: ReservedField Date: Wed, 20 Jul 2016 00:39:46 +0200 Subject: [PATCH] Implement lazy FPU stacking Lazy FPU stacking greatly improves interrupt latency and context switch time for an SDK build with FPU support. This initial implementation can be unstable, has seen little testing and is not yet properly optimized. It may spawn evil dragons that will leave a bloody mess in place of what once was your cute, beloved kitten. Studies have shown that cleaning up mauled kittens from floors is widely regarded as an unpleasant job, so be careful. --- Makefile | 32 ++++++++----- make/Base.mk | 9 +++- src/startup/fault.c | 6 +++ src/startup/fpsetup_fpu.s | 8 ++-- src/thread/ContextSwitch_fpu.s | 13 ++--- src/thread/ContextSwitch_nofpu.s | 2 - src/thread/Queue.c | 5 ++ src/thread/Thread.c | 42 +++++++++++++++- src/thread/UsageFault_fpu.s | 82 ++++++++++++++++++++++++++++++++ 9 files changed, 169 insertions(+), 30 deletions(-) create mode 100644 src/thread/UsageFault_fpu.s diff --git a/Makefile b/Makefile index 6e193c9..92fb812 100644 --- a/Makefile +++ b/Makefile @@ -24,8 +24,6 @@ OBJS := $(NUVOSDK)/Device/Nuvoton/M451Series/Source/system_M451Series.o \ src/startup/init.o \ src/startup/mainthread.o \ src/startup/sleep.o \ - src/thread/Thread.o \ - src/thread/Queue.o \ src/sysinfo/SysInfo.o \ src/dataflash/Dataflash.o \ src/display/Display_SSD.o \ @@ -41,6 +39,9 @@ OBJS := $(NUVOSDK)/Device/Nuvoton/M451Series/Source/system_M451Series.o \ src/battery/Battery.o \ src/atomizer/Atomizer.o +OBJS_NOFPU := src/thread/Thread.o \ + src/thread/Queue.o + ifneq ($(EVICSDK_FPU_SUPPORT),) FPUSPEC := fpu else @@ -62,6 +63,9 @@ DOCDIR := doc ifneq ($(EVICSDK_FAULT_HANDLER),) OBJS_CRT0 += src/startup/fault.o endif +ifneq ($(EVICSDK_FPU_SUPPORT),) + OBJS_CRT0 += src/thread/UsageFault_fpu.o +endif # We need to find out if on cygwin or not ifeq ($(OS),Windows_NT) @@ -119,14 +123,17 @@ ifneq ($(ARMGCC),) ifdef NEED_FIXPATH ifeq ($(WIN_CYG), 0) OBJS_FIXPATH := $(OBJS) + OBJS_NOFPU_FIXPATH := $(OBJS_NOFPU) OBJS_CRT0_FIXPATH := $(OBJS_CRT0) else OBJS_FIXPATH := $(shell cygpath -w $(OBJS)) + OBJS_NOFPU_FIXPATH := $(shell cygpath -w $(OBJS_NOFPU)) OBJS_CRT0_FIXPATH := $(shell cygpath -w $(OBJS_CRT0)) EVICSDK := $(shell cygpath -w $(EVICSDK)) endif else OBJS_FIXPATH := $(OBJS) + OBJS_NOFPU_FIXPATH := $(OBJS_NOFPU) OBJS_CRT0_FIXPATH := $(OBJS_CRT0) endif endif @@ -151,34 +158,35 @@ INCDIRS := $(foreach d,$(shell arm-none-eabi-gcc -x c -v -E /dev/null 2>&1 | sed -I$(NUVOSDK)/StdDriver/inc \ -Iinclude -CPUFLAGS := -mcpu=cortex-m4 -mthumb +CPUFLAGS_NOFPU := -mcpu=cortex-m4 -mthumb ifneq ($(EVICSDK_FPU_SUPPORT),) - CPUFLAGS += -mfloat-abi=hard -mfpu=fpv4-sp-d16 + CPUFLAGS := $(CPUFLAGS_NOFPU) -mfloat-abi=hard -mfpu=fpv4-sp-d16 CFLAGS += -DEVICSDK_FPU_SUPPORT TARGET := libevicsdk_fpu else + CPUFLAGS := $(CPUFLAGS_NOFPU) TARGET := libevicsdk endif TARGET_CRT0 := $(TARGET)_crt0 -CFLAGS += -Wall $(CPUFLAGS) -Os -fdata-sections -ffunction-sections +CFLAGS += -Wall -Os -fdata-sections -ffunction-sections CFLAGS += $(INCDIRS) -ASFLAGS := $(CPUFLAGS) - all: env_check gen_tag $(TARGET_CRT0).o $(TARGET).a +$(OBJS_NOFPU_FIXPATH): CPUFLAGS := $(CPUFLAGS_NOFPU) + %.o: %.c - $(CC) $(CFLAGS) -c $< -o $@ + $(CC) $(CPUFLAGS) $(CFLAGS) -c $< -o $@ %.o: %.s - $(AS) $(ASFLAGS) -o $@ $< + $(AS) $(CPUFLAGS) $(ASFLAGS) -o $@ $< -$(TARGET).a: $(OBJS_FIXPATH) +$(TARGET).a: $(OBJS_FIXPATH) $(OBJS_NOFPU_FIXPATH) test -d $(OUTDIR) || mkdir $(OUTDIR) - $(AR) -rv $(OUTDIR)/$(TARGET).a $(OBJS_FIXPATH) + $(AR) -rv $(OUTDIR)/$(TARGET).a $(OBJS_FIXPATH) $(OBJS_NOFPU_FIXPATH) $(TARGET_CRT0).o: $(OBJS_CRT0_FIXPATH) test -d $(OUTDIR) || mkdir $(OUTDIR) @@ -188,7 +196,7 @@ docs: doxygen clean: - rm -rf $(OBJS) $(OBJS_CRT0) $(AEABI_OBJS) $(OUTDIR)/$(TARGET).a $(OUTDIR)/$(TARGET_CRT0).o $(OUTDIR) $(DOCDIR) + rm -rf $(OBJS) $(OBJS_NOFPU) $(OBJS_CRT0) $(AEABI_OBJS) $(OUTDIR)/$(TARGET).a $(OUTDIR)/$(TARGET_CRT0).o $(OUTDIR) $(DOCDIR) env_check: ifeq ($(ARMGCC),) diff --git a/make/Base.mk b/make/Base.mk index a32f7a5..2a350d3 100644 --- a/make/Base.mk +++ b/make/Base.mk @@ -108,8 +108,15 @@ CPPFLAGS += -fno-exceptions -fno-rtti ASFLAGS += $(CPUFLAGS) +# Yes, I know what I'm doing with --no-warn-mismatch. +# The thread library is compiled without FPU support to avoid issues with FPU +# context switching, which would normally result in a linker error due to +# different ABIs (soft/hard) when the SDK is compiled with FPU support. Since +# no function in the thread library accepts FP arguments, they will work fine +# together. Of course this trainwrecks when SDK is compiled with FPU support +# and APROM is not. Oh well... LDFLAGS += $(LIBDIRS) -LDFLAGS += -nostdlib -nostartfiles -T$(LDSCRIPT) --gc-sections +LDFLAGS += -nostdlib -nostartfiles -T$(LDSCRIPT) --gc-sections --no-warn-mismatch all: env_check $(TARGET).bin diff --git a/src/startup/fault.c b/src/startup/fault.c index 7c808f7..d64341b 100644 --- a/src/startup/fault.c +++ b/src/startup/fault.c @@ -170,6 +170,12 @@ static void Fault_Delay(uint16_t delay) { void Fault_HandleHardFault(uint32_t *stack) { uint8_t isHigh, btnState, oldBtnState, pressRelease; +#ifdef EVICSDK_FPU_SUPPORT + // Re-enable UsageFault for the thread library + // in case this is an escalated usage fault + SCB->SHCSR |= SCB_SHCSR_USGFAULTENA_Msk; +#endif + Display_Clear(); Fault_DumpFaultLow(stack); Display_Update(); diff --git a/src/startup/fpsetup_fpu.s b/src/startup/fpsetup_fpu.s index 03a72a8..90de1b0 100644 --- a/src/startup/fpsetup_fpu.s +++ b/src/startup/fpsetup_fpu.s @@ -20,13 +20,11 @@ .global Startup_FpSetup .thumb_func Startup_FpSetup: - @ Disable lazy stacking, enable FPU state saving - @ (ASPEN = 1, LSPEN = 0, i.e. FPCCR[31:30] = 10) - @ TODO: use lazy stacking to improve interrupt latency + @ Enable lazy stacking + @ (ASPEN = 1, LSPEN = 1, i.e. FPCCR[31:30] = 11) LDR R0, =0xE000EF34 LDR R1, [R0] - BIC R1, R1, #(0x1 << 30) - ORR R1, R1, #(0x2 << 30) + ORR R1, R1, #(0x3 << 30) STR R1, [R0] @ Enable FPU (enable CP10/CP11, i.e. CPACR[23:20] = 1111) diff --git a/src/thread/ContextSwitch_fpu.s b/src/thread/ContextSwitch_fpu.s index a48f1cb..74995e2 100644 --- a/src/thread/ContextSwitch_fpu.s +++ b/src/thread/ContextSwitch_fpu.s @@ -36,22 +36,19 @@ PendSV_Handler: @ If needed, save old thread context TEQ R1, #0 - ITTT NE + ITT NE MRSNE R2, PSP STMNE R1!, {R2, R4-R11} - VSTMNE R1!, {S16-S31} @ If needed, switch context @ Also clear any exclusive lock held by the old thread TEQ R0, #0 - ITTTT NE + ITTT NE LDMNE R0!, {R2, R4-R11} - VLDMNE R0!, {S16-S31} MSRNE PSP, R2 CLREXNE - @ Return to thread mode, use PSP, restore FP state - LDR LR, =0xFFFFFFED - - @ Resume thread + @ Return to thread mode, use PSP, no FP state + @ TODO: not sure about EXC_RETURN[4] = 1 + LDR LR, =0xFFFFFFFD BX LR diff --git a/src/thread/ContextSwitch_nofpu.s b/src/thread/ContextSwitch_nofpu.s index f5d4767..662a754 100644 --- a/src/thread/ContextSwitch_nofpu.s +++ b/src/thread/ContextSwitch_nofpu.s @@ -50,6 +50,4 @@ PendSV_Handler: @ Return to thread mode, use PSP, no FP state LDR LR, =0xFFFFFFFD - - @ Resume thread BX LR diff --git a/src/thread/Queue.c b/src/thread/Queue.c index 4e8fdde..b14e976 100644 --- a/src/thread/Queue.c +++ b/src/thread/Queue.c @@ -17,6 +17,11 @@ * Copyright (C) 2016 ReservedField */ +/* + * NOTE: this file is always compiled without hardware FPU support, to avoid + * issues with FPU context switching (Thread.c uses this queue implementation). + */ + #include #include diff --git a/src/thread/Thread.c b/src/thread/Thread.c index 52677b5..d8e9bd4 100644 --- a/src/thread/Thread.c +++ b/src/thread/Thread.c @@ -17,6 +17,11 @@ * Copyright (C) 2016 ReservedField */ +/* + * NOTE: this file is always compiled without hardware FPU support, to avoid + * issues with FPU context switching. + */ + #include #include #include @@ -194,11 +199,10 @@ static Queue_t Thread_chronoQueue; /** * Current thread TCB. - * Accessed by threads and scheduler. * This must be NULL even before Thread_Init(). * Synchronization: global critical section. */ -Thread_TCB_t *Thread_curTcb = NULL; +static Thread_TCB_t *Thread_curTcb = NULL; /** * Critical section counter. Zero when not in a critical section. @@ -211,6 +215,24 @@ static volatile uint32_t Thread_criticalCount; */ volatile uint32_t Thread_sysTick; +#ifdef EVICSDK_FPU_SUPPORT +/** + * Pointer to software-saved FPU state (Thread_SoftwareContex_t.s) + * of the thread that held FPU last. Will be saved when another + * thread uses the FPU. NULL if no thread holds the FPU state. + * Shared with the UsageFault handler. + */ +uint32_t *Thread_fpuHolderCtx = NULL; + +/** + * Pointer to software-saved FPU state (Thread_SoftwareContext_t.s) + * for the current thread. Will be restored when the current thread + * uses the FPU. NULL if the current thread has no FPU state. + * Shared with the UsageFault handler. + */ +uint32_t *Thread_fpuCurCtx = NULL; +#endif + /** * Inserts a thread into a queue ordered by info.chronoTime. * This is an internal function. @@ -401,12 +423,23 @@ uint64_t Thread_Schedule() { // Switch to next thread Thread_curTcb = nextTcb; newCtx = &Thread_curTcb->ctx; +#ifdef EVICSDK_FPU_SUPPORT + Thread_fpuCurCtx = Thread_curTcb->ctx.s; +#endif // Configure stack guard: stack is at the beginning // of the allocated block. primask = Thread_IrqDisable(); Thread_SetupStackGuard(Thread_curTcb->blockPtr); Thread_IrqRestore(primask); + +#ifdef EVICSDK_FPU_SUPPORT + // Disable FPU (CP10/CP11), i.e. CPACR[23:20] = 0000 + SCB->CPACR &= ~(0xFUL << 20); + // Ensure write completed, flush pipeline + __DMB(); + __ISB(); +#endif } // Reset quantum @@ -488,6 +521,11 @@ void Thread_Init() { SysTick->VAL = 0; SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk | SysTick_CTRL_TICKINT_Msk; NVIC_SetPriority(SysTick_IRQn, 0 << 2); + +#ifdef EVICSDK_FPU_SUPPORT + // Enable UsageFault for lazy stacking + SCB->SHCSR |= SCB_SHCSR_USGFAULTENA_Msk; +#endif } Thread_Error_t Thread_Create(Thread_t *thread, Thread_EntryPtr_t entry, void *args, uint16_t stackSize) { diff --git a/src/thread/UsageFault_fpu.s b/src/thread/UsageFault_fpu.s new file mode 100644 index 0000000..62472c8 --- /dev/null +++ b/src/thread/UsageFault_fpu.s @@ -0,0 +1,82 @@ +@ This file is part of eVic SDK. +@ +@ eVic SDK is free software: you can redistribute it and/or modify +@ it under the terms of the GNU General Public License as published by +@ the Free Software Foundation, either version 3 of the License, or +@ (at your option) any later version. +@ +@ eVic SDK is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +@ GNU General Public License for more details. +@ +@ You should have received a copy of the GNU General Public License +@ along with eVic SDK. If not, see . +@ +@ Copyright (C) 2016 ReservedField + +.syntax unified + +.global UsageFault_Handler +UsageFault_Handler: + .thumb_func + + @ Check if NOCP (CFSR[16+3]) is set + @ If not, this is not a CP denial + LDR R0, =0xE000ED28 + LDR R1, [R0] + TST R1, #(1 << 19) + BEQ UsageFault_Handler_escalate + + @ Check if FPU is enabled (i.e. CPACR[23:20] != 0000) + @ If it is, this is not an FPU CP denial + LDR R0, =0xE000ED88 + LDR R1, [R0] + TST R1, #(0xF << 20) + BNE UsageFault_Handler_escalate + + @ Enable FPU (enable CP10/CP11, i.e. CPACR[23:20] = 1111) + ORR R1, #(0xF << 20) + STR R1, [R0] + @ FPU enabled: sync barrier, flush pipeline + DSB + ISB + + @ If the current thread is the FPU holder, we are done + LDR R0, =Thread_fpuHolderCtx + LDR R2, [R0] + LDR R1, =Thread_fpuCurCtx + LDR R3, [R1] + TEQ R2, R3 + IT EQ + BXEQ LR + + @ If needed, save FPU context for FPU holder + TEQ R2, #0 + IT NE + VSTMNE R2, {S16-S31} + + @ If needed, restore FPU context for current + @ thread and set it as FPU holder + TEQ R3, #0 + ITT NE + VLDMNE R3, {S16-S31} + STRNE R3, [R0] + + BX LR + +UsageFault_Handler_escalate: + @ If this is not an FPU CP denial, we want to escalate + @ it to a hard fault to hand it over to the SDK fault + @ handler (if enabled). We do this by disabling the + @ UsageFault exception (i.e. SHCSR[18] = 0) and returning. + @ The faulting instruction will be executed again, and + @ this time it will escalate to a hard fault because the + @ UsageFault handler is disabled. If the SDK fault handler + @ is enabled, it will re-enable the UsageFault handler for + @ us. If not, the system will halt. + LDR R0, =0xE000ED24 + LDR R1, [R0] + BIC R1, #(1 << 18) + STR R1, [R0] + BX LR