Skip to content

Commit

Permalink
Improve lazy FPU stacking
Browse files Browse the repository at this point in the history
This commit optimizes the lazy stacking feature and makes it
stabler, fixing bugs in how the hardware stacking was handled.
  • Loading branch information
ReservedField committed Aug 9, 2016
1 parent 6588362 commit 7a884da
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 54 deletions.
18 changes: 10 additions & 8 deletions src/thread/ContextSwitch_fpu.s
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,30 @@ PendSV_Handler:
@ preempt ISRs, we always work with the PSP.
.thumb_func

@ Call Thread_Schedule()
PUSH {LR}

@ Call Thread_Schedule(EXC_RETURN)
@ We can delay the context push because the ABI enforces
@ routines to save and restore R4-R11 and S16-S31.
@ Return: R0 = new ctx (or NULL), R1 = old ctx (or NULL)
LDR R0, =Thread_Schedule
BLX R0
MOV R0, LR
LDR R1, =Thread_Schedule
BLX R1

POP {LR}

@ If needed, save old thread context
TEQ R1, #0
ITT NE
MRSNE R2, PSP
STMNE R1!, {R2, R4-R11}
STMNE R1, {R2, R4-R11, LR}

@ If needed, switch context
@ Also clear any exclusive lock held by the old thread
TEQ R0, #0
ITTT NE
LDMNE R0!, {R2, R4-R11}
LDMNE R0, {R2, R4-R11, LR}
MSRNE PSP, R2
CLREXNE

@ Return to thread mode, use PSP, no FP state
@ TODO: not sure about EXC_RETURN[4] = 1
LDR LR, =0xFFFFFFFD
BX LR
10 changes: 6 additions & 4 deletions src/thread/ContextSwitch_nofpu.s
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,29 @@ PendSV_Handler:
@ preempt ISRs, we always work with the PSP.
.thumb_func

PUSH {LR}

@ Call Thread_Schedule()
@ We can delay the context push because the ABI enforces
@ routines to save and restore R4-R11.
@ Return: R0 = new ctx (or NULL), R1 = old ctx (or NULL)
LDR R0, =Thread_Schedule
BLX R0

POP {LR}

@ If needed, save old thread context
TEQ R1, #0
ITT NE
MRSNE R2, PSP
STMNE R1!, {R2, R4-R11}
STMNE R1, {R2, R4-R11, LR}

@ If needed, switch context
@ Also clear any exclusive lock held by the old thread
TEQ R0, #0
ITTT NE
LDMNE R0!, {R2, R4-R11}
LDMNE R0, {R2, R4-R11, LR}
MSRNE PSP, R2
CLREXNE

@ Return to thread mode, use PSP, no FP state
LDR LR, =0xFFFFFFFD
BX LR
136 changes: 108 additions & 28 deletions src/thread/Thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,17 @@

/* Default PSR for new threads, only thumb flag set. */
#define THREAD_DEFAULT_PSR 0x01000000
/* Default EXC_RETURN for new threads: thread mode, PSP, no FP state. */
#define THREAD_DEFAULT_ER 0xFFFFFFFD

#ifdef EVICSDK_FPU_SUPPORT
/* NOT set in EXC_RETURN if the thread used FPU at least once. */
#define THREAD_ER_MSK_FPCTX (1 << 4)
#endif

/* Creates the return value for Thread_Schedule(). */
#define THREAD_MAKE_SCHEDRET(newCtx, oldCtx) ((((uint64_t)(uint32_t) (oldCtx)) << 32) | ((uint32_t) (newCtx)))
#define THREAD_MAKE_SCHEDRET(newCtx, oldCtx) \
((((uint64_t)(uint32_t) (oldCtx)) << 32) | ((uint32_t) (newCtx)))

/* Marks the scheduler as pending by flagging PendSV. */
#define THREAD_PEND_SCHED() do { SCB->ICSR |= SCB_ICSR_PENDSVSET_Msk; } while(0)
Expand Down Expand Up @@ -121,12 +129,36 @@ typedef struct {
uint32_t sp;
/**< Software-saved registers: R4-R11. */
uint32_t r[8];
/**< EXC_RETURN to be used when resuming. */
uint32_t er;
#ifdef EVICSDK_FPU_SUPPORT
/**< Software-saved FPU registers: S16-S31. */
uint32_t s[16];
/**< Software-saved FPU registers: S0-S31. */
uint32_t s[32];
#endif
} Thread_SoftwareContext_t;

#ifdef EVICSDK_FPU_SUPPORT
/**
* FPU context state.
* Keep in sync with UsageFault handler.
*/
typedef struct {
/**
* Pointer to software-saved FPU state (Thread_SoftwareContex_t.s)
* of the thread that held FPU last. Will be saved when another
* thread uses the FPU. NULL if no thread holds the FPU state.
*/
uint32_t *holderCtx;
/**
* Pointer to software-saved FPU state (Thread_SoftwareContext_t.s)
* for the current thread. Will be restored when the current thread
* uses the FPU. NULL if the current thread has no FPU state.
* Shared with the UsageFault handler.
*/
uint32_t *curCtx;
} Thread_FpuState_t;
#endif

/**
* Thread control block.
* Field order is arranged first to minimize
Expand Down Expand Up @@ -217,20 +249,12 @@ volatile uint32_t Thread_sysTick;

#ifdef EVICSDK_FPU_SUPPORT
/**
* Pointer to software-saved FPU state (Thread_SoftwareContex_t.s)
* of the thread that held FPU last. Will be saved when another
* thread uses the FPU. NULL if no thread holds the FPU state.
* Shared with the UsageFault handler.
*/
uint32_t *Thread_fpuHolderCtx = NULL;

/**
* Pointer to software-saved FPU state (Thread_SoftwareContext_t.s)
* for the current thread. Will be restored when the current thread
* uses the FPU. NULL if the current thread has no FPU state.
* Shared with the UsageFault handler.
* FPU context state.
* Shared with UsageFault handler. Non-atomic operations
* must be carried out with IRQs masked, to protect from
* faults generated by higher priority FPU-using ISRs.
*/
uint32_t *Thread_fpuCurCtx = NULL;
Thread_FpuState_t Thread_fpuState;
#endif

/**
Expand Down Expand Up @@ -336,17 +360,42 @@ static void Thread_SetupStackGuard(void *guardPtr) {
// thread gets control back (this must be called from ISR).
}

#ifdef EVICSDK_FPU_SUPPORT
/**
* Enables/disables the FPU.
*
* @param enable True to enable, false to disable.
*/
static void Thread_FpuControl(uint8_t enable) {
if(enable) {
// Enable CP10/CP11, i.e. CPACR[23:20] = 1111
SCB->CPACR |= (0xFUL << 20);
}
else {
// Disable CP10/CP11, i.e. CPACR[23:20] = 0000
SCB->CPACR &= ~(0xFUL << 20);
}

// Ensure write completed, flush pipeline
__DMB();
__ISB();
}
#endif

/**
* Schedules the next thread. Called from PendSV.
* This is an internal function.
*
* @param er EXC_RETURN from the PendSV handler.
* Ignored if FPU support is disabled.
*
* @return The lower 32 bits are the address of the software-saved
* context for the new thread. If NULL, no context switch
* is performed. The higher 32 bits are the address of the
* software-saved context for the old thread. If NULL, the
* old context isn't saved.
*/
uint64_t Thread_Schedule() {
uint64_t Thread_Schedule(uint32_t er) {
Thread_TCB_t *nextTcb;
Thread_SoftwareContext_t *newCtx = NULL, *oldCtx = NULL;
uint8_t isCurReady;
Expand Down Expand Up @@ -420,26 +469,40 @@ uint64_t Thread_Schedule() {
}
}

#ifdef EVICSDK_FPU_SUPPORT
uint32_t *prevFpuCtx = (Thread_curTcb != NULL ? Thread_curTcb->ctx.s : NULL);
#endif

// Switch to next thread
Thread_curTcb = nextTcb;
newCtx = &Thread_curTcb->ctx;

primask = Thread_IrqDisable();

#ifdef EVICSDK_FPU_SUPPORT
Thread_fpuCurCtx = Thread_curTcb->ctx.s;
if(Thread_fpuState.curCtx == NULL && !(er & THREAD_ER_MSK_FPCTX)) {
// The previous thread used FPU for the first time
// The previous holder already had its context saved
Thread_fpuState.holderCtx = prevFpuCtx;
}
// Switch current FPU context. If a thread has never used FPU
// NULL its context to avoid useless saves. If it ends up using
// it, the holder will be updated (see above).
Thread_fpuState.curCtx = (Thread_curTcb->ctx.er & THREAD_ER_MSK_FPCTX ?
NULL : Thread_curTcb->ctx.s);
// If we're resuming the holder thread, enable FPU since
// registers are good. Otherwise, disable FPU and let lazy
// stacking do its job. Also disable FPU when curCtx is NULL,
// since we don't have FP context for that thread yet.
Thread_FpuControl(Thread_fpuState.curCtx != NULL &&
Thread_fpuState.curCtx == Thread_fpuState.holderCtx);
#endif

// Configure stack guard: stack is at the beginning
// of the allocated block.
primask = Thread_IrqDisable();
Thread_SetupStackGuard(Thread_curTcb->blockPtr);
Thread_IrqRestore(primask);

#ifdef EVICSDK_FPU_SUPPORT
// Disable FPU (CP10/CP11), i.e. CPACR[23:20] = 0000
SCB->CPACR &= ~(0xFUL << 20);
// Ensure write completed, flush pipeline
__DMB();
__ISB();
#endif
Thread_IrqRestore(primask);
}

// Reset quantum
Expand Down Expand Up @@ -471,6 +534,16 @@ void SysTick_Handler() {
static void Thread_ExitProc(void *ret) {
Thread_CriticalEnter();

#ifdef EVICSDK_FPU_SUPPORT
// We don't hold FPU anymore
uint32_t primask = Thread_IrqDisable();
if(Thread_fpuState.holderCtx == Thread_fpuState.curCtx) {
Thread_fpuState.holderCtx = NULL;
}
Thread_fpuState.curCtx = NULL;
Thread_IrqRestore(primask);
#endif

// If a thread has joined us, wake him up
if(Thread_curTcb->join.tcb != NULL) {
*Thread_curTcb->join.retPtr = ret;
Expand Down Expand Up @@ -500,6 +573,10 @@ void Thread_Init() {
Queue_Init(&Thread_readyQueue);
Queue_Init(&Thread_chronoQueue);
Thread_curTcb = NULL;
#ifdef EVICSDK_FPU_SUPPORT
Thread_fpuState.holderCtx = NULL;
Thread_fpuState.curCtx = NULL;
#endif
Thread_criticalCount = 0;
Thread_sysTick = 0;

Expand Down Expand Up @@ -562,6 +639,7 @@ Thread_Error_t Thread_Create(Thread_t *thread, Thread_EntryPtr_t entry, void *ar
ctx->lr = (uint32_t) Thread_ExitProc;
ctx->pc = (uint32_t) entry;
ctx->psr = THREAD_DEFAULT_PSR;
tcb->ctx.er = THREAD_DEFAULT_ER;
tcb->ctx.sp = (uint32_t) ctx;

// Push new thread to back of ready queue
Expand Down Expand Up @@ -690,6 +768,7 @@ void Thread_CriticalExit() {

/**
* Initializes a semaphore.
* This is an internal function.
*
* @param sema Semaphore.
* @param count Initial semaphore count.
Expand All @@ -702,6 +781,7 @@ static void Thread_SemaphoreInit(Thread_SemaphoreInternal_t *sema, int32_t count

/**
* Deletes and deallocates a semaphore.
* This is an internal function.
*
* @param sema Semaphore.
* @param doFree True to free the memory pointed by sema.
Expand Down Expand Up @@ -734,7 +814,7 @@ Thread_Error_t Thread_SemaphoreCreate(Thread_Semaphore_t *sema, int32_t count) {
return INVALID_VALUE;
}

// Allocate and initialize semaphore
// Allocate semaphore
sm = malloc(sizeof(Thread_SemaphoreInternal_t));
if(sm == NULL) {
return NO_MEMORY;
Expand Down
Loading

0 comments on commit 7a884da

Please sign in to comment.