diff --git a/smw.ini b/smw.ini index b776b0b..b1678d8 100644 --- a/smw.ini +++ b/smw.ini @@ -2,8 +2,10 @@ # Automatically save state on quit and reload on start Autosave = 0 -# Disable the SDL_Delay that happens each frame (Gives slightly better perf if your -# display is set to exactly 60hz) +# Disable the manual delaying without vsync that happens each frame. Instead, +# rely on vsync to do frame timing. Results in frames never being dropped at +# the cost of more lag, but be sure to set your display to 60hz when using +# this. DisableFrameDelay = 0 # Save a snapshot each time a level is completed @@ -31,7 +33,19 @@ NoSpriteLimits = 1 # Use either SDL, SDL-Software, or OpenGL as the output method # SDL-Software rendering might give better performance on Raspberry pi. -#OutputMethod = SDL +OutputMethod = SDL + +# The type of display sync to use, when vsync is not forced on ( Disabled, Vsync, or Adaptive ) +# Vsync is forced on when DisableFrameDelay is 1. Disabled and Vsync should +# always work. Disabled produces less lag than Vsync, at the cost of possible +# tearing. Adaptive sync is only supported when OutputMethod is OpenGL. If your +# setup supports FreeSync or G-Sync, you'll get least-lag-no-tearing with frame +# delay not disabled, adaptive sync, and OpenGL output method. Choosing +# adaptive sync with OpenGL output method when your setup doesn't support +# adaptive sync can result in using vsync instead. +#DisplaySync = Disabled +DisplaySync = Vsync +#DisplaySync = Adaptive # Set to true to use linear filtering. Gives less crisp pixels. Works with SDL and OpenGL. #LinearFiltering = 0 diff --git a/src/config.c b/src/config.c index a63d286..c2bb5a6 100644 --- a/src/config.c +++ b/src/config.c @@ -383,6 +383,11 @@ static bool HandleIniConfig(int section, const char *key, char *value) { g_config.output_method = StringEqualsNoCase(value, "SDL-Software") ? kOutputMethod_SDLSoftware : StringEqualsNoCase(value, "OpenGL") ? kOutputMethod_OpenGL : kOutputMethod_SDL; return true; + } else if (StringEqualsNoCase(key, "DisplaySync")) { + g_config.display_sync = StringEqualsNoCase(value, "Disabled") ? 0 : + StringEqualsNoCase(value, "Vsync") ? 1 : + StringEqualsNoCase(value, "Adaptive") ? -1 : 0; + return true; } else if (StringEqualsNoCase(key, "LinearFiltering")) { return ParseBool(value, &g_config.linear_filtering); } else if (StringEqualsNoCase(key, "NoSpriteLimits")) { diff --git a/src/config.h b/src/config.h index bf5ca30..875f9d8 100644 --- a/src/config.h +++ b/src/config.h @@ -68,6 +68,7 @@ typedef struct Config { uint8 enable_msu; bool resume_msu; bool disable_frame_delay; + int display_sync; bool save_playthrough; uint8 msuvolume; uint32 features0; diff --git a/src/main.c b/src/main.c index da046dd..f68bc03 100644 --- a/src/main.c +++ b/src/main.c @@ -12,6 +12,8 @@ #include #include #endif +#define NANOTIME_IMPLEMENTATION +#include "third_party/nanotime/nanotime.h" #include "assets/smw_assets.h" @@ -263,7 +265,26 @@ static bool SdlRenderer_Init(SDL_Window *window) { SDL_Renderer *renderer = SDL_CreateRenderer(g_window, -1, g_config.output_method == kOutputMethod_SDLSoftware ? SDL_RENDERER_SOFTWARE : - SDL_RENDERER_ACCELERATED | SDL_RENDERER_PRESENTVSYNC); + SDL_RENDERER_ACCELERATED | (g_config.disable_frame_delay || g_config.display_sync == 1 ? SDL_RENDERER_PRESENTVSYNC : 0)); + if (g_config.disable_frame_delay) { + printf("Using vsync without frame delay\n"); + } + else { + switch (g_config.display_sync) { + case 0: + printf("Disabled vsync\n"); + break; + case 1: + printf("Using vsync\n"); + break; + case -1: + printf("Attempted to use adaptive sync when not supported; defaulting to disabled vsync\n"); + break; + default: + printf("Invalid g_config.display_func value of %d; defaulting to disabled vsync\n", g_config.display_sync); + break; + } + } if (renderer == NULL) { printf("Failed to create renderer: %s\n", SDL_GetError()); return false; @@ -314,7 +335,7 @@ static void SdlRenderer_EndDraw(void) { // printf("%f ms\n", v * 1000); SDL_RenderClear(g_renderer); SDL_RenderCopy(g_renderer, g_texture, &g_sdl_renderer_rect, NULL); - SDL_RenderPresent(g_renderer); // vsyncs to 60 FPS? + SDL_RenderPresent(g_renderer); } static const struct RendererFuncs kSdlRendererFuncs = { @@ -478,12 +499,12 @@ error_reading:; HandleCommand(kKeys_Load + 0, true); bool running = true; - uint32 lastTick = SDL_GetTicks(); - uint32 curTick = 0; uint32 frameCtr = 0; uint8 audiopaused = true; bool has_bug_in_title = false; GamepadInfo *gi; + nanotime_step_data stepper; + nanotime_step_init(&stepper, NANOTIME_NSEC_PER_SEC / 60, nanotime_now_max(), nanotime_now, nanotime_sleep); while (running) { SDL_Event event; @@ -546,7 +567,7 @@ error_reading:; } if (g_paused) { - SDL_Delay(16); + nanotime_sleep(stepper.sleep_duration); continue; } @@ -576,24 +597,9 @@ error_reading:; } } - // if vsync isn't working, delay manually - curTick = SDL_GetTicks(); - + // If not leaning on vsync to do timing, delay manually if (!g_snes->disableRender && !g_config.disable_frame_delay) { - static const uint8 delays[3] = { 17, 17, 16 }; // 60 fps - lastTick += delays[frameCtr % 3]; - - if (lastTick > curTick) { - uint32 delta = lastTick - curTick; - if (delta > 500) { - lastTick = curTick - 500; - delta = 500; - } - // printf("Sleeping %d\n", delta); - SDL_Delay(delta); - } else if (curTick - lastTick > 500) { - lastTick = curTick; - } + nanotime_step(&stepper); } } diff --git a/src/opengl.c b/src/opengl.c index 8e6e52c..48adc50 100644 --- a/src/opengl.c +++ b/src/opengl.c @@ -39,7 +39,31 @@ static bool OpenGLRenderer_Init(SDL_Window *window) { SDL_GLContext context = SDL_GL_CreateContext(window); (void)context; - SDL_GL_SetSwapInterval(1); + if (g_config.disable_frame_delay) { + SDL_GL_SetSwapInterval(1); + printf("Using vsync without frame delay\n"); + } + else if (SDL_GL_SetSwapInterval(g_config.display_sync) < 0) { + SDL_GL_SetSwapInterval(0); + printf("Disabled vsync; chosen display sync setting not supported\n"); + } + else { + switch (g_config.display_sync) { + case 0: + printf("Disabled vsync\n"); + break; + case 1: + printf("Using vsync\n"); break; + break; + case -1: + printf("Using adaptive sync\n"); + break; + default: + printf("Unknown value of g_config.display_sync: %d\n", g_config.display_sync); + break; + } + } + ogl_LoadFunctions(); if (!ogl_IsVersionGEQ(3, 3)) diff --git a/third_party/nanotime/nanotime.h b/third_party/nanotime/nanotime.h new file mode 100644 index 0000000..c12530a --- /dev/null +++ b/third_party/nanotime/nanotime.h @@ -0,0 +1,765 @@ +#ifndef _include_guard_nanotime_ +#define _include_guard_nanotime_ + +/* + * You can choose this license, if possible in your jurisdiction: + * + * Unlicense + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or distribute + * this software, either in source code form or as a compiled binary, for any + * purpose, commercial or non-commercial, and by any means. + * + * In jurisdictions that recognize copyright laws, the author or authors of + * this software dedicate any and all copyright interest in the software to the + * public domain. We make this dedication for the benefit of the public at + * large and to the detriment of our heirs and successors. We intend this + * dedication to be an overt act of relinquishment in perpetuity of all present + * and future rights to this software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * For more information, please refer to + * + * + * Alternative license choice, if works can't be directly submitted to the + * public domain in your jurisdiction: + * + * The MIT License (MIT) + * + * Copyright © 2022 Brandon McGriff + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the “Software”), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#if defined(_MSC_VER) + #if (_MSC_VER < 1600) + #error "Current Visual Studio version is not at least Visual Studio 2010, the nanotime library requires at least 2010." + #endif +#elif defined(__cplusplus) + #if (__cplusplus < 201103L) + #error "Current C++ standard is not at least C++11, the nanotime library requires at least C++11." + #endif +#elif defined(__STDC_VERSION__) + #if (__STDC_VERSION__ < 199901L) + #error "Current C standard is not at least C99, the nanotime library requires at least C99." + #endif +#else + #error "Current C or C++ standard is unknown, the nanotime library requires stdint.h and stdbool.h to be available (C99 or higher, C++11 or higher, Visual Studio 2010 or higher)." +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Implementor's note: This library directly uses Win32 APIs both for MSVC and + * MinGW GCC, as they work for both, and produce better behavior in MinGW + * builds. Detection of them is accomplished via checking if _WIN32 is defined, + * as it's defined in both MSVC and MinGW GCC. Though it's convenient to have + * UNIX-like APIs on Windows provided by MinGW, they just aren't as good as + * directly using Win32 APIs on Windows. + */ + +#include +#include +#include + +#define NANOTIME_NSEC_PER_SEC UINT64_C(1000000000) + +#ifndef NANOTIME_ONLY_STEP + +/* + * Returns the current time since some unspecified epoch. With the exception of + * the standard C11 implementation and non-Apple/Mach kernel POSIX + * implementation when neither CLOCK_MONOTONIC_RAW nor CLOCK_MONOTONIC are + * available, the time values monotonically increase, so they're not equivalent + * to calendar time (i.e., no leap seconds are accounted for, etc.). Calendar + * time has to be used as a last resort sometimes, as monotonic time isn't + * always available. + */ +uint64_t nanotime_now(); + +/* + * Returns the maximum possible timestamp value. Use of this value is required + * to properly handle overflow of timestamp values, such as when calculating the + * interval between a time value before overflow and the next time value after + * overflow. + */ +uint64_t nanotime_now_max(); + +/* + * Sleeps the current thread for the requested count of nanoseconds. The slept + * duration may be less than, equal to, or greater than the time requested. + */ +void nanotime_sleep(uint64_t nsec_count); + +#endif + +#ifndef NANOTIME_ONLY_STEP + +/* + * The yield function is provided for some platforms, but in the case of + * unknown platforms, the function is defined as a no-op. + */ + +#ifdef _WIN32 +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#define nanotime_yield() YieldProcessor() +#define NANOTIME_YIELD_IMPLEMENTED +#elif (defined(__unix__) || defined(__APPLE__)) && defined(_POSIX_VERSION) && (_POSIX_VERSION >= 200112L) +#include +#define nanotime_yield() (void)sched_yield() +#define NANOTIME_YIELD_IMPLEMENTED +#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_THREADS__) +#include +#define nanotime_yield() thrd_yield() +#define NANOTIME_YIELD_IMPLEMENTED +#elif defined(__cplusplus) +extern void (* const nanotime_yield)(); +#elif defined(__SWITCH__) +#include +#define nanotime_yield() svcSleepThread(YieldType_ToAnyThread) +#define NANOTIME_YIELD_IMPLEMENTED +#else +#define nanotime_yield() +#define NANOTIME_YIELD_NOP +#define NANOTIME_YIELD_IMPLEMENTED +#endif + +#endif + +/* + * Calculates the time interval between two nanosecond time values, correctly + * handling the case when the end time value overflows past max. You should + * probably use this function when calculating time intervals, as not all + * platforms' maximum timestamp value is UINT64_MAX, which is required for the + * trivial "end - start" formula for interval calculation to work as expected. + */ +uint64_t nanotime_interval(const uint64_t start, const uint64_t end, const uint64_t max); + +typedef struct nanotime_step_data { + uint64_t sleep_duration; + uint64_t now_max; + uint64_t (* now)(); + void (* sleep)(uint64_t nsec_count); + + #ifdef __APPLE__ + uint64_t overhead_numer; + uint64_t overhead_denom; + uint64_t backoff; + #endif + uint64_t zero_sleep_duration; + uint64_t accumulator; + uint64_t sleep_point; +} nanotime_step_data; + +/* + * Initializes the nanotime precise fixed timestep object. Call immediately + * before entering the loop using the stepper object. + */ +void nanotime_step_init(nanotime_step_data* const stepper, const uint64_t sleep_duration, const uint64_t now_max, uint64_t (* const now)(), void (* const sleep)(uint64_t nsec_count)); + +/* + * Does one step of sleeping for a fixed timestep logic update cycle. It makes + * a best-attempt at a precise delay per iteration, but might skip a cycle of + * sleeping if skipping sleeps is required to catch up to the correct + * wall-clock time. Returns true if a sleep up to the latest target sleep end + * time occurred, otherwise returns false in the case of a sleep step skip. + */ +bool nanotime_step(nanotime_step_data* const stepper); + +#if !defined(NANOTIME_ONLY_STEP) && defined(NANOTIME_IMPLEMENTATION) + +/* + * Non-portable, platform-specific implementations are first. If none of them + * match the current platform, the standard C/C++ versions are used as a last + * resort. + */ + +#ifdef _WIN32 + +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include + +#ifndef NANOTIME_NOW_IMPLEMENTED +uint64_t nanotime_now() { + static uint64_t scale = UINT64_C(0); + static bool multiply; + if (scale == 0u) { + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + if (frequency.QuadPart < NANOTIME_NSEC_PER_SEC) { + scale = NANOTIME_NSEC_PER_SEC / frequency.QuadPart; + multiply = true; + } + else { + scale = frequency.QuadPart / NANOTIME_NSEC_PER_SEC; + multiply = false; + } + } + LARGE_INTEGER performanceCount; + QueryPerformanceCounter(&performanceCount); + if (multiply) { + return performanceCount.QuadPart * scale; + } + else { + return performanceCount.QuadPart / scale; + } +} +#define NANOTIME_NOW_IMPLEMENTED +#endif + +#ifndef NANOTIME_NOW_MAX_IMPLEMENTED +uint64_t nanotime_now_max() { + static uint64_t now_max; + if (now_max == UINT64_C(0)) { + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + if (frequency.QuadPart < NANOTIME_NSEC_PER_SEC) { + now_max = UINT64_MAX * (NANOTIME_NSEC_PER_SEC / frequency.QuadPart); + } + else { + now_max = UINT64_MAX / (frequency.QuadPart / NANOTIME_NSEC_PER_SEC); + } + } + return now_max; +} +#define NANOTIME_NOW_MAX_IMPLEMENTED +#endif + +#ifndef NANOTIME_SLEEP_IMPLEMENTED +void nanotime_sleep(uint64_t nsec_count) { + LARGE_INTEGER dueTime; + + if (nsec_count < UINT64_C(100)) { + /* + * Allows the OS to schedule another process for a single time + * slice. Better than a delay of 0, which immediately returns + * with no actual non-CPU-hogging delay. The time-slice-yield + * behavior is specified in Microsoft's Windows documentation. + */ + SleepEx(0UL, FALSE); + } + else { + HANDLE timer = NULL; + if ( +#ifdef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION + /* + * Requesting a high resolution timer can make quite the + * difference, so always request high resolution if available. It's + * available in Windows 10 1803 and above. This arrangement of + * building it if the build system supports it will allow the + * executable to use high resolution if available on a user's + * system, but revert to low resolution if the user's system + * doesn't support high resolution. + */ + (timer = CreateWaitableTimerEx(NULL, NULL, CREATE_WAITABLE_TIMER_HIGH_RESOLUTION, TIMER_ALL_ACCESS)) == NULL && +#endif + (timer = CreateWaitableTimer(NULL, TRUE, NULL)) == NULL + ) { + return; + } + + dueTime.QuadPart = -(LONGLONG)(nsec_count / UINT64_C(100)); + + SetWaitableTimer(timer, &dueTime, 0L, NULL, NULL, FALSE); + WaitForSingleObject(timer, INFINITE); + + CloseHandle(timer); + } +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif + +#endif + +#if defined(__APPLE__) || defined(__MACH__) + +#ifndef NANOTIME_NOW_IMPLEMENTED +// The current platform is some Apple operating system, or at least uses some +// Mach kernel. The POSIX implementation below using clock_gettime works on at +// least Apple platforms, though this version using Mach functions has lower +// overhead. +#include +uint64_t nanotime_now() { + static mach_timebase_info_data_t info = { 0 }; + if (info.denom == UINT32_C(0)) { + const kern_return_t status = mach_timebase_info(&info); + assert(status == KERN_SUCCESS); + if (status != KERN_SUCCESS) { + return UINT64_C(0); + } + } + return (mach_absolute_time() * info.numer) / info.denom; +} +#define NANOTIME_NOW_IMPLEMENTED +#endif + +#ifndef NANOTIME_NOW_MAX_IMPLEMENTED +#include +uint64_t nanotime_now_max() { + static uint64_t now_max = UINT64_C(0); + if (now_max == UINT64_C(0)) { + mach_timebase_info_data_t info; + const kern_return_t status = mach_timebase_info(&info); + assert(status == KERN_SUCCESS); + if (status != KERN_SUCCESS) { + return UINT64_C(0); + } + else { + now_max = UINT64_MAX / info.denom; + } + } + return now_max; +} +#define NANOTIME_NOW_MAX_IMPLEMENTED +#endif + +#endif + +#if defined(__unix__) && defined(_POSIX_VERSION) && (_POSIX_VERSION >= 199309L) && !defined(NANOTIME_NOW_IMPLEMENTED) +// Current platform is some version of POSIX, that might have clock_gettime. +#include +#include +#include +uint64_t nanotime_now() { + struct timespec now; + const int status = clock_gettime( + #if defined(CLOCK_MONOTONIC_RAW) + // Monotonic raw is more precise, but not always available. For the + // sorts of applications this code is intended for, mainly soft real + // time applications such as game programming, the subtle + // inconsistencies of it vs. monotonic aren't an issue. + CLOCK_MONOTONIC_RAW + #elif defined(CLOCK_MONOTONIC) + // Monotonic is quite good, and widely available, but not as precise as + // monotonic raw, so it's only used if required. + CLOCK_MONOTONIC + #else + // Realtime isn't fully correct, as it's calendar time, but is even more + // widely available than monotonic. Monotonic is only unavailable on + // very old platforms though, so old they're likely unused now (as of + // last editing this, 2023). + CLOCK_REALTIME + #endif + , &now); + assert(status == 0 || (status == -1 && errno != EOVERFLOW)); + if (status == 0 || (status == -1 && errno != EOVERFLOW)) { + return (uint64_t)now.tv_sec * NANOTIME_NSEC_PER_SEC + (uint64_t)now.tv_nsec; + } + else { + return UINT64_C(0); + } +} +#define NANOTIME_NOW_IMPLEMENTED + +#endif + +#if (defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(__MINGW32__) || defined(__MINGW64__)) && !defined(NANOTIME_SLEEP_IMPLEMENTED) +#include +#include +#include +void nanotime_sleep(uint64_t nsec_count) { + const struct timespec req = { + .tv_sec = (time_t)(nsec_count / NANOTIME_NSEC_PER_SEC), + .tv_nsec = (long)(nsec_count % NANOTIME_NSEC_PER_SEC) + }; +#ifndef NDEBUG + const int status = +#endif + nanosleep(&req, NULL); + assert(status == 0 || (status == -1 && errno != EINVAL)); +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif + +#if defined(__vita__) +#ifndef NANOTIME_SLEEP_IMPLEMENTED +#include +void nanotime_sleep(uint64_t nsec_count) { + sceKernelDelayThreadCB(nsec_count / UINT64_C(1000)); +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif + +#ifndef NANOTIME_NOW_IMPLEMENTED +#include +uint64_t nanotime_now() { + return sceKernelGetProcessTimeWide() * UINT64_C(1000); +} +#define NANOTIME_NOW_IMPLEMENTED +#endif +#endif + +#ifdef __EMSCRIPTEN__ +#ifndef NANOTIME_SLEEP_IMPLEMENTED +#include +/* + * NOTE: You *must* have asyncify enabled in the Emscripten build (pass + * -sASYNCIFY to the compiler/linker) or sleeping won't work. + */ +void nanotime_sleep(uint64_t nsec_count) { + emscripten_sleep(nsec_count / UINT64_C(1000000)); +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif + +#ifndef NANOTIME_NOW_IMPLEMENTED +#include +uint64_t nanotime_now() { + const double now = emscripten_get_now(); + return (uint64_t)now * UINT64_C(1000000); +} +#define NANOTIME_NOW_IMPLEMENTED +#endif +#endif + +#ifdef __SWITCH__ +#ifndef NANOTIME_SLEEP_IMPLEMENTED +#include +void nanotime_sleep(uint64_t nsec_count) { + if (nsec_count > INT64_MAX) { + svcSleepThread(INT64_MAX); + } + else { + svcSleepThread((s64)nsec_count); + } +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif + +#ifndef NANOTIME_NOW_IMPLEMENTED +#define NANOTIME_NOW_IMPLEMENTED +#include +uint64_t nanotime_now() { + return svcGetSystemTick(); +} +#define NANOTIME_NOW_IMPLEMENTED +#endif +#endif + + +#ifndef NANOTIME_NOW_IMPLEMENTED +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +#include +uint64_t nanotime_now() { + struct timespec now; + const int status = timespec_get(&now, TIME_UTC); + assert(status == TIME_UTC); + if (status == TIME_UTC) { + return (uint64_t)now.tv_sec * NANOTIME_NSEC_PER_SEC + (uint64_t)now.tv_nsec; + } + else { + return UINT64_C(0); + } +} +#define NANOTIME_NOW_IMPLEMENTED +#endif +#endif + +#ifndef NANOTIME_SLEEP_IMPLEMENTED +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_THREADS__) +#include +void nanotime_sleep(uint64_t nsec_count) { + const struct timespec req = { + .tv_sec = (time_t)(nsec_count / NANOTIME_NSEC_PER_SEC), + .tv_nsec = (long)(nsec_count % NANOTIME_NSEC_PER_SEC) + }; + const int status = thrd_sleep(&req, NULL); + assert(status == 0 || status == -1); +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#if !defined(NANOTIME_ONLY_STEP) && defined(NANOTIME_IMPLEMENTATION) && defined(__cplusplus) + +#if !defined(NANOTIME_YIELD_IMPLEMENTED) +#include +extern "C" void (* const nanotime_yield)() = std::this_thread::yield; +#define NANOTIME_YIELD_IMPLEMENTED +#endif + +#ifndef NANOTIME_NOW_IMPLEMENTED +#include +#include +extern "C" uint64_t nanotime_now() { + return static_cast( + std::chrono::time_point_cast( + std::chrono::steady_clock::now() + ).time_since_epoch().count() + ); +} +#define NANOTIME_NOW_IMPLEMENTED +#endif + +#ifndef NANOTIME_SLEEP_IMPLEMENTED +#include +#include +#include +extern "C" void nanotime_sleep(uint64_t nsec_count) { + try { + std::this_thread::sleep_for(std::chrono::nanoseconds(nsec_count)); + } + catch (std::exception e) { + } +} +#define NANOTIME_SLEEP_IMPLEMENTED +#endif + +#endif + +#if !defined(NANOTIME_ONLY_STEP) && defined(NANOTIME_IMPLEMENTATION) + +#ifndef NANOTIME_NOW_IMPLEMENTED +#error "Failed to implement nanotime_now (try using C11 with C11 threads support or C++11)." +#endif + +#ifndef NANOTIME_SLEEP_IMPLEMENTED +#error "Failed to implement nanotime_sleep (try using C11 with C11 threads support or C++11)." +#endif + +#endif + +#ifdef NANOTIME_IMPLEMENTATION +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef NANOTIME_NOW_MAX_IMPLEMENTED +// Might not be correct on some platforms, but it's the best we can do as a last +// resort. +uint64_t nanotime_now_max() { + return UINT64_MAX; +} +#define NANOTIME_NOW_MAX_IMPLEMENTED +#endif + +uint64_t nanotime_interval(const uint64_t start, const uint64_t end, const uint64_t max) { + assert( + max > UINT64_C(0) && + start <= max && + end <= max + ); + + if (end >= start) { + return end - start; + } + else { + return end + (max - start) + UINT64_C(1); + } +} + +void nanotime_step_init(nanotime_step_data* const stepper, const uint64_t sleep_duration, const uint64_t now_max, uint64_t (* const now)(), void (* const sleep)(uint64_t nsec_count)) { + assert( + stepper != NULL && + sleep_duration > UINT64_C(0) && + now_max > UINT64_C(0) && + now != NULL && + sleep != NULL + ); + + stepper->sleep_duration = sleep_duration; + stepper->now_max = now_max; + stepper->now = now; + stepper->sleep = sleep; + + const uint64_t start = now(); + nanotime_sleep(UINT64_C(0)); + stepper->zero_sleep_duration = nanotime_interval(start, now(), now_max); + #ifdef __APPLE__ + stepper->overhead_numer = UINT64_C(1); + stepper->overhead_denom = UINT64_C(1); + stepper->backoff = UINT64_C(0); + #endif + stepper->accumulator = UINT64_C(0); + + // This should be last here, so the sleep point is close to what it + // should be. + stepper->sleep_point = now(); +} + +bool nanotime_step(nanotime_step_data* const stepper) { + assert(stepper != NULL); + + bool slept; + if (stepper->accumulator < stepper->sleep_duration) { + const uint64_t total_sleep_duration = stepper->sleep_duration - stepper->accumulator; + uint64_t current_sleep_duration = total_sleep_duration; + const uint64_t shift = UINT64_C(4); + + #ifdef __APPLE__ + // Start with a big sleep. This helps reduce CPU/power use vs. many + // shorter sleeps. Shorter sleeps are still done below, but this reduces + // the number of shorter sleeps. It appears that the actually-slept + // duration is roughly equal to the requested delay time multiplied by a + // factor that remains relatively constant over the short run, greater + // than 1.0, in testing on ARM macOS; such behavior doesn't appear to be + // the case on all platforms, but is the case in testing on an M1 Mac. + // By only setting the overhead factor when the sleep overshoots, it + // levels off to a pretty stable value in a feedback loop, resulting in + // the big sleep approaching a value that does as big as possible of a + // big sleep while reducing overshoots. Also, a "backoff" duration is + // subtracted from the overhead factor-adjusted sleep duration, which + // reduces the frequency of overshoots, while still maintaining the + // desired longer sleep duration before the higher cost/higher precision + // sleeping below; the backoff duration is also updated in a feedback + // loop that causes it to approach a reasonably correct value. + // TODO: This was carefully tuned to be well-behaved on Apple Silicon + // M1, but hasn't been tested on any Intel Mac; test on an Intel Mac to + // see if this algorithm is appropriate there, if not, special-case for + // each CPU type. + // TODO: Implement "initial big sleep" for other platforms; it really + // does reduce wasted cycles regardless of platform. Or, if this + // algorithm seems to work fine on other platforms, change it to be used + // on all platforms, not just Apple's. + uint64_t overhead_sleep_duration = (current_sleep_duration * stepper->overhead_numer) / stepper->overhead_denom; + if (overhead_sleep_duration > stepper->backoff) { + overhead_sleep_duration -= stepper->backoff; + } + else { + stepper->backoff = UINT64_C(0); + } + const uint64_t overhead_start = stepper->now(); + stepper->sleep(overhead_sleep_duration); + const uint64_t big_sleep_duration = nanotime_interval(overhead_start, stepper->now(), stepper->now_max); + const uint64_t slept_so_far = nanotime_interval(stepper->sleep_point, stepper->now(), stepper->now_max); + if (slept_so_far <= total_sleep_duration) { + if (stepper->backoff >= total_sleep_duration - slept_so_far) { + stepper->backoff -= total_sleep_duration - slept_so_far; + } + current_sleep_duration -= slept_so_far; + } + else { + stepper->overhead_numer = overhead_sleep_duration; + stepper->overhead_denom = big_sleep_duration > UINT64_C(0) ? big_sleep_duration : UINT64_C(1); + if (stepper->backoff <= UINT64_MAX - slept_so_far - total_sleep_duration) { + stepper->backoff += slept_so_far - total_sleep_duration; + } + if (stepper->overhead_numer > stepper->overhead_denom) { + stepper->overhead_numer = UINT64_C(1); + stepper->overhead_denom = UINT64_C(1); + } + goto step_end; + } + #endif + + // This has the flavor of Zeno's dichotomous paradox of motion, as it + // successively divides the time remaining to sleep, but attempts to + // stop short of the deadline to hopefully be able to precisely sleep up + // to the deadline below this loop. The divisor is larger than two + // though, as it produces better behavior, and seems to work fine in + // testing on real hardware. This loop, and the one below, take the + // assumption that sleep requests of the same amount are roughly equal; + // by keeping track of the max of all the sleeps, the loops can be + // broken out of when the remaining time is less than the max, allowing + // the loop after this one to do shorter sleeps, with a corresponding + // smaller max of the sleeps. The overshoot possible in the loop below + // this one won't overshoot much, or in the best case won't overshoot so + // the busyloop can finish up the sleep precisely. + current_sleep_duration >>= shift; + for ( + uint64_t max = stepper->zero_sleep_duration; + nanotime_interval(stepper->sleep_point, stepper->now(), stepper->now_max) + max < total_sleep_duration && current_sleep_duration > UINT64_C(0); + current_sleep_duration >>= shift + ) { + max = stepper->zero_sleep_duration; + uint64_t start; + while (max < stepper->sleep_duration && nanotime_interval(stepper->sleep_point, start = stepper->now(), stepper->now_max) + max < total_sleep_duration) { + stepper->sleep(current_sleep_duration); + uint64_t slept_duration; + if ((slept_duration = nanotime_interval(start, stepper->now(), stepper->now_max)) > max) { + max = slept_duration; + } + } + } + + { + // After (hopefully) stopping short of the deadline by a small + // amount, do small sleeps here to get closer to the deadline, but + // again attempting to stop short by an even smaller amount. It's + // best to do larger sleeps as done in the above loop, to reduce + // CPU/power usage, as each sleep iteration has a CPU/power usage + // cost. + uint64_t max = stepper->zero_sleep_duration; + uint64_t start; + while (nanotime_interval(stepper->sleep_point, start = stepper->now(), stepper->now_max) + max < total_sleep_duration) { + stepper->sleep(UINT64_C(0)); + if ((stepper->zero_sleep_duration = nanotime_interval(start, stepper->now(), stepper->now_max)) > max) { + max = stepper->zero_sleep_duration; + } + } + } + + #ifdef __APPLE__ + step_end: + #endif + { + // Finally, do a busyloop to precisely sleep up to the + // deadline. The code above this loop attempts to reduce the + // remaining time to sleep to a minimum via process-yielding + // sleeps, so the amount of time spent spinning here is + // hopefully quite low. + uint64_t current_time; + uint64_t accumulated; + while ((accumulated = nanotime_interval(stepper->sleep_point, current_time = stepper->now(), stepper->now_max)) < total_sleep_duration); + + stepper->accumulator += accumulated; + stepper->sleep_point = current_time; + slept = true; + } + } + else { + slept = false; + } + stepper->accumulator -= stepper->sleep_duration; + return slept; +} + +#ifdef __cplusplus +} +#endif +#endif + +#endif /* _include_guard_nanotime_ */